diff --git a/General_Features.md b/General_Features.md deleted file mode 100644 index f7de1ca4..00000000 --- a/General_Features.md +++ /dev/null @@ -1,8 +0,0 @@ -This package enforces a few features over all macros: - - - Ready for both Persistent Staging Areas and Transient Staging Areas, due to the allowance of multiple deltas in all macros, without losing any intermediate changes - - Enforcing standards in naming conventions by implementing global variables for technical columns - - Following the insert-only-approach by using a mix of tables and views - - Creating a snapshot-based Business interface by using a centralized snapshot table supporting logarithmic logic - - Optimizing incremental loads by implementing a high-water-mark that also works for entities that are loaded from multiple sources - diff --git a/macros/internal/metadata_processing/metadata_processing.yml b/macros/internal/metadata_processing/metadata_processing.yml new file mode 100644 index 00000000..6fbd0a53 --- /dev/null +++ b/macros/internal/metadata_processing/metadata_processing.yml @@ -0,0 +1,21 @@ +version: 2 + +macros: + - name: yaml_metadata_parser + description: A macro to parse yaml-metadata into single parameters. Used in top-level front-end macros. + arguments: + - name: name + type: string + description: The name of the parameter you want to extract of the yaml-metadata. + - name: yaml_metadata + type: string + description: The yaml-string that holds the definition of other parameters. Needs to be in yaml format. + - name: parameter + type: variable + description: The forwarded parameter of the top-level macro. This is used, if the yaml-metadata is none. + - name: required + type: boolean + description: Whether this parameter is required for the top-level macro. Default is False. + - name: documentation + type: string + description: A string that holds documentation of this parameter. diff --git a/macros/internal/metadata_processing/yaml_metadata_parser.sql b/macros/internal/metadata_processing/yaml_metadata_parser.sql new file mode 100644 index 00000000..1b35d0ee --- /dev/null +++ b/macros/internal/metadata_processing/yaml_metadata_parser.sql @@ -0,0 +1,28 @@ +{% macro yaml_metadata_parser(name=none, yaml_metadata=none, parameter=none, required=False, documentation=none) %} + + {% if datavault4dbt.is_something(yaml_metadata) %} + {%- set metadata_dict = fromyaml(yaml_metadata) -%} + {% if name in metadata_dict.keys() %} + {% set return_value = metadata_dict.get(name) %} + {% if datavault4dbt.is_something(parameter)%} + {{ log("[" ~ this ~ "] Parameter '" ~ name ~ "' defined both in yaml-metadata and separately. Value from yaml-metadata will be used, and separate parameter is ignored.", info=False) }} + {% endif %} + {% elif datavault4dbt.is_something(parameter) %} + {% set return_value = parameter %} + {{ log("[" ~ this ~ "] yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Applying '" ~ parameter ~ "' which is either a parameter passed separately or the default value.", info=False) }} + {% elif required %} + {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter '" ~ name ~ "' not defined in there or outside in the parameter. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} + {% else %} + {% set return_value = None %} + {% endif %} + {% elif datavault4dbt.is_something(parameter) %} + {% set return_value = parameter %} + {% elif required %} + {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: Required parameter '" ~ name ~ "' not defined. Define it either directly, or inside yaml-metadata. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} + {% else %} + {% set return_value = None %} + {% endif %} + + {{ return(return_value) }} + +{% endmacro %} diff --git a/macros/staging/bigquery/stage.sql b/macros/staging/bigquery/stage.sql index a6403ebc..78bd66c3 100644 --- a/macros/staging/bigquery/stage.sql +++ b/macros/staging/bigquery/stage.sql @@ -96,17 +96,19 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -183,6 +185,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -224,8 +228,6 @@ ldts_rsrc_data AS ( {%- set last_cte = "ldts_rsrc_data" -%} {%- set final_columns_to_select = alias_columns + final_columns_to_select %} - {%- set final_columns_to_select = datavault4dbt.process_columns_to_select(final_columns_to_select, derived_column_names) | list -%} - {%- set columns_without_excluded_columns_tmp = [] -%} {%- for column in columns_without_excluded_columns -%} {%- if column.name | lower not in derived_column_names | map('lower') -%} @@ -256,6 +258,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -263,19 +266,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -296,28 +323,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -444,65 +468,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -514,62 +534,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} diff --git a/macros/staging/databricks/stage.sql b/macros/staging/databricks/stage.sql index 72dc7cbc..4511fdda 100644 --- a/macros/staging/databricks/stage.sql +++ b/macros/staging/databricks/stage.sql @@ -96,17 +96,19 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -177,8 +179,13 @@ {% set error_value_rsrc = var('datavault4dbt.default_error_rsrc', 'ERROR') %} {% set unknown_value_rsrc = var('datavault4dbt.default_unknown_rsrc', 'SYSTEM') %} -{# Setting the rsrc default datatype #} -{% set rsrc_default_dtype = datavault4dbt.string_default_dtype(type=rsrc) %} +{# Setting the rsrc default datatype and length #} +{% set rsrc_default_dtype = datavault4dbt.string_default_dtype(type='rsrc') %} + +{# Setting the ldts default datatype #} +{% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} + +{{ datavault4dbt.prepend_generated_by() }} WITH @@ -206,7 +213,7 @@ source_data AS ( ldts_rsrc_data AS ( SELECT - {{ ldts }} AS {{ load_datetime_col_name}}, + CAST( {{ ldts }} as {{ ldts_default_dtype }} ) AS {{ load_datetime_col_name }}, CAST( {{ rsrc }} as {{ rsrc_default_dtype }} ) AS {{ record_source_col_name }} {%- if datavault4dbt.is_something(sequence) %}, {{ sequence }} AS edwSequence @@ -221,8 +228,7 @@ ldts_rsrc_data AS ( {%- set last_cte = "ldts_rsrc_data" -%} {%- set final_columns_to_select = alias_columns + final_columns_to_select %} - {%- set final_columns_to_select = datavault4dbt.process_columns_to_select(final_columns_to_select, derived_column_names) | list -%} - + {%- set columns_without_excluded_columns_tmp = [] -%} {%- for column in columns_without_excluded_columns -%} {%- if column.name | lower not in derived_column_names | map('lower') -%} @@ -253,6 +259,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -260,19 +267,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -293,28 +324,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -441,65 +469,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -511,62 +535,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index 3b6571df..0b21c53d 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -90,7 +90,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -99,9 +99,11 @@ {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -178,6 +180,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -188,6 +192,12 @@ source_data AS ( FROM {{ source_relation }} + {% if is_incremental() %} + WHERE {{ ldts }} > (SELECT max({{ load_datetime_col_name}}) + FROM {{ this }} + WHERE {{ load_datetime_col_name}} != {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} ) + {%- endif -%} + {% set last_cte = "source_data" -%} ), @@ -244,6 +254,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -251,19 +262,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS "{{ col | upper }}" - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -284,25 +319,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} @@ -429,63 +461,64 @@ hashed_columns AS ( {%- if enable_ghost_records and not is_incremental() %} {# Creating Ghost Record for unknown case, based on datatype #} unknown_values AS ( + SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -497,62 +530,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=column.char_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -582,12 +614,13 @@ columns_to_select AS ( {%- if enable_ghost_records and not is_incremental() %} UNION ALL + SELECT {{ datavault4dbt.print_list(datavault4dbt.escape_column_names(final_columns_to_select)) }} FROM ghost_records -{%- endif -%} +{% endif %} ) SELECT * FROM columns_to_select diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 2a33caf2..6735e76a 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -89,12 +89,11 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -134,8 +133,11 @@ {%- set only_include_from_source = (derived_input_columns + hashed_input_columns + prejoined_input_columns + ma_keys) | unique | list -%} {%- else -%} + {%- set only_include_from_source = (derived_input_columns + hashed_input_columns + prejoined_input_columns) | unique | list -%} + {%- endif -%} + {%- set source_columns_to_select = only_include_from_source -%} {%- endif-%} @@ -253,28 +255,54 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} +{# Prejoining Business Keys of other source objects for Link purposes #} {%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} -{# Prejoining Business Keys of other source objects for Link purposes #} + prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{datavault4dbt.escape_column_names(vals['bk'])}} AS {{datavault4dbt.escape_column_names(col)}} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ datavault4dbt.escape_column_names(column) }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ datavault4dbt.escape_column_names(prejoin['aliases'][loop.index0]) }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -295,28 +323,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=datavault4dbt.escape_column_names(vals['this_column_name']), prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=datavault4dbt.escape_column_names(vals['ref_column_name'])) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=datavault4dbt.escape_column_names(prejoin['this_column_name']), prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=datavault4dbt.escape_column_names(prejoin['ref_column_name'])) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -443,65 +468,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} {%- for column in pj_relation_columns -%} - - {%- if column.name|lower == vals['bk']|lower -%} - {{- log('column found? yes, for column :' ~ column.name , false) -}} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', alias=datavault4dbt.escape_column_names(col)) }} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', alias=datavault4dbt.escape_column_names(prejoin['aliases'][prejoin_col_index])) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{%- endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -513,62 +534,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', alias=datavault4dbt.escape_column_names(col)) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', alias=datavault4dbt.escape_column_names(prejoin['aliases'][prejoin_col_index])) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -595,6 +615,7 @@ columns_to_select AS ( {{ datavault4dbt.print_list(datavault4dbt.escape_column_names(final_columns_to_select)) }} FROM {{ last_cte }} + {% if enable_ghost_records and not is_incremental() %} UNION ALL diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index ca070b88..a8d339fb 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -96,19 +96,19 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} - {{ log('source_columns_to_select when include_source_columns=true: '~ source_columns_to_select, false) }} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -189,6 +189,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -263,6 +265,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -270,19 +273,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -303,28 +330,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -452,65 +476,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -523,62 +543,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -621,4 +640,4 @@ columns_to_select AS ( SELECT * FROM columns_to_select -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index cec4b2c7..b60a8a6c 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -96,18 +96,19 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -184,6 +185,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -256,6 +259,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -263,19 +267,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -296,25 +324,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} - {%- do exceptions.raise_compiler_error(error_message) -%} + {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} @@ -444,65 +469,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -514,62 +535,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} - {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} - {%- endif -%} - {%- endfor -%} - {%- if not loop.last -%},{%- endif %} - {% endfor -%} + {% for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} + {%- endif -%} - {%- endif -%} + {%- endfor -%} + {% endfor -%} + {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} diff --git a/macros/staging/redshift/stage.sql b/macros/staging/redshift/stage.sql index 199bcb7d..2079a590 100644 --- a/macros/staging/redshift/stage.sql +++ b/macros/staging/redshift/stage.sql @@ -95,18 +95,20 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -183,6 +185,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -255,26 +259,51 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( SELECT - {% if final_columns_to_select | length > 0 -%} + {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -295,28 +324,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -455,65 +481,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, - {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column and derived_columns #} + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} + {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -525,62 +547,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} diff --git a/macros/staging/snowflake/stage.sql b/macros/staging/snowflake/stage.sql index 408f0814..ea629c51 100644 --- a/macros/staging/snowflake/stage.sql +++ b/macros/staging/snowflake/stage.sql @@ -96,19 +96,20 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} - {{ log('source_columns_to_select when include_source_columns=true: '~ source_columns_to_select, false) }} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -189,6 +190,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -263,6 +266,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -270,19 +274,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -303,28 +331,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -395,65 +420,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -465,62 +486,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} diff --git a/macros/staging/stage.sql b/macros/staging/stage.sql index c6e39879..4b06cfe8 100644 --- a/macros/staging/stage.sql +++ b/macros/staging/stage.sql @@ -1,15 +1,21 @@ {# This macro creates the staging layer for the Data Vault model. This layer is mainly for hashing, and additionally gives the option to create derived columns, conduct prejoins and add NULL values for missing columns. Always create one stage per source table that you want to add to the Data Vault model. The staging layer is not to harmonize data. That will be done in the later layers. + #} + - Parameters: +{%- macro stage(yaml_metadata=none, ldts=none, rsrc=none, source_model=none, include_source_columns=true, hashed_columns=none, derived_columns=none, sequence=none, prejoined_columns=none, missing_columns=none, multi_active_config=none, enable_ghost_records=true) -%} + + {% set ldts_description = " ldts::string Name of the column inside the source data, that holds information about the Load Date Timestamp. Can also be a SQL expression. Examples: 'edwLoadDate' Uses the column called 'edwLoadDate' as it is from the source model. 'PARSE_TIMESTAMP('%Y-%m-%dT%H-%M-%S', edwLoadDate)' Applies the SQL function 'PARSE_TIMESTAMP' on the input column 'edwLoadDate'. + " %} + {% set rsrc_description = " rsrc::string Name of the column inside the source data, that holds information about the Record Source. Can also be a SQL expression or a static string. A static string must begin with a '!'. @@ -17,7 +23,9 @@ 'edwRecordSource' Uses the column called 'edwRecordSource' as it is from the source model. '!SAP.Accounts' Uses the static string 'SAP.Customers' as rsrc. 'CONCAT(source_system, '||', source_object)' Applies the SQL function 'CONCAT' to concatenate two source columns. + " %} + {% set source_model_description = " source_model::string | dictionary Can be just a string holding the name of the referred dbt model to use as a source. But if the 'source' functionality inside the .yml file is used, it must be a dictionary with 'source_name': 'source_table'. @@ -25,10 +33,14 @@ 'source_account' The source model that you want to use for the stage is available as another dbt model with the name 'source_account'. {'source_data': 'source_account'} The source model that you want to use for the stage is available as a source defined inside the .yml file with the name 'source_data', and you select the table 'source_account' out of that source. + " %} + {% set include_source_columns_description = " include_source_columns::boolean Defines if all columns from the referred source table should be included in the result table, or if only the added columns should be part of the result table. By default the source columns should be included. + " %} + {% set hashed_columns_description = " hashed_columns::dictionary Defines the names and input for all hashkeys and hashdiffs to create. The key of each hash column is the name of the hash column. The value for Hashkeys is a list of input Business Keys, for Hashdiffs another dictionary with the pairs 'is_hashdiff:true' and 'columns: '. @@ -38,7 +50,9 @@ 'hd_account_s': {'is_hashdiff': true, keys 'account_number' and 'account_key'. A hashdiff called 'hd_account_s' is calculated 'columns': ['name', 'address', 'phone', 'email']}} out of the descriptive attributes 'name', 'address', 'phone', and 'email'. More hashkeys and hashdiffs would be added as other keys of the dictionary. + " %} + {% set derived_columns_description = " derived_columns::dictionary Defines values and datatypes for derived ('added' or 'calculated') columns. The values of this dictionary are the desired column names, the value is another dictionary with the keys 'value' (holding a column name, a SQL expression, or a static string beginning with '!') and 'datatype' (holding a valid SQL datatype for the target database). @@ -48,15 +62,19 @@ 'datatype': 'INT64'}, the number of days between two columns available inside the source data. 'country_isocode': {'value': '!GER', The column 'country_isocode' inserts the static string 'EUR' for all rows. 'datatype': 'STRING'}, The column 'account_name' duplicates an already existing column and gives - 'account_name': {'value': 'name', it another name. More derived columns can be added as other keys of + 'account_name': {'value': 'name', it another name. More derived columns can be added as additional keys of 'datatype': 'String'}} the dictionary. + " %} + {% set sequence_description = " sequence::string Name of the column inside the source data, that holds a sequence number that was generated during the data source extraction process. Optional and not required. Example: 'edwSequence' Uses the column 'edwSequence' that is available inside the source data. + " %} + {% set prejoined_columns_description = " prejoined_columns::dictionary Defines information about information that needs to be prejoined. Most commonly used to create links, when the source data does not hold the Business Key, but the technical key of the referred object. The values of the dict are the aliases you want to give the prejoined columns. Typically, but not always, this should be the same as the name of the prejoined column inside the prejoined entity. For each prejoined column @@ -71,13 +89,15 @@ 'bk': 'contractnumber', name (specified in 'bk') from the source table 'contract' in the source 'source_data' 'this_column_name': 'ContractId', by joining on 'this.ContractId = contract.Id'. In this case the prejoined 'ref_column_name': 'Id'}, column alias equals the name of the original business key column, which should be - 'master_account_key' {'ref_model': 'account_prep', or a self-prejoin happens, and then you would have to rename the final columns to not + 'master_account_key': {'ref_model': 'account_prep', or a self-prejoin happens, and then you would have to rename the final columns to not 'bk': 'account_key', have duplicate column names. The column 'master_account_key' holds values of the column 'this_column_name': 'master_account_id', 'account_key' inside the pre-populated dbt model 'account_prep'. If this prejoin is done inside account, 'ref_column_name': 'Id'}} we would now have a self-prejoin ON 'account.master_account_id = account.Id'. Because the table 'account' already has a column 'account_key', we rename the prejoined column - to 'master_account_key'. More prejoined columns can be added as other keys of the dictionary. + to 'master_account_key'. More prejoined columns can be added as additional keys of the dictionary. + " %} + {% set missing_columns_description = " missing_columns::dictionary If the schema of the source changes over time and columns are disappearing, this parameter gives you the option to create additional columns holding NULL values, that replace columns that were previously there. By this procedure, hashdiff calculations and satellite payloads wont break. The dictionary holds the names of those columns as keys, and the SQL datatypes of these columns as values. @@ -85,7 +105,9 @@ Example: {'legacy_account_uuid': 'INT64', Two additional columns are added to the source table holding NULL values. The column 'legacy_account_uuid' will 'shipping_address' : 'STRING'} have the datatype 'INT64' and the column 'shipping_address' will have the datatype 'STRING'. + " %} + {% set multi_active_config_description = " multi_active_config::dictionary If the source data holds multi-active data, define here the column(s) holding the multi-active key and the main hashkey column. If the source data is multi-active but has no natural multi-active key, create one using the row_number SQL function (or similar) one layer before. Then insert the name of that artificial column into the multi-active-key parameter. The combination of the multi-active key(s), the main-hashkey and the ldts column should be unique in the final result satellite. @@ -96,43 +118,57 @@ 'main_hashkey_column': 'hk_contact_h'} That means, that the combination of main_hashkey, ldts and 'phonetype' is unique inside the source system. {'multi_active_key': ['phonetype', 'company'], This source data comes with two multi-active keys. The combination of those two, the main_hashkey and ldts is unique - 'main_hashkey_column': 'hk_contact_h'} inside the source system. - - enable_ghost_records::boolean If set to true, the stage will be created with ghost records. By default, ghost records are enabled. Optional Parameter - - #} - - - - {%- macro stage(ldts, rsrc, source_model, include_source_columns=true, hashed_columns=none, derived_columns=none, sequence=none, prejoined_columns=none, missing_columns=none, multi_active_config=none, enable_ghost_records=true) -%} - - {# If include_source_columns is passed but its empty then it is set with the default value (true) #} - {%- if include_source_columns is none or include_source_columns == "" -%} - {%- set include_source_columns = true -%} - {%- endif -%} - - {# If enable_ghost_records is passed but its empty then it is set with the default value (true) #} - {%- if enable_ghost_records is none or enable_ghost_records == "" -%} - {%- set enable_ghost_records = true -%} - {%- endif -%} - - {# If ldts is empty replace it with the current timestamp #} - {%- if datavault4dbt.is_nothing(ldts) -%} - {%- set ldts = datavault4dbt.current_timestamp() -%} - {%- endif -%} - - {% set hashed_columns= datavault4dbt.exclude_hashdiff_columns(source_model,hashed_columns)%} + 'main_hashkey_column': 'hk_contact_h'} inside the source system. + " %} + + {% set enable_ghost_records_description = " + enable_ghost_records::boolean If set to true, the stage will be created with ghost records. By default, ghost records are enabled. Optional Parameter. + " %} + + {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=True, documentation=ldts_description) -%} + {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=True, documentation=rsrc_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set include_source_columns = datavault4dbt.yaml_metadata_parser(name='include_source_columns', yaml_metadata=yaml_metadata, parameter=include_source_columns, required=False, documentation=include_source_columns_description) -%} + {%- set hashed_columns = datavault4dbt.yaml_metadata_parser(name='hashed_columns', yaml_metadata=yaml_metadata, parameter=hashed_columns, required=False, documentation=hashed_columns_description) -%} + {%- set derived_columns = datavault4dbt.yaml_metadata_parser(name='derived_columns', yaml_metadata=yaml_metadata, parameter=derived_columns, required=False, documentation=derived_columns_description) -%} + {%- set sequence = datavault4dbt.yaml_metadata_parser(name='sequence', yaml_metadata=yaml_metadata, parameter=sequence, required=False, documentation=sequence_description) -%} + {%- set prejoined_columns = datavault4dbt.yaml_metadata_parser(name='prejoined_columns', yaml_metadata=yaml_metadata, parameter=prejoined_columns, required=False, documentation=prejoined_columns_description) -%} + {%- set missing_columns = datavault4dbt.yaml_metadata_parser(name='missing_columns', yaml_metadata=yaml_metadata, parameter=missing_columns, required=False, documentation=missing_columns_description) -%} + {%- set multi_active_config = datavault4dbt.yaml_metadata_parser(name='multi_active_config', yaml_metadata=yaml_metadata, parameter=multi_active_config, required=False, documentation=multi_active_config_description) -%} + {%- set enable_ghost_records = datavault4dbt.yaml_metadata_parser(name='enable_ghost_records', yaml_metadata=yaml_metadata, parameter=enable_ghost_records, required=False, documentation=enable_ghost_records_description) -%} + + {# If include_source_columns is passed but its empty then it is set with the default value (true) #} + {%- if include_source_columns is none or include_source_columns == "" -%} + {%- set include_source_columns = true -%} + {%- endif -%} + + {# If enable_ghost_records is passed but its empty then it is set with the default value (true) #} + {%- if enable_ghost_records is none or enable_ghost_records == "" -%} + {%- set enable_ghost_records = true -%} + {%- endif -%} + + {# If ldts is empty replace it with the current timestamp #} + {%- if datavault4dbt.is_nothing(ldts) -%} + {%- set ldts = datavault4dbt.current_timestamp() -%} + {%- endif -%} - {{- adapter.dispatch('stage', 'datavault4dbt')(include_source_columns=include_source_columns, - ldts=ldts, - rsrc=rsrc, - source_model=source_model, - hashed_columns=hashed_columns, - derived_columns=derived_columns, - sequence=sequence, - prejoined_columns=prejoined_columns, - missing_columns=missing_columns, - multi_active_config=multi_active_config, - enable_ghost_records=enable_ghost_records) -}} + {# To parse the list syntax of prejoined columns #} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {%- set prejoined_columns = datavault4dbt.process_prejoined_columns(prejoined_columns) -%} + {%- endif -%} + + {% set hashed_columns= datavault4dbt.exclude_hashdiff_columns(source_model,hashed_columns) %} + + {{- adapter.dispatch('stage', 'datavault4dbt')(include_source_columns=include_source_columns, + ldts=ldts, + rsrc=rsrc, + source_model=source_model, + hashed_columns=hashed_columns, + derived_columns=derived_columns, + sequence=sequence, + prejoined_columns=prejoined_columns, + missing_columns=missing_columns, + multi_active_config=multi_active_config, + enable_ghost_records=enable_ghost_records) -}} {%- endmacro -%} diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/staging/stage_processing_macros.sql similarity index 56% rename from macros/internal/helpers/stage_processing_macros.sql rename to macros/staging/stage_processing_macros.sql index 6de3d6a0..4072762d 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/staging/stage_processing_macros.sql @@ -54,24 +54,27 @@ {# Do nothing. No source column required. #} {%- elif value is mapping and value.is_hashdiff -%} {%- do extracted_input_columns.append(value['columns']) -%} - {%- elif value is mapping and 'this_column_name' in value.keys() -%} - {%- if datavault4dbt.is_list(value['this_column_name'])-%} - {%- for column in value['this_column_name'] -%} - {%- do extracted_input_columns.append(column) -%} - {%- endfor -%} - {%- else -%} - {%- do extracted_input_columns.append(value['this_column_name']) -%} - {%- endif -%} {%- else -%} {%- do extracted_input_columns.append(value) -%} {%- endif -%} {%- endfor -%} - - {%- do return(extracted_input_columns) -%} + + {%- elif datavault4dbt.is_list(columns_dict) -%} + {% for prejoin in columns_dict %} + {%- if datavault4dbt.is_list(prejoin['this_column_name'])-%} + {%- for column in prejoin['this_column_name'] -%} + {%- do extracted_input_columns.append(column) -%} + {%- endfor -%} + {%- else -%} + {%- do extracted_input_columns.append(prejoin['this_column_name']) -%} + {%- endif -%} + {% endfor %} {%- else -%} {%- do return([]) -%} {%- endif -%} + {%- do return(extracted_input_columns) -%} + {%- endmacro -%} @@ -132,4 +135,89 @@ {%- endif %} {%- endfor -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} + + +{%- macro process_prejoined_columns(prejoined_columns=none) -%} + {# Check if the old syntax is used for prejoined columns + If so parse it to new list syntax #} + + {% if datavault4dbt.is_list(prejoined_columns) %} + {% do return(prejoined_columns) %} + {% else %} + {% set output = [] %} + + {% for key, value in prejoined_columns.items() %} + {% set ref_model = value.get('ref_model') %} + {% set src_name = value.get('src_name') %} + {% set src_table = value.get('src_table') %} + {%- if 'operator' not in value.keys() -%} + {%- do value.update({'operator': 'AND'}) -%} + {%- set operator = 'AND' -%} + {%- else -%} + {%- set operator = value.get('operator') -%} + {%- endif -%} + + {% set match_criteria = ( + ref_model and output | selectattr('ref_model', 'equalto', ref_model) or + src_name and output | selectattr('src_name', 'equalto', src_name) | selectattr('src_table', 'equalto', src_table) + ) | selectattr('this_column_name', 'equalto', value.this_column_name) + | selectattr('ref_column_name', 'equalto', value.ref_column_name) + | selectattr('operator', 'equalto', value.operator) + | list | first %} + + {% if match_criteria %} + {% do match_criteria['extract_columns'].append(value.bk) %} + {% do match_criteria['aliases'].append(key) %} + {% else %} + {% set new_item = { + 'extract_columns': [value.bk], + 'aliases': [key], + 'this_column_name': value.this_column_name, + 'ref_column_name': value.ref_column_name, + 'operator': operator + } %} + + {% if ref_model %} + {% do new_item.update({'ref_model': ref_model}) %} + {% elif src_name and src_table %} + {% do new_item.update({'src_name': src_name, 'src_table': src_table}) %} + {% endif %} + + {% do output.append(new_item) %} + {% endif %} + {% endfor %} + {% endif %} + + {%- do return(output) -%} + +{%- endmacro -%} + + +{%- macro extract_prejoin_column_names(prejoined_columns=none) -%} + + {%- set extracted_column_names = [] -%} + + {% if not datavault4dbt.is_something(prejoined_columns) %} + {%- do return(extracted_column_names) -%} + {% endif %} + + {% for prejoin in prejoined_columns %} + {% if datavault4dbt.is_list(prejoin['aliases']) %} + {% for alias in prejoin['aliases'] %} + {%- do extracted_column_names.append(alias) -%} + {% endfor %} + {% elif datavault4dbt.is_something(prejoin['aliases']) %} + {%- do extracted_column_names.append(prejoin['aliases']) -%} + {% elif datavault4dbt.is_list(prejoin['extract_columns']) %} + {% for column in prejoin['extract_columns'] %} + {%- do extracted_column_names.append(column) -%} + {% endfor %} + {% else %} + {%- do extracted_column_names.append(prejoin['extract_columns']) -%} + {% endif %} + {%- endfor -%} + + {%- do return(extracted_column_names) -%} + +{%- endmacro -%} diff --git a/macros/staging/staging.yml b/macros/staging/staging.yml new file mode 100644 index 00000000..86988615 --- /dev/null +++ b/macros/staging/staging.yml @@ -0,0 +1,23 @@ +version: 2 + +macros: + - name: process_prejoined_columns + description: > + A macro to process prejoined columns. If a list of dictioniaries(new syntax) is provided it will do nothing and return the list. + If a dictionary of dictionaries if provided(old syntax) it will be transformed to the new syntax. + When multiple columns are to be extracted from the same prejoin-target and with the same conditions(columns and operator) they will be combined into one item. + arguments: + - name: prejoined_columns + type: list or dictionary + description: The value of the prejoined_columns as defined in the yaml_metadata of the stage-model. + + - name: extract_prejoin_column_names + description: > + A macro to extract the names of the prejoined columns of each staging-model. + Takes a list of prejoins and will add the aliases of the prejoins to the return-list. + If no aliases are present it will return the names of the extracted columns. + Returns an empty list if the passed parameter is empty. + arguments: + - name: prejoined_columns + type: list + description: The prejoined_columns as process by the process_prejoined_columns-macro \ No newline at end of file diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 4aac64b9..27839c37 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -89,7 +89,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -179,6 +179,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -255,6 +257,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} @@ -264,19 +267,43 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} + {%- endif -%} + + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} FROM {{ last_cte }} lcte - {% for col, vals in prejoined_columns.items() %} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -297,28 +324,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endfor %} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} @@ -445,65 +469,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -515,62 +535,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index 03698646..b12fac84 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -32,6 +32,7 @@ {%- if ghost_record_type == 'unknown' -%} {%- if datatype == 'TIMESTAMP' %} {{ datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} + {%- elif datatype == 'DATETIME'%} CAST({{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} AS {{ datatype }}) as {{ alias }} {%- elif datatype == 'DATE'-%} PARSE_DATE('{{date_format}}','{{ beginning_of_all_times_date }}') as {{ alias }} {%- elif datatype == 'STRING' %} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'INT64' %} CAST({{unknown_value__numeric}} as INT64) as {{ alias }} @@ -41,6 +42,7 @@ {% endif %} {%- elif ghost_record_type == 'error' -%} {%- if datatype == 'TIMESTAMP' %} {{ datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} + {%- elif datatype == 'DATETIME'%} CAST({{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} AS {{ datatype }}) as {{ alias }} {%- elif datatype == 'DATE'-%} PARSE_DATE('{{date_format}}', '{{ end_of_all_times_date }}') as {{ alias }} {%- elif datatype == 'STRING' %} '{{error_value__STRING}}' as {{ alias }} {%- elif datatype == 'INT64' %} CAST({{error_value__numeric}} as INT64) as {{ alias }} @@ -465,7 +467,7 @@ {%- if ghost_record_type == 'unknown' -%} - {%- if datatype in ['DATETIME2'] %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as {{ alias }} + {%- if 'DATETIME2' in datatype %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as {{ alias }} {%- elif datatype in ['DATETIMEOFFSET'] %} CONVERT({{ datatype }},{{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as {{ alias }} {%- elif 'CHAR' in datatype -%} {%- if col_size is not none -%} @@ -497,7 +499,7 @@ {%- elif ghost_record_type == 'error' -%} - {%- if datatype in ['DATETIME2'] %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as {{ alias }} + {%- if 'DATETIME2' in datatype %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as {{ alias }} {%- elif datatype in ['DATETIMEOFFSET'] %} CONVERT({{ datatype }},{{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as {{ alias }} {%- elif 'CHAR' in datatype -%} {%- if col_size is not none -%} @@ -553,8 +555,8 @@ {%- set error_value__numeric = var('datavault4dbt.error_value__numeric', -2) -%} {%- set hash = datavault4dbt.hash_method() -%} -{%- set hash_default_values = datavault4dbt.hash_default_values(hash_function=hash) -%} -{%- set unknown_value__HASHTYPE = hash_default_values['unknown_key'] -%} +{%- set hash_default_values = fromjson(datavault4dbt.hash_default_values(hash_function=hash)) -%} +{%- set unknown_value__HASHTYPE = hash_default_values.get('unknown_key') -%} {%- set error_value__HASHTYPE = hash_default_values['error_key'] -%} {%- set datatype = datatype | string | upper | trim -%} @@ -566,7 +568,7 @@ {%- elif datatype in ['INT', 'SMALLINT', 'TINYINT', 'BIGINT', 'DOUBLE', 'FLOAT'] %} CAST('{{unknown_value__numeric}}' as {{ datatype}}) as {{ alias }} {%- elif datatype.upper().startswith('DECIMAL') %} CAST('{{unknown_value__numeric}}' as DECIMAL) as {{ alias }} {%- elif datatype == 'BOOLEAN' %} CAST('FALSE' as BOOLEAN) as {{ alias }} - {%- elif datatype == 'BINARY' %} CAST('{{ unknown_value__HASHTYPE }}') as {{ alias }} + {%- elif datatype == 'BINARY' %} CAST('{{ unknown_value__HASHTYPE }}' as BINARY) as {{ alias }} {%- else %} CAST(NULL as {{ datatype }}) as {{ alias }} {% endif %} {%- elif ghost_record_type == 'error' -%} @@ -576,7 +578,7 @@ {%- elif datatype in ['INT', 'SMALLINT', 'TINYINT', 'BIGINT', 'DOUBLE', 'FLOAT'] %} CAST('{{error_value__numeric}}' as {{ datatype}}) as {{ alias }} {%- elif datatype.upper().startswith('DECIMAL') %} CAST('{{error_value__numeric}}' as DECIMAL) as {{ alias }} {%- elif datatype == 'BOOLEAN' %} CAST('FALSE' as BOOLEAN) as {{ alias }} - {%- elif datatype == 'BINARY' %} CAST('{{ error_value__HASHTYPE }}') as {{ alias }} + {%- elif datatype == 'BINARY' %} CAST('{{ error_value__HASHTYPE }}' as BINARY) as {{ alias }} {%- else %} CAST(NULL as {{ datatype }}) as {{ alias }} {% endif %} {%- else -%} @@ -605,26 +607,18 @@ {%- set error_value__numeric = var('datavault4dbt.error_value__numeric', '-2') -%} {%- if ghost_record_type == 'unknown' -%} - {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIMEZONE' or datatype == 'TIMESTAMP WITH LOCAL TIMEZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} + {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIME ZONE' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ beginning_of_all_times_date }}', '{{ date_format }}' ) as "{{ alias }}" - {%- elif datatype == 'VARCHAR'-%} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'VARCHAR2'-%} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NVARCHAR2' %} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'CHAR' %} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NCHAR' %} '{{unknown_value__STRING}}' as {{ alias }} + {%- elif 'CHAR' in datatype %} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'LONG' %} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'NUMBER' %} CAST('{{unknown_value__numeric}}' as NUMBER) as {{ alias }} {%- elif datatype == 'FLOAT' %} CAST('{{unknown_value__numeric}}' as FLOAT) as {{ alias }} {%- else %} CAST(NULL as {{ datatype }}) as {{ alias }} {% endif %} {%- elif ghost_record_type == 'error' -%} - {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIMEZONE' or datatype == 'TIMESTAMP WITH LOCAL TIMEZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} + {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIME ZONE' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ end_of_all_times_date }}', '{{ date_format }}' ) as "{{ alias }}" - {%- elif datatype == 'VARCHAR'-%} CAST('{{error_value__STRING}}' as VARCHAR2(40)) as {{ alias }} - {%- elif datatype == 'VARCHAR2'-%} '{{error_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NVARCHAR2' %} '{{error_value__STRING}}' as {{ alias }} - {%- elif datatype == 'CHAR' %} '{{error_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NCHAR' %} '{{error_value__STRING}}' as {{ alias }} + {%- elif 'CHAR' in datatype %} '{{error_value__STRING}}' as {{ alias }} {%- elif datatype == 'LONG' %} '{{error_value__STRING}}' as {{ alias }} {%- elif datatype == 'NUMBER' %} CAST('{{error_value__numeric}}' as NUMBER) as {{ alias }} {%- elif datatype == 'FLOAT' %} CAST('{{error_value__numeric}}' as FLOAT) as {{ alias }} @@ -636,4 +630,4 @@ {%- endif %} {%- endif -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} diff --git a/macros/supporting/hash_standardization.sql b/macros/supporting/hash_standardization.sql index a2cbb75f..bef0d50a 100644 --- a/macros/supporting/hash_standardization.sql +++ b/macros/supporting/hash_standardization.sql @@ -447,15 +447,15 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- else -%} {%- if case_sensitive -%} - {%- set standardise_prefix = "IFNULL({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')), CAST({} AS {})) AS {}".format(zero_key, datatype, alias)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')) as {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')), CAST({} AS {}))".format(zero_key, datatype)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')) as {}), CAST({} AS {}))".format(datatype, zero_key, datatype)-%} {%- endif -%} {%- else -%} - {%- set standardise_prefix = "IFNULL({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')), CAST({} AS {})) AS {}".format(zero_key, datatype, alias)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')) as {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- endif -%} {%- endif -%} @@ -934,20 +934,20 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- else -%} {%- if case_sensitive -%} - {%- set standardise_prefix = "IFNULL({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {}) AS {}".format(zero_key, alias)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {})".format(zero_key)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {}))".format(datatype, zero_key, datatype)-%} {%- endif -%} {%- else -%} - {%- set standardise_prefix = "IFNULL({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {}) AS {}".format(zero_key, alias)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {})".format(zero_key)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {}))".format(datatype, zero_key, datatype)-%} {%- endif -%} {%- endif -%} @@ -984,4 +984,4 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- endif -%} {%- do dict_result.update({"standardise_suffix": standardise_suffix, "standardise_prefix": standardise_prefix }) -%} {{ return(dict_result | tojson ) }} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} diff --git a/macros/tables/bigquery/nh_link.sql b/macros/tables/bigquery/nh_link.sql index e05de7dc..59584184 100644 --- a/macros/tables/bigquery/nh_link.sql +++ b/macros/tables/bigquery/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro default__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro default__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -25,6 +25,18 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION DISTINCT' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -213,7 +225,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/control_snap_v0.sql b/macros/tables/control_snap_v0.sql index 2a4a8f3e..423899de 100644 --- a/macros/tables/control_snap_v0.sql +++ b/macros/tables/control_snap_v0.sql @@ -40,29 +40,44 @@ logic that is applied in the version 1 snapshot table on top of this one. This column is automatically set to TRUE. - Parameters: +#} - start_date::timestamp Defines the earliest timestamp that should be available inside the snapshot_table. The time part of this - timestamp needs to be set to '00:00:00'. The format of this timestamp must equal to the timestamp format - defined in the global variable 'datavault4dbt.timestamp_format'. +{%- macro control_snap_v0(yaml_metadata=none, start_date=none, daily_snapshot_time=none, sdts_alias=none, end_date=none) -%} - Examples: - '2015-01-01T00-00-00' This snapshot table would hold daily snapshots beginning at 2015. - daily_snapshot_time::time Defines the time that your daily snapshots should have. Usually this is either something right before - daily business starts, or after daily business is over. + {% set start_date_description = " + start_date::timestamp Defines the earliest timestamp that should be available inside the snapshot_table. The time part of this + timestamp needs to be set to '00:00:00'. The format of this timestamp must equal to the timestamp format + defined in the global variable 'datavault4dbt.timestamp_format'. - Examples: - '07:30:00' The snapshots inside this table would all have the time '07:30:00'. - '23:00:00' The snapshots inside this table would all have the time '23:00:00'. - - sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. It is optional, - if not set will use the global variable `datavault4dbt.sdts_alias` set inside dbt_project.yml + Examples: + '2015-01-01T00-00-00' This snapshot table would hold daily snapshots beginning at 2015. + " %} -#} + {% set daily_snapshot_time_description = " + daily_snapshot_time::time Defines the time that your daily snapshots should have. Usually this is either something right before + daily business starts, or after daily business is over. + + Examples: + '07:30:00' The snapshots inside this table would all have the time '07:30:00'. + '23:00:00' The snapshots inside this table would all have the time '23:00:00'. + " %} + + {% set sdts_alias_description = " + sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. It is optional, + if not set will use the global variable `datavault4dbt.sdts_alias` set inside dbt_project.yml + " %} + + {% set end_date_description = " + end_date::timestamp Defines the latest timestamp that should be available inside the snapshot_table. + " %} + + + {%- set start_date = datavault4dbt.yaml_metadata_parser(name='start_date', yaml_metadata=yaml_metadata, parameter=start_date, required=True, documentation=start_date_description) -%} + {%- set daily_snapshot_time = datavault4dbt.yaml_metadata_parser(name='daily_snapshot_time', yaml_metadata=yaml_metadata, parameter=daily_snapshot_time, required=True, documentation=daily_snapshot_time_description) -%} + {%- set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) -%} + {%- set end_date = datavault4dbt.yaml_metadata_parser(name='end_date', yaml_metadata=yaml_metadata, parameter=end_date, required=False, documentation=end_date_description) -%} -{%- macro control_snap_v0(start_date, daily_snapshot_time, sdts_alias=none, end_date=none) -%} - {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} {{ adapter.dispatch('control_snap_v0', 'datavault4dbt')(start_date=start_date, diff --git a/macros/tables/control_snap_v1.sql b/macros/tables/control_snap_v1.sql index d33775c9..146b1c5d 100644 --- a/macros/tables/control_snap_v1.sql +++ b/macros/tables/control_snap_v1.sql @@ -26,54 +26,62 @@ is_last_rolling_year::boolean Captures if a sdts is inside the range that starts two years ago (from the current date) and ranges until one year ago (from the current date). - - Parameters: - - control_snap_v0::string The name of the underlying version 0 control snapshot table. Needs to be - available as a dbt model. - - log_logic::dictionary Defining the desired durations of each granularity. Available granularities - are 'daily', 'weekly', 'monthly', and 'yearly'. For each granularity the - duration can be defined as an integer, and the time unit for that duration. - The units include (in BigQuery): DAY, WEEK, MONTH, QUARTER, YEAR. Besides - defining a duration and a unit for each granularity, there is also the option - to set a granularity to 'forever'. E.g. reporting requires daily snapshots - for 3 months, and after that the monthly snapshots should be kept forever. - - If log_logic is not set, no logic will be applied, and all snapshots will stay - active. The other dynamic columns are calculated anyway. - - The duration is always counted from the current date. - - EXASOL: Due to a missing "DAY OF WEEK" Function in Exasol, is_weekly is currently - not supported and needs to be left out of the log_logic definition. - - Examples: - {'daily': {'duration': 3, This configuration would keep daily - 'unit': 'MONTH', snapshots for 3 months, weekly snapshots - 'forever': 'FALSE'}, for 1 year, monthly snapshots for 5 - 'weekly': {'duration': 1, years and yearly snapshots forever. - 'unit': 'YEAR'}, If 'forever' is not defined here, it - 'monthly': {'duration': 5, is automatically set to 'FALSE'. - 'unit': 'YEAR'}, therefore it could have been left out - 'yearly': {'forever': 'TRUE'} } in the configuration for daily snapshots. - - {'daily': {'duration': 90, This would keep daily snapshots for 90 - 'unit': 'DAY'}, days, and monthly snapshots forever. - 'monthly': {'forever': 'TRUE'}} - - sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. - It is optional, if not set will use the global variable `datavault4dbt.sdts_alias` - set inside dbt_project.yml - #} -{%- macro control_snap_v1(control_snap_v0, log_logic=none, sdts_alias=none) -%} - -{%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} - -{{ adapter.dispatch('control_snap_v1', 'datavault4dbt')(control_snap_v0=control_snap_v0, - log_logic=log_logic, - sdts_alias=sdts_alias) }} +{%- macro control_snap_v1(yaml_metadata=none, control_snap_v0=none, log_logic=none, sdts_alias=none) -%} + + {% set control_snap_v0_description = " + control_snap_v0::string The name of the underlying version 0 control snapshot table. Needs to be + available as a dbt model. + " %} + + {% set log_logic_description = " + log_logic::dictionary Defining the desired durations of each granularity. Available granularities + are 'daily', 'weekly', 'monthly', and 'yearly'. For each granularity the + duration can be defined as an integer, and the time unit for that duration. + The units include (in BigQuery): DAY, WEEK, MONTH, QUARTER, YEAR. Besides + defining a duration and a unit for each granularity, there is also the option + to set a granularity to 'forever'. E.g. reporting requires daily snapshots + for 3 months, and after that the monthly snapshots should be kept forever. + + If log_logic is not set, no logic will be applied, and all snapshots will stay + active. The other dynamic columns are calculated anyway. + + The duration is always counted from the current date. + + EXASOL: Due to a missing 'DAY OF WEEK' Function in Exasol, is_weekly is currently + not supported and needs to be left out of the log_logic definition. + + Examples: + {'daily': {'duration': 3, This configuration would keep daily + 'unit': 'MONTH', snapshots for 3 months, weekly snapshots + 'forever': 'FALSE'}, for 1 year, monthly snapshots for 5 + 'weekly': {'duration': 1, years and yearly snapshots forever. + 'unit': 'YEAR'}, If 'forever' is not defined here, it + 'monthly': {'duration': 5, is automatically set to 'FALSE'. + 'unit': 'YEAR'}, therefore it could have been left out + 'yearly': {'forever': 'TRUE'} } in the configuration for daily snapshots. + + {'daily': {'duration': 90, This would keep daily snapshots for 90 + 'unit': 'DAY'}, days, and monthly snapshots forever. + 'monthly': {'forever': 'TRUE'}} + " %} + + {% set sdts_alias_description = " + sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. + It is optional, if not set will use the global variable `datavault4dbt.sdts_alias` + set inside dbt_project.yml + " %} + + {% set control_snap_v0 = datavault4dbt.yaml_metadata_parser(name='control_snap_v0', yaml_metadata=yaml_metadata, parameter=control_snap_v0, required=True, documentation=control_snap_v0_description) %} + {% set log_logic = datavault4dbt.yaml_metadata_parser(name='log_logic', yaml_metadata=yaml_metadata, parameter=log_logic, required=False, documentation=log_logic_description) %} + {% set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) %} + + + {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} + + {{ adapter.dispatch('control_snap_v1', 'datavault4dbt')(control_snap_v0=control_snap_v0, + log_logic=log_logic, + sdts_alias=sdts_alias) }} {%- endmacro -%} diff --git a/macros/tables/databricks/nh_link.sql b/macros/tables/databricks/nh_link.sql index 450f362a..00f02889 100644 --- a/macros/tables/databricks/nh_link.sql +++ b/macros/tables/databricks/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro databricks__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro databricks__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,17 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION DISTINCT' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -213,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/eff_sat_v0.sql b/macros/tables/eff_sat_v0.sql index 2857e263..2905065b 100644 --- a/macros/tables/eff_sat_v0.sql +++ b/macros/tables/eff_sat_v0.sql @@ -1,5 +1,13 @@ -{%- macro eff_sat_v0(source_model, tracked_hashkey, src_ldts=none, src_rsrc=none, is_active_alias=none, source_is_single_batch=true, disable_hwm=false) -%} +{%- macro eff_sat_v0(yaml_metadata=none, source_model=none, tracked_hashkey=none, src_ldts=none, src_rsrc=none, is_active_alias=none, source_is_single_batch=true, disable_hwm=false) -%} + {% set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation='Name of the source model') %} + {% set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation='Name of the hashkey column to be tracked') %} + {% set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation='Name of the loaddate column in the source model. Optional.') %} + {% set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation='Name of the record source column in the source model. Optional.') %} + {% set is_active_alias = datavault4dbt.yaml_metadata_parser(name='is_active_alias', yaml_metadata=yaml_metadata, parameter=is_active_alias, required=False, documentation='Name of the new active flag column. Optional.') %} + {% set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default True.') %} + {% set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be disabled or not. Optional.') %} + {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} {%- set src_rsrc = datavault4dbt.replace_standard(src_rsrc, 'datavault4dbt.rsrc_alias', 'rsrc') -%} @@ -13,4 +21,5 @@ source_is_single_batch=source_is_single_batch, disable_hwm=disable_hwm) ) }} + {%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/exasol/nh_link.sql b/macros/tables/exasol/nh_link.sql index 24bffbee..fb28e493 100644 --- a/macros/tables/exasol/nh_link.sql +++ b/macros/tables/exasol/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro exasol__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro exasol__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,17 @@ {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -213,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/fabric/nh_link.sql b/macros/tables/fabric/nh_link.sql index 1317f2b1..e73615a8 100644 --- a/macros/tables/fabric/nh_link.sql +++ b/macros/tables/fabric/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro fabric__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro fabric__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -27,6 +27,17 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -221,7 +232,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/hub.sql b/macros/tables/hub.sql index fa9f00a9..f6221320 100644 --- a/macros/tables/hub.sql +++ b/macros/tables/hub.sql @@ -7,15 +7,19 @@ - Supports multiple updates per batch and therefore initial loading - Can use a dynamic high-water-mark to optimize loading performance of multiple loads - Allows source mappings for deviations between source column names and hub column names +#} + - Parameters: +{%- macro hub(yaml_metadata=none, hashkey=none, business_keys=none, source_models=none, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + {% set hashkey_description = " hashkey::string Name of the hashkey column inside the stage, that should be used as PK of the Hub. Examples: 'hk_account_h' This hashkey column was created before inside the corresponding staging area, using the stage macro. + " %} - + {% set business_keys_description = " business_keys::string|list of strings Name(s) of the business key columns that should be loaded into the hub and are the input of the hashkey column. Needs to be available inside the stage model. If the names differ between multiple sources, you should define here how the business keys should be called inside the final hub model. The actual input column names need to be defined inside the 'source_model' @@ -25,8 +29,9 @@ 'account_key' This hub only has one business key and therefore only one is defined here. ['account_key', 'account_number'] This hub has two business keys which are both defined here. + " %} - + {% set source_models_description = " source_models::dictionary Dictionary with information about the source models. The keys of the dict are the names of the source models, and the value of each source model is another dictionary. This inner dictionary requires the key 'bk_columns' to be set (which contains the name of the business keys of that source model), and can have the optional keys 'hk_column', 'rsrc_static'. @@ -68,17 +73,24 @@ If the record source is the same over all loads, then it might look something like this: 'SAP/Accounts/'. Here everything would be static over all loads and therefore the rsrc_static can be set to 'SAP/Accounts/' without any wildcards in place. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - -#} - - -{%- macro hub(hashkey, business_keys, source_models, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + " %} + + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set business_keys = datavault4dbt.yaml_metadata_parser(name='business_keys', yaml_metadata=yaml_metadata, parameter=business_keys, required=True, documentation=business_keys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/link.sql b/macros/tables/link.sql index 7818f06e..38c0ba13 100644 --- a/macros/tables/link.sql +++ b/macros/tables/link.sql @@ -3,16 +3,20 @@ if multiple sources share the same business definitions. Typically a link would only be loaded by multiple sources, if those multiple sources also share the business definitions of the hubs, and therefore load the connected hubs together as well. If multiple sources are used, it is required that they all have the same number of foreign keys inside, otherwise they would not share the same business definition of that link. +#} - Parameters: +{%- macro link(yaml_metadata=none, link_hashkey=none, foreign_hashkeys=none, source_models=none, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + {% set link_hashkey_description = " link_hashkey::string Name of the link hashkey column inside the stage. Should get calculated out of all business keys inside the link. Examples: 'hk_account_contact_l' This hashkey column belongs to the link between account and contact, and was created at the staging layer by the stage macro. + " %} + {% set foreign_hashkeys_description = " foreign_hashkeys::list of strings List of all hashkey columns inside the link, that refer to other hub entities. All hashkey columns must be available inside the stage area. @@ -20,7 +24,9 @@ ['hk_account_h', 'hk_contact_h'] The link between account and contact needs to contain both the hashkey of account and contact to enable joins the corresponding hub entities. + " %} + {% set source_models_description = " source_models::dictionary Dictionary with information about the source models. The keys of the dict are the names of the source models, and the value of each source model is another dictionary. This inner dictionary requires to have the keys 'rsrc_static', and optionally the keys 'hk_column' and 'fk_columns'. @@ -59,16 +65,24 @@ If my rsrc would be the same over all loads, then it might look something like this: 'SAP/Accounts/'. Here everything would be static over all loads and therefore I would set rsrc_static to 'SAP/Accounts/' without any wildcards in place. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - -#} - -{%- macro link(link_hashkey, foreign_hashkeys, source_models, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + " %} + + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} + {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=True, documentation=foreign_hashkeys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ma_sat_v0.sql b/macros/tables/ma_sat_v0.sql index 4bdcf1c9..fa535870 100644 --- a/macros/tables/ma_sat_v0.sql +++ b/macros/tables/ma_sat_v0.sql @@ -8,9 +8,11 @@ Features: - Can handle multiple updates per batch, without losing intermediate changes. therefore initial loading is supported. - Using a dynamic high-water-mark to optimize loading performance of multiple loads +#} - Parameters: +{%- macro ma_sat_v0(yaml_metadata=none, parent_hashkey=none, src_hashdiff=none, src_ma_key=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none) -%} + {% set parent_hashkey_description = " parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. Examples: @@ -19,7 +21,9 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set src_hashdiff_description = " src_hashdiff::string Name of the hashdiff column of this satellite, that was created inside the staging area and is calculated out of the entire payload of this satellite. The stage must hold one hashdiff per satellite entity. @@ -28,7 +32,9 @@ 'hd_account_data_sfdc_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the data satellite for account. + " %} + {% set src_ma_key_description = " src_ma_key::string|list of strings Name(s) of the multi-active keys inside the staging area. Need to be the same ones, as defined in the stage model. @@ -39,7 +45,9 @@ ['phonetype', 'company'] In this case, the combination of the two columns 'phonetype' and 'company' is treated as the multi-active key. + " %} + {% set src_payload_description = " src_payload::list of strings A list of all the descriptive attributes that should be included in this satellite. Needs to be the columns that are fed into the hashdiff calculation of this satellite. Do not include the multi-active key in the payload of a multi-active satellite, it is included automatically! @@ -48,23 +56,32 @@ ['name', 'address', 'country', 'phone', 'email'] This satellite would hold the columns 'name', 'address', 'country', 'phone' and 'email', coming out of the underlying staging area. + " %} + {% set source_model_description = " source_model::string Name of the underlying staging model, must be available inside dbt as a model. Examples: 'stage_account' This satellite is loaded out of the stage for account. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - - - -#} - -{%- macro ma_sat_v0(parent_hashkey, src_hashdiff, src_ma_key, src_payload, source_model, src_ldts=none, src_rsrc=none) -%} + " %} + + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_ma_key = datavault4dbt.yaml_metadata_parser(name='src_ma_key', yaml_metadata=yaml_metadata, parameter=src_ma_key, required=True, documentation=src_ma_key_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ma_sat_v1.sql b/macros/tables/ma_sat_v1.sql index ce411472..53ba79ef 100644 --- a/macros/tables/ma_sat_v1.sql +++ b/macros/tables/ma_sat_v1.sql @@ -7,15 +7,19 @@ - Calculates virtualized load-end-dates to correctly identify multiple active records per batch - Enforces insert-only approach by view materialization - Allows multiple attributes to be used as the multi-active-attribute +#} - Parameters: +{%- macro ma_sat_v1(yaml_metadata=none, sat_v0=none, hashkey=none, hashdiff=none, ma_attribute=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} + {% set sat_v0_description = " sat_v0::string Name of the underlying version 0 multi-active satellite. Examples: 'contact_phonenumber_0_s' This satellite would be the version 1 satellite of the underlying version 0 phone number satellite for contacts. + " %} + {% set hashkey_description = " hashkey::string Name of the parent hashkey column inside the version 0 satellite. Would either be the hashkey of a hub or a link. Needs to be similar to the 'parent_hashkey' parameter inside the sat_v0 model. @@ -25,7 +29,9 @@ 'hk_order_contact_l' The satellite would be attached to the link between order and contact, which has the column 'hk_order_contact_l' as a hashkey column. + " %} + {% set hashdiff_description = " hashdiff::string Name of the hashdiff column inside the underlying version 0 satellite. Needs to be similar to the 'src_hashdiff' parameter inside the sat_v0 model. Must not include the ma_attribute in calculation. @@ -33,7 +39,9 @@ 'hd_contact_phonenumber_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the phone number satellite for contacts. + " %} + {% set ma_attribute_description = " ma_attribute::string|list of strings Name of the multi active attribute inside the v0 satellite. This needs to be identified under the requirement that the combination of hashkey + ldts + ma_attribute is unique over the entire stage / satellite. @@ -46,25 +54,38 @@ ['phone_type', 'iid'] If a contact could have multiple mobile phone numbers, the phone_type alone would not be enough to uniquely identify a record inside a hashkey+ldts combination. Additionally the attribute iid, which is an increasing identifier within a phone_type, is added as a ma_attribute. + " %} - + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set ledts_alias_description = " ledts_alias::string Desired alias for the load end date column. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if set here. + " %} + {% set add_is_current_flag_description = " add_is_current_flag::boolean Optional parameter to add a new column to the v1 sat based on the load end date timestamp (ledts). Default is false. If set to true it will add this is_current flag to the v1 sat. For each record this column will be set to true if the load end date time stamp is equal to the variable end of all times. If its not, then the record is not current therefore it will be set to false. - -#} - -{%- macro ma_sat_v1(sat_v0, hashkey, hashdiff, ma_attribute, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} + " %} + + {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set ma_attribute = datavault4dbt.yaml_metadata_parser(name='ma_attribute', yaml_metadata=yaml_metadata, parameter=ma_attribute, required=True, documentation=ma_attribute_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} diff --git a/macros/tables/nh_link.sql b/macros/tables/nh_link.sql index d6ebef77..1a9ff691 100644 --- a/macros/tables/nh_link.sql +++ b/macros/tables/nh_link.sql @@ -4,23 +4,29 @@ number of foreign keys inside, otherwise they would not share the same business definition of that non-historized link. In the background a non-historized link uses exactly the same loading logic as a regular link, but adds the descriptive attributes as additional payload. +#} - Parameters: - +{%- macro nh_link(yaml_metadata=none, link_hashkey=none, payload=none, source_models=none, foreign_hashkeys=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false, union_strategy='all') -%} + + {% set link_hashkey_description = " link_hashkey::string Name of the non-historized link hashkey column inside the stage. Should get calculated out of all business keys inside the link. Examples: 'hk_transaction_account_nl' This hashkey column belongs to the non-historized link between transaction and account, and was created at the staging layer by the stage macro. + " %} + {% set foreign_hashkeys_description = " foreign_hashkeys::list of strings List of all hashkey columns inside the non-historized link, that refer to other hub entities. All hashkey columns must be available inside the stage area. Examples: ['hk_transaction_h', 'hk_account_h'] The non-historized link between transaction and account needs to contain both the hashkey of transaction and account to enable joins to the corresponding hub entities. + " %} + {% set payload_description = " payload::list of strings A list of all the descriptive attributes that should be the payload of this non-historized link. If the names differ between source models, this list will define how the columns are named inside the result non historized link. The mapping which columns to use from which source model then need to be defined inside the parameter 'payload' inside the variable 'source_models'. @@ -28,7 +34,9 @@ Examples: ['currency_isocode', 'amount', 'purpose', 'transaction_date'] The non-historized link will be enriched by the descriptive attributes 'currency_isocode', 'amount', 'purpose' and 'transaction_date'. + " %} + {% set source_models_description = " source_models::dictionary Dictionary with information about the source models. The keys of the dict are the names of the source models, and the value of each source model is another dictionary. This inner dictionary optionally has the keys 'hk_column', 'fk_columns', 'payload' and 'rsrc_static'. @@ -72,17 +80,33 @@ If the record source is the same over all loads, then it might look something like this: 'SAP/Accounts/'. Here everything would be static over all loads and therefore the rsrc_static would be set to 'SAP/Accounts/' without any wildcards in place. + " %} - + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set union_strategy_description = " + union_strategy::'all' | 'distinct' Defines how multiple sources should be unioned. 'all' will result in a UNION ALL and represents the default value. Should only be changed, if you have duplicates across + source systems, and don't want to deduplicate them upfront. + " %} + + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} + {%- set payload = datavault4dbt.yaml_metadata_parser(name='payload', yaml_metadata=yaml_metadata, parameter=payload, required=True, documentation=payload_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=False, documentation=foreign_hashkeys_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=False, documentation=rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} + {%- set union_strategy = datavault4dbt.yaml_metadata_parser(name='union_strategy', yaml_metadata=yaml_metadata, parameter=union_strategy, required=False, documentation=union_strategy_description) -%} -#} - -{%- macro nh_link(link_hashkey, payload, source_models, foreign_hashkeys=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} @@ -95,6 +119,7 @@ src_rsrc=src_rsrc, source_models=source_models, disable_hwm=disable_hwm, - source_is_single_batch=source_is_single_batch) -}} + source_is_single_batch=source_is_single_batch, + union_strategy=union_strategy) -}} {%- endmacro -%} diff --git a/macros/tables/nh_sat.sql b/macros/tables/nh_sat.sql index 2f059a6f..3b1d1214 100644 --- a/macros/tables/nh_sat.sql +++ b/macros/tables/nh_sat.sql @@ -6,9 +6,11 @@ Features: - High-Perfomance loading of non-historized satellite data +#} - Parameters: +{%- macro nh_sat(yaml_metadata=none, parent_hashkey=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, source_is_single_batch=false) -%} + {% set parent_hashkey_description = " parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. Examples: @@ -17,30 +19,40 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set src_payload_description = " src_payload::list of strings A list of all the descriptive attributes that should be included in this satellite. Examples: ['name', 'address', 'country', 'phone', 'email'] This satellite would hold the columns 'name', 'address', 'country', 'phone' and 'email', coming out of the underlying staging area. + "%} + {% set source_model_description = " source_model::string Name of the underlying staging model, must be available inside dbt as a model. Examples: 'stage_account' This satellite is loaded out of the stage for account. + "%} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} - src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - - - -#} - -{%- macro nh_sat(parent_hashkey, src_payload, source_model, src_ldts=none, src_rsrc=none, source_is_single_batch=false) -%} + " %} + + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/oracle/nh_link.sql b/macros/tables/oracle/nh_link.sql index 86a77276..c60a794e 100644 --- a/macros/tables/oracle/nh_link.sql +++ b/macros/tables/oracle/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro oracle__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro oracle__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -214,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/pit.sql b/macros/tables/pit.sql index c2a42b3f..18c83652 100644 --- a/macros/tables/pit.sql +++ b/macros/tables/pit.sql @@ -9,49 +9,78 @@ - Strongly improves performance if upstream queries require many JOIN operations - Creates a unique dimension key to optimize loading performance of incremental loads - Allows to insert a static string as record source column, matching business vault definition of a record source +#} - Parameters: - - pit_type::string String to insert into the 'pit_type' column. Has to be prefixed by a !. - Allows for future implementations of other PIT variants, like T-PITs etc. - Can be set freely, something like 'PIT' could be the default. - Is optional, if not set, no column will be added. +{%- macro pit(yaml_metadata=none, tracked_entity=none, hashkey=none, sat_names=none, snapshot_relation=none, dimension_key=none, snapshot_trigger_column=none, ldts=none, custom_rsrc=none, ledts=none, sdts=none, pit_type=none) -%} + {% set tracked_entity_description = " tracked_entity::string Name of the tracked Hub entity. Must be available as a model inside the dbt project. + " %} + {% set hashkey_description = " hashkey::string The name of the hashkey column inside the previously referred Hub entity. + " %} + {% set sat_names_description = " sat_names::list of strings A list of all the satellites that should be included in this PIT table. Can only be satellites that are attached to the tracked Hub, and should typically include all those satellites. You should always refer here to the version 1 satellites, since those hold the load-end-date. The macro currently supports regular satellites and nh-satellites. + " %} + {% set snapshot_relation_description = " snapshot_relation::string The name of the snapshot relation. It needs to be available as a model inside this dbt project. + " %} + {% set snapshot_trigger_column_description = " snapshot_trigger_column::string The name of the column inside the previously mentioned snapshot relation, that is boolean and identifies the snapshots that should be included in the PIT table. + " %} + {% set dimension_key_description = " dimension_key::string The desired name of the dimension key inside the PIT table. Should follow some naming conventions. Recommended is the name of the hashkey with a '_d' suffix. + " %} + {% set ldts_description = " ldts::string Name of the ldts column inside all source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set custom_rsrc_description = " custom_rsrc::string A custom string that should be inserted into the 'rsrc' column inside the PIT table. Since a PIT table is a business vault entity, the technical record source is no longer used here. Is optional, if not defined, no column is added. + " %} + {% set ledts_description = " ledts::string Name of the load-end-date column inside the satellites. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if not set here. - + " %} + + {% set sdts_description = " sdts::string Name of the snapshot date timestamp column inside the snapshot table. It is optional, will use the global variable 'datavault4dbt.sdts_alias' if not set here. + " %} -#} - - - -{%- macro pit(tracked_entity, hashkey, sat_names, snapshot_relation, dimension_key, snapshot_trigger_column=none, ldts=none, custom_rsrc=none, ledts=none, sdts=none, pit_type=none) -%} + {% set pit_type_description = " + pit_type::string String to insert into the 'pit_type' column. Has to be prefixed by a !. + Allows for future implementations of other PIT variants, like T-PITs etc. + Can be set freely, something like 'PIT' could be the default. + Is optional, if not set, no column will be added. + " %} + + {%- set tracked_entity = datavault4dbt.yaml_metadata_parser(name='tracked_entity', yaml_metadata=yaml_metadata, parameter=tracked_entity, required=True, documentation=tracked_entity_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set sat_names = datavault4dbt.yaml_metadata_parser(name='sat_names', yaml_metadata=yaml_metadata, parameter=sat_names, required=True, documentation=sat_names_description) -%} + {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=True, documentation=snapshot_relation_description) -%} + {%- set dimension_key = datavault4dbt.yaml_metadata_parser(name='dimension_key', yaml_metadata=yaml_metadata, parameter=dimension_key, required=True, documentation=dimension_key_description) -%} + {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} + {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=False, documentation=ldts_description) -%} + {%- set custom_rsrc = datavault4dbt.yaml_metadata_parser(name='custom_rsrc', yaml_metadata=yaml_metadata, parameter=custom_rsrc, required=False, documentation=custom_rsrc_description) -%} + {%- set ledts = datavault4dbt.yaml_metadata_parser(name='ledts', yaml_metadata=yaml_metadata, parameter=ledts, required=False, documentation=ledts_description) -%} + {%- set sdts = datavault4dbt.yaml_metadata_parser(name='sdts', yaml_metadata=yaml_metadata, parameter=sdts, required=False, documentation=sdts_description) -%} + {%- set pit_type = datavault4dbt.yaml_metadata_parser(name='pit_type', yaml_metadata=yaml_metadata, parameter=pit_type, required=False, documentation=pit_type_description) -%} {# Applying the default aliases as stored inside the global variables, if ldts, sdts and ledts are not set. #} diff --git a/macros/tables/postgres/nh_link.sql b/macros/tables/postgres/nh_link.sql index 24c4e3d1..7147fb0e 100644 --- a/macros/tables/postgres/nh_link.sql +++ b/macros/tables/postgres/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro postgres__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro postgres__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -214,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/rec_track_sat.sql b/macros/tables/rec_track_sat.sql index 13d024b2..b02bb75d 100644 --- a/macros/tables/rec_track_sat.sql +++ b/macros/tables/rec_track_sat.sql @@ -11,28 +11,32 @@ - Supports multiple updates per batch and therefore initial loading - Using a dynamic high-water-mark to optimize loading performance of multiple loads - Can either track link- or hub-hashkeys +#} - Parameters: +{%- macro rec_track_sat(yaml_metadata=none, tracked_hashkey=none, source_models=none, src_ldts=none, src_rsrc=none, src_stg=none, disable_hwm=false) -%} + {% set tracked_hashkey_description = " tracked_hashkey::string The name of the hashkey column you want to track. Needs to be available in the underlying staging layer. If you want to track multiple hashkeys out of one stage, you need to create one record tracking satellite for each hashkey. Examples: - "hk_contact_h" This record tracking satellite tracks the appearance of the hashkey for the contact hub. + 'hk_contact_h' This record tracking satellite tracks the appearance of the hashkey for the contact hub. - "hk_contact_account_l" This record tracking satellite tracks the appearance of the hashkey for the link between contacts and accounts. + 'hk_contact_account_l' This record tracking satellite tracks the appearance of the hashkey for the link between contacts and accounts. + " %} + {% set source_models_description = " source_models::dictionary Dictionary with information about the source model. The key of the dict is the name of the source model, and the value is another dictionary. This inner dictionary requires to have the keys 'rsrc_static', and optionally the key 'hk_column'. Examples: - {'stage_account': {'hk_column': 'hk_account_h', This record tracking satellite tracks the hashkey "hk_account_h" inside the - 'rsrc_static': '*/SAP/Accounts/*'}} source model named "stage_account". + {'stage_account': {'hk_column': 'hk_account_h', This record tracking satellite tracks the hashkey 'hk_account_h' inside the + 'rsrc_static': '*/SAP/Accounts/*'}} source model named 'stage_account'. {'stage_contact': {'rsrc_static': '*/SALESFORCE/Contact/*'}, This tracks the appearance of one hub hashkey that is loaded from the two source - 'stage_partner': {'hk_column': 'hk_partner_h', models "stage_contact" and "stage_partner". For "stage_account" no 'hk_column' is defined, + 'stage_partner': {'hk_column': 'hk_partner_h', models 'stage_contact' and 'stage_partner'. For 'stage_account' no 'hk_column' is defined, 'rsrc_static': '*/SALESFORCE/Partners/*'}} and therefore the input of the upper level variable 'tracked_hashkey' will be used. - For "stage_partner" the name of the hashkey column differs from the upper level definition + For 'stage_partner' the name of the hashkey column differs from the upper level definition and therefore this other name is set under the variable 'hk_column.' The 'rsrc_static' attribute defines a STRING or a list of strings that will always be @@ -53,19 +57,28 @@ If the rsrc_static is not set in one of the source models, then the assumption is made that or this source there is always the same value for any record in the record source column. The macro will then get automatically this unique value querying the source model. - + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_stg_description = " src_stg::string Name of the source stage model. Is optional, will use the global variable 'datavault4dbt.stg_alias'. - -#} - -{%- macro rec_track_sat(tracked_hashkey, source_models, src_ldts=none, src_rsrc=none, src_stg=none, disable_hwm=false) -%} + " %} + + {%- set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation=tracked_hashkey_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set src_stg = datavault4dbt.yaml_metadata_parser(name='src_stg', yaml_metadata=yaml_metadata, parameter=src_stg, required=False, documentation=src_stg_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index 3483db4a..8e734972 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -49,23 +49,13 @@ source_data AS ( In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. #} {%- if is_incremental() %} -current_status_prep AS ( - - SELECT - {{ tracked_hashkey }}, - {{ is_active_alias}}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn - FROM {{ this }} - -), - current_status AS ( SELECT {{ tracked_hashkey }}, {{ is_active_alias }} - FROM current_status_prep - WHERE rn = 1 + FROM {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 ), {% endif %} @@ -136,32 +126,19 @@ current_status AS ( {# The rows are deduplicated on the is_active_alias, to only include status changes. - Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - deduplicated_incoming_prep AS ( - - SELECT - is_active.{{ tracked_hashkey }}, - is_active.{{ src_ldts }}, - is_active.{{ is_active_alias }}, - LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active - - FROM is_active - - ), - deduplicated_incoming AS ( SELECT - deduplicated_incoming_prep.{{ tracked_hashkey }}, - deduplicated_incoming_prep.{{ src_ldts }}, - deduplicated_incoming_prep.{{ is_active_alias }} - - FROM - deduplicated_incoming_prep - WHERE - deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active - OR deduplicated_incoming_prep.lag_is_active IS NULL + ia.{{ tracked_hashkey }}, + ia.{{ src_ldts }}, + ia.{{ is_active_alias }} + FROM is_active ia + QUALIFY + CASE + WHEN ia.{{ is_active_alias }} = LAG(ia.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), diff --git a/macros/tables/redshift/hub.sql b/macros/tables/redshift/hub.sql index dc45edf9..9418f59e 100644 --- a/macros/tables/redshift/hub.sql +++ b/macros/tables/redshift/hub.sql @@ -207,21 +207,17 @@ source_new_union AS ( {%- endif %} -earliest_hk_over_all_sources_prep AS ( - SELECT - lcte.*, - ROW_NUMBER() OVER (PARTITION BY {{ hashkey }} ORDER BY {{ src_ldts - }}) as rn - FROM {{ ns.last_cte }} AS lcte), - earliest_hk_over_all_sources AS ( {#- Deduplicate the unionized records again to only insert the earliest one. #} SELECT lcte.* - FROM earliest_hk_over_all_sources_prep AS lcte - WHERE rn = 1 - {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%}), + FROM {{ ns.last_cte }} AS lcte + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%} + +), records_to_insert AS ( {#- Select everything from the previous CTE, if incremental filter for hashkeys that are not already in the hub. #} diff --git a/macros/tables/redshift/link.sql b/macros/tables/redshift/link.sql index 05ccbe8a..9e67e1d9 100644 --- a/macros/tables/redshift/link.sql +++ b/macros/tables/redshift/link.sql @@ -210,21 +210,17 @@ source_new_union AS ( {%- endif %} -earliest_hk_over_all_sources_prep AS ( - SELECT - lcte.*, - ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts - }}) as rn - FROM {{ ns.last_cte }} AS lcte), - earliest_hk_over_all_sources AS ( {#- Deduplicate the unionized records again to only insert the earliest one. #} SELECT lcte.* - FROM earliest_hk_over_all_sources_prep AS lcte - WHERE rn = 1 - {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%}), + FROM {{ ns.last_cte }} AS lcte + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%} + +), records_to_insert AS ( {# Select everything from the previous CTE, if incremental filter for hashkeys that are not already in the link. #} diff --git a/macros/tables/redshift/ma_sat_v0.sql b/macros/tables/redshift/ma_sat_v0.sql index 5e322ab2..4f59994c 100644 --- a/macros/tables/redshift/ma_sat_v0.sql +++ b/macros/tables/redshift/ma_sat_v0.sql @@ -41,44 +41,29 @@ source_data AS ( {# Get the latest record for each parent hashkey in existing sat, if incremental. #} {%- if is_incremental() %} -latest_entries_in_sat_prep AS ( - - SELECT - {{ parent_hashkey }}, - {{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }} DESC) as rn - FROM - {{ this }} -), - latest_entries_in_sat AS ( SELECT {{ parent_hashkey }}, {{ ns.hdiff_alias }} FROM - latest_entries_in_sat_prep - WHERE rn = 1 + {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 ), {%- endif %} {# Get a list of all distinct hashdiffs that exist for each parent_hashkey. #} - lag_source_data AS ( - SELECT - {{ parent_hashkey }}, - {{ src_ldts }}, - {{ ns.hdiff_alias }}, - LAG({{ ns.hdiff_alias }}) OVER (PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) as prev_ns_hdiff_alias - FROM source_data -), - deduped_row_hashdiff AS ( + SELECT {{ parent_hashkey }}, {{ src_ldts }}, {{ ns.hdiff_alias }} - FROM lag_source_data - WHERE {{ ns.hdiff_alias }} != prev_ns_hdiff_alias OR prev_ns_hdiff_alias IS NULL + FROM source_data redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY CASE + WHEN {{ ns.hdiff_alias }} = LAG({{ ns.hdiff_alias }}) OVER (PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), {# Dedupe the source data regarding non-delta groups. #} diff --git a/macros/tables/redshift/nh_link.sql b/macros/tables/redshift/nh_link.sql index 38ec0368..74e5e9dc 100644 --- a/macros/tables/redshift/nh_link.sql +++ b/macros/tables/redshift/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro redshift__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro redshift__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -25,6 +25,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -213,7 +223,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} @@ -226,21 +236,17 @@ source_new_union AS ( {%- if not source_is_single_batch %} -earliest_hk_over_all_sources_prep AS ( - SELECT - lcte.*, - ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts - }}) as rn - FROM {{ ns.last_cte }} AS lcte), - earliest_hk_over_all_sources AS ( - {#- Deduplicate the unionized records again to only insert the earliest one. #} +{#- Deduplicate the unionized records again to only insert the earliest one. #} SELECT lcte.* - FROM earliest_hk_over_all_sources_prep AS lcte - WHERE rn = 1 - {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%}), + FROM {{ ns.last_cte }} AS lcte + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%} + +), {%- endif %} diff --git a/macros/tables/redshift/ref_sat_v0.sql b/macros/tables/redshift/ref_sat_v0.sql index 6ce47239..4fceff2d 100644 --- a/macros/tables/redshift/ref_sat_v0.sql +++ b/macros/tables/redshift/ref_sat_v0.sql @@ -46,18 +46,6 @@ source_data AS ( {# Get the latest record for each parent ref key combination in existing sat, if incremental. #} {%- if is_incremental() %} -latest_entries_in_sat_prep AS ( - - SELECT - {% for ref_key in parent_ref_keys %} - {{ref_key}}, - {% endfor %} - {{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key|lower}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }} DESC) as rn - FROM - {{ this }} -), - latest_entries_in_sat AS ( SELECT @@ -66,8 +54,8 @@ latest_entries_in_sat AS ( {% endfor %} {{ ns.hdiff_alias }} FROM - latest_entries_in_sat_prep - WHERE rn = 1 + {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }} DESC) = 1 ), {%- endif %} @@ -75,7 +63,7 @@ latest_entries_in_sat AS ( Deduplicate source by comparing each hashdiff to the hashdiff of the previous record, for each parent ref key combination. Additionally adding a row number based on that order, if incremental. #} -deduplicated_numbered_source_prep AS ( +deduplicated_numbered_source AS ( SELECT {% for ref_key in parent_ref_keys %} @@ -86,24 +74,12 @@ deduplicated_numbered_source_prep AS ( {% if is_incremental() -%} , ROW_NUMBER() OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }}) as rn {%- endif %} - , LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key|lower}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }}) as prev_hashdiff - FROM source_data -), - -deduplicated_numbered_source AS ( - - SELECT - {% for ref_key in parent_ref_keys %} - {{ref_key}}, - {% endfor %} - {{ ns.hdiff_alias }}, - {{ datavault4dbt.print_list(source_cols) }} - FROM deduplicated_numbered_source_prep - WHERE 1=1 - AND {{ ns.hdiff_alias }} <> prev_hashdiff OR prev_hashdiff IS NULL - {% if is_incremental() -%} - AND rn = 1 - {%- endif %} + FROM source_data redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY + CASE + WHEN {{ ns.hdiff_alias }} = LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), {# @@ -128,7 +104,7 @@ records_to_insert AS ( AND {{ datavault4dbt.multikey(ref_key, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} {% endfor %} AND {{ datavault4dbt.multikey(ns.hdiff_alias, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} - ) + AND deduplicated_numbered_source.rn = 1) {%- endif %} ) diff --git a/macros/tables/redshift/sat_v0.sql b/macros/tables/redshift/sat_v0.sql index e04d4f66..8833c635 100644 --- a/macros/tables/redshift/sat_v0.sql +++ b/macros/tables/redshift/sat_v0.sql @@ -42,24 +42,14 @@ source_data AS ( {# Get the latest record for each parent hashkey in existing sat, if incremental. #} {%- if is_incremental() %} -latest_entries_in_sat_prep AS ( - - SELECT - {{ parent_hashkey }}, - {{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }} DESC) as rn - FROM - {{ this }} -), - latest_entries_in_sat AS ( SELECT {{ parent_hashkey }}, {{ ns.hdiff_alias }} FROM - latest_entries_in_sat_prep - WHERE rn = 1 + {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 ), {%- endif %} @@ -67,7 +57,7 @@ latest_entries_in_sat AS ( Deduplicate source by comparing each hashdiff to the hashdiff of the previous record, for each hashkey. Additionally adding a row number based on that order, if incremental. #} -deduplicated_numbered_source_prep AS ( +deduplicated_numbered_source AS ( SELECT {{ parent_hashkey }}, @@ -76,23 +66,12 @@ deduplicated_numbered_source_prep AS ( {% if is_incremental() -%} , ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) as rn {%- endif %} - , LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }}) as prev_hashdiff - FROM source_data - -), - -deduplicated_numbered_source AS ( - - SELECT - {{ parent_hashkey }}, - {{ ns.hdiff_alias }}, - {{ datavault4dbt.print_list(source_cols) }} - FROM deduplicated_numbered_source_prep - WHERE 1=1 - AND {{ ns.hdiff_alias }} <> prev_hashdiff OR prev_hashdiff IS NULL - {% if is_incremental() -%} - AND rn = 1 - {%- endif %} + FROM source_data redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY + CASE + WHEN {{ ns.hdiff_alias }} = LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), {# @@ -111,7 +90,8 @@ records_to_insert AS ( SELECT 1 FROM latest_entries_in_sat WHERE {{ datavault4dbt.multikey(parent_hashkey, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} - AND {{ datavault4dbt.multikey(ns.hdiff_alias, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }}) + AND {{ datavault4dbt.multikey(ns.hdiff_alias, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} + AND deduplicated_numbered_source.rn = 1) {%- endif %} ) diff --git a/macros/tables/ref_hub.sql b/macros/tables/ref_hub.sql index ccd7f488..84d13687 100644 --- a/macros/tables/ref_hub.sql +++ b/macros/tables/ref_hub.sql @@ -1,35 +1,27 @@ -{# -Example model: - -{{ config(materialized='incremental', - schema='Core') }} - -{%- set yaml_metadata -%} -source_models: stg_nation -ref_keys: N_NATIONKEY -{%- endset -%} - -{% set metadata_dict = fromyaml(yaml_metadata) %} - -{{ datavault4dbt.ref_hub(source_models=metadata_dict['source_models'], - ref_keys=metadata_dict['ref_keys']) }} - -#} - - - - - - - - - - - - - - -{%- macro ref_hub(ref_keys, source_models, src_ldts=none, src_rsrc=none) -%} +{%- macro ref_hub(yaml_metadata=none, ref_keys=none, source_models=none, src_ldts=none, src_rsrc=none) -%} + + {% set ref_keys_description = " + ref_keys::string|list of strings Name of the reference key(s) available in the source model(s). + " %} + + {% set source_models_description = " + source_models::dictionary Similar to other source_models parameters, e.g. in Hubs or Links. + " %} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ref_sat_v0.sql b/macros/tables/ref_sat_v0.sql index 474620d7..7c8e43f7 100644 --- a/macros/tables/ref_sat_v0.sql +++ b/macros/tables/ref_sat_v0.sql @@ -1,32 +1,55 @@ -{# -Example model: +{%- macro ref_sat_v0(yaml_metadata=none, parent_ref_keys=none, src_hashdiff=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} -{{ config(materialized='incremental', - schema='Core') }} + {% set parent_ref_keys_description = " + parent_ref_keys::string|list of strings Name of the reference key(s) of the parent ref_hub. + " %} -{%- set yaml_metadata -%} -source_model: stg_nation -parent_ref_keys: N_NATIONKEY -src_hashdiff: hd_nation_rs -src_payload: - - N_COMMENT - - N_NAME - - N_REGIONKEY -{%- endset -%} + {% set src_hashdiff_description = " + src_hashdiff::string Name of the hashdiff column of this ref satellite, that was created inside the staging area and is + calculated out of the entire payload of this ref satellite. The stage must hold one hashdiff per + ref satellite entity. -{% set metadata_dict = fromyaml(yaml_metadata) %} + Examples: + 'hd_nation_sfdc_rs' Since we recommend naming the hashdiff column similar to the name + of the ref satellite entity, just with a prefix, this would be the + hashdiff column of the ref satelliet for nation. + " %} -{{ datavault4dbt.ref_sat_v0(source_model=metadata_dict['source_model'], - parent_ref_keys=metadata_dict['parent_ref_keys'], - src_hashdiff=metadata_dict['src_hashdiff'], - src_payload=metadata_dict['src_payload']) }} + {% set src_payload_description = " + src_payload::list of strings A list of all the descriptive attributes that should be included in this ref satellite. Needs to be the + columns that are fed into the hashdiff calculation of this ref satellite. -#} + Examples: + ['name', 'continent', 'area'] This ref satellite would hold the columns 'name', 'continent', and 'area' + coming out of the underlying staging area. + " %} + {% set source_model_description = " + source_model::string Name of the underlying staging model, must be available inside dbt as a model. + Examples: + 'stage_nation' This ref satellite is loaded out of the stage for account. + " %} + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} -{%- macro ref_sat_v0(parent_ref_keys, src_hashdiff, src_payload, source_model, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + + {%- set parent_ref_keys = datavault4dbt.yaml_metadata_parser(name='parent_ref_keys', yaml_metadata=yaml_metadata, parameter=parent_ref_keys, required=True, documentation=parent_ref_keys_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/ref_sat_v1.sql b/macros/tables/ref_sat_v1.sql index 7b3ea907..abe56d81 100644 --- a/macros/tables/ref_sat_v1.sql +++ b/macros/tables/ref_sat_v1.sql @@ -1,29 +1,47 @@ -{# -Example model: - -{{ config(materialized='view', - schema='Core') }} - -{%- set yaml_metadata -%} -ref_sat_v0: nation_rs -ref_keys: N_NATIONKEY -hashdiff: hd_nation_rs -add_is_current_flag: true -{%- endset -%} - -{% set metadata_dict = fromyaml(yaml_metadata) %} - -{{ datavault4dbt.ref_sat_v1(ref_sat_v0=metadata_dict['ref_sat_v0'], - ref_keys=metadata_dict['ref_keys'], - hashdiff=metadata_dict['hashdiff'], - add_is_current_flag=metadata_dict['add_is_current_flag']) }} - -#} - - - -{%- macro ref_sat_v1(ref_sat_v0, ref_keys, hashdiff, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} - +{%- macro ref_sat_v1(yaml_metadata=none, ref_sat_v0=none, ref_keys=none, hashdiff=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} + + {% set ref_sat_v0_description = " + ref_sat_v0::string Name of the underlying ref_sat_v0 dbt model + " %} + + {% set ref_keys_description = " + ref_keys::string | list of strings Name(s) of the reference key(s) in the underlying reference sat v0. + " %} + + {% set hashdiff_description = " + hashdiff::string Name of the Hashdiff column in the underlying reference sat v0. + "%} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set ledts_alias_description = " + ledts_alias::string Desired alias for the load end date column. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if + set here. + " %} + + {% set add_is_current_flag_description = " + add_is_current_flag::boolean Optional parameter to add a new column to the v1 sat based on the load end date timestamp (ledts). Default is false. If + set to true it will add this is_current flag to the v1 sat. For each record this column will be set to true if the load + end date time stamp is equal to the variable end of all times. If its not, then the record is not current therefore it + will be set to false. + " %} + + {%- set ref_sat_v0 = datavault4dbt.yaml_metadata_parser(name='ref_sat_v0', yaml_metadata=yaml_metadata, parameter=ref_sat_v0, required=True, documentation=ref_sat_v0_description) -%} + {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/ref_table.sql b/macros/tables/ref_table.sql index 567851ac..0cdfe7d4 100644 --- a/macros/tables/ref_table.sql +++ b/macros/tables/ref_table.sql @@ -1,92 +1,43 @@ -{# -Example models: +{%- macro ref_table(yaml_metadata=none, ref_hub=none, ref_satellites=none, src_ldts=none, src_rsrc=none, historized='latest', snapshot_relation=none, snapshot_trigger_column=none) -%} + + {% set ref_hub_description = " + ref_hub::string Name of the underlying ref_hub model. + " %} + + {% set ref_satellites_description = " + ref_satellites::string|list of strings Name(s) of the reference satellites to be included in this ref_table. Optional: 'include' & 'exclude' as dictionary keys for each satellite. + " %} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set historized_description = " + historized::string Possible values are 'full', 'latest', or 'snapshot'. Influences how much history this reference table will hold. + " %} + + {% set snapshot_relation_description = " + snapshot_relation::string Only required, if 'historized' set to 'snapshot'. Name of the snapshot_v1 model to be used. + " %} + + {% set snapshot_trigger_column_description = " + snapshot_trigger_column::string Only required, if 'historized' set to 'snapshot'. Defaults to global variable 'datavault4dbt.sdts_alias'. Only needs to be set if alias deviates from global variable. + " %} + + {%- set ref_hub = datavault4dbt.yaml_metadata_parser(name='ref_hub', yaml_metadata=yaml_metadata, parameter=ref_hub, required=True, documentation=ref_hub_description) -%} + {%- set ref_satellites = datavault4dbt.yaml_metadata_parser(name='ref_satellites', yaml_metadata=yaml_metadata, parameter=ref_satellites, required=True, documentation=ref_satellites_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set historized = datavault4dbt.yaml_metadata_parser(name='historized', yaml_metadata=yaml_metadata, parameter=historized, required=False, documentation=historized_description) -%} + {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=False, documentation=snapshot_relation_description) -%} + {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} -Fully historized: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - - nation_rs1 - - nation_p_rs - historized: 'full' - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - -Only latest data: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - - nation_rs1 - - nation_p_rs - historized: 'latest' - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - -Snapshot Based: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - - nation_rs1 - - nation_p_rs - historized: 'snapshot' - snapshot_relation: snap_v1 - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - -Include / Exclude per Satellite: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - nation_rs1: - exclude: - - N_NAME - nation_p_rs: - include: - - N_NAME - historized: 'full' - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - - -#} - - -{%- macro ref_table(ref_hub, ref_satellites, src_ldts=none, src_rsrc=none, historized='latest', snapshot_relation=none, snapshot_trigger_column=none) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/sat_v0.sql b/macros/tables/sat_v0.sql index cf1a8c71..de5e156b 100644 --- a/macros/tables/sat_v0.sql +++ b/macros/tables/sat_v0.sql @@ -7,9 +7,11 @@ Features: - Can handle multiple updates per batch, without losing intermediate changes. therefore initial loading is supported. - Using a dynamic high-water-mark to optimize loading performance of multiple loads +#} - Parameters: +{%- macro sat_v0(yaml_metadata=none, parent_hashkey=none, src_hashdiff=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + {% set parent_hashkey_description = " parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. Examples: @@ -18,7 +20,9 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set src_hashdiff_description = " src_hashdiff::string Name of the hashdiff column of this satellite, that was created inside the staging area and is calculated out of the entire payload of this satellite. The stage must hold one hashdiff per satellite entity. @@ -27,7 +31,9 @@ 'hd_account_data_sfdc_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the data satellite for account. + " %} + {% set src_payload_description = " src_payload::list of strings A list of all the descriptive attributes that should be included in this satellite. Needs to be the columns that are fed into the hashdiff calculation of this satellite. @@ -35,21 +41,33 @@ ['name', 'address', 'country', 'phone', 'email'] This satellite would hold the columns 'name', 'address', 'country', 'phone' and 'email', coming out of the underlying staging area. + " %} + {% set source_model_description = " source_model::string Name of the underlying staging model, must be available inside dbt as a model. Examples: 'stage_account' This satellite is loaded out of the stage for account. + " %} - src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} - src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - -#} - -{%- macro sat_v0(parent_hashkey, src_hashdiff, src_payload, source_model, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + " %} + + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/sat_v1.sql b/macros/tables/sat_v1.sql index 6f743ec0..a573974d 100644 --- a/macros/tables/sat_v1.sql +++ b/macros/tables/sat_v1.sql @@ -2,15 +2,19 @@ This macro calculates a virtualized load end date on top of a version 0 satellite. This column is generated for usage in the PIT tables, and only virtualized to follow the insert-only approach. Usually one version 1 sat would be created for each version 0 sat. A version 1 satellite should be materialized as a view by default. +#} - Parameters: +{%- macro sat_v1(yaml_metadata=none, sat_v0=none, hashkey=none, hashdiff=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false, include_payload=true) -%} + {% set sat_v0_description = " sat_v0::string Name of the underlying version 0 satellite. Examples: 'account_data_sfdc_0_s' This satellite would be the version 1 satellite of the underlying version 0 data satellite for account. + " %} + {% set hashkey_description = " hashkey::string Name of the parent hashkey column inside the version 0 satellite. Would either be the hashkey of a hub or a link. Needs to be similar to the 'parent_hashkey' parameter inside the sat_v0 model. @@ -20,7 +24,9 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set hashdiff_description = " hashdiff::string Name of the hashdiff column inside the underlying version 0 satellite. Needs to be similar to the 'src_hashdiff' pararmeter inside the sat_v0 model. @@ -28,25 +34,42 @@ 'hd_account_data_sfdc_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the data satellite for account. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set ledts_alias_description = " ledts_alias::string Desired alias for the load end date column. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if set here. - + " %} + + {% set add_is_current_flag_description = " add_is_current_flag::boolean Optional parameter to add a new column to the v1 sat based on the load end date timestamp (ledts). Default is false. If set to true it will add this is_current flag to the v1 sat. For each record this column will be set to true if the load end date time stamp is equal to the variable end of all times. If its not, then the record is not current therefore it will be set to false. - + " %} + + {% set include_payload_description = " include_payload::boolean Optional parameter to specify if the v1 sat should have the payload columns from sat v0 or not. Default is true. -#} + " %} -{%- macro sat_v1(sat_v0, hashkey, hashdiff, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false, include_payload=true) -%} + {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {%- set include_payload = datavault4dbt.yaml_metadata_parser(name='include_payload', yaml_metadata=yaml_metadata, parameter=include_payload, required=False, documentation=include_payload_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} diff --git a/macros/tables/snowflake/nh_link.sql b/macros/tables/snowflake/nh_link.sql index 701041d9..0b38718d 100644 --- a/macros/tables/snowflake/nh_link.sql +++ b/macros/tables/snowflake/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro snowflake__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro snowflake__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -25,6 +25,17 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -212,7 +223,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/synapse/nh_link.sql b/macros/tables/synapse/nh_link.sql index bf66d726..2f44bdf1 100644 --- a/macros/tables/synapse/nh_link.sql +++ b/macros/tables/synapse/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro synapse__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro synapse__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower == 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -214,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%}