From 9b23ec8721d88cdd4dc9eaee4d299025a6dda477 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 17 May 2021 12:42:53 +0000 Subject: [PATCH 01/13] support `cross_join()` --- R/step-join.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/R/step-join.R b/R/step-join.R index 9da8944ac..ef972ac92 100644 --- a/R/step-join.R +++ b/R/step-join.R @@ -23,6 +23,24 @@ step_join <- function(x, y, on, style, suffix = c(".x", ".y")) { ) } +cross_join <- function(x, y) { + # TODO use `right_join()` as it produces a shorter query + # needs issue 243 to be fixed + # https://github.com/tidyverse/dtplyr/issues/243 + xy <- left_join( + mutate(x, .cross_join_col = 1), + mutate(y, .cross_join_col = 1), + by = ".cross_join_col" + ) + + # use custom select to produce way shorter query + step_subset_j( + xy, + vars = setdiff(xy$vars, ".cross_join_col"), + j = expr(!".cross_join_col") + ) +} + #' @export dt_sources.dtplyr_step_join <- function(x) { # TODO: need to throw error if same name refers to different tables. @@ -87,6 +105,10 @@ left_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c y <- dtplyr_auto_copy(x, y, copy = copy) by <- dtplyr_common_by(by, x, y) + if (is_empty(by)) { + return(cross_join(x, y)) + } + if (join_is_simple(x$vars, y$vars, by)) { col_order <- unique(c(x$vars, y$vars)) out <- step_subset_on(y, x, i = y, on = by) From 95e56ef9efcd0207b63d9be03fc9b0abed6c2577 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 17 May 2021 12:43:07 +0000 Subject: [PATCH 02/13] support for `nesting()` --- R/step-subset-expand.R | 148 ++++++++++++++++++++--- tests/testthat/test-step-subset-expand.R | 11 +- 2 files changed, 141 insertions(+), 18 deletions(-) diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R index fefa753fa..b281032b3 100644 --- a/R/step-subset-expand.R +++ b/R/step-subset-expand.R @@ -38,12 +38,125 @@ #' fruits %>% dplyr::right_join(all) # exported onLoad expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") { - dots <- capture_dots(data, ..., .j = FALSE) - dots <- dots[!vapply(dots, is_null, logical(1))] + dots <- prepare_expand_dots(data, ..., .name_repair = .name_repair) + + # TODO handle factors if (length(dots) == 0) { return(data) } + expanded_simple <- expand_no_nesting(data, dots$simple) + expanded_nesting <- expand_nesting(data, dots$nesting, .name_repair = .name_repair) + + if (is_empty(expanded_simple)) { + out <- expanded_nesting + } else if (is_empty(expanded_nesting)) { + out <- expanded_simple + } else { + out <- left_join(expanded_nesting, expanded_simple, by = character()) + } + + renamed <- names(dots$select) != unname(dots$select) + relocated <- unname(dots$select) != out$vars + if (any(renamed) || any(relocated)) { + out <- select(out, !!!dots$select) + } + + out +} + +# exported onLoad +expand.data.table <- function(data, ..., .name_repair = "check_unique") { + data <- lazy_dt(data) + tidyr::expand(data, ..., .name_repair = .name_repair) +} + +prepare_expand_dots <- function(data, ..., .name_repair) { + dot_names_tidyr <- names(exprs(..., .named = TRUE)) + dots <- capture_dots(data, ..., .j = FALSE) + + dot_is_null <- vapply(dots, is_null, logical(1)) + dots <- dots[!dot_is_null] + dot_names_tidyr <- dot_names_tidyr[!dot_is_null] + + if (is_null(dots)) { + return(NULL) + } + + is_nesting <- purrr::map_lgl(dots, ~ is_call(.x, "nesting")) + + dots_df <- tibble( + expr = dots, + name_tidyr = ifelse(is_nesting, NA_character_, dot_names_tidyr), + position = seq_along(expr) + ) + + dots_df_nesting <- dots_df[is_nesting, ] + nesting_vars <- purrr::map(dots_df_nesting$expr, get_nesting_vars) + dots_df_nesting$name_tidyr <- purrr::map(nesting_vars, names) + dots_df_nesting$var <- purrr::map(nesting_vars, unlist) + + dots_df_simple <- dots_df[!is_nesting, ] + simple_vars <- dt_dot_names(dots_df_simple$expr) + dots_df_simple$name_dt <- names(simple_vars) + dots_df_simple$var <- simple_vars + + meta_df <- bind_rows( + dots_df_simple, + tidyr::unnest(dots_df_nesting, name_tidyr) + ) + + meta_df <- arrange(meta_df, position) + meta_df$name_dt <- coalesce(meta_df$name_dt, meta_df$name_tidyr) + + meta_df$name_tidyr <- vctrs::vec_as_names(meta_df$name_tidyr, repair = .name_repair) + + + list( + simple = dots_df_simple$var, + nesting = dots_df_nesting$var, + select = purrr::set_names(meta_df$name_dt, meta_df$name_tidyr) + ) +} + +get_nesting_vars <- function(expr) { + args <- call_args(expr) + + repair <- args[[".name_repair"]] %||% "check_unique" + args[[".name_repair"]] <- NULL + + vars <- exprs_auto_name(args) + nms <- vctrs::vec_as_names(names(vars), repair = repair) + purrr::set_names(vars, nms) +} + +expand_nesting <- function(data, vars, .name_repair) { + if (is_empty(vars)) { + return(NULL) + } + + # now that `nesting()` has been unpacked resolve name conflicts + out_names <- names(exprs_auto_name(purrr::flatten(vars))) + out_names_repaired <- vctrs::vec_as_names(out_names, repair = .name_repair) + + ns <- lengths(vars) + indices <- vctrs::vec_rep_each(seq_along(vars), ns) + out_names_list <- vctrs::vec_split(out_names_repaired, indices)$val + + distinct_tables <- purrr::map2( + vars, out_names_list, + ~ { + args <- set_names(.x, .y) + data_distinct <- distinct(data, !!!args) + # ensure same ordering as for data frame + arrange(data_distinct, across(), .by_group = TRUE) + } + ) + + purrr::reduce(distinct_tables, left_join, by = group_vars(data)) +} + +dt_dot_names <- function(dots, .name_repair) { named_dots <- have_name(dots) if (any(!named_dots)) { # Auto-names generated by enquos() don't always work with the CJ() step @@ -55,24 +168,31 @@ expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") { names(dots)[needs_v_name] <- v_names[needs_v_name] names(dots)[symbol_dots] <- lapply(dots[symbol_dots], as_name) } - names(dots) <- vctrs::vec_as_names(names(dots), repair = .name_repair) - - on <- names(dots) - cj <- expr(CJ(!!!syms(on), unique = TRUE)) + dots +} - out <- distinct(data, !!!syms(data$groups), !!!dots) +expand_no_nesting <- function(data, dots, .name_repair) { if (length(data$groups) == 0) { - out <- step_subset(out, i = cj, on = on) + dt_vars <- names(dots) + + dt_auto_names <- names(dt_dot_names(unname(dots))) + name_needed <- dt_auto_names != dt_vars + names(dots)[!name_needed] <- "" + + out <- step_subset_j( + parent = data, + vars = dt_vars, + j = expr(CJ(!!!dots, unique = TRUE)) + ) } else { + out <- distinct(data, !!!syms(data$groups), !!!dots) + + on <- names(dots) + cj <- expr(CJ(!!!syms(on), unique = TRUE)) + on <- call2(".", !!!syms(on)) out <- step_subset(out, j = expr(.SD[!!cj, on = !!on])) } out } - -# exported onLoad -expand.data.table <- function(data, ..., .name_repair = "check_unique") { - data <- lazy_dt(data) - tidyr::expand(data, ..., .name_repair = .name_repair) -} diff --git a/tests/testthat/test-step-subset-expand.R b/tests/testthat/test-step-subset-expand.R index 69086da5c..a14922357 100644 --- a/tests/testthat/test-step-subset-expand.R +++ b/tests/testthat/test-step-subset-expand.R @@ -6,7 +6,8 @@ test_that("expand completes all values", { expect_equal( show_query(step), - expr(unique(DT[, .(x, y)])[CJ(x, y, unique = TRUE), on = .(x, y)]) + # expr(unique(DT[, .(x, y)])[CJ(x, y, unique = TRUE), on = .(x, y)]) + expr(DT[, CJ(x, y, unique = TRUE)]) ) expect_equal(step$vars, c("x", "y")) expect_equal(nrow(out), 4) @@ -29,9 +30,10 @@ test_that("works with unnamed vectors", { expect_equal( show_query(step), - expr(unique(DT[, .(x = x, V2 = 1:2)])[CJ(x, V2, unique = TRUE), on = .(x, V2)]) + # expr(unique(DT[, .(x = x, V2 = 1:2)])[CJ(x, V2, unique = TRUE), on = .(x, V2)]) + expr(DT[, CJ(x, 1:2, unique = TRUE)][, .(x, `1:2` = V2)]) ) - expect_equal(step$vars, c("x", "V2")) + expect_equal(step$vars, c("x", "1:2")) expect_equal(nrow(out), 4) }) @@ -43,7 +45,8 @@ test_that("works with named vectors", { expect_equal( show_query(step), - expr(unique(DT[, .(x = x, val = 1:2)])[CJ(x, val, unique = TRUE), on = .(x, val)]) + # expr(unique(DT[, .(x = x, val = 1:2)])[CJ(x, val, unique = TRUE), on = .(x, val)]) + expr(DT[, CJ(x, val = 1:2, unique = TRUE)]) ) expect_equal(step$vars, c("x", "val")) expect_equal(nrow(out), 4) From 06ce20382ef74010dd20178ffbb6b4ebc0f82d09 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 19 May 2021 08:25:49 +0000 Subject: [PATCH 03/13] refactor --- R/step-subset-expand.R | 56 ++++++++++-------------------------------- 1 file changed, 13 insertions(+), 43 deletions(-) diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R index b281032b3..39c0bc9a1 100644 --- a/R/step-subset-expand.R +++ b/R/step-subset-expand.R @@ -45,16 +45,12 @@ expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") { return(data) } - expanded_simple <- expand_no_nesting(data, dots$simple) - expanded_nesting <- expand_nesting(data, dots$nesting, .name_repair = .name_repair) + tbl_list <- c( + list(expand_no_nesting(data, dots$simple)), + expand_nesting(data, dots$nesting) + ) - if (is_empty(expanded_simple)) { - out <- expanded_nesting - } else if (is_empty(expanded_nesting)) { - out <- expanded_simple - } else { - out <- left_join(expanded_nesting, expanded_simple, by = character()) - } + out <- purrr::reduce(tbl_list, left_join, by = group_vars(data)) renamed <- names(dots$select) != unname(dots$select) relocated <- unname(dots$select) != out$vars @@ -75,20 +71,15 @@ prepare_expand_dots <- function(data, ..., .name_repair) { dot_names_tidyr <- names(exprs(..., .named = TRUE)) dots <- capture_dots(data, ..., .j = FALSE) - dot_is_null <- vapply(dots, is_null, logical(1)) - dots <- dots[!dot_is_null] - dot_names_tidyr <- dot_names_tidyr[!dot_is_null] - + dots <- dots[!vapply(dots, is_null, logical(1))] if (is_null(dots)) { return(NULL) } is_nesting <- purrr::map_lgl(dots, ~ is_call(.x, "nesting")) - dots_df <- tibble( expr = dots, - name_tidyr = ifelse(is_nesting, NA_character_, dot_names_tidyr), - position = seq_along(expr) + position = seq_along(dots) ) dots_df_nesting <- dots_df[is_nesting, ] @@ -100,22 +91,19 @@ prepare_expand_dots <- function(data, ..., .name_repair) { simple_vars <- dt_dot_names(dots_df_simple$expr) dots_df_simple$name_dt <- names(simple_vars) dots_df_simple$var <- simple_vars + dots_df_simple$name_tidyr <- dot_names_tidyr[!is_nesting] meta_df <- bind_rows( dots_df_simple, tidyr::unnest(dots_df_nesting, name_tidyr) ) - - meta_df <- arrange(meta_df, position) - meta_df$name_dt <- coalesce(meta_df$name_dt, meta_df$name_tidyr) - - meta_df$name_tidyr <- vctrs::vec_as_names(meta_df$name_tidyr, repair = .name_repair) - + names_dt <- coalesce(meta_df$name_dt, meta_df$name_tidyr) + names_tidyr <- vctrs::vec_as_names(meta_df$name_tidyr, repair = .name_repair) list( simple = dots_df_simple$var, nesting = dots_df_nesting$var, - select = purrr::set_names(meta_df$name_dt, meta_df$name_tidyr) + select = purrr::set_names(names_dt, names_tidyr)[order(meta_df$position)] ) } @@ -130,30 +118,12 @@ get_nesting_vars <- function(expr) { purrr::set_names(vars, nms) } -expand_nesting <- function(data, vars, .name_repair) { +expand_nesting <- function(data, vars) { if (is_empty(vars)) { return(NULL) } - # now that `nesting()` has been unpacked resolve name conflicts - out_names <- names(exprs_auto_name(purrr::flatten(vars))) - out_names_repaired <- vctrs::vec_as_names(out_names, repair = .name_repair) - - ns <- lengths(vars) - indices <- vctrs::vec_rep_each(seq_along(vars), ns) - out_names_list <- vctrs::vec_split(out_names_repaired, indices)$val - - distinct_tables <- purrr::map2( - vars, out_names_list, - ~ { - args <- set_names(.x, .y) - data_distinct <- distinct(data, !!!args) - # ensure same ordering as for data frame - arrange(data_distinct, across(), .by_group = TRUE) - } - ) - - purrr::reduce(distinct_tables, left_join, by = group_vars(data)) + purrr::map(vars, ~ distinct(data, !!!.x)) } dt_dot_names <- function(dots, .name_repair) { From 3f8149166ee6a8776a15fa6ff3152b4a0af62e5d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 19 May 2021 12:06:38 +0000 Subject: [PATCH 04/13] `prepare_expand_dots()` respects groups & handles NULL --- R/step-subset-expand.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R index 39c0bc9a1..335cc6f59 100644 --- a/R/step-subset-expand.R +++ b/R/step-subset-expand.R @@ -68,10 +68,11 @@ expand.data.table <- function(data, ..., .name_repair = "check_unique") { } prepare_expand_dots <- function(data, ..., .name_repair) { - dot_names_tidyr <- names(exprs(..., .named = TRUE)) dots <- capture_dots(data, ..., .j = FALSE) - dots <- dots[!vapply(dots, is_null, logical(1))] + dot_is_null <- vapply(dots, is_null, logical(1)) + dots <- dots[!dot_is_null] + dot_names_tidyr <- names(exprs(..., .named = TRUE))[!dot_is_null] if (is_null(dots)) { return(NULL) } @@ -97,13 +98,18 @@ prepare_expand_dots <- function(data, ..., .name_repair) { dots_df_simple, tidyr::unnest(dots_df_nesting, name_tidyr) ) - names_dt <- coalesce(meta_df$name_dt, meta_df$name_tidyr) - names_tidyr <- vctrs::vec_as_names(meta_df$name_tidyr, repair = .name_repair) + groups <- group_vars(data) + names_dt <- c(groups, coalesce(meta_df$name_dt, meta_df$name_tidyr)) + names_tidyr <- vctrs::vec_as_names( + c(groups, meta_df$name_tidyr), + repair = .name_repair + ) + order <- c(seq_along(groups), length(groups) + order(meta_df$position)) list( simple = dots_df_simple$var, nesting = dots_df_nesting$var, - select = purrr::set_names(names_dt, names_tidyr)[order(meta_df$position)] + select = purrr::set_names(names_dt, names_tidyr)[order] ) } From 874526709298c617943e3d25b8d99644d9c9ddfc Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 19 May 2021 12:45:18 +0000 Subject: [PATCH 05/13] fix `check()` issues --- R/step-subset-expand.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R index 335cc6f59..7efbae270 100644 --- a/R/step-subset-expand.R +++ b/R/step-subset-expand.R @@ -50,7 +50,7 @@ expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") { expand_nesting(data, dots$nesting) ) - out <- purrr::reduce(tbl_list, left_join, by = group_vars(data)) + out <- Reduce(function(x, y) left_join(x, y, by = group_vars(data)), tbl_list) renamed <- names(dots$select) != unname(dots$select) relocated <- unname(dots$select) != out$vars @@ -77,16 +77,16 @@ prepare_expand_dots <- function(data, ..., .name_repair) { return(NULL) } - is_nesting <- purrr::map_lgl(dots, ~ is_call(.x, "nesting")) - dots_df <- tibble( + is_nesting <- vapply(dots, function(x) is_call(x, "nesting"), logical(1)) + dots_df <- tibble::tibble( expr = dots, position = seq_along(dots) ) dots_df_nesting <- dots_df[is_nesting, ] - nesting_vars <- purrr::map(dots_df_nesting$expr, get_nesting_vars) - dots_df_nesting$name_tidyr <- purrr::map(nesting_vars, names) - dots_df_nesting$var <- purrr::map(nesting_vars, unlist) + nesting_vars <- lapply(dots_df_nesting$expr, get_nesting_vars) + dots_df_nesting$name_tidyr <- lapply(nesting_vars, names) + dots_df_nesting$var <- lapply(nesting_vars, unlist) dots_df_simple <- dots_df[!is_nesting, ] simple_vars <- dt_dot_names(dots_df_simple$expr) @@ -94,12 +94,12 @@ prepare_expand_dots <- function(data, ..., .name_repair) { dots_df_simple$var <- simple_vars dots_df_simple$name_tidyr <- dot_names_tidyr[!is_nesting] - meta_df <- bind_rows( + meta_df <- dplyr::bind_rows( dots_df_simple, - tidyr::unnest(dots_df_nesting, name_tidyr) + tidyr::unnest(dots_df_nesting, "name_tidyr") ) groups <- group_vars(data) - names_dt <- c(groups, coalesce(meta_df$name_dt, meta_df$name_tidyr)) + names_dt <- c(groups, dplyr::coalesce(meta_df$name_dt, meta_df$name_tidyr)) names_tidyr <- vctrs::vec_as_names( c(groups, meta_df$name_tidyr), repair = .name_repair @@ -109,7 +109,7 @@ prepare_expand_dots <- function(data, ..., .name_repair) { list( simple = dots_df_simple$var, nesting = dots_df_nesting$var, - select = purrr::set_names(names_dt, names_tidyr)[order] + select = set_names(names_dt, names_tidyr)[order] ) } @@ -121,7 +121,7 @@ get_nesting_vars <- function(expr) { vars <- exprs_auto_name(args) nms <- vctrs::vec_as_names(names(vars), repair = repair) - purrr::set_names(vars, nms) + set_names(vars, nms) } expand_nesting <- function(data, vars) { @@ -129,7 +129,7 @@ expand_nesting <- function(data, vars) { return(NULL) } - purrr::map(vars, ~ distinct(data, !!!.x)) + lapply(vars, function(x) distinct(data, !!!x)) } dt_dot_names <- function(dots, .name_repair) { From bda39f32102620b6ec37332ee06a02f84e7c7302 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 06:53:46 +0000 Subject: [PATCH 06/13] `group_by()` can handle empty groups --- R/step-group.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/step-group.R b/R/step-group.R index 9e291c9ae..10c03bc15 100644 --- a/R/step-group.R +++ b/R/step-group.R @@ -84,7 +84,7 @@ group_by.dtplyr_step <- function(.data, ..., .add = FALSE, add = deprecated(), a dots[!existing] <- syms(names(dots[!existing])) } - groups <- c(if (.add) .data$groups, names(dots)) + groups <- c(if (.add) .data$groups, names(dots)) %||% character() arranged <- if (!is.null(.data$arrange)) .data$arrange && arrange else arrange step_group(.data, groups, arranged) From 6c5413c59006e53d6b7f63b4dbbdf15f90b21e1b Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 06:59:41 +0000 Subject: [PATCH 07/13] add `nest()` --- R/compat-purrr.R | 13 ++++++++ R/compat-tidyr.R | 9 ++++++ R/step-nest.R | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ R/zzz.R | 2 ++ 4 files changed, 106 insertions(+) create mode 100644 R/compat-purrr.R create mode 100644 R/compat-tidyr.R create mode 100644 R/step-nest.R diff --git a/R/compat-purrr.R b/R/compat-purrr.R new file mode 100644 index 000000000..dc8432ad0 --- /dev/null +++ b/R/compat-purrr.R @@ -0,0 +1,13 @@ +imap <- function(.x, .f, ...) { + map2(.x, names(.x) %||% seq_along(.x), .f, ...) +} + +map2 <- function(.x, .y, .f, ...) { + .f <- as_function(.f, env = global_env()) + out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) + if (length(out) == length(.x)) { + set_names(out, names(.x)) + } else { + set_names(out, NULL) + } +} diff --git a/R/compat-tidyr.R b/R/compat-tidyr.R new file mode 100644 index 000000000..c5ff3f7c1 --- /dev/null +++ b/R/compat-tidyr.R @@ -0,0 +1,9 @@ +strip_names <- function(df, base, names_sep) { + base <- paste0(base, names_sep) + names <- names(df) + + has_prefix <- regexpr(base, names, fixed = TRUE) == 1L + names[has_prefix] <- substr(names[has_prefix], nchar(base) + 1, nchar(names[has_prefix])) + + set_names(df, names) +} diff --git a/R/step-nest.R b/R/step-nest.R new file mode 100644 index 000000000..ae4d3d5e6 --- /dev/null +++ b/R/step-nest.R @@ -0,0 +1,82 @@ +#' Nest +#' +#' @description +#' This is a method for the tidyr `nest()` generic. +#' +#' @inheritParams tidyr::nest +#' @param data A [lazy_dt()]. +#' @examples +#' if (require("tidyr", quietly = TRUE)) { +#' dt <- lazy_dt(tibble(x = c(1, 2, 1), y = c("a", "a", "b"))) +#' dt %>% nest(data = y) +#' +#' dt %>% dplyr::group_by(x) %>% nest() +#' } +# exported onLoad +nest.dtplyr_step <- function(.data, ..., .names_sep = NULL) { + cols <- eval_nest_dots(.data, ...) + + cols <- lapply(cols, set_names) + if (!is.null(.names_sep)) { + cols <- imap(cols, strip_names, .names_sep) + } + + if (length(cols) == 1 && is.null(.names_sep)) { + # use `.SD` as it is shorter and faster + nm <- names(cols) + j_exprs <- exprs(!!nm := .(.SD)) + } else { + j_exprs <- imap( + cols, + function(x, name) { + x <- simplify_names2(x) + expr(.(data.table(!!!syms(x)))) + } + ) + } + + asis <- setdiff(.data$vars, unlist(cols)) + out <- step_subset_j( + .data, + vars = c(asis, names(cols)), + j = expr(.(!!!j_exprs)), + groups = asis, + arrange = FALSE + ) + + groups <- intersect(out$vars, group_vars(.data)) + group_by(out, !!!syms(groups)) +} + +# exported onLoad +nest.data.table <- function(.data, ..., .names_sep = NULL) { + .data <- lazy_dt(.data) + tidyr::nest(.data, ...) +} + +eval_nest_dots <- function(.data, ...) { + if (missing(...)) { + groups <- group_vars(.data) + if (is_empty(groups)) { + warn(paste0( + "`...` must not be empty for ungrouped data frames.\n", + "Did you want `data = everything()`?" + )) + } + + nest_vars <- setdiff(.data$vars, groups) + list(data = nest_vars) + } else { + cols <- enquos(...) + sim_data <- simulate_vars(.data) + lapply(cols, function(.x) names(tidyselect::eval_select(.x, sim_data))) + } +} + +simplify_names2 <- function(x) { + # TODO this could be used in more places + auto_names <- rlang::exprs_auto_name(x) + name_unnecessary <- names2(x) == auto_names + names(x)[name_unnecessary] <- "" + x +} diff --git a/R/zzz.R b/R/zzz.R index 82c810ca8..5c97aa644 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -11,6 +11,7 @@ register_s3_method("tidyr", "pivot_longer", "data.table") register_s3_method("tidyr", "pivot_wider", "data.table") register_s3_method("tidyr", "replace_na", "data.table") + register_s3_method("tidyr", "nest", "data.table") register_s3_method("dplyr", "filter", "dtplyr_step") register_s3_method("dplyr", "intersect", "dtplyr_step") @@ -23,6 +24,7 @@ register_s3_method("tidyr", "pivot_longer", "dtplyr_step") register_s3_method("tidyr", "pivot_wider", "dtplyr_step") register_s3_method("tidyr", "replace_na", "dtplyr_step") + register_s3_method("tidyr", "nest", "dtplyr_step") } register_s3_method <- function(pkg, generic, class, fun = NULL) { From 1566611f3a7867e86e367364e2ba024a55f0f77d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 06:59:52 +0000 Subject: [PATCH 08/13] add tests --- tests/testthat/test-step-nest.R | 84 +++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/testthat/test-step-nest.R diff --git a/tests/testthat/test-step-nest.R b/tests/testthat/test-step-nest.R new file mode 100644 index 000000000..508dfdbba --- /dev/null +++ b/tests/testthat/test-step-nest.R @@ -0,0 +1,84 @@ +test_that("nest turns grouped values into one list-df", { + ldt <- lazy_dt(tibble(x = c(1, 1, 1), y = 1:3), "DT") + out <- nest(ldt, data = y) + outc <- collect(out) + + expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x)])) + + expect_equal(group_vars(out), character()) + expect_equal(out$vars, c("x", "data")) + + expect_equal(outc$x, 1) + expect_equal(length(outc$data), 1L) + expect_equal(outc$data[[1L]], data.table(y = 1:3)) +}) + +test_that("nest uses grouping vars if present", { + ldt <- lazy_dt(tibble(x = c(1, 1, 1), y = 1:3), "DT") + out <- nest(dplyr::group_by(ldt, x)) + + expect_equal(group_vars(out), "x") + expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x)])) +}) + +test_that("provided grouping vars override grouped defaults", { + ldt <- tibble(x = 1, y = 2, z = 3) %>% group_by(x) %>% lazy_dt("DT") + out <- nest(ldt, data = y) + + expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x, z)])) + expect_equal(group_vars(out), "x") + expect_equal(out$vars, c("x", "z", "data")) +}) + +test_that("puts data into the correct row", { + ldt <- tibble(x = 1:3, y = c("B", "A", "A")) %>% lazy_dt() + out <- nest(ldt, data = x) %>% collect() %>% dplyr::filter(y == "B") + expect_equal(out$data[[1]]$x, 1) +}) + +test_that("nesting everything yields a simple data frame", { + dt <- data.table(x = 1:3, y = c("B", "A", "A")) + ldt <- lazy_dt(dt, "DT") + out <- nest(ldt, data = c(x, y)) + + expect_equal(show_query(out), expr(DT[, .(data = .(.SD))])) + expect_equal(out$vars, "data") + + expect_equal(collect(out)$data, list(dt)) +}) + +test_that("nest preserves order of data", { + ldt <- lazy_dt(tibble(x = c(1, 3, 2, 3, 2), y = 1:5), "DT") + out <- nest(ldt, data = y) + expect_equal(collect(out)$x, c(1, 3, 2)) +}) + +test_that("can strip names", { + ldt <- lazy_dt(tibble(x = c(1, 1, 1), ya = 1:3, yb = 4:6), "DT") + out <- nest(ldt, y = starts_with("y"), .names_sep = "") + + expect_equal( + show_query(out), + expr(DT[, .(y = .(data.table(a = ya, b = yb))), by = .(x)]) + ) + + expect_named(collect(out)$y[[1]], c("a", "b")) +}) + +test_that("can nest multiple columns", { + ldt <- lazy_dt(tibble(x = 1, a1 = 1, a2 = 2, b1 = 1, b2 = 2), "DT") + out <- ldt %>% nest(a = c(a1, a2), b = c(b1, b2)) + + expect_equal( + show_query(out), + expr(DT[, .(a = .(data.table(a1, a2)), b = .(data.table(b1, b2))), by = .(x)]) + ) + expect_equal(out$vars, c("x", "a", "b")) +}) + +test_that("nesting no columns nests all inputs", { + # included only for backward compatibility + ldt <- lazy_dt(tibble(a1 = 1, a2 = 2, b1 = 1, b2 = 2), "DT") + expect_warning(out <- nest(ldt), "must not be empty") + expect_equal(show_query(out), expr(DT[, .(data = .(.SD))])) +}) From 6f94f1a78607e7654f2c989f8b0d36dbc51b400d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 07:01:56 +0000 Subject: [PATCH 09/13] add news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7bbde4968..a5a543585 100644 --- a/NEWS.md +++ b/NEWS.md @@ -43,6 +43,8 @@ * `pivot_longer()` (@markfairbanks, #204) * `replace_na()` (@markfairbanks, #202) + + * `nest()` (@mgirlich, #251) # dtplyr 1.1.0 From 46a2110b56be4c1167132a1d9a667d8937efc1a3 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 09:04:09 +0200 Subject: [PATCH 10/13] Revert "add news" This reverts commit 6f94f1a78607e7654f2c989f8b0d36dbc51b400d. --- NEWS.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index a5a543585..7bbde4968 100644 --- a/NEWS.md +++ b/NEWS.md @@ -43,8 +43,6 @@ * `pivot_longer()` (@markfairbanks, #204) * `replace_na()` (@markfairbanks, #202) - - * `nest()` (@mgirlich, #251) # dtplyr 1.1.0 From 24d65d98abdd3bae34b058d48af2989b97fece52 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 09:04:18 +0200 Subject: [PATCH 11/13] Revert "add tests" This reverts commit 1566611f3a7867e86e367364e2ba024a55f0f77d. --- tests/testthat/test-step-nest.R | 84 --------------------------------- 1 file changed, 84 deletions(-) delete mode 100644 tests/testthat/test-step-nest.R diff --git a/tests/testthat/test-step-nest.R b/tests/testthat/test-step-nest.R deleted file mode 100644 index 508dfdbba..000000000 --- a/tests/testthat/test-step-nest.R +++ /dev/null @@ -1,84 +0,0 @@ -test_that("nest turns grouped values into one list-df", { - ldt <- lazy_dt(tibble(x = c(1, 1, 1), y = 1:3), "DT") - out <- nest(ldt, data = y) - outc <- collect(out) - - expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x)])) - - expect_equal(group_vars(out), character()) - expect_equal(out$vars, c("x", "data")) - - expect_equal(outc$x, 1) - expect_equal(length(outc$data), 1L) - expect_equal(outc$data[[1L]], data.table(y = 1:3)) -}) - -test_that("nest uses grouping vars if present", { - ldt <- lazy_dt(tibble(x = c(1, 1, 1), y = 1:3), "DT") - out <- nest(dplyr::group_by(ldt, x)) - - expect_equal(group_vars(out), "x") - expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x)])) -}) - -test_that("provided grouping vars override grouped defaults", { - ldt <- tibble(x = 1, y = 2, z = 3) %>% group_by(x) %>% lazy_dt("DT") - out <- nest(ldt, data = y) - - expect_equal(show_query(out), expr(DT[, .(data = .(.SD)), by = .(x, z)])) - expect_equal(group_vars(out), "x") - expect_equal(out$vars, c("x", "z", "data")) -}) - -test_that("puts data into the correct row", { - ldt <- tibble(x = 1:3, y = c("B", "A", "A")) %>% lazy_dt() - out <- nest(ldt, data = x) %>% collect() %>% dplyr::filter(y == "B") - expect_equal(out$data[[1]]$x, 1) -}) - -test_that("nesting everything yields a simple data frame", { - dt <- data.table(x = 1:3, y = c("B", "A", "A")) - ldt <- lazy_dt(dt, "DT") - out <- nest(ldt, data = c(x, y)) - - expect_equal(show_query(out), expr(DT[, .(data = .(.SD))])) - expect_equal(out$vars, "data") - - expect_equal(collect(out)$data, list(dt)) -}) - -test_that("nest preserves order of data", { - ldt <- lazy_dt(tibble(x = c(1, 3, 2, 3, 2), y = 1:5), "DT") - out <- nest(ldt, data = y) - expect_equal(collect(out)$x, c(1, 3, 2)) -}) - -test_that("can strip names", { - ldt <- lazy_dt(tibble(x = c(1, 1, 1), ya = 1:3, yb = 4:6), "DT") - out <- nest(ldt, y = starts_with("y"), .names_sep = "") - - expect_equal( - show_query(out), - expr(DT[, .(y = .(data.table(a = ya, b = yb))), by = .(x)]) - ) - - expect_named(collect(out)$y[[1]], c("a", "b")) -}) - -test_that("can nest multiple columns", { - ldt <- lazy_dt(tibble(x = 1, a1 = 1, a2 = 2, b1 = 1, b2 = 2), "DT") - out <- ldt %>% nest(a = c(a1, a2), b = c(b1, b2)) - - expect_equal( - show_query(out), - expr(DT[, .(a = .(data.table(a1, a2)), b = .(data.table(b1, b2))), by = .(x)]) - ) - expect_equal(out$vars, c("x", "a", "b")) -}) - -test_that("nesting no columns nests all inputs", { - # included only for backward compatibility - ldt <- lazy_dt(tibble(a1 = 1, a2 = 2, b1 = 1, b2 = 2), "DT") - expect_warning(out <- nest(ldt), "must not be empty") - expect_equal(show_query(out), expr(DT[, .(data = .(.SD))])) -}) From 43faf0000d6972665c280f11ece175d56db4bb13 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 09:04:26 +0200 Subject: [PATCH 12/13] Revert "add `nest()`" This reverts commit 6c5413c59006e53d6b7f63b4dbbdf15f90b21e1b. --- R/compat-purrr.R | 13 -------- R/compat-tidyr.R | 9 ------ R/step-nest.R | 82 ------------------------------------------------ R/zzz.R | 2 -- 4 files changed, 106 deletions(-) delete mode 100644 R/compat-purrr.R delete mode 100644 R/compat-tidyr.R delete mode 100644 R/step-nest.R diff --git a/R/compat-purrr.R b/R/compat-purrr.R deleted file mode 100644 index dc8432ad0..000000000 --- a/R/compat-purrr.R +++ /dev/null @@ -1,13 +0,0 @@ -imap <- function(.x, .f, ...) { - map2(.x, names(.x) %||% seq_along(.x), .f, ...) -} - -map2 <- function(.x, .y, .f, ...) { - .f <- as_function(.f, env = global_env()) - out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) - if (length(out) == length(.x)) { - set_names(out, names(.x)) - } else { - set_names(out, NULL) - } -} diff --git a/R/compat-tidyr.R b/R/compat-tidyr.R deleted file mode 100644 index c5ff3f7c1..000000000 --- a/R/compat-tidyr.R +++ /dev/null @@ -1,9 +0,0 @@ -strip_names <- function(df, base, names_sep) { - base <- paste0(base, names_sep) - names <- names(df) - - has_prefix <- regexpr(base, names, fixed = TRUE) == 1L - names[has_prefix] <- substr(names[has_prefix], nchar(base) + 1, nchar(names[has_prefix])) - - set_names(df, names) -} diff --git a/R/step-nest.R b/R/step-nest.R deleted file mode 100644 index ae4d3d5e6..000000000 --- a/R/step-nest.R +++ /dev/null @@ -1,82 +0,0 @@ -#' Nest -#' -#' @description -#' This is a method for the tidyr `nest()` generic. -#' -#' @inheritParams tidyr::nest -#' @param data A [lazy_dt()]. -#' @examples -#' if (require("tidyr", quietly = TRUE)) { -#' dt <- lazy_dt(tibble(x = c(1, 2, 1), y = c("a", "a", "b"))) -#' dt %>% nest(data = y) -#' -#' dt %>% dplyr::group_by(x) %>% nest() -#' } -# exported onLoad -nest.dtplyr_step <- function(.data, ..., .names_sep = NULL) { - cols <- eval_nest_dots(.data, ...) - - cols <- lapply(cols, set_names) - if (!is.null(.names_sep)) { - cols <- imap(cols, strip_names, .names_sep) - } - - if (length(cols) == 1 && is.null(.names_sep)) { - # use `.SD` as it is shorter and faster - nm <- names(cols) - j_exprs <- exprs(!!nm := .(.SD)) - } else { - j_exprs <- imap( - cols, - function(x, name) { - x <- simplify_names2(x) - expr(.(data.table(!!!syms(x)))) - } - ) - } - - asis <- setdiff(.data$vars, unlist(cols)) - out <- step_subset_j( - .data, - vars = c(asis, names(cols)), - j = expr(.(!!!j_exprs)), - groups = asis, - arrange = FALSE - ) - - groups <- intersect(out$vars, group_vars(.data)) - group_by(out, !!!syms(groups)) -} - -# exported onLoad -nest.data.table <- function(.data, ..., .names_sep = NULL) { - .data <- lazy_dt(.data) - tidyr::nest(.data, ...) -} - -eval_nest_dots <- function(.data, ...) { - if (missing(...)) { - groups <- group_vars(.data) - if (is_empty(groups)) { - warn(paste0( - "`...` must not be empty for ungrouped data frames.\n", - "Did you want `data = everything()`?" - )) - } - - nest_vars <- setdiff(.data$vars, groups) - list(data = nest_vars) - } else { - cols <- enquos(...) - sim_data <- simulate_vars(.data) - lapply(cols, function(.x) names(tidyselect::eval_select(.x, sim_data))) - } -} - -simplify_names2 <- function(x) { - # TODO this could be used in more places - auto_names <- rlang::exprs_auto_name(x) - name_unnecessary <- names2(x) == auto_names - names(x)[name_unnecessary] <- "" - x -} diff --git a/R/zzz.R b/R/zzz.R index 5c97aa644..82c810ca8 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -11,7 +11,6 @@ register_s3_method("tidyr", "pivot_longer", "data.table") register_s3_method("tidyr", "pivot_wider", "data.table") register_s3_method("tidyr", "replace_na", "data.table") - register_s3_method("tidyr", "nest", "data.table") register_s3_method("dplyr", "filter", "dtplyr_step") register_s3_method("dplyr", "intersect", "dtplyr_step") @@ -24,7 +23,6 @@ register_s3_method("tidyr", "pivot_longer", "dtplyr_step") register_s3_method("tidyr", "pivot_wider", "dtplyr_step") register_s3_method("tidyr", "replace_na", "dtplyr_step") - register_s3_method("tidyr", "nest", "dtplyr_step") } register_s3_method <- function(pkg, generic, class, fun = NULL) { From 401e733f660a947dcf49546adc620738922cc811 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 25 May 2021 09:04:34 +0200 Subject: [PATCH 13/13] Revert "`group_by()` can handle empty groups" This reverts commit bda39f32102620b6ec37332ee06a02f84e7c7302. --- R/step-group.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/step-group.R b/R/step-group.R index 10c03bc15..9e291c9ae 100644 --- a/R/step-group.R +++ b/R/step-group.R @@ -84,7 +84,7 @@ group_by.dtplyr_step <- function(.data, ..., .add = FALSE, add = deprecated(), a dots[!existing] <- syms(names(dots[!existing])) } - groups <- c(if (.add) .data$groups, names(dots)) %||% character() + groups <- c(if (.add) .data$groups, names(dots)) arranged <- if (!is.null(.data$arrange)) .data$arrange && arrange else arrange step_group(.data, groups, arranged)