From 27f72a0237eddd1704700d49e14b11aa657fea9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 29 Aug 2022 04:07:17 +0200 Subject: [PATCH 1/5] Framework --- R/subsetting.R | 6 ++++++ src/init.c | 1 + src/tbl_subassign_col.c | 5 +++++ src/tibble.h | 1 + 4 files changed, 13 insertions(+) create mode 100644 src/tbl_subassign_col.c diff --git a/R/subsetting.R b/R/subsetting.R index 4b1b4c985..8cc79c1c9 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -636,6 +636,12 @@ is_tight_sequence_at_end <- function(i_new, n) { } tbl_subassign_col <- function(x, j, value) { + c <- .Call(`tibble_tbl_subassign_col`, x, j, value) + r <- tbl_subassign_col_r(x, j, value) + r +} + +tbl_subassign_col_r <- function(x, j, value) { nrow <- fast_nrow(x) # Grow, assign new names diff --git a/src/init.c b/src/init.c index 4588296a2..79c3ce574 100644 --- a/src/init.c +++ b/src/init.c @@ -9,6 +9,7 @@ static const R_CallMethodDef CallEntries[] = { {"tibble_string_to_indices", (DL_FUNC) &tibble_string_to_indices, 1}, {"tibble_restore_impl", (DL_FUNC) &tibble_restore_impl, 2}, {"tibble_need_coerce", (DL_FUNC) &tibble_need_coerce, 1}, + {"tibble_tbl_subassign_col", (DL_FUNC) &tbl_subassign_col, 3}, {NULL, NULL, 0} }; diff --git a/src/tbl_subassign_col.c b/src/tbl_subassign_col.c new file mode 100644 index 000000000..201b0dd8d --- /dev/null +++ b/src/tbl_subassign_col.c @@ -0,0 +1,5 @@ +#include "tibble.h" + +SEXP tbl_subassign_col(SEXP x, SEXP j, SEXP value) { + return R_NilValue; +} diff --git a/src/tibble.h b/src/tibble.h index b1c762fb2..beffbb8ad 100644 --- a/src/tibble.h +++ b/src/tibble.h @@ -9,5 +9,6 @@ SEXP tibble_string_to_indices(SEXP x); SEXP tibble_need_coerce(SEXP x); SEXP tibble_update_attrs(SEXP x, SEXP dots); SEXP tibble_restore_impl(SEXP xo, SEXP x); +SEXP tbl_subassign_col(SEXP x, SEXP j, SEXP value); #endif /* TIBBLE_H */ From 68da28620b58d775af906f537011b44f3117fc6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 29 Aug 2022 05:24:14 +0200 Subject: [PATCH 2/5] Refactor: order j --- R/subsetting.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/subsetting.R b/R/subsetting.R index 8cc79c1c9..39cc38cc4 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -646,11 +646,22 @@ tbl_subassign_col_r <- function(x, j, value) { # Grow, assign new names new <- attr(j, "new") + if (is.null(new)) { + n_new <- 0 + } else { + n_new <- max(j[new]) - length(x) + } + + # Grow, assign new names if (!is.null(new)) { - length(x) <- max(j[new]) + length(x) <- length(x) + n_new names(x)[j[new]] <- names2(j)[new] } + order_j <- order(j) + value <- value[order_j] + j <- j[order_j] + # Update to_remove <- integer() for (jj in seq_along(value)) { From 8128870ec83d1f058b4ee5bc2910a3fe43c1e111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Tue, 30 Aug 2022 04:32:30 +0200 Subject: [PATCH 3/5] Refactor: copy only once --- R/subsetting.R | 72 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/R/subsetting.R b/R/subsetting.R index 39cc38cc4..ed35a6c32 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -636,7 +636,13 @@ is_tight_sequence_at_end <- function(i_new, n) { } tbl_subassign_col <- function(x, j, value) { - c <- .Call(`tibble_tbl_subassign_col`, x, j, value) + if (length(j) > 1) { + order_j <- order(j) + value <- value[order_j] + j <- j[order_j] + } + + # c <- .Call(`tibble_tbl_subassign_col`, x, j, value) r <- tbl_subassign_col_r(x, j, value) r } @@ -644,44 +650,52 @@ tbl_subassign_col <- function(x, j, value) { tbl_subassign_col_r <- function(x, j, value) { nrow <- fast_nrow(x) - # Grow, assign new names - new <- attr(j, "new") - if (is.null(new)) { - n_new <- 0 - } else { - n_new <- max(j[new]) - length(x) + j_max <- max(c(j, length(x))) + n_old <- sum(vapply(value, is.null, NA)) + + xo_idx <- integer(j_max - n_old) + + jj <- 1 + xoj <- 1 + xj <- 1 + while(xoj <= length(xo_idx)) { + if (jj <= length(j) && xj == j[[jj]]) { + xj <- xj + 1 + if (!is.null(value[[jj]])) { + xo_idx[[xoj]] <- -jj + xoj <- xoj + 1 + } + jj <- jj + 1 + } else { + xo_idx[[xoj]] <- xj + xoj <- xoj + 1 + xj <- xj + 1 + } } - # Grow, assign new names - if (!is.null(new)) { - length(x) <- length(x) + n_new - names(x)[j[new]] <- names2(j)[new] + while(jj <= length(j)) { + stopifnot(is.null(value[[jj]])) + jj <- jj + 1 } - order_j <- order(j) - value <- value[order_j] - j <- j[order_j] + xo <- vector("list", length(xo_idx)) + mostattributes(xo) <- attributes(x) + names(xo) <- rep("", length(xo_idx)) - # Update - to_remove <- integer() - for (jj in seq_along(value)) { - ji <- j[[jj]] - value_jj <- value[[jj]] - if (!is.null(value_jj)) { - x[[ji]] <- value_jj + for (xoj in seq_along(xo_idx)) { + xj <- xo_idx[[xoj]] + if (xj > 0) { + xo[[xoj]] <- x[[xj]] + names(xo)[[xoj]] <- names(x)[[xj]] } else { - to_remove <- c(to_remove, ji) + xo[[xoj]] <- value[[-xj]] + names(xo)[[xoj]] <- names(j)[[-xj]] } } - # Remove - if (length(to_remove) > 0) { - x <- x[-to_remove] - } - # Can be destroyed by setting length - attr(x, "row.names") <- .set_row_names(nrow) - x + attr(xo, "row.names") <- .set_row_names(nrow) + xo } tbl_expand_to_nrow <- function(x, i) { From be92ee834f18ff7a2ca4f335a4aed91cebbfa976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Tue, 30 Aug 2022 11:45:47 +0200 Subject: [PATCH 4/5] Implement tbl_subassign_col in R --- R/subsetting.R | 55 +-------------------- src/tbl_subassign_col.c | 106 +++++++++++++++++++++++++++++++++++++++- src/tibble.h | 2 + 3 files changed, 107 insertions(+), 56 deletions(-) diff --git a/R/subsetting.R b/R/subsetting.R index ed35a6c32..e849482dd 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -642,60 +642,7 @@ tbl_subassign_col <- function(x, j, value) { j <- j[order_j] } - # c <- .Call(`tibble_tbl_subassign_col`, x, j, value) - r <- tbl_subassign_col_r(x, j, value) - r -} - -tbl_subassign_col_r <- function(x, j, value) { - nrow <- fast_nrow(x) - - j_max <- max(c(j, length(x))) - n_old <- sum(vapply(value, is.null, NA)) - - xo_idx <- integer(j_max - n_old) - - jj <- 1 - xoj <- 1 - xj <- 1 - while(xoj <= length(xo_idx)) { - if (jj <= length(j) && xj == j[[jj]]) { - xj <- xj + 1 - if (!is.null(value[[jj]])) { - xo_idx[[xoj]] <- -jj - xoj <- xoj + 1 - } - jj <- jj + 1 - } else { - xo_idx[[xoj]] <- xj - xoj <- xoj + 1 - xj <- xj + 1 - } - } - - while(jj <= length(j)) { - stopifnot(is.null(value[[jj]])) - jj <- jj + 1 - } - - xo <- vector("list", length(xo_idx)) - mostattributes(xo) <- attributes(x) - names(xo) <- rep("", length(xo_idx)) - - for (xoj in seq_along(xo_idx)) { - xj <- xo_idx[[xoj]] - if (xj > 0) { - xo[[xoj]] <- x[[xj]] - names(xo)[[xoj]] <- names(x)[[xj]] - } else { - xo[[xoj]] <- value[[-xj]] - names(xo)[[xoj]] <- names(j)[[-xj]] - } - } - - # Can be destroyed by setting length - attr(xo, "row.names") <- .set_row_names(nrow) - xo + .Call(`tibble_tbl_subassign_col`, x, j, value) } tbl_expand_to_nrow <- function(x, i) { diff --git a/src/tbl_subassign_col.c b/src/tbl_subassign_col.c index 201b0dd8d..4c14297f9 100644 --- a/src/tbl_subassign_col.c +++ b/src/tbl_subassign_col.c @@ -1,5 +1,107 @@ #include "tibble.h" -SEXP tbl_subassign_col(SEXP x, SEXP j, SEXP value) { - return R_NilValue; +R_xlen_t fast_nrow(SEXP x); + +SEXP tbl_subassign_col(SEXP x, SEXP j_, SEXP value) { + R_xlen_t nrow = fast_nrow(x); + R_xlen_t n_x = Rf_xlength(x); + + R_xlen_t n_j = Rf_xlength(j_); + R_xlen_t* j = (R_xlen_t*)R_alloc(n_j + 1, sizeof(R_xlen_t)); + if (TYPEOF(j_) == INTSXP) { + const int* j_int = INTEGER(j_); + for (R_xlen_t jj = 0; jj < n_j; ++jj) { + j[jj] = j_int[jj] - 1; + } + } else if (TYPEOF(j_) == REALSXP) { + const double* j_real = REAL(j_); + for (R_xlen_t jj = 0; jj < n_j; ++jj) { + j[jj] = (R_xlen_t)j_real[jj] - 1; + } + } else { + Rf_error("Internal: tbl_subassign_col: invalid type for j_."); + } + + // Add sentinel value + j[n_j] = -1; + + // Compute n_new and n_old + R_xlen_t j_max = n_x; + R_xlen_t n_old = 0; + for (R_xlen_t jj = 0; jj < n_j; ++jj) { + if (j_max <= j[jj]) { + j_max = j[jj] + 1; + } + if (VECTOR_ELT(value, jj) == R_NilValue) { + ++n_old; + } + } + + R_xlen_t n_xo = j_max - n_old; + + R_xlen_t* xo_idx = (R_xlen_t*)R_alloc(n_xo, sizeof(R_xlen_t)); + bool* xo_is_value = (bool*)R_alloc(n_xo, sizeof(bool)); + + for (R_xlen_t jj = 0, xoj = 0, xj = 0; xoj < n_xo; ++xj) { + bool is_value = (xj == j[jj]); + xo_is_value[xoj] = is_value; + + if (is_value) { + if (VECTOR_ELT(value, jj) != R_NilValue) { + xo_idx[xoj++] = jj; + } + jj++; + } else { + xo_idx[xoj++] = xj; + } + } + + SEXP xo = Rf_allocVector(VECSXP, n_xo); + Rf_copyMostAttrib(x, xo); + + SEXP xo_names = Rf_allocVector(STRSXP, n_xo); + SEXP j_names = Rf_getAttrib(j_, R_NamesSymbol); + SEXP x_names = Rf_getAttrib(x, R_NamesSymbol); + + for (R_xlen_t xoj = 0; xoj < n_xo; ++xoj) { + R_xlen_t xj = xo_idx[xoj]; + if (xo_is_value[xoj]) { + SET_VECTOR_ELT(xo, xoj, VECTOR_ELT(value, xj)); + SET_STRING_ELT(xo_names, xoj, STRING_ELT(j_names, xj)); + } else { + if (xj >= n_x) { + Rf_error("Internal: tbl_subassign_col: x_j >= n_x, %d >= %d", xj, n_x); + } + SET_VECTOR_ELT(xo, xoj, VECTOR_ELT(x, xj)); + SET_STRING_ELT(xo_names, xoj, STRING_ELT(x_names, xj)); + } + } + + Rf_setAttrib(xo, R_NamesSymbol, xo_names); + SEXP new_row_names_ = Rf_allocVector(INTSXP, 2); + int* new_row_names = INTEGER(new_row_names_); + + new_row_names[0] = NA_INTEGER; + new_row_names[1] = -nrow; + Rf_setAttrib(xo, R_RowNamesSymbol, new_row_names_); + + return xo; +} + +R_xlen_t fast_nrow(SEXP x) { + SEXP row_names = Rf_getAttrib(x, R_RowNamesSymbol); + + if (Rf_xlength(row_names) == 0) { + return 0; + } + + if (TYPEOF(row_names) == INTSXP && INTEGER_ELT(row_names, 0) == NA_INTEGER) { + int out = INTEGER_ELT(row_names, 1); + if (out < 0) { + out = -out; + } + return out; + } else { + return Rf_xlength(row_names); + } } diff --git a/src/tibble.h b/src/tibble.h index beffbb8ad..5fb0f8061 100644 --- a/src/tibble.h +++ b/src/tibble.h @@ -4,6 +4,8 @@ #define R_NO_REMAP #include +#include "stdbool.h" + SEXP tibble_matrixToDataFrame(SEXP xSEXP); SEXP tibble_string_to_indices(SEXP x); SEXP tibble_need_coerce(SEXP x); From a87aba53c600753e79f4071a3e812d4515adac6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 3 Sep 2022 05:30:32 +0200 Subject: [PATCH 5/5] Comments --- src/tbl_subassign_col.c | 78 +++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/src/tbl_subassign_col.c b/src/tbl_subassign_col.c index 4c14297f9..184914a0b 100644 --- a/src/tbl_subassign_col.c +++ b/src/tbl_subassign_col.c @@ -2,21 +2,36 @@ R_xlen_t fast_nrow(SEXP x); +// Performs a variant of x[j] <- value, growing x as necessary +// NULL values are supported +// First, creates the storage for the resulting x +// Then, populates that storage by copying the correct values exactly once +// +// Requires j to be sorted on input SEXP tbl_subassign_col(SEXP x, SEXP j_, SEXP value) { + // Naming conventions: + // n_x: length of x + // x_idx: index into vector x + // x_names: names of x + // n_old: number of elements with some property + // j_max: maximum value of j + // j_/j: SEXP and native version of bare vector + // xo: output vector R_xlen_t nrow = fast_nrow(x); R_xlen_t n_x = Rf_xlength(x); + // Compute j from j_, converting 1-based j_ to zero-based j R_xlen_t n_j = Rf_xlength(j_); R_xlen_t* j = (R_xlen_t*)R_alloc(n_j + 1, sizeof(R_xlen_t)); if (TYPEOF(j_) == INTSXP) { const int* j_int = INTEGER(j_); - for (R_xlen_t jj = 0; jj < n_j; ++jj) { - j[jj] = j_int[jj] - 1; + for (R_xlen_t j_idx = 0; j_idx < n_j; ++j_idx) { + j[j_idx] = j_int[j_idx] - 1; } } else if (TYPEOF(j_) == REALSXP) { const double* j_real = REAL(j_); - for (R_xlen_t jj = 0; jj < n_j; ++jj) { - j[jj] = (R_xlen_t)j_real[jj] - 1; + for (R_xlen_t j_idx = 0; j_idx < n_j; ++j_idx) { + j[j_idx] = (R_xlen_t)j_real[j_idx] - 1; } } else { Rf_error("Internal: tbl_subassign_col: invalid type for j_."); @@ -25,37 +40,42 @@ SEXP tbl_subassign_col(SEXP x, SEXP j_, SEXP value) { // Add sentinel value j[n_j] = -1; - // Compute n_new and n_old + // Compute j_max and n_old R_xlen_t j_max = n_x; R_xlen_t n_old = 0; - for (R_xlen_t jj = 0; jj < n_j; ++jj) { - if (j_max <= j[jj]) { - j_max = j[jj] + 1; + for (R_xlen_t j_idx = 0; j_idx < n_j; ++j_idx) { + if (j_max <= j[j_idx]) { + j_max = j[j_idx] + 1; } - if (VECTOR_ELT(value, jj) == R_NilValue) { + if (VECTOR_ELT(value, j_idx) == R_NilValue) { ++n_old; } } R_xlen_t n_xo = j_max - n_old; - R_xlen_t* xo_idx = (R_xlen_t*)R_alloc(n_xo, sizeof(R_xlen_t)); + // For each target element, position of source element in the x or value vectors + R_xlen_t* xo_src_idx = (R_xlen_t*)R_alloc(n_xo, sizeof(R_xlen_t)); bool* xo_is_value = (bool*)R_alloc(n_xo, sizeof(bool)); - for (R_xlen_t jj = 0, xoj = 0, xj = 0; xoj < n_xo; ++xj) { - bool is_value = (xj == j[jj]); - xo_is_value[xoj] = is_value; + for (R_xlen_t j_idx = 0, xo_idx = 0, x_idx = 0; xo_idx < n_xo; ++x_idx) { + // Is the next xo element taken from x or from value? + bool is_value = (x_idx == j[j_idx]); + xo_is_value[xo_idx] = is_value; if (is_value) { - if (VECTOR_ELT(value, jj) != R_NilValue) { - xo_idx[xoj++] = jj; + // If the next value is not NULL, use it, otherwise skip it + if (VECTOR_ELT(value, j_idx) != R_NilValue) { + xo_src_idx[xo_idx++] = j_idx; } - jj++; + j_idx++; } else { - xo_idx[xoj++] = xj; + // Use next x + xo_src_idx[xo_idx++] = x_idx; } } + // Allocate output vector and output names with final size SEXP xo = Rf_allocVector(VECSXP, n_xo); Rf_copyMostAttrib(x, xo); @@ -63,24 +83,28 @@ SEXP tbl_subassign_col(SEXP x, SEXP j_, SEXP value) { SEXP j_names = Rf_getAttrib(j_, R_NamesSymbol); SEXP x_names = Rf_getAttrib(x, R_NamesSymbol); - for (R_xlen_t xoj = 0; xoj < n_xo; ++xoj) { - R_xlen_t xj = xo_idx[xoj]; - if (xo_is_value[xoj]) { - SET_VECTOR_ELT(xo, xoj, VECTOR_ELT(value, xj)); - SET_STRING_ELT(xo_names, xoj, STRING_ELT(j_names, xj)); + // Populate xo, copying from x or value as planned + for (R_xlen_t xo_idx = 0; xo_idx < n_xo; ++xo_idx) { + R_xlen_t x_idx = xo_src_idx[xo_idx]; + if (xo_is_value[xo_idx]) { + SET_VECTOR_ELT(xo, xo_idx, VECTOR_ELT(value, x_idx)); + SET_STRING_ELT(xo_names, xo_idx, STRING_ELT(j_names, x_idx)); } else { - if (xj >= n_x) { - Rf_error("Internal: tbl_subassign_col: x_j >= n_x, %d >= %d", xj, n_x); + if (x_idx >= n_x) { + Rf_error("Internal: tbl_subassign_col: x_j >= n_x, %d >= %d", x_idx, n_x); } - SET_VECTOR_ELT(xo, xoj, VECTOR_ELT(x, xj)); - SET_STRING_ELT(xo_names, xoj, STRING_ELT(x_names, xj)); + SET_VECTOR_ELT(xo, xo_idx, VECTOR_ELT(x, x_idx)); + SET_STRING_ELT(xo_names, xo_idx, STRING_ELT(x_names, x_idx)); } } + // Set output names Rf_setAttrib(xo, R_NamesSymbol, xo_names); + + // Set output row names + // FIXME: Reuse original vector? SEXP new_row_names_ = Rf_allocVector(INTSXP, 2); int* new_row_names = INTEGER(new_row_names_); - new_row_names[0] = NA_INTEGER; new_row_names[1] = -nrow; Rf_setAttrib(xo, R_RowNamesSymbol, new_row_names_);