Helper to build data.table capable non-sql nodes.

rq_df_funciton_node(
  .,
  f,
  ...,
  f_db = NULL,
  columns_produced,
  display_form,
  orig_columns = FALSE
)

Arguments

.

or data.frame input.

f

function that takes a data.table to a data.frame (or data.table).

...

force later arguments to bind by name.

f_db

implementation signature: f_db(db, incoming_table_name, outgoing_table_name, nd, ...) (db being a database handle). NULL defaults to using f.

columns_produced

character columns produces by f.

display_form

display form for node.

orig_columns

orig_columns, if TRUE assume all input columns are present in derived table.

Value

relop non-sql node implementation.

Examples


# a node generator is something an expert can
# write and part-time R users can use.
grouped_regression_node <- function(., group_col = "group", xvar = "x", yvar = "y") {
  force(group_col)
  formula_str <- paste(yvar, "~", xvar)
  f <- function(df, nd = NULL) {
    dlist <- split(df, df[[group_col]])
    clist <- lapply(dlist,
                    function(di) {
                      mi <- lm(as.formula(formula_str), data = di)
                      ci <- as.data.frame(summary(mi)$coefficients)
                      ci$Variable <- rownames(ci)
                      rownames(ci) <- NULL
                      ci[[group_col]] <- di[[group_col]][[1]]
                      ci
                    })
    data.table::rbindlist(clist)
  }
  columns_produced =
     c("Variable", "Estimate", "Std. Error", "t value", "Pr(>|t|)", group_col)
  rq_df_funciton_node(
    ., f,
    columns_produced = columns_produced,
    display_form = paste0(yvar, "~", xvar, " grouped by ", group_col))
}

# work an example
set.seed(3265)
d <- data.frame(x = rnorm(1000),
                y = rnorm(1000),
                group = sample(letters[1:5], 1000, replace = TRUE),
                stringsAsFactors = FALSE)

rquery_pipeline <- local_td(d) %.>%
  grouped_regression_node(.)

cat(format(rquery_pipeline))
#> mk_td("d", c(
#>   "x",
#>   "y",
#>   "group")) %.>%
#>  non_sql_node(., y~x grouped by group)

d %.>% rquery_pipeline
#>       Estimate Std. Error    t value   Pr(>|t|)    Variable group
#> 1   0.05921097 0.06246165  0.9479572 0.34421552 (Intercept)     a
#> 2  -0.02301646 0.06093971 -0.3776924 0.70603174           x     a
#> 3   0.09793586 0.06666844  1.4689988 0.14335117 (Intercept)     b
#> 4   0.05703537 0.06963630  0.8190466 0.41370179           x     b
#> 5  -0.05184909 0.07556010 -0.6861967 0.49348193 (Intercept)     c
#> 6   0.05554476 0.08019680  0.6926057 0.48945965           x     c
#> 7   0.15331654 0.07004124  2.1889469 0.02985964 (Intercept)     d
#> 8   0.02056881 0.06921107  0.2971896 0.76665700           x     d
#> 9   0.02250647 0.06919627  0.3252556 0.74531773 (Intercept)     e
#> 10 -0.08785792 0.06864886 -1.2798162 0.20204920           x     e