title: "statar"
author: "Matthieu Gomez"
date: "r Sys.Date()"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Summary function}
%\VignetteEngine{knitr::rmarkdown}

%\usepackage[utf8]{inputenc}

Summary functions

Vector functions

The package adds the following vector functions

# sample_mode returns the statistical mode
sample_mode(c(1, 2, 2))
sample_mode(c(1, 2))
sample_mode(c(NA, NA, 1))
sample_mode(c(NA, NA, 1), na.rm = TRUE)

# bin creates integer variable for quantile categories (corresponds to Stata xtile)
v <- c(NA, 1:10)                   
bin(v, n_quantiles = 3) # 3 groups based on terciles
bin(v, probs = c(0.3, 0.7)) # 3 groups based on two quantiles
bin(v, cutpoints = c(2, 3)) # 3 groups based on two cutpoints

# winsorize (default based on 5 x interquartile range)
v <- c(1:4, 99)
winsorize(v)
winsorize(v, replace = NA)
winsorize(v, probs = c(0.01, 0.99))
winsorize(v, cutpoints = c(1, 50))

# demean on multiple groups (ie multiple fixed effects)
demean(c(1,2), fe = c(1,1))  
demean(c(NA,2), fe = list(c(1,2), c(1,3)))               
demean(c(1,2), fe = list(c(NA,2), c(1,3)))

data.table functions

Keep and remove

setkeep and setdrop keeps certain columns inplace

DT <- data.table(
  id = c(1,2),
  v1 = c(1,1),
  v2 = c(2,1)
)
setkeep(DT, id, v2)
setkeep(DT, -id)
setdiscard(DT, v1)

keep and discard create a new table with certain columns

DT <- data.table(
  id = c(1,2),
  v1 = c(1,1),
  v2 = c(2,1)
)
keep(DT, id, v2)
keep(DT, -id)
discard(DT, v1)

keep_if and discard_if create a new table with certain rows

DT <- data.table(
  id = c(1,2,1),
  v1 = c(1,NA,2)
)
keep_if(DT, v1 == 1)
keep_if(DT, v1 == min(v1), by = id)
discard_if(DT, v1 == 1)
discard_if(DT, v1 == min(v1), by = id)

discard_if(condition) differs from keep_if(!(condition)): it keeps rows where the condition evaluates to NA.

Summarize

# sum_up prints detailed summary statistics (corresponds to Stata summarize)
N <- 100
DT <- data.table(
  id = 1:N,
  v1 = sample(5, N, TRUE),
  v2 = sample(1e6, N, TRUE)
)
sum_up(DT)
sum_up(DT, v2, d = TRUE)
sum_up(DT, starts_with("v"), by = v1)


# duplicates returns duplicated rows
DT <- data.table(a = rep(1:2, each = 3), b = 1:6)
duplicates(DT, a)
duplicates(DT, a, b)

Visual exploration

graph is a wrapper for ggplot2 functionalities, useful for interactive exploration of datasets

N <- 10000
DT <- data.table(
  id = sample(c("id1","id2","id3"), N, TRUE),
  v1 = sample(c(1:5), N, TRUE),
  v2 = rnorm(N, sd = 20),
  v3 = sample(runif(100, max=100), N, TRUE)
)
DT[, v4 := (id=="id1")* v2 + rnorm(N, sd = 5)]
graph(DT)

graph(DT, by = id)

graph(DT, by = id, type = "boxplot")

graph(DT, list(v3, v4), along_with = v2)

graph(DT, list(v3, v4), along_with = v2, by = id, type = "loess")

Join

join is a wrapper for data.table merge functionalities.

Syntax

Functions with the prefix set modify the input data.table in place.
Function selects variables similarly to dplyr syntax. Each function has a version that accepts strings, formulas or quoted expressions : its name is the original function's name with the suffix _ (see the dplyr vignette for more details). For instance, the SE version of sum_up is sum_up_.

# NSE version
sum_up(DT, list(v2, v3), by = list(id,v1))
# SE version
sum_up_(DT, c("v2","v3"), by = c("id","v1"))