The package adds the following vector functions
# sample_mode returns the statistical mode
fmode(c(1, 2, 2))
fmode(c(1, 2))
fmode(c(NA, NA, 1))
fmode(c(NA, NA, 1), na.rm = TRUE)
# bin creates integer variable for quantile categories (corresponds to Stata xtile)
v <- c(NA, 1:10)
fbin(v, n_quantiles = 3) # 3 groups based on terciles
fbin(v, probs = c(0.3, 0.7)) # 3 groups based on two quantiles
fbin(v, cutpoints = c(2, 3)) # 3 groups based on two cutpoints
# winsorize (default based on 5 x interquartile range)
v <- c(1:4, 99)
winsorize(v)
winsorize(v, replace = NA)
winsorize(v, probs = c(0.01, 0.99))
winsorize(v, cutpoints = c(1, 50))
# demean on multiple groups (ie multiple fixed effects)
demean(c(1,2), fe = c(1,1))
demean(c(NA,2), fe = list(c(1,2), c(1,3)))
demean(c(1,2), fe = list(c(NA,2), c(1,3)))
setkeep and setdrop keeps certain columns inplace
DT <- data.table(
id = c(1,2),
v1 = c(1,1),
v2 = c(2,1)
)
setkeep(DT, id, v2)
setkeep(DT, -id)
setdiscard(DT, v1)
keep and discard create a new table with certain columns
DT <- data.table(
id = c(1,2),
v1 = c(1,1),
v2 = c(2,1)
)
keep(DT, id, v2)
keep(DT, -id)
discard(DT, v1)
keep_if
and discard_if
create a new table with certain rows
DT <- data.table(
id = c(1,2,1),
v1 = c(1,NA,2)
)
keep_if(DT, v1 == 1)
keep_if(DT, v1 == min(v1), by = id)
discard_if(DT, v1 == 1)
discard_if(DT, v1 == min(v1), by = id)
discard_if(condition)
differs from keep_if(!(condition))
: it keeps rows where the condition evaluates to NA
. ### Summarize
# sum_up prints detailed summary statistics (corresponds to Stata summarize)
N <- 100
DT <- data.table(
id = 1:N,
v1 = sample(5, N, TRUE),
v2 = sample(1e6, N, TRUE)
)
sum_up(DT)
sum_up(DT, v2, d = TRUE)
sum_up(DT, starts_with("v"), by = v1)
# duplicates returns duplicated rows
DT <- data.table(a = rep(1:2, each = 3), b = 1:6)
duplicates(DT, a)
duplicates(DT, a, b)
graph
is a wrapper for ggplot2
functionalities, useful for interactive exploration of datasets
N <- 10000
DT <- data.table(
id = sample(c("id1","id2","id3"), N, TRUE),
v1 = sample(c(1:5), N, TRUE),
v2 = rnorm(N, sd = 20),
v3 = sample(runif(100, max=100), N, TRUE)
)
DT[, v4 := (id=="id1")* v2 + rnorm(N, sd = 5)]
graph(DT)
graph(DT, by = id)
graph(DT, by = id, type = "boxplot")
graph(DT, v3, v4, along_with = v2)
You can also regress the variable on another, after partialing out thanks to control specified in formula:
graph(DT, v3, along_with = v2, by = id, type = "felm", formula = ~v4|v1)
join
is a wrapper for data.table merge functionalities.
Stata | statar |
---|---|
merge v1 | join(x, y, kind = “outer”) |
merge v1, keep(master matched) | join(x, y, kind = “left”) |
merge v1, keep(matched using) | join(x, y, kind = “right”) |
merge v1, keep(matched) | join(x, y, kind = “inner”) |
merge v1, keep(matched) keepusing(v1) | join(x, y, kind = “semi”) |
merge v1, keep(master) keepusing(v1) | join(x, y, kind = “anti”) |
crossby | join(x, y, kind = “cross”) |
r # merge m:1 v1 join(x, y, kind = "outer", check = m~1)
- The option “gen” specifies the name of a new variable that identifies non matched and matched rows (as in Stata).
r # merge m:1 v1, gen(_merge) join(x, y, kind = "outer", gen = "_merge")
Functions with the prefix set
modify the input data.table in place. Function selects variables similarly to dplyr
syntax. Each function has a version that accepts strings, formulas or quoted expressions : its name is the original function’s name with the suffix _ (see the dplyr vignette for more details). For instance, the SE version of sum_up
is sum_up_
.
# NSE version
sum_up(DT, list(v2, v3), by = list(id,v1))
# SE version
sum_up_(DT, c("v2","v3"), by = c("id","v1"))