The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This vignette replicates the figures found in “The q-q boxplot” (citation coming soon).
First load the ‘qqboxplot’ package and the relevant packages from the ‘tidyverse’.
library(dplyr)
library(ggplot2)
library(qqboxplot)
Figure 1 is meant to be a quick comparison of boxplots, q-q plots, and q-q boxplots. It uses a random sample of genes from one autism and one control patient and determines if the log of the gene expression counts can be modeled as lognormal.
#set value for text size so consistent across figures (only applies to figure 1)
<- 12
eltext
#q-q boxplot
<- expression_data %>%
qqbox ggplot(aes(specimen, log_count)) + geom_qqboxplot(varwidth = TRUE, notch = TRUE) +
ylab('logged normalized expression') + ggtitle("c) q-q boxplot") +
theme(plot.title=element_text(size=eltext, face="plain", hjust=0.5), axis.title.x = element_text(size=eltext), axis.title.y = element_text(size=eltext),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid.major = element_line(colour = "grey70"),
panel.grid.minor = element_line(colour="grey80"))
# regular boxplot
<- expression_data %>%
box ggplot(aes(specimen, log_count)) + geom_boxplot(varwidth = TRUE, notch = TRUE) +
ylab('logged normalized expression') + ggtitle('a) boxplot') +
theme(plot.title=element_text(size=eltext, face="plain", hjust=0.5), axis.title.x = element_text(size=eltext), axis.title.y = element_text(size=eltext),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid.major = element_line(colour = "grey70"),
panel.grid.minor = element_line(colour="grey80"))
<- c(16, 17)
override.shape #q-q plot
<- expression_data %>%
qq ggplot(aes(sample=log_count)) + geom_qq(aes(color=specimen, shape=specimen)) +
xlab('theoretical normal distribution') +
ylab('logged normalized expression') + ggtitle('b) q-q plot') +
labs(color="specimen") +
guides(color = guide_legend(override.aes = list(shape=override.shape)), shape=FALSE) +
theme(plot.title=element_text(size=eltext, face="plain", hjust=0.5), axis.title.x = element_text(size=eltext), axis.title.y = element_text(size=eltext),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid.major = element_line(colour = "grey70"),
panel.grid.minor = element_line(colour="grey80"),
legend.position = c(0.8, 0.2))
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#>
#> combine
# combine the plots
::grid.arrange(box, qq, qqbox, ncol=3) gridExtra
Figure 2 in the paper shows boxplots for simulated t-distributions
(and one simulated normal distribution with mean=2). simulated_data
contains two columns, “y” and “group”.
“group” specifies the distribution the data (“y”) comes from.
%>%
simulated_data ggplot(aes(factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")), y=y)) +
geom_boxplot(notch=TRUE, varwidth = TRUE) +
xlab(NULL) +
ylab(NULL) +
theme(axis.text.x = element_text(angle = 23, size = 15), axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 3 is the same data, visualized with a q-q plot compared with the theoretical normal distribution.
<- c(16, 17, 15, 3, 7)
override.shape
%>% ggplot(aes(sample=y, color=factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")),
simulated_data shape=factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")))) +
geom_qq() + geom_qq_line() + labs(color="distribution") +
xlab("Normal Distribution") +
ylab("Simulated Datasets") +
guides(color = guide_legend(override.aes = list(shape=override.shape)), shape=FALSE) +
theme(axis.title.x = element_text(size=15), axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
Figure 4 is the same data, visualized with q-q boxplots, compared with the theoretical normal distribution. Note in this figure that reference_dist = “norm” is chosen to specify that the normal distribution should be the reference distribution.
%>%
simulated_data ggplot(aes(factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")), y=y)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("reference: normal distribution") +
ylab(NULL) +
guides(color=FALSE) +
theme(axis.text.x = element_text(angle = 23, size = 15), axis.title.y = element_text(size=15),
axis.title.x = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
simulated data was created by running the following code:
tibble(y=c(rnorm(1000, mean=2), rt(1000, 16), rt(500, 4),
rt(1000, 8), rt(1000, 32)),
group=c(rep("normal, mean=2", 1000),
rep("t distribution, df=16", 1000),
rep("t distribution, df=4", 500),
rep("t distribution, df=8", 1000),
rep("t distribution, df=32", 1000)))
Figure 5 shows the same data from Figure 4, but compared against a
simulated normal distribution, with mean=5 and variance=1. Note that the
reference dataset comparison_dataset
is a separate vector
and is not contained in the dataset simulated_data
.
%>%
simulated_data ggplot(aes(factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")), y=y)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, compdata=comparison_dataset) +
xlab("reference: simulated normal dataset") +
ylab(NULL) +
theme(axis.text.x = element_text(angle = 23, size = 15), axis.title.y = element_text(size=15),
axis.title.x = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
The vector comparison_dataset
was simulated as
follows
rnorm(1000, 5)
Figure 6 uses dataset “indicators” from this package. Male, 2008 serves as a base case. Note that the deviation whiskers for Male 2008 are straight since it is being compared with itself.
<- indicators %>% filter(year==2008 & `Series Code`=="SL.TLF.ACTI.1524.MA.NE.ZS")
comparison_data
%>%
indicators #change the labels in series name to shorter titles
mutate(`Series Name`= ifelse(
`Series Name`=="Labor force participation rate for ages 15-24, male (%) (national estimate)",
"Male ages 15-24",
"Female ages 15-24")) %>%
ggplot(aes(as.factor(year), y=indicator))+
geom_qqboxplot(notch=TRUE, varwidth = TRUE, compdata=comparison_data$indicator) +
xlab("Year") +
ylab("Country level labor force\nparticipation rate (%)") +
facet_wrap(~factor(`Series Name`, levels = c("Male ages 15-24", "Female ages 15-24"))) +
theme(strip.text = element_text(size=12), axis.text.x = element_text(size = 15), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=12),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 7 is the same data visualized with a violin plot.
%>%
indicators #change the labels in series name to shorter titles
mutate(`Series Name`= ifelse(
`Series Name`=="Labor force participation rate for ages 15-24, male (%) (national estimate)",
"Male ages 15-24",
"Female ages 15-24")) %>%
ggplot()+
geom_violin(aes(x=factor(year),y=indicator),fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
geom_segment(aes(
x=match(factor(year),levels(factor(year)))-0.1,
xend=match(factor(year),levels(factor(year)))+0.1,
y=indicator,yend=indicator),
col='black'
+
) xlab("Year") +
ylab("Country level labor force\nparticipation rate (%)") +
facet_wrap(~factor(`Series Name`, levels = c("Male ages 15-24", "Female ages 15-24"))) +
theme(strip.text = element_text(size=12), axis.text.x = element_text(size = 15), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=12),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 8 looks at spike counts from the V1 region of the brain under various grating orientations shown to a anesthetized macaque. The question her is if the spike counts can for each orientation can be reasonably modeled by a normal distribution (i.e. is the firing rate high enough to be reasonably approximated by a Gaussian)
%>% filter(region=="V1") %>%
spike_data ggplot(aes(factor(orientation), nspikes)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Same data, visualized with a bean plot.
%>% filter(region=="V1") %>%
spike_data ggplot()+
geom_violin(aes(x=factor(orientation),y=nspikes),fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
geom_segment(aes(
x=match(factor(orientation),levels(factor(orientation)))-0.1,
xend=match(factor(orientation),levels(factor(orientation)))+0.1,
y=nspikes,yend=nspikes),
col='black'
+
) xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 10 is the same setup but is looking at neurons in V2. Here we see that the firing rate is too low to be reasonably modeled as Gaussian.
%>% filter(region=="V2") %>%
spike_data ggplot(aes(factor(orientation), nspikes)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Same data as Figure 10, visualized with a bean plot.
%>% filter(region=="V2") %>%
spike_data ggplot()+
geom_violin(aes(x=factor(orientation),y=nspikes),fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
geom_segment(aes(
x=match(factor(orientation),levels(factor(orientation)))-0.1,
xend=match(factor(orientation),levels(factor(orientation)))+0.1,
y=nspikes,yend=nspikes),
col='black'
+
) xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 12 look at the firing rate for a population of neurons and asks if the population can be reasonably modeled as lognormal. It appears that, in general, this is not a reasonable distributional assumption.
%>%
population_brain_data ggplot(aes(x=ecephys_structure_acronym, y=log(rate))) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("Brain regions") +
ylab("Log firing rate") +
facet_wrap(~fr_type) +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14),
axis.title.x = element_text(size=15)
axis.title.y = element_text(size=15),
, panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"),
strip.text.x = element_text(size = 14))
Same data as Figure 12, visualized with a violin plot.
%>%
population_brain_data ggplot(aes(x=ecephys_structure_acronym, y=log(rate))) +
geom_violin(fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
xlab("Brain regions") +
ylab("Log firing rate") +
facet_wrap(~fr_type) +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14),
axis.title.x = element_text(size=15)
axis.title.y = element_text(size=15),
, panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"),
strip.text.x = element_text(size = 14))
library(scales)
# set grid
<- seq(from=-2, to=2, length.out = 101)
h_grid <- seq(from=-2, to=2, length.out = 101)
g_grid <- c(-.67448, .67448)
z_grid
# transformation function
<- function(h, g, z){
zfun ifelse(g==0,
*exp(h*z^2/2),
zexp(h*z^2/2)*((exp(g*z)-1))/g)
}
# expand grid, calculate transformation of IQR. Group the data by g and h values
# and then calculate the IQR for the transformed data and the IQR for the normal
# distribution (the second is the same for all groups, but is calculated here
# to avoid hardcoding). Finally, compute the IQR ratio.
<- as_tibble(expand.grid(h=h_grid, g=g_grid, z=z_grid)) %>%
data mutate(zvals = zfun(h, g, z)) %>% group_by(h, g) %>%
summarize(iqr_mod = max(zvals) - min(zvals), iqr=max(z)-min(z)) %>%
ungroup() %>%
mutate(iqr_ratio = (iqr_mod) / iqr)
#> `summarise()` has grouped output by 'h'. You can override using the `.groups`
#> argument.
%>%
data ggplot(aes(g, h)) + geom_raster(aes(fill=iqr_ratio)) + scale_fill_gradient2(low=muted("blue"), mid="white", high=muted("red"), midpoint=1) +
guides(fill=guide_colorbar(title="iqr ratio")) +
theme(axis.title.x = element_text(size=15), axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.