This RMarkdown document demonstrates how key elements from the notebook for case study 1 in the EpiGraphDB paper can be achieved using the R package. For detailed explanations of the case study please refer to the paper or the case study notebook.
Mendelian randomization (MR), a technique to evaluate the causal role of modifiable exposures on a health outcome using a set of genetic instruments, tacitly assumes that a genetic variant (e.g. SNP) is only related to an outcome of interest through an exposure (i.e. the “exclusion restriction criterion”). Horizontal pleiotropy, where a SNP is associated with multiple phenotypes independently of the exposure of interest, potentially violates this assumption delivering misleading conclusions. In contrast, vertical pleiotropy, where a SNP is associated with multiple phenotypes on the same biological pathway, does not violate this assumption.
Here, we use external evidence on biological pathways and protein-protein interactions to assess the pleiotropic profile for a genetic variant based on the genes (and proteins) with which that variant is associated. In a graph representation, we will show that:
This case study goes as follows:
library("magrittr")
library("dplyr")
library("purrr")
library("glue")
library("igraph")
library("epigraphdb")
Here we configure the parameters used in the case study example. In this example we will look at the variant rs12720356, a SNP located in chromosome 19 that has been associated with Crohn’s disease and psoriasis.**
<- "rs12720356"
SNP
<- c("ZGLP1", "FDX1L", "MRPL4", "ICAM5", "TYK2", "GRAP2", "KRI1", "TMED1", "ICAM1")
GENELIST
<- 1 PPI_N_INTERMEDIATE_PROTEINS
The first step of the analysis is to map each of these genes to their protein product.
<- function(genelist) {
get_gene_protein <- "/mappings/gene-to-protein"
endpoint <- list(
params gene_name_list = genelist %>% I()
)<- query_epigraphdb(
r route = endpoint,
params = params,
mode = "table",
method = "POST"
)<- r
protein_df if (nrow(protein_df) > 0) {
<- protein_df %>%
res_df select(gene_name = `gene.name`, uniprot_id = `protein.uniprot_id`)
else {
} <- tibble() %>% set_names(c("gene_name", "uniprot_id"))
res_df
}
res_df
}
<- get_gene_protein(genelist = GENELIST)
gene_protein_df
gene_protein_df#> # A tibble: 9 x 2
#> gene_name uniprot_id
#> <chr> <chr>
#> 1 ZGLP1 P0C6A0
#> 2 FDX1L Q6P4F2
#> 3 MRPL4 Q9BYD3
#> 4 ICAM5 Q9UMF0
#> 5 TYK2 P29597
#> 6 GRAP2 O75791
#> 7 KRI1 Q8N9T8
#> 8 TMED1 Q13445
#> 9 ICAM1 P05362
For each protein we retrieve the pathways they are involved in.
<- function(gene_protein_df) {
get_protein_pathway <- "/protein/in-pathway"
endpoint <- list(
params uniprot_id_list = gene_protein_df %>% pull(`uniprot_id`) %>% I()
)<- query_epigraphdb(route = endpoint, params = params, mode = "table", method = "POST")
df
if (nrow(df) > 0) {
<- gene_protein_df %>%
res_df select(`uniprot_id`) %>%
left_join(df, by = c("uniprot_id"))
else {
} <- gene_protein_df %>%
res_df select(`uniprot_id`) %>%
mutate(pathway_count = NA_integer_, pathway_reactome_id = NA_character_)
}<- res_df %>%
res_df mutate(
pathway_count = ifelse(is.na(pathway_count), 0L, as.integer(pathway_count)),
pathway_reactome_id = ifelse(is.na(pathway_reactome_id), c(), pathway_reactome_id)
)
res_df
}<- get_protein_pathway(gene_protein_df = gene_protein_df)
pathway_df
pathway_df#> # A tibble: 9 x 3
#> uniprot_id pathway_count pathway_reactome_id
#> <chr> <int> <list>
#> 1 P0C6A0 0 <NULL>
#> 2 Q6P4F2 15 <chr [15]>
#> 3 Q9BYD3 6 <chr [6]>
#> 4 Q9UMF0 5 <chr [5]>
#> 5 P29597 28 <chr [28]>
#> 6 O75791 15 <chr [15]>
#> 7 Q8N9T8 0 <NULL>
#> 8 Q13445 0 <NULL>
#> 9 P05362 11 <chr [11]>
Now for each pair of proteins we match the pathways they have in common.
<- function(pathway_df) {
get_shared_pathway # For the protein-pathway data
# Get protein-protein permutations where they share pathways
<- function(pathway_df, permutation) {
per_permutation <- pathway_df %>% filter(uniprot_id %in% permutation)
df <- pathway_df %>%
primary_pathway filter(uniprot_id == permutation[1]) %>%
pull(pathway_reactome_id) %>%
unlist()
<- pathway_df %>%
assoc_pathway filter(uniprot_id == permutation[2]) %>%
pull(pathway_reactome_id) %>%
unlist()
intersect(primary_pathway, assoc_pathway)
}
<- pathway_df %>%
pairwise_permutations pull(`uniprot_id`) %>%
::permutations(n = length(.), r = 2, v = .)
gtools<- tibble(
shared_pathway_df protein = pairwise_permutations[, 1],
assoc_protein = pairwise_permutations[, 2]
%>%
) mutate(
shared_pathway = map2(`protein`, `assoc_protein`, function(x, y) {
per_permutation(pathway_df = pathway_df, permutation = c(x, y))
}),combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
<- sort(c(x, y))
comb paste(comb, collapse = ",")
}),count = map_int(`shared_pathway`, function(x) na.omit(x) %>% length()),
connected = count > 0
)
shared_pathway_df
}<- get_shared_pathway(pathway_df)
shared_pathway_df <- length(shared_pathway_df %>% filter(count > 0))
n_pairs print(glue::glue("Num. shared_pathway pairs: {n_pairs}"))
#> Num. shared_pathway pairs: 6
%>% arrange(desc(count))
shared_pathway_df #> # A tibble: 72 x 6
#> protein assoc_protein shared_pathway combination count connected
#> <chr> <chr> <list> <chr> <int> <lgl>
#> 1 P05362 P29597 <chr [6]> P05362,P29597 6 TRUE
#> 2 P29597 P05362 <chr [6]> P05362,P29597 6 TRUE
#> 3 P05362 Q9UMF0 <chr [5]> P05362,Q9UMF0 5 TRUE
#> 4 Q9UMF0 P05362 <chr [5]> P05362,Q9UMF0 5 TRUE
#> 5 O75791 P05362 <chr [2]> O75791,P05362 2 TRUE
#> 6 O75791 P29597 <chr [2]> O75791,P29597 2 TRUE
#> 7 O75791 Q9UMF0 <chr [2]> O75791,Q9UMF0 2 TRUE
#> 8 P05362 O75791 <chr [2]> O75791,P05362 2 TRUE
#> 9 P29597 O75791 <chr [2]> O75791,P29597 2 TRUE
#> 10 Q9UMF0 O75791 <chr [2]> O75791,Q9UMF0 2 TRUE
#> # … with 62 more rows
We can further query EpiGraphDB regarding the detailed pathway information using GET /meta/nodes/Pathway/search
.
<- function(reactome_id) {
get_pathway_info <- "/meta/nodes/Pathway/search"
endpoint <- list(id = reactome_id)
params <- query_epigraphdb(route = endpoint, params = params, mode = "table")
df
df
}
<- shared_pathway_df %>%
pathway pull(shared_pathway) %>%
unlist() %>%
unique()
<- pathway %>% map_df(get_pathway_info)
pathway_info %>% print()
pathway_info #> # A tibble: 12 x 6
#> node._name node.name node._source node.id node._id node.url
#> <chr> <chr> <list> <chr> <chr> <chr>
#> 1 Immune System Immune System <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 2 Adaptive Immune … Adaptive Immune… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 3 Signal Transduct… Signal Transduc… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 4 Interferon Signa… Interferon Sign… <chr [1]> R-HSA-… R-HSA-9… https://rea…
#> 5 Interleukin-4 an… Interleukin-4 a… <chr [1]> R-HSA-… R-HSA-6… https://rea…
#> 6 Interleukin-10 s… Interleukin-10 … <chr [1]> R-HSA-… R-HSA-6… https://rea…
#> 7 Signaling by Int… Signaling by In… <chr [1]> R-HSA-… R-HSA-4… https://rea…
#> 8 Cytokine Signali… Cytokine Signal… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 9 Integrin cell su… Integrin cell s… <chr [1]> R-HSA-… R-HSA-2… https://rea…
#> 10 Immunoregulatory… Immunoregulator… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 11 Extracellular ma… Extracellular m… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 12 Disease Disease <chr [1]> R-HSA-… R-HSA-1… https://rea…
In order to extract protein groups from the shared pathways, the last step for this query is to convert the shared pathway data into a graph where
We then count the number of nodes in each connected community and plot the graph.
<- function(df) {
protein_df_to_graph <- df %>%
df_connected filter(connected) %>%
distinct(`combination`, .keep_all = TRUE)
<- df %>%
nodes pull(protein) %>%
unique()
<- igraph::graph_from_data_frame(
graph
df_connected,directed = FALSE, vertices = nodes
)$layout <- igraph::layout_with_kk
graph
graph
}
<- function(graph) {
graph_to_protein_groups %>%
graph ::components() %>%
igraph::groups() %>%
igraphtibble(group_member = .) %>%
mutate(group_size = map_int(`group_member`, length)) %>%
arrange(desc(group_size))
}
<- shared_pathway_df %>% protein_df_to_graph()
pathway_protein_graph <- pathway_protein_graph %>% graph_to_protein_groups()
pathway_protein_groups %>% str()
pathway_protein_groups #> tibble [5 × 2] (S3: tbl_df/tbl/data.frame)
#> $ group_member:List of 5
#> ..$ 1: chr [1:5] "O75791" "P05362" "P29597" "Q6P4F2" ...
#> ..$ 2: chr "P0C6A0"
#> ..$ 3: chr "Q13445"
#> ..$ 4: chr "Q8N9T8"
#> ..$ 5: chr "Q9BYD3"
#> ..- attr(*, "dim")= int 5
#> ..- attr(*, "dimnames")=List of 1
#> .. ..$ : chr [1:5] "1" "2" "3" "4" ...
#> $ group_size : Named int [1:5] 5 1 1 1 1
#> ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
plot(pathway_protein_graph)
sessionInfo
sessionInfo()
#> R version 4.0.4 (2021-02-15)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Manjaro Linux
#>
#> Matrix products: default
#> BLAS: /usr/lib/libblas.so.3.9.0
#> LAPACK: /usr/lib/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] epigraphdb_0.2.2 igraph_1.2.6 glue_1.4.2 purrr_0.3.4
#> [5] dplyr_1.0.2 magrittr_2.0.1
#>
#> loaded via a namespace (and not attached):
#> [1] rstudioapi_0.13 knitr_1.31 tidyselect_1.1.0 debugme_1.1.0
#> [5] R6_2.5.0 rlang_0.4.10 fansi_0.4.2 highr_0.8
#> [9] httr_1.4.2 stringr_1.4.0 tools_4.0.4 xfun_0.21
#> [13] utf8_1.2.1 cli_2.3.1 gtools_3.8.2 htmltools_0.5.1.1
#> [17] ellipsis_0.3.1 assertthat_0.2.1 yaml_2.2.1 digest_0.6.27
#> [21] tibble_3.1.0 lifecycle_1.0.0 crayon_1.4.1 ps_1.6.0
#> [25] vctrs_0.3.6 curl_4.3 evaluate_0.14 rmarkdown_2.1
#> [29] stringi_1.5.3 compiler_4.0.4 pillar_1.5.1 generics_0.1.0
#> [33] jsonlite_1.7.2 pkgconfig_2.0.3