Load R Packages
library(tidyverse) # Data wrangling
library(TwoSampleMR) # MR
library(LDlinkR) # LD and proxy snps
# Usr defined functions
source('scripts/munge_proxies.R')
source('scripts/misc_functions.R')
Previously, we demonstrated the harmonization of a single exposure-outcome pair. Here, we expand on this concept by harmonizing multiple exposure-outcome trait pairs using the map
functions from the purrr
package. However, it is important to note that due to the size of some summary statistic files, memory limits may become an issue when running these scripts locally. As the number of exposure-outcome pairs increases, it becomes even more important to invest in functional programming and develop reproducible, adaptable, and transparent computational workflows that can handle large-scale data analysis (Targets in R; Snakemake in Python). By doing so, we can increase efficiency, reduce errors, and make our research more accessible and reproducible to others.
library(tidyverse) # Data wrangling
library(TwoSampleMR) # MR
library(LDlinkR) # LD and proxy snps
# Usr defined functions
source('scripts/munge_proxies.R')
source('scripts/misc_functions.R')
Our exposures of interest LDL cholesteral level, with GWAS summary statistics obtained from two separate GWAS conducted by the Global Lipid Genetics Consortium. The first GWAS, conducted by Willer et al. in 2013, included 188,577 individuals and identified 56 loci that were significantly associated with LDL cholesterol levels. The second GWAS, conducted by Graham et al. in 2021, included a much larger sample size of approximately 1.65 million individuals, which included 350,000 individuals of non-European ancestry. In the subset of individuals of European ancestry (n = 1,320,016), 403 loci were identified as being significantly associated with LDL cholesterol levels.
The process of importing and clumping the exposure datasets remains the same as before, except that now we are excluding rare variants with a minor allele frequency (MAF) of less than 0.01. This is done to reduce the file size.
= c(
exposure_paths "resources/Graham2021ldl.chrall.CPRA_b37.tsv.gz",
"resources/Willer2013ldl.chrall.CPRA_b37.tsv.gz"
)<- str_extract(exposure_paths, "(?<=/).*(?=.chrall)")
exposures
<- map(exposure_paths, function(x){
exposure_ss <- str_extract(x, "(?<=/).*(?=.chrall)")
trait
message("Imporing: ", x, "....")
## Filter out problematic snps - MAF > 1%, SNVs, rsid
<- read_tsv(
ss comment = "##", col_types = coltypes,
x, col_select = c(DBSNP_ID, CHROM, POS, REF, ALT, AF, BETA, SE, Z, P, N, TRAIT)) %>%
filter(between(AF, 0.01, 0.99)) %>%
filter(nchar(REF) == 1 & nchar(ALT) == 1) %>%
filter(!is.na(DBSNP_ID)) %>%
mutate(TRAIT = trait)
message("Standardizing: ", x, "....")
<- ss %>% format_data(.,
ss_out type = "exposure",
snps = NULL,
header = TRUE,
phenotype_col = "TRAIT",
snp_col = "DBSNP_ID",
beta_col = "BETA",
se_col = "SE",
eaf_col = "AF",
effect_allele_col = "ALT",
other_allele_col = "REF",
pval_col = "P",
samplesize_col = "N",
z_col = "Z",
chr_col = "CHROM",
pos_col = "POS",
log_pval = FALSE
%>%
) as_tibble()
rm(ss)
ss_out%>%
}) ::set_names(exposures) magrittr
# Perform LD clumping on SNP data, filter SNPs to make it run faster
<- map(exposure_ss, function(x){
exposure_clump %>%
x filter(pval.exposure < 1e-8) %>%
clump_data(.,
clump_kb = 10000,
clump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
pop = "EUR"
)
})
<- bind_rows(exposure_clump) exposure_dat
Our outcomes consist of GWAS of clinical late onset Alzheimer’s disease by Kunkle et al 2019 and AD/dementia by Bellenguez et al 2022. Importing, identifying proxy variants, and harmonizing the datasets remains the same as before.
= c(
outcome_paths "resources/Kunkle2019load_stage123.chrall.CPRA_b37.tsv.gz",
"resources/Bellenguez2022load.chrall.CPRA_b37.tsv.gz"
)
<- str_extract(outcome_paths, "(?<=/).*(?=.chrall)")
outcomes
<- map(outcome_paths, function(x){
outcome_ss <- str_extract(x, "(?<=/).*(?=.chrall)")
trait
message("Imporing: ", x, "....")
## Filter out problematic snps - MAF > 1%, SNVs, rsid
<- read_tsv(
ss comment = "##", col_types = coltypes, # n_max = 100,
x, col_select = c(DBSNP_ID, CHROM, POS, REF, ALT, AF, BETA, SE, Z, P, N, TRAIT)) %>%
filter(between(AF, 0.01, 0.99)) %>%
filter(nchar(REF) == 1 & nchar(ALT) == 1) %>%
filter(!is.na(DBSNP_ID)) %>%
mutate(TRAIT = trait)
message("Standardizing: ", x, "....")
<- ss %>% format_data(.,
ss_out type = "outcome",
snps = NULL,
header = TRUE,
phenotype_col = "TRAIT",
snp_col = "DBSNP_ID",
beta_col = "BETA",
se_col = "SE",
eaf_col = "AF",
effect_allele_col = "ALT",
other_allele_col = "REF",
pval_col = "P",
samplesize_col = "N",
z_col = "Z",
chr_col = "CHROM",
pos_col = "POS",
log_pval = FALSE
%>%
) as_tibble()
rm(ss)
ss_out%>%
}) ::set_names(outcomes) magrittr
# extract exposure SNPs present in outcome
<- map(outcome_ss, function(x){
outcome_clump semi_join(
by = "SNP"
x, exposure_dat,
)
})
# Exposure SNPs not present in outomce
<- map(outcome_ss, function(x){
exp_snps_wo anti_join(
by = "SNP"
exposure_dat, x,
)
})
# Use LDLinkR to identify proxy snps
map2(exp_snps_wo, outcomes, function(x,y){
LDproxy_batch(x$SNP,
pop = "CEU", # Match population ancestries
r2d = "r2",
token = 'a6deee62cc4a',
append = TRUE, # We appended the results of each LDlink query to a single file
genome_build = "grch37") # Select genome build based on summary stats
system(glue("mv combined_query_snp_list_grch37.txt data/exposure_{y}_proxy_snps.txt"))
})
# Munge proxy snp file
<- pmap(list(outcomes, outcome_ss, outcome_clump),
outcome_dat function(file, sumstats, clumped){
munge_proxies(glue("data/exposure_{file}_proxy_snps.txt"),
sumstats, clumped)%>%
}) bind_rows()
During the harmonization step we are now flagging SNPs that are GWS for the outcome or that are located in the APOE region for removal.
<- harmonise_data(exposure_dat, test, action = 2) %>%
mr_dat as_tibble() %>%
mutate(
apoe_region = case_when(
== 19 & between(pos.outcome, 44912079, 45912079) ~ TRUE,
chr.outcome TRUE ~ FALSE
),gws.outcome = ifelse(pval.outcome < 5e-8, TRUE, FALSE),
mr_keep = ifelse(mr_keep == FALSE | apoe_region == TRUE | gws.outcome == TRUE, FALSE, TRUE)
)