3 Harmonizing SumStats

In order to perform MR we need to combine our exposure and outcome datasets. Here we will use the TwoSampleMR procedure to harmonize GWAS summary stats.

Load R Packages

library(tidyverse)    # Data wrangling 
library(TwoSampleMR)  # MR 
library(LDlinkR)      # LD and proxy snps

# Define column types for summary statistics
coltypes = cols(
  ID = col_character(),
  CHROM = col_double(),
  POS = col_double(),
  REF = col_character(),
  ALT = col_character(),
  AF = col_double(),
  TRAIT = col_character(),
  BETA = col_double(),
  SE = col_double(),
  Z = col_double(),
  P = col_double(),
  N = col_double(),
  OR = col_double(),
  OR_L95 = col_double(),
  OR_U95 = col_double(),
  DIR = col_character(),
  G1000_ID = col_character(),
  G1000_VARIANT = col_character(),
  DBSNP_ID = col_character(),
  DBSNP_VARIANT = col_character(),
  OLD_ID = col_character(),
  OLD_VARIANT = col_character()
)

Function for munging LDlink output

munge_proxies <- function(LDLink_file, outcome, outcome_clump){
  LDLink_file_path <- LDLink_file
  proxy_snps <- read_tsv(LDLink_file_path, skip = 1, col_names = F) %>%
  rename(id = X1, func = X2, proxy_snp = X3, coord = X4, alleles = X5, maf = X6, 
         distance = X7, dprime = X8, rsq = X9, correlated_alleles = X10, FORGEdb = X11, RegulomeDB = X12) %>%
  separate(coord, c('chr', 'pos'), sep = ":") %>%
  mutate(snp = ifelse(id == 1, proxy_snp, NA), 
         chr = str_replace(chr, 'chr', ""), 
         chr = as.numeric(chr), 
         pos = as.numeric(pos)) %>%
  fill(snp, .direction = 'down') %>%
  relocate(snp, .before = proxy_snp) %>%
  dplyr::select(-id, -func, -FORGEdb, -RegulomeDB) %>%
  filter(rsq >= 0.8)

  # Munge proxy snp and outcome data
  proxy_outcome <- left_join(
    proxy_snps, outcome, by = c("proxy_snp" = "SNP")
  ) %>%
    separate(correlated_alleles, c("target_a1.outcome", "proxy_a1.outcome", 
                                   "target_a2.outcome", "proxy_a2.outcome"), sep = ",|=") %>%
    filter(!is.na(chr.outcome)) %>%
    arrange(snp, -rsq, abs(distance)) %>%
    group_by(snp) %>%
    slice(1) %>%
    ungroup() %>%
    mutate(
       proxy.outcome = TRUE,
       target_snp.outcome = snp,
       proxy_snp.outcome = proxy_snp, 
    ) %>% 
    mutate(
         new_effect_allele.outcome = case_when(
          proxy_a1.outcome == effect_allele.outcome & proxy_a2.outcome == other_allele.outcome ~ target_a1.outcome,
          proxy_a2.outcome == effect_allele.outcome & proxy_a1.outcome == other_allele.outcome ~ target_a2.outcome,
          TRUE ~ NA_character_
       ), 
        new_other_allele.outcome = case_when(
          proxy_a1.outcome == effect_allele.outcome & proxy_a2.outcome == other_allele.outcome ~ target_a2.outcome,
          proxy_a2.outcome == effect_allele.outcome & proxy_a1.outcome == other_allele.outcome ~ target_a1.outcome,
          TRUE ~ NA_character_
       ), 
       effect_allele.outcome = new_effect_allele.outcome, 
       other_allele.outcome = new_other_allele.outcome
    ) %>%
    dplyr::select(-proxy_snp, -chr, -pos, -alleles, -maf, -distance, -rsq, -dprime,  
           -new_effect_allele.outcome, -new_other_allele.outcome) %>%
    relocate(target_a1.outcome, proxy_a1.outcome, target_a2.outcome, proxy_a2.outcome, .after = proxy_snp.outcome) %>%
    rename(SNP = snp) %>%
    relocate(SNP, .after = samplesize.outcome)
  
  # Merge outcome and proxy outcomes
  outcome_dat <- bind_rows(
    outcome_clump, proxy_outcome
  ) %>% 
    arrange(chr.outcome, pos.outcome)
  
  outcome_dat
}

Exposure dataset

The exposure GWAS SumStats have previously been standardized to common format, however, they need to be converted to the format required by TwoSampleMR.

Import exposure GWAS SumStats

exposure_path = "resources/Willer2013tc.chrall.CPRA_b37.tsv.gz"
exposure_ss <- read_tsv(exposure_path, comment = "##", col_types = coltypes, 
                        col_select = c(DBSNP_ID, CHROM, POS, REF, ALT, AF, BETA, SE, Z, P, N, TRAIT))

# Format data to TwoSampleMR format
exposure <- exposure_ss %>%
  format_data(.,
    type = "exposure",
    snps = NULL,
    header = TRUE,
    phenotype_col = "TRAIT",
    snp_col = "DBSNP_ID",
    beta_col = "BETA",
    se_col = "SE",
    eaf_col = "AF",
    effect_allele_col = "ALT",
    other_allele_col = "REF",
    pval_col = "P",
    samplesize_col = "N",
    z_col = "Z",
    chr_col = "CHROM",
    pos_col = "POS",
    log_pval = FALSE
) %>%
  as_tibble()

Two-Sample MR methods usually utilize only independent genome-wide significant SNPs. The process of extracting these SNPs from the full genome-wide association study (GWAS) summary statistics involves linkage disequilibrium (LD) clumping. TwoSampleMR provides a clumping function, TwoSampleMR::clump_data(), which performs a stringent clumping procedure with a default window size of 10Mb and an r² threshold of 0.001. Setting clump_p1 = 1 and clump_p2 = 1 will perform clumping on all available SNPs. The GWAS summary statistics used in this analysis are from a population of European ancestry, and therefore the EUR reference panel is specified.

LD clump exposure

# Perform LD clumping on SNP data, filter SNPs to make it run faster
exposure_clump <- exposure %>% 
  filter(pval.exposure < 0.01) %>%
  clump_data(.,
  clump_kb = 10000,
  clump_r2 = 0.001,
  clump_p1 = 1,
  clump_p2 = 1,
  pop = "EUR"
)

# Should filter here at GWS, I dont here to demonstrate proxy snps
# exposure_dat <- filter(exposure_clump, pval.exposure < 5e-8)
exposure_dat <- exposure_clump

Note

Typically, in a Two-Sample MR analysis, one would only consider genome-wide significant SNPs (SNPs with pval.exposure < 5e-8). However, in this demonstration, all independent SNPs with pval.exposure < 0.01 are retained to showcase how to identify proxy variants, as all of the genome-wide significant SNPs are also present in the outcome GWAS.

Outcome datasets

Similarly, the outcome dataset needs to be converted to the TwoSampleMR format.

Import outcome GWAS SumStats

outcome_path = "resources/Kunkle2019load_stage123.chrall.CPRA_b37.tsv.gz"
outcome_ss <- read_tsv(outcome_path, comment = "##",  col_types = coltypes, 
                       col_select = c(DBSNP_ID, CHROM, POS, REF, ALT, AF, BETA, SE, Z, P, N, TRAIT))

# Format outcome
outcome <- outcome_ss %>%
  format_data(.,
    type = "outcome",
    snps = NULL,
    header = TRUE,
    phenotype_col = "TRAIT",
    snp_col = "DBSNP_ID",
    beta_col = "BETA",
    se_col = "SE",
    eaf_col = "AF",
    effect_allele_col = "ALT",
    other_allele_col = "REF",
    pval_col = "P",
    samplesize_col = "N",
    z_col = "Z",
    chr_col = "CHROM",
    pos_col = "POS",
    log_pval = FALSE
) %>%
  as_tibble()

LD Proxy SNPs

Proxy variants are genetic markers that are in strong linkage disequilibrium (LD) with the variants of interest, and can be used as a proxy to estimate the effect of the variants of interest on the outcome of interest. In some cases, variants in the exposure dataset may not be present in the outcome dataset due to different GWAS using different genotyping arrays, impuation methods, and QC practices. To overcome this, proxy variants in the outcome dataset can be substituted for the missing variant.

Here we first extract our exposure SNPs from the outcome GWAS and determine which SNPs are not presenting in the outcome dataset. We then query the LDLink web server to identify proxy variants that are in strong LD (r²) with the missing variant. The resulting proxy SNPs are then extracted from the outcome dataset and appended to the initial SNPs extracted from the outcome GWAS using munge_proxies() function.

Identify Proxy Variants

# extract exposure SNPs present in outcome
outcome_clump <- semi_join(
  outcome, exposure_dat, by = "SNP"
)

# Exposure SNPs not present in outomce
exp_snps_wo <- anti_join(
  exposure_dat, outcome, by = "SNP"
)

# Use LDLinkR to identify proxy snps
LDproxy_batch(exp_snps_wo$SNP, 
        pop = "CEU",             # Match population ancestries
        r2d = "r2", 
        token = 'a6deee62cc4a', 
        append = TRUE,           # We appended the results of each LDlink query to a single file
        genome_build = "grch37") # Select genome build based on summary stats
system("mv combined_query_snp_list_grch37.txt data/exposure_outcome_proxy_snps.txt")


# Munge proxy snp file
outcome_dat <- munge_proxies("data/exposure_outcome_proxy_snps.txt", outcome, outcome_clump)

Harmonize Exposure - Outcome Datasets

Finally, the effect of a SNP on an outcome and exposure must be harmonised to be relative to the same allele. We also specify to infer the positive strand alleles, using allele frequencies for palindromes rather then removing palindromic variants.

Harmonize

mr_dat <- harmonise_data(exposure_dat, outcome_dat, action = 2) %>% as_tibble() %>%
#  mutate(
#    apoe_region = case_when(
#      chr.outcome == 19 & between(pos.outcome, 44912079, 45912079) ~ TRUE,
#      TRUE ~ FALSE
#    ), 
#    gws.outcome = ifelse(pval.outcome < 5e-8, TRUE, FALSE), 
#   mr_keep = ifelse(mr_keep == FALSE | apoe_region == TRUE | gws.outcome == TRUE, FALSE, TRUE)
#  )
  filter(pval.exposure < 5e-8)

Note

The variable mr_keep is used to determine the SNPs to be included in the MR analysis. It is recommended to exclude variants with pval.outcome < 5e-8 as they violate the exclusion restriction assumption. For studies involving Alzheimer’s disease, variants in the APOE region (GRCh37 - CHR: 19, BP: 44912079-45912079) should also be removed due to its large effect size and pleiotropic nature. However, in this tutorial, this step is not performed to demonstrate how causal estimates can be influenced by horizontal pleiotropy.

And now we have a final harmonized dataset that can be used in our downstream MR analyses