You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm using LDmatrices with HapMap3+ variants + summary statistics from here (paper here)
Unfortunately it doesn't converge. Here is my code:
########################## 1. Set up environment ##############################
library(data.table)
library(bigsnpr) # bigsnpr and bigstatsr
library(bigreadr)
library(ggplot2)
library(dplyr)
pheno='bmi'
trait_type = 'continuous'
LDref_path='/mnt/project/penetrance_at_scale/data/44_familybased_PGS/ldref_hm3_plus/'
LDref_file='map_hm3_plus.rds'
# summary statistics file with already filtered variants present in UKB genotype
sumstats_file=paste0('/mnt/project/penetrance_at_scale/data/44_familybased_PGS/tan.summary_statistics/tan_2024_', pheno, '.common')
tmp_path='./'
results_path='./'
NCORES <- nb_cores()
print(paste("Number of cores available:", NCORES))
############### 1. Load hapmap3+ data and summary statistics ###################
# Info on HapMap3+ variants (the variants provided in the LD reference)
#map_ldref <- readRDS(runonce::download_file("https://figshare.com/ndownloader/files/37802721", dir = "tmp-data", fname = "map_hm3_plus.rds"))
print('Loading HapMap3+ variants info')
map_ldref <- readRDS(paste0(LDref_path, LDref_file))
# External summary statistics
print('Loading Summary Statistics')
sumstats <- fread(sumstats_file, sep=" ", dec=".")
setnames(sumstats, "chromosome", "chr")
setnames(sumstats, "SNP", "rsid")
setnames(sumstats, "A1", "a1")
setnames(sumstats, "A2", "a0")
setnames(sumstats, "direct_Beta", "beta")
setnames(sumstats, "direct_SE", "beta_se")
setnames(sumstats, "direct_N", "n_eff")
setnames(sumstats, "direct_pval", "p")
sumstats<-sumstats[freq > 0.01, ]
########### 2. Matching LD matrices and summary statistics variants ############
# join_by_pos = FALSE means, join by rsid
print('Merging HapMap3+ variants and summary statistics')
info_snp <- snp_match(sumstats, map_ldref, join_by_pos = FALSE) #, return_flip_and_rev = T?
# this drops any row that has NA in any column (ex in cols pos_hg18 or pos_hg38, which are ok to retain)
# info_snp <- tidyr::drop_na(tibble::as_tibble(info_snp))
info_snp <- tibble::as_tibble(info_snp)
# QC
# SD REFERENCE
print('Removing bad variants')
sd_ldref <- with(info_snp, sqrt(2 * af_UKBB * (1 - af_UKBB)))
if(trait_type == 'binary'){ #if GWAS trait is binary
cat("treating GWAS trait as binary\n",sep='')#,file=file_log,append=TRUE)
sd_ss <- with(info_snp, 2 / sqrt(n_eff * beta_se^2 + beta^2)) #sumstats sd
}else if (trait_type == 'continuous'){ #if it is continuous
cat("treating GWAS trait as continuous\n",sep='')#,file=file_log,append=TRUE)
#consistent with e.g.: https://github.com/privefl/paper-misspec/blob/main/code/prepare-sumstats-bbj/height.R
#NOTE: sd_ss / sqrt(info)???
sd_ss <- with(info_snp, 1 / sqrt(n_eff * beta_se^2 + beta^2))
sd_ss <- sd_ss / quantile(sd_ss, 0.99) * sqrt(0.5)
} else {
cat('trait type not found')
}
is_bad <- sd_ss < (0.5 * sd_ldref) | sd_ss > (sd_ldref + 0.1) | sd_ss < 0.05 | sd_ldref < 0.05
#df_beta <- info_snp[!is_bad, ]
df_beta <- info_snp
# plot bad points
png(file = paste0(results_path, "/", pheno, ".removed_variants.png"), width = 800)
qplot(sd_ldref, sd_ss, color = is_bad, alpha = I(0.5)) +
theme_bigstatsr() +
coord_equal() +
scale_color_viridis_d(direction = -1) +
geom_abline(linetype = 2, color = "red") +
labs(x = "Standard deviations derived from allele frequencies of the LD reference",
y = "Standard deviations derived from the summary statistics",
color = "Removed?")
dev.off()
print(paste('Number of LD variants:', nrow(map_ldref)))
print(paste('Number of summary stats variants:', nrow(sumstats)))
print(paste('Number of merged variants:', nrow(info_snp)))
print(paste('Number of good variants:', nrow(df_beta)))
# Also restrict to the variants present in your test data as well (already done in bash script)
######################### 3. Computing LDpred2 scores ##########################
# LD matrices (correlations between pairs of genetic variants) for 1,444,196 HapMap3+ variants based on European individuals of the UK biobank.
# Pairs of variants further than 3 cM apart are assumed to have 0 correlation.
print("Loading LD matrices")
tmp <- tempfile(tmpdir = tmp_path)
for (chr in 1:22) {
cat(chr, ".. ", sep = "")
## indices in 'df_beta'
ind.chr <- which(df_beta$chr == chr)
## indices in 'map_ldref'
ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr]
## indices in 'corr_chr'
ind.chr3 <- match(ind.chr2, which(map_ldref$chr == chr))
corr_chr <- readRDS(paste0(LDref_path,"LD_with_blocks_chr", chr, ".rds"))[ind.chr3, ind.chr3]
if (chr == 1) {
corr <- as_SFBM(corr_chr, tmp, compact = TRUE)
} else {
corr$add_columns(corr_chr, nrow(corr))
}
}
print(paste('Memory needed in GB:', file.size(corr$sbk) / 1024^3))
# Heritability estimation of LD score regression
# to be used as a starting value in LDpred2-auto
print('Estimating heritability')
(ldsc <- with(df_beta, snp_ldsc(ld, ld_size = nrow(map_ldref),
chi2 = (beta / beta_se)^2,
sample_size = n_eff,
ncores = NCORES)))
h2_est <- ldsc[["h2"]]
# LDpred2-auto
set.seed(1) # to get the same result every time
coef_shrink <- 0.95 # reduce this up to 0.4 if you have some (large) mismatch with the LD ref
print('Running LDpred2 auto')
multi_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = h2_est,
vec_p_init = seq_log(1e-4, 0.2, length.out = 30),
allow_jump_sign = FALSE, shrink_corr = coef_shrink,
ncores = NCORES)
print(str(multi_auto[[1]], max.level = 1))
print('Filtering out bad chains')
# Filter for best chains and average remaining ones - new recommended way
# `range` should be between 0 and 2
(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est))))
(keep <- which(range > (0.95 * quantile(range, 0.95, na.rm = TRUE))))
print('range')
print(range[0:10])
print('keep')
print(keep[0:10])
print(length(keep))
beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est))
#save final betas
print('Saving final weights')
final_beta_auto <- cbind(df_beta[,1:4], beta_auto) #bind with chr:pos:a0:a1
write.table(final_beta_auto, paste0(results_path,"/",pheno,".weights.txt"), col.names=T,row.names=F,quote=F)
Results are:
List of 12
$ beta_est : num [1:1153813] NA NA NA NA NA NA NA NA NA NA ...
$ postp_est : num [1:1153813] NA NA NA NA NA NA NA NA NA NA ...
$ corr_est : num [1:1153813] NA NA NA NA NA NA NA NA NA NA ...
$ sample_beta :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
$ path_p_est : num [1:700] 0.000285 0.000392 0.000603 0.000783 0.000985 ...
$ path_h2_est : num [1:700] 0.0625 0.0997 0.1172 0.1443 0.1631 ...
$ path_alpha_est: num [1:700] -0.497 -0.566 -0.702 -0.79 -0.623 ...
$ h2_est : num NA
$ p_est : num NA
$ alpha_est : num NA
$ h2_init : num 0.256
$ p_init : num 1e-04
If I lower the coef_shrink to 0.5 it does converge. But I'm not sure if this is the right approach. Could you please explain to me what coef_shrink and use_MLE==FALSE are for? What is best to modify for convergence?
List of 12
$ beta_est : num [1:1152423] -2.92e-04 -1.83e-04 -2.12e-04 -9.44e-05 -8.15e-05 ...
$ postp_est : num [1:1152423] 0.0395 0.0244 0.0334 0.026 0.0234 ...
$ corr_est : num [1:1152423] -4.50e-04 6.16e-05 -3.44e-04 -2.89e-04 -3.36e-05 ...
$ sample_beta :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
$ path_p_est : num [1:700] 0.000177 0.000256 0.000477 0.000567 0.000818 ...
$ path_h2_est : num [1:700] 0.0998 0.1175 0.1087 0.1257 0.1315 ...
$ path_alpha_est: num [1:700] -1.127 -0.66 -0.819 -0.812 -0.788 ...
$ h2_est : num 0.494
$ p_est : num 0.0237
$ alpha_est : num -0.812
$ h2_init : num 0.256
$ p_init : num 1e-04
The text was updated successfully, but these errors were encountered:
Are these individuals of European ancestry? (closest to NW the better); the differences in SD are quite strong.
About the sample size: you should QC for 0.7 of the max(N); it is never good to have very different per-variant sample sizes.
Finally, with small to moderate sample sizes, you want to use use_MLE = FALSE, which uses to the two-parameter LDpred2 model (without the third parameter which is difficult to estimate with small sample sizes, and can lead to divergence).
You can try again with coef_shrink = 0.95 and use_MLE = FALSE and the threshold on per-variant N.
Hi Florian!
I'm using LDmatrices with HapMap3+ variants + summary statistics from here (paper here)
Unfortunately it doesn't converge. Here is my code:
Results are:
If I lower the
coef_shrink
to 0.5 it does converge. But I'm not sure if this is the right approach. Could you please explain to me whatcoef_shrink
anduse_MLE==FALSE
are for? What is best to modify for convergence?The text was updated successfully, but these errors were encountered: