Convergence issues. Modifying coef_shrink vs use_MLE. #532

scienception · 2025-01-09T00:55:25Z

Hi Florian!

I'm using LDmatrices with HapMap3+ variants + summary statistics from here (paper here)
Unfortunately it doesn't converge. Here is my code:

########################## 1. Set up environment ##############################
library(data.table)
library(bigsnpr) # bigsnpr and bigstatsr
library(bigreadr)
library(ggplot2)
library(dplyr)

pheno='bmi'
trait_type = 'continuous'

LDref_path='/mnt/project/penetrance_at_scale/data/44_familybased_PGS/ldref_hm3_plus/'
LDref_file='map_hm3_plus.rds'
# summary statistics file with already filtered variants present in UKB genotype
sumstats_file=paste0('/mnt/project/penetrance_at_scale/data/44_familybased_PGS/tan.summary_statistics/tan_2024_', pheno, '.common')
tmp_path='./'
results_path='./'

NCORES <- nb_cores()
print(paste("Number of cores available:", NCORES))

############### 1. Load hapmap3+ data and summary statistics ###################

# Info on HapMap3+ variants (the variants provided in the LD reference)
#map_ldref <- readRDS(runonce::download_file("https://figshare.com/ndownloader/files/37802721", dir = "tmp-data", fname = "map_hm3_plus.rds"))
print('Loading HapMap3+ variants info')
map_ldref <- readRDS(paste0(LDref_path, LDref_file))

# External summary statistics
print('Loading Summary Statistics')
sumstats <- fread(sumstats_file, sep=" ",  dec=".")
setnames(sumstats, "chromosome", "chr")
setnames(sumstats, "SNP", "rsid")
setnames(sumstats, "A1", "a1")
setnames(sumstats, "A2", "a0")
setnames(sumstats, "direct_Beta", "beta")
setnames(sumstats, "direct_SE", "beta_se")
setnames(sumstats, "direct_N", "n_eff")
setnames(sumstats, "direct_pval", "p")

sumstats<-sumstats[freq > 0.01, ]

########### 2. Matching LD matrices and summary statistics variants ############
# join_by_pos = FALSE means, join by rsid
print('Merging HapMap3+ variants and summary statistics')
info_snp <- snp_match(sumstats, map_ldref, join_by_pos = FALSE) #, return_flip_and_rev = T?
# this drops any row that has NA in any column (ex in cols pos_hg18 or pos_hg38, which are ok to retain)
# info_snp <- tidyr::drop_na(tibble::as_tibble(info_snp))
info_snp <- tibble::as_tibble(info_snp)

# QC
# SD REFERENCE
print('Removing bad variants')
sd_ldref <- with(info_snp, sqrt(2 * af_UKBB * (1 - af_UKBB)))
if(trait_type == 'binary'){  #if GWAS trait is binary
  cat("treating GWAS trait as binary\n",sep='')#,file=file_log,append=TRUE)
  sd_ss <- with(info_snp, 2 / sqrt(n_eff * beta_se^2 + beta^2)) #sumstats sd
}else if (trait_type == 'continuous'){ #if it is continuous
  cat("treating GWAS trait as continuous\n",sep='')#,file=file_log,append=TRUE)
  #consistent with e.g.: https://github.com/privefl/paper-misspec/blob/main/code/prepare-sumstats-bbj/height.R
  #NOTE: sd_ss / sqrt(info)???
  sd_ss <- with(info_snp, 1 / sqrt(n_eff * beta_se^2 + beta^2))
  sd_ss <- sd_ss / quantile(sd_ss, 0.99) * sqrt(0.5)
} else {
  cat('trait type not found')
}

is_bad <- sd_ss < (0.5 * sd_ldref) | sd_ss > (sd_ldref + 0.1) | sd_ss < 0.05 | sd_ldref < 0.05
#df_beta <- info_snp[!is_bad, ]
df_beta <- info_snp

# plot bad points
png(file = paste0(results_path, "/", pheno, ".removed_variants.png"), width = 800)
qplot(sd_ldref, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations derived from allele frequencies of the LD reference",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")

dev.off()

print(paste('Number of LD variants:', nrow(map_ldref)))
print(paste('Number of summary stats variants:', nrow(sumstats)))
print(paste('Number of merged variants:', nrow(info_snp)))
print(paste('Number of good variants:', nrow(df_beta)))

# Also restrict to the variants present in your test data as well (already done in bash script)


######################### 3. Computing LDpred2 scores ##########################

# LD matrices (correlations between pairs of genetic variants) for 1,444,196 HapMap3+ variants based on European individuals of the UK biobank.
# Pairs of variants further than 3 cM apart are assumed to have 0 correlation.

print("Loading LD matrices")
tmp <- tempfile(tmpdir = tmp_path)

for (chr in 1:22) {
  cat(chr, ".. ", sep = "")
  ## indices in 'df_beta'
  ind.chr <- which(df_beta$chr == chr)
  ## indices in 'map_ldref'
  ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr]
  ## indices in 'corr_chr'
  ind.chr3 <- match(ind.chr2, which(map_ldref$chr == chr))
  corr_chr <- readRDS(paste0(LDref_path,"LD_with_blocks_chr", chr, ".rds"))[ind.chr3, ind.chr3]

  if (chr == 1) {
    corr <- as_SFBM(corr_chr, tmp, compact = TRUE)
  } else {
    corr$add_columns(corr_chr, nrow(corr))
  }
}

print(paste('Memory needed in GB:', file.size(corr$sbk) / 1024^3))

# Heritability estimation of LD score regression
# to be used as a starting value in LDpred2-auto
print('Estimating heritability')
(ldsc <- with(df_beta, snp_ldsc(ld, ld_size = nrow(map_ldref),
                                chi2 = (beta / beta_se)^2,
                                sample_size = n_eff,
                                ncores = NCORES)))
h2_est <- ldsc[["h2"]]


# LDpred2-auto
set.seed(1)  # to get the same result every time
coef_shrink <- 0.95  # reduce this up to 0.4 if you have some (large) mismatch with the LD ref
print('Running LDpred2 auto')
multi_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = h2_est,
                               vec_p_init = seq_log(1e-4, 0.2, length.out = 30),
                               allow_jump_sign = FALSE, shrink_corr = coef_shrink,
                               ncores = NCORES)

print(str(multi_auto[[1]], max.level = 1))

print('Filtering out bad chains')
# Filter for best chains and average remaining ones - new recommended way
# `range` should be between 0 and 2
(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est))))
(keep <- which(range > (0.95 * quantile(range, 0.95, na.rm = TRUE))))

print('range')
print(range[0:10])

print('keep')
print(keep[0:10])
print(length(keep))

beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est))
#save final betas
print('Saving final weights')
final_beta_auto <- cbind(df_beta[,1:4], beta_auto) #bind with chr:pos:a0:a1
write.table(final_beta_auto, paste0(results_path,"/",pheno,".weights.txt"), col.names=T,row.names=F,quote=F)

Results are:

List of 12
$ beta_est : num [1:1153813] NA NA NA NA NA NA NA NA NA NA ...
$ postp_est : num [1:1153813] NA NA NA NA NA NA NA NA NA NA ...
$ corr_est : num [1:1153813] NA NA NA NA NA NA NA NA NA NA ...
$ sample_beta :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
$ path_p_est : num [1:700] 0.000285 0.000392 0.000603 0.000783 0.000985 ...
$ path_h2_est : num [1:700] 0.0625 0.0997 0.1172 0.1443 0.1631 ...
$ path_alpha_est: num [1:700] -0.497 -0.566 -0.702 -0.79 -0.623 ...
$ h2_est : num NA
$ p_est : num NA
$ alpha_est : num NA
$ h2_init : num 0.256
$ p_init : num 1e-04

If I lower the coef_shrink to 0.5 it does converge. But I'm not sure if this is the right approach. Could you please explain to me what coef_shrink and use_MLE==FALSE are for? What is best to modify for convergence?

List of 12
$ beta_est : num [1:1152423] -2.92e-04 -1.83e-04 -2.12e-04 -9.44e-05 -8.15e-05 ...
$ postp_est : num [1:1152423] 0.0395 0.0244 0.0334 0.026 0.0234 ...
$ corr_est : num [1:1152423] -4.50e-04 6.16e-05 -3.44e-04 -2.89e-04 -3.36e-05 ...
$ sample_beta :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
$ path_p_est : num [1:700] 0.000177 0.000256 0.000477 0.000567 0.000818 ...
$ path_h2_est : num [1:700] 0.0998 0.1175 0.1087 0.1257 0.1315 ...
$ path_alpha_est: num [1:700] -1.127 -0.66 -0.819 -0.812 -0.788 ...
$ h2_est : num 0.494
$ p_est : num 0.0237
$ alpha_est : num -0.812
$ h2_init : num 0.256
$ p_init : num 1e-04

The text was updated successfully, but these errors were encountered:

privefl · 2025-01-09T08:41:54Z

What is the effective sample size of the GWAS summary statistics?
Which method have the GWAS sumstats derived from?
Can you show the QC plot?

scienception · 2025-01-09T14:07:27Z

What is the effective sample size of the GWAS summary statistics?
I'm using continuous traits and it oscillates between 5k to 30k for each variant.
Which method have the GWAS sumstats derived from?
It uses a FGWAS

- Can you show the QC plot? Here is the QC plot for BMI

privefl · 2025-01-10T07:33:17Z

Are these individuals of European ancestry? (closest to NW the better); the differences in SD are quite strong.
About the sample size: you should QC for 0.7 of the max(N); it is never good to have very different per-variant sample sizes.
Finally, with small to moderate sample sizes, you want to use use_MLE = FALSE, which uses to the two-parameter LDpred2 model (without the third parameter which is difficult to estimate with small sample sizes, and can lead to divergence).

You can try again with coef_shrink = 0.95 and use_MLE = FALSE and the threshold on per-variant N.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Convergence issues. Modifying coef_shrink vs use_MLE. #532

Convergence issues. Modifying coef_shrink vs use_MLE. #532

scienception commented Jan 9, 2025 •

edited

Loading

privefl commented Jan 9, 2025

scienception commented Jan 9, 2025 •

edited

Loading

privefl commented Jan 10, 2025

Convergence issues. Modifying coef_shrink vs use_MLE. #532

Convergence issues. Modifying coef_shrink vs use_MLE. #532

Comments

scienception commented Jan 9, 2025 • edited Loading

privefl commented Jan 9, 2025

scienception commented Jan 9, 2025 • edited Loading

privefl commented Jan 10, 2025

scienception commented Jan 9, 2025 •

edited

Loading

scienception commented Jan 9, 2025 •

edited

Loading