Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh
pheno2=readRDS("~/Library/CloudStorage/Dropbox-Personal/df_ukb_pheno_updated.rds")

pheno_prs=merge(pheno_prs,pheno2[,c("Identifier","Cad_hard_Any","Cad_hard_censor_age")],by.x="identifier",by.y="Identifier")
eth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")
eth=eth[!is.na(eth$f.21000.0.0),]
eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")

Phenotyping

We validated previously that UKB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Code

# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)


m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_hard_Any","Cad_hard_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")


m2$Cad_Sta=ifelse(m2$Cad_hard_Any==2,1,0)
m2$Cad_0_censor_age=m2$Cad_hard_censor_age



m2=m2[!is.na(m2$Cad_Sta),]
m2=m2[!is.na(m2$Cad_0_censor_age),]
summary(m2$Cad_0_censor_age[m2$Cad_Sta==1])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.04654 55.13689 62.57906 62.05269 70.03970 84.76112

Code

summary(m2$Cad_0_censor_age[m2$Cad_Sta==0])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.38   63.13   70.51   69.60   76.13   88.63

Code

## old phenotypes
# df2=fread("~/Library/CloudStorage/Dropbox-Personal/UKBB_HardCAD_202109.tsv")
# m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","f.31.0.0")],eth,by.x="identifier",by.y="identifier")
# m2=merge(m2,df2[,c("sample_id","has_disease","censor_age")],by.x = "identifier",by.y="sample_id")
# m2$Cad_Sta=m2$has_disease
# m2$Cad_0_censor_age=m2$censor_age

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

Code

final_dat$yob=year(final_dat$Birthdate)

#final_dat=final_dat[year(final_dat$Birthdate)>1950,]

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code

# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # TAMR will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")
final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
table(final_dat$rf80)


   AFR    AMR    EAS    EUR    SAS 
  8426    707   2249 448106   9085

Code

## with wallace's 

prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/SAS_SUM_SCORE_FINAL_mixed")
prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/AFR_SUM_SCORE_FINAL_mixed")
prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EUR_SUM_SCORE_FINAL_mixed")
prs_EAS=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/EAS_SUM_SCORE_FINAL_mixed")
prs_AMR=fread("~/Library/CloudStorage/Dropbox-Personal/final_mixed_ethnicity_scores/HISP_SUM_SCORE_FINAL_mixed")


AFR=final_dat$identifier[final_dat$rf80%in%"AFR"]
EUR=final_dat$identifier[final_dat$rf80%in%"EUR"]
SAS=final_dat$identifier[final_dat$rf80%in%"SAS"]
AMR=final_dat$identifier[final_dat$rf80%in%"AMR"]
EAS=final_dat$identifier[final_dat$rf80%in%"EAS"]

colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=colnames(prs_EAS)=colnames(prs_AMR)=c("IID","PRS")

SAS_dat=merge(prs_SAS,final_dat[final_dat$rf80%in%"SAS",],by.x="IID",by.y="identifier")
AFR_dat=merge(prs_AFR,final_dat[final_dat$rf80%in%"AFR",],by.x="IID",by.y="identifier")
EUR_dat=merge(prs_EUR,final_dat[final_dat$rf80%in%"EUR",],by.x="IID",by.y="identifier")
AMR_dat=merge(prs_AMR,final_dat[final_dat$rf80%in%"AMR",],by.x="IID",by.y="identifier")
EAS_dat=merge(prs_EAS,final_dat[final_dat$rf80%in%"EAS",],by.x="IID",by.y="identifier")


pop_data_list=list(SAS_dat,AFR_dat,EUR_dat,AMR_dat,EAS_dat)
names(pop_data_list)=c("SAS","AFR","EUR","AMR","EAS")

final_dat=final_dat[!is.na(final_dat$rf80),]
final_dat%>%group_by(rf80)%>%summarise("cad_rate"=mean(Cad_Sta),"mean_ldl"=mean(na.omit(ldl_adj)),"%M"=mean(na.omit(as.numeric(f.31.0.0))),"meanAge"=as.numeric(mean(enrollage)))

# A tibble: 5 × 5
  rf80  cad_rate mean_ldl  `%M` meanAge
  <chr>    <dbl>    <dbl> <dbl>   <dbl>
1 AFR     0.0405     129. 0.417    52.4
2 AMR     0.0368     142. 0.332    52.5
3 EAS     0.0293     135. 0.311    52.9
4 EUR     0.0772     142. 0.458    57.3
5 SAS     0.132      135. 0.539    54.0

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_AFR$Population <- 'AFR';prs_AFR=prs_AFR[prs_AFR$IID%in%AFR]
prs_EUR$Population <- 'EUR';prs_EUR=prs_EUR[prs_EUR$IID%in%EUR]
prs_SAS$Population <- 'SAS';prs_SAS=prs_SAS[prs_SAS$IID%in%SAS]
prs_AMR$Population = 'AMR'; prs_AMR=prs_AMR[prs_AMR$IID%in%AMR]
prs_EAS$Population = 'EAS'; prs_EAS=prs_EAS[prs_EAS$IID%in%EAS]


combined_data <- rbind(prs_AFR, prs_EUR, prs_SAS,prs_AMR,prs_EAS)

ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0469

Using residual PRS

Code

totaldata=do.call(rbind,pop_data_list)
pcmod=lm(PRS~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data = totaldata)
totaldata$predictscore=predict(object = pcmod,newdata=totaldata)
totaldata$residscore=totaldata$PRS-totaldata$predictscore
totaldata$Population=as.factor(totaldata$rf80)
totaldata$scaled_resid=scale(totaldata$residscore)

ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0473

Code

totaldata%>%group_by(Population)%>%
summarise("event_rate"=mean(Cad_Sta),"total_subjects"=length(Cad_Sta),"percent_female"=mean(f.31.0.0==0),"enrollment_age"=mean(enrollage))

# A tibble: 5 × 5
  Population event_rate total_subjects percent_female enrollment_age
  <fct>           <dbl>          <int>          <dbl> <drtn>        
1 AFR            0.0405           8426          0.583 52.43066 days 
2 AMR            0.0368            707          0.668 52.53349 days 
3 EAS            0.0293           2249          0.689 52.87715 days 
4 EUR            0.0772         448106          0.542 57.28159 days 
5 SAS            0.132            9085          0.461 54.03807 days

Code

totaldata$pop_scaled_prs=rep(0,nrow(totaldata))
totaldata$pop_scaled_prs[totaldata$Population%in%"AFR"]=scale(totaldata$residscore[totaldata$Population%in%"AFR"])
totaldata$pop_scaled_prs[totaldata$Population%in%"EUR"]=scale(totaldata$residscore[totaldata$Population%in%"EUR"])
totaldata$pop_scaled_prs[totaldata$Population%in%"SAS"]=scale(totaldata$residscore[totaldata$Population%in%"SAS"])
totaldata$pop_scaled_prs[totaldata$Population%in%"EAS"]=scale(totaldata$residscore[totaldata$Population%in%"EAS"])
totaldata$pop_scaled_prs[totaldata$Population%in%"AMR"]=scale(totaldata$residscore[totaldata$Population%in%"AMR"])

summary(totaldata$pop_scaled_prs)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.89947 -0.65175  0.03021  0.00000  0.68575  4.22746

Code

ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores: Scaled for Population",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.154

Code

library(dplyr)
totaldata$enrollage=as.numeric(totaldata$enrollage)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))

# A tibble: 5 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <fct>                                 <dbl>                    <dbl>
1 AFR                                  -0.625                   0.0760
2 AMR                                  -0.608                   0.0546
3 EAS                                  -0.574                   0.0918
4 EUR                                  -0.653                   0.0288
5 SAS                                  -0.646                   0.0334
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Survival plot

Code

library(survival)
library(survminer)

Loading required package: ggpubr


Attaching package: 'survminer'

The following object is masked from 'package:survival':

    myeloma

Code

fit3 <- survfit( Surv(Cad_0_censor_age, Cad_Sta) ~ Population,data=totaldata)
ggsurvplot(fit3, data = totaldata, conf.int = FALSE,fun="event",
risk.table = TRUE, risk.table.col="strata",
ggtheme = theme_classic(),xlab="Age",palette = "nejm")

Table 1

Code

totaldata$followup=round(as.numeric(totaldata$Cad_0_censor_age)-as.numeric(totaldata$enrollage),2)
library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

Code

dat <- totaldata
dat$Sex=ifelse(dat$f.31.0.0==1,"Male","Female")
dat$Sex=as.factor(dat$Sex)
dat$Cad_Sta=as.factor(dat$Cad_Sta)
dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))
dat$enrollage=as.numeric(dat$enrollage)
dat$CAD_censor <- as.factor(dat$Cad_Sta)
label(dat$ldl_adj) <- "LDL-C (adjusted for Statin)"
label(dat$Cad_Sta) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$enrollage) = "Age Enrolled"

# Continue with creating the table
table1(~ Sex + ldl_adj + Cad_Sta + enrollage + followup+round(pop_scaled_prs,4) + scaled_resid | Population, data = dat)

	AFR (N=8426)	AMR (N=707)	EAS (N=2249)	EUR (N=448106)	SAS (N=9085)	Overall (N=468573)
Sex
Female	4914 (58.3%)	472 (66.8%)	1550 (68.9%)	242705 (54.2%)	4184 (46.1%)	253825 (54.2%)
Male	3512 (41.7%)	235 (33.2%)	699 (31.1%)	205401 (45.8%)	4901 (53.9%)	214748 (45.8%)
LDL-C (adjusted for Statin)
Mean (SD)	129 (33.1)	142 (34.5)	135 (30.5)	142 (32.1)	135 (32.0)	141 (32.1)
Median [Min, Max]	126 [32.6, 310]	137 [54.1, 318]	134 [48.7, 305]	140 [10.2, 457]	134 [33.2, 330]	139 [10.2, 457]
Missing	536 (6.4%)	27 (3.8%)	112 (5.0%)	21226 (4.7%)	487 (5.4%)	22388 (4.8%)
CAD Ever
0	8085 (96.0%)	681 (96.3%)	2183 (97.1%)	413490 (92.3%)	7883 (86.8%)	432322 (92.3%)
1	341 (4.0%)	26 (3.7%)	66 (2.9%)	34616 (7.7%)	1202 (13.2%)	36251 (7.7%)
Age Enrolled
Mean (SD)	52.4 (8.05)	52.5 (8.06)	52.9 (7.79)	57.3 (8.02)	54.0 (8.44)	57.1 (8.07)
Median [Min, Max]	51.0 [39.2, 70.4]	51.3 [40.3, 70.3]	52.3 [40.2, 70.1]	58.6 [39.7, 73.7]	53.7 [40.1, 72.1]	58.4 [39.2, 73.7]
followup
Mean (SD)	12.6 (3.09)	12.6 (3.09)	12.8 (2.77)	12.0 (4.64)	11.3 (5.66)	12.0 (4.63)
Median [Min, Max]	13.0 [-45.1, 15.5]	13.1 [-17.9, 15.5]	13.2 [-17.1, 15.5]	13.3 [-67.8, 15.9]	13.0 [-37.2, 15.5]	13.2 [-67.8, 15.9]
Pop Scaled Residual PRS
Mean (SD)	0.0000641 (1.00)	-0.0000424 (1.00)	-0.0000889 (1.00)	0.000000937 (1.00)	0.0000209 (1.00)	0.00000196 (1.00)
Median [Min, Max]	0.0800 [-4.89, 3.52]	0.0500 [-3.84, 2.92]	0.0900 [-4.05, 2.76]	0.0300 [-4.90, 4.23]	0.0300 [-4.31, 3.22]	0.0300 [-4.90, 4.23]
Overall Scaled Residual PRS
Mean (SD)	0.0404 (1.10)	-0.279 (1.02)	0.0307 (0.890)	-0.00121 (1.00)	0.0365 (0.723)	-0.0000000000000000751 (1.00)
Median [Min, Max]	0.124 [-5.35, 3.92]	-0.223 [-4.20, 2.70]	0.112 [-3.57, 2.48]	0.0277 [-4.92, 4.24]	0.0606 [-3.08, 2.37]	0.0303 [-5.35, 4.24]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)

# Before the loop
results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR=numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)

# Assuming 'df' is your dataset

# List of unique populations
populations <- c("AFR","SAS","EUR","AMR","EAS")
library(broom)
library(dplyr)
# Initialize an empty data frame to store results

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()



for(pop in populations) {
  # Subset data for the specific population
  pop_data=totaldata[totaldata$rf80%in%pop,]
  

   pop_data$PRS=pop_data$pop_scaled_prs

  
  # Step 1: Linear Regression for LDL-C Levels
# Assuming 'pop' is defined elsewhere and 'pop_data' is your dataset
#if (pop == "EUR") {
    ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#} else {
#  pop_data$PRS=-1*pop_data$PRS
#    ldl_model <- lm(ldl_adj ~  PRS + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + #PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
#}
    ldl_coef_ci <- confint(ldl_model, "PRS")

#print(paste("LDL_coef_CI for population:", pop))
if ("PRS" %in% names(coef(ldl_model))) {
    #print(confint(ldl_model, parm = "PRS"))
} else {
    #print("PRS coefficient is not available in the model.")
}


  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  #print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  
    ldl_model_2 <- lm(ldl_adj ~ scaled_ldl_prs + as.numeric(enrollage) + as.factor(f.31.0.0) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = pop_data)
    
  ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")

  #print(summary(pop_data$scaled_ldl_prs))
  # Step 2: Cox Model for CAD -- inherengly accounting for age by using age as time scale
  cox_model <- coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs
                     +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
  
    

    glm_model <- glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) 
                     + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family = "binomial")
  glm_models[[pop]] <- glm_model
      log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
  log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
  or = round(exp(log_or), 2)
  lci = round(exp(log_or - 1.96 * log_or_se), 2)
  uci = round(exp(log_or + 1.96 * log_or_se), 2)
  
  glmci= confint(glm_model)
  
    # Extract summary stats
  cox_summary <- summary(cox_model)
  glm_summary <- summary(glm_model)
  
  # Compile results into the data frame
  # Append to results
  results <- rbind(results, data.frame(
    Population = pop,
    PRS_Mean = mean(pop_data$PRS),
    PRS_SD = sd(pop_data$PRS),
    Percent_male = mean(pop_data$f.31.0.0==1),
    LDL_Coef_CI_Lower = ldl_coef_ci[1],
    LDL_Coef_CI_Upper = ldl_coef_ci[2],
    Population_Scale_Factor = prs_scale_factor,
    LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1], # Adjust according to your calculations
    LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
    HR = cox_summary$conf.int["scaled_ldl_prs", 1],
    HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3], # Assuming 'cox_summary' is properly defined
    HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4], # Assuming 'cox_summary' is properly defined
    OR = or,
    #OR_LCI = lci,
    #OR_UCI = uci
    
    OR_LCI = exp(glmci['scaled_ldl_prs',1]),
    OR_UCI = exp(glmci['scaled_ldl_prs',2])
))}

Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...

Code

results

     Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS         AFR  1.532519e-16      1    0.4168051         12.114591
PRS1        SAS -2.932895e-18      1    0.5394606          8.154680
PRS2        EUR -1.460787e-17      1    0.4583759         11.582390
PRS3        AMR -1.191880e-16      1    0.3323904          8.792547
PRS4        EAS  9.320147e-17      1    0.3108048          8.997966
     LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          13.506436                3.122435                 37.82703
PRS1          9.495848                4.532442                 36.96062
PRS2         11.759772                3.427275                 39.69603
PRS3         13.885337                3.527666                 31.01717
PRS4         11.405132                3.920973                 35.28078
     LDL_Scaled_Coef_CI_Upper        HR HR_CI_Lower HR_CI_Upper   OR    OR_LCI
PRS                  42.17297 1.3704630   0.9646008    1.947095 1.40 0.9793945
PRS1                 43.03938 1.8026748   1.3748214    2.363679 1.93 1.4216083
PRS2                 40.30397 1.6971991   1.6362720    1.760395 1.75 1.6809276
PRS3                 48.98283 3.3969733   0.8091850   14.260555 2.43 0.5411120
PRS4                 44.71922 0.9217767   0.3597363    2.361931 0.90 0.3414912
        OR_UCI
PRS   2.020129
PRS1  2.619395
PRS2  1.818275
PRS3 11.516530
PRS4  2.440573

Code

saveRDS(results,'results_UKB.rds')
# Access and summarize the Cox models for each population
#lapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

Now let’s look at a meta analysis:

Code

# Create a data frame to store the results
results <- data.frame(Population = character(),
                      HR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

 #Extract the results from each model
 for (pop in names(cox_models)) {
   model_summary <- summary(cox_models[[pop]])
   hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"]
  lower <- model_summary$conf.int["scaled_ldl_prs","lower .95"]
  upper <- model_summary$conf.int["scaled_ldl_prs","upper .95"]

  # Add to the results data frame
  results <- rbind(results, data.frame(Population = pop, HR = hr, Lower = lower, Upper = upper))
}

ggplot(results, aes(x = Population, y = HR, ymin = Lower, ymax = Upper,color=Population)) +
  geom_point() + # Add points for hazard ratios
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) + # Add error bars
  coord_flip() + # Flip coordinates for a horizontal plot
  xlab("Population") +
  ylab("Hazard Ratio (HR)") +
  ggtitle("HR for LDL-C_40 on CAD across Populations:UKB") +
  theme_minimal()

And now with OR/GLM:

Code

#lapply(glm_models, function(x){exp(coef(x)["scaled_ldl_prs"])})



# 
results <- data.frame(Population = character(),
                      OR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

for (pop in names(cox_models)) {
    model_summary <- tidy(glm_models[[pop]], conf.int = TRUE, exponentiate = TRUE)

    if ("scaled_ldl_prs" %in% model_summary$term) {
        term_row <- model_summary %>% filter(term == "scaled_ldl_prs")

        results <- rbind(results, data.frame(Population = pop,
                                             OR = term_row$estimate,
                                             Lower = term_row$conf.low,
                                             Upper = term_row$conf.high))
    } else {
        results <- rbind(results, data.frame(Population = pop, OR = NA, Lower = NA, Upper = NA))
    }
}
# # Create the forest plot using ggplot2
# 
ggplot(results, aes(x = Population, y = OR, ymin = Lower, ymax = Upper,color=Population)) +
    geom_point() +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) +
    coord_flip()+
    xlab("Population") +
    ylab("Odds Ratio (OR)") +
  ggtitle("OR for LDL-C PRS_40 on CAD:UKB")  +theme_classic()

Bayesian Meta-analysis

Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use tAMR global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.

Calculate Weighted Global Mean HR/OR:
- For each population, you have an HR/OR and its standard error (SE).
- You can weight the HR/OR by the inverse of the variance (which is SE²).
- The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
- Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
- The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.

Variance and Weights

$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2 \]

Based on tAMR variance, the weight for each observation is:

\[ w_i = \frac{1}{\text{Variance}_i} \]

Weighted Mean The weighted mean is then calculated as:

\[\ \text{Weighted Mean} = \frac{\sum_{i=1}^{n} w_i x_i}{\sum_{i=1}^{n} w_i} \ \]

where \[( x_i ) \] is the log(OR) for each observation \[i\].

Weighted Variance The weighted variance is given by:

\[ \text{Weighted Variance} = \frac{\sum w_i}{\left( \sum w_i \right)^2 - \sum w_i^2} \sum w_i (x_i - \text{Weighted Mean})^2 \]

The weighted variance is given by:

\[\text{Weighted Variance} = \frac{\sum_{i=1}^{n} w_i}{\left( \sum_{i=1}^{n} w_i \right)^2 - \sum_{i=1}^{n} w_i^2} \sum_{i=1}^{n} w_i (x_i - \text{Weighted Mean})^2\]Here,

\[ \sum_{i=1}^{n} w_i \]

is the sum of the weights,

\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.

Code

# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'

# Calculate variance and weights
results <- results %>%
  mutate(
    log_OR = log(OR),
    log_Lower = log(Lower),
    log_Upper = log(Upper),
    Variance = ((log_Upper - log_Lower) / (2 * 1.96))^2,
    Weight = 1 / Variance
  )


# Calculate weighted mean and variance for the prior
sum_weights <- sum(results$Weight)
sum_squared_weights <- sum(results$Weight^2)

weighted_mean_log_OR <- sum(results$log_OR * results$Weight) / sum_weights
weighted_variance <- (sum_weights / (sum_weights^2 - sum_squared_weights)) * 
                     sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)

prior_mean <- weighted_mean_log_OR
prior_variance <- weighted_variance

# Perform Bayesian updating for each population
results <- results %>%
  mutate(
    Posterior_Variance = 1 / (1 / prior_variance + 1 / Variance),
    Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance)
  )

# Exponentiating the posterior mean to get back to the OR scale
results$Posterior_OR <- exp(results$Posterior_Mean)

results <- results %>%
  mutate(
    Posterior_Lower_Log = Posterior_Mean - 1.96 * sqrt(Posterior_Variance),
    Posterior_Upper_Log = Posterior_Mean + 1.96 * sqrt(Posterior_Variance),
    Posterior_Lower_OR = exp(Posterior_Lower_Log),
    Posterior_Upper_OR = exp(Posterior_Upper_Log)
  )
# Now 'results' contains the posterior mean and variance for each population
print(results)

  Population        OR     Lower     Upper     log_OR   log_Lower log_Upper
1        AFR 1.4025052 0.9793945  2.020129  0.3382601 -0.02082079 0.7031616
2        SAS 1.9280613 1.4216083  2.619395  0.6565150  0.35178885 0.9629436
3        EUR 1.7482240 1.6809276  1.818275  0.5586004  0.51934580 0.5978884
4        AMR 2.4325701 0.5411120 11.516530  0.8889483 -0.61412902 2.4437834
5        EAS 0.8964945 0.3414912  2.440573 -0.1092631 -1.07443323 0.8922330
     Variance      Weight Posterior_Variance Posterior_Mean Posterior_OR
1 0.034110165   29.316774       0.0145130506      0.4638344     1.590160
2 0.024306935   41.140523       0.0123873908      0.6076324     1.836079
3 0.000401456 2490.932910       0.0003951758      0.5585727     1.748176
4 0.608524320    1.643320       0.0242541561      0.5700684     1.768388
5 0.251703457    3.972929       0.0229570221      0.4960789     1.642269
  Posterior_Lower_Log Posterior_Upper_Log Posterior_Lower_OR Posterior_Upper_OR
1           0.2277129           0.6999558           1.255725           2.013664
2           0.3894870           0.8257777           1.476223           2.283656
3           0.5196098           0.5975356           1.681371           1.817634
4           0.2648229           0.8753138           1.303200           2.399628
5           0.1991080           0.7930497           1.220314           2.210126

Forest Plot

We can then make the Bayesian meta analysis

Code

library(dplyr)
library(ggplot2)
library(forestplot)

Loading required package: grid

Loading required package: checkmate

Loading required package: abind

Code

library(forestplot)
library(ggplot2)
library(dplyr)

# Assuming 'results' is already loaded with your data

# We need to create a new data frame for ggplot
# First, we create two separate data frames for original and posterior results and then bind them together
# plot_data <- results %>%
#   select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%
#   gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%
#   mutate(
#     ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),
#     ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),
#     Type = factor(Type, levels = c("OR", "Posterior_OR"))
#   )
# 
# # Now we plot with ggplot2
# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +
#   geom_errorbarh(height = 0.2) +
#   geom_point(size = 2.5) +
#   scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
#   theme_minimal() + lims(x=c(0.5,2.5))+
#   labs(x = "Odds Ratio", y = "", color = "Estimate Type") +
#   theme(legend.position = "bottom")



# Adding an identifier for each row
results$ID <- seq_along(results$Population)

results$ID <- seq_along(results$Population)

# Create a long-format data frame for plotting
plot_data <- tidyr::pivot_longer(
  results, 
  cols = c("OR", "Posterior_OR"), 
  names_to = "Estimate_Type", 
  values_to = "Value"
) %>%
  mutate(
    ymin = ifelse(Estimate_Type == "OR", Lower, Posterior_Lower_OR),
    ymax = ifelse(Estimate_Type == "OR", Upper, Posterior_Upper_OR),
    Estimate_Type = factor(Estimate_Type, levels = c("OR", "Posterior_OR")),
    # Adjust the y position of the posterior estimates
    ypos = as.numeric(ID) + ifelse(Estimate_Type == "OR", -0.1, 0.1)
  )

# Create the plot
ggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +
  geom_point(size = 3) +
  geom_errorbar(aes(xmin = ymin, xmax = ymax), width = 0.2) +
  scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
  labs(x = "Odds Ratio", y = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(vjust = 0.5), 
        axis.ticks.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom") +
  scale_y_continuous(breaks = 1:length(unique(plot_data$ID)), labels = results$Population)

We can do tAMR in stan and calculate heterogneiety as well:

Code

library(brms)

results <- results %>%
mutate(se_log_OR = 1 / sqrt(Weight))

# Given prior study's log OR and its standard error
prior_log_or <- log(1/0.76)  # log of the hazard ratio from the study
prior_sd <- (log(1/0.64) - log(1/0.91)) / (2 * 1.96)  # standard error of the log OR


# or try tAMR:
#   
# rosuvastatin_with_event <- 235
# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event
# 
# placebo_with_event <- 304
# placebo_without_event <- 6344 - placebo_with_event
# 
# # Compute the OR
# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event
# odds_control <- placebo_with_event / placebo_without_event
# OR <- odds_control / odds_case
# 
# # Convert OR to log scale for the prior
# prior_log_or <- log(OR)
# 
# # Calculate standard error of the log OR
# prior_log_or_se <- sqrt(1 / placebo_with_event +
#                         1 / placebo_without_event +
#                         1 / rosuvastatin_with_event +
#                         1 / rosuvastatin_without_event)
# 
# # Print the prior log OR and its standard error
# print(prior_log_or)
# print(prior_log_or_se)
# 
# # Print the OR
# print(OR)
# # Define the priors based on the prior study for the global effect size
#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")

prior_effect <- set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class = "Intercept")

# Define a non-informative prior for the heterogeneity
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd", group = "Population")

prior_global <- set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd",group = "Population")

# Fit the model
meta_analysis_model <- brm(
  bf(log_OR | se(se_log_OR) ~ 1 + (1 | Population)),
  data = results,
  prior = c(prior_global, prior_heterogeneity),
  family = gaussian(),
  chains = 4,
  iter = 4000,
  warmup = 2000,
  control = list(adapt_delta = 0.999999) # adjust for convergence
)

# Check the summary of the model
summary(meta_analysis_model)

# Extract population-specific effects
ranef(meta_analysis_model)

global_effect= fixef(meta_analysis_model)[1,1]

# Get the random effects (population-specific deviations)
population_effects <- ranef(meta_analysis_model)

# Calculate the population-specific log ORs by adding the global effect to the random effects
population_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]

# Convert the log ORs to ORs
population_specific_ORs <- exp(population_specific_log_ORs)

# Now you have the population-specific ORs
print(population_specific_ORs)

Heterogeneity assessment:

Code

# Extract the standard deviation of the random effects
heterogeneity_estimate <- VarCorr(meta_analysis_model)$Population$sd

# Print the heterogeneity estimate
print(heterogeneity_estimate)

# Conduct posterior predictive checks
pp_check(meta_analysis_model)

# If you want to plot the distribution of the standard deviation of the random effects
posterior_samples <- posterior_samples(meta_analysis_model, pars = "sd_Population__Intercept")
AMRt(posterior_samples[, "sd_Population__Intercept"], breaks = 40, main = "Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")

# Summarize the posterior distribution of the heterogeneity parameter
posterior_summary <- summary(posterior_samples[, "sd_Population__Intercept"])
print(posterior_summary)