Step 5 Conventional p-value procedure
Independent variable: this SNP’s minor allele count for everyone
Dependent variable: disease status for everyone
5.1 Conduct logistics regression on each SNP
# save odds ratio&p-values for later use
# Note: or is abbrev. for odds_ratio
p = c()
or = c()
for (col in (1:ncol(X))){
#short for SNP minor allele count
SNPmac = as.vector(X[,col])
#convert status into a vector
statusv = as.vector(status)
#fit a logistic regression model
model.glm <- glm(statusv ~ SNPmac, family = 'binomial')
#beta0 is the intercept of the logistic regression
beta0 = summary(model.glm)$coefficients[1,1]
#beta1 is the coeff of increment of 1 minor allele's effect on the log(p/1-p) = log(odds_ratio)
beta1 = summary(model.glm)$coefficients[2,1]
or = cbind(or,exp(beta1))
p = cbind(p, summary(model.glm)$coefficients[2,4])
}
or = as.vector(or)
p = as.vector(p)# Remember how many non-monomorphic SNPs are we using right now? Make sure we have an odds ratio and a p-value for each of them
# (i.e. The number of odds ratio and p-value should match the number of non-monomorphic SNPs)
length(or)## [1] 3582
length(p)## [1] 3582
# Great! Let's first check the estimated effect of our 4 causal SNPs
## Recall that the we set the 10th, 11th, 2000th, 3000th SNPs as causal SNPs
## Recall that we set their odds_ratio (effect size) to be log(1.5)
## Let's see if the logistics regressions can detect that
or[causal_idx]## [1] 303531777 84118113 67853034 92488570
# OMG! Really big odds ratios. Let's check other SNPs' odds ratios. Are they much smaller than the odds ratios of causal SNPs?
or[1000]## [1] 3.778327
or[1500]## [1] 1.652893
or[3500]## [1] 0.9224015
# exclude monomorphic SNPs
useful_snps = hapmap$map[-mono,]
# build a dataframe with snp.name, chromosome, position, p-values, then remove the causal SNPs by their row index
df = data.frame(snp.name = useful_snps$snp.name, chromosome = useful_snps$chromosome, position = useful_snps$position, pvalue = p, odds_ratio = or, z_value = z) %>% filter(!row_number() %in% causal_idx)
# preview the dataframe
head(df)