################################################################################################################## ## source('XHWE.txt') ## ## XHWE("ped.txt",loci=NULL,header=T,filename="results.txt") ## ## ## ## $Tstat ## ## LRT0 LRT1 LRT2 Z0 Z1 Z2 ## ## SNP1_1 0.8464845 0.6416829 0.2024129 0.8726769 0.6434745 0.2292024 ## ## SNP2_1 17.2407287 12.8338786 4.1913076 18.2702870 13.9685528 4.3017342 ## ## ## ## $Pvalue ## ## LRT0 LRT0b LRT1 LRT2 LRT2b Z0 Z1 Z2 ## ## SNP1_1 6.5492e-01 4.5700e-01 4.2310e-01 6.5278e-01 3.3300e-01 6.4640e-01 4.2246e-01 6.3212e-01 ## ## SNP2_1 1.8039e-04 0.0000e+00 3.4040e-04 4.0632e-02 2.1000e-02 1.0781e-04 1.8589e-04 3.8074e-02 ## ## ## ## $Estimates_H1 ## ## pm pf rho ## ## SNP1_1 0.53 0.4866667 0.02597213 ## ## SNP2_1 0.33 0.5233333 0.11807890 ## ## ## ## $Estimates_H01 ## ## p rho ## ## SNP1_1 0.4929953 0.02613709 ## ## SNP2_1 0.4928944 0.12117841 ## ## ## ## $Estimates_H02 ## ## pm pf ## ## SNP1_1 0.53 0.4866667 ## ## SNP2_1 0.33 0.5233333 ## ## ## ## $Estimates_H0 ## ## p ## ## SNP1_1 0.4928571 ## ## SNP2_1 0.4957143 ## ################################################################################################################## # # This code contains the likelihood ratio tests for Hardy-Weinberg equilibrium (HWE) at SNP markers on X chromosome # # The code only uses all the founders with the genotypes available from the input pedigree file # # The results may be different due to the parametric bootstrap techniques # # ped: the name of a pedigree file or a matrix/dataframe containing the pedigrees # the 5th column is the gender (1 = male, 2 = female, 0 = missing) # the 6th column is the affection status (1 = unaffected, 2 = affected) # from the 7th column, every consecutive two columns are the alleles at a SNP marker # for males, these two alleles at a SNP marker are the same, because males have only one X chromosome # ped is required # # loci: the name of a loci file # note that only SNP markers are suitable for the code # for each SNP locus, there are two alleles 1 and 2 # # header: if ped contains variable names, then set header = True (or T) # the default value is True (or T) # # status_missing: the input variable "status_missing" is the missing value for the affection status in the data files, # and the default value is 9 # it can take NA, but cannot take any other string values # # allele_missing: the input variable "allele_missing" represents the missing value for the allele # it may be 9 in some data files; or other numeric value; the default value is 0. # it cannot be NA or string values. # # # start.rho: the initial value of the inbreeding coefficient for iterations, which should be taken to be # larger than 0. The default value is 0.02. # # simuno: the number of bootstrap replications; the default value is 1000. # # dv: convergence criterion. The default value is 1e-7. # # itertime: the maximum number of iterations. The default value is 1000. # # filename: if the "filename" is assigned (should be text file, and the file name should be embraced by double # quotation marks, then the output results will be saved in the text file in the current directory. # Otherwise, the ouput results will be automatically saved in the "results.txt" file. # # In case of multiple markers, each marker will be analyzed seperately. # # References: # Xiao-Ping You, Qi-Lei Zou, Jian-Long Li and Ji-Yuan Zhou. Likelihood Ratio Tests for Hardy-Weinberg Equilibrium at Marker Loci on X Chromosome. submitted. # Gang Zheng, Jungnam Joo, Chun Zhang and Nancy L. Geller. Testing association for markers on the X chromosome. Genetic Epidemiology, 2007, 31(8): 834--843. `XHWE` <- function(ped,loci=NULL,header=T,status_missing=9,allele_missing=0,start.rho=0.02,simuno=1000,dv=1e-7,itertime=1000,filename="results.txt") { marker.name <- NULL # read in pedigree file if it is provided if (is.character(ped)) { pedfile <- ped ped <- read.table(pedfile,header=header,stringsAsFactors=FALSE) n.loci <- (dim(ped)[2] - 6) / 2 n.ind <- dim(ped)[1] if(header==F){names(ped)[5+2*(1:n.loci)] <- paste("SNP",1:n.loci)} marker.name <- names(ped)[5+2*(1:n.loci)] } else if(is.data.frame(ped)) { n.loci <- (dim(ped)[2] - 6) / 2 n.ind <- dim(ped)[1] names(ped)[5+2*(1:n.loci)] <- paste("SNP",1:n.loci) marker.name <- names(ped)[5+2*(1:n.loci)] # read in marker names from ped } else if(is.matrix(ped)) { n.loci <- (dim(ped)[2] - 6) / 2 n.ind <- dim(ped)[1] if(!is.null(dimnames(ped))) marker.name <- dimnames(ped)[[2]][5+2*(1:n.loci)] } if(is.na(status_missing)){ ped[which(ped[,6] == 1),6] <-0 ped[which(ped[,6] == 2),6] <-1 } else if (status_missing!=0){ ped[which(ped[,6] == 1),6] <-0 ped[which(ped[,6] == 2),6] <-1 } else if(status_missing==0){ ped[which(ped[,6] == 0),6] <-9 ped[which(ped[,6] == 1),6] <-0 ped[which(ped[,6] == 2),6] <-1 status_missing <- 9 } # read in loci file if it is provided if (is.character(loci)) { locifile <- loci loci.raw <- readLines(locifile) loci.raw2 <- strsplit(loci.raw, ",") for(i in 1:n.loci){ marker.name[i] <- loci.raw2[[1]][i] } } if (is.null(marker.name)) { marker.name <- paste("SNP",1:n.loci) } # Output Tstat <- matrix(0,ncol=6,nrow=n.loci) dimnames(Tstat) <- list(marker.name, c("LRT0","LRT1","LRT2","Z0","Z1","Z2")) pvalue <- matrix(0,ncol=8,nrow=n.loci) dimnames(pvalue) <- list(marker.name, c("LRT0","LRT0b","LRT1","LRT2","LRT2b","Z0","Z1","Z2")) Estimates_H1 <- matrix(0,ncol=3,nrow=n.loci) dimnames(Estimates_H1) <- list(marker.name, c("pm","pf","rho")) Estimates_H01 <- matrix(0,ncol=2,nrow=n.loci) dimnames(Estimates_H01) <- list(marker.name, c("p","rho")) Estimates_H02 <- matrix(0,ncol=2,nrow=n.loci) dimnames(Estimates_H02) <- list(marker.name, c("pm","pf")) Estimates_H0 <- matrix(0,ncol=1,nrow=n.loci) dimnames(Estimates_H0) <- list(marker.name, c("p")) family.id <- unique(ped[,1]) n.family <- length(family.id) select.fou <- which(ped[,3] == 0 & ped[,4] == 0 & ped[,7] != 0) ped <- ped[select.fou,] for(i in 1:n.loci){ j <- 2*i+5 result1 <- result(ped[,c(1:6,j:(j+1))],loci,dv,start.rho,simuno,status_missing,allele_missing,header,itertime,marker.name[i]) Tstat[i,1] <- result1[[1]]$LRT.test Tstat[i,2] <- result1[[1]]$LRT.test2 Tstat[i,3] <- result1[[1]]$LRT.test1 Tstat[i,4] <- result1[[1]]$z0 Tstat[i,5] <- result1[[1]]$z1 Tstat[i,6] <- result1[[1]]$z2 pvalue[i,1] <- result1[[2]]$Pvalue pvalue[i,2] <- result1[[2]]$Pvalue.boot pvalue[i,3] <- result1[[2]]$Pvalue2 pvalue[i,4] <- result1[[2]]$Pvalue1 pvalue[i,5] <- result1[[2]]$Pvalue1.boot pvalue[i,6] <- result1[[2]]$Pvalue.z0 pvalue[i,7] <- result1[[2]]$Pvalue.z1 pvalue[i,8] <- result1[[2]]$Pvalue.z2 Estimates_H1[i,1] <- result1[[3]]$pm Estimates_H1[i,2] <- result1[[3]]$pf1 Estimates_H1[i,3] <- result1[[3]]$rho1 Estimates_H01[i,1] <- result1[[3]]$p.01 Estimates_H01[i,2] <- result1[[3]]$rho.01 Estimates_H02[i,1] <- result1[[3]]$pm.02 Estimates_H02[i,2] <- result1[[3]]$pf.02 Estimates_H0[i,1] <- result1[[3]]$p.0 } pvalue1 <- noquote(format(pvalue, scientific = TRUE, digits=5)) results <- list(Tstat=Tstat,Pvalue=pvalue1,Estimates_H1=Estimates_H1,Estimates_H01=Estimates_H01,Estimates_H02=Estimates_H02,Estimates_H0=Estimates_H0) if (missing(filename)!=T){ sink(filename) print(results) sink() } print(results) } `result` <- function(ped,loci,dv,start.rho,simuno,status_missing,allele_missing,header,itertime,SNP_name) { n2f <- sum(ped[,5]==2 & ped[,7]==1 & ped[,8]==1) n1f <- sum((ped[,5]==2 & ped[,7]==1 & ped[,8]==2) | (ped[,5]==2 & ped[,7]==2 & ped[,8]==1)) n0f <- sum(ped[,5]==2 & ped[,7]==2 & ped[,8]==2) n1m <- sum(ped[,5]==1 & ped[,7]==1 ) n0m <- sum(ped[,5]==1 & ped[,7]==2 ) nf <- n2f + n1f + n0f nm <- n1m + n0m if(nm > 0 & nf < 0.1) { print(cbind(SNP_name,"Warning: all the female founders are missing!")) pm.male <- n1m / nm test.testa <- data.frame(LRT.test = 0, LRT.test2 = 0, LRT.test1 = 0, z0=0, z1=0, z2=0) test.pvalue <- data.frame(Pvalue = 1, Pvalue.boot = 1, Pvalue2 = 1, Pvalue1 = 1, Pvalue1.boot = 1, Pvalue.z0 = 1, Pvalue.z1 = 1, Pvalue.z2 = 1) test.para <- data.frame(p.0=NA,p.01=NA, rho.01=NA, pf.z.01=NA, rho.z.01=NA, pm.02=pm.male, pf.02=NA, pm=pm.male, pf1=NA, rho1=NA, pf=NA, rho.z=NA) } else if(nm < 0.1 & nf > 0) { print(cbind(SNP_name,"Warning: all the male founders are missing!")) ############################################# # Z2-stat with df being 1 nfT2 <- 2 * nf pf <- (n2f * 2 + n1f) / nfT2 pf2 <- (pf)^2 delta <- (n2f / nf) - pf2 pq <- pf * (1 - pf) Z2 <- sqrt(nf) * (delta + pq / nfT2) / pq z2 <- (Z2)^2 ############################################# rho.z <- delta/pq emf1 <- emf(0, 0, n2f, n1f, n0f, 0, nf, start.rho, dv, itertime) pf1.last <- emf1$pf.last rho1.last <- emf1$rho.last pf1qf1 <- pf1.last * (1 - pf1.last) tt1 <- rho1.last * pf1qf1 p2.e <- pf1.last^2 + tt1 p1.e <- 2 * (pf1qf1 - tt1) ############################################# # Chi-square test with df being 1 p.all <- pf p2.all <- p.all^2 p1.all <- 2 * p.all * (1 - p.all) LRT.num <- Likelihoodfun(0, p2.e, p1.e, 0, 0, n2f, n1f, n0f) LRT.den <- Likelihoodfun(0, p2.all, p1.all, 0, 0, n2f, n1f, n0f) LRT.test1 <- 2 * (LRT.num - LRT.den) ########################################################### #Z2 with df being 1 Pvalue.z2 <- pchisq(z2, df = 1,lower.tail=F) # Chi-square test with df being 1 for rho=0 Pvalue1 <- pchisq(LRT.test1, df = 1,lower.tail=F) test.testa <- data.frame(LRT.test = 0, LRT.test2 = 0, LRT.test1 = LRT.test1, z0=0, z1=0, z2=z2) test.pvalue <- data.frame(Pvalue = 1, Pvalue.boot = 1, Pvalue2 = 1, Pvalue1 = Pvalue1, Pvalue1.boot = 1, Pvalue.z0 = 1, Pvalue.z1 = 1, Pvalue.z2 = Pvalue.z2) test.para <- data.frame(p.0=NA, p.01=NA, rho.01=NA, pf.z.01=NA, rho.z.01=rho.z, pm.02=NA, pf.02=pf, pm=NA, pf1=pf1.last, rho1=rho1.last, pf=pf, rho.z=rho.z) } else if (nf > 0 & nm > 0) { LRT.array4 <- array(0, dim=c(simuno, 1)) LRT.array6 <- array(0, dim=c(simuno, 1)) ############################################# # Z1-stat with df being 1 nfT2 <- 2 * nf pf <- (n2f * 2 + n1f) / nfT2 pm <- n1m / nm pf2 <- (pf)^2 var1 <- pm * (1-pm) / nm var2 <- (pf - 2 * pf2 + n2f / nf) / nfT2 Z1 <- (pm - pf) / (sqrt(var1 + var2)) z1 <- (Z1)^2 ############################################# # Z2-stat with df being 1 delta <- (n2f / nf) - pf2 pq <- pf * (1 - pf) Z2 <- sqrt(nf) * (delta + pq / nfT2) / pq z2 <- (Z2)^2 ############################################# # Z0-stat with df being 2 z0 <- z1 + z2 rho.z <- delta/pq emf1 <- emf(n1m, n0m, n2f, n1f, n0f, nm, nf, start.rho, dv, itertime) pm1.last <- emf1$pm.last pf1.last <- emf1$pf.last rho1.last <- emf1$rho.last pf1qf1 <- pf1.last * (1 - pf1.last) tt1 <- rho1.last * pf1qf1 p2.e <- pf1.last^2 + tt1 p1.e <- 2 * (pf1qf1 - tt1) ############################################# # Chi-square test with df being 2 sum1 <- nm + nfT2 p.all <- (n1m + n2f * 2 + n1f)/sum1 p2.all <- p.all^2 p1.all <- 2 * p.all * (1 - p.all) LRT.num <- Likelihoodfun(pm1.last, p2.e, p1.e, n1m, n0m, n2f, n1f, n0f) LRT.den <- Likelihoodfun(p.all, p2.all, p1.all, n1m, n0m, n2f, n1f, n0f) LRT.test <- 2 * (LRT.num - LRT.den) ########################################################### # Chi-square test with df being 1 for rho = 0 pf1 <- 2 * pq LRT.den1 <- Likelihoodfun(pm, pf2, pf1, n1m, n0m, n2f, n1f, n0f) LRT.test1 <- 2 * (LRT.num - LRT.den1) ########################################################### # Chi-square test with df being 1 for pm = pf em2 <- emc(n1m, n0m, n2f, n1f, n0f, nm, nf, start.rho, dv, itertime) pc2.last <- em2$pc.last rho2.last <- em2$rho.last pcqc <- pc2.last * (1 - pc2.last) tt2 <- rho2.last * pcqc p2f2 <- pc2.last^2 + tt2 p1f2 <- 2 * (pcqc - tt2) LRT.den2 <- Likelihoodfun(pc2.last, p2f2, p1f2, n1m, n0m, n2f, n1f, n0f) LRT.test2 <- 2 * (LRT.num - LRT.den2) ########################################################### #BOOTSTRAP #rho=0 and pm=pf p0.all <- 1 - p2.all - p1.all pf0 <- 1 - pf2 - pf1 nm.all <- array(0,dim=c(simuno,2)) nm.a0 <- rbinom(simuno,nm,p.all) nm.all[,1] <- nm.a0 nm.all[,2] <- nm - nm.a0 nf.all <- rmultinom(simuno, size = nf, prob = c(p2.all,p1.all,p0.all)) n.all <- cbind(nm.all,t(nf.all)) nf.all02 <- rmultinom(simuno, size = nf, prob = c(pf0,pf1,pf2)) nf.all2 <- t(nf.all02) for(j in 1:simuno){ n1ma<-n.all[j,1] n0ma<-n.all[j,2] n2f.a<-n.all[j,3] n1f.a<-n.all[j,4] n0f.a<-n.all[j,5] pfa <- (n1ma + n2f.a * 2 + n1f.a)/sum1 pf2a <- (pfa)^2 pf1a <- 2 * pfa * (1 - pfa) emfa <- emf(n1ma, n0ma, n2f.a, n1f.a, n0f.a, nm, nf, start.rho, dv, itertime) pma.last <- emfa$pm.last pfa.last <- emfa$pf.last rhoa.last <- emfa$rho.last pfaqfa <- pfa.last * (1 - pfa.last) tt3 <- rhoa.last * pfaqfa p2.aa <- pfa.last^2 + tt3 p1.aa <- 2 * (pfaqfa - tt3) LRT.dena <- Likelihoodfun(pfa, pf2a, pf1a, n1ma, n0ma, n2f.a, n1f.a, n0f.a) LRT.numa <- Likelihoodfun(pma.last, p2.aa, p1.aa, n1ma, n0ma, n2f.a, n1f.a, n0f.a) LRT.testa <- 2 * (LRT.numa - LRT.dena) LRT.array6[j,1] <- LRT.testa #rho=0 n0fb <-nf.all2[j,1] n1fb <-nf.all2[j,2] n2fb <-nf.all2[j,3] pfb <- (n2fb * 2 + n1fb) / nfT2 pf2b <- (pfb)^2 pf1b <- 2 * pfb * (1 - pfb) emfb <- emf(n1m, n0m, n2fb, n1fb, n0fb, nm, nf, start.rho, dv, itertime) pmb.last <- emfb$pm.last pfb.last <- emfb$pf.last rhob.last <- emfb$rho.last pfbqfb <- pfb.last * (1 - pfb.last) tt4 <- rhob.last * pfbqfb p2.b <- pfb.last^2 + tt4 p1.b <- 2 * (pfbqfb - tt4) LRT.numb <- Likelihoodfun(pmb.last, p2.b, p1.b, n1m, n0m, n2fb, n1fb, n0fb) LRT.denb <- Likelihoodfun(pm, pf2b, pf1b, n1m, n0m, n2fb, n1fb, n0fb) LRT.testb <- 2 * (LRT.numb - LRT.denb) LRT.array4[j,1] <- LRT.testb } #Z1 with df being 1 Pvalue.z1 <- pchisq(z1, df = 1,lower.tail=F) #Z2 with df being 1 Pvalue.z2 <- pchisq(z2, df = 1,lower.tail=F) #Z0 with df being 2 Pvalue.z0 <- pchisq(z0, df = 2,lower.tail=F) # Chi-square test with df being 2 Pvalue <- pchisq(LRT.test, df = 2, lower.tail=F) Pvalue.boot <- sum(LRT.array6[,1] >= LRT.test) / simuno # Chi-square test with df being 1 for rho=0 Pvalue1 <- pchisq(LRT.test1, df = 1,lower.tail=F) Pvalue1.boot <- sum(LRT.array4[,1] >= LRT.test1) / simuno # Chi-square test with df being 1 for pm = pf Pvalue2 <- pchisq(LRT.test2, df = 1,lower.tail=F) test.testa <- data.frame(LRT.test = LRT.test, LRT.test2 = LRT.test2, LRT.test1 = LRT.test1, z0=z0, z1=z1, z2=z2) test.pvalue <- data.frame(Pvalue = Pvalue, Pvalue.boot = Pvalue.boot, Pvalue2 = Pvalue2, Pvalue1 = Pvalue1, Pvalue1.boot = Pvalue1.boot, Pvalue.z0 = Pvalue.z0, Pvalue.z1 = Pvalue.z1, Pvalue.z2 = Pvalue.z2) test.para <- data.frame(p.0=p.all, p.01=pc2.last, rho.01=rho2.last, pf.z.01=p.all, rho.z.01=rho.z, pm.02=pm, pf.02=pf, pm=pm1.last, pf1=pf1.last, rho1=rho1.last, pf=pf, rho.z=rho.z) } else { test.testa <- data.frame(LRT.test = 0, LRT.test2 = 0, LRT.test1 = 0, z0=0, z1=0, z2=0) test.pvalue <- data.frame(Pvalue = 1, Pvalue.boot = 1, Pvalue2 = 1, Pvalue1 = 1, Pvalue1.boot = 1, Pvalue.z0 = 1, Pvalue.z1 = 1, Pvalue.z2 = 1) test.para <- data.frame(p.0=NA, p.01=NA, rho.01=NA, pf.z.01=NA, rho.z.01=NA, pm.02=NA, pf.02=NA, pm=NA, pf1=NA, rho1=NA, pf=NA, rho.z=NA) } list(test.testa, test.pvalue, test.para) } # Likelihood function Likelihoodfun <- function(p1m, p2f, p1f, n1m, n0m, n2f, n1f, n0f) { if (n1m != 0 & n0m != 0){ LH <- log(p1m) * n1m + log(1 - p1m) * n0m + log(p2f) * n2f + log(p1f) * n1f + log(1 - p2f - p1f) * n0f return(LH) } else{ LH <- log(p2f) * n2f + log(p1f) * n1f + log(1 - p2f - p1f) * n0f return(LH) } } # EM algorithm emf <- function(n1m, n0m, n2f, n1f, n0f, nm, nf, rho, dv, itertime) { iter <- 0 nfT2 <- 2 * nf pf <- (n2f * 2 + n1f) / nfT2 # the initial value of pf nonstop <- TRUE while(nonstop & (iter <= itertime)) { temp <- rho * pf * (1 - pf) pf.sq <- pf^2 qf.sq <- (1 - pf)^2 deno1 <- pf.sq + temp deno2 <- qf.sq + temp e1 <- n2f * pf.sq / deno1 e2 <- n2f * temp / deno1 e3 <- n0f * qf.sq / deno2 e4 <- n0f * temp / deno2 sume2e4 <- e4 + e2 trho <- sume2e4 / (sume2e4 + n1f) tpf <- (2 * e1 + n1f + sume2e4) / nfT2 nonstop <- (abs(trho - rho) > dv | abs(tpf - pf)> dv) if(is.nan(nonstop)|is.na(nonstop)) break if(nonstop & (iter <= itertime)) { rho <- trho pf <- tpf iter <- iter + 1 } } rho.last <- rho pf.last <- pf if (nm == 0){ list(rho.last = rho.last, pf.last = pf.last) } else{ pm.last <- n1m / nm list(rho.last = rho.last,pm.last = pm.last,pf.last = pf.last) } } # EM algorithm for pm = pf emc <- function(n1m, n0m, n2f, n1f, n0f, nm, nf, rho, dv, itertime) { sum.all <- nm + 2 * nf pc <- (n1m + n2f * 2 + n1f)/sum.all # the initial value of pc iter <- 0 nonstop <- TRUE while(nonstop & (iter <= itertime)) { temp <- rho * pc * (1 - pc) pc.sq <- pc^2 qc.sq <- (1 - pc)^2 deno1 <- pc.sq + temp deno2 <- qc.sq + temp e1 <- n2f * pc.sq / deno1 e2 <- n2f * temp / deno1 e3 <- n0f * qc.sq / deno2 e4 <- n0f * temp / deno2 sume2e4 <- e4 + e2 trho <- sume2e4 / (sume2e4 + n1f) tpc <- (2 * e1 + n1f + sume2e4 + n1m) / sum.all nonstop <- (abs(trho - rho) > dv | abs(tpc - pc)> dv) if(is.nan(nonstop)|is.na(nonstop)) break if(nonstop & (iter <= itertime)) { rho <- trho pc <- tpc iter <- iter + 1 } } rho.last <- rho pc.last <- pc list(rho.last = rho.last, pc.last = pc.last) }