#################################################################################################################### ## ## ## R Code for Haplotype-Based HWE Tests ## ## ## #################################################################################################################### ## ## ## ## #################################################################################################################### ## > source("HAP-HWE.txt") ## ## > HAPHWE("ped.txt",header=T,missing=0,blockfile="block.txt",filename="results.txt") ## #################################################################################################################### ## Input variables: ## ## ## ## ped: The name of a standard linkage pedigree file or a matrix/dataframe containing pedigree relationship, ## ## genotype, and phenotype information, one row for each individual. The first 5 columns give the ## ## pedigree id, individual id, father id (0 if founder), mother id (0 if founder), and sex (1=male, ## ## 2=female), in that order. Note that these fields need to be numeric. The sixth column gives the ## ## affection status (1=unaffected; 2=affected; 0=unknown). The remaining columns code for marker genotypes ## ## (2 column per SNP), with the two alleles for each marker represented by two consecutive numbers ## ## (0=missing). The HAP-HWE software will automatically exclude the nonfounders from the analysis. ## ## ## ## header: If the data file has header, then it should be taken as T (or TRUE), otherwise F (or FALSE). ## ## The default value is T. ## ## ## ## missing: The missing value for alleles.The default value is 0. It could take any numeric value, except for ## ## 1, 2, NA and any other string value. If there is any missing values at some loci for an individual, ## ## then the individual will be excluded from the analysis. ## ## ## ## blockfile: haplotype block file with each row representing a haplotype block, the first column representing ## ## the ID of haplotype block and the columns after the first column representing the marker sequential ## ## numbers included in the haplotype block. If the "blockfile" is assigned, then the HAP-HWE software ## ## analyzes the haplotype blocks one by one. Otherwise, the HAP-HWE software will simutaneously ## ## analyze all the markers in the input file ("ped.txt") as a whole haplotype block. The default value ## ## of the "blockfile" is NULL. ## ## ## ## filename: If the "filename" is assigned (should be text file, and the file name should be embraced by double ## ## quotation marks, then the output results will be saved in the text file in the current directory. ## ## Otherwise, the ouput results will be automatically saved in the "results.txt" file. ## ## ## ## start.ratio.ab: the initial value of the parameter K=a/b under the Niu's Model for iterations. ## ## ## ## start.rho: the initial value of rho under the Inbreeding Model for iterations, which should be taken to be ## ## larger than 0. ## ## ## ## iter: the maximum number of iterations. The default value is 1000. ## ## ## ## dv: convergence criterion. The default value is 1e-8. ## ## ## ## ## #################################################################################################################### HAPHWE <- function(ped, header=T, missing=0, blockfile=NULL, filename="results.txt", start.ratio.ab = 1, start.rho = 0.01, iter = 1000, dv = 1e-8){ if (is.null(blockfile)) { blockname <- "block1" results <- HAPHWE0(ped, header, missing, blockfile, filename, start.ratio.ab, start.rho, iter, dv, blockname) if (missing(filename)!=T){ sink(filename) print(results) sink() } print(results) } else if (!is.null(blockfile)) { results <- NULL ped1 <- read.table(ped, header=header) n <- count.fields(blockfile) b <- read.table(blockfile,fill=T,col.name=1:max(n)) b.dim <- dim(b)[1] for (i in 1:b.dim){ snp <- b[i,2:max(n)][!is.na(b[i,2:max(n)])] snppos <- 2*(rep(snp,each=2)-1)+7:8 ped2 <- ped1[,c(1:6,snppos)] blockname <- paste("block",i,sep = "") results0 <- HAPHWE0(ped2, header, missing, blockfile, filename, start.ratio.ab, start.rho, iter, dv, blockname) print(results0) results <- c(results, results0) } if (missing(filename)!=T){ sink(filename) print(results) sink() } } } HAPHWE0 <- function(ped, header, missing, blockfile, filename, start.ratio.ab, start.rho, iter, dv, blockname){ if (is.character(ped)) { pedfile <- ped ped <- read.table(pedfile,header=header,as.is=T) ped_org <- ped ped <- ped[which(ped[,3]==0),c(1,7:ncol(ped))] n.loci <- (dim(ped)[2] - 1) / 2 marker_name <- names(ped)[5+2*(1:n.loci)] } else if(is.data.frame(ped)) { ped_org <- ped ped <- ped[which(ped[,3]==0),c(1,7:ncol(ped))] n.loci <- (dim(ped)[2] - 1) / 2 marker_name <- names(ped)[5+2*(1:n.loci)] } else if(is.matrix(ped)) { ped_org <- ped ped <- ped[which(ped[,3]==0),c(1,7:ncol(ped))] n.loci <- (dim(ped)[2] - 1) / 2 marker_name <- NULL if(!is.null(dimnames(ped))) marker_name <- dimnames(ped)[[2]][5+2*(1:n.loci)] } if(is.null(marker_name)){ marker_name <- paste("Marker",rep(1:n.loci,each=2),sep="") marker_name <- paste(marker_name,c("_1","_2"),sep="") colnames(ped) <- c("ID",marker_name) } ped <- as.matrix(ped) ped.bak <- ped peddata <- ped for(chk in 2:ncol(peddata)){ if(missing == 0){ peddata[which(peddata[,chk]==missing),chk] <- 9 peddata[which(peddata[,chk]==2),chk] <- 0 } else peddata[which(peddata[,chk]==2),chk] <- 0 } for(index_m in 1:n.loci){ if(missing == 0){ ped.anl <- peddata[which(peddata[,2+2*(index_m-1)]!=9),] peddata <- ped.anl } else{ ped.anl <- peddata[which(peddata[,2+2*(index_m-1)]!=missing),] peddata <- ped.anl } } ped <- peddata n.ind <- dim(ped)[1] stat.ped <- count.geno( ped,bloci=2 ) ped.c <- stat.ped[[1]] count <- ped.c[,1:2] corr.id <- stat.ped[[2]] if( n.ind != sum(count[,2]) ) stop("Sample size is wrong!") n.ind1 <- nrow(ped.c) n.col <- ncol(ped.c) ped1 <- ped.c[,c(1,3:n.col)] getHAP <- get.HAP(ped1,outcomb=T) HAP1 <- getHAP[[1]] all.comb1 <- getHAP[[2]] n.HAP <- length(HAP1) start.freq <- 1 / n.HAP HAP.freq.start <- cbind(HAP1, start.freq) h1.re <- HAP.freq(ped, ped1, count, corr.id, HAP1, all.comb1, start.ratio.ab=start.ratio.ab ,start.freq=NULL, bloci=2, iter=iter, dv=dv,if.HWE=F,HAPcomb.out=T ) h0.re <- HAP.freq(ped, ped1, count, corr.id, HAP1, all.comb1, start.ratio.ab=1 ,start.freq=NULL, bloci=2, iter=iter, dv=dv,if.HWE=T,HAPcomb.out=T ) LRT.NM <- LH.test(h0.re[[4]],h1.re[[4]]) EM.h1.re <- HAP.freq.EM(HAP.freq.start, all.comb1, count, start.rho=start.rho ,start.freq=NULL, iter=iter, dv=dv,if.HWE=F ) EM.h0.re <- HAP.freq.EM(HAP.freq.start, all.comb1, count, start.rho=0 ,start.freq=NULL, iter=iter, dv=dv,if.HWE=T ) LRT.IM <- LH.test(EM.h0.re[[4]],EM.h1.re[[4]]) BH.h1 <- DH2BH( h1.re[[3]][,1],n.loci ) BH.h1[which(BH.h1==0)] <- 2 CBH.h1 <- apply(BH.h1,1,paste,collapse="") R.h1 <- data.frame(Haplotype=CBH.h1,Frequency=h1.re[[3]][,2]) BH.EM.h1 <- DH2BH( EM.h1.re[[3]][,1],n.loci ) BH.EM.h1[which(BH.EM.h1==0)] <- 2 CBH.EM.h1 <- apply(BH.EM.h1,1,paste,collapse="") R.EM.h1 <- data.frame(Haplotype=CBH.EM.h1,Frequency=EM.h1.re[[3]][,2]) BH.EM.h0 <- DH2BH( EM.h0.re[[3]][,1],n.loci ) BH.EM.h0[which(BH.EM.h0==0)] <- 2 CBH.EM.h0 <- apply(BH.EM.h0,1,paste,collapse="") R.EM.h0 <- data.frame(Haplotype=CBH.EM.h0,Frequency=EM.h0.re[[3]][,2]) results <- list("block.name"=blockname,"Convergence process of likelihood function under EM algorithm" = EM.h0.re[[1]], "Haplotype frequency estimates under EM algorithm" = R.EM.h0, "Convergence process of likelihood function under ECM algorithm" = h1.re[[1]], "Haplotype frequency estimates under ECM algorithm"=R.h1, K=h1.re[[2]], "LRT statistic based on Niu's Model"=LRT.NM[[1]], "P value of LRT based on Niu's Model"=LRT.NM[[2]], "Convergence process of likelihood function under IEM algorithm" = EM.h1.re[[1]], "Haplotype frequency estimates under IEM algorithm" = R.EM.h1, rho=EM.h1.re[[2]], "LRT statistic based on Inbreeding Model"=LRT.IM[[1]], "P value of LRT based on Inbreeding Model"=LRT.IM[[2]]) return(results) } DH2BH <- function( D.Haplotype, nloci){ if(!is.matrix(D.Haplotype)) D.Haplotype <- as.matrix(D.Haplotype) B.HAP <- apply(D.Haplotype,1,decBin,n=nloci) B.Haplotype <- t(B.HAP) return(B.Haplotype) } binDec<-function (vect) { k = 0 for (i in 1:length(vect)) { k = k + vect[i] * 2^(i - 1) } return(k) } decBin<-function (nb, n) { v = nb if (nb >= 2^n) { print("arguments are not valid") } else { bin = vector("numeric", n) i = 1 while (v != 0) { p = v%/%2 if (2 * p == v) { bin[i] = 0 } else { bin[i] = 1 } v = p i = i + 1 } return(bin) } } cal.pair<-function( haplotype,prob,ratio.ab=1,if2D=F ){ if( if2D == 'F' | if2D == 'FALSE') D.HAP <- apply(haplotype,1,binDec) else D.HAP <- haplotype HAP.prob <- cbind( D.HAP,prob ) HAP.prob <- HAP.prob[order(HAP.prob[,1]),] place <- 1:nrow(HAP.prob) cpl <- cbind(place,HAP.prob) result <- apply(cpl,1,cal.comb,HAP.freq=HAP.prob,ratio.ab=ratio.ab) p.comb <- do.call("rbind",result) return(p.comb) } cal.comb <- function(data,HAP.freq,ratio.ab=1){ PL <- data[1] x <- data[2] n.row <- nrow(HAP.freq) ch <- PL:n.row result <- cbind(x,HAP.freq[ch,1],0,HAP.freq[PL,2],HAP.freq[ch,2]) if(PL!=n.row){ ch2 <- 2:nrow(result) result[1,3] <- ratio.ab * result[1,4]^2 result[ch2,3] <- 2 * result[ch2,4] * result[ch2,5] } else result[1,3] <- ratio.ab * result[1,4]^2 colnames(result) <- NULL rownames(result) <- NULL return(result) } EB <- function( HAP.freq, all.HAP, ratio.ab ){ if(is.vector(all.HAP)) all.HAP <- matrix(all.HAP,nrow=1) HAPmatrix <- all.HAP[,1:3] if(is.matrix(HAPmatrix)) HAPmatrix <- HAPmatrix[order(HAPmatrix[,1],HAPmatrix[,2],HAPmatrix[,3]),] else HAPmatrix <- matrix(HAPmatrix,nrow=1) all.PL <- 1:nrow(HAPmatrix) same.PL <- which(HAPmatrix[,2]==HAPmatrix[,3]) diff.PL <- all.PL[-same.PL] Sec.PL <- match(HAPmatrix[,2],HAP.freq[,1]) Thr.PL <- match(HAPmatrix[,3],HAP.freq[,1]) Prob.comb <- HAP.freq[Sec.PL,2] * HAP.freq[Thr.PL,2] HAPmatrix <- cbind( HAPmatrix, Prob.comb ) HAPmatrix[same.PL,4] <- ratio.ab * HAPmatrix[same.PL,4] HAPmatrix[diff.PL,4] <- 2 * HAPmatrix[diff.PL,4] return(HAPmatrix) } Bysum <- function(HAP.freq, geno.comb, same.pl, diff.pl, rho=0){ geno.comb2 <- cbind(geno.comb[,1],0) geno.comb2[same.pl,2] <- rho * HAP.freq[same.pl,1] + (1-rho) * HAP.freq[same.pl,1]^2 geno.comb2[diff.pl,2] <- 2*(1-rho) * HAP.freq[diff.pl,1] * HAP.freq[diff.pl,2] Pr <- aggregate(geno.comb2,by=list(geno.comb2[,1]),sum) Pr <- as.matrix(Pr) colnames(Pr) <- NULL Pr2 <- Pr[,c(1,3)] return(Pr2) } get.HAP <- function(ped,bloci=2,outcomb=F){ nind <- nrow(ped) n.col <- ncol(ped) ped <- ped[,c(1,bloci:n.col)] all.HAP <- apply(ped,1,Gen.HAP) if(is.list(all.HAP)) comb.HAP<- do.call("rbind",all.HAP ) else comb.HAP <- matrix(all.HAP,nrow=nind,byrow=T) if(outcomb == 'F' | outcomb == 'FALSE' ) { HAP <- sort( unique( c(comb.HAP[,2],comb.HAP[,3]) ) ) return(HAP) } else{ HAP <- sort( unique( c(comb.HAP[,2],comb.HAP[,3]) ) ) result <- list( "Haplotype"=HAP, "individual combined Haplotype"=comb.HAP) return( result ) } } Gen.HAP <- function(ind.geno){ len <- length(ind.geno) genematrix <- matrix(ind.geno[2:len],ncol=2,byrow=T) diff.PL <- which( genematrix[,1] != genematrix[,2] ) same.PL <- which( genematrix[,1] == genematrix[,2] ) ndiff <- length(diff.PL) same <- 2 + 2*( same.PL-1 ) diff.p1 <- 2+2*( diff.PL-1 ) diff.p2 <- 3+2*( diff.PL-1 ) k <- 1 HAP <- numeric(0) if( ndiff > 2 ){ judge <- numeric(0) loop <- ndiff %/% 2 for(i in 1:loop){ ncom <- combn(1:ndiff,i) num.comb <- ncol(ncom) for(j in 1:num.comb){ PL1 <- diff.p1 PL2 <- diff.p2 PL1[ ncom[,j] ] <- diff.p2[ ncom[,j] ] PL2[ ncom[,j] ] <- diff.p1[ ncom[,j] ] HAP1 <- binDec(ind.geno[sort(c(PL1,same))]) HAP2 <- binDec(ind.geno[sort(c(PL2,same))]) if( !any(judge == HAP1) ) { cHAP <- c(sort(c(HAP1,HAP2)),k) HAP <- rbind(HAP,cHAP) judge <- c(judge,HAP1,HAP2) k <- k + 1 } } } HAP1 <- binDec(ind.geno[sort(c(diff.p1,same))]) HAP2 <- binDec(ind.geno[sort(c(diff.p2,same))]) cHAP <- c(sort(c(HAP1,HAP2)),k) HAP <- rbind(HAP,cHAP) } else{ all.loci1 <- sort(c(diff.p1,same)) all.loci2 <- sort(c(diff.p2,same)) HAP1 <- binDec(ind.geno[all.loci1]) HAP2 <- binDec(ind.geno[all.loci2]) HAP <- matrix(c(sort(c(HAP1,HAP2)),1),nrow=1,byrow=T) if( ndiff ==2 ) { PL <- diff.p1[1] diff.p1[1] <- diff.p2[1] diff.p2[1] <- PL HAP1 <- binDec(ind.geno[sort(c(diff.p1,same))]) HAP2 <- binDec(ind.geno[sort(c(diff.p2,same))]) cHAP <- c(sort(c(HAP1,HAP2)),2) HAP <- rbind(HAP,cHAP) } } HAP <- cbind(ind.geno[1],HAP) rownames(HAP) <- NULL return(HAP) } root3 <- function(a,b,c,d){ if(a != 0){ fst <- -b/(3*a) sec <- b*c/(6*a^2) - b^3/(27*a^3) - d/(2*a) thr <- c/(3*a) - b^2/(9*a^2) delta <- sec^2 + thr^3 if(delta > 0){ cubic1 <- sec + sqrt(delta) cubic2 <- sec - sqrt(delta) if(cubic1 >= 0) rcubic1 <- cubic1^(1/3) else rcubic1 <- -abs(cubic1)^(1/3) if(cubic2 >= 0) rcubic2 <- cubic2^(1/3) else rcubic2 <- -abs(cubic2)^(1/3) X1 <- fst + rcubic1 + rcubic2 ARe <- fst - 0.5 * rcubic1 - 0.5 * rcubic2 AIm <- (sqrt(3)/2) * rcubic1 - (sqrt(3)/2) * rcubic2 X2 <- complex(re=ARe,im=AIm) X3 <- complex(re=ARe,im=-AIm) } else if(delta == 0){ cubic1 <- sec + sqrt(delta) cubic2 <- sec - sqrt(delta) if(cubic1 >= 0) rcubic1 <- cubic1^(1/3) else rcubic1 <- -abs(cubic1)^(1/3) if(cubic2 >= 0) rcubic2 <- cubic2^(1/3) else rcubic2 <- -abs(cubic2)^(1/3) X1 <- fst + rcubic1 + rcubic2 X2 <- fst - 0.5 * rcubic1 - 0.5 * rcubic2 X3 <- fst - 0.5 * rcubic1 - 0.5 * rcubic2 } else{ fcos <- acos(sec/sqrt((-thr)^3)) X1 <- fst + 2 * sqrt(-thr) * cos(fcos/3) X2 <- fst + 2 * sqrt(-thr) * cos((fcos + 2*pi)/3) X3 <- fst + 2 * sqrt(-thr) * cos((fcos - 2*pi)/3) } result<-data.frame(x1=X1,x2=X2,x3=X3) return(result) } else{ stop("It is not a univariate cubic equation") } } sum.pair<-function( haplotype,prob,if2D=F ){ if( if2D == 'F' | if2D == 'FALSE') D.HAP <- apply(haplotype,1,binDec) else D.HAP <- haplotype HAP.prob <- cbind( D.HAP,prob ) HAP.prob <- HAP.prob[order(HAP.prob[,1]),] place <- 1:nrow(HAP.prob) cpl <- cbind(place,HAP.prob) result <- apply(cpl,1,cal.sum,HAP.freq=HAP.prob) p.comb <- t(result) rownames(p.comb) <- HAP.prob[,1] return(p.comb) } cal.sum <- function(data,HAP.freq){ PL <- data[1] x <- data[2] n.row <- nrow(HAP.freq) ch <- PL:n.row result <- cbind(x,HAP.freq[ch,1],HAP.freq[PL,2],HAP.freq[ch,2]) diff.sum <- 0 same.sum <- 0 if(PL!=n.row){ ch2 <- 2:nrow(result) same.sum <- result[1,3]^2 diff.sum <- result[ch2,3] * result[ch2,4] } else same.sum <- result[1,3]^2 output <- c(same.sum,sum(diff.sum)) return(output) } Ln <- function(HAP.freq, geno.comb, count.geno, ratio.ab=1){ geno.comb2 <- geno.comb[order(geno.comb[,1],geno.comb[,4]),] pair.P <- sum.pair(HAP.freq[,1],HAP.freq[,2],if2D=T) sum.T <- sum(ratio.ab * pair.P[,1],2 * pair.P[,2]) first.PL <- match(geno.comb2[,2],HAP.freq[,1]) second.PL <- match(geno.comb2[,3],HAP.freq[,1]) HAP.prob <- HAP.freq[first.PL,2] * HAP.freq[second.PL,2] same.PL <- which(geno.comb2[,2] == geno.comb2[,3]) diff.PL <- which(geno.comb2[,2] != geno.comb2[,3]) geno.comb2 <- cbind(geno.comb2,HAP.prob) geno.comb2[same.PL,5] <- ratio.ab * geno.comb2[same.PL,5] / sum.T geno.comb2[diff.PL,5] <- 2 * geno.comb2[diff.PL,5] / sum.T colnames(geno.comb2) <- paste("V",1:5,sep="") forchoose <- as.data.frame(geno.comb2) Pr <- aggregate(geno.comb2[,c(1,5)],by=list(geno.comb2[,1]),sum) choosephase <- aggregate(geno.comb2[,c(1,5)],by=list(geno.comb2[,1]),max) choosephase2 <- as.data.frame(choosephase[,2:3]) maxphase <- merge(choosephase2,forchoose,by.x=c("V1","V5"),by.y=c("V1","V5")) maxphase <- maxphase[order(maxphase[,1]),c(1,3,4,2)] maxphase <- as.matrix(maxphase) rownames(maxphase) <- NULL colnames(maxphase) <- c("id","snp1","snp2","prob") LH <- sum(log(Pr[order(Pr[,1]),3])*count.geno[order(count.geno[,1]),2]) result <- list(Likelihood=LH,"combined HAPlotype"=maxphase) return(result) } HAP.freq <- function(ped, ped1, count, corr.id, HAP, all.comb, start.ratio.ab=1 ,start.freq=NULL, bloci=2, iter=1000, dv=1e-8,if.HWE=F,HAPcomb.out=F ){ pro_iter <- numeric(0) n.ind <- nrow(ped) n.HAP <- length(HAP) if( is.null(start.freq) ) start.freq <- 1 / n.HAP else start.freq <- start.freq n.iter <- 1 if(toupper(if.HWE) == 'F' | toupper(if.HWE) == 'FALSE' ) old.ratio <- start.ratio.ab else old.ratio <- 1 old.freq <- start.freq HAP.freq <- cbind(HAP, old.freq) colnames(HAP.freq) <- c("Haplo","Freq") B1 <- 0 B2 <- 0 PL.B1 <- which(all.comb[,2]==all.comb[,3]) len.B1 <- length(PL.B1) if(len.B1 != 0) B1 <- sum( count[match(all.comb[PL.B1,1],count[,1]),2] ) ############# calculate the probability of all the possible haplotype pairs ############# pair.P <- sum.pair( HAP.freq[,1],HAP.freq[,2],if2D=T ) pair.rowname <- row.names(pair.P) ############# ECM algorithm ############################################################# while(n.iter <= iter){ Cy <<- array( 0,dim=c( n.HAP,2 ) ) Cy[,1] <<- HAP.freq[,1] sum.same <- sum( pair.P[,1] ) sum.diff <- sum( pair.P[,2] ) sum.all <- sum(old.ratio * pair.P[,1],2 * pair.P[,2]) all.freq <- EB( HAP.freq,all.comb,ratio.ab=old.ratio ) old.geno.freq <- t( apply(ped1,1,Previous.HAPfreq,all.count=count,all.prob=all.freq) ) old.geno.freq[,2] <- old.geno.freq[,2]/sum.all if( if.HWE == 'F' | if.HWE == 'FALSE' ) new.ratio <- ( 2 * B1 * sum.diff ) / ( ( n.ind - B1 ) * sum.same ) else new.ratio <- 1 HAP.freq.old <- HAP.freq maxpL <- which(HAP.freq[,2]==max(HAP.freq[,2])) maxpL.HAP <- HAP.freq[maxpL[1],1] pair.maxHAP <- which(pair.rowname == maxpL.HAP) for(r in 1:n.HAP){ if(r == maxpL[1]) next cal.HAP <- HAP.freq[r,1] pair.P2 <- sum.pair(HAP.freq[-c(r,maxpL[1]),1],HAP.freq[-c(r,maxpL[1]),2],if2D=T) sum.same3 <- sum(pair.P2[,1]) sum.diff3 <- sum(pair.P2[,2]) A1 <- sum(HAP.freq[-c(r,maxpL[1]),2]) d <- 2 * ( new.ratio - 1 ) A2 <- d * ( sum.same3 + sum.diff3 - A1) + new.ratio cube <- d * ( 2 * n.ind - Cy[r,2] - Cy[maxpL[1],2] ) quad <- d * ( 1 - A1 ) * ( 2 * Cy[r,2] + Cy[maxpL[1],2] - 3 * n.ind ) first <- d * ( 1 - A1 )^2 * ( n.ind - Cy[r,2] ) - A2 * ( Cy[r,2] + Cy[maxpL[1],2] ) const <- ( 1 - A1 ) * A2 * Cy[r,2] if( cube != 0) { result <- root3( cube,quad,first,const ) if(is.complex( result[1,2] )) result <- as.data.frame(result[1,1]) } else { result <- -const / first result <- as.data.frame(result) } result[1,which(abs(result[1,]) <= 1e-8)] <- 0 rp <- which( result[1,] >= 0 & result[1,] < 1 & (result[1,] + A1) <= 1 ) if( length(rp) == 1 ) HAP.freq[r,2] <- result[1,rp] else if(length(rp) > 1){ result2 <- result[1,rp] find.value <- abs(result2 - HAP.freq[r,2]) value.PL <- which(find.value == min(find.value)) HAP.freq[r,2] <- result2[1,value.PL[1]] } } sita.max <- 1 - sum( HAP.freq[-maxpL[1],2]) HAP.freq[maxpL[1],2] <- sita.max Lnb <- sum( log( old.geno.freq[order(old.geno.freq[,1]),2] ) * count[order(count[,1]),2] ) Lna <- Ln(HAP.freq, all.comb, count,new.ratio) if(n.iter == 1) pro_iter <- c(0, Lnb) iteraction <- c(n.iter,Lna[[1]]) pro_iter <- rbind(pro_iter,iteraction) if(n.iter == iter | abs( Lna[[1]] - Lnb ) <= dv) break else{ n.iter <- n.iter + 1 old.ratio <- new.ratio pair.P <- sum.pair(HAP.freq[,1],HAP.freq[,2],if2D=T) } } ############# End iteration ###################################################### if( toupper(HAPcomb.out) != 'F' | toupper(HAPcomb.out) != 'FALSE' ){ maxcomb <- Lna[[2]] all.max <- cbind(corr.id[,1],maxcomb[match(corr.id[,2],maxcomb[,1]),2:3]) colnames(all.max) <- c("id","HAP1","HAP2") } ############# Output the results ################################################# p.out <- which(HAP.freq[,2]<1e-5) if ( length(p.out) != 0) Out.freq <- HAP.freq[-p.out,] else Out.freq <- HAP.freq colnames(Out.freq) <- c("Haplotype","Frequency") rownames(pro_iter) <- NULL if( toupper(HAPcomb.out) == 'F' | toupper(HAPcomb.out) == 'FALSE' ) HAP.result <- list(iter=pro_iter,ratio.ab=new.ratio,"Haplotype.frequency.estimate" = Out.freq, Likelihood = Lna[[1]]) else HAP.result <- list(iter=pro_iter,ratio.ab=new.ratio,"Haplotype.frequency.estimate" = Out.freq, Likelihood = Lna[[1]],maxphase.comb=all.max) return(HAP.result) } HAP.freq.EM <- function(HAP.freq, all.comb, count, start.rho=0 ,start.freq=NULL, iter=1000, dv=1e-8,if.HWE=F ){ pro_iter <- numeric(0) nHAP <- nrow(HAP.freq) all.comb2 <- all.comb[,1:3] if(is.matrix(all.comb2)) all.comb2 <- all.comb2[order(all.comb2[,1],all.comb2[,2],all.comb2[,3]),] else all.comb2 <- matrix(all.comb2,nrow=1) all.PL <- 1:nrow(all.comb2) same.PL <- which(all.comb2[,2]==all.comb2[,3]) diff.PL <- all.PL[-same.PL] Sec.PL <- match(all.comb2[,2],HAP.freq[,1]) Thr.PL <- match(all.comb2[,3],HAP.freq[,1]) Hapfreq <- array(0,dim=c(nrow(all.comb2),2)) Hapfreq[,1] <- HAP.freq[Sec.PL,2] Hapfreq[,2] <- HAP.freq[Thr.PL,2] same.id <- unique(all.comb2[same.PL,1]) diff.id <- unique(all.comb2[diff.PL,1]) same.pl2 <- match(same.id,count[,1]) diff.pl2 <- match(diff.id,count[,1]) n.iter <- 1 if(if.HWE==F) old.rho <- start.rho else old.rho <- 0 count2 <- count[order(count[,1]),] HAP.freq2 <- HAP.freq deno.n <- sum(count2[,2]) Pr.byid <- Bysum(Hapfreq, all.comb2, same.PL, diff.PL, rho=old.rho) Lnb <- sum(log(Pr.byid[,2])*count2[,2]) pro_iter <- c(0, Lnb) while(n.iter <= iter){ if(if.HWE==F){ new.rho <- sum(old.rho * Hapfreq[same.PL,1] /(old.rho * Hapfreq[same.PL,1] + (1 - old.rho) * Hapfreq[same.PL,1]^2)*count[same.pl2,2])/deno.n } else new.rho <- 0 for(i in 1:nHAP){ hap.cal <- HAP.freq2[i,1] zero.hap <- which(HAP.freq[i,2]<1e-100) cal.pl1 <- which(all.comb2[,2]==hap.cal & all.comb2[,3]==hap.cal) cal.pl2 <- which((all.comb2[,2]==hap.cal & all.comb2[,3]!=hap.cal) | (all.comb2[,2]!=hap.cal & all.comb2[,3]==hap.cal)) if(length(cal.pl1)!=0){ cal.id1 <- all.comb2[cal.pl1,1] c.pl1 <- match(cal.id1, count[,1]) c.sum.pl1 <- match(cal.id1,Pr.byid[,1]) E1 <- sum((((old.rho * Hapfreq[cal.pl1,1]+2*(1 - old.rho) * Hapfreq[cal.pl1,1]^2) / Pr.byid[c.sum.pl1,2])*count[c.pl1,2])) } else E1 <- 0 if(length(cal.pl2)!=0){ cal.id2 <- all.comb2[cal.pl2,1] c.pl2 <- match(cal.id2, count[,1]) c.sum.p2 <- match(cal.id2,Pr.byid[,1]) E2 <- sum(2*((1-old.rho) * Hapfreq[cal.pl2,1] * Hapfreq[cal.pl2,2] / Pr.byid[c.sum.p2,2]) * count[c.pl2,2]) } else E2 <- 0 HAP.freq2[i,2] <- E1 + E2 } HAP.freq2[,2] <- HAP.freq2[,2]/sum(HAP.freq2[,2]) Hapfreq[,1] <- HAP.freq2[Sec.PL,2] Hapfreq[,2] <- HAP.freq2[Thr.PL,2] Pr.byid2 <- Bysum(Hapfreq, all.comb2, same.PL, diff.PL, rho=new.rho) Lna <- sum(log(Pr.byid2[,2])*count2[,2]) iteraction <- c(n.iter, Lna) pro_iter <- rbind(pro_iter, iteraction) if(n.iter == iter | abs( Lna - Lnb ) <= dv) break else{ n.iter <- n.iter + 1 old.rho <- new.rho Lnb <- Lna Pr.byid <- Pr.byid2 } } p.out <- which(HAP.freq2[,2]<1e-5) if(length(p.out)!=0) Out.HAP <- HAP.freq2[-p.out,] else Out.HAP <- HAP.freq2 colnames(Out.HAP) <- c("Haplotype","Frequency") rownames(pro_iter) <- NULL results <- list(iter=pro_iter,rho=new.rho,"Haplotype.frequency.estimate" = Out.HAP,"Value of Likelihood Function" = Lna) return(results) } # Likelihood ratio test LH.test <- function( h0.LH, h1.LH ){ stat <- ( h1.LH - h0.LH ) * 2 P.value <- 1 - pchisq( stat,1 ) result <- list( stat=stat,P.value=P.value) return(result) } sort.snps <- function( data,bloci=2 ){ len <- length( data ) bs.snps <- matrix(data[ bloci:len ],ncol=2,byrow=T) af.snps <- apply(bs.snps,1,sort) af.data <- matrix(c(data[1:(bloci-1)],as.vector(af.snps))) return(af.data) } count.geno <- function(ped,bloci=2){ ped1 <- t( apply(ped,1,sort.snps,bloci=bloci) ) n.col <- ncol(ped1) n.loci <- (n.col - bloci +1 )/2 HAP1 <- apply(ped1[,seq(bloci,n.col-1,by=2)],1,binDec) HAP2 <- apply(ped1[,seq((bloci+1),n.col,by=2)],1,binDec) geno <- paste(HAP1,HAP2,sep="/") ped.D <- as.data.frame(cbind(ped1[,1],HAP1,HAP2,geno)) count <- as.vector(table(geno)) geno2 <- sort(unique(geno)) str.geno <- strsplit(geno2,"/") str.geno <- do.call("rbind",str.geno) geno3 <- matrix(as.numeric(str.geno),ncol=2) n.count <- length(count) id <- 1:n.count find.id <- data.frame(geno2=geno2,id=id) if(is.vector(find.id)) find.id <- matrix(find.id,nrow=1) ped2 <- cbind(id,count,geno3) if(is.vector(ped2)) ped2 <- matrix(ped2,nrow=1) new.col <- 2+2*n.loci ped3 <- array(0,dim=c(n.count,new.col)) ped3[,1:2] <- ped2[,1:2] HAP.D1 <- as.matrix(ped2[,3]) HAP.B1 <- t( apply(HAP.D1,1,decBin,n.loci) ) ped3[,seq(3,new.col-1,by=2)] <- HAP.B1 HAP.D2 <- as.matrix(ped2[,4]) HAP.B2 <- t( apply(HAP.D2,1,decBin,n.loci) ) ped3[,seq(4,new.col,by=2)] <- HAP.B2 ped.find <- cbind( ped.D,find.id[match(ped.D[,4],find.id[,1]),2] ) corr <- ped.find[,c(1,5)] corr <- as.matrix(corr) corr <- matrix(as.numeric(corr),ncol=2) colnames(corr) <- c("old.id","new.id") result <- list(ped=ped3,corr=corr) return(result) } Previous.HAPfreq <- function(ind.geno,all.count,all.prob){ ind.id <- ind.geno[1] c.freq <- all.count[ which( all.count[,1]==ind.id ),2 ] ind.prob <- all.prob[which(all.prob[,1]==ind.id),] if(is.vector(ind.prob)) ind.prob <- matrix(ind.prob,nrow=1) Pr.g <- sum( ind.prob[,4] ) ind.geno.freq <- c( ind.id,Pr.g ) sp2 <- which( ind.prob[,2] == ind.prob[,3] ) ind.all.HAP <- sort(unique(c(ind.prob[,2],ind.prob[,3]))) n.ind.HAP <- length( ind.all.HAP ) for(j in 1:n.ind.HAP){ ind.H <- ind.all.HAP[j] Cy.PL <- which(Cy[,1]==ind.H) ind.p <- which( ind.prob[,2] == ind.H | ind.prob[,3] == ind.H ) if(length(ind.p) > 0){ if(ifelse(length(sp2) >0, ind.p == sp2, 0)) {Cy[Cy.PL,2] <<- Cy[Cy.PL,2] + c.freq * 2 * sum( ind.prob[ind.p,4] / Pr.g )} else Cy[Cy.PL,2] <<- Cy[Cy.PL,2] + c.freq * sum( ind.prob[ind.p,4] / Pr.g ) } } return(ind.geno.freq) }