happy.hbrem/DESCRIPTION0000644000261400006360000000164212254616417013770 0ustar00rmottmottPackage: happy.hbrem Version: 2.4 Date: 2012-07-20 Title: Quantitative Trait Locus genetic analysis in Heterogeneous Stocks Author: Richard Mott Maintainer: Richard Mott Depends: R (>= 2.6.0), g.data, multicore Description: happy is an R interface into the HAPPY C package for fine-mapping Quantitative Trait Loci (QTL) in Heterogenous Stocks (HS). An HS is an advanced intercross between (usually eight) founder inbred strains of mice. HS are suitable for fine-mapping QTL. The happy package is an extension of the original C program happy; it uses the C code to compute the probability of descent from each of the founders, at each locus position, but the happy packager allows a much richer range of models to be fit to the data. License: GPL (>=2) URL: http://www.r-project.org, http://www.well.ox.ac.uk/happy Packaged: 2013-12-19 16:21:35 UTC; rmott happy.hbrem/NAMESPACE0000744000261400006360000000012012254601044013456 0ustar00rmottmottexportPattern("^[^\\.]") exportPattern("^[[:alpha:]]+") useDynLib(happy.hbrem) happy.hbrem/R/0000755000261400006360000000000012254616417012460 5ustar00rmottmotthappy.hbrem/R/happy.R0000744000261400006360000016743312254615404013736 0ustar00rmottmott.packageName <- "happy.hbrem_2.4" library(MASS) library(g.data) library(multicore) # C interface to read in .data and .alleles files, perform DP and create a happy object # # a happy object is a list with the following attributes: # happy$strains array of strain names # happy$markers array of marker names # happy$subjects array of subject names # happy$phenotype array of phenotype values # happy$map array of map locations (in cM) pf markers # happy$handle integer handle which maps the R happy object to the corresponding C QTL object happy <- function( datafile, allelesfile, generations=200, phase="unknown", file.format="happy", missing.code="NA", do.dp=TRUE, min.dist=1.0e-5, mapfile=NULL, ancestryfile=NULL, haploid=FALSE ) { gen <- as.numeric(generations)+0 if ( phase=="estimate" ) file.format <- "ped" h <- .Call( "happy", datafile, allelesfile, gen, phase, file.format, missing.code, do.dp=as.integer(do.dp), min.dist=min.dist, haploid=as.integer(haploid), ancestryfile=ancestryfile, PACKAGE="happy.hbrem" ) h$phase <- phase h$haploid <- haploid strain.names <- make.names(h$strains) num.strains <- length(h$strains) h$names.additive <- strain.names diplotype.names <- matrix(kronecker(strain.names, strain.names, paste, sep="."), nrow=num.strains) h$names.full.symmetric <- c( diag(diplotype.names), diplotype.names[upper.tri(diplotype.names, diag=FALSE)]) h$names.full.asymmetric <- c( t(diplotype.names) ) # assumes row major order, ie, (row1, row2, etc), in C object # for backwards compatibility h$nam <- strain.names h$nam2 <- h$names.full.symmetric h$nam3 <- h$names.full.asymmetric h$bp = h$map if ( ! is.null(mapfile)) { map <- read.delim(mapfile) if ( !is.null(map$marker) && ! is.null(map$bp)) { hbp <- data.frame(markers=h$markers,bp=rep(NA,length(h$markers))) h$bp <- map$bp[match(hbp$markers,map$marker)] } else stop( "incorrect column names found in mapfile ", mapfile , "\n") } h$additive = list() nm = length(h$markers)-1 h$additive$genome <- data.frame( marker = I(as.character(h$markers))[1:nm], map = as.numeric(h$map)[1:nm], bp = as.numeric(h$bp)[1:nm], chromosome = I(as.character(h$chromosome))[1:nm]) h$full = list() h$full$genome <- data.frame( marker = I(as.character(h$markers))[1:nm], map = as.numeric(h$map)[1:nm], bp = as.numeric(h$bp)[1:nm], chromosome = I(as.character(h$chromosome))[1:nm]) mother = vector( mode="character", length=length(h$mother)) w = which(h$mother>0) mother[w] = h$subjects[h$mother[w]] w = which(h$mother<=0) mother[w] = NA h$mother = mother father = vector( mode="character", length=length(h$father)) w = which(h$father>0) father[w] = h$subjects[h$father[w]] w = which(h$father<=0) father[w] = NA h$father = father return(h) } happy.matrices <- function( h ) { if ( class(h) == "happy" ) { if (is.null(h$matrices) ) { matrices <- list() for( m in h$markers) { add <- hdesign( h, m, model='additive' ) full <- hdesign( h, m, model='full' ) if ( h$use.pedigrees || h$phase.known ) full.asymmetric <- hdesign( h, m, model="full.asymmetric") else full.asymmetric <- NULL id <- m matrices[[m]] <- list( id=id, additive=add, full=full, full.asymmetric=full.asymmetric ) } h$matrices <- matrices } } return(h) } happy.save <- function( h, file ) { if ( class(h) == "happy" ) { if (is.null(h$matrices) ) h <- h$matrices save(h,file=file,compress=TRUE) } } # C interface to return the design matrix for a marker interval # h is a happy object returned by a previous call to happy # marker is the name of the left-hand marker in the interval, or the integer index of the marker (starting from 1) # mode can be 'additive' or 'full' # if mergematrix is non-null then the columns of the design matrix are merged hdesign <- function( h, marker, model='additive', mergematrix=NULL ) { d<- NULL if ( class(h) == "happy" ) { if ( ! is.null(h$matrices) ) { # data are in R memory if ( !is.integer(marker) ) { # integer marker index m <- match( marker, h$markers ) if ( ! is.na(m) ) return( hdesign( h, m, model=model, mergematrix=mergematrix)) else return(NULL) } else { # marker name obj <- h$matrices[marker] if ( model == 'additive' ) return ( obj[[1]]$additive ) else return ( obj[[1]]$full ) } } else { # data are in C memory handle <- as.numeric(h$handle)+0 if ( h$haploid ) { d <- .Call( "haploid_happydesign", handle, marker, PACKAGE="happy.hbrem") if ( ! is.null(d) ) if ( model == 'additive' ) colnames(d) <- h$nam else colnames(d) <- h$nam2 } else { d <- .Call( "happydesign", handle, marker, model, PACKAGE="happy.hbrem") if ( ! is.null(d) ) { if ( model == 'additive' ) colnames(d) <- h$nam else colnames(d) <- h$nam2 } } } } else if ( class(h) == "happy.genome" ) { # delayed data package loaded.markers <- load.markers( h, c(marker), model=model ) d <- loaded.markers[[1]] } # merge the matrix if required if ( ! is.null(d) && ! is.null( mergematrix ) ) { if ( model == 'additive' ) { d <- d %*% mergematrix$amat colnames(d) <- colnames(mergematrix$amat) } else if (model == "full" ) { d <- d %*% mergematrix$imat colnames(d) <- colnames(mergematrix$imat) } else if (model == "full.asymmetric") { d <- d %*% mergematrix$famat colnames(d) <- colnames(mergematrix$famat) } } return(d) } hnonrecomb<- function( h, marker=NULL, do.mean=TRUE ) { handle <- as.numeric(h$handle)+0 if ( is.null(marker) ) { nm <- length(h$markers)-1 r <- vector("numeric",length=nm) for(m in 1:nm) { x <- .Call( "happynonrecomb", handle, m, PACKAGE="happy.hbrem") r[m] <- 0.5*mean(x) } } else { r <- .Call( "happynonrecomb", handle, marker, PACKAGE="happy.hbrem") r <- 0.5*r if ( do.mean ) r <- mean(r) } return(r) } hprob <- function( h, marker=NULL ) { handle <- as.numeric(h$handle)+0 p <- .Call( "happyprobs", handle, marker, PACKAGE="happy.hbrem") colnames(p) <- h$nam2 return(p) } hprob2 <- function( h, marker=NULL, symmetrize=FALSE ) { handle <- as.numeric(h$handle)+0 if ( symmetrize==TRUE ) symmetrize=1 else symmetrize=0 p <- .Call( "happyprobs2", handle, marker, symmetrize, PACKAGE="happy.hbrem") if ( symmetrize==1 ) colnames(p) <- h$nam2 else colnames(p) <- h$nam3 return(p) } h.sum.prob2 <- function( h, marker=NULL ) { # for Caroline - the sum of squares of the probabilities handle <- as.numeric(h$handle)+0 if ( ! is.null( marker ) ) { p <- .Call( "happyprobs2", handle, marker, PACKAGE="happy.hbrem") p2 <- apply( p*p, 1, sum ) return(p2) } else { nm <- length(h$markers)-1 mat <- matrix( nrow=length(h$subjects), ncol=nm ) for(i in 1:nm ) { p <- .Call( "happyprobs2", handle, i, PACKAGE="happy.hbrem") mat[,i] <- apply( p*p, 1, sum ) } rownames(mat) <- h$subjects colnames(mat) <- h$markers[1:nm] return(mat) } } hgenotype <- function( h, marker=NULL, collapse=FALSE, sep="" ) { if ( class(h) == "happy" ) { handle <- as.numeric(h$handle)+0 g <- .Call( "happygenotype", handle, marker, PACKAGE="happy.hbrem") } else if ( class(h) == "happy.genome" ) { # delayed data package loaded.markers <- load.markers( h, c(marker), model="genotype" ) g <- loaded.markers[[1]] } if ( collapse ) { y <- paste( g[,1], g[,2], sep=sep ) g <- ifelse ( y == "NANA", NA, y ) } else { colnames(g) <- c("allele1", "allele2") } return(g) } # fit a QTL to the markers using the specified mode # happy is a happy object returned by a call to happy # markers is a array of marker names or marker indices # mode is one of 'additive', 'full' or 'partial' # verbose controls the amount of output # return value is a table giving the logP values for each marker tested hfit <- function( h, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=FALSE, phenotype=NULL, family='gaussian', permute=0 ) { if ( class(h) == "happy.genome" ) { if ( !is.null( h[[model]] ) ) map <- h[[model]]$map if ( is.null(markers) ) { nm <- length(h[[model]]$markers)-1 markers <- h[[model]]$markers[1:nm] } if ( is.null(phenotype)) { stop( "phenotype must be set\n") } } else { map <- h$map if ( is.null(markers) ) markers <- h$markers[1:length(h$markers)-1] if ( is.null(phenotype) ) phenotype = h$phenotype } if ( model == 'partial' || model == 'full' ) { lp <- matrix( ncol=8, nrow=length(map)-1) colnames(lp) <-c('cM', 'marker', 'additive logP', 'full logP', 'partial logP', 'additive SS', 'full SS', 'partial SS') offset <- 3 width <- 3 idx <- 3 } else { lp <- matrix( ncol=4, nrow=length(map)-1) colnames(lp) <-c('cM', 'marker', 'additive logP', 'additive SS') offset <- 3 width <- 1 idx <- 3 } permdata <- NULL if ( permute > 0 ) { # permutation test offset <- 6 width <- 2 if ( verbose ) cat ("performing permutation anaysis on ", permute, " permutations\n") hf0 <- hfit( h, markers=markers, model=model, mergematrix=mergematrix, covariatematrix=covariatematrix, verbose=FALSE, family=family, permute=0 ) emp <- matrix( ncol=7, nrow=length(map)-1) colnames(emp) <-c('cM', 'marker', 'anova.logP', 'global.pval', 'point.pval', 'global.logP', 'point.logp') maxlogp <- vector(mode='numeric',length=permute) logpk <- vector(mode='numeric',length=length(map)-1) for(k in 1:permute) { shuf <- sample(phenotype) pres <- hfit( h, markers=markers, model=model, mergematrix=mergematrix, covariatematrix=covariatematrix, verbose=FALSE, phenotype=shuf, family=family, permute=0 ) maxlogp[k] <- max( as.numeric(pres$table[,idx]) ) logpk <- logpk + ifelse(pres$table[,idx] > hf0$table[,idx], 1, 0 ) if ( verbose ) cat(k, maxlogp[k], "\n") } maxlogp <- sort( maxlogp, decreasing=TRUE ) logpk <- logpk / permute p01 <- maxlogp[as.integer(permute/100)] p05 <- maxlogp[as.integer(permute/20)] if ( verbose) cat( 'p01 ', p01, ' p05 ', p05 , "\n") mi <- 1 for ( m in hf0$table[,"marker"]) { emp[mi,4] <- NA if ( mi <= nrow(hf0$table)) { logp <- as.numeric(hf0$table[mi,idx]) n <- 0 if ( is.numeric(logp) ) { n <- sum(ifelse(maxlogp>logp,1,0),na.rm=TRUE) emp[mi,"global.pval"] <- n/permute } } mi <- mi+1 } logemp <- ifelse( emp[,'global.pval'] >= 1/permute , -log10(emp[,'global.pval']), log10(permute)) emp[,"cM"] <- hf0$table[,"cM"] emp[,"marker"] <- hf0$table[,"marker"] emp[,"anova.logP"] <-hf0$table[,idx] emp[,"global.logP"] <- logemp emp[,'point.pval'] <- logpk emp[,'point.logp'] <- ifelse( logpk >= 1/permute, -log10(logpk), log10(permute)) permdata <- list( N=permute, p01=p01, p05=p05, permutation.dist=maxlogp, permutation.pval=emp ) hf0$width <- width hf0$offset <- offset hf0$permdata <- permdata return(hf0); } else { i <- 1 maxp <- 0 maxm <- NA maxSS <- NA for( m in markers ) { if ( verbose ) cat( "\n\n****** ", i, "marker interval ", m, "\n\n" ) if ( model == 'partial' || model == 'full' ) { if ( ! is.null(full<- hdesign( h, m, model='full', mergematrix=mergematrix )) ) { additive <- hdesign( h, m, model='additive', mergematrix=mergematrix ) if ( is.null(covariatematrix) ) { cfit <- glmfit( phenotype ~ 1, family=family) ffit <- glmfit( phenotype ~ full , family=family) afit <- glmfit( phenotype ~ additive , family=family) } else { cfit <- glmfit( phenotype ~ covariatematrix , family=family) full <- cbind(covariatematrix, full ) ffit <- glmfit( phenotype ~ full , family=family) additive <- cbind(covariatematrix, additive ) afit <- glmfit( phenotype ~ additive , family=family) } if ( family == "gaussian") { an <- anova( cfit, afit, ffit ); if ( verbose ) print(an) logP <- -log(an[[6]])/log(10) an2 <- anova( cfit, ffit ); logP2 <- -log(an2[[6]])/log(10) } else { an <- anova( cfit, afit, ffit, test="Chisq" ); if ( verbose ) print(an) logP <- -log(an[[5]])/log(10) an2 <- anova( cfit, ffit, test="Chisq" ); logP2 <- -log(an2[[5]])/log(10) } if ( ! is.na(logP[2]) && logP[2] > maxp ) { maxp <- logP[2] maxm <- m maxSS <- an2[[5]][2] } lp[i,1] <- (map[i]+map[i+1])/2 lp[i,2] = m lp[i,3] <- logP[2] lp[i,4] <- logP2[2] lp[i,5] <- logP[3] lp[i,6] <- an[[4]][2] lp[i,7] <- an2[[4]][2] lp[i,8] <- an[[4]][3] i <- i+1 } } else { if ( ! is.null(d<- hdesign( h, m, model='additive', mergematrix=mergematrix )) ) { if ( is.null(covariatematrix) ) { cfit <- glmfit( phenotype ~ 1 , family=family) afit <- glmfit( phenotype ~ d , family=family) } else { cfit <- glmfit( phenotype ~ covariatematrix ) d <- cbind( covariatematrix, d ) afit <- glmfit( phenotype ~ d ) } if ( family=="gaussian") { an <- anova( cfit, afit ) logP <- -log( an[[6]])/log(10) } else { an <- anova( cfit, afit, test="Chisq" ) # print(an) logP <- -log( an[[5]])/log(10) # print(logP) } if ( verbose ) { print( an ) strain.effects( h, afit ) } if ( ! is.na(logP[2]) && logP[2] > maxp ) { maxp <- logP[2] maxm <- m maxSS <- an[[4]][2] } lp[i,1] <- (map[i]+map[i+1])/2 lp[i,2] <- m lp[i,3] <- logP[2] lp[i,4] <- an[[4]][2] i <- i+1 } } } return(list( table=lp, model=model, family=family, test='hfit', offset=offset, width=width, maxp=maxp, maxm=maxm, maxSS=maxSS, permdata=NULL )) } } # Calculate the T-tests for comparing strain effects. # Called from hfit: only suited to the additive model at present strain.effects <- function( h, fit, family='gaussian' ){ if ( class(fit) == "lm" ) { c <- coef(fit) len <- length(c) cov <- vcov(fit) dia <- 1/sqrt(diag(cov)) v <- dia %o% dia corr <- cov * v cat('\nCorrelation Matrix:\n') print (corr) cat('\n') nam <- names(c) nam2 <- c( 'Mean', h$strains ) # the true strain names se <- c() for(m in 1:len ) { strain1 <- nam[m] if ( ! is.na(c[strain1]) ) se <- c( se, sqrt(cov[strain1,strain1])) else se <- c( se, NA ) } df <- df.residual(fit) intercept <- "(Intercept)" cat('\nStrain Main Effects, with standard errors:\n' ) cat(formatC( nam2, width=10, format='s' ), '\n') cat(formatC( c, width=10, digits=4, format='f'), '\n' ) cat(formatC( se, width=10, digits=4, format='f'), '\n' ) e <- matrix( nrow=len*(len+1)/2, ncol=4 ) en <- matrix( nrow=len*(len+1)/2, ncol=2 ) colnames(e) <- c( 'diff', 'se', 'T', 'P' ) colnames(en) <- c( 'strain1', 'strain2' ) k <- 1 for(m in 1:len ) { strain1 <- nam[m] if ( strain1 != intercept && ! is.na(c[strain1]) ) { for(n in 1:m) { strain2 <- nam[n] if ( strain2 != intercept && ! is.na(c[strain2]) ) { if ( n < m ) { d <- c[m]-c[n] v <- cov[strain1,strain1] + cov[strain2,strain2] - 2*cov[strain1,strain2] en[k,1] <- nam2[m] en[k,2] <- nam2[n] e[k,1] <- d if ( v > 1.0e-6 ) { se <- sqrt(v) t <- d/se p <- pt(t,df=df,lower.tail=FALSE) e[k,2] <- se e[k,3] <- t e[k,4] <- p } k <- k+1 } else if ( n == m ) { d <- c[m] -c[intercept] v <- cov[strain1,strain1] + cov[strain2,intercept] - 2*cov[strain1,intercept] en[k,1] <- nam2[m] en[k,2] <- '(Mean)' e[k,1] <- d if ( v > 1.0e-6 ) { se <- sqrt(v) t <- d/se p <- pt(t,df=df,lower.tail=FALSE) e[k,2] <- se e[k,3] <- t e[k,4] <- p } k <- k+1 } } } } } k <- k-1 cat('\nTests of Strain Differences (note that differences may be hard to interpret when strains are indistinguishable)\n\n') cat( formatC('strain1',width=12,format='s'), formatC('strain2',width=12,format='s'), formatC('diff',width=10,format='s'), formatC('se', width=10, format='s'), formatC('T', width=7, format='s'), formatC('P',width=7,format='s'), '\n') for( j in 1:k) { cat(formatC( en[j,'strain1'], width=12,format='s'), formatC( en[j,'strain2'], width=12,format='s'), formatC(e[j,'diff'], width=10, digits=4, format='f'), formatC(e[j,'se'], width=10, digits=4, format='f'), formatC(e[j,'T'], width=7, digits=3, format='f'), formatC(e[j,'P'],digits=4,width=7,format='e'), '\n') } e[1:k,] } NULL } # merge fit a QTL to the markers using the specified mode # happy is a happy object returned by a call to happy # markers is a array of marker names or marker indices # model is one of 'additive', 'full' or 'partial' # covariatematrix is an optional design matrix to include on all the models. # (This can be used to include additional markers or covariates) # verbose controls the amount of output # merge contains the merge object used for merging strains # return value is a table giving the logP values for each marker tested # the test is for merged strains versus unmerged mfit <- function( happy, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=TRUE, family='gaussian', variants=NULL ) { map <- happy$map if ( is.null( markers ) ) markers <- happy$markers if ( model == 'partial' || model == 'full' ) { lp <- matrix( ncol=8, nrow=length(markers)) if ( is.null(variants) ) rownames(lp) <- markers else rownames(lp) <- variants colnames(lp) <-c('cM', 'marker', 'full-merged', 'full-unmerged', 'partial', 'full-merged-SS', 'full-unmerged-SS', 'partial-SS' ) } else { lp <- matrix( ncol=8, nrow=length(markers)) if ( is.null(variants) ) rownames(lp) <- markers else rownames(lp) <- variants colnames(lp) <-c('cM', 'marker', 'additive-merged', 'additive-unmerged', 'partial', 'additive-merged-SS', 'additive-unmerged-SS', 'partial-SS' ) } i <- 1 maxp <- 0 maxm <- NA for( m in markers ) { if ( verbose ) print( paste( " ", i, "marker interval ", m ) ) if ( model == 'partial' || model == 'full' ) { if ( ! is.null(full<- hdesign( happy, m, model='full', mergematrix=mergematrix )) ) { fullu <- hdesign( happy, m, model='full', mergematrix=NULL) if ( is.null( covariatematrix ) ) { cfit <- glmfit( happy$phenotype ~ 1 , family=family) ffit <- glmfit( happy$phenotype ~ full , family=family) fufit <- glmfit( happy$phenotype ~ fullu , family=family) } else { cfit <- glmfit( happy$phenotype ~ covariatematrix , family=family) full <- cbind( covariatematrix, full ) ffit <- glmfit( happy$phenotype ~ full , family=family) fullu <- cbind( covariatematrix, fullu ) fufit <- glmfit( happy$phenotype ~ fullu , family=family ) } an <- anova( cfit, ffit, fufit ); if ( verbose ) print(an) logP <- -log(an[[6]])/log(10) if ( ! is.na(logP[2]) && logP[2] > maxp ) { maxp <- logP[2] maxm <- m } an2 <- anova( cfit, fufit ); logP2 <- -log(an2[[6]])/log(10) # p-value for full unmerged lp[i,1] <- (map[i]+map[i+1])/2 lp[i,2] <- m lp[i,3] <- logP[2] # full merged lp[i,4] <- logP2[2] # full unmerged lp[i,5] <- logP[3] # partial test of merged vs unmerged if ( is.na(lp[i,5]) ) lp[i,5] <- 0 lp[i,6] <- an[[4]][2] lp[i,7] <- an2[[4]][2] lp[i,8] <- an[[4]][3] # lp[is.na(lp)] = 0.0 i <- i+1 } } else { if ( ! is.null(d<- hdesign( happy, m, model='additive', mergematrix=mergematrix )) ) { du <- hdesign( happy, m, model='additive', mergematrix=NULL ) if ( is.null( covariatematrix ) ) { cfit <- glmfit( happy$phenotype ~ 1 , family=family) afit <- glmfit( happy$phenotype ~ d , family=family) aufit <- glmfit( happy$phenotype ~ du , family=family) } else { cfit <- glmfit( happy$phenotype ~ covariatematrix , family=family) d <- cbind( covariatematrix, d ) afit <- glmfit( happy$phenotype ~ d , family=family) du <- cbind( covariatematrix, du ) aufit <- glmfit( happy$phenotype ~ du , family=family) } an <- anova( cfit, afit, aufit ) if ( verbose ) print( an ) logP <- -log(an[[6]])/log(10) if ( ! is.na(logP[2]) && logP[2] > maxp ) { maxp <- logP[2] maxm <- m } an2 <- anova( cfit, aufit ); logP2 <- -log(an2[[6]])/log(10) lp[i,1] <- (map[i]+map[i+1])/2 lp[i,2] <- m lp[i,3] <- logP[2] lp[i,4] <- logP2[2] lp[i,5] <- logP[3] if ( is.na(lp[i,5]) ) lp[i,5] <- 0 lp[i,6] <- an[[4]][2] lp[i,7] <- an2[[4]][2] lp[i,8] <- an[[4]][3] i <- i+1 } } } list( table=lp, model=model, test='mfit', offset=3, width=3, maxp=maxp, maxm=maxm ) } # create plots of the result returned by hfit, mfit happyplot <- function ( fit, mode='logP', labels=NULL, xlab='cM', ylab=NULL, main=NULL, t='s', pch=20, ... ) { def.par <- par(no.readonly=TRUE) plot.new() model <- fit$model lp <- na.omit(fit$table) test <- fit$test offset <- fit$offset plots <- fit$width # title and labels if ( ! is.null( fit$permdata ) ) { mode <- 'permutation' main <- 'permutation plot' ylab <- 'permutation logp' lp <- na.omit(fit$permdata$permutation.pval) } else if ( mode == 'logP' ) { ylab <- mode if ( is.null(main) ) main <- 'log probability plot' } else { ylab <- 'SS' if ( is.null(main) ) main <- 'Fitting Sums of Squares plot' offset <- offset + plots } # the y-axis range mx <- 0 rangemax <- offset + plots-1 for( i in offset:rangemax ) { r <- range( as.numeric(lp[,i])) mx <- range(c( mx, r)) } ymax <- mx[2] # work out how much vertical space to allocate to the marker labels, if present if ( ! is.null( labels ) ) { ps <- par('ps') par(ps=8) lwidth <- strwidth( as.character(labels$text), units='inches' ); par(ps=ps) lr <- range(lwidth) H <- lr[2]*1.2 area <- par( 'fin' ) # width and height in inches of figure mai <- par( 'mai' ) # margins in inches lambda <- (area[2]-mai[1]-mai[3]-H)/mx[2] # expansion factor h <- H/lambda mx[2] <- mx[2] + h } colours <- c( "black", "red", "blue", "green", "orange") cnames = colnames(lp ); par(col="black") par(lwd=2) plot( x=lp[,1], y=lp[,offset], ylim=mx,main=main,xlab=xlab,ylab=ylab, t=t, pch=pch, ...) rx = range( as.numeric( lp[,1] ) ) lx <- rx[2]-rx[1] tx <- c( rx[1] + 0.02*(lx) ) ty <- c( 0.95*mx[2] ) text( tx, ty, cnames[offset], adj=c(0)) wd <- strwidth(cnames) buff <- strwidth("spa") if ( rangemax > offset ) for( i in (offset+1):rangemax ) { par(col=colours[i-offset+1]) tx <- c( tx[1] + wd[i-1] + buff[1]) text( tx, ty, cnames[i], adj=c(0)) par(ps=1) lines( x=lp[,1], y=lp[,i], t=t, pch=pch) par(ps=12) } par(col="black") # the labels if ( ! is.null(labels) ) { par(srt=270) par(adj=0) par(ps=8) y <- rep( mx[2]*0.99, length(labels$text) ) text( labels$POSITION, y, as.character(labels$text) ) par(lwd=1) par(col='black') x <- labels$POSITION for( m in x) { lines( x=c( m,m ), y=c(0,ymax) ) } par(srt=0) } par(def.par) NULL } # fit two markers simultaneously, assuming additive model twofit <- function ( happy, marker1, marker2, merge1=NULL, merge2=NULL, model = 'additive', verbose=TRUE, family='gaussian' ) { if ( ! is.null(a1<- hdesign( happy, marker1, mergematrix=merge1, model=model )) && ! is.null(a2<- hdesign( happy, marker2, mergematrix=merge2, model=model )) ) { print ( paste( 'joint test of markers ', marker1, ', ', marker2 ) ) both <- cbind( a1, a2 ) f1 <- glmfit( happy$phenotype ~ a1 , family=family) f2 <- glmfit( happy$phenotype ~ a2 , family=family) b <- glmfit( happy$phenotype ~ both , family=family) cfit <- glmfit( happy$phenotype ~ 1 , family=family) s <- summary(glmfit(f1)) f <- s$fstatistic p<-pf( f[1], f[2], f[3], lower.tail=FALSE, log.p=TRUE ) if ( verbose ) { print(anova( cfit, f1, b )) print(anova( cfit, f2, b )) print(names(s)) print(-p/log(10)) } } } # fit a model to a set of markers conditional upon another marker being in the model condfit <- function( happy, markers, condmarker, merge=NULL, condmerge=NULL, model='additive', condmodel='additive', epistasis=FALSE, verbose=TRUE,family='gaussian' ) { if ( ! is.null( cond <- hdesign( happy, condmarker, model=model, mergematrix=condmerge ))) { cfit <- glmfit( happy$phenotype ~ 1 , family=family) condfit <- glmfit( happy$phenotype ~ cond , family=family) if ( verbose ) print ( paste( 'conditional additive test on marker ', condmarker ) ) for( m in markers ) { additive <- hdesign( happy, m, model=model, mergematrix=merge ) condadditive <- cbind( additive, cond ); cafit <- glmfit( happy$phenotype ~ condadditive , family=family) if ( verbose ) print(anova(cfit,condfit,cafit)) } } } mergedpositionmatrix <- function( h, position, prepmerge, model='additive', verbose=FALSE, design=TRUE ) { # creates the merged design matrix for the specified position # optionally returns the mergematrix instead ind <- match( position, prepmerge$testmarkerdata$POSITION ) # the index of the position retval <- NULL if ( ! is.null(ind) ) { alleles <- prepmerge$testmarkerdata[ind,] # the allele distribution amounst the strains mlist <- mergelist( h$strains, alleles ) # list of lists representation of the allele distribution mergematrix <- mergematrices( h$strains, mergelist=mlist) # the matrices representing the merge if ( design ) { # return the design matrix marker <- prepmerge$interval[ind] retval <- hdesign( h, marker, mergematrix=mergematrix, model=model ) } else # return the mergematrix retval <- mergematrix } else { print(paste('test marker', position, 'not found' )) } retval } # merge strains together. # creates matries used to multiply with hapy design matrices # Subsequent analyses of the happy data can use the merged strains. # happy is a happy object # mergedata is a list of lists of strains eg c( c( "Balbc", "AKR"), c("C57", "DBA") ) # NOTE: all strain names MUST be in happy$strains # NOTE: unlisted strains are dropped mergematrices <- function ( strains, mergelist=NULL, verbose=FALSE ) { if ( is.null( mergelist ) ) { return(NULL) } else { ls <- length(strains) lm <- length(mergelist) merge <- strains group <- c() # the matrix for the additive model amat <- matrix( nrow=ls, ncol=lm, data=0 ) mapping <- 1:ls i <- 1 for ( m in mergelist ) { ind <- match( m, strains, nomatch=NA); merge[ind] = paste( sep='','group',i) amat[ind,i] = 1 p <- paste(as.list(strains[ind]),collapse=",") group <- c( group, p) mapping[ind] = i i <- i+1 } # the matrix for the full model imat <- matrix( nrow=ls*(ls+1)/2, ncol=lm*(lm+1)/2, data=0 ) n <- 0 imap <- matrix(nrow=lm,ncol=lm) for(i in 1:lm) { for(j in 1:i ) { n <- n + 1 imap[i,j] <- n imap[j,i] <- n } } iname <- array( dim=lm*(lm+1)/2 ) n <- 0 # for(i in 1:ls) { # n <- n+1 # k<- imap[mapping[i],mapping[i]] # imat[n,k] <- 1 # iname[k] <- paste(sep="", "group", mapping[i], ",", mapping[i] ) # } # for(i in 2:ls) { # for(j in 1:(i-1) ) { # n <- n+1 # k<- imap[mapping[i],mapping[j]] # imat[n,k] <- 1 # iname[k] <- paste(sep="", "group", mapping[i], ",", mapping[j] ) # # } # } for(i in 1:ls) { for(j in 1:i) { n <- n+1 k<- imap[mapping[i],mapping[j]] imat[n,k] <- 1 iname[k] <- paste(sep="", "group", mapping[i], ",", mapping[i] ) } } # the matrix for the full asymmetric model famat <- matrix( nrow=ls*ls, ncol=lm*lm, data=0 ) n <- 0 famap <- matrix(nrow=lm,ncol=lm) for(i in 1:lm) { for(j in 1:lm ) { n <- n + 1 famap[i,j] <- n } } faname <- array( dim=lm*lm ) n <- 0 for(i in 1:ls) { for(j in 1:ls ) { n <- n+1 k<- famap[mapping[i],mapping[j]] # print( c(i, j, n, k, mapping[i], mapping[j])) famat[n,k] <- 1 faname[k] <- paste(sep="", "group", mapping[i], ",", mapping[j] ) } } ###### names(merge) <- strains colnames(amat) <- group colnames(imat) <- iname colnames(famat) <- faname if ( verbose ) print(paste("strains merged into ", length(group), "groups", paste( group, collapse="| " ))) list(merge=merge, group=group, amat=amat, imat=imat, famat=famat) } } fastmergefit <- function( datafile, allelesfile, markerposfile, testmarkerfile, generations=200, model='additive', verbose=FALSE ) { h <- happy( datafile, allelesfile, generations=generations ) if ( ! is.null( h ) ) { prep <- mergeprepare( h, markerposfile, testmarkerfile ) fit <- mergefit( h, prep, model=model, verbose=verbose ) return(fit) } else { return(NULL) } } mergeprepare <- function( h, markerposfile, testmarkerfile, verbose=FALSE ) { # read in the marker positions from markerposfile # format comprises two columns header 'marker' and 'POSITION' # the file must be sorted by position. Position refers to bp coordinate mergedata <- list() markerpos <- read.table(markerposfile,header=TRUE) if ( is.null(markerpos$POSITION) ) { print (paste(' ERROR - required column POSITION is missing from ', markerposfile )) return (NULL) } if ( is.null(markerpos$marker) ) { print (paste(' ERROR - required column marker is missing from ', markerposfile )) return (NULL) } # determine those markers that are also in the happy object markerpos$marker <- as.character(markerpos$marker) if ( verbose ) print(markerpos) if ( class(h) == "happy" ) mmatch <- markerpos[match( h$markers, markerpos$marker, nomatch=0 ),] else mmatch <- markerpos[match( h$additive$markers, markerpos$marker, nomatch=0 ),] nmarkers <- nrow(mmatch) markers <- mmatch[order(mmatch$POSITION),] if ( verbose ) print(markers) print(paste( nrow(markers), ' skeleton markers' )) # read in a file of markers to test # must contain columns titled 'marker' and 'POSITION' and an optional column "Variant" # together with a column for every strain in happy$strains testmarkerdata<-read.table(testmarkerfile, header=TRUE) if ( is.null(testmarkerdata$Variant) ) testmarkerdata$Variant <- paste( "var", as.character(testmarkerdata$POSITION), sep=".") # check that the required columns are all present required <- c( 'marker', 'POSITION' , make.names(h$strains) ) s <- required %in% names(testmarkerdata) n <- 1 m <- 0 if ( verbose ) print(s) if ( verbose ) print(required) for ( x in s ) { if ( ! x ) { print( paste( 'required column', required[n], 'missing from', testmarkerfile ) ); m <- m+1 } n <- n+1 } if ( m > 0 ) { print( 'mergeprepare halted') return(NULL) } # identify the happy marker interval for each marker to be tested ntest <- nrow(testmarkerdata) interval <- matrix(nrow=ntest,ncol=1) last <- 1 good <- 0 for ( t in 1:ntest ) { i <- last pos <- testmarkerdata$POSITION[t] ok <- FALSE while( ! ok && i < nmarkers ) { if ( pos >= mmatch$POSITION[i] && pos < mmatch$POSITION[i+1] ) { interval[t] <- as.character(mmatch$marker[i]) if ( verbose ) cat(paste(sep=' ', good, as.character(testmarkerdata$marker[t]), interval[t], mmatch$POSITION[i],pos,mmatch$POSITION[i+1], '\n')) last <- i ok <- TRUE good <- good + 1 } i <- i +1 } # out of range markers if ( is.na(interval[t])) { if ( pos <= mmatch$POSITION[1] ){ interval[t] = as.character(mmatch$marker[1]) cat(paste('marker ', testmarkerdata$marker[t], 'out of range, assigned to interval ', interval[1], '\n' )) } else if ( pos >= mmatch$POSITION[nmarkers] ) { interval[t] = as.character(mmatch$marker[nmarkers-1]) cat(paste('marker ', testmarkerdata$marker[t], 'out of range, assigned to interval ', interval[nmarkers-1], '\n' )) } } } print(paste(good, ' test markers placed on skeleton map')) mergedata$markerpos <- markerpos mergedata$interval <- interval mergedata$markers <- markers mergedata$testmergedata <- testmarkerdata mergedata } # compute the strain distribution pattern from a list of alleles for the strains. # results is a string of 0'1 and 1's, in the order dictated by strains. The first character is always 0 sdp <- function( strains, alleles ) { s <- list() n <- 0 for (a in alleles) { for (x in a) { s[x] = n } n <- n+1 } flip <- FALSE if ( s[strains[1]] == 1 ) flip <- TRUE if ( n == 2 && flip ) { for ( y in strains) { if ( s[y] == 1 ) s[y] = 0 else s[y] = 1 } } sdpvalue <- "" for ( y in strains) { sdpvalue <- paste( sdpvalue, s[y], sep="" ) } return(sdpvalue) } condmergefit <- function( h, mergedata, model='additive', covariatematrix=NULL, verbose=FALSE ) { interval <- mergedata$interval testmergedata <- mergedata$testmergedata ntest <- nrow(testmergedata) matrices <- list() strains <- h$strains logPmax <- matrix(nrow=ntest,ncol=6) colnames(logPmax) <- c( "position", "interval", "sdp", "logPself", "logPmax", "logPmaxPosition" ) logP <- list() logPm <- list() logPmatrix <- matrix(nrow=ntest,ncol=ntest) for ( m in 1:ntest ) { if ( !is.null(interval[m]) ) { # this test eliminates markers that could not be placed on the map im <- interval[m] alleles <- testmergedata[m,] # the allele distribution amounst the strains mlist <- mergelist( strains, alleles ) # list of lists representation of the allele distribution sdpvalue <- sdp( strains, mlist ) # Strain Distribution pattern if ( is.null( matrices[[sdpvalue]] ) ) matrices[[sdpvalue]] <- mergematrices( strains, mergelist=mlist) # the matrices representing the merge d <- hdesign( h, im, model=model, mergematrix=matrices[[sdpvalue]] ) ckey <- paste(im,sdpvalue) # cache key if ( is.null(logP[[ckey]]) ) { if ( ! is.null(covariatematrix) ) d <- cbind( d, covariatematrix ) f <- mfit( h, im, model=model, mergematrix=matrices[[sdpvalue]], covariatematrix=covariatematrix, verbose=verbose ) mf <- mergefit( h, mergedata, model=model, covariatematrix=d, verbose=verbose ) mx <- max(as.numeric(mf$table[,3]),na.rm=TRUE) mp <- which.max(as.numeric(mf$table[,3])) logPm[[ckey]] <- ifelse( is.na(mf$table[,3]), 0, mf$table[,3] ) logP[[ckey]] <- c( im, sdpvalue, f$maxp, mx, mp) } logPmatrix[m,] <- logPm[[ckey]] logPmax[m,] <- c( testmergedata$POSITION[m], logP[[ckey]]) cat(paste(logPmax[m,]),"\n") } } rownames(logPmatrix) <- as.character(testmergedata$POSITION) colnames(logPmatrix) <- as.character(testmergedata$POSITION) return( list(logPmax=logPmax, logPmatrix=logPmatrix ) ) } mergefit <- function( h, mergedata, model='additive', covariatematrix=NULL, verbose=FALSE ) { fit <- NULL n <- 1 interval <- mergedata$interval testmergedata <- mergedata$testmergedata ntest <- nrow(testmergedata) cache <- list() f <- NULL saved <- 0 calculated <- 0 strains <- h$strains for ( m in 1:ntest ) { if ( !is.null(interval[m]) ) { # this test eliminates markers that could not be placed on the map im <- interval[m] alleles <- testmergedata[m,] # the allele distribution amounst the strains mlist <- mergelist( strains, alleles ) # list of lists representation of the allele distribution sdpvalue <- sdp( strains, mlist ) # Strain Distribution pattern ckey <- paste(im,sdpvalue) # cache key # cat( "ckey ", ckey, '\n' ) if ( is.null( cache[[ckey]] )) { # test if this fit has been cached mergematrix <- mergematrices( strains, mergelist=mlist) # the matrices representing the merge f <- mfit( h, im, model=model, mergematrix=mergematrix, covariatematrix=covariatematrix, verbose=verbose, variants=testmergedata$Variant[m] ) calculated <- calculated +1 cache[[ckey]] <- f } else { saved <- saved +1 f <- cache[[ckey]] } f$table[1,1] <- alleles$POSITION if ( n == 1 ) { fit <- f } else { fit$table <- rbind( fit$table, f$table[1,] ) } n <- n+1 } } rownames(fit$table) <- as.character(testmergedata$Variant) print(paste(saved, 'fits saved', calculated, 'fits calculated')) fit$testmarkerdata <- testmergedata fit$interval <- interval return(fit) } # create a mergelist structure from a list of alleles associated with each strain mergelist <- function( strains, alleles ) { l <- c() strains <- make.names(strains) for ( s in strains ) { l <- c( l, as.character(alleles[[s]] )) } l <- unique(sort(as.character(l))) mlist <- list() for ( a in l ) { v <- c() for ( s in strains ) { if ( alleles[[s]] == a ) { v <- c( v, s ) } } mlist[[a]] <- v } mlist } mergeplot <- function( fit, mergedata, mode='logP', xlab='bp', ylab=NULL, main=NULL, t='p', pch=20, ... ) { def.par <- par(no.readonly = TRUE)# save default, for resetting... # layout( matrix( c(2,1), nrow=2, ncol=1 ), widths = c( 1), heights=c( 5, 1 ) ) # layout.show(2) labels <- list( text=as.character(mergedata$markers$marker), POSITION=mergedata$markers$POSITION ) happyplot( fit, mode=mode, labels=labels, xlab=xlab, ylab=ylab, main=main, t=t, pch=pch, ... ) par(def.par)#- reset to default } epistasis <- function( h, markers1, markers2=NULL, merge1=NULL, merge2=NULL, model='additive', verbose=FALSE, family='gaussian' ) { if ( is.null( markers2 ) ) { # all pairwise interactions nmarkers = length(markers1) ninteractions <- nmarkers*(nmarkers-1)/2 results <- matrix( nrow=ninteractions, ncol=7 ) colnames(results) <- c( 'marker1', 'marker2', 'main1', 'main2', 'main1+main2', 'main1*main2', 'main1.main2' ) r <- 1 d <- list() length(d) <- nmarkers main <-list() length(main) <- nmarkers logten <- log(10) # precalculate the main effects for ( m in 1:nmarkers ) { d[[m]] <- hdesign( h, markers1[m], mergematrix=merge1, model=model ) f <- glmfit( h$phenotype ~ d[[m]] , family=family) a <- anova(f) main[[m]] <- -log(a[1,5])/logten # its log-P value } for ( m1 in 2:nmarkers ) { ma1 <- markers1[m1] print(ma1) for( m2 in 1:(m1-1) ) { results[r,] <- epistasispair( h, ma1, markers1[m2], merge1=merge1, merge2=merge2, model=model, verbose=verbose, d1=d[[m1]], d2=d[[m2]], main1=main[[m1]], main2=main[[m2]] ) r <- r+1 } } } else { nmarkers1 = length(markers1) nmarkers2 = length(markers2) ninteractions <- nmarkers1*nmarkers2 results <- matrix( nrow=ninteractions, ncol=7 ) colnames(results) <- c( 'marker1', 'marker2', 'main1', 'main2', 'main1+main2', 'main1*main2', 'main1.main2' ) r <- 1 for ( m1 in 1:nmarkers1 ) { for( m2 in 1:nmarkers2 ) { results[r,] <- epistasispair( h, markers1[m1], markers2[m2], merge1=merge1, merge2=merge2, model=model, verbose=verbose ) r <- r+1 } } } results } epistasispair<- function( h, marker1, marker2, merge1=NULL, merge2=NULL, model='additive', verbose=FALSE, d1=NULL, d2=NULL, main1=0, main2=0, family='gaussian' ) { logten <- log(10) # fit the first marker if ( is.null(d1)) { d1 <- hdesign( h, marker1, mergematrix=merge1, model=model ) m1 <- glmfit( h$phenotype ~ d1 , family=family) a1 <- anova(m1) main1 <- -log(a1[1,5])/logten # its log-P value } # fit the second marker if ( is.null(d2)) { d2 <- hdesign( h, marker2, mergematrix=merge2, model=model ) m2 <- glmfit( h$phenotype ~ d2 , family=family) a2 <- anova(m2) main2 <- -log(a2[1,5])/logten } additive <- 0 epistatic <- 0 full <- 0 if ( ! is.null(d1) && ! is.null(d2) ) { const <- glmfit( h$phenotype ~ 1 , family=family) # fit both markers additively d12 <- cbind( d1, d2 ) m12 <- glmfit( h$phenotype ~ d12 , family=family) a12 <- anova(m12) additive <- -log(a12[1,5])/logten # fit both markers epistatically D12 <- matrixSquared( d1, d2 ) M12 <- glmfit( h$phenotype ~ D12 , family=family) A12 <- anova(const,m12,M12) if ( verbose ) print(A12) additive <- -log(A12[2,6])/logten # the log P for the additive main effects epistatic <- -log(A12[3,6])/logten # the log P for the interaction after removing main effects an12 <- anova( M12 ); full <- -log(an12[1,5])/logten # log p-value for full interaction } # print(c( marker1, marker2, main1, main2, additive, full, epistatic )) c( marker1, marker2, main1, main2, additive, full, epistatic ) } matrixSquared <- function( matrix1, matrix2 ) { dim1 <- dim(matrix1) dim2 <- dim(matrix2) I <- NULL if ( dim1[1] == dim2[1] && dim1[2] == dim2[2] ) { I <- matrix( nrow=dim1[1], ncol=dim1[2]*dim2[2]) c12 <- 1 for( c1 in 1:dim1[2] ) for( c2 in 1: dim2[2] ) { I[,c12] <- matrix1[,c1]*matrix2[,c2] c12 <- c12+1 } } I } # support for a seqential fit of marker intervals hfit.sequential<- function ( h, threshold=2, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=FALSE, family='gaussian') { if ( is.null(covariatematrix) ) covariatematrix <- matrix(1, nrow=length(h$phenotypes),ncol=1) nullfit <- glmfit( as.formula('h$phenotypes ~ covariatematrix'), family=family ) lastfit <- nullfit cat('\nNull Model\n') print(anova(nullfit)) fit <- hfit( h, markers=markers, model=model, mergematrix=mergematrix, covariatematrix=covariatematrix, verbose=verbose, family=family ); logP <- fit$maxp; m <- fit$maxm kmax <- length(h$markers)-1 k <- 0 intervals <- c() interval <- list() oldformula <- 'h$phenotypes ~ covariatematrix' submatrix <- covariatematrix while ( logP > threshold && k < kmax ) { k <- k+1 intervals <- c(intervals, m) d <- hdesign( h, m, model=model, mergematrix=mergematrix ) interval[[m]] <- d newformula <- paste(oldformula, ' + interval[[\'', m, '\']]', sep="") nextfit <- glmfit( as.formula(newformula), family=family ) cat(paste( "\n", k, " *** intervals", paste(intervals, collapse=","), "logP", logP), "\n\n" ) cat('\nPartial Comparison:\n') an1 <- anova(lastfit,nextfit) print(an1) cat('\nFull Comparison:\n') an2 <- anova(nullfit,nextfit) print(an2) print(summary.lm(nextfit)) lastfit <- nextfit oldformula <- newformula submatrix <- cbind( submatrix, d ) fit <- hfit( h, markers=markers, model=model, mergematrix=mergematrix, covariatematrix=submatrix, verbose=verbose, family=family ); logP <- fit$maxp; m <- fit$maxm } NULL } # support for multiple phenotypes # phen should be a data table of numeric phenotypes pfit <- function( h, phen, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=FALSE, family='gaussian') { if ( is.data.frame(phen) ) { pnames <- names(phen) if ( nrow(phen) == length(h$subjects) ) { results <- list(); for ( p in pnames) { cat(p, "\n") results[[p]] <- hfit(h,phenotype=as.numeric(as.character(phen[[p]])),family=family,covariatematrix=covariatematrix) } return(results) } else { print(paste('ERROR - number of rows in phen = ', nrow(phen), ', different from number of subjects ', nrow(h$subjects))) } } else { print('ERROR phen is not a data.frame') } NULL } glmfit <- function( formula=NA, family='gaussian' ) { formula <- as.formula(formula) if ( family == 'gaussian' ) return(lm( formula )) else return(glm( formula, family=family)) } # convert a vector of numbers into gaussian deviates normalise <- function( values=NULL ) { n<- length(values)+1; r <- rank(values)/n; qnorm(r) } ################################### gaussian.iterate <- function( d, params ) { y <- d$y probs <- d$probs # n x p beta <- params$beta sigma2 <- params$sigma2 n <- length(y) yy <- t(array(y, dim=c(n,length(beta)))) pp <- array(beta,dim=c(length(beta),n)) xx <- yy-pp # p x n xxx <- xx*xx # p x n e <- t(exp(-xxx/(2*sigma2)))*probs # n x p w <- e / apply( e,1,sum) # divide by col sums n x p ws <- apply(w,2,sum) beta.new <- drop( y %*% w ) / ws rs <- apply( e, 1, sum ) sigma2.new <- mean (apply( t(w) * xxx, 2, sum ) ) LogL <- -sum(log(rs)) + n* log(2*pi*sigma2) *0.5 dbeta <- ( beta - beta.new )*ws/sigma2 dsigma2 <- 0.5*n/sigma2 * (1 - sigma2.new/sigma2 ) return (list( LogL=LogL, beta=beta.new, sigma2=sigma2.new, dbeta=dbeta, dsigma2=dsigma2 )) } gaussian.init <- function( d ) { dm<- dim(d$probs) df <- dm[2] n <- length(d$y) sigma2 <- var(d$y)*(n-1)/n mu <- mean(d$y) LogL <- gaussian.null( n, sigma2 ) return( list( sigma2=sigma2, beta=array(mu, dim=df), LogL=LogL)) } gaussian.null <- function( n, sigma2 ) { return( 0.5*n*(1+log(2*pi*sigma2))) } gaussian.loop <- function ( d, maxit=100, eps=1.0e-3, df=NULL ) { i <- 0 e <- 2*eps Flast <- 0.0 params.null <- gaussian.init( d ) params <- params.null while( i < maxit && e > eps ) { params.new <- gaussian.iterate( d, params ) i <- i+1 params <- params.new e <- abs(params.new$LogL-Flast); # print(c(i,e,params.new$LogL,params.new$sigma)) Flast <- params.new$LogL } params$it <- i params$eps <- e params$N <- length(d$y) params$Null <- gaussian.null(params$N,params.null$sigma2) params$chi <- 2*( params$Null - params.new$LogL ) if ( is.null(df) ) params$df <- length(params$beta)-1 else params$df <- df print( c(params$chi, params$df , params$Null, params$LogL)) params$Pval <- pchisq( params$chi, params$df , lower.tail=FALSE) params$LogPval <- -log10(params$Pval) if ( params$chi < 0 ) { cat (c("error ", params$chi, params$Null , params$LogL, "\n")) # print( params$sigma) # print( params$beta) } return( params ) } gaussian.fn <- function( p, d=NULL ) { params <- list( beta=p[2:length(p)], sigma2=p[1] ) params.new <- gaussian.iterate( d, params ) res <- params.new$LogL attr(res, "gradient") <- c( params.new$dsigma2, params.new$dbeta ) res } gaussian.gr <- function( p, d=NULL ) { params <- list( beta=p[2:length(p)], sigma2=p[1] ) params.new <- gaussian.iterate( d, params ) c( params.new$dsigma2, params.new$dbeta ) } gfit <- function( h, eps=1.0e-4, shuffle=FALSE, method="optim" ) { y <- h$phenotypes nm <- length(h$markers)-1 table <- matrix(nrow=nm,ncol=7) colnames(table) <- c( "cM", "marker", "LogP", "ChiSq", "Null", "df", "Pval") if ( shuffle ) y <- sample(y) for( m in 1:nm ) { p <- hprob( h, h$markers[m] ) q <- qr(p) df <- q$rank d <- list( y=y, probs=p ) if ( method == "optim" ) { params0 <- gaussian.init( d ) p0 <- c( params0$sigma2, params0$beta ) # res <- nlm( gaussian.fn, p0, print.level=2, check.analyticals=TRUE, d=d) res <- optim( p0, gaussian.fn, gaussian.gr, method="BFGS", d=d) chi <- 2*(params0$LogL - res$value) pval <- pchisq( chi, df , lower.tail=FALSE) LogPval <- -log10(pval) table[m,] <- c( h$map[m], h$markers[m], LogPval, chi, params0$LogL, df, pval ) print(table[m,]) } else { res <- gaussian.loop( d, eps=eps, df=df ) cat( h$markers[m], "chi", res$chi, "df", res$df, "logPval", res$LogPval, "LogL", res$LogL, "Null", res$Null, "\n" ) table[m,] <- c( h$map[m], h$markers[m], res$LogPval, res$chi, res$Null, res$df, res$Pval ) } } return ( list( table=table, offset=3, width=1, model="mixture",test="gfit", method=method, maxm=max(as.numeric(table[,3]),na.rm=TRUE), maxp=which.max(as.numeric(table[,3])) )) } # Genome Cache functions save.genome <- function ( gdir, sdir, prefix, chrs=NULL, file.format="ped", mapfile=NULL, ancestryfile=NULL, generations=50, phase="unknown", haploid=FALSE, mc.cores=1 ) { if ( is.null(chrs) ) chrs <- the.chromosomes() if ( ! file.exists(sdir)) dir.create(sdir) if ( haploid == FALSE ) { full <- paste(sdir, "/full/", sep="") dir.create(full) } additive <- paste(sdir, "/additive/", sep="") dir.create(additive) genotype <- paste(sdir, "/genotype/", sep="") dir.create(genotype) if ( ! require(multicore)) mc.cores = 1 if ( mc.cores <=1 ) { lapply( chrs, save.happy.internal, gdir, prefix, file.format, ancestryfile, generations, mapfile, phase, haploid, additive, full, genotype) } else { mclapply( chrs, save.happy.internal, gdir, prefix, file.format, ancestryfile, generations, mapfile, phase, haploid, additive, full, genotype, mc.cores=mc.cores) } } save.happy.internal <- function( chr, gdir, prefix, file.format, ancestryfile, generations, mapfile, phase, haploid, additive, full, genotype) { h <- happy( paste( gdir, chr, prefix, ".data", sep="" ), paste( gdir, chr, prefix, ".alleles", sep="" ), file.format=file.format, ancestryfile=ancestryfile, generations=generations, do.dp=TRUE, mapfile=mapfile, phase=phase, haploid=haploid ) save.happy( h, chr, dir=additive, model="additive" ) if ( h$haploid == FALSE ) save.happy( h, chr, dir=full, model="full" ) save.happy( h, chr, dir=genotype, model="genotype" ) delete.happy.cobject(h) } delete.happy.cobject <- function(h) { cat("delete.happy() called\n") } the.chromosomes <- function( autosomes=19, use.X=FALSE ) { if ( use.X) return(paste( "chr", c(1:autosomes,"X"), sep="")) else return(paste( "chr", c(1:autosomes), sep="")) } save.happy <- function( h, pkg, dir, model="additive" ) { ddp <- paste( dir, "/", pkg, sep="") g.data.attach(ddp) if ( model == "genotype" ) nm <- length(h$markers) else nm <- length(h$markers) -1 # markers.safe = as.character(h$markers[1:nm]) markers.safe = make.names(h$markers[1:nm]) assign("markers", h$markers[1:nm], 2) assign("markers.safe",markers.safe,2) assign("map", h$map[1:nm], 2 ) assign("chromosome", h$chromosome[1:nm], 2 ) assign("subjects", h$subjects, 2 ); assign("strains", h$strains, 2 ) assign("haploid", h$haploid, 2) print ("saving strains") if ( !is.null(h$bp)) assign("bp", h$bp[1:nm], 2); if ( model == "genotype" ) { for( m in 1:nm ) { assign(markers.safe[m], hgenotype( h, m, collapse=FALSE ), 2) } g.data.save(ddp) } else { for( m in 1:nm ) { assign(markers.safe[m], hdesign( h, m, model=model ), 2) } g.data.save(ddp) } return(ddp) } load.genome <- function (sdir, use.X = TRUE, chrs = the.chromosomes(use.X=use.X), n.chr=NA, models=c("additive", "full", "genotype")) # CHANGED { g <- list() old.subjects <- NULL old.strains <- NULL if ( is.integer(n.chr) ) chrs = paste("chr", 1:n.chr, sep="") for (model in models) { if ( file.exists( paste(sdir, model, sep = "/") )) { pkgs <- paste(sdir, model, chrs, sep = "/") # CHANGED pkgs = pkgs[file.exists(pkgs)] markers <- c() chromosome <- c() map <- c() pkgname <- c() bp <- c() for (p in pkgs) { # chromosome <- c(chromosome, g.data.get("chromosome", p)) chromosome <- c(chromosome, happy.load.data("chromosome", p)) m <- happy.load.data("markers", p) markers <- c(markers, m) map <- c(map, happy.load.data("map", p)) bp <- c(bp, happy.load.data("bp", p)) pkgname <- c(pkgname, rep(p, length(m))) subjects <- happy.load.data("subjects", p) strains <- happy.load.data("strains", p) if ( is.null(old.subjects)) { old.subjects <- subjects } if ( !identical(subjects,old.subjects )) { cat( "ERROR - subject names are inconsistent for chromosome ", chromosome[1] , "\n", subjects, "\n", old.subjects, "\n") stop( "FATAL HAPPY ERROR") } if ( is.null(old.strains)) { old.strains <- strains } if ( ! identical( strains, old.strains) ) { cat( "ERROR - strain names are inconsistent for chromosome ", chromosome[1] , "\n", strains, "\n", old.strains, "\n") stop( "FATAL HAPPY ERROR") } } genome <- data.frame( marker = I(as.character(markers)), map = as.numeric(map), bp = as.numeric(bp), ddp = I(as.character(pkgname)), chromosome = I(as.character(chromosome))) g[[model]] <- list( genome = genome, subjects = subjects, strains = strains, markers = as.character(genome$marker), chromosome = as.character(genome$chromosome), map = genome$map) } } g$subjects <- g$genotype$subjects g$strains <- g$additive$strains g$markers <- g$genotype$markers g$chromosome <- g$genotype$chromosome g$map <- g$genotype$map class(g) <- "happy.genome" return(g) } load.markers <- function( genome, markers, model="additive", include.models=FALSE ) { if ( length(model) == 1 ) model <- rep( model, length(markers)) marker.list <- list() model.list <- list() for(i in 1:length(markers)) { genome.model <- genome[[model[i]]] if ( is.numeric(markers)) rows <- markers else rows <- pmatch( as.character(markers[i]), as.character(genome.model$genome[,1]), nomatch=NA ) if ( length(rows) > 0 ) { r <- rows[1] m <- as.character(genome.model$genome[r,"marker"]) # m.names = make.names(m) pkg <- as.character(genome.model$genome[r,"ddp"]) marker.list[[m]] <- happy.load.data( m, pkg) ### model.list[[m]] <- model[i] } } if ( include.models ) return( list( marker=marker.list, model=model.list )) else return( marker.list ) } happy.load.data <- function (item, dir) # replaces calls to g.data.get, to make things backwards compatible. { env <- new.env() # determine which version of g.data was used to save the data filename.pre2009 <- file.path(dir, "data", paste(item, "RData", sep = ".")) if (file.exists(filename.pre2009)) { load(filename.pre2009, env) return ( get(item, envir = env) ) } # assume 2009 version of g.data was used filename.post2009 <- file.path(dir, paste(gsub("([[:upper:]])", "@\\1", item), "RData", sep = ".") ) if (file.exists(filename.post2009)) { load(filename.post2009, env) return ( get(item, envir = env ) ) } mm = make.names(item) filename.make.names = file.path(dir,paste(gsub("([[:upper:]])", "@\\1", mm), "RData", sep = ".")) if (file.exists(filename.make.names)) { load(filename.make.names, env) return ( get(mm, envir = env ) ) } stop("Could not find data for ", item, " in package ", dir) } happy.hbrem/R/hbrem.R0000644000261400006360000002452412254601044013675 0ustar00rmottmott.packageName <- "happy.hbrem_2.2" hbrem <- function( RX, HaploidInd, Ndip, Nstrain, Nind, Npost=2000, Nbin, Ry ) { brem <- .Call( "hbrem", RX, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin, Ry, PACKAGE="happy.hbrem" ) return(brem) } hbrem.true <- function( RX, HaploidInd, Ndip, Nstrain, Nind, Npost=2000, Nbin, Ry ) { brem.true <- .Call( "hbrem_true", RX, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin, Ry, PACKAGE="happy.hbrem" ) return(brem.true) } hbrem.locus <- function(m, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) { if ( class(g) == "condensed.happy" ) { cum.mark <- 0 chr <- 0 while ( m > cum.mark) { chr = ( chr + 1 ) cum.mark = ( cum.mark + length(g[[model]]$chr[[chr]]) ) } cum.gen <- ( cum.mark - length(g[[model]]$chr[[chr]]) ) chr.mark <- ( m - cum.gen ) d <- g[[model]]$chr[[chr]][chr.mark][[1]][[1]] d.cc <- d[cc,] } else { d <- hdesign(g, m, model=model) d.cc <- d[cc,] } hb <- hbrem(RX=d.cc, HaploidInd=HaploidInd, Ndip=Ndip, Nstrain=Nstrain, Nind=Nind, Npost=Npost, Nbin=Nbin, Ry=Ry) reg.full.lm <- lm(Ry ~ d.cc) reg.null.lm <- lm(Ry ~ 1) Ftest <- anova(reg.null.lm, reg.full.lm) pval <- Ftest$"Pr(>F)"[2] # cat( m, -log10(pval), "\n" ) return( c( -log10(pval), hb[[1]], hb[[2]], hb[[3]], hb[[4]], hb[[5]] )) } hbrem.perm.locus <- function(m, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) { if ( class(g) == "condensed.happy" ) { cum.mark <- 0 chr <- 0 while ( m > cum.mark) { chr = ( chr + 1 ) cum.mark = ( cum.mark + length(g[[model]]$chr[[chr]]) ) } cum.gen <- ( cum.mark - length(g[[model]]$chr[[chr]]) ) chr.mark <- ( m - cum.gen ) d <- g[[model]]$chr[[chr]][chr.mark][[1]][[1]] d.cc <- d[cc,] } else { d <- hdesign(g, m, model=model) d.cc <- d[cc,] } nrow.d <- length(d.cc[,1]) ncol.d <- length(d.cc[1,]) d.best <- matrix(rep(0, nrow.d*ncol.d), nrow.d, ncol.d) best.cc <- c() for ( i in 1:nrow.d ) { best <- which( d.cc[i,] == max(d.cc[i,]) ) if ( length(best) == 1 ) { best.cc[i] = best } else { best.cc[i] = sample(best, 1) } d.best[ i, best.cc[i] ] = 1 } hb <- hbrem.true(RX=d.best, HaploidInd=HaploidInd, Ndip=Ndip, Nstrain=Nstrain, Nind=Nind, Npost=Npost, Nbin=Nbin, Ry=Ry) reg.full.lm <- lm(Ry ~ d.cc) reg.null.lm <- lm(Ry ~ 1) Ftest <- anova(reg.null.lm, reg.full.lm) pval <- Ftest$"Pr(>F)"[2] # cat( m, -log10(pval), "\n" ) return( c( -log10(pval), hb[[1]], hb[[2]], hb[[3]], hb[[4]], hb[[5]] )) } hbrem.merge.locus <- function(m, sdp, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) { d <- hdesign(g, m, model=model) d.cc <- d[cc,] if ( model == "additive" ) { all.0 <- numeric(length(d[,1])) all.1 <- numeric(length(d[,1])) for ( i in 1:Nstrain ) { if ( sdp[i] == 0 ) { all.0 = ( all.0 + d[,i] ) } else if ( sdp[i] == 1 ) { all.1 = ( all.1 + d[,i] ) } else { cat("sdp not 0 or 1\n") } } d.merge <- cbind(all.0, all.1) } else if ( model == "full" ) { sdp.matrix <- matrix(kronecker(sdp, sdp, paste, sep=""), nrow=Nstrain) sdp.vector <- c( diag(sdp.matrix), sdp.matrix[upper.tri(sdp.matrix, diag=FALSE)]) sdp.full <- rep(1,36) sdp.full[sdp.vector == "00"] = 0 sdp.full[sdp.vector == "11"] = 2 dip.0 <- numeric(d[,1]) dip.1 <- numeric(d[,1]) dip.2 <- numeric(d[,1]) for ( i in 1:Ndip ) { if ( sdp.full[i] == 0 ) { dip.0 = ( dip.0 + d[,i] ) } else if ( sdp.full[i] == 1 ) { dip.1 = ( dip.1 + d[,i] ) } else if ( sdp.full[i] == 2 ) { dip.2 = ( dip.2 + d[,i] ) } else { cat("sdp not 0,1 or 2\n") } } d.merge <- cbind(dip.0,dip.1,dip.2) } else { cat("model not specified: must be one of additive, full\n") } hb <- hbrem(RX=d.merge, HaploidInd=HaploidInd, Ndip=3, Nstrain=2, Nind=Nind, Npost=Npost, Nbin=Nbin, Ry=Ry) reg.full.lm <- lm(Ry ~ d.merge) reg.null.lm <- lm(Ry ~ 1) Ftest <- anova(reg.null.lm, reg.full.lm) pval <- Ftest$"Pr(>F)"[2] # cat( m, -log10(pval), "\n" ) return( c( -log10(pval), hb[[1]], hb[[2]], hb[[3]], hb[[4]], hb[[5]] )) } hbrem.region <- function(g, markers, Ry, cc, HaploidInd, Npost, Nbin, Nperm=1000, thres.quick=c(0.5, 0.1, 0.05), thres.precise=c( 0.05, (1/length(markers)) ), thres.method="none", mc.cores=1) { if ( HaploidInd == 1 ) { model = "additive" Ndip = length(g$strains) Nstrain = Ndip Nind = length(Ry) marker.names <- g$additive$genome$marker[markers] } else if ( HaploidInd == 0 ) { model = "full" ns = length(g$strains) Ndip = ns*(ns+1)/2 Nstrain = ns Nind = length(Ry) marker.names <- g$full$genome$marker[markers] } nmark <- length(markers) if ( mc.cores == 1 ) res = t(sapply ( markers, hbrem.locus, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) ) else { res.list=mclapply ( markers, hbrem.locus, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin, mc.cores=mc.cores) res = t(do.call( "cbind", res.list ) ) } mark.pars.df <- data.frame(res[,1:46]) names(mark.pars.df) = c( "F.logPval", "Hbar", "sd.Ni", "BIC.qtl", "BIC.null", "BF", "logBF", "DIC.qtl", "DIC.null", "DIC.diff", "pd.qtl", "pd.null", "mode.k", "ga", "gb", "mode.var", "med.k", "med.mu", "med.var", "mean.k", "mean.mu", "mean.var", "hpd.k.50.lower", "hpd.k.50.upper", "hpd.mu.50.lower", "hpd.mu.50.upper", "hpd.var.50.lower", "hpd.var.50.upper", "hpd.k.75.lower", "hpd.k.75.upper", "hpd.mu.75.lower", "hpd.mu.75.upper", "hpd.var.75.lower", "hpd.var.75.upper", "hpd.k.95.lower", "hpd.k.95.upper", "hpd.mu.95.lower", "hpd.mu.95.upper", "hpd.var.95.lower", "hpd.var.95.upper", "hpd.k.99.lower", "hpd.k.99.upper", "hpd.mu.99.lower", "hpd.mu.99.upper", "hpd.var.99.lower", "hpd.var.99.upper") offset = ncol(mark.pars.df) mark.pars.df$Name=as.character(marker.names) if ( class(g) == "happy.genome" ) { idx = match( marker.names, g[[model]]$genome$marker ) mark.pars.df$Chr = g[[model]]$genome$chromosome[idx] mark.pars.df$Bp = g[[model]]$genome$bp[idx] bp2 = mark.pars.df$Bp[2:length(mark.pars.df$Bp)] bidx = which(bp2 < mark.pars.df$Bp[1:(length(mark.pars.df$Bp)-1)])-1 mark.pars.df$CumBp = rep(0,nrow(mark.pars.df)) mark.pars.df$CumBp[bidx+2] = mark.pars.df$Bp[bidx+1] mark.pars.df$CumBp = cumsum(mark.pars.df$CumBp) + mark.pars.df$Bp } cat("max logP = ", max(mark.pars.df$F.logPval), "\n") cat("max mode(k) = ", max(mark.pars.df$mode.k), "\n") hap.means.df <- data.frame( res[,(offset+1):(offset+Ndip)]) hap.sdevs.df <- data.frame( res[,(offset+Ndip+1):(offset+2*Ndip)]) hap.avNis.df <- data.frame( res[,(offset+2*Ndip+1):(offset+3*Ndip)]) strain.means.df <- data.frame( res[,(offset+3*Ndip+1):(offset+3*Ndip+Nstrain)]) if ( HaploidInd == 1 ) { names(hap.means.df) <- g$strains names(hap.sdevs.df) <- g$strains names(hap.avNis.df) <- g$strains names(strain.means.df) <- g$strains } else if ( HaploidInd == 0 ) { strain.names = g$strains num.strains = length(g$strains) diplotype.names <- matrix(kronecker(strain.names, strain.names, paste, sep="."), nrow=num.strains) names.full.symmetric <- c( diag(diplotype.names), diplotype.names[upper.tri(diplotype.names, diag=FALSE)]) names(hap.means.df) <- names.full.symmetric names(hap.sdevs.df) <- names.full.symmetric names(hap.avNis.df) <- names.full.symmetric names(strain.means.df) <- g$strains } permuted.y=NULL if ( is.numeric(Nperm) & Nperm>0 ) { permuted.y = replicate( Nperm, sample(Ry) ) } FlogP.thres=NULL modek.thres=NULL if ( thres.method == "precise" ) { FlogP.region = matrix( rep(0, nmark*Nperm), Nperm, nmark ) modek.region = matrix( rep(0, nmark*Nperm), Nperm, nmark ) for ( i in 1:Nperm ) { cat("perm ", i, "\n") if ( mc.cores == 1 ) res = t(sapply ( markers, hbrem.locus, g, model, permuted.y[,i], cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) ) else { res.list=mclapply ( markers, hbrem.locus, g, model, permuted.y[,i], cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin, mc.cores=mc.cores) res = t(do.call( "cbind", res.list ) ) } FlogP.vector <- res[,1] modek.vector <- res[,13] for ( j in 1:nmark ) { FlogP.region[i,j] = FlogP.vector[j] modek.region[i,j] = modek.vector[j] } } for ( j in 1:nmark ) { FlogP.region[,j] <- sort.list(-FlogP.region[,j]) modek.region[,j] <- sort.list(-modek.region[,j]) } FlogP.thres <- FlogP.region[Nperm*thres.precise,] modek.thres <- modek.region[Nperm*thres.precise,] } else if ( thres.method == "quick" ) { FlogP.regionwide.distn <- numeric(Nperm) modek.regionwide.distn <- numeric(Nperm) for ( i in 1:Nperm ) { cat("perm ", i, "\n") if ( mc.cores == 1 ) res = t(sapply ( markers, hbrem.perm.locus, g, model, permuted.y[,i], cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) ) else { res.list=mclapply ( markers, hbrem.perm.locus, g, model, permuted.y[,i], cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin, mc.cores=mc.cores) res = t(do.call( "cbind", res.list ) ) } FlogP.vector <- res[,1] modek.vector <- res[,12] max.FlogP <- max(FlogP.vector) max.modek <- max(modek.vector) FlogP.regionwide.distn[i] = max.FlogP modek.regionwide.distn[i] = max.modek } FlogP.sort <- FlogP.regionwide.distn[ sort.list(-FlogP.regionwide.distn) ] modek.sort <- modek.regionwide.distn[ sort.list(-modek.regionwide.distn) ] FlogP.thres <- FlogP.sort[Nperm*thres.quick] modek.thres <- modek.sort[Nperm*thres.quick] } hbrem.region.list <- list(Summary.Parameters=mark.pars.df, Diplo.Means=hap.means.df, Diplo.StDevs=hap.sdevs.df, Diplo.ExpCounts=hap.avNis.df, Strain.Means=strain.means.df, F.logP.thres=FlogP.thres, Mode.k.thres=modek.thres) return(hbrem.region.list) } happy.hbrem/R/zzz.R0000644000261400006360000000004312254604112013422 0ustar00rmottmott.packageName <- "happy.hbrem_2.4" happy.hbrem/man/0000755000261400006360000000000012254616417013032 5ustar00rmottmotthappy.hbrem/man/AAA-happy.Rd0000644000261400006360000004334512254601044015021 0ustar00rmottmott\name{Happy} \alias{introduction} \alias{happy} \alias{happy.matrices} \alias{happy.save} \title{Quantitative Trait Locus analysis in Heterogeneous Stocks} \description{happy is an \R interface into the HAPPY C package for fine-mapping Quantitative Trait Loci (QTL) in mosaci crosses such as Heterogenous Stocks (HS). HAPPY uses a multipoint analysis which offers significant improvements in statistical power to detect QTLs over that achieved by single-marker association. An HS is an advanced intercross between (usually eight) founder inbred strains of mice. HS are suitable for fine-mapping QTL. The happy package is an extension of the original C program happy; it uses the C code to compute the probability of descent from each of the founders, at each locus position, but the happy packager allows a much richer range of models to be fit to the data. happy() is used to initialise input files and perform dynamic programming in C. Model fitting is then performed by subsequent calls to hfit() etc. Input file foramt is described at \url{http://www.well.ox.ac.uk/happy} } \details{ \bold{Biological Background} Most phenotypes of medical importance can be measured quantitatively, and in many cases the genetic contribution is substantial, accounting for 40\% or more of the phenotypic variance. Considerable efforts have been made to isolate the genes responsible for quantitative genetic variation in human populations, but with little success, mostly because genetic loci contributing to quantitative traits (quantitative trait loci, QTL) have only a small effect on the phenotype. Association studies have been proposed as the most appropriate method for finding the genes that influence complex traits. However, family-based studies may not provide the resolution needed for positional cloning, unless they are very large, while environmental or genetic differences between cases and controls may confound population-based association studies. These difficulties have led to the study of animal models of human traits. Studies using experimental crosses between inbred animal strains have been successful in mapping QTLs with effects on a number of different phenotypes, including behaviour, but attempts to fine-map QTLs in animals have often foundered on the discovery that a single QTL of large effect was in fact due to multiple loci of small effect positioned within the same chromosomal region. A further potential difficulty with detecting QTLs between inbred crosses is the significant reduction in genetic heterogeneity compared to the total genetic variation present in animal populations: a QTL segregating in the wild need not be present in the experimental cross. In an attempt to circumvent the difficulties encountered with inbred crosses, we have been using a genetically heterogeneous stock (HS) of mice for which the ancestry is known. The heterogeneous stock was established from an 8 way cross of C57BL, BALB/c, RIII, AKR, DBA/2, I, A and C3H/2 inbred strains. Since its foundation 30 years ago, the stock has been maintained by breeding from 40 pairs and, at the time of this experiment, was in its 60th generation. Thus each chromosome from an HS animal is a fine-grained genetic mosaic of the founder strains, with an average distance between recombinants of 1/60 or 1.7 cM. Theoretically, the HS offers at least a 30 fold increase in resolution for QTL mapping compared to an F2 intercross. The high level of recombination means that fine-mapping is possible using a relatively small number of animals; for QTLs of small to moderate effect, mapping to under 0.5 cM is possible with fewer than 2,000 animals. The large number of founders increases the genetic heterogeneity, and in theory one can map all QTLs that account for progenitor strain genetic differences. Potentially, the use of the HS offers a substantial improvement over current methods for QTL mapping. \bold{Problem Statement and Requirements} \enumerate{ \item HAPPY is designed to map QTL in Heterogeneous Stocks (HS), ie populations founded from known inbred lines, which have interbred over many generations. No pedigree information is required. \item Obviously, phenotypic values for the trait must be known for all individuals. It is preferable that these are normally distributed because HAPPY uses Analysis of Variance F statistics to test for linkage (however, a permutation test can be used instead). \item For each genotyped marker, it is necessary to know the ancestral alleles in the inbred founders (which by definition must be homozygous), and the genotypes from the individuals in the final generation. \item The chromosomal position in centiMorgans of each marker must be known. \item Missing data are accomodated provided these are due to random failures in the genotyping and not selective genotyping based on the trait values (however, it is permissible to selectively genotype all the markers provided the same individuals are genotyped at each locus). } \bold{What HAPPY does} HAPPY's analyis is essentially two stage; ancestral haplotype reconstruction using dynamic programming, followed by QTL testing by linear regression: \itemize{ \item Assume that at a QTL, a pair of chromosomes originating from the progenitor strains, labelled \eqn{s,t} contribute an unknown amount \eqn{T_{st}}{T(st)} to the phenotype. In the special case where the contribution from each chromosome is additive at the locus then \eqn{T_{st} = T_s + T_t}{T(s,t) = T(s)+T(t)},say. \item a test for a QTL is equivalent to testing for differences between the \eqn{T}'s. \item A dynamic-programming algorithm is used to compute the probability \eqn{F_{iLst}}{F(n,s,t)} that a given individual \eqn{i} has the ancestral alleles \eqn{s, t} at locus labelled \eqn{L}, conditional upon all the genotype data for the individual. Then the expected phenotype is \deqn{ y = \sum_{st} T_{st} F_{iLst}}{ y = Sum (st) T(s,t)F(i, L,s,t)}, and the \eqn{T}'s are estimated by a linear regression of the observed phenotypes on these expected values across all individuals, followed by an analysis of variance to test whether the progenitor estimates differ significantly. \item The method's power depends on the ability to distinguish ancestral haplotypes across the interval; clearly the power will be lower if all markers in a region have the same type of non-informative allele distribution, but the markers can share information where there is a mixture. \item All inference is based on regression of the phenotypes on the probabilities of descent from the founder loci, \eqn{F_{nst}}{F(n,s,t)}. } Although the models are presented here in the linear model framework (ie least-squares estimation, with ANOVA F-tests), it is of course straighforward to extend them to \R's generalised linear model framework. Multivariate analysis is also possible. It is straighforward to fit models involving the effects of multiple loci and of covariates. It is easiest to see this by rewriting the problem in standard linear modelling notation. Consider first the case of fitting a QTL at a locus, \eqn{L}. Let \eqn{\bf y}{\bold{y}} be the vector of trait values. Let \eqn{\bf X_L}{\bold{X(L)}} be the design matrix for fitting a QTL at the locus \eqn{L}. Let \eqn{\bf t_L}{\bold{ t(L)}} be the vector of parameters to be estimated at the locus. For an additive QTL, the paramters are the strain effect sizes; for a full interaction model there is a paramter for every possible strain combination. Then the one-QTL model is \deqn{\bf E(y) = X_L t_L}{\bold{E(y) = X(L).t}} There are \eqn{S(S-1)/2} parameters to be estimated in a full model allowing for interactions between the alleles within the locus, and \eqn{S-1} parameters in an additive model. For the full model, the \eqn{i,j}'th element of the design matrix \eqn{\bold{X}} is related to the strain probabilities thus: \deqn{\bold{X_{Lij} = F_{iLst}}}{\bold{X(Lij) = F(iLst)}}, where \deqn{j(s,t) = \min(s + S(t-1), t + S(s-1)} and for the additive model \deqn{\bold{X_{Lij} = \sum_s F_{iLsj}}}{\bold{X(Lij) = \sum(s) F(iLst)}} \bold{ More complex models} To add covariates to the model (for instance sex or age ) we add additional columns \eqn{\bold{C}} to the design matrix: \deqn{ \bf E(y) = \left[ X_L \| C \right] ( t_L \| c ) }{\bold{E(y) = [X(L)|C].[t|c]}} where \eqn{\bf C} is a design matrix representing the covariates of interest, and \eqn{\bf c}{\bold{c}} are the parameters to be estimated.\eqn{\bf (t_L\|c)}{\bold{(t(L)|c)}} represents the vector formed by adjoining the vectors \eqn{t_L}{t(L)} and \eqn{c}. Note that at present \eqn{\bold{C}} must be a numeric matrix: factors must be explicitly converted into columns of dummy variables. Similarly to fit an additional locus \eqn{K} we adjoin the design matrix \eqn{\bold{X_K}}{\bold{X(K)}}, for example: \deqn{ \bf E(y) = \left[ X_L \| X_K \| C \right] ( t_L \| t_K \|c ) }{\bold{E(y) = [X(L)|X(K)|C].[t(L)|t(K)|c]}} (this is essentially \bold{composite interval mapping}). The happy package allows the inclusion of arbitrary covariate matrices, which can include other loci; new loci are then tested to see if they significantly improve the fit conditional upon the presence of the covariates. In this way we can analyse any number of linear combinations of loci and covariates. \bold{Epistasis}, or the interaction between loci, is supported as well. At present the package can test for interactions between unlinked loci, but not linked loci. The test compares the fit between the sum of the additive contributions from each locus and the interaction. This is accomplished as follows: Let \eqn{X_L, X_K}{X(L),X(K)} be the design matrices for the loci \eqn{L,K}. Let \eqn{m_L}{m(L)} be the number of columns in \eqn{X_L}{X(L)} Then form a matrix \eqn{X_{LK}}{X(LK)} whose \eqn{m_Lm_K}{m(L)m(K)} columns are formed by multiplying the elements in each pair of columns in the original matrices. \bold{Merging Strains} An important feature of the happy package is the suite of functions to merge strains together. The models described above (particularly the full interaction models) have the disadvantage that the fits sometimes involve a larger number of parameters, with many degrees of freedom. This is particularly true for full non-additive models and for epistasis. For example in an 8-strain HS, 28 df are required to fir a full model for a single locus. Large numbers of degrees of freedom have two problems: firstly the models may become overspecified, and secondly even if there are plenty of degrees of freedom for the residual error, the power to detect an effect is diluted. A partial solution is to note that since most polymorphisms are diallelic (eg SNPs), it makes sense to group the strains according to their alleles at some polymorphic locus. This corresponds to operating with design matrices in which certain columns are combined by adding their corresponding elements together. A diallelic merge reduces the number of degrees of freedom dramatically: only 3 df (instead of 28df) are required to fit a full model at a locus (and only 1 df instead of 7df for the addtive model), and an epistatic interaction between two merged loci will involve only 3df (additive) or 8df (full). } \references{ Mott R, Talbot CJ, Turri MG, Collins AC, Flint J. A method for fine mapping quantitative trait loci in outbred animal stocks. Proc Natl Acad Sci U S A. 2000 Nov 7;97(23):12649-54. } \usage{ happy( datafile, allelesfile, generations=200, phase="unknown", file.format="happy", missing.code="NA", do.dp=TRUE, min.dist=1.0e-5, mapfile=NULL, ancestryfile=NULL, haploid=FALSE ) happy.matrices( h ) happy.save( h, file ) } \arguments{ \item{datafile}{ name of the text file containing the genotype and phenotype data in HAPPY format} \item{allelesfile}{ name of the text file containing the allele/strain data in HAPPY format} \item{generations}{ the number of breeding generations in the HS} \item{phase}{ If phase=="unknown" then the phase of the genotypes is unknown and no attempt is made to infer it. If phase="estimate" then it is estimated using parental genotype data when available. If phase="known" then it is assumed the phase of the input genotypes is correct i.e. the first and second alleles in each genotype for an individual are on the respectively the first and second chromosomes. Where phase is known this setting should increase power, but it will cause erroneous output if it is set when the data are unphased. If phase="estimate" then file.format="ped" is assumed automatically, because the input data file must be in ped-file format in order to specify parental information. } \item{file.format}{ The format of the genotype file. Either "happy" (the default) or "ped". "happy" files do not contain any pedigree information. They are structured so that one record corresponds to an individual. The first two fields are the subject id (unique) and the phenotype value. The remaining fields are the N genotypes for the subject (where N is the number of markers specified in the alleles file), arranged in 2N fields all separated by spaces. The "ped" file format is similar except that in place of the two columns "id" and "phenotype" in the original "happy" file format there should be six columns "family", "id", "mother", "father", "gender", "phenotype". Note that the "family" field can be constant so long as the id's are all unique. The resultant name of each subject is constructed as "family.id", which must be unique. The genotype data then follow as in "happy" format. Note that if phase="estimate" then file.format="ped" automatically; it is only necessary to use this option when the input file format is "ped" but it is desired not to make any use of pedigree information.} \item{missing.code}{ The code for a missing allele in the input file. Defaults to "NA". Note that old HAPPY files use "ND"} \item{do.dp}{ A switch that turns off the dynamic programming part of happy. By default dynamic programmig is performed. The only reason to turn this off is when only the genotypes are required.} \item{min.dist}{ The minimum genetic distance (in centiMorgans) allowed between adjacent merkers. Markers positioned closer than min.dist in the input file are treated as being min.dist cM apart. This prevents problems with markers at the same position, and which HAPPY cannot process.} \item{mapfile}{ Optional name of text file containing the physical base-pair coordinates of the markers (the alleles file only contains the genetic map in centiMorgans). The file format has three columns named "marker", "chromosome" and "bp". This file is not required unless genome cache objects are to be made (see save.genome()).} \item{haploid}{ A boolean variable indicating if the genomes should be interpreted as haploid, ie. homozygous at every locus. This option is used for the analysis of both truly haploid genomes and for recombinant inbred lines where all genotypes should be homozygotes. Note that the format of the genotype file (the .data file) is unchanged, but only the first allele of each genotype is used in the analysis.The default value for this option is FALSE, i.e. the genomes are assumued to be diploid and heterozygous. } \item{ancestryfile}{ An optional file name that is used to provide subject-specific ancestry information. More Soon...}. \item{h}{ An object of class "happy"} \item{file}{ Name of file in which to save data } } \value{ happy() returns an object of type happy, which should be passed onto model-fitting functions such as hfit(). A happy object 'h' is a list with a number of useful members: \item{strains}{ a character vector containing the names of the founder strains} \item{markers}{ a character vector containing the names of the markers, in map order} \item{map}{ a numeric vector containing the map coordinates in centiMorgans of the markers} \item{subjects}{ a character vector containing the subject names } \item{phenotype}{ a numeric vector containing the subject phenotypes} \item{handle}{a numeric index used internally by the C-code. Do not change.} \item{matrices}{ a list of matrices used in model fitting (only created after a call to happy.matrices()).} \item{use.pedigrees}{ boolean variable indicating whether pedigree information was used to help determine the phase of the genotypes} \item{phase.known}{ boolean variable indicating whether or not the phase of the genotypes is assumed to be known} happy.save() will save a happy object to a file so that it can be re-used in a later session with the load() command. happy.matrices() is not normally called directly - its function is to copy all the dynamic-programming matrices created by a call to happy() from the underlying C memory space into R objects. The object returned is still a happy object, but with an additional component 'matrices'. It can be used in exactly the same way as a normal happy object except that the underlying C memory is no longer used. When a happy object is saved using happy.save() is is first converted by a call to happy.matrices(), and when a happy object is reloaded using load() it uses matrices stored in R memory. Thus these functions are a useful way to save computing time - the dynamic programming step need only be performed once, the data can be persisted to disk and then re-used (e.g. to analyse multiple phenotypes) at a later date. } \seealso{ hfit(), mergefit(), happyplot() } \examples{ \dontrun{h <- happy('HS.data', 'HS.alleles', generations=200)} \dontrun{ happy.save(h,'h.Rdata')} \dontrun{ load('h.Rdata')} } \keyword{models} \author{Richard Mott} happy.hbrem/man/epistasis.Rd0000644000261400006360000000700312254601044015313 0ustar00rmottmott\name{epistasis} \alias{epistasis} \alias{epistasispair} \title{Analysis of Epistasis between Markers} \description{epistasis() will test for a statistical interaction between two sets of markers within the happy framework. The markers should be sufficiently far apart that they are unlinked (in practice 10cM for a 30 generation HS is sufficient). A partial F-test is performed to test if a model allowing for interactions fits better than a model in which each marker's contribution is additive between loci. Note that the effect of each marker within a locus can be either additive or full. Merging of strain is permitted. epistasispair() is the same as epistasis() except that only one pair of markers is tested. } \usage{ epistasis( h, markers1, markers2, merge1=NULL, merge2=NULL, model='additive', verbose=FALSE, family='gaussian' ) epistasispair( h, marker1, marker2, merge1=NULL, merge2=NULL, model='additive', verbose=FALSE, d1=NULL, d2=NULL, main1=0, main2=0, family='gaussian' ) } \arguments{ \item{h}{an object returned by a previous call to happy()} \item{markers1}{an array of marker names or indices} \item{markers2}{an array of marker names or indices}. In epistasis() Every marker listed in marker1 is tested for interaction with every marker in marker2.If marker2 in NULL then every marker in marker1 is tested against every other marker in the array. \item{marker1}{ a single marker name or index} \item{marker2}{ a single marker name or index} In epistasispair() marker1 is tested for interaction with marker2. \item{merge1}{an optional merge object (returned by mergematrices()) determining how the strains should be merged together for the markers listed in marker1} \item{merge2}{an optional merge object (returned by mergematrices()) determining how the strains should be merged together for the markers listed in marker2} \item{model}{the type of model fitted at each locus. Either 'additive' or 'full'} \item{verbose}{switch controlling output to screen} \item{d1}{optional design matrix for the main effect of the first marker (saves computation time)} \item{main1}{optional log-P-value for the main effect of the first marker. NOTE: If d1 is not NULL then main1 \emph{must} be set } \item{d2}{optional design matrix for the main effect of the second marker (saves computation time). } \item{main2}{optional log-P-value for the main effect of the second marker. NOTE: If d2 is not NULL then main2 \emph{must} be set } \item{family}{ The distribution of errors in the data. The default is 'gaussian'. This variable controls the type of model fitting. In the gauusian case a standard linear model is fitted using lm(). Otherwise the data are fitted as a generalised linear model using glm(), when the value of family must be one of the distributions hangled by glm(), such as 'binomial', 'gamma'. See family() for the full range of models.} } \value{ epistasis() returns a matrix with columns named 'marker1', 'marker2', 'main1', 'main2', 'main1+main2', 'main1*main2', 'main1.main2'. marker1 and marker2 are the names of the markers being compared in a given row, the remaining values are the ANOVA log-P-values of the main effects (main1 and main2), the combined additive effect (main1+main2), the addtive plus interaction (main1*main2) and the partial F of the interaction (main1.main2) after allowing for main1+main2. epistasispair() returns a list with the same fields.} \keyword{models} \author{Richard Mott} happy.hbrem/man/gauss.Rd0000644000261400006360000001430412254601044014433 0ustar00rmottmott\name{gfit} \alias{gaussian.iterate} \alias{gaussian.null} \alias{gaussian.loop} \alias{gaussian.init} \alias{gaussian.fn} \alias{gaussian.gr} \alias{gfit} \title{Fit a Gaussian Mixture Model to an object returned by happy()} \description{ gfit() fits a QTL model to a happy() object. The model is a mixture of Gaussians, each with a different mean, and corresponds loosely to the "full" model in hfit(). The difference is that hfit() fits the observed phenotype values to the expected phenotypes under a full model, whereas gfit() uses maximum likelihood to fit the observed phenotype values to a mixture of Gaussians, each with a different mean but common variance. The other functions in this suite are not usually called directly. The statistical model fitted is as follows. Consider first the case of fitting a QTL at a locus, \eqn{L}. Let \eqn{\bf y}{\bold{y}} be the vector of trait values. Let \eqn{\bf X_L}{\bold{X(L)}} be the design matrix for fitting a QTL at the locus \eqn{L}. Let \eqn{\bf t_L}{\bold{ t(L)}} be the vector of parameters to be estimated at the locus. For an additive QTL, the paramters are the strain effect sizes; for a full interaction model there is a paramter for every possible strain combination. Then the one-QTL model is \deqn{\bf E(y) = X_L t_L}{\bold{E(y) = X(L).t}} There are \eqn{S(S-1)/2} parameters to be estimated in a full model allowing for interactions between the alleles within the locus, so the \eqn{i,j}'th element of the design matrix \eqn{\bold{X}} is related to the strain probabilities thus: \deqn{\bold{X_{Lij} = F_{iLst}}}{\bold{X(Lij) = F(iLst)}}, where \deqn{j(s,t) = \min(s + S(t-1), t + S(s-1))} In the function hfit(), the observed phenotypes are regressed directly on the expected trait values. This is not an optimal procedure becuase the data are really a mixture: \deqn{ y_i \tilde \sum_{st} F_{iLst} f( ( y_i - \beta_{Lst} )/2\sigma_L^2) } where \eqn{f(x)} is a standard Gaussian density. The \eqn{\bold{\beta_L}} is a vector of mean trait values for the strain combinations. The parameters \eqn{\bold{\beta_L} , \sigma_L} are estimated by maximum likelihood, and the test for the presence of a QTL at locus \eqn{L} is equivalent to the test that all the \eqn{\beta_{st}=\mu}, when the model collapses to a single Gaussian distribution. The model-fitting is implemented in the function gfit() by an iterative process, rather like a simplified version of EM. Is is slower than hfit(), and generally gives similar results as far as overall QTL detection is concered,m but gives more accurate parameter estimates. The log-likelihood for the data is \deqn{ L = \sum_{i} \log ( \sum_j p_{ij} \frac{\exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2})}{\sqrt{2\pi \sigma^2}}) } \deqn{ = \sum_i \log ( \sum_j p_{ij} \exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2})) - \frac{N \log(2\pi\sigma^2)}{2} } Differentiating wrt to the parameters gives XXXX \deqn{ \frac{\partial L}{\partial \sigma^2} = \sum_i \frac{\sum_j p_{ij} (y_i-\beta_j)^2 \exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2}) }{ 2\sigma^4 \sum_j p_{ij} \exp(-\frac{(y_i-\beta_j)^2 }{2\sigma^2})} - \frac{N}{2\sigma^2} } \deqn{ \frac{\partial L}{\partial \beta_j } = - \sum_i \frac{ p_{ij} \frac{(y_i-\beta_j) }{\sigma^2} \exp( -\frac{(y_i-\beta_j)^2}{2\sigma^2})}{ \sum_j e_{ij}} } \deqn{ = \frac{1}{\sigma^2} \left( - \sum_i \frac{y_i e_{ij} }{\sum_j e_{ij}} + \beta_j \frac{\sum_i e_{ij} }{\sum_j e_{ij}} \right) } \deqn{ \hat{\sigma^2} = \frac{1}{N} \sum_i \sum_j \hat{w}_{ij}(y_i-\hat{\beta}_j)^2 } write \deqn{ w_{ij} = \frac{p_{ij} \exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2}) }{ \sum_j p_{ij} \exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2})}} then the mle satisfies \deqn{ \hat{\beta_j} = \frac{\sum_i \hat{e}_{ij} y_i}{\sum_i \hat{e}_{ij}} } \deqn{ \hat{\sigma^2} = \frac{1}{N} \sum_i \frac{\sum_j p_{ij}(y_i-\beta_j)^2 \exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2})} {\sum_j p_{ij}\exp(-\frac{(y_i-\beta_j)^2}{2\sigma^2})}} and the log-likelihood is \deqn{ \hat{L} = \sum_i(\log \sum_j \hat{e}_{ij}) - \frac{N \log(2\pi\hat{\sigma}^2)}{2} } } \usage{ gfit( h,eps=1.0e-4, shuffle=FALSE, method="optim" ) gaussian.loop( d, maxit=100, eps=1.0e-3, df=NULL ) gaussian.null( n, sigma2 ) gaussian.init( d ) gaussian.iterate( d, params ) gaussian.fn( p, d=NULL ) gaussian.gr( p, d=NULL ) } \arguments{ \item{h}{ an object returned by a previous call to happy()} \item{shuffle}{ boolean indicating whether the shuffle the phenotypes to perform a permutation test} \item{method}{The optimisation algorithm. Default is to use R's "optim" function, which uses derivative information. All other values of this argument will use an EM type iteration.} \item{d}{ a list comprising two elements d, probs} \item{maxit}{ the maximum number of iterations in the ML fitting} \item{eps}{ the terminatation accuracy in the model fitting : the log likelihood must change by less than eps in successive iterations} \item{df}{ the degress of freedom to use. If NULL then this is computed as the rank of the data} \item{n}{ the number of observations with non-missing phenotypes} \item{sigma2}{the variance of the errors in the data} \item{params}{ a list with two components, beta = the group means and sigma = the standard deviation} \item{p}{ vector of paramters. For internal use only} } \value{ gfit() returns a matrix with columns "marker", "LogL", "Null", "Chi", "df", "Pval", "LogPval". Each row of the column describes the fit of the model for thecorresponding marker interval. gaussian.loop() fits the model to a single marker and returns a list with the same elements as in hfit() gaussian.iterate() performs a single iteration of the fitting process and returns a list with the updated LogL, beta, sigma, dbeta and dsigma gaussian.init() intialises the parameters under the Null model, ie to the case where the means are all identical and the variance is the overal variance. gaussian.null() returns the log-likelihood under the Null model gaussian.fn() and gaussian.gr() are the function and gradient required by the optim function. } \examples{ ## An example session: # initialise happy \dontrun{h <- happy('HS.data','HS.alleles')} # fit all the markers \dontrun{f <- gfit(h)} } \seealso{ happy{}, hprob{} } \keyword{models} \author{Richard Mott} happy.hbrem/man/happy-internal.Rd0000744000261400006360000000257212254606275016264 0ustar00rmottmott\name{happy-internal} \alias{happy-internal} \title{Internal Happy Functions} \description{Internal functions for happy. These are not normally called by the user} \alias{comparelist} \alias{matrixSquared} \alias{mfit} \alias{twofit} \alias{condfit} \alias{strain.effects} \alias{glmfit} \alias{sdp} \alias{hprob2} \alias{hnonrecomb} \alias{h.sum.prob2} \alias{delete.happy.cobject} \alias{happy.load.data} \alias{hbrem} \alias{hbrem.locus} \alias{hbrem.region} \alias{hbrem.true} \alias{hbrem.merge.locus} \alias{hbrem.perm.locus} \alias{save.happy.internal} \usage{ matrixSquared( matrix1, matrix2 ) twofit( happy, marker1, marker2, merge1=NULL, merge2=NULL, model = 'additive', verbose=TRUE, family='gaussian' ) mfit( happy, markers, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=TRUE , family='gaussian', variants=NULL ) condfit( happy, markers, condmarker, merge=NULL, condmerge=NULL, model='additive',condmodel='additive', epistasis=FALSE, verbose=TRUE, family='gaussian' ) strain.effects( h, fit, family='gaussian' ) glmfit( formula=NA, family='gaussian' ) sdp( strains, alleles ) hbrem.perm.locus(m, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) hbrem.true( RX, HaploidInd, Ndip, Nstrain, Nind, Npost=2000, Nbin, Ry ) hbrem.merge.locus(m, sdp, g, model, Ry, cc, HaploidInd, Ndip, Nstrain, Nind, Npost, Nbin) } \author{Richard Mott} \keyword{internal} happy.hbrem/man/happyplot.Rd0000644000261400006360000000523612254601044015335 0ustar00rmottmott\name{happyplot} \alias{happyplot} \alias{mergeplot} \title{Plotting functions for happy model fits} \description{ happyplot() will plot along the genome the log P-value that a QTL is not found in a series of marker intervals. It accepts as input the results of hfit(), mfit() and mergefit(). mergeplot() is a convenience function for calling happlyplot() after a call to mergefit(), with several parameters set.} \usage{ happyplot( fit, mode='logP', labels=NULL, xlab='cM', ylab=NULL, main=NULL, t='s', pch=20, ... ) mergeplot( fit, mergedata, mode='logP', xlab='bp', ylab=NULL, main=NULL, t='p', pch=20, ... ) } \arguments{ \item{fit}{ an object returned by a previous call to hfit(), mfit(), or mergefit() } \item{mode}{ the mode of the plot - either 'logP', when the negative base-10 logarithm of the ANOVA P-value of plotted, or 'SS', when the fitting sums-of-squares is plotted} \item{labels}{ optional matrix detailing marker labels to be drawn on the plot. The labels are written vertically above the plot, with vertical lines extending down into the plot area. labels is a matrix with two named columns 'marker', containing the marker names, and 'POSITION', containing the x-axis positions of the markers.} \item{mergedata}{ (mergeplot() only). an object returned by a previous call to mergeprepare(). This is used to construct labels for plotting} The following options are passed to the plotting routines: \item{xlab}{ the x-axis label } \item{ylab}{ the y-axis label } \item{main}{ the titke of the plot } \item{t}{ the type of plot - either 'p', 'l', 's' or 'S', with the same meanings as in plot() } \item{pch}{ the plotting character code, with the same meaning as in plot() } \item{...}{ arguments passed down to R plotting functions} } \value{ A plot to the current graphics device is produced. For happyplot, if fit$permdata is not NULL (i.e. hfit() was run using the permutation test option) then the plot contains two step functions, labelled global.logp and point.logp The global.logp plot shows the empirical log-pvalue relative the whole region (ie adjusted for the number of markers) while point.logp shows the empirical log-pvalue for each interval. If fit$permdata is NULL then the plots give the ANOVA logP values. If the model used in hfit() is 'additive' then the logP for the additive mocdel vs the null model is plotted; if the model is 'full' then the curves for the full, additive and partial F-test logP values are plotted. } \examples{ \dontrun{ h <- happy( 'HS.data', 'HS.alleles' ) } \dontrun{ fit <- hfit( h, h$markers, model='full' ) } \dontrun{ happyplot( fit ) } } \seealso{ hfit(), mfit(), mergefit() } \keyword{aplot} \author{Richard Mott} happy.hbrem/man/hcache.Rd0000744000261400006360000001461312254615146014540 0ustar00rmottmott\name{cache} \alias{save.genome} \alias{save.happy} \alias{load.genome} \alias{load.markers} \alias{the.chromosomes} \title{Save HAPPY design matrices and genotypes to disk for rapid reloading} \description{ \code{save.genome()} will persist the happy design matrices or genotypes from a series of happy objects to disk as a collection of R delayed data packages (as implemented in the package \code{g.data}). \code{load.genome()} "reloads" the data, although the matrices are not actually loaded into memory until used. \code{load.markers()} loads in a specific set of design matrices or genotypes, as defined by their marker names. These functions are very usefiul when access to a random selection of loci across the genome is required, and when it would be impossible for reasons of space to load many entire HAPPY objects into memory. \code{save.happy()} saves a single happy object as a delayed data package. \code{the.chromosomes()} is a conveniemce funtion that generates a character vector of chromosome names.} \usage{ save.genome( gdir, sdir, prefix, chrs=NULL, file.format="ped", mapfile=NULL,ancestryfile=NULL, generations=50, phase="unknown", haploid=FALSE, mc.cores=1 ) the.chromosomes(autosomes=19, use.X=FALSE) load.genome( sdir, use.X=TRUE, chrs = the.chromosomes(use.X=use.X), n.chr=NA, models=c("additive", "full", "genotype")) load.markers( genome, markers, model="additive", include.models=FALSE ) save.happy( h, pkg, dir, model="additive" ) } \arguments{ \item{gdir}{ Path to the directory containing the genotype (.alleles and either .data or .ped ) input files required to instantiate happy objects. This directory wil1 typically contain a pair of files for each chromosome of the genome of interest}. \item{sdir}{Path to the directory where the data will be saved by \code{save.genome}, and read back by \code{load.genome()}.} \item{prefix}{ Text fragment used to define the file names sought by \code{save.genome()}. An attempt is made to find files in \code{gdir} named like \code{chrN.prefix.*} where N is the chromosome number (1...20, X, Y), as defined in \code{chrs}.} \item{chrs}{ number of autosomes.} \item{n.chr}{ Alternative way of specifying the number of chromosomes. Must be an integer or NA.} \item{autosomes}{Sequence of autosomal chromosome identifiers} \item{use.X}{Logical to determine whether to use X-chromsome data, in load.genome().} \item{models}{ list of strings specifies the types of data to load. "additive" and "full" are design matrices corresponding to the additive and full models, "genotypes" are the raw genotypes.} \item{include.models}{ Boolean indicating the type of return object - if TRUE then an additional list specifiying if the model is additive, full or genotype is returned.} \item{file.format}{Defines the input genotype file format, either "ped" (Ped file format) or "happy" ( HAPPY .data file format).} \item{mapfile}{ Name of a text file containing the physicla (base pair) map for the genome. It contains three columns named "marker", "chromosome" and "bp". Every marker in the .alleles files should be listed in the file.} \item{generations}{ The number of generations since the HS was founded (see happy()).} \item{genome}{ An object returned by \code{load.genome()}.} \item{markers}{ A vector of marker names. These names will be searched for in the \code{genome} object, and if found, their corresponding data retrieved.} \item{haploid}{ A boolean variable indicating if the genomes should be interpreted as haploid, ie. homozygous at every locus. This option is used for the analysis of both truly haploid genomes and for recombinant inbred lines where all genotypes should be homozygotes. Note that the format of the genotype file (the .data file) is unchanged, but only the first allele of each genotype is used in the analysis.The default value for this option is FALSE, i.e. the genomes are assumued to be diploid and heterozygous. } \item{ancestryfile}{ An optional file name that is used to provide subject-specific ancestry information. More Soon...} \item{phase}{ If phase=="unknown" then the phase of the genotypes is unknown and no attempt is made to infer it. If phase="estimate" then it is estimated using parental genotype data when available. If phase="known" then it is assumed the phase of the input genotypes is correct i.e. the first and second alleles in each genotype for an individual are on the respectively the first and second chromosomes. Where phase is known this setting should increase power, but it will cause erroneous output if it is set when the data are unphased. If phase="estimate" then file.format="ped" is assumed automatically, because the input data file must be in ped-file format in order to specify parental information. } \item{h}{ A HAPPY object} \item{pkg}{The name of the R delayed data package to be created} \item{dir}{ Name of directory to create a delayed data package for a single happy object} \item{model}{ One of "additive", "full", "genotype"}. \item{mc.cores}{Split computation across this number of cores} } \value{ \code{save.genome()} returns NULL. \code{load.genome()} returns a list object which contains information about the delayed datapackages loaded, and how the markers are distributed between the packages. The list comprises two components, named "genome" and "subjects". The former is a datatable with columns "marker", "chromosome", "map", "ddp" which acts as a genome-wide lookup-table for each marker. The latter lists the subject names corresponding to the rows in the design matrices or genotypes. NOTE: The software assumes that all the chromosome-specific files used in \code{save.genome()} are consistent. i.e. the same subjects in the same order occur in each chromosome, and that a marker is only present once across the genome. \code{load.markers()} returns a list of data (either matrices or genotype vectors), each datum being named accoring to the relevant marker \code{the.chromosomes()} returns a character vector of chromosome names, like \code{ c( "chr1", "chr2" ..., "chrX", "chrY" )}. } \seealso{ happy(). Note that the function happy.save() differs from save.happy(), in that it saves a single happy object for reloading with \code{load()}; it does not use delayed data loading.} \examples{ } \keyword{models} \author{Richard Mott} happy.hbrem/man/hdesign.Rd0000644000261400006360000000502712254601044014734 0ustar00rmottmott\name{hdesign} \alias{hdesign} \alias{hprob} \alias{hgenotype} \title{Extract design matrix or genotypes for a specific marker interval from a happy object} \description{hdesign() will call C to extract the design matrix to fit a QTL to a marker interval. hprob() will call C to extract a raw probability matrix. hgenotype() will return the raw genotype data for a marker} \usage{ hdesign( h, marker, model='additive', mergematrix=NULL ) hprob( h, marker=NULL ) hgenotype ( h, marker, collapse=FALSE, sep="" ) } \arguments{ \item{h}{ an object returned by a previous call to happy()} \item{marker}{ either a character string giving the name of the marker or the index of the marker in the array h\$markers} \item{model}{either 'additive' (default) or 'full'. The additive design matrix returns an array with S columns, where S is the number of founder strains in the HS. The full design matrix returns a matrix with S(S-1)/2 columns, one for each combination of strains} \item{mergematrix}{an object returned by mergematrices, used to define sets of strains that are to be merged together. This is accomplished by adding the corresponding columns in the original design matrix.} \item{collapse}{ a boolean variable indicating whether to collapse the alleles into a single genotype.} \item{sep}{ the text to be used to separate the alleles if collapsed.} } \value{ hdesign() returns a design matrix \eqn{d_{ij}}{d[i,j]}, in which the \eqn{i}th row corresponds to the subject \eqn{i}, and the \eqn{j}th column to the corresponding strain or combination of strains or merged strains. hprob() returns a matrix \eqn{p_{ix}{p[i,x]}}, in which the \eqn{i} the row corresponds to the subject \eqn{i}, and the \eqn{x=s*S+t} th column contains the probability that the ancestral strains are \eqn{s,t} where \eqn{S} is the total number of strains. hgenotype() returns a \eqn{Nx2} matrix \eqn{g_{ix}{g[i,x]}} in which the \eqn{i} th row corresponds to the subject \eqn{i}, and column 1 contains the first allele and column 2 the second allele at the marker specified, or (if \code{collapse=TRUE}) a vector of genotypes with the alleles pasted together. } \seealso{ happy(), hfit() } \examples{ \dontrun{h <- happy( 'HS.data', 'HS.alleles', generations=200 )} \dontrun{d <- hdesign( h, 1 ) ## the first marker interval} \dontrun{d <- hdesign( h, 'D1MIT264') ## the marker interval with left-hand marker D1MIT264} \dontrun{d <- hdesign( h, 'D1MIT264', model='full' ) ## ditto with full design matrix} } \keyword{models} \author{Richard Mott} happy.hbrem/man/hfit.Rd0000644000261400006360000001706712254601044014254 0ustar00rmottmott\name{hfit} \alias{hfit} \alias{hfit.sequential} \alias{pfit} \alias{normalise} \title{Fit a model to an object returned by happy() } \description{ hfit() fits a QTL model to a happy() object, for a set of markers specified. The model can additive or full (ie allowing for dominance effects). The test is a partial F-test. In the case of the full model two tests are performed: the full against the null, and the full against the additive. hfit.sequential() performs an automated search for multiple QTL, fitting marker intervals in a sequential manner, and testing for a QTL conditional upon the presence of previously identified QTL. This is essentially forward selection of variables in multiple regression, and very similar to composite interval mapping. pfit() is a conveneince function to fit several univariate phenotypes to the same genotype data. normalise() is a convenience function to convert vector of phenotype data into a set of standard Gaussian deviates: the values are first ranked and then the ranks replaced by the corresponding percentiles in a standard Normal distribution. This may be used to help map traits that are strongly non-normal (or use the permute argument in hfit()). } \usage{ hfit( h, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=FALSE, phenotype=NULL, family='gaussian', permute=0 ) hfit.sequential( h, threshold=2, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=FALSE, family='gaussian') pfit( h, phen, markers=NULL, model='additive', mergematrix=NULL, covariatematrix=NULL, verbose=FALSE, family='gaussian' ) normalise(values) } \arguments{ \item{h}{ an object returned by a previous call to happy()} \item{markers}{ a vector of marker intervals to test. The markers can either be specified by name or by index. Default is NULL, in which case all the markers are fitted (same as setting markers=h$markers) } \item{model}{ specify the type of model to be fit. Either 'additive', where the contrinutions of each allele at the locus are assumed to act additively, or 'full', in which a term for every possible combination of alleles is included. The default 'additive' mimics the behaviour of the original C HAPPY software.} \item{mergematrix}{ specify a mergematrix object ( returned by mergematrices() ) which describes which founder strains are to be merged. This is used to test whether merging strains reduces statistical significance (see mergematrices())} \item{covariatematrix}{Optional additional matrix of covariates to include in the mode. These may be additional markers (terurned by hdesign) or covariates such as sex, age etc. } \item{verbose}{ control whether to print the results of the fits to the screen, or work silently (the default)} \item{threshold}{ the logP threshold used in hfit.sequential to decide whether to include a marker interval in the QTL model. The default is 2, ie a marker interval must have a partial F statistic with P-value <0.01 (=logP 2) to be included.} \item{family}{ the distribution of errors in the data. The default is 'gaussian'. This variable controls the type of model fitting. In the gauusian case a standard linear model is fitted using lm(). Otherwise the data are fitted as a generalised linear model using glm(), when the value of family must be one of the distributions hangled by glm(), such as 'binomial', 'gamma'. See family() for the full range of models.} \item{permute}{The number of permutations to perform. Default is 0, i.e. no permutation testing is done, and statistical significance is assessed by ANOVA (or Analysis of Deviance if family != 'gaussian') If permute>0 then statistical significance is assessed based on permuting the phenotypes between individuals, repeating the model fit, and finding the top-scoring marker interval. The emprical distribution of the max logP values is then used to assess statistical significance. This technique is useful for non-normally distributed phenotypes and for estimating region-wide significance levels Note that permutation testing is very slow.} \item{phen}{a data.frame containing additional phenotypes. Each column should be numeric (only used by pfit())}. \item{phenotype}{ An optional vector containing the phenotype values. Used to override the default phenotype in h$phenotype (hfit() only ). } \item{values}{a numeric vector of phenotype values to transform into normal deviates (normalise() only) } } \value{hfit() returns a list. The following components of the list are of interest: \item{table}{a table with the log-P values of the F statistics. The table contains rows, one per marker interval. The columns are the negative base-10 logarithms of the F-test P-values that there is no QTL in the marker interval. In the case of model='full', the partial F-test that the full model is no better than the additive is also given. In the special case of model='additive' and verbose=TRUE the effects of all estimable strains are compared with a T-test, taking into account the correlations between these estimates. However, it should be noted that estimates of individual strain effects may be hard to interpret when some combinations of strains are indistinguishable, and it is possible for the overall F-statistic to be very significant whilst none of the strains appear to be significant, based on their T-statistics. The F-statistic is a better indicator of the true overall fit of the model. } \item{permdata}{ a list containing the results of the permutation analysis, or NULL if permute=0. The list contains the following elements: \itemize{ \item{N}{ The number of permutations } \item{permutation.dist}{ A vector containing sorted ANOVA logP values from the N permutations. These values can be used to estimate the shape of the null distribution, and plotted e.g. using hist().} \item{permutation.pval}{A data table containing the permutation p-values for each marker interval. The columns in the datatable give the position in cM, the marker name (left-hand marker in the interval), the original ANOVA logp, the permutation pval for this logp, and the log permutation P-value. Bothe global (ie region-wide) and pointwise pvalues are given. The Global pvalue for a marker interval is the fraction of times that the logP for the interval (either additive or full, depending on the model specified) is exceeded by the maximum logP in all intervals for permuted data. The pointwise pvalue is the fraction of permutation logP at the marker interval that exceed the logP for that interval. } } } The object returned by hfit() is suitable for plotting with happyplot() pfit() returns a list of hfit() objects, the n'th being the fit for the n'th column (phenotype) in phen. } \examples{ ## An example session: # initialise happy \dontrun{h <- happy('Hs.data','HS.alleles')} # fit all the markers with an additive model \dontrun{f <- hfit(h)} # plot the results \dontrun{happyplot(f)} # fit a non-additive model \dontrun{ff <- hfit(h, model='full')} # view the results \dontrun{write.table(ff,quote=F)} # plot the results \dontrun{happyplot(ff)} # use noramlised trait values \dontrun{ff <- hfit(h,phenotype=normalise(h$phenotypes))} # permutation test with 1000 permutations \dontrun{ff <- hfit(h, model='full', permute=1000)} } \seealso{ happy{} } \keyword{models} \author{Richard Mott} happy.hbrem/man/mergelist.Rd0000644000261400006360000000206612254601044015306 0ustar00rmottmott\name{mergelist} \alias{mergelist} \title{Create an object descrbing how to merge strains together} \description{ mergelist() is a convenience function which creates a list object suitable for use with mergematrices() } \usage{ mergelist( strains, alleles ) } \arguments{ \item{strains}{ a character vector of strain names} \item{alleles}{a character matrix with one row of strain/allele combinations. There must be a named column in the matrix corresponding to every strain name in strains. The value of the element is the allele for that strain} } \value{ a list of lists of strains describing how the strains are grouped together. For instance \code{mergelist <- list( A=list('AJ', 'BALB', 'AKR'), T=list('RIII','I', 'DBA', 'C57', 'C3H') )} divides the strains into two groups corresponding to the alleles A, T (the allele names are not important). It is essential that the all strain names match all the values in strains. The object should be used as an input parameter to mergematrices() } \seealso{ mergematrices()} \author{Richard Mott} \keyword{models} happy.hbrem/man/mergematrices.Rd0000644000261400006360000000427612254601044016147 0ustar00rmottmott\name{mergematrices} \alias{mergedpositionmatrix} \alias{mergematrices} \title{Construct matrices used to merge together founder strains} \description{mergematrices() creates a list containing two matrices suitable for pre-multiplying with an additive or full happy marker design matrix, in order to produce matrices with certain columns combined. These reduced matrices are used to test whether the specified merge reduces the significance of the fit. This function is not usually called directly but is used by mergfit() and hfit() megedpositionmatrix() will return either the merged design matrix or the mergematrices object corresponding to an object returned by mergeprepare() } \usage{ mergematrices( strains, mergelist=NULL, verbose=FALSE ) mergedpositionmatrix( h, position, prepmerge, model='additive', verbose=FALSE, design=TRUE ) } \arguments{ \item{strains}{character array of strain names} \item{mergelist}{ a list of lists of strains describing how the strains are grouped together. For instance \code{mergelist <- list( A=list('AJ', 'BALB', 'AKR'), T=list('RIII','I', 'DBA', 'C57', 'C3H') )} divides the strains into two groups corresponding to the alleles A, T (the allele names are not important). It is essential that the all strain names match all the values in strains.} \item{verbose}{switch to determine whether to tell what is happening.} \item{h}{ an object returned by a previous call to happy() } \item{position}{ the coordinate of the polymorphism to be tested, ie an entry in prepmerge$testmarkerdata$POSITION} \item{prepmerge}{ an object returned by mergeprepare()} \item{model}{ the type of model to be fitted - 'additive' or 'full'} \item{design}{ switch to make mergepositionmatrix return the mergematrix object rather than the merged design matrix} } \seealso{happy(), mergefit(), hfit(), mergelist()} \value{ mergematrices() and mergepositionmatrix() return an object comprising a list with two elements: \itemize{ \item{amat}{ the matrix to apply to an additive-model design matrix} \item{imat}{ the matrix to apply to a full-model (interaction) design matrix} } } \keyword{models} \author{Richard Mott} happy.hbrem/man/mergeprepare.Rd0000644000261400006360000001340012254601044015763 0ustar00rmottmott\name{mergeprepare} \alias{mergeprepare} \alias{mergefit} \alias{fastmergefit} \alias{condmergefit} \title{ Perform tests to determine whether individual polymorphisms could have given rise to a QTL} \description{ mergeprepare() reads in datafiles descrbing the locations and strain distribution patterns of polymorphisms (SNPs or otherwise) which have not necessarily been genotyped. The following tasks are performed: \enumerate{ \item the polymorphism data are read in from testmarkerfile.For each polymorphism the corresponding sketon marker interval is determined, based on their coordinates. Only those polymorphisms lying inside a skeleton marker interval are retained. \item the coordinates (typically in bp rather than cM) of the genotyped markers are read in from markerposfile. Note that these coordinates are distinct from those in the cM map in h\$map used in happy(). Only those markers listed in markerposfile that are also in h\$markers are retained - the rest are discarded. The retained markers are referred to as 'skeleton' markers as they define a framework of genotype data that can us used to test the significance of other polymorphisms. } mergefit() tests each of the polymorphisms to see if it could be a QTL. It performs the following operations on each polymorphism: \enumerate{ \item The founder strains are merged together based on the strin distribution pattern for that polymorphism. \item The merged data are used to fit a QTL in the corresponding skeleton marker interval \item The unmerged data are used to fir a QTL in the corresponding skeleton marker interval. \item The fits of the merged and unmerged data are compared with a partial F-test. If the unmerged data are significant but the merged data are not then there is evidence to reject the polymorphism as being associated with the trait. } fastmergefit() is a convenience function which perfroms a complete analysis without making a prior call to happy(). condmergefit() performs a conditional analysis in which each variant is fitted conditional upon every other variant being included in turn. This is VERY SLOW. } \usage{ mergeprepare( h, markerposfile, testmarkerfile, verbose=FALSE ) mergefit( h, mergedata, model='additive', covariatematrix=NULL, verbose=FALSE ) fastmergefit( datafile, allelesfile, markerposfile, testmarkerfile, generations=200, model='additive', verbose=FALSE ) condmergefit( h, mergedata, model='additive', covariatematrix=NULL, verbose=FALSE ) } \arguments{ \item{h}{ an object returned by a previous call to happy()} \item{markerposfile}{ the name of a text file containing the names and locations of the genotyped markers. Contains two names columns 'marker' and 'POSITION'} \item{testmarkerfile}{ the name of a text file containign the names, positions and strain/allele distribution patterns for each polymorphism to be tested. Contains two columns 'marker' and 'POSITION' plus an additional named column for each of the strains listed in h\$strains - \emph{the column names and strain names must match exactly}. } \item{verbose}{switch to control the level of ouput sent to the screen} \item{mergedata}{ an object created by a previous call to mergeprepare() } \item{model}{ determine the type of model to be fitted - either 'additive' or 'full'. For the additive model it is assumed that the contribution to the phenotype from each chromosome is additive, ie if the founder strains at the locus being tested are \eqn{s,t} then the expected phenotype will be of the form \eqn{T_s+T_t}{T(s)+T(t)}. For the full model the expected phenotype will be of the form \eqn{T_{st}}{T(s,t)}. Analysis of variance is used to test for differences between the estimated effects \eqn{T_s, T_{st}}{T(s), T(s,t)}. The additive model is a submodel of the full, so for model='full' in addition a partial F-test is performed to test if the full model explains more variance than the additive. } \item{covariatematrix}{ an optional design matrix which can be used to include additional terms in the model, such as other markers (using the matrix returned by hdesign()) and/or other covariates such as sex, age etc } \item{datafile}{the name of a genotype datafile to be passed to happy()} \item{allelesfile}{the name of the corresponding alleles datafile to be passed to happy() } \item{generations}{ the number of generations to be passed to happy() } } \value{ mergeprepare() returns a list with the following named elements: \itemize{ \item{markerpos}{the positions of the markers} \item{interval}{an array. interval[m] contains the index of the genotyped marker interval in which the polymorhism p is located, or NULL if it is outside all genotyped intervals. } \item{markers}{} \item{testmarkerdata}{details about the polymorphisms to be tested} } mergefit() and fastmergefit() return an object, called say 'fit', suitable for plotting using mergeplot(). It contains a named element 'table' containing the log-P values as in hfit(), which can be printed using \code{write.table(fit$table)}. condmergefit() returns a table with columns "position", "interval", "sdp", "logPself", "logPmax", "logPmaxPosition" . } \examples{ ## An example session: # initialise happy \dontrun{h <- happy('Hs.data','HS.alleles')} # prepare the merge files \dontrun{prep <- mergeprepare('markers.positions','testmarkers.txt')} # run the merge fit \dontrun{fit <- mergefit( h, prep )} # alternative, and equivalent, use of fastmergefit(): \dontrun{fit <- fastmergefit( 'Hs.data','HS.alleles', 'markers.positions','testmarkers.txt' )} # plot the results \dontrun{mergeplot( fit, prep )} } \seealso{ happy(), mergeplot() } \keyword{models} \author{Richard Mott} happy.hbrem/src/0000755000261400006360000000000012254616416013045 5ustar00rmottmotthappy.hbrem/src/Rhappy.c0000644000261400006360000015275512254616417014474 0ustar00rmottmott#define _ISOC99_SOURCE #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include"happy.h" #include"readline.h" #include"cmp.h" #include"stats.h" static QTL_DATA *qtldata[100]; static int nqtldata = 0; int entrycmp( const void *a, const void *b ); SEXP happy( SEXP datafile, SEXP allelesfile, SEXP generations, SEXP phase, SEXP file_format, SEXP missing_code, SEXP do_dp, SEXP min_dist, SEXP haploid, SEXP ancestryfile ) { QTL_DATA *q = NULL; ALLELES *a = NULL; FILE *dfp=NULL, *afp=NULL, *anfp=NULL; const char *afilename=NULL, *anfilename=NULL; const char *dfilename=NULL; int gen = 0; double g; int verbose = 0; int use_parents = 0; int ped_format = 0; int phaseKnown = 0; SEXP strains,markers, chromosome, subjects, phenotypes, map, family, mother, father; SEXP ans = R_NilValue; SEXP names, handle, class; char *MissingCode; int i; const char *PhaseStr; const char *File_FormatStr; int Do_dp, Haploid; double MinDist = 1.0e-5; if ( ! isString(datafile) || length(datafile) != 1 ) error( "datafile is not a string"); dfilename = CHAR(STRING_ELT(datafile,0)); if ( ! (dfp = fopen( dfilename, "r" ) ) ) error( "could not open data file" ); if ( ! isString(allelesfile) || length(allelesfile) != 1 ) error( "allelesfile is not a string"); afilename = CHAR(STRING_ELT(allelesfile,0)); if ( ! (afp = fopen( afilename, "r" ) ) ) error( "could not open alleles file" ); if ( isString(ancestryfile) && length(ancestryfile) == 1 ) { anfilename = CHAR(STRING_ELT(ancestryfile,0)); if ( ! (anfp = fopen( anfilename, "r" ) ) ) error( "could not open ancestry file" ); } if ( ! isNumeric(generations) || length(generations) != 1 ) error( "generations is not numeric"); g = REAL(generations)[0]; gen = (int)g; if ( ! isString(phase) || length(phase) != 1 ) error( "phase is not a string"); PhaseStr = CHAR(STRING_ELT(phase,0)); if ( ! isString(file_format) || length(file_format) != 1 ) error( "file_format is not character(1)"); File_FormatStr = CHAR(STRING_ELT(file_format,0)); if ( ! isString(missing_code) || length(missing_code) != 1 ) error( "missing_code is not character(1)"); if ( strlen( CHAR(STRING_ELT(missing_code,0)) ) > 0 ) { MissingCode = (char*)CHAR(STRING_ELT(missing_code,0)); } else { MissingCode = strdup(ND_ALLELE); } if ( ! isNumeric(do_dp) || length(do_dp) != 1 ) error( "do_dp is not numeric(1)"); Do_dp = INTEGER(do_dp)[0]; if ( ! isNumeric(haploid) || length(haploid) != 1 ) error( "haploid is not numeric(1)"); Haploid = INTEGER(haploid)[0]; if ( ! isNumeric(min_dist) || length(min_dist) != 1 ) error( "min_dist is not numeric(1)"); else if ( isNumeric(min_dist) ) MinDist = (double)REAL(min_dist)[0]; Rprintf( "mindist: %g\n", MinDist ); Rprintf( "datafile %s allelesfile %s gen %d\n", dfilename, afilename, gen ); Rprintf( "genotype phase: %s\n", PhaseStr); if ( ! strcmp( File_FormatStr, "ped") ) ped_format = 1; else ped_format = 0; if ( ! strcmp( PhaseStr, "unknown" ) ) use_parents = 0; else if ( ! strcmp( PhaseStr, "estimate" ) ) { use_parents = 1; ped_format = 1; } else if ( ! strcmp( PhaseStr, "known" ) ) { use_parents = 0; phaseKnown = 1; } if ( use_parents ) Rprintf( "using parental genotypes to help determine phase\n"); a = input_allele_frequencies( afp, gen, MissingCode, MinDist, verbose ); q = read_qtl_data( dfp, (char*)dfilename, a, verbose, use_parents, ped_format, MissingCode ); q->an = read_subject_ancestries( anfp, (char*)anfilename, verbose ); q->phase_known = phaseKnown; q->haploid = Haploid; if ( Haploid ) Rprintf( "assuming haploid(inbred) genotypes\n"); if ( q->an ) check_and_apply_ancestry( q ); Rprintf( "datafile %s allelesfile %s gen %d\n", dfilename, afilename, gen ); if ( Do_dp ) { if ( q->haploid ) { /* heterozygosity(q ); */ create_haploid_summed_dp_matrices( q ); } else create_summed_dp_matrices( q ); } PROTECT(strains=allocVector(STRSXP,q->S)); for(i=0;iS;i++) { SET_STRING_ELT(strains,i, mkChar(a->strain_name[i])); } PROTECT(markers=allocVector(STRSXP,q->M)); for(i=0;iM;i++) { SET_STRING_ELT(markers,i, mkChar(a->af[i].marker_name)); } PROTECT(chromosome=allocVector(STRSXP,q->M)); for(i=0;iM;i++) { SET_STRING_ELT(chromosome,i, mkChar(a->af[i].chromosome)); } PROTECT(map=allocVector(REALSXP,q->M)); for(i=0;iM;i++) { REAL(map)[i] = a->af[i].position; } PROTECT(subjects=allocVector(STRSXP,q->N)); for(i=0;iN;i++) { SET_STRING_ELT(subjects,i, mkChar(q->name[i])); } PROTECT(family=allocVector(STRSXP,q->N)); for(i=0;iN;i++) { if ( q->family && q->family[i] ) SET_STRING_ELT(family,i, mkChar(q->family[i])); else SET_STRING_ELT(family,i, mkChar("")); } PROTECT(mother=allocVector(INTSXP,q->N)); if ( q->mother ) { for(i=0;iN;i++) { INTEGER(mother)[i] = q->mother[i]+1; } } else { for(i=0;iN;i++) { INTEGER(mother)[i] = -1; } } PROTECT(father=allocVector(INTSXP,q->N)); if ( q->father ){ for(i=0;iN;i++) { INTEGER(father)[i] = q->father[i]+1; } } else { for(i=0;iN;i++) { INTEGER(father)[i] = -1; } } PROTECT(phenotypes=allocVector(REALSXP,q->N)); for(i=0;iN;i++) { REAL(phenotypes)[i] = q->observed[i]; } PROTECT( ans = allocVector( VECSXP, 10 ) ); PROTECT(names=allocVector(STRSXP, 10 ) ); PROTECT(handle = allocVector(INTSXP,1)); qtldata[nqtldata] = q; INTEGER(handle)[0] = nqtldata++; /* SET_VECTOR_ELT(ans,0,strains); SET_VECTOR_ELT(names,0,mkChar("strains")); SET_VECTOR_ELT(ans,1,markers); SET_VECTOR_ELT(names,1,mkChar("markers")); SET_VECTOR_ELT(ans,2,map); SET_VECTOR_ELT(names,2,mkChar("map")); SET_VECTOR_ELT(ans,3,subjects); SET_VECTOR_ELT(names,3,mkChar("subjects")); SET_VECTOR_ELT(ans,4,phenotypes); SET_VECTOR_ELT(names,4,mkChar("phenotypes")); SET_VECTOR_ELT(ans,5,handle); SET_VECTOR_ELT(names,5,mkChar("handle")); SET_VECTOR_ELT(ans,6,chromosome); SET_VECTOR_ELT(names,6,mkChar("chromosome")); SET_VECTOR_ELT(ans,7,family); SET_VECTOR_ELT(names,7,mkChar("family")); */ SET_VECTOR_ELT(ans,0,strains); SET_STRING_ELT(names,0,mkChar("strains")); SET_VECTOR_ELT(ans,1,markers); SET_STRING_ELT(names,1,mkChar("markers")); SET_VECTOR_ELT(ans,2,map); SET_STRING_ELT(names,2,mkChar("map")); SET_VECTOR_ELT(ans,3,subjects); SET_STRING_ELT(names,3,mkChar("subjects")); SET_VECTOR_ELT(ans,4,phenotypes); SET_STRING_ELT(names,4,mkChar("phenotypes")); SET_VECTOR_ELT(ans,5,handle); SET_STRING_ELT(names,5,mkChar("handle")); SET_VECTOR_ELT(ans,6,chromosome); SET_STRING_ELT(names,6,mkChar("chromosome")); SET_VECTOR_ELT(ans,7,family); SET_STRING_ELT(names,7,mkChar("family")); SET_VECTOR_ELT(ans,8,mother); SET_STRING_ELT(names,8,mkChar("mother")); SET_VECTOR_ELT(ans,9,father); SET_STRING_ELT(names,9,mkChar("father")); setAttrib( ans, R_NamesSymbol, names ); UNPROTECT(12); /* set the class */ PROTECT(class = allocVector(STRSXP, 1)); SET_STRING_ELT(class, 0, mkChar("happy")); classgets(ans, class); UNPROTECT(1); return ans; } SEXP getListElement(SEXP list, char *str) { SEXP elmt = R_NilValue, names = getAttrib(list, R_NamesSymbol); int i; for (i = 0; i < length(list); i++) if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) { elmt = VECTOR_ELT(list, i); break; } return elmt; } /* copy a QTL_FIT structure */ void qtl_fit_cp( QTL_FIT *fit1, QTL_FIT *fit2, int N, int S ) { int i; fit1->locus = fit2->locus; fit1->rss = fit2->rss; fit1->fss = fit2->fss; fit1->F = fit2->F; fit1->pval = fit2->pval; fit1->mean = fit2->mean; fit1->sigma = fit2->sigma; fit1->left = fit2->left; fit1->right = fit2->right; for(i=0;itrait1[i] = fit2->trait1[i]; fit1->trait2[i] = fit2->trait2[i]; } for(i=0;itrait[i] = fit2->trait[i]; fit1->trait_error[i] = fit2->trait_error[i]; } } QTL_FIT *allocate_qtl_fit( QTL_FIT *fit, int N, int strains ) { if ( fit == NULL ) fit = (QTL_FIT*)calloc(1,sizeof(QTL_FIT)); fit->trait = (double*)calloc(strains,sizeof(double)); fit->trait_error = (double*)calloc(strains,sizeof(double)); fit->trait1 = (int*)calloc(N,sizeof(int)); fit->trait2 = (int*)calloc(N,sizeof(int)); return fit; } QTL_DATA *read_qtl_data( FILE *fp, char *name, ALLELES *a, int verbose, int use_parents, int ped_format, char *missingCode ) { QTL_DATA *q = (QTL_DATA*)calloc(1,sizeof(QTL_DATA)); int max_N = 10000; int m; int bufsize = 100+a->markers*20; char *buffer = (char*)calloc(bufsize,sizeof(char)); char **mother = NULL; char **father= NULL; double NaN = nan("char-sequence"); int *pcount; int nparents=0; q->alleles = a; q->filename = (char*)strdup(name); q->N = 0; q->M = a->markers; q->S = a->strains; q->observed = (double*)calloc(max_N,sizeof(double)); q->genos = (CHROM_PAIR*)calloc(max_N,sizeof(CHROM_PAIR)); q->name = (char**)calloc(max_N,sizeof(char*)); q->family = (char**)calloc(max_N,sizeof(char*)); q->use_parents = use_parents; q->missingCode = (char*)strdup(missingCode); if ( use_parents || ped_format ) { q->sex = (int*)calloc(max_N, sizeof(int)); mother = (char**)calloc(max_N, sizeof(char*)); father = (char**)calloc(max_N, sizeof(char*)); Rprintf( "Reading phenotype and genotype data from ped file %s\n", name ); } else Rprintf( "Reading phenotype and genotype data from data file %s\n", name ); while ( skip_comments( fp, buffer ) != EOF ) { char *str1, *str2; m = 0; int ok = 0; if ( q->N >= max_N ) { max_N *= 2; q->observed = (double*)realloc(q->observed,max_N*sizeof(double)); q->genos = (CHROM_PAIR*)realloc(q->genos,max_N*sizeof(CHROM_PAIR)); q->name = (char**)realloc(q->name,max_N*sizeof(char*)); q->family = (char**)realloc(q->family,max_N*sizeof(char*)); if ( use_parents ) { q->sex = (int*)realloc(q->sex, max_N*sizeof(int)); mother = (char**)realloc(mother, max_N*sizeof(char*)); father = (char**)realloc(father, max_N*sizeof(char*)); } } q->genos[q->N].markers = q->M; q->genos[q->N].chrom1 = (int*)calloc(q->M,sizeof(int)); q->genos[q->N].chrom2 = (int*)calloc(q->M,sizeof(int)); if ( use_parents || ped_format ) { char *family = strtok( buffer, " " ); char *id = strtok( NULL, " " ); char *dad = strtok( NULL, " " ); char *mum = strtok( NULL, " " ); char *sex = strtok( NULL, " " ); char *pheno = strtok( NULL, " " ); if ( family && id && dad && mum && sex && pheno ) { char *endptr; q->family[q->N] = (char*)strdup(family); father[q->N] = (char*)strdup(dad); mother[q->N] = (char*)strdup(mum); q->sex[q->N] = atoi(sex); q->name[q->N] = (char*)strdup(id); /* Rprintf( "name %s\n", q->name[q->N]); */ q->observed[q->N] = strtod(pheno,&endptr); if (endptr == pheno) q->observed[q->N] = NaN; ok = 1; } } else { char *id = strtok( buffer, " " ); char *pheno = strtok( NULL, " " ); char *endptr; q->name[q->N] = (char*)strdup(id); q->observed[q->N] = strtod(pheno,&endptr); if (endptr == pheno) q->observed[q->N] = NaN; ok = 1; } if ( ok ) { if ( verbose >=2 ) Rprintf("individual %s %.5f\n", q->name[q->N], q->observed[q->N] ); while( (str1 = strtok( NULL, " " ) ) && (str2 = strtok( NULL, " " ) ) ) { if ( m >= q->M ) { Rprintf( "ERROR: too many markers on line %d\n", q->N ); error("fatal HAPPY error"); } if ( ! legal_string( str1, a->af[m].allele_name, a->af[m].alleles, &q->genos[q->N].chrom1[m] ) ) { int k; Rprintf( "ERROR: subject %s unknown allele1 %s for marker %d %s - legal values are ", q->name[q->N], str1, m, a->af[m].marker_name ); for(k=0;kaf[m].alleles;k++) Rprintf( " %s", a->af[m].allele_name[k] ); Rprintf( "\n"); if ( ! legal_string( missingCode, a->af[m].allele_name, a->af[m].alleles, &q->genos[q->N].chrom1[m] ) ) { Rprintf( "ERROR: subject %s unknown allele1 %s for marker %d %s - legal values are", q->name[q->N], missingCode, m, a->af[m].marker_name ); error("fatal HAPPY error"); } } if ( strcmp( str1, a->af[m].allele_name[q->genos[q->N].chrom1[m]] ) ) { Rprintf( "ERROR subject %s decoding allele %s %s\n", q->name[q->N], str1, a->af[m].allele_name[q->genos[q->N].chrom1[m]] ); } if ( ! legal_string( str2, a->af[m].allele_name, a->af[m].alleles, &q->genos[q->N].chrom2[m] ) ) { int k; Rprintf( "ERROR: subject %s unknown allele2 %s for marker %d %s - legal values are", q->name[q->N], str2, m, a->af[m].marker_name ); for(k=0;kaf[m].alleles;k++) Rprintf( " %s", a->af[m].allele_name[k] ); Rprintf( "\n"); if ( ! legal_string( missingCode, a->af[m].allele_name, a->af[m].alleles, &q->genos[q->N].chrom2[m] ) ) { Rprintf( "ERROR:subject %s unknown allele2 %s for marker %d %s - legal values are", q->name[q->N], missingCode, m, a->af[m].marker_name ); error("fatal HAPPY error"); } } if ( strcmp( str2, a->af[m].allele_name[q->genos[q->N].chrom2[m]] )) { Rprintf( "ERROR subject %s decoding allele %s %s\n", q->name[q->N], str2, a->af[m].allele_name[q->genos[q->N].chrom2[m]] ); } /*Rprintf( "%d: %s %s\n", m, str1, str2 ); */ a->af[m].allele_freq[q->genos[q->N].chrom1[m]]++; a->af[m].allele_freq[q->genos[q->N].chrom2[m]]++; m++; } } if ( m < q->M ) { Rprintf( "ERROR subject %s does not have enough alleles %d (%d) on line %d\n", q->name[q->N], m, q->M, q->N ); error("fatal HAPPY error"); } q->N++; } if ( verbose>=2 ) { for(m=0;mM;m++) { int al; ALLELE_FREQ *af = &a->af[m]; Rprintf( "marker %s %.3f\n", af->marker_name, af->position ); for(al=0;alalleles;al++) Rprintf( "%10s %5.0f\n", af->allele_name[al], af->allele_freq[al] ); } } for(m=0;mM;m++) { ALLELE_FREQ *af = &a->af[m]; int al; for(al=0;alalleles;al++) af->allele_freq[al] /= (2*q->N); } for(m=0;mM;m++) { ALLELE_FREQ *af = &a->af[m]; int al, s; for(s=0;sS;s++) { double p = 0.0; for(al=0;alalleles;al++) p += af->pr_AtoS[al][s]*af->allele_freq[al]; if ( p > 1.0e-7 ) af->entropy -= p*log(p); } /* printf( "marker %d %s entropy %e\n", m+1, af->marker_name, af->entropy ); */ } q->fit = (QTL_FIT*)calloc(q->M,sizeof(QTL_FIT)); for(m=0;mM;m++) allocate_qtl_fit( &q->fit[m], q->N, q->S ); Rprintf( "Number of individuals: %-5d\n", q->N ); Rprintf( "Number of markers: %-5d\n", q->M ); Rprintf( "Number of strains: %-5d\n", q->S ); Rprintf( "Use Parents: %s\n", q->use_parents ? "yes" : "no" ); #ifdef _USE_HASH_ Rprintf( "status %d\n", use_parents || ped_format ); if ( use_parents || ped_format ) { int i, nparents=0; PARENT_KEY e; PARENT_KEY *f; int both = 0; q->mother = (int*)calloc(q->N, sizeof(int)); q->father = (int*)calloc(q->N, sizeof(int)); pcount = (int*)calloc(q->N, sizeof(int)); if ( hcreate(q->N) == 0 ) error("Could not create hash table"); for(i=0;iN;i++) { e.key = q->name[i]; e.data = (void*)(long)i; hsearch( e, ENTER ); } for(i=0;iN;i++) { e.key = mother[i]; f = hsearch( e, FIND ); if ( f != NULL ) q->mother[i] = (int)((long)(f->data)); else q->mother[i] = -1; e.key = father[i]; f = hsearch( e, FIND ); if ( f != NULL ) q->father[i] = (int)((long)(f->data)); else q->father[i] = -1; if ( q->mother[i] > -1 && q->father[i] > -1 ) { both++; pcount[q->mother[i]]++; pcount[q->father[i]]++; } } for( i=0;iN;i++) nparents += pcount[i]>0 hdestroy(); free(mother); free(father); free(pcount); Rprintf( "Number of subjects with two parents: %-5d\n", both ); Rprintf( "Number of parents in nuclear families: %-5d\n", nparents ); } #else if ( use_parents || ped_format ) { int i; int both = 0; PARENT_KEY *sorted = (PARENT_KEY*)calloc(q->N, sizeof(PARENT_KEY)); PARENT_KEY *m, *f; q->mother = (int*)calloc(q->N, sizeof(int)); q->father = (int*)calloc(q->N, sizeof(int)); pcount = (int*)calloc(q->N, sizeof(int)); for(i=0;iN;i++) { sorted[i].key = q->name[i]; sorted[i].id = i; } qsort( sorted, q->N, sizeof(PARENT_KEY), entrycmp ); for(i=0;iN;i++) { PARENT_KEY mum, dad; mum.key = mother[i]; dad.key = father[i]; m = (PARENT_KEY*)bsearch( &mum, sorted, q->N, sizeof(PARENT_KEY), entrycmp ); f = (PARENT_KEY*)bsearch( &dad, sorted, q->N, sizeof(PARENT_KEY), entrycmp ); if ( m != NULL ) q->mother[i] = m->id; else q->mother[i] = -1; if ( f != NULL ) q->father[i] = f->id; else q->father[i] = -1; if ( q->mother[i] > -1 && q->father[i] > -1 ) { both++; pcount[q->mother[i]]++; pcount[q->father[i]]++; } } for( i=0;iN;i++) nparents += pcount[i]>0; free(mother); free(father); free(sorted); free(pcount); Rprintf( "Number of subjects with two parents: %-5d\n", both ); Rprintf( "Number of parents in nuclear families: %-5d\n", nparents ); } #endif fit_null_qtl_model( q ); free(buffer); return q; } double fit_null_qtl_model( QTL_DATA *q ) { int i, k; QTL_FIT *fit = q->null_model = (QTL_FIT*)calloc(1,sizeof(QTL_FIT)); allocate_qtl_fit( fit, q->N, q->S ); fit->mean = 0.0; for(i=0;iN;i++) { fit->mean += q->observed[i]; fit->sigma += q->observed[i]*q->observed[i]; } fit->mean /= q->N; fit->sigma = ( fit->sigma - q->N*fit->mean*fit->mean )/(q->N-1); fit->rss = 0.0; for(i=0;iN;i++) { double residual = q->observed[i] - fit->mean; fit->rss += residual*residual; } for(k=0;kalleles->strains;k++) fit->trait[k] = fit->trait_error[k] = 0.0; Rprintf("null model mean %e var %e\n", fit->mean, fit->sigma ); return fit->sigma; } ANCESTRY *read_subject_ancestries( FILE *fp, char *filename, int verbose ) { if ( fp ) { int subjects = 0; int strains = 0; char line[256]; int line_no = 0; int i; // int imax = 10000; Rprintf( "Reading subject ancestries from %s\n", filename ); skip_comments(fp, line); line_no++; if ( sscanf( line, "subjects %d strains %d", &subjects, &strains ) == 2 ) { Rprintf( "subjects %d strains %d", subjects, strains ); ANCESTRY *an = (ANCESTRY*)calloc(1, sizeof(ANCESTRY)); an->N = subjects; an->S = strains; skip_comments( fp, line ); line_no++; if ( 0 == strncmp( line, "strain_names", strlen("strain_names") ) ) { int k; char *str=strtok(line," "); an->strain_name = (char**)calloc(strains,sizeof(char*)); for(k=0;kstrain_name[k] = (char*)strdup(str); } else { Rprintf( "ERROR not enough strain names %d/%d\n", k, strains ); error("fatal HAPPY error"); } } } an->subject_name = (char**)calloc(subjects, sizeof(char*)); an->prob = (double**)calloc(subjects,sizeof(double*)); for(i=0;isubject_name[i] = (char*)strdup(str); an->prob[i] = (double*)calloc(strains, sizeof(double)); for(k=0;kprob[i][k] = x; total += x; } else { Rprintf( "ERROR not a probability \"%s\" (token %d) in ancestry file line %d\n", str, k, line_no ); error( "fatal HAPPY error"); } } for(k=0;kprob[i][k] /= total; } return(an); } } return(NULL); } void heterozygosity( QTL_DATA *q ) { int id, m; ALLELES *A = q->alleles; for( id=0;idN;id++) { double het = subject_heterozygosity(q, id); if ( het > 0.0 ) { Rprintf( "subject %20.20s heterozygosity %.4f\n", q->name[id], het ); } } for( m=0;mM;m++) { double het = marker_heterozygosity(q, m); if ( het > 0.0 ) { Rprintf( "marker %20.20s heterozygosity %.4f\n", A->af[m].marker_name, het ); } } } double subject_heterozygosity( QTL_DATA *q, int individual ) { double het = 0.0; int m; for(m=0;mM;m++) { het += q->genos[individual].chrom1[m]!=q->genos[individual].chrom2[m]; } het /= q->M; return(het); } double marker_heterozygosity( QTL_DATA *q, int marker ) { double het = 0.0; int i; for(i=0;iN;i++) { het += q->genos[i].chrom1[marker]!=q->genos[i].chrom2[marker]; } het /= q->N; return(het); } int check_and_apply_ancestry(QTL_DATA *q ) { ANCESTRY *an = q->an; if ( an ) { ALLELES *A = q->alleles; int i; if ( an->S != q->S ) { Rprintf( "ERROR number of strains in ancestry file %d unequal to number of strains in alleles file %d\n", an->S, q->S ); error( "fatal HAPPY error"); } else { int i; int bad = 0; for ( i=0; iS; i++) { if ( strcmp( an->strain_name[i], A->strain_name[i] ) != 0) { bad ++; Rprintf( "ERROR strain at position %d name %s in ancestry differs from %s in alleles\n", i+1, an->strain_name[i], A->strain_name[i] ); } if ( bad > 0 ) error( "fatal HAPPY error"); } Rprintf( "Checked consistency of strain names between ancestry and alleles: OK\n" ); } if ( an->N != q->N ) { Rprintf( "ERROR number of subjects in ancestry file %d unequal to number of subjects in alleles file %d\n", an->N, q->N ); error( "fatal HAPPY error"); } else { int i; int bad = 0; for ( i=0; iN; i++) { if ( strcmp( an->subject_name[i], q->name[i] ) != 0) { bad ++; Rprintf( "ERROR subject at position %d name %s in ancestry differs from %s in data\n", i+1, an->subject_name[i], q->name[i] ); } if ( bad > 0 ) error( "fatal HAPPY error"); } Rprintf( "Checked consistency of subject names between ancestry and data: OK\n" ); } an->pr_AtoS = (double****)calloc(an->N, sizeof(double***)); for ( i=0; iN; i++) { int m; an->pr_AtoS[i] = (double***)calloc(q->M, sizeof(double**)); for(m=0;mM;m++) { ALLELE_FREQ *af = &(A->af[m]); double **pr_AtoS = af->pr_AtoS; int s; int a; an->pr_AtoS[i][m] = (double**)calloc(af->alleles,sizeof(double*)); for(a=0;aalleles;a++) { double total = 1.0e-10; an->pr_AtoS[i][m][a] = (double*)calloc(q->S,sizeof(double)); for(s=0;sS;s++) { total += an->prob[i][s]*pr_AtoS[a][s]; } for(s=0;sS;s++) { an->pr_AtoS[i][m][a][s] = an->prob[i][s]*pr_AtoS[a][s] / total; } } } } return(1); } return(0); } ALLELES *input_allele_frequencies( FILE *fp, int generations, char *missingCode, double MinDist, int verbose ) { /* Example format: [ all probabilities are expressed as Pr(strain|allele) ] [rows are alleles, columns are strains] markers 5 strains 8 strain_name-1 strain-name2 ... strain-name8 marker m1 4 81.3 allele 1 0.5 0.0 0.5 0.0 0.0 0.0 0.0 0.0 allele 2 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 allele 3 0.0 0.0 0.0 0.33 0.33 0.0 0.0 0.33 allele 4 0.0 0.0 0.0 0.0 0.0 0.5 0.5 0.0 marker m2 4 85.5 allele 1 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 allele 2 0.25 0.25 0.0 0.0 0.25 0.25 0.0 0.0 allele 3 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 allele 4 0.0 0.0 0.0 0.0 0.0 0.0 0.5 0.5 marker m3 5 86.0 allele 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 allele 2 0.0 0.0 0.5 0.0 0.5 0.0 0.0 0.0 allele 3 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 allele 4 0.0 0.0 0.0 0.0 0.0 0.0 0.5 0.5 allele 5 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 marker m4 2 87.5 allele 1 0.2 0.2 0.2 0.0 0.0 0.0 0.2 0.2 allele 2 0.0 0.0 0.0 0.33 0.33 0.33 0.0 0.0 marker m5 4 87.8 allele 1 0.0 0.0 0.0 0.5 0.0 0.0 0.0 0.5 allele 2 0.0 0.0 0.0 0.0 0.5 0.5 0.0 0.0 allele 3 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 allele 4 0.5 0.0 0.5 0.0 0.0 0.0 0.5 0.0 Note the marker location in centimorgans can be omitted, in which case it defaults to 1.0 cM */ char line[10000]; char marker_name[256]; char allele_name[256]; int markers=-1; int strains=-1; int line_no = 0; int m, s, a; ALLELES *A=NULL; skip_comments( fp, line ); line_no++; if ( sscanf( line, "markers %d strains %d", &markers, &strains ) == 2 ) { A = (ALLELES*)calloc(1,sizeof(ALLELES)); A->markers = markers; A->strains = strains; A->generations = generations; A->af = (ALLELE_FREQ*)calloc(markers,sizeof(ALLELE_FREQ)); skip_comments( fp, line ); line_no++; if ( 0 == strncmp( line, "strain_names", strlen("strain_names") ) ) { int k; char *str=strtok(line," "); A->strain_name = (char**)calloc(strains,sizeof(char*)); for(k=0;kstrain_name[k] = (char*)strdup(str); } else { Rprintf( "ERROR not enough strain names %d/%d\n", k, strains ); error("fatal HAPPY error"); } } } for(m=0;maf[m]); double total = 0.0; skip_comments( fp, line ); line_no++; strcpy(af->chromosome,"unknown"); if ( (sscanf( line, "marker %s %d %s %lf", marker_name, &(af->alleles), af->chromosome, &(af->position) ) == 4 ) || ( sscanf( line, "marker %s %d %lf", marker_name, &(af->alleles), &(af->position) ) == 3 )) { af->marker_name = (char*)strdup( marker_name ); if ( verbose>=2 ) Rprintf("marker %d %s %.3f %d\n", m, af->marker_name, af->position, af->alleles); af->pr_AtoS = (double**)calloc(af->alleles,sizeof(double*)); for(s=0;salleles;s++) af->pr_AtoS[s] = (double*)calloc(strains,sizeof(double)); af->allele_name = (char**)calloc(af->alleles,sizeof(char*)); af->which_allele = (int*)calloc(strains,sizeof(int)); af->allele_freq = (double*)calloc(af->alleles,sizeof(double)); /* printf( "marker %d %s\n", m+1, marker_name ); */ af->ND = -1; /* index of code for missing value */ for(a=0;aalleles;a++) { skip_comments( fp, line ); line_no++; if ( sscanf( line, "allele %s", allele_name ) == 1 ) { char *str = (char*)strtok( line, " " ); af->allele_name[a] = (char*)strdup(allele_name); str = (char*)strtok( NULL, " " ); if ( ! strcmp(allele_name, missingCode) ) af->ND = a; /* Rprintf( "missing code for %d %d\n", m, af->ND);*/ total = 0.0; for(s=0;swhich_allele[s] = a; total += af->pr_AtoS[a][s] = x; /* prob allele a induces strain s ie Pr(s|a) */ } else { Rprintf( "probability Parse ERROR, line %d token :%s:\n", line_no, str ); error("fatal HAPPY error"); } } for(s=0;spr_AtoS[a][s] /= total; } } else { Rprintf( "allele Parse ERROR, line %d %s\n", line_no, line ); error("fatal HAPPY error"); } } } else { Rprintf( "marker Parse ERROR, line %d %s\n", line_no, line ); error("fatal HAPPY error"); } } A->Pr_ss = (double*)calloc(markers,sizeof(double)); A->Pr_st = (double*)calloc(markers,sizeof(double)); for(m=1;maf[m].chromosome, A->af[m-1].chromosome)) { double d = (A->af[m].position - A->af[m-1].position)/100.0; double lambda = generations*d; /* Poisson parameter */ double q = exp(-lambda); double p = A->af[m].ProbSame = q + (1.0-q)/strains; A->Pr_ss[m-1] = p; A->Pr_st[m-1] = (1.0-p)/(strains-1.0); if ( verbose>=2 ) Rprintf("marker %d %s %.3f %.4f %.4f %.4f \n", m, A->af[m].marker_name, A->af[m].position, p, A->Pr_ss[m-1], A->Pr_st[m-1] ); } else { A->af[m].ProbSame = 1.0/strains; A->Pr_ss[m-1] = 1.0/strains; A->Pr_st[m-1] = 1.0/strains; } } /* compute the prior probabilities that, if a QTL lies between marker m-1 and m, the linkage to the neighbouring markers for the two chromosomes are X,Y where X and Y must take exactly one of the values BOTH, LEFT, UNLINKED, RIGHT */ A->MinDist = MinDist; for(m=0;maf[m]); int s, t; double total; double P[LINKAGE_STATES]; double d = (A->af[m+1].position - A->af[m].position)/100.0; double lambda, elambda, elambda2; if ( d < MinDist ) d = MinDist; lambda = generations*d; elambda = exp(-lambda); elambda2 = elambda*elambda; P[BOTH] = elambda; P[LEFT] = P[RIGHT] = (1-elambda)/lambda-elambda; P[UNLINKED] = 1.0-P[BOTH]-P[LEFT]-P[RIGHT]; af->prior = (double**)calloc(LINKAGE_STATES,sizeof(double*)); for(s=0;sprior[s] = (double*)calloc(LINKAGE_STATES,sizeof(double)); af->prior[BOTH][BOTH] = P[BOTH]*P[BOTH]; af->prior[BOTH][RIGHT] = af->prior[RIGHT][BOTH] = af->prior[BOTH][LEFT] = af->prior[LEFT][BOTH] = P[BOTH]*P[LEFT]; af->prior[BOTH][UNLINKED] = af->prior[UNLINKED][BOTH] = P[BOTH] - af->prior[BOTH][BOTH] - af->prior[BOTH][LEFT] - af->prior[BOTH][RIGHT]; af->prior[LEFT][LEFT] = af->prior[RIGHT][RIGHT] = 0.5*(1-elambda2)/lambda - 2*elambda*(1-elambda)/lambda + elambda2; af->prior[LEFT][RIGHT] = af->prior[RIGHT][LEFT] = P[BOTH]*(1-2*P[LEFT]-P[BOTH]); af->prior[LEFT][UNLINKED] = af->prior[RIGHT][UNLINKED] = af->prior[UNLINKED][LEFT] = af->prior[UNLINKED][RIGHT] = P[LEFT] - af->prior[BOTH][LEFT] - af->prior[LEFT][LEFT] - af->prior[RIGHT][LEFT]; af->prior[UNLINKED][UNLINKED] = P[UNLINKED] - af->prior[UNLINKED][BOTH] - af->prior[UNLINKED][LEFT] - af->prior[UNLINKED][RIGHT]; if ( verbose>=2 ) { Rprintf("priors for %d lambda %.5f d %.5f\n", m, lambda, d); for(s=BOTH;s<=RIGHT;s++) { total = 0.0; Rprintf("%2d ", s); for(t=BOTH;t<=RIGHT;t++) { Rprintf(" %.5f", af->prior[s][t] ); total += af->prior[s][t]; } Rprintf(" %.5f %.5f\n", total, P[s]); } } } } return A; } int KVcmp( const void *a, const void *b ) { const KV *A = (const KV*)a; const KV *B = (const KV*)b; double x = A->key-B->key; if ( x > 0.0 ) return 1; else if ( x < 0.0 ) return -1; else return 0.0; } /* compute the matrix of probabilities that, if the qtl is a distance c from the left end of the curent interval, the pair of linkage states for the two chromosomes takes the value s,t */ void pointwise_interval_mapping_probabilities( QTL_DATA *q, int locus, double c, double **prior ) { ALLELES *A = q->alleles; double d = (A->af[locus+1].position - A->af[locus].position)/100.0; double lambda = A->generations*d; double elambda = exp(-lambda); double elambdac = exp(-lambda*c); double elambdac1 = exp(-lambda*(1-c)); int s, t; double P[LINKAGE_STATES]; P[BOTH] = elambda; P[LEFT] = elambdac-elambda; P[RIGHT] = elambdac1 -elambda; P[UNLINKED] = 1.0-P[BOTH]-P[LEFT]-P[RIGHT]; for(s=0;sS,sizeof(double)); double *RightMargin = (double*)calloc(qtl->S,sizeof(double)); double s1 = 1.0/qtl->S; for(i=0;iN;i++) { int s, t; DP_MATRICES *dp = &(qtl->dp_matrices[i]); double **Left = dp->Left[m]; double **Right = dp->Right[m+1]; dp->NonRecomb[m]=0.0; for(s=0;sS;s++) { LeftMargin[s] = RightMargin[s] = 0.0; for(t=0;tS;t++) { LeftMargin[s] += Left[s][t]; RightMargin[s] += Right[s][t]; } } total = 0.0; for(s=0;sS;s++) { for(t=0;tS;t++) { total += qp[i][s][t].prior = Left[s][t]*Right[s][t]*P[BOTH][BOTH] + Left[s][t]*RightMargin[t]*P[LEFT][BOTH] + LeftMargin[t]*RightMargin[t]*P[UNLINKED][BOTH]*s1 + LeftMargin[t]*Right[s][t]*P[RIGHT][BOTH] + Left[s][t]*RightMargin[s]*P[BOTH][LEFT] + Left[s][t]*P[LEFT][LEFT] + LeftMargin[t]*P[UNLINKED][LEFT]*s1 + LeftMargin[t]*RightMargin[s]*P[RIGHT][LEFT] + LeftMargin[s]*RightMargin[s]*P[BOTH][UNLINKED]*s1 + LeftMargin[s]*P[LEFT][UNLINKED]*s1 + P[UNLINKED][UNLINKED]*s1*s1 + RightMargin[s]*P[RIGHT][UNLINKED]*s1 + LeftMargin[s]*Right[s][t]*P[BOTH][RIGHT] + LeftMargin[s]*RightMargin[t]*P[LEFT][RIGHT] + RightMargin[t]*P[UNLINKED][RIGHT]*s1 + Right[s][t]*P[RIGHT][RIGHT]; dp->NonRecomb[m] += 2*Left[s][t]*Right[s][t]*P[BOTH][BOTH] + Left[s][t]*RightMargin[t]*P[LEFT][BOTH] + LeftMargin[t]*RightMargin[t]*P[UNLINKED][BOTH]*s1 + LeftMargin[t]*Right[s][t]*P[RIGHT][BOTH] + Left[s][t]*RightMargin[s]*P[BOTH][LEFT] + LeftMargin[s]*RightMargin[s]*P[BOTH][UNLINKED]*s1 + LeftMargin[s]*Right[s][t]*P[BOTH][RIGHT]; /* Rprintf("%d %d %lf\n", i, m, dp->NonRecomb[m]);*/ } } /* printf( "new total %d %.8f\n", i, total ); */ for(s=0;sS;s++) for(t=0;tS;t++) { qp[i][s][t].prior /= total; /* if ( !(qp[i][s][t].prior < 1.0) ) qp[i][s][t].prior = s12; */ /* Rprintf( "%d %d %d %lf %d %d\n", i, s, t, qp[i][s][t].prior, qp[i][s][t].prior==NAN, */ } dp->NonRecomb[m] /= total; } free(LeftMargin); free(RightMargin); return qp; } QTL_PRIOR ***allocate_qtl_priors( QTL_DATA *qtl ) { QTL_PRIOR ***qp = (QTL_PRIOR***)calloc(qtl->N,sizeof(QTL_PRIOR**)); int i, k; for(i=0;iN;i++) { qp[i] = (QTL_PRIOR**)calloc(qtl->S,sizeof(QTL_PRIOR*)); for(k=0;kS;k++) qp[i][k] = (QTL_PRIOR*)calloc(qtl->S,sizeof(QTL_PRIOR)); } return qp; } /* create the forward and backward Dynamic Programming matrices for the sum of all possible haplotype reconstructions */ void create_summed_dp_matrices( QTL_DATA *q ) { int i; double *Pr_ss = q->alleles->Pr_ss; double *Pr_st = q->alleles->Pr_st; q->dp_matrices = (DP_MATRICES*)calloc(q->N,sizeof(DP_MATRICES)); for(i=0;iN;i++) { if ( i == 0 || genotype_difference( q, i, i-1 ) ) { q->dp_matrices[i].Left = summed_dp_matrix( q, i, Pr_ss, Pr_st, +1 ); q->dp_matrices[i].Right = summed_dp_matrix( q, i, Pr_ss, Pr_st, -1 ); q->dp_matrices[i].NonRecomb = (double*)calloc( q->M, sizeof(double)); } else { /* printf( "same genotype %d %d\n"); */ q->dp_matrices[i].Left = q->dp_matrices[i-1].Left; q->dp_matrices[i].Right = q->dp_matrices[i-1].Right; q->dp_matrices[i].NonRecomb = q->dp_matrices[i-1].NonRecomb; } } } double ***summed_dp_matrix( QTL_DATA *qtl, int individual, double *Pr_ss, double *Pr_st, int direction ) { double ***X; int start, stop, incr; CHROM_PAIR *genotypes = &(qtl->genos[individual]); ALLELES *A = qtl->alleles; int m, s, t; int offset; double total; int markers = genotypes->markers; int strains = A->strains; double **Pr_tr1; double **Pr_tr2; double root2 = sqrt(2.0); int mum = -1; int dad = -1; CHROM_PAIR *mgenotypes=NULL; CHROM_PAIR *fgenotypes=NULL; int use_parents = 0; /* double dp=0.0; double ndp=1.0e-10; */ if ( qtl->use_parents && (mum=qtl->mother[individual]) > -1 && (dad=qtl->father[individual]) > -1 ) { use_parents = 1; mgenotypes = &(qtl->genos[mum]); fgenotypes = &(qtl->genos[dad]); } /* printf( "dp_matrix %d strains %d markers %d\n", direction, strains, markers ); */ /* Rprintf( "use_parents %d %d individual %d mum %d dad %d\n", use_parents,qtl->use_parents, individual, mum, dad ); */ Pr_tr1 = (double**)calloc(strains,sizeof(double*)); for(s=0;s 0 ) { start = 0; stop = markers-1; incr = +1; offset = 0; } else { start = markers-1; stop = 0; incr = -1; offset = -1; } if ( qtl->phase_known ) { double **a; if ( qtl->an ) a = qtl->an->pr_AtoS[individual][start]; else a = A->af[start].pr_AtoS; for(s=0;schrom1[start]; int g2 = genotypes->chrom2[start]; X[start][s][t] = a[g1][s] * a[g2][t]; } } else { double phaseP = 1.0; int g1 = genotypes->chrom1[start]; int g2 = genotypes->chrom2[start]; double **a; if ( use_parents ) { int m1 = mgenotypes->chrom1[start]; int m2 = mgenotypes->chrom2[start]; int p1 = fgenotypes->chrom1[start]; int p2 = fgenotypes->chrom2[start]; phaseP = phaseProb( g1, g2, m1, m2, p1, p2, A->af[m].ND ); } if ( qtl->an ) a = qtl->an->pr_AtoS[individual][start]; else { ALLELE_FREQ *af = &(A->af[start]); a = af->pr_AtoS; } for(s=0;schrom1[m]; int g2 = genotypes->chrom2[m]; double pr_ss = Pr_ss[m+offset]; double pr_st = Pr_st[m+offset]; double **a; if ( qtl->an ) a = qtl->an->pr_AtoS[individual][m]; else a = A->af[m].pr_AtoS; int m1=0, m2=0, p1=0, p2=0; if ( use_parents ) { m1 = mgenotypes->chrom1[m]; m2 = mgenotypes->chrom2[m]; p1 = fgenotypes->chrom1[m]; p2 = fgenotypes->chrom2[m]; } for(s=0;sphase_known ) { norm1 *= root2; norm2 *= root2; } for(t=0;tphase_known ) { for(s=0;saf[m].ND ); } for(s=0;s=0 && i < q->N && j >= 0 && j < q-> N ) { int m; for(m=0;mM;m++) d += (q->genos[i].chrom1[m] != q->genos[j].chrom1[m]) + (q->genos[i].chrom2[m] != q->genos[j].chrom2[m]); } else { d = -1; } /* printf( "comparing %d %d diff %d\n", i, j, d); */ return d; } int marker_index( const char *name, QTL_DATA *q, const int isIntervalModel ) { ALLELE_FREQ *af = q->alleles->af; int i; int numMarkers = isIntervalModel ? q->M-1 : q->M; for( i=0; i= 0 && id < nqtldata ) { q = qtldata[id]; if ( q == NULL ) error( "no QTL data"); } else { error("attempt to extract locus using invalid handle %d" ,id); } /* * validate marker */ if ( isString(marker) ) { const char *string = CHAR(STRING_ELT(marker,0)); int i = marker_index( string, q, isIntervalModel ); if (NOT_FOUND == i) { error("could not find locus named %s", string); } *locus = i; } else if ( isInteger(marker) || isNumeric(marker) ) { int m = isInteger(marker) ? INTEGER(marker)[0] : (int)REAL(marker)[0]; int upper = (isIntervalModel) ? q->M-1 : q->M; m--; /* to start indexing at 0 */ if ( m >=0 && m < upper ) { *locus = m; } else { error("no such locus %d", m); } } else { error("locus must be specified as a number or a string"); } return q; } SEXP happyprobs ( SEXP handle, SEXP marker ) { int locus = -1; QTL_DATA *q = validateParams( handle, marker, &locus, 0 ); SEXP Matrix = R_NilValue; if ( locus >= 0 && q->dp_matrices != NULL) { ALLELE_FREQ *af = &(q->alleles->af[locus]); QTL_PRIOR ***p; int S2 = q->S*(q->S+1)/2; int i; /* get the prior probabilities of the strain state combinations at the flanking marker */ p = allocate_qtl_priors( q ); compute_qtl_priors( q, p, locus, af->prior ); PROTECT( Matrix = allocMatrix( REALSXP, q->N, S2) ); for(i=0; iN; i++) { int j, k,m=0; for(j=0;jS;j++) { for(k=0;kN*m] = 2*p[i][j][k].prior; } REAL(Matrix)[i+q->N*m] = p[i][j][j].prior; m++; } } UNPROTECT(1); for(i=0;iN;i++) { int s1; for(s1=0;s1S;s1++) free(p[i][s1]); free(p[i]); } free(p); } return Matrix; } SEXP happyprobs2 ( SEXP handle, SEXP marker, SEXP symmetrize ) { /* returns a matrix giving for each row the probability that the corresponding individual is descened form a pair of strains. happyprobs2 differs from happyprobs only in the row of probabilities return is complete - ie due to symmetry it contains the off-diagonal elements twice. this is useful if the probabilities have been computed using parental information to estimate the phase of the haplotypes. */ int locus = -1; QTL_DATA *q = validateParams( handle, marker, &locus, 1 ); SEXP Matrix = R_NilValue; int Symmetrize; if ( ! isNumeric(symmetrize) || length(symmetrize) != 1 ) error( "symmetrize is not numeric(1)"); Symmetrize = (int)REAL(symmetrize)[0]; if ( locus >= 0 && q->dp_matrices != NULL) { ALLELE_FREQ *af = &(q->alleles->af[locus]); QTL_PRIOR ***p; int i; /* get the prior probabilities of the strain state combinations at the flanking marker */ p = allocate_qtl_priors( q ); compute_qtl_priors( q, p, locus, af->prior ); if ( Symmetrize ) { int S2 = q->S*(q->S+1)/2; PROTECT( Matrix = allocMatrix( REALSXP, q->N, S2) ); for(i=0; iN; i++) { int j, k,m=0; for(j=0;jS;j++) { for(k=0;kN*m] = p[i][j][k].prior + p[i][k][j].prior; } REAL(Matrix)[i+q->N*m] = p[i][j][j].prior; m++; } } UNPROTECT(1); } else { int S2 = q->S*q->S; PROTECT( Matrix = allocMatrix( REALSXP, q->N, S2) ); for(i=0; iN; i++) { int j, k,m=0; for(j=0;jS;j++) { for(k=0;kS;k++,m++) { REAL(Matrix)[i+q->N*m] = p[i][j][k].prior; } } } UNPROTECT(1); } for(i=0;iN;i++) { int s1; for(s1=0;s1S;s1++) free(p[i][s1]); free(p[i]); } free(p); } return Matrix; } SEXP happynonrecomb ( SEXP handle, SEXP marker ) { int locus = -1; QTL_DATA *q = validateParams( handle, marker, &locus, 0 ); SEXP nonrecomb = R_NilValue; if ( locus >= 0 ) { ALLELE_FREQ *af = &(q->alleles->af[locus]); QTL_PRIOR ***p; int i; /* get the prior probabilities of the strain state combinations at the flanking marker */ p = allocate_qtl_priors( q ); compute_qtl_priors( q, p, locus, af->prior ); PROTECT( nonrecomb = allocVector( REALSXP, q->N ) ); for(i=0;iN;i++) { REAL(nonrecomb)[i] = q->dp_matrices[i].NonRecomb[locus]; } UNPROTECT(1); for(i=0;iN;i++) { int s1; for(s1=0;s1S;s1++) free(p[i][s1]); free(p[i]); } free(p); } return(nonrecomb); } SEXP happygenotype ( SEXP handle, SEXP marker ) { int locus = -1; QTL_DATA *q = validateParams( handle, marker, &locus, 0 ); SEXP Genotype = R_NilValue; if ( locus >= 0 ) { ALLELE_FREQ *af = &(q->alleles->af[locus]); int i; PROTECT( Genotype = allocMatrix( STRSXP, q->N, 2 ) ); for(i=0;iN;i++) { CHROM_PAIR cp =q->genos[i]; char *g1 = af->allele_name[cp.chrom1[locus]]; char *g2 = af->allele_name[cp.chrom2[locus]]; if ( !strcmp( g1, "NA" ) || ! strcmp( g2, "NA") ) { SET_STRING_ELT(Genotype,i, R_NaString); SET_STRING_ELT(Genotype,i+q->N, R_NaString); } else { SET_STRING_ELT(Genotype,i, mkChar(g1)); SET_STRING_ELT(Genotype, i+q->N, mkChar(g2)); } } UNPROTECT(1); } return Genotype; } SEXP happydesign( SEXP handle, SEXP marker, SEXP model ) { SEXP Design = R_NilValue; char *mod=NULL; int locus = -1; QTL_DATA *q = validateParams( handle, marker, &locus, 1 ); if ( isString(model) ) { mod = (char*)CHAR(STRING_ELT(model,0)); } if ( locus >= 0 && q->dp_matrices != NULL ) { ALLELE_FREQ *af = &(q->alleles->af[locus]); QTL_PRIOR ***p; int i; /* get the prior probabilities of the strain state combinations at the flanking marker */ p = allocate_qtl_priors( q ); compute_qtl_priors( q, p, locus, af->prior ); /* allocate and instantiate the design matrix */ if ( mod == NULL || ! strcmp( mod, "additive") ) { PROTECT( Design = allocMatrix( REALSXP, q->N, q->S ) ); for(i=0; iN; i++) { int j; for(j=0;jS;j++) { REAL(Design)[i+q->N*j] = 0.0; } } for(i=0; iN; i++) { int j, k; for(j=0;jS;j++) { for(k=0;kS;k++) { REAL(Design)[i+q->N*j] += p[i][j][k].prior; REAL(Design)[i+q->N*k] += p[i][j][k].prior; } } } UNPROTECT(1); } else if ( mod != NULL && ! strcmp ( mod, "full" ) ) { int dim = q->S*(q->S+1)/2; PROTECT( Design = allocMatrix( REALSXP, q->N, dim ) ); for(i=0; iN; i++) { int j, k, n=0; for(j=0;jS;j++) REAL(Design)[i+q->N*n++] = p[i][j][j].prior; for(j=0;jS;j++) for(k=0;kN*n] = p[i][j][k].prior + p[i][k][j].prior; } } UNPROTECT(1); } else if ( mod != NULL && ! strcmp ( mod, "full.asymmetric" ) ) { int dim = q->S*q->S; PROTECT( Design = allocMatrix( REALSXP, q->N, dim ) ); for(i=0; iN; i++) { int j, k, n=0; for(j=0;jS;j++) for(k=0;kN*n] = p[i][j][k].prior; } } UNPROTECT(1); } else { warning( "unknown model %s", mod ); } for(i=0;iN;i++) { int s1; for(s1=0;s1S;s1++) free(p[i][s1]); free(p[i]); } free(p); } else { /* warning("Error - locus index %d out of range\n", locus ); */ } return Design; } double phaseProb( int a1, int a2, int m1, int m2, int p1, int p2, int NA ) { double Q12, Q21; double T; if ( a1 == NA || a2 == NA || m1 == NA || m2 == NA || p1 == NA || p2 == NA ) return 0.5; Q12 = (a1==m1)*(a2==p1) + (a1==m2)*(a2==p1) + (a1==m1)*(a2==p2) + (a1==m2)*(a2==p2); Q21 = (a2==m1)*(a1==p1) + (a2==m2)*(a1==p1) + (a2==m1)*(a1==p2) + (a2==m2)*(a1==p2); T = Q12 + Q21; if ( T>0 ) { Q12 /= T; } else Q12 = 0.5; return Q12; } /* HAPLOID GENOMES */ SEXP haploid_happydesign( SEXP handle, SEXP marker ) { SEXP Design = R_NilValue; // char *mod=NULL; int locus = -1; QTL_DATA *q = validateParams( handle, marker, &locus, 1 ); if ( locus >= 0 && q->haploid_dp_matrices != NULL ) { QTL_PRIOR **p; int i; /* get the prior probabilities of the strain state combinations at the flanking marker */ p = allocate_haploid_qtl_priors( q ); compute_haploid_qtl_priors( q, p, locus ); /* allocate and instantiate the design matrix */ PROTECT( Design = allocMatrix( REALSXP, q->N, q->S ) ); for(i=0; iN; i++) { int j; for(j=0;jS;j++) { REAL(Design)[i+q->N*j] = 0.0; } } for(i=0; iN; i++) { int j; for(j=0;jS;j++) { REAL(Design)[i+q->N*j] = p[i][j].prior; } } UNPROTECT(1); for(i=0;iN;i++) { free(p[i]); } free(p); } return Design; } void create_haploid_summed_dp_matrices( QTL_DATA *q ) { int i; double *Pr_ss = q->alleles->Pr_ss; double *Pr_st = q->alleles->Pr_st; q->haploid_dp_matrices = (HAPLOID_DP_MATRICES*)calloc(q->N,sizeof(HAPLOID_DP_MATRICES)); for(i=0;iN;i++) { if ( i == 0 || genotype_difference( q, i, i-1 ) ) { q->haploid_dp_matrices[i].Left = haploid_summed_dp_matrix( q, i, Pr_ss, Pr_st, +1 ); q->haploid_dp_matrices[i].Right = haploid_summed_dp_matrix( q, i, Pr_ss, Pr_st, -1 ); q->haploid_dp_matrices[i].NonRecomb = (double*)calloc( q->M, sizeof(double)); } else { /* printf( "same genotype %d %d\n"); */ q->haploid_dp_matrices[i].Left = q->haploid_dp_matrices[i-1].Left; q->haploid_dp_matrices[i].Right = q->haploid_dp_matrices[i-1].Right; q->haploid_dp_matrices[i].NonRecomb = q->haploid_dp_matrices[i-1].NonRecomb; } } } double **haploid_summed_dp_matrix( QTL_DATA *qtl, int individual, double *Pr_ss, double *Pr_st, int direction ) { double **X, **a; int start, stop, incr; CHROM_PAIR *genotypes = &(qtl->genos[individual]); ALLELES *A = qtl->alleles; int m, s, t; int offset; double total; int markers = genotypes->markers; int strains = A->strains; double **Pr_tr1; Pr_tr1 = (double**)calloc(strains,sizeof(double*)); for(s=0;s 0 ) { start = 0; stop = markers-1; incr = +1; offset = 0; } else { start = markers-1; stop = 0; incr = -1; offset = -1; } if ( qtl->an ) a = qtl->an->pr_AtoS[individual][start]; else a = A->af[start].pr_AtoS; for(s=0;schrom1[start]; X[start][s] = a[g1][s]; } for(m=start+incr;m!=stop;m+=incr) { int g1 = genotypes->chrom1[m]; double pr_ss = Pr_ss[m+offset]; double pr_st = Pr_st[m+offset]; double **a; if ( qtl->an ) a = qtl->an->pr_AtoS[individual][m]; else a = A->af[m].pr_AtoS; for(s=0;salleles; double d = (A->af[locus+1].position - A->af[locus].position)/100.0; double lambda, elambda; double MinDist = qtl->alleles->MinDist; if ( d < MinDist ) d = MinDist; lambda = A->generations*d; elambda = exp(-lambda); P[BOTH] = elambda; P[LEFT] = P[RIGHT] = (1-elambda)/lambda-elambda; P[UNLINKED] = 1.0-P[BOTH]-P[LEFT]-P[RIGHT]; for(i=0;iN;i++) { int s; HAPLOID_DP_MATRICES *dp = &(qtl->haploid_dp_matrices[i]); double *Left = dp->Left[m]; double *Right = dp->Right[m+1]; dp->NonRecomb[m]=0.0; total = 0.0; for(s=0;sS;s++) { total += qp[i][s].prior = Left[s]*Right[s]*P[BOTH] + Left[s]* P[LEFT] + Right[s] * P[RIGHT] + P[UNLINKED]; } for(s=0;sS;s++) qp[i][s].prior /= total; dp->NonRecomb[m] /= total; } return qp; } QTL_PRIOR **allocate_haploid_qtl_priors( QTL_DATA *qtl ) { QTL_PRIOR **qp = (QTL_PRIOR**)calloc(qtl->N,sizeof(QTL_PRIOR*)); int i; for(i=0;iN;i++) { qp[i] = (QTL_PRIOR*)calloc(qtl->S,sizeof(QTL_PRIOR)); } return qp; } int entrycmp( const void *a, const void *b ) { const PARENT_KEY *A = (PARENT_KEY*)a; const PARENT_KEY *B = (PARENT_KEY*)b; return strcmp( (const char*)A->key, (const char*)B->key ); } happy.hbrem/src/cmp.c0000644000261400006360000000436012254616417013774 0ustar00rmottmott/* CMP contains standrd comparison functions for use with qsort */ #define _GNU_SOURCE #include #include #include #include"cmp.h" int icmp( const void *A, const void *B) { const int *a = (const int*)A; const int *b = (const int*)B; return *a-*b; } int Icmp( const void *A, const void *B) { const int **a = (const int**)A; const int **b = (const int**)B; return **a - **b; } int fcmp( const void *A, const void *B) { const float *a = (const float*)A; const float *b = (const float*)B; float x = *a - *b; if ( x > 0.0 ) return 1; else if ( x < 0.0 ) return -1; else return 0; } int Fcmp( const void *A, const void *B) { const float **a = (const float**)A; const float **b = (const float**)B; float x = **a - **b; if ( x > 0.0 ) return 1; else if ( x < 0.0 ) return -1; else return 0; } int Rstrcmp( const void *A, const void *B) { const char *a = (const char*)A; const char *b = (const char*)B; /* string comparison with strings reversed, case insensitive */ int la = strlen(a)-1; int lb = strlen(b)-1; int n; while ( la && lb ) { if (n = ( ((int)a[la--]) - ((int)b[lb--]) ) ) { return n; } } return la-lb; } int Strcmp( const void *A, const void *B) { const char **a = (const char**)A; const char **b = (const char**)B; return strcmp(*a,*b); } /* a version of strcmp which works with null strings */ int nStrcmp( const void *A, const void *B) { const char *a = (const char*)A; const char *b = (const char*)B; if ( a && b ) return strcmp(a,b); else if ( a ) return 1; else if ( b ) return -1; else return 0; } int SStrcmp( const void *A, const void *B) { const char ***a = (const char***)A; const char ***b = (const char***)B; return Strcmp((const void*)*a,(const void*)*b); } int uscmp( const void *A, const void *B) { const unsigned short *a = (const unsigned short*)A; const unsigned short *b = (const unsigned short*)B; return (int)(*a-*b); } int dcmp( const void *A, const void *B) { const double *a = (const double*)A; const double *b = (const double*)B; double x = *a - *b; if ( x > 0.0 ) return 1; else if ( x < 0.0 ) return -1; else return 0; } happy.hbrem/src/cmp.h0000644000261400006360000000116512254616417014001 0ustar00rmottmott/* headers for simple cmp functions, for use with qsort and bsearch */ #ifndef _CMP_H_ #define _CMP_H_ int icmp( const void *, const void *); /* int */ int Icmp( const void *, const void *); /* int** */ int dcmp( const void *, const void *); /* double* */ int fcmp( const void *, const void *); /* float* */ int Fcmp( const void *, const void *); /* float** */ int Rstrcmp( const void *, const void *); /* Reversed strcmp */ int Strcmp( const void *, const void *); /* for char** */ int SStrcmp( const void *, const void *); /* for char *** !!!! */ int uscmp( const void *, const void *); /* unsigned short */ #endif happy.hbrem/src/happy.h0000644000261400006360000002134712254616417014347 0ustar00rmottmott#ifndef _HAPPY_H_ #define _HAPPY_H_ #include typedef enum { BOTH, LEFT, UNLINKED, RIGHT, LINKAGE_STATES } LINKAGESTATES; typedef enum {UNKNOWN, MALE, FEMALE } GENDER; #define NOT_FOUND -1 #define ND_ALLELE "NA" typedef struct { int markers; int *chrom1; int *chrom2; } CHROM_PAIR; typedef struct { int alleles; char *marker_name; char **allele_name; double *allele_freq; /* observed frequency of alleles */ double **pr_AtoS; /* prob of strain s | allele a */ double entropy; char chromosome[20]; /* the chromosome of the marker */ double position; /* estimate of the distance of the QTL from the left-hand end */ double ProbSame; /* prob of an observable recomb between this and the next marker */ double **prior; /* array of probabilities that the pair of QTL states are in */ int *which_allele; int ND; /* index of the allele corresponding to "ND" (missing) */ } ALLELE_FREQ; typedef struct { int strains; char **strain_name; int markers; int generations; double *Pr_ss; /* array of transition probabilities for staying in same state */ double *Pr_st; /* array of transition probabilities for changing state */ ALLELE_FREQ *af; double MinDist; /* minimum distance apart for markers */ } ALLELES; /* DP_MATRICES contains the forward and backward dynamic-programming matrices */ typedef struct { double ***Left; /* forward DP matrix */ double ***Right; /* backward DP matrix */ double *NonRecomb; /* expected number of non-recombinant chromosomes */ } DP_MATRICES; /* HAPLOID_DP_MATRICES contains the forward and backward dynamic-programming matrices for haploid genomes*/ typedef struct { double **Left; /* forward DP matrix */ double **Right; /* backward DP matrix */ double *NonRecomb; /* expected number of non-recombinant chromosomes */ } HAPLOID_DP_MATRICES; /* QTL_FIT contains all the data associated with fitting the QTL */ typedef struct { int locus; /* left-hand marker locus of current interest */ double rss; /* the residual sum of squares */ double fss; /* the fitting sum of squares */ double F; /* the F-ratio */ double pval; /* hte p-value of the F */ double mean; /* the estimated mean trait response */ double *trait; /* array of estimated trait effects for each strain */ double *trait_error; /* array of estimated trait standard errors for each strain */ double sigma; /* estimated residual standard error */ int *trait1; /* array of N predicted trait states for chrom1 */ int *trait2; /* array of N predicted trait states for chrom2 */ int left, right; /* counts of the number of chromosomes allocated left and right */ double **design_matrix; /* alternative expectations of traits for regression */ double *residual; /* residduals from fit */ } QTL_FIT; /* ANCESTRY is a matrix giving the expected proportion of each founder strain to be found in each subject; only used if these fractions are unequal */ typedef struct { int N; int S; char **strain_name; char **subject_name; double **prob; double ****pr_AtoS; /* subject-specific prob of strain s | allele a */ } ANCESTRY; /* QTL_DATA is a portmanteau struct that contains pretty much all the data */ typedef struct { char *filename; /* Name of the data-set */ int N; /* Number of individuals */ int M; /* Number of markers */ int S; /* Number of strains */ char *missingCode; /* missing allele code */ int haploid; /* boolean indicating if data are haploid (== inbred)*/ ALLELES *alleles; /* pointer to ALLELES struct containing the founder strain info */ ANCESTRY *an; /* pointer to ANCESTRY struct containing the subject-specific ancestral strain probabilities (can be null)*/ char **name; /* array of names of individuals */ double *observed; /* array of N observed trait values */ CHROM_PAIR *genos; /* array of N CHROM_PAIR structs containing raw marker genotypes */ CHROM_PAIR *haplos; /* array of N CHROM_PAIR structs containing deduced strain haplotypes */ DP_MATRICES *dp_matrices; /* array of N dynamic-programming matrices for computing the priors */ HAPLOID_DP_MATRICES *haploid_dp_matrices; /* array of N dynamic-programming matrices for computing the priors for haploid genomes*/ QTL_FIT *fit; /* array of QTL_FIT structures, for each marker locus */ QTL_FIT *null_model; /* QTL_FIT struct containing the null model fit */ double drop; /* factor for reducing search space of prior configurations */ int from_marker; int to_marker; int phase_known; /* switch indicating whether the phase of the genotypes is known - ie they are haplotypes */ int use_parents; /* switch indicating whether pedigree data is available */ int *mother; /* array of indices to mother , -1 if not present */ int *father; /* array of indices to father, -1 if not present */ char **family; /* array of family names (null if no family) */ int *sex; /* array of sex indices +1 (male) 0 (unknown) 2 (female) */ } QTL_DATA; typedef struct { double prior, posterior, cum; } QTL_PRIOR; typedef struct { double key, value; } KV; typedef struct { char *key; int id;; } PARENT_KEY; /* function prototypes */ CHROM_PAIR *new_chrom_pair( int markers ); ALLELES *input_allele_frequencies( FILE *fp, int generations, char *missingCode, double MinDist, int verbose ); QTL_PRIOR ***compute_qtl_priors( QTL_DATA *qtl, QTL_PRIOR ***qp, int locus, double **prior ); QTL_PRIOR **compute_haploid_qtl_priors( QTL_DATA *qtl, QTL_PRIOR **qp, int locus ); int qpcmp( const void *A, const void *B ); double fit_null_qtl_model( QTL_DATA *qtl_data ); void allocate_traits( QTL_DATA *q, QTL_PRIOR ***p, QTL_FIT *fit, int mode ); void fit_qtl( QTL_DATA *q, int locus, int verbose, int shuffles ); double fit_linear_additive_model( QTL_DATA *qtl, QTL_FIT *fit, int shuffles, int verbose ); QTL_FIT *allocate_qtl_fit( QTL_FIT *fit, int N, int strains ); void print_qtl_data ( QTL_DATA *q, QTL_FIT *fit, FILE *fp ); void qtl_fit_cp( QTL_FIT *fit1, QTL_FIT *fit2, int N, int S ); QTL_DATA *read_qtl_data( FILE *fp, char *name, ALLELES *a, int verbose, int use_parents, int ped_format, char *missing ); void write_qtl_data( FILE *fp, QTL_DATA *q ); int check_and_apply_ancestry(QTL_DATA *q ); double ***summed_dp_matrix( QTL_DATA *qtl, int individual, double *p1, double *p2, int direction ); double **haploid_summed_dp_matrix( QTL_DATA *qtl, int individual, double *Pr_ss, double *Pr_st, int direction ); void create_summed_dp_matrices( QTL_DATA *q ); void create_haploid_summed_dp_matrices( QTL_DATA *q ); QTL_PRIOR ***allocate_qtl_priors( QTL_DATA *q ); QTL_PRIOR **allocate_haploid_qtl_priors( QTL_DATA *qtl ); int remove_partial_fit( QTL_DATA *q, char *marker, int verbose, int fail ); void permute_data( double *data , int N ); void permute_genotypes( QTL_DATA *q ); void pointwise_mapping( QTL_DATA *q, double step, int verbose ); void pointwise_interval_mapping_probabilities( QTL_DATA *q, int locus, double c, double **prior ); QTL_DATA *resample_qtl_data( QTL_DATA *q, QTL_DATA *r ); void bootstrap_analysis( QTL_DATA *q, int bootstrap, char *bootstart, char *bootstop, int verbose ); void sequential_fit( QTL_DATA *q ); int marker_index( const char *name, QTL_DATA *q, const int isIntervalModel ); int genotype_difference( QTL_DATA *q, int i, int j ); int pdump_prob_data( FILE *fp, int locus, QTL_DATA *q ); double ** additive_design_matrix( QTL_DATA *q, int locus ); SEXP happy( SEXP datafile, SEXP allelesfile, SEXP generations, SEXP phase, SEXP file_format, SEXP missing_code, SEXP do_dp, SEXP min_dist, SEXP haploid, SEXP anfilename ); SEXP getListElement(SEXP list, char *str); QTL_DATA * validateParams ( SEXP handle, SEXP marker, int *locus, const int isIntervalModel); SEXP happyprobs ( SEXP handle, SEXP marker ); SEXP happydesign( SEXP handle, SEXP marker, SEXP model ); double phaseProb( int a1, int a2, int m1, int m2, int p1, int p2, int NA ); SEXP happygenotype ( SEXP handle, SEXP marker ); SEXP happynonrecomb ( SEXP handle, SEXP marker ); ANCESTRY *read_subject_ancestries( FILE *fp, char *filename, int verbose ); void heterozygosity( QTL_DATA *q ); double subject_heterozygosity( QTL_DATA *q, int individual ); double marker_heterozygosity( QTL_DATA *q, int marker ); #endif happy.hbrem/src/hbcore.c0000644000261400006360000001442612254616417014463 0ustar00rmottmott#include #include #include #include "hbcore.h" XMAT* Xdip(double **Xmat, int nrow, int ncol) { int i,j; double new, dcheck, dncol, dnrow, maxH, Hbar; double mu_avNi, sd_avNi, var_avNi; double *Hvec=NULL, *av_Ni=NULL; double **newX=NULL, **cumX=NULL; XMAT *full=NULL; // memory allocation full = (XMAT*)calloc(1,sizeof(XMAT)); newX = (double**)calloc(nrow,sizeof(double*)); cumX = (double**)calloc(nrow,sizeof(double*)); for (i=0; i= 1.00001) ) { Rprintf("individual %i : dcheck = %e ERROR HMM probs do not sum to 1\n",i,dcheck); } Hvec[i] = 0.0; for (j=0; j= 1.00001) ) { Rprintf("individual %i : dcheck = %e ERROR HMM probs do not sum to 1\n",i,dcheck); } Hvec[i] = 0.0; for (j=0; j=0; j--) { // load the shuffle table (after 8 warm-ups) k=(*idum)/IQ1; *idum=IA1*(*idum-k*IQ1)-k*IR1; if (*idum < 0) *idum += IM1; if (j < NTAB) iv[j] = *idum; } iy=iv[0]; } k=(*idum)/IQ1; // start here when not initialising *idum=IA1*(*idum-k*IQ1)-k*IR1; // compute idum = (IA1*idum) % IM1 without overflows, by Schrage's \ method if (*idum < 0) *idum += IM1; k=idum2/IQ2; idum2=IA2*(idum2-k*IQ2)-k*IR2; // compute idum2 = (IA2*idum) % IM2 likewise if (idum2 < 0) idum2 += IM2; j=iy/NDIV; // will be in the range 0..NTAB-1 iy=iv[j]-idum2; // here idum is shuffled, idum and idum2 are combined to generate o\ utput iv[j] = *idum; if (iy < 1) iy += IMM1; if ((temp=AM*iy) > RNMX) return RNMX; else return temp; // because users don't expect endpoint values } happy.hbrem/src/hbcore.h0000644000261400006360000000066112254616417014464 0ustar00rmottmott#include #include // has exit() function #include #include #include typedef struct { double **X, **cumX; double *Hvec, *av_Ni; double Hbar, muNi, sdNi; } XMAT; XMAT* Xdip(double **Xmat, int nrow, int ncol); XMAT* Xhap(double **Xmat, int nrow, int ncol); double NRroundit(double d, int dig); float ran1(long *idum); float ran2(long *idum); void NRsort(int nr, double *arr); happy.hbrem/src/hbrem.c0000644000261400006360000002270612254616417014316 0ustar00rmottmott #define _ISOC99_SOURCE #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include"happy.h" /*#include"cl.h" #include"cmp.h" #include"stats.h" */ #include "hbcore.h" #include "hbrem.h" SEXP hbrem( SEXP RX, SEXP HaploidInd, SEXP Ndip, SEXP Nstrain, SEXP Nind, SEXP Npost, SEXP Nbin, SEXP Ry ) { SEXP Rstats = R_NilValue; SEXP Reffectmeans = R_NilValue; SEXP ReffectSDs = R_NilValue; SEXP ReffectNis = R_NilValue; SEXP Rstrainmeans = R_NilValue; SEXP list = R_NilValue; int haploid_ind,nbin,ncol,nrow,nsim,nstrains; double *y=NULL; double *pntRstats=NULL, *pntReffectmeans=NULL, *pntReffectSDs=NULL; double *pntReffectNis=NULL, *pntRX=NULL, *pntRstrainmeans=NULL; PROTECT(RX = AS_NUMERIC(RX)); pntRX = (double*)NUMERIC_POINTER(RX); PROTECT(HaploidInd = AS_INTEGER(HaploidInd)); haploid_ind = (int)INTEGER_POINTER(HaploidInd)[0]; PROTECT(Ndip = AS_INTEGER(Ndip)); ncol = (int)INTEGER_POINTER(Ndip)[0]; PROTECT(Nstrain = AS_INTEGER(Nstrain)); nstrains = (int)INTEGER_POINTER(Nstrain)[0]; PROTECT(Nind = AS_INTEGER(Nind)); nrow = (int)INTEGER_POINTER(Nind)[0]; PROTECT(Npost = AS_INTEGER(Npost)); nsim = (int)INTEGER_POINTER(Npost)[0]; PROTECT(Nbin = AS_INTEGER(Nbin)); nbin = (int)INTEGER_POINTER(Nbin)[0]; /* Rprintf( "nbin %d npost %d nrow %d ncol %d haploid_ind %d\n", nbin, nsim, nrow, ncol, haploid_ind); */ PROTECT(Ry = AS_NUMERIC(Ry)); y = (double*)NUMERIC_POINTER(Ry); PROTECT(Rstats = NEW_NUMERIC(45)); // allocates storage space pntRstats = (double*)NUMERIC_POINTER(Rstats); PROTECT(Reffectmeans = NEW_NUMERIC(ncol)); // allocates storage space pntReffectmeans = (double*)NUMERIC_POINTER(Reffectmeans); PROTECT(ReffectSDs = NEW_NUMERIC(ncol)); // allocates storage space pntReffectSDs = (double*)NUMERIC_POINTER(ReffectSDs); PROTECT(ReffectNis = NEW_NUMERIC(ncol)); // allocates storage space pntReffectNis = (double*)NUMERIC_POINTER(ReffectNis); PROTECT(Rstrainmeans = NEW_NUMERIC(nstrains)); pntRstrainmeans = (double*)NUMERIC_POINTER(Rstrainmeans); int i,j,inc; long idum=0; float init; double **Xmat=NULL; double *Tcomb=NULL; XMAT *xmat=NULL; POST *SLsamp=NULL; SSTA *SLstats=NULL; Xmat = (double**)calloc(nrow,sizeof(double*)); for (j=0; j #include #include #include #include"readline.h" /* functions for reading in lines (C) Richard Mott,, ICRF */ void uncomment( char *string ) /* truncates string at the first ! */ { while ( *string != '!' && *string != '#' && *string != 0 ) string++; *string = 0; } int read_line( FILE *file, char *string ) { int c; int i=0; if ( file != NULL ) { while((c=getc(file))) { if (!i && c==EOF) return EOF; if (i && c==EOF ) return i; if (c=='\n') return i; string[i] = c; string[++i] = '\0'; } } return EOF; } int next_line( FILE *file ) { int c; if ( file != NULL ) { while((c=getc(file))) { if (feof(file)) return 0; if (c=='\n') return 1; } } return EOF; } int not_blank( char *string ) /* checks whether string is full of white space */ { while ( *string != 0 ) { if ( ! isspace(*string) ) return 1; string++; } return 0; } /* reads in successive lines, truncating comments and skipping blank lines */ int skip_comments( FILE *file, char *string ) { int n = EOF; *string = 0; if ( file ) { while ( ( n = read_line( file, string ) ) != EOF ) { uncomment( string ); if ( not_blank( string ) ) return n; } } return n; } int legal_string( char *string, char **strings, int size, int *value ) /* checks if string is a member os strings, and sets value to the index in the array strings returns 1 on success and 0 on failure */ { int i; if ( string ) for(i=0;i #include #include #include #include #include #include #include #include"stats.h" #include"cmp.h" /* misc statistical procedures */ double rank_lin_regression( double *x, double *y, int from, int to, double *intercept, double *slope, double *sigma, double *t_slope ) { double *rankx = replace_by_ranks( x, from, to ); double *ranky = replace_by_ranks( y, from, to ); double c; double e_slope, e_intercept; c = lin_regression( rankx, ranky, 0, to-from+1, intercept, slope, sigma, t_slope, &e_slope, &e_intercept); free(rankx); free(ranky); return c; } double lin_regression( double *x, double *y, int from, int to, double *intercept, double *slope, double *sigma, double *t_slope, double *stderr_slope, double *stderr_intercept ) { double s_x, s_y, ss_x, ss_y, ss_xy; int k; double N=to-from+1, R; s_x = s_y = ss_x = ss_y = ss_xy = 0.0; for(k=from;k<=to;k++) { s_x += x[k]; ss_x += x[k]*x[k]; s_y += y[k]; ss_y += y[k]*y[k]; ss_xy += y[k]*x[k]; } s_x /= N; s_y /= N; ss_x = ss_x-s_x*s_x*N; ss_y = ss_y-s_y*s_y*N; ss_xy = ss_xy-s_x*s_y*N; *slope = ss_xy/ss_x; *intercept = s_y - *slope * s_x; *sigma = sqrt( (ss_y - *slope *ss_xy)/(N-2) ); *t_slope = *slope*sqrt(ss_x)/(*sigma); *stderr_slope = (*sigma)/sqrt(ss_x); *stderr_intercept = *sigma*sqrt((1.0/N+s_x*s_x/ss_x)); R = ss_xy/sqrt(ss_x*ss_y); return R; /* correlation coefficient */ } double *replace_by_ranks( double *array, int start, int stop ) { int len = stop-start+1; double *rank = (double*)calloc( len, sizeof(double) ); double **ptr = (double**)calloc( len, sizeof(double*) ); int n; for(n=0;n 0.0 ? ans : 2.0-ans; } happy.hbrem/src/stats.h0000644000261400006360000000167112254616417014362 0ustar00rmottmott/* misc statistical procedures */ #ifndef _STATS_H_ #define _STATS_H_ double rank_lin_regression( double *x, double *y, int from, int to, double *intercept, double *slope, double *sigma, double *t_slope ); double lin_regression( double *x, double *y, int from, int to, double *intercept, double *slope, double *sigma, double *t_slope, double *stderr_slope, double *stderr_intercept ); double *replace_by_ranks( double *array, int start, int stop ); double durbin_watson_test( double *x, double *y, int from, int to, double slope, double intercept ); double perm_test( int N, int M, int **table, int *seed, int permutations ); double chi_stat( int N, int M, int **table, int *margin1, int *margin2, int total ); int **reduce_table( int *N, int *M, int **table ); double erfcc( double x ); double normal_tail( double z ); double betai(double a, double b, double x); double betacf (double a, double b, double x); double gammln (double xx); #endif happy.hbrem/src/subrout.c0000744000261400006360000011412012254616417014715 0ustar00rmottmott#include #include #include #include"hbcore.h" #include"hbrem.h" #include"cmp.h" POST* single_locus_jointpostX(XMAT *xmat,double *y,int nsim,int ncol,int nrow,int nbin,long *idum) { FILE *out=NULL; char check; char sampoutname[25]="SLXsamp_R.dat"; int i,sim; double nu,kT; int **NiX=NULL; double *postkT=NULL, *postvar=NULL, *postmu=NULL, *nullvar=NULL, *nullmu=NULL; double *Lnull=NULL, *Lqtl=NULL; double **postT=NULL, **yTbarX=NULL; SIMX *selX=NULL; GRKT *kTdist=NULL; POST *single=NULL; single = (POST*)calloc(1,sizeof(POST)); postkT = (double*)calloc(nsim,sizeof(double)); postvar = (double*)calloc(nsim,sizeof(double)); postmu = (double*)calloc(nsim,sizeof(double)); postT = (double**)calloc(nsim,sizeof(double*)); yTbarX = (double**)calloc(nsim,sizeof(double*)); for (i=0; i= nbin ) { dn = (double)(*trueX).Ni[ indvec[i][j] ]; sum2 = ( sum2 + dn ); sum1 = ( sum1 + dn*(*SLstats).muT[ indvec[i][j] ] ); } } Tcomb[i] = sum1/sum2; } for (i=0; i 0.0 ) { dn = (*xmat).av_Ni[ indvec[i][j] ]; sum2 = ( sum2 + dn ); sum1 = ( sum1 + dn*(*SLstats).muT[ indvec[i][j] ] ); } } Tcomb[i] = sum1/sum2; } for (i=0; i= nbin ) { muT[j] = muT[j] + (*SLsamp).T[i][j]; } } mu_nullmu = mu_nullmu + (*SLsamp).null_mu[i]; mu_nullvar = mu_nullvar + (*SLsamp).null_var[i]; avlik_null = avlik_null + (*SLsamp).lik_null[i]; avlik_qtl = avlik_qtl + (*SLsamp).lik_qtl[i]; } mukT = mukT/dnsim; mumu = mumu/dnsim; muvar = muvar/dnsim; for (j=0; j= nbin ) { muT[j] = muT[j]/dnsim; } } mu_nullmu = mu_nullmu/dnsim; mu_nullvar = mu_nullvar/dnsim; avlik_null = avlik_null/dnsim; avlik_qtl = avlik_qtl/dnsim; modevar = (((*SLsamp).N - 3.0)/((*SLsamp).N + 1.0))*muvar; mode_nullvar = (((*SLsamp).N - 3.0)/((*SLsamp).N + 1.0))*mu_nullvar; varkT = 0.0; for (i=0; i= nbin ) { sdT[j] = sdT[j] + ((*SLsamp).T[i][j] - muT[j])*((*SLsamp).T[i][j] - muT[j]); } } } varkT = varkT/dnsim; for (j=0; j= nbin ) { sdT[j] = sdT[j]/dnsim; sdT[j] = sqrt(sdT[j]); } } ga = ((mukT*mukT*(1.0 - mukT)/varkT) - mukT); gb = ga*((1.0 - mukT)/mukT); modekT = (ga - 1.0)/(ga + gb - 2.0); if (modekT < 0.0) { modekT = 0.0; } else if (modekT > 1.0) { modekT = 1.0; } // plug-ins for pD Lnull = null_lik(trueX,y,mode_nullvar,mu_nullmu,nrow,nbin); Lqtl = qtl_lik(trueX,y,modekT,modevar,mumu,muT,nrow,nbin); pD_qtl = -2.0*(avlik_qtl - Lqtl); pD_null = -2.0*(avlik_null - Lnull); DIC_qtl = ( pD_qtl - 2.0*avlik_qtl ); DIC_null = ( pD_null - 2.0*avlik_null ); DIC_diff = ( DIC_null - DIC_qtl ); // plug-ins for BIC Lfocqtl = qtl_Lfoc(trueX,y,modekT,modevar,mumu,nrow,ncol,nbin); BIC_qtl = ( (3.0*log((*SLsamp).N)) - (2.0*Lfocqtl) ); BIC_null = ( (2.0*log((*SLsamp).N)) - (2.0*Lnull) ); BF = exp(-(BIC_null - BIC_qtl)/2.0); logBF = -log10(BF); count1 = 0; for (i=0; i count1 ) { count1 = kThst[i]; } } // sort posterior sample // NRsort doesn't use 0 index - entries start at 1 qsort( stkT+1, nsim, sizeof(double), dcmp ); qsort( stmu+1, nsim, sizeof(double), dcmp ); qsort( stvar+1, nsim, sizeof(double), dcmp ); // NRsort(nsim,stkT); // NRsort(nsim,stmu); // NRsort(nsim,stvar); // calculate medians index = (int)(0.5*((float)nsim)); cred = stkT[index] + stkT[index + 1]; medkT = cred/2.0; cred = stmu[index] + stmu[index + 1]; medmu = cred/2.0; cred = stvar[index] + stvar[index + 1]; medvar = cred/2.0; // calculate HPD credible intervals kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(99*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(99*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTb99 = bkT; kTt99 = tkT; } bmu = stmu[i]; tmu = stmu[i+(99*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub99 = bmu; mut99 = tmu; } bvar = stvar[i]; tvar = stvar[i+(99*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb99 = bvar; vart99 = tvar; } i = i + 1; } kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(95*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(95*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTt95 = tkT; kTb95 = bkT; } bmu = stmu[i]; tmu = stmu[i+(95*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub95 = bmu; mut95 = tmu; } bvar = stvar[i]; tvar = stvar[i+(95*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb95 = bvar; vart95 = tvar; } i = i + 1; } kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(75*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(75*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTt75 = tkT; kTb75 = bkT; } bmu = stmu[i]; tmu = stmu[i+(75*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub75 = bmu; mut75 = tmu; } bvar = stvar[i]; tvar = stvar[i+(75*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb75 = bvar; vart75 = tvar; } i = i + 1; } kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(50*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(50*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTt50 = tkT; kTb50 = bkT; } bmu = stmu[i]; tmu = stmu[i+(50*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub50 = bmu; mut50 = tmu; } bvar = stvar[i]; tvar = stvar[i+(50*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb50 = bvar; vart50 = tvar; } i = i + 1; } HPDlkT[3] = kTb99; HPDukT[3] = kTt99; HPDlkT[2] = kTb95; HPDukT[2] = kTt95; HPDlkT[1] = kTb75; HPDukT[1] = kTt75; HPDlkT[0] = kTb50; HPDukT[0] = kTt50; HPDlmu[3] = mub99; HPDumu[3] = mut99; HPDlmu[2] = mub95; HPDumu[2] = mut95; HPDlmu[1] = mub75; HPDumu[1] = mut75; HPDlmu[0] = mub50; HPDumu[0] = mut50; HPDlvar[3] = varb99; HPDuvar[3] = vart99; HPDlvar[2] = varb95; HPDuvar[2] = vart95; HPDlvar[1] = varb75; HPDuvar[1] = vart75; HPDlvar[0] = varb50; HPDuvar[0] = vart50; free(stkT); free(stmu); free(stvar); (*stats).kThist = kThst; (*stats).HPDlkT = HPDlkT; (*stats).HPDukT = HPDukT; (*stats).modekT = modekT; (*stats).mukT = mukT; (*stats).medkT = medkT; (*stats).ga = ga; (*stats).gb = gb; (*stats).HPDlmu = HPDlmu; (*stats).HPDumu = HPDumu; (*stats).mumu = mumu; (*stats).medmu = medmu; (*stats).HPDlvar = HPDlvar; (*stats).HPDuvar = HPDuvar; (*stats).modevar = modevar; (*stats).muvar = muvar; (*stats).medvar = medvar; (*stats).muT = muT; (*stats).sdT = sdT; (*stats).pD_null = pD_null; (*stats).pD_qtl = pD_qtl; (*stats).DIC_null = DIC_null; (*stats).DIC_qtl = DIC_qtl; (*stats).DIC_diff = DIC_diff; (*stats).BIC_null = BIC_null; (*stats).BIC_qtl = BIC_qtl; (*stats).BF = BF; (*stats).logBF = logBF; return stats; } SSTA* single_locus_sumstatsX(XMAT *xmat, POST *SLsamp, double *y, int nsim, int ncol, int nrow, int nbin) { int i,j,index,count1,nprc,sum; double cred,dnbin,dnrow,dnsim,prc; double mukT,medkT,mumu,medmu,muvar,medvar,modevar,modekT; double BF,BIC_qtl,BIC_null,logBF,DIC_null,DIC_qtl,DIC_diff; double avlik_qtl,avlik_null,Lfocqtl,Lqtl,Lnull,pD_null,pD_qtl; double ga,gb,varkT,mu_nullmu,mu_nullvar,mode_nullvar,av_ysq,av_ybar; double bkT,bmu,bvar,tkT,tmu,tvar,kTdiff,mudiff,vardiff; double kTb99,kTt99,kTb95,kTt95,kTb75,kTt75,kTb50,kTt50; double mub99,mut99,mub95,mut95,mub75,mut75,mub50,mut50; double varb99,vart99,varb95,vart95,varb75,vart75,varb50,vart50; int *kThst=NULL; double *stkT=NULL,*HPDlkT=NULL,*HPDukT=NULL; double *stmu=NULL,*HPDlmu=NULL,*HPDumu=NULL; double *stvar=NULL,*HPDlvar=NULL,*HPDuvar=NULL; double *muT=NULL,*sdT=NULL,*nT=NULL,*av_yT=NULL; SSTA *stats=NULL; nprc = 201; prc = 200.0; stats = (SSTA*)calloc(1,sizeof(SSTA)); HPDlkT = (double*)calloc(4,sizeof(double)); HPDukT = (double*)calloc(4,sizeof(double)); HPDlmu = (double*)calloc(4,sizeof(double)); HPDumu = (double*)calloc(4,sizeof(double)); HPDlvar = (double*)calloc(4,sizeof(double)); HPDuvar = (double*)calloc(4,sizeof(double)); kThst = (int*)calloc(nprc,sizeof(int)); stkT = (double*)calloc((nsim+1),sizeof(double)); stmu = (double*)calloc((nsim+1),sizeof(double)); stvar = (double*)calloc((nsim+1),sizeof(double)); muT = (double*)calloc(ncol,sizeof(double)); sdT = (double*)calloc(ncol,sizeof(double)); nT = (double*)calloc(ncol, sizeof(double)); av_yT = (double*)calloc(ncol,sizeof(double)); dnsim = (double)nsim; nsim = (int)nsim; dnrow = (double)nrow; nrow = (int)nrow; dnbin = (double)nbin; nbin = (int)nbin; mukT = 0.0; mumu = 0.0; muvar = 0.0; mu_nullmu = 0.0; mu_nullvar = 0.0; avlik_null = 0.0; avlik_qtl = 0.0; for (i=0; i= nbin ) { muT[j] = muT[j] + (*SLsamp).T[i][j]; nT[j] = nT[j] + 1.0; } } mu_nullmu = mu_nullmu + (*SLsamp).null_mu[i]; mu_nullvar = mu_nullvar + (*SLsamp).null_var[i]; avlik_null = avlik_null + (*SLsamp).lik_null[i]; avlik_qtl = avlik_qtl + (*SLsamp).lik_qtl[i]; } mukT = mukT/dnsim; mumu = mumu/dnsim; muvar = muvar/dnsim; for(j=0; j 0.0 ) { muT[j] = muT[j]/nT[j]; } } mu_nullmu = mu_nullmu/dnsim; mu_nullvar = mu_nullvar/dnsim; avlik_null = avlik_null/dnsim; avlik_qtl = avlik_qtl/dnsim; modevar = ((dnrow - 3.0)/(dnrow + 1.0))*muvar; mode_nullvar = ((dnrow - 3.0)/(dnrow + 1.0))*mu_nullvar; varkT = 0.0; for (i=0; i= nbin ) { sdT[j] = sdT[j] + ((*SLsamp).T[i][j] - muT[j])*((*SLsamp).T[i][j] - muT[j]); } } } varkT = varkT/dnsim; for (j=0; j 0.0 ) { sdT[j] = sdT[j]/nT[j]; sdT[j] = sqrt(sdT[j]); } } ga = ((mukT*mukT*(1.0 - mukT)/varkT) - mukT); gb = ga*((1.0 - mukT)/mukT); modekT = (ga - 1.0)/(ga + gb - 2.0); if (modekT < 0.0) { modekT = 0.0; } else if (modekT > 1.0) { modekT = 1.0; } // plug-ins for pD av_ysq = 0.0; av_ybar = 0.0; for (i=0; i 0.0 ) { av_yT[j] = av_yT[j]/(*xmat).av_Ni[j]; } } Lnull = null_plug((*xmat).av_Ni,av_ysq,av_ybar,mode_nullvar,mu_nullmu,ncol,nrow); Lqtl = qtl_plug(av_yT,(*xmat).av_Ni,av_ysq,modekT,modevar,mumu,muT,ncol,nrow); pD_qtl = -2.0*(avlik_qtl - Lqtl); pD_null = -2.0*(avlik_null - Lnull); DIC_qtl = ( pD_qtl - 2.0*avlik_qtl ); DIC_null = ( pD_null - 2.0*avlik_null ); DIC_diff = ( DIC_null - DIC_qtl ); // plug-ins for BIC Lfocqtl = qtl_LfocX(av_yT,(*xmat).av_Ni,av_ysq,av_ybar,modekT,modevar,mumu,ncol,nrow); BIC_qtl = ( (3.0*log(dnrow)) - (2.0*Lfocqtl) ); BIC_null = ( (2.0*log(dnrow)) - (2.0*Lnull) ); BF = exp(-(BIC_null - BIC_qtl)/2.0); logBF = -log10(BF); count1 = 0; for (i=0; i count1 ) { count1 = kThst[i]; } } // sort posterior sample // NRsort doesn't use 0 index - entries start at 1 qsort( stkT+1, nsim, sizeof(double), dcmp ); qsort( stmu+1, nsim, sizeof(double), dcmp ); qsort( stvar+1, nsim, sizeof(double), dcmp ); // NRsort(nsim,stkT); // NRsort(nsim,stmu); // NRsort(nsim,stvar); // calculate medians index = (int)(0.5*((float)nsim)); nsim = (int)nsim; cred = stkT[index] + stkT[index + 1]; medkT = cred/2.0; cred = stmu[index] + stmu[index + 1]; medmu = cred/2.0; cred = stvar[index] + stvar[index + 1]; medvar = cred/2.0; // calculate HPD credible intervals kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(99*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(99*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTb99 = bkT; kTt99 = tkT; } bmu = stmu[i]; tmu = stmu[i+(99*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub99 = bmu; mut99 = tmu; } bvar = stvar[i]; tvar = stvar[i+(99*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb99 = bvar; vart99 = tvar; } i = i + 1; } kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(95*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(95*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTt95 = tkT; kTb95 = bkT; } bmu = stmu[i]; tmu = stmu[i+(95*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub95 = bmu; mut95 = tmu; } bvar = stvar[i]; tvar = stvar[i+(95*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb95 = bvar; vart95 = tvar; } i = i + 1; } kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(75*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(75*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTt75 = tkT; kTb75 = bkT; } bmu = stmu[i]; tmu = stmu[i+(75*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub75 = bmu; mut75 = tmu; } bvar = stvar[i]; tvar = stvar[i+(75*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb75 = bvar; vart75 = tvar; } i = i + 1; } kTdiff = 1.0; mudiff = ( stmu[nsim] - stmu[1] ); vardiff = ( stvar[nsim] - stvar[1] ); i = 1; while ( (i+(50*nsim/100)) < nsim ) { bkT = stkT[i]; tkT = stkT[i+(50*nsim/100)]; if ( (tkT - bkT) < kTdiff ) { kTdiff = (tkT - bkT); kTt50 = tkT; kTb50 = bkT; } bmu = stmu[i]; tmu = stmu[i+(50*nsim/100)]; if ( (tmu - bmu) < mudiff ) { mudiff = (tmu - bmu); mub50 = bmu; mut50 = tmu; } bvar = stvar[i]; tvar = stvar[i+(50*nsim/100)]; if ( (tvar - bvar) < vardiff ) { vardiff = (tvar - bvar); varb50 = bvar; vart50 = tvar; } i = i + 1; } HPDlkT[3] = kTb99; HPDukT[3] = kTt99; HPDlkT[2] = kTb95; HPDukT[2] = kTt95; HPDlkT[1] = kTb75; HPDukT[1] = kTt75; HPDlkT[0] = kTb50; HPDukT[0] = kTt50; HPDlmu[3] = mub99; HPDumu[3] = mut99; HPDlmu[2] = mub95; HPDumu[2] = mut95; HPDlmu[1] = mub75; HPDumu[1] = mut75; HPDlmu[0] = mub50; HPDumu[0] = mut50; HPDlvar[3] = varb99; HPDuvar[3] = vart99; HPDlvar[2] = varb95; HPDuvar[2] = vart95; HPDlvar[1] = varb75; HPDuvar[1] = vart75; HPDlvar[0] = varb50; HPDuvar[0] = vart50; free(stkT); free(stmu); free(stvar); free(nT); free(av_yT); (*stats).kThist = kThst; (*stats).HPDlkT = HPDlkT; (*stats).HPDukT = HPDukT; (*stats).modekT = modekT; (*stats).mukT = mukT; (*stats).medkT = medkT; (*stats).ga = ga; (*stats).gb = gb; (*stats).HPDlmu = HPDlmu; (*stats).HPDumu = HPDumu; (*stats).mumu = mumu; (*stats).medmu = medmu; (*stats).HPDlvar = HPDlvar; (*stats).HPDuvar = HPDuvar; (*stats).modevar = modevar; (*stats).muvar = muvar; (*stats).medvar = medvar; (*stats).muT = muT; (*stats).sdT = sdT; (*stats).pD_null = pD_null; (*stats).pD_qtl = pD_qtl; (*stats).DIC_null = DIC_null; (*stats).DIC_qtl = DIC_qtl; (*stats).DIC_diff = DIC_diff; (*stats).BIC_null = BIC_null; (*stats).BIC_qtl = BIC_qtl; (*stats).BF = BF; (*stats).logBF = logBF; return stats; } double qtl_LfocX(double *av_yT, double *avNi, double av_ysq, double av_ybar, double kT, double var, double mu, int ncol, int nrow) { // integrated version of qtl_plug, compared to null_plug int i; double deno,dncol,dnrow,lik,prod1,sum1,sum2,sum3; dnrow = (double)nrow; nrow = (int)nrow; dncol = (double)ncol; ncol = (int)ncol; sum2 = 0.0; prod1 = 0.0; for (i=0; i 0.0 ) { deno = ( 1.0 - kT + kT*avNi[i] ); prod1 = prod1 + log(deno); sum2 = sum2 + ((avNi[i]*avNi[i]*(av_yT[i] - mu)*(av_yT[i] - mu))/deno); } } prod1 = prod1/2.0; sum1 = ( (dnrow*mu*(mu - 2.0*av_ybar)) + av_ysq ); sum3 = (sum1 - (kT*sum2))/(2.0*var*(1.0 - kT)); lik = ( (-dnrow/2.0)*log(2.0*M_PI) + ((dncol - dnrow)/2.0)*log(1.0 - kT) - (dnrow/2.0)*log(var) - prod1 - sum3 ); return lik; } double qtl_plug(double *av_yT, double *avNi, double av_ysq, double kT, double var, double mu, double *T, int ncol, int nrow) { int i; double dnrow,lik,sum2; dnrow = (double)nrow; nrow = (int)nrow; sum2 = 0.0; for (i=0; i 0.0 ) { sum2 = sum2 + ( (avNi[i])*(mu + T[i])*(mu + T[i] - 2.0*av_yT[i]) ); } } sum2 = sum2 + av_ysq; lik = ( ((-dnrow/2.0)*log(2.0*M_PI)) - ((dnrow/2.0)*log(1.0 - kT)) - ((dnrow/2.0)*log(var)) - (sum2/(2.0*var*(1.0 - kT))) ); return lik; } double qtl_lik(SIMX *trueX, double *y, double kT, double var, double mu, double *T, int nrow, int nbin) { int i,m; double Nsamp,lik,sum2; Nsamp = 0.0; sum2 = 0.0; for (i=0; i= nbin ) { sum2 = sum2 + (y[i] - mu - T[m])*(y[i] - mu - T[m]); Nsamp = Nsamp + 1.0; } } lik = ( ((-Nsamp/2.0)*log(2.0*M_PI)) - ((Nsamp/2.0)*log(1.0 - kT)) - ((Nsamp/2.0)*log(var)) - (sum2/(2.0*var*(1.0 - kT))) ); return lik; } double qtl_Lfoc(SIMX *trueX, double *y, double kT, double var, double mu, int nrow, int ncol, int nbin) { int i,m; double deno,dn,Nsamp,lik,prod1,sum1,sum2,sum3,tsamp; double *yTbar=NULL; yTbar = (double*)calloc(ncol,sizeof(double)); Nsamp = 0.0; sum1 = 0.0; for (i=0; i= nbin ) { sum1 = sum1 + ((y[i] - mu)*(y[i] - mu)); yTbar[m] = yTbar[m] + y[i]; Nsamp = Nsamp + 1.0; } } tsamp = 0.0; prod1 = 0.0; sum2 = 0.0; for (i=0; i= nbin ) { dn = (double)(*trueX).Ni[i]; deno = ( 1.0 - kT + kT*dn ); tsamp = tsamp + 1.0; prod1 = prod1 + log(deno); yTbar[i] = yTbar[i]/dn; sum2 = sum2 + ((dn*dn*(yTbar[i] - mu)*(yTbar[i] - mu))/deno); } } prod1 = prod1/2.0; sum3 = (sum1 - (kT*sum2))/(2.0*var*(1.0 - kT)); lik = ( (-Nsamp/2.0)*log(2.0*M_PI) + ((tsamp - Nsamp)/2.0)*log(1.0 - kT) - (Nsamp/2.0)*log(var) - prod1 - sum3 ); free(yTbar); return lik; } double null_plug(double *avNi, double av_ysq, double av_ybar, double var, double mu, int ncol, int nrow) { int i; double dnrow,lik,sum1; dnrow = (double)nrow; nrow = (int)nrow; sum1 = ( (dnrow*mu*(mu - 2.0*av_ybar)) + av_ysq ); lik = ( ((-dnrow/2.0)*log(2.0*M_PI)) - ((dnrow/2.0)*log(var)) - (sum1/(2.0*var)) ); return lik; } double null_lik(SIMX *trueX, double *y, double var, double mu, int nrow, int nbin) { int i,j,m; double lik,sum1,Nsamp; Nsamp = 0.0; sum1 = 0.0; for (i=0; i= nbin ) { sum1 = sum1 + (y[i] - mu)*(y[i] - mu); Nsamp = Nsamp + 1.0; } } lik = ( ((-Nsamp/2.0)*log(2.0*M_PI)) - ((Nsamp/2.0)*log(var)) - (sum1/(2.0*var)) ); return lik; } SIMX* drawX(XMAT *xmat, int ncol, int nrow, long *idum) { int i,j,m; double check,dncol,max,muNi,ran,sdNi,varNi; int *Xvec=NULL, *Ni=NULL; double *Xprob=NULL; SIMX *trueX=NULL; trueX = (SIMX*)calloc(1,sizeof(SIMX)); Xvec = (int*)calloc(nrow,sizeof(int)); Xprob = (double*)calloc(nrow,sizeof(double)); Ni = (int*)calloc(ncol,sizeof(int)); dncol = (double)ncol; ncol = (int)ncol; for (i=0; i (*xmat).cumX[i][m-1] ) { m++; } if (m > ncol) { Rprintf("hbrem drawX ERROR m = %i, ran = %f\n",m,ran); return(NULL); } Xvec[i] = m; Xprob[i] = (*xmat).X[i][m-1]; Ni[Xvec[i] - 1] = Ni[Xvec[i] - 1] + 1; } muNi = 0.0; for (i=0; i= nbin ) { yTbar[m] = yTbar[m] + y[i]; sum1 = sum1 + (y[i]*y[i]); sum6 = sum6 + y[i]; Nsamp = Nsamp + 1.0; } } sum6 = sum6/Nsamp; Ngen = 0.0; for (j=0; j= nbin ) { yTbar[j] = yTbar[j]/((double)(*trueX).Ni[j]); Ngen = Ngen + 1.0; } } kT = 0.0; inc = 1.0/dprc; stmax = -1000000.0; for (i=0; i= nbin ) { dn = (double)(*trueX).Ni[j]; deno = ( 1.0 - kT + (kT*dn) ); sum2 = sum2 + (dn/deno); prod1 = prod1 + log(deno); sum3 = sum3 + (dn*dn*(yTbar[j])*(yTbar[j]))/deno; sum4 = sum4 + (dn*(yTbar[j]))/deno; } } inter = ( sum1 - (kT*sum3) - ((1.0-kT)*(sum4*sum4/sum2)) ); sum2 = -log(sum2)/2.0; prod1 = -prod1/2.0; pow1 = ( Ngen - 1.0 )/2.0; pow2 = ( Nsamp - 1.0 )/2.0; kT_pdf[i] = ( (pow1*log(1-kT)) + sum2 + prod1 - (pow2*log(inter)) ); if (kT_pdf[i] > stmax) { stmax = kT_pdf[i]; } kT = kT + inc; } sum5 = 0.0; stmin = (stmax - 703.0); for (i=0; i (*kTdist).cdf[i] ) { i++; } if ( i > (prc + 1) ) { Rprintf("error in draw of kT\n"); return(0.0); } kT = ((double)i)/dprc; return kT; } double draw_knownvar(GRKT *kTdist, int *Ni, int ncol, double kT, double nu, int nbin) { int j; double chi,scale,var; double deno,dn,sum2,sum3,sum4; if ( kT == 1.0 ) { var = 0.0; } else { chi = rchisq(nu); sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; for (j=0; j= nbin ) { dn = (double)Ni[j]; deno = ( 1.0 - kT + kT*dn ); sum2 = sum2 + (dn/deno); sum3 = sum3 + (dn*dn*((*kTdist).yTbar[j])*((*kTdist).yTbar[j]))/deno; sum4 = sum4 + (dn*((*kTdist).yTbar[j]))/deno; } } // scale = rho squared multiplied by nu=degrees of freedom scale = ( ((*kTdist).sum_ysq)/(1.0 - kT) - (kT/(1.0 - kT))*sum3 - (sum4*sum4/sum2) ); var = scale/chi; } return var; } double draw_nullvar(SIMX *selX, double *y, int nrow, int nbin) { int j,m; double chi,nu,scale,var; double dnrow,sum2,sum3,Nsamp; dnrow = (double)nrow; nrow = (int)nrow; Nsamp = 0.0; sum2 = 0.0; sum3 = 0.0; for (j=0; j= nbin ) { sum2 = sum2 + (y[j]*y[j]); sum3 = sum3 + y[j]; Nsamp = Nsamp + 1.0; } } sum3 = sum3/Nsamp; nu = (Nsamp - 1.0); chi = rchisq(nu); // scale = rho squared multiplied by nu=degrees of freedom scale = ( sum2 - (Nsamp*sum3*sum3) ); var = scale/chi; return var; } double draw_knownmu(GRKT *kTdist, int *Ni, int ncol, double kT, double var, int nbin) { int j; double mu,nmean,nsigmasq,sigma; double deno,dn,sum2,sum4; sum2 = 0.0; sum4 = 0.0; for (j=0; j= nbin ) { dn = (double)Ni[j]; deno = ( 1.0 - kT + kT*dn ); sum2 = sum2 + (dn/deno); sum4 = sum4 + (dn*((*kTdist).yTbar[j]))/deno; } } nmean = sum4/sum2; nsigmasq = var/sum2; sigma = sqrt(nsigmasq); mu = rnorm(0.0, sigma); mu = mu + nmean; return mu; } double draw_nullmu(SIMX *selX, double *y, int nrow, double var, int nbin) { int j,m; double mu,nmean,nsigmasq,sigma; double dnrow,sum2,Nsamp; dnrow = (double)nrow; nrow = (int)nrow; sum2 = 0.0; Nsamp = 0.0; for (j=0; j= nbin ) { sum2 = sum2 + y[j]; Nsamp = Nsamp + 1.0; } } sum2 = sum2/Nsamp; nmean = sum2; nsigmasq = var/Nsamp; sigma = sqrt(nsigmasq); mu = rnorm(0.0, sigma); mu = mu + nmean; return mu; } double draw_knownTi(GRKT *kTdist, int *Ni, double kT, double var, double mu, int nbin, int index) { double dn,deno; double nmean,nsigmasq,sigma,ti; ti = 0.0; if ( (int)Ni[index] >= nbin ) { dn = (double)Ni[index]; deno = ( 1.0 - kT + kT*dn ); nmean = (kT*dn*((*kTdist).yTbar[index] - mu))/deno; nsigmasq = (kT*(1.0 - kT)*var)/deno; sigma = sqrt(nsigmasq); ti = rnorm(0.0, sigma); ti = ti + nmean; } return ti; }