GSA/0000755000175100001440000000000013424630013010676 5ustar hornikusersGSA/NAMESPACE0000644000175100001440000000071313424622723012127 0ustar hornikusersexport(GSA, GSA.correlate, GSA.func, GSA.listsets, GSA.make.features, GSA.plot, GSA.genescores, GSA.xl.plot, GSA.xl.correlate, GSA.xl.summary.genesets, GSA.xl.genescores, GSA, GSA.read.gmt, print.GSA.func, print.GSA, summary.GSA.genesets) S3method(print, GSA) S3method(print, GSA.func) S3method(summary, GSA.genesets) importFrom("graphics", "legend", "plot", "points") importFrom("stats", "cor", "mad", "pt", "qnorm", "quantile", "runif", "var") GSA/R/0000755000175100001440000000000011320430575011103 5ustar hornikusersGSA/R/GSA.R0000644000175100001440000001332110743770151011645 0ustar hornikusersGSA=function(x,y, genesets, genenames, method=c("maxmean","mean","absmean"), resp.type=c("Quantitative","Two class unpaired","Survival","Multiclass", "Two class paired", "tCorr", "taCorr"), censoring.status=NULL,random.seed=NULL, knn.neighbors=10, s0=NULL, s0.perc=NULL,minsize=15,maxsize=500, restand=TRUE,restand.basis=c("catalog","data"), nperms=200, xl.mode=c("regular","firsttime","next20","lasttime"), xl.time=NULL, xl.prevfit=NULL){ # # computes feature set scores for a single set of data this.call=match.call() method <- match.arg(method) resp.type=match.arg(resp.type) xl.mode=match.arg(xl.mode) restand.basis=match.arg(restand.basis) fdr.lo=NULL fdr.hi=NULL pvalues.lo=NULL pvalues.hi=NULL if(!is.null(random.seed)){ set.seed(random.seed) } if(xl.mode=="regular" | xl.mode=="firsttime"){ if(sum(is.na(x))>0){ require(impute) x=impute.knn(x,k=knn.neighbors) } #Error check: make sure that genenames and genesets have reasonable overlap temp=match(unlist(genesets),genenames) if(sum(!is.na(temp))/length(temp) < .05){ stop("Fewer than 5% of genes in the genesets appear in the dataset. Make sure that gene identifiers in dataset are Gene symbols") } junk=GSA.func(x,y,genesets=genesets, genenames=genenames, method=method, resp.type=resp.type, censoring.status=censoring.status, s0=s0,s0.perc=s0.perc, minsize=minsize,maxsize=maxsize, restand=restand, restand.basis= restand.basis) r.obs=junk$score stand.info=junk$stand.info gene.scores=junk$gene.scores s0=junk$s0 s0.perc=junk$s0.perc r.star=NULL k=length(genesets) o=unlist(lapply(genesets,length)) gs.ind=(1:k)[o>= minsize &o<= maxsize] catalog=NULL ngenes=rep(NA,length(gs.ind)) gs.mat=matrix(nrow(x)+1,nrow=length(gs.ind),ncol=maxsize) ii=0 for(i in gs.ind){ ii=ii+1 gene.set=match(genesets[[i]],genenames) gene.set=gene.set[!is.na(gene.set)] catalog=c(catalog,gene.set) if(length(gene.set)>0){ gs.mat[ii,1:length(gene.set)]=gene.set ngenes[ii]=length(gene.set) } } catalog.unique=unique(catalog) # initialize for xl.iterations r.star=matrix(0,nrow=length(genesets),ncol=nperms) stand.info.star=matrix(0,nrow=8,ncol=nperms) dimnames(stand.info.star)=list(c("mean.all","mean.abs","sd.all" , "sd.abs" , "mean.pos","sd.pos", "mean.neg", "sd.neg"),NULL) fdr.lo=NULL fdr.hi=NULL GSA.scores=NULL GSA.scores.perm=NULL pvalues.lo=NULL pvalues.hi=NULL first.time=TRUE } if(xl.mode=="next20" | xl.mode=="lasttime"){ # get stuff from prevfit GSA.scores=NULL GSA.scores.perm=NULL x=xl.prevfit$x y=xl.prevfit$y genesets=xl.prevfit$genesets genenames=xl.prevfit$genenames r.obs=xl.prevfit$r.obs r.star=xl.prevfit$r.star stand.info.star=xl.prevfit$stand.info.star gs.mat=xl.prevfit$gs.mat gs.ind=xl.prevfit$gs.ind catalog=xl.prevfit$catalog catalog.unique=xl.prevfit$catalog.unique ngenes=xl.prevfit$ngenes nperms=xl.prevfit$nperms stand.info=xl.prevfit$stand.info gene.scores=xl.prevfit$gene.scores s0=xl.prevfit$s0 s0.perc=xl.prevfit$s0.perc } if(xl.mode=="regular"){ first=1;last=nperms } if(xl.mode=="firsttime"){ first=1;last=1 } if(xl.mode=="next20"){ first=xl.time; last= min(xl.time+9, nperms-1) } if(xl.mode=="lasttime"){ first=nperms;last=nperms } for(i in first:last){ if(i%%10==0){ cat(c("perm=",i,paste("/",as.character(nperms)),sep=""),fill=T) } if(resp.type!="Two class paired"){oo=sample(1:length(y))} if(resp.type=="Two class paired"){oo=paired.perm(y)} junk=GSA.func(x[,oo],y,genesets=genesets, genenames=genenames, first.time=FALSE, return.gene.ind=FALSE, gs.mat=gs.mat, ngenes=ngenes,gs.ind=gs.ind, catalog=catalog,catalog.unique=catalog.unique, method=method, resp.type=resp.type, censoring.status=censoring.status, s0=s0, s0.perc=s0.perc,minsize=minsize,maxsize=maxsize, restand=restand) r.star[,i]=junk$score if (restand==TRUE) { stand.info.star[, i] = unlist(junk$stand.info) } else stand.info.star[, i] = NA } if(xl.mode=="regular" | xl.mode=="lasttime"){ k=length(genesets) pvalues.hi=rep(NA,k) pvalues.lo=rep(NA,k) for(i in gs.ind){ pvalues.hi[i]=sum(r.star[i,]>r.obs[i])/nperms pvalues.lo[i]=sum(r.star[i,](length(x$scores)-10) oo1=order(-x$scores[o1]) geneset.names=x$geneset.names if(is.null(geneset.names)){geneset.names=as.character(1:length(x$scores))} mat1=cbind((1:length(x$scores))[o1],x$geneset.names[o1], round(x$scores[o1],4))[oo1,] o2=rank(x$scores)<10 oo2=order(x$scores[o2]) geneset.names=x$geneset.names if(is.null(geneset.names)){geneset.names=1:length(x$scores)} mat2=cbind((1:length(x$scores))[o2],x$geneset.names[o2], round(x$scores[o2],4))[oo2,] print(mat1, quote = FALSE) print("") print("") print(mat2, quote = FALSE) invisible() } GSA/R/GSA.read.gmt.R0000644000175100001440000000236610520705102013337 0ustar hornikusers GSA.read.gmt=function(filename){ # ## Read in and parse a gmt file (gene set file) from the Broad institute # this is tricky, because each lines (geneset) has a variable length # I read the file twice, first to pick up the geneset name and description # in the first two columns, then I read it all in as a long string # The beginning and end of each gene set in the string # is determined by matching # BOTH on geneset name and description (since geneset names sometimes # occur as genenames elsewhere in the file) a=scan(filename,what=list("",""),sep="\t", quote=NULL, fill=T, flush=T,multi.line=F) geneset.names=a[1][[1]] geneset.descriptions=a[2][[1]] dd=scan(filename,what="",sep="\t", quote=NULL) nn=length(geneset.names) n=length(dd) ox=rep(NA,nn) ii=1 for(i in 1:nn){ cat(i) while((dd[ii]!=geneset.names[i]) | (dd[ii+1]!=geneset.descriptions[i]) ){ ii=ii+1 } ox[i]=ii ii=ii+1 } genesets=vector("list",nn) for(i in 1:(nn-1)){ cat(i,fill=T) i1=ox[i]+2 i2=ox[i+1]-1 geneset.descriptions[i]=dd[ox[i]+1] genesets[[i]]=dd[i1:i2] } geneset.descriptions[nn]=dd[ox[nn]+1] genesets[[nn]]=dd[(ox[nn]+2):n] out=list(genesets=genesets,geneset.names=geneset.names, geneset.descriptions=geneset.descriptions) class(out)="GSA.genesets" return(out) } GSA/R/GSA.make.features.R0000644000175100001440000000071310475577706014414 0ustar hornikusersGSA.make.features=function(GSA.func.obj, x, genesets, genenames){ np=length(GSA.func.obj$gene.ind) xs=t(scale(t(x),center=GSA.func.obj$mean,scale=GSA.func.obj$sd)) val=matrix(NA,nrow=np,ncol=ncol(x)) for(i in 1:np){ if(!is.null(GSA.func.obj$gene.ind[[i]])){ gene.set=match(genesets[[i]],genenames) gene.set=gene.set[!is.na(gene.set)] geneind=gene.set[GSA.func.obj$gene.ind[[i]]] val[i,]=(colSums(xs[geneind,,drop=F])/length(gene.set)) } } return(val) } GSA/R/GSA.func.R0000644000175100001440000001417711320173540012600 0ustar hornikusersGSA.func=function(x,y, genesets, genenames, geneset.names=NULL, method=c("maxmean","mean","absmean"), resp.type=c("Quantitative","Two class unpaired","Survival","Multiclass", "Two class paired", "tCorr", "taCorr"), censoring.status=NULL, first.time=TRUE, return.gene.ind=TRUE, ngenes=NULL, gs.mat=NULL, gs.ind=NULL, catalog=NULL, catalog.unique=NULL, s0=NULL, s0.perc=NULL, minsize=15,maxsize=500, restand=TRUE, restand.basis=c("catalog","data")){ # # computes gene set scores for a single set of data this.call=match.call() method <- match.arg(method) resp.type <- match.arg(resp.type) restand.basis <- match.arg(restand.basis) BIG=10e9 me=rowMeans(x) if(resp.type=="tCorr" | resp.type=="taCorr"){s0=0} if(first.time){ k=length(genesets) o=unlist(lapply(genesets,length)) gs.ind=(1:k)[o>= minsize &o<= maxsize] ngenes=rep(NA,length(gs.ind)) # note that gs.mat is initialized with the value nrow(x)+1; this is # a trick: see note below catalog=NULL gs.mat=matrix(nrow(x)+1,nrow=length(gs.ind),ncol=maxsize) ii=0 for(i in gs.ind){ ii=ii+1 gene.set=match(genesets[[i]],genenames) gene.set=gene.set[!is.na(gene.set)] catalog=c(catalog,gene.set) if(length(gene.set)>0){ gs.mat[ii,1:length(gene.set)]=gene.set ngenes[ii]=length(gene.set) } } catalog.unique=unique(catalog) gs.mat=gs.mat[,1:max(ngenes, na.rm=TRUE)] } # estimate s0 if necessary if(is.null(s0)){ if(!is.null(s0.perc)){ if((s0.perc != -1 & s0.perc < 0) | s0.perc > 100){ stop("Illegal value for s0.perc: must be between 0 and 100, or equal to (-1) (meaning that s0 should be set to zero)") } if(s0.perc== -1){s0=0} } initflag=FALSE if(is.null(s0.perc)){initflag=TRUE} else if(s0.perc>=0){initflag=TRUE} if(initflag){ if(resp.type=="Quantitative"){ init.fit=quantitative.func(x[catalog.unique,],y,s0=0) } if(resp.type=="Two class unpaired"){ init.fit=ttest.func(x[catalog.unique,],y,s0=0) } if(resp.type=="Survival"){ init.fit=cox.func(x[catalog.unique,],y,censoring.status,s0=0) } if(resp.type=="Multiclass"){ init.fit=multiclass.func(x[catalog.unique,],y,s0=0) } if(resp.type=="Two class paired"){ init.fit=paired.ttest.func(x[catalog.unique,],y,s0=0) } } if(is.null(s0.perc)){ s0=est.s0(init.fit$tt,init.fit$sd)$s0.hat s0.perc=100*sum(init.fit$sd0)) sd.pos=sqrt(var(tt[catalog]*(tt[catalog]>0))) mean.neg=-mean(tt[catalog]*(tt[catalog]<0)) sd.neg=sqrt(var(tt[catalog]*(tt[catalog]<0))) } if(restand.basis=="data"){ mean.all = mean(tt) sd.all = sqrt(var(tt)) mean.abs = mean(abs(tt)) sd.abs = sqrt(var(abs(tt))) mean.pos = mean(tt * (tt > 0)) sd.pos = sqrt(var(tt * (tt > 0))) mean.neg = -mean(tt * (tt < 0)) sd.neg = sqrt(var(tt * (tt < 0))) } } stand.info=list(mean.all=mean.all, mean.abs=mean.abs, sd.all=sd.all, sd.abs=sd.abs, mean.pos=mean.pos, sd.pos=sd.pos, mean.neg=mean.neg, sd.neg=sd.neg ) # Note: this is a trick: I artificially tack a zero on the end of tt, # so that tt2[gs.mat] has the same dim as gs.mat, with zeroes filling # in the end of each row tt2=c(tt,0) ttt=matrix(tt2[gs.mat],nrow=nrow(gs.mat)) if(method=="maxmean"){ #rpos=rowSums(pmax(ttt,0))/ngenes #rneg=-1*rowSums(pmin(ttt,0))/ngenes s2=abs(ttt) rpos=rowSums((ttt+s2)/2)/ngenes rneg=rowSums((s2-ttt)/2)/ngenes rpos[is.na(rpos)]=0 rneg[is.na(rneg)]=0 if(restand ){rpos=(rpos- mean.pos)/(sd.pos)} if(restand & resp.type!= "Multiclass" & resp.type!= "taCorr"){rneg=(rneg- mean.neg)/(sd.neg)} rr=pmax(rpos,rneg) rr[rneg>rpos]=-1*rr[rneg>rpos] } gene.ind=NULL if(method=="mean"){ rr=rowSums(ttt)/ngenes if(restand){rr=(rr-mean.all)/(sd.all/sqrt(ngenes))} } if(method=="absmean"){ rr=rowSums(abs(ttt))/ngenes if(restand){rr=(rr-mean.all)/(sd.all/sqrt(ngenes))} } if(return.gene.ind){ gene.ind=vector("list",length(gs.ind)) ii=0 for(i in gs.ind){ ii=ii+1 gene.set=match(genesets[[i]],genenames) gene.set=gene.set[!is.na(gene.set)] if(rr[ii]>0){ gene.ind[[i]]=(1:length(gene.set))[tt[gene.set]>0]} if(rr[ii]<0){ gene.ind[[i]]=(1:length(gene.set))[tt[gene.set]<0]} }} rrr=rep(NA,length(genesets)) rrr[gs.ind]=rr rrt=-qnorm(1-pt(rrr, df=length(y)-2)) out=list(scores=rrr, norm.scores=rrt, mean=me, sd=s, gene.ind=gene.ind, geneset.names=geneset.names, gene.scores=gene.scores,s0=s0,s0.perc=s0.perc, stand.info=stand.info, method=method,call=this.call) class(out)="GSA.func" out } GSA/R/print.GSA.R0000644000175100001440000000035610475577706013021 0ustar hornikusersprint.GSA <- function(x, ...) { cat("Call:\n") dput(x$call) mat1=x$fdr.lo print("") print("Negative") print(mat1, quote = FALSE) mat2=x$fdr.hi print("") print("") print("Positive") print(mat2, quote = FALSE) invisible() } GSA/R/GSA.xlfuns.R0000644000175100001440000000337410555323173013171 0ustar hornikusers GSA.xl.plot=function(GSA.obj, fac=.10, FDRcut=1){ eps=.001 a=GSA.listsets(GSA.obj, geneset.names=NULL, FDRcut=FDRcut) neg=matrix(as.numeric(as.character(a$negative[,-2])),ncol=4)+eps pos=matrix(as.numeric(as.character(a$positive[,-2])),ncol=4)+eps ymax=max(c(neg[,4],pos[,4])) xmax=max(c(neg[,3],pos[,3])) ymin=min(c(neg[,4],pos[,4])) xmin=min(c(neg[,3],pos[,3])) o1=order(neg[,3]) o2=order(pos[,3]) amount1=runif(nrow(neg),min=-fac*neg[o1,3],max=fac*neg[o1,3]) amount2=runif(nrow(pos),min=-fac*pos[o2,3],max=fac*pos[o2,3]) return(list(xneg=neg[o1,3]+amount1,yneg=neg[o1,4],xpos=pos[o2,3]+amount2,ypos=pos[o2,4])) } GSA.xl.summary.genesets=function(GSA.genesets.obj){ nsets=length(GSA.genesets.obj$genesets) ngenes=unlist(lapply(GSA.genesets.obj$genesets,length)) allgenes=unlist(GSA.genesets.obj$genesets) tt=table(ngenes,dnn=NULL) if(length(tt)>20){ qq=round(quantile(ngenes,(0:20)/20),0) tt=table(cut(ngenes,breaks=unique(qq)),dnn=NULL) } tab.ngenes=matrix(tt, nrow=1) dimnames(tab.ngenes)[[2]]=names(tt) return(list(nsets=nsets, totgenes=sum(ngenes), totunique.genes=length(unique(allgenes)), tab.ngenes=tab.ngenes)) } GSA.xl.correlate=function(GSA.genesets.obj, genenames){ nsets=length(GSA.genesets.obj$genesets) ngenes=unlist(lapply(GSA.genesets.obj$genesets,length)) allgenes=unlist(GSA.genesets.obj$genesets) sets.in.exp=match(unique(allgenes),genenames) exp.in.sets=match(genenames,allgenes) nn=rep(NA,nsets) for(i in 1:nsets){ nn[i]=sum(!is.na(match(GSA.genesets.obj$genesets[[i]],genenames))) } qq=quantile(nn/ngenes, seq(0,1,by=.1)) quant=matrix(qq,nrow=1) dimnames(quant)[[2]]=names(qq) return(list(totgenes=length(genenames),tot.unique.genes=length(unique(genenames)), num.both=sum(!is.na(sets.in.exp)), quant=quant)) } GSA/R/GSA.xl.genescores.R0000644000175100001440000000071510502710644014420 0ustar hornikusersGSA.xl.genescores=function(GSA.listsets.obj, genesets, GSA.obj, genenames){ o1=as.numeric(GSA.listsets.obj$pos[,1]) pos=vector("list",length(o1)) ii=0 for(i in o1){ ii=ii+1 pos[[ii]]=GSA.genescores(i,genesets, GSA.obj, genenames) } o2=as.numeric(GSA.listsets.obj$neg[,1]) neg=vector("list",length(o2)) ii=0 for(i in o2){ ii=ii+1 neg[[ii]]=GSA.genescores(i,genesets, GSA.obj, genenames, negfirst=TRUE) } return(list(posi=o1, negi=o2,pos=pos,neg=neg)) } GSA/R/GSA.plot.R0000644000175100001440000000157010500053657012621 0ustar hornikusersGSA.plot=function(GSA.obj, fac=1, FDRcut=1){ eps=.001 a=GSA.listsets(GSA.obj, geneset.names=NULL, FDRcut=FDRcut) neg=matrix(as.numeric(as.character(a$negative[,-2])),ncol=4)+eps pos=matrix(as.numeric(as.character(a$positive[,-2])),ncol=4)+eps ymax=max(c(neg[,4],pos[,4])) xmax=max(c(neg[,3],pos[,3])) ymin=min(c(neg[,4],pos[,4])) xmin=min(c(neg[,3],pos[,3])) o1=order(neg[,3]) o2=order(pos[,3]) plot(jitter(neg[o1,3],factor=fac),jitter(neg[o1,4],factor=fac),xlab="p-value",ylab="False discovery rate", type="n",log="xy", xlim=c(xmin,xmax), ylim=c(ymin,ymax)) points(jitter(neg[o1,3],factor=fac),jitter(neg[o1,4],factor=fac),col=3,cex=.7,type="b") points(jitter(pos[o2,3],factor=fac),jitter(pos[o2,4],factor=fac),col=4, cex=.7,type="b") #axis(3, at = res[,3], lab = paste(round(res[,3],0)), srt = 90, adj = 0,cex=.5) legend(.1,.2,c("Negative","Positive"),col=3:4, pch=c("o","o")) } GSA/R/GSA.morefuns.R0000644000175100001440000001511010701042001013454 0ustar hornikusersquantitative.func <- function(x,y,s0=0){ # regression of x on y my=mean(y) yy <- y-my temp <- x%*%yy mx=rowMeans(x) syy= sum(yy^2) scor <- temp/syy b0hat <- mx-scor*my xhat <- matrix(b0hat,nrow=nrow(x),ncol=ncol(x))+y*matrix(scor,nrow=nrow(x),ncol=ncol(x)) sigma <- sqrt(rowSums((x-xhat)^2)/(ncol(xhat)-2)) sd <- sigma/sqrt(syy) tt <- scor/(sd+s0) return(list(tt=tt, numer=scor, sd=sd)) } tCorr.func <- function(x,y,s0=0){ #simple correlation (Fisher trans) corr=cor(t(x),y) scor=.5*log((1+corr)/(1-corr)) sd=rep(1,length(scor)) tt=scor return(list(tt=tt, numer=scor, sd=sd)) } taCorr.func <- function(x,y,s0=0){ #simple abs correlation (Fisher trans) corr=abs(cor(t(x),y)) scor=.5*log((1+corr)/(1-corr)) sd=rep(1,length(scor)) tt=scor return(list(tt=tt, numer=scor, sd=sd)) } ttest.func <- function(x,y,s0=0, sd=NULL){ n1 <- sum(y==1) n2 <- sum(y==2) p <- nrow(x) m1 <- rowMeans(x[,y==1,drop=F]) m2 <- rowMeans(x[,y==2,drop=F]) if(is.null(sd)){ sd <- sqrt( ((n2-1) * varr(x[, y==2], meanx=m2) + (n1-1) * varr(x[, y==1], meanx=m1) )*(1/n1+1/n2)/(n1+n2-2) ) } numer <- m2 - m1 dif.obs <- (numer)/(sd + s0) return(list(tt=dif.obs,numer=numer, sd=sd)) } varr <- function(x, meanx=NULL){ n <- ncol(x) p <- nrow(x) Y <-matrix(1,nrow=n,ncol=1) if(is.null(meanx)){ meanx <- rowMeans(x)} ans<- rep(1, p) xdif <- x - meanx %*% t(Y) ans <- (xdif^2) %*% rep(1/(n - 1), n) ans <- drop(ans) return(ans) } paired.ttest.func <- function(x,y,s0=0, sd=NULL){ nc <- ncol(x)/2 o <- 1:nc o1 <- rep(0,ncol(x)/2);o2 <- o1 for(j in 1:nc){o1[j] <- (1:ncol(x))[y==-o[j]]} for(j in 1:nc){o2[j] <- (1:ncol(x))[y==o[j]]} d <- x[,o2,drop=F]-x[,o1,drop=F] su <- x[,o2,drop=F]+x[,o1,drop=F] if(is.matrix(d)){ m <- rowMeans(d) } if(!is.matrix(d)) {m <- mean(d)} if(is.null(sd)){ if(is.matrix(d)){ sd <- sqrt(varr(d, meanx=m)/nc)} if(!is.matrix(d)){sd <- sqrt(var(d)/nc)} } dif.obs <- m/(sd+s0) return(list(tt=dif.obs, numer=m, sd=sd)) } cox.func <- function(x,y,censoring.status,s0=0){ scor <- coxscor(x,y, censoring.status)$scor sd <- sqrt(coxvar(x,y, censoring.status)) tt <- scor/(sd+s0) return(list(tt=tt, numer=scor, sd=sd)) } coxscor <- function(x, y, ic, offset = rep(0., length(y))) { ## computes cox scor function for rows of nx by n matrix x ## first put everything in time order n <- length(y) nx <- nrow(x) yy <- y + (ic == 0.) * (1e-05) otag <- order(yy) y <- y[otag] ic <- ic[otag] x <- x[, otag, drop = F] ##compute unique failure times, d=# of deaths at each failure time, ##dd= expanded version of d to length n, s=sum of covariates at each ## failure time, nn=#obs in each risk set, nno=sum(exp(offset)) at each failure time offset <- offset[otag] a <- coxstuff(x, y, ic, offset = offset) nf <- a$nf fail.times <- a$fail.times s <- a$s d <- a$d dd <- a$dd nn <- a$nn nno <- a$nno w <- rep(0., nx) for(i in (1.:nf)) { w <- w + s[, i] oo<- (1.:n)[y >= fail.times[i]] r<-rowSums(x[, oo, drop = F] * exp(offset[oo])) w<- w - (d[i]/nno[i])*r } return(list(scor = w, coxstuff.obj = a)) } coxvar <- function(x, y, ic, offset = rep(0., length(y)), coxstuff.obj = NULL){ ## computes information elements (var) for cox ## x is nx by n matrix of expression values nx <- nrow(x) n <- length(y) yy <- y + (ic == 0.) * (1e-06) otag <- order(yy) y <- y[otag] ic <- ic[otag] x <- x[, otag, drop = F] offset <- offset[otag] if(is.null(coxstuff.obj)) { coxstuff.obj <- coxstuff(x, y, ic, offset = offset) } nf <- coxstuff.obj$nf fail.times <- coxstuff.obj$fail.times s <- coxstuff.obj$s d <- coxstuff.obj$d dd <- coxstuff.obj$dd nn <- coxstuff.obj$nn nno <- coxstuff.obj$nno x2<- x^2 oo <- (1.:n)[y >= fail.times[1] ] sx<-(1/nno[1])*rowSums(x[, oo] * exp(offset[oo])) s<-(1/nno[1])*rowSums(x2[, oo] * exp(offset[oo])) w <- d[1] * (s - sx * sx) for(i in 2.:nf) { oo <- (1.:n)[y >= fail.times[i-1] & y < fail.times[i] ] sx<-(1/nno[i])*(nno[i-1]*sx-rowSums(x[, oo,drop=F] * exp(offset[oo]))) s<-(1/nno[i])*(nno[i-1]*s-rowSums(x2[, oo,drop=F] * exp(offset[oo]))) w <- w + d[i] * (s - sx * sx) } return(w) } coxstuff<- function(x, y, ic, offset = rep(0., length(y))) { fail.times <- unique(y[ic == 1.]) nf <- length(fail.times) n <- length(y) nn <- rep(0., nf) nno <- rep(0., nf) for(i in 1.:nf) { nn[i] <- sum(y >= fail.times[i]) nno[i] <- sum(exp(offset)[y >= fail.times[i]]) } s <- matrix(0., ncol = nf, nrow = nrow(x)) d <- rep(0., nf) ##expand d out to a vector of length n for(i in 1.:nf) { o <- (1.:n)[(y == fail.times[i]) & (ic == 1.)] d[i] <- length(o) } oo <- match(y, fail.times) oo[ic==0]<-NA oo[is.na(oo)]<- max(oo[!is.na(oo)])+1 s<-t(rowsum(t(x),oo)) if(ncol(s)> nf){s<-s[,-ncol(s)]} dd <- rep(0., n) for(j in 1.:nf) { dd[(y == fail.times[j]) & (ic == 1.)] <- d[j] } return(list(fail.times=fail.times, s=s, d=d, dd=dd, nf=nf, nn=nn, nno=nno)) } multiclass.func <- function(x,y,s0=0){ ##assumes y is coded 1,2... nn <- table(y) m <- matrix(0,nrow=nrow(x),ncol=length(nn)) v <- m for(j in 1:length(nn)){ m[,j] <- rowMeans(x[,y==j]) v[,j] <- (nn[j]-1)*varr(x[,y==j], meanx=m[,j]) } mbar <- rowMeans(x) mm <- m-matrix(mbar,nrow=length(mbar),ncol=length(nn)) fac <- (sum(nn)/prod(nn)) scor <- sqrt(fac*(apply(matrix(nn,nrow=nrow(m),ncol=ncol(m),byrow=TRUE)*mm*mm,1,sum))) sd <- sqrt(rowSums(v)*(1/sum(nn-1))*sum(1/nn)) tt <- scor/(sd+s0) mm.stand=t(scale(t(mm),center=FALSE,scale=sd)) return(list(tt=tt, numer=scor, sd=sd,stand.contrasts=mm.stand)) } est.s0<-function(tt,sd,s0.perc=seq(0,1, by=.05)){ ## estimate s0 (exchangeability) factor for denominator. ## returns the actual estimate s0 (not a percentile) br=unique(quantile(sd,seq(0,1,len=101))) nbr=length(br) a<-cut(sd,br,labels=F) a[is.na(a)]<-1 cv.sd<-rep(0,length(s0.perc)) for(j in 1:length(s0.perc)){ w<-quantile(sd,s0.perc[j]) w[j==1]<-0 tt2<-tt*sd/(sd+w) tt2[tt2==Inf]=NA sds<-rep(0,nbr-1) for(i in 1:(nbr-1)){ sds[i]<-mad(tt2[a==i], na.rm=TRUE) } cv.sd[j]<-sqrt(var(sds))/mean(sds) } o=(1:length(s0.perc))[cv.sd==min(cv.sd)] # we don;t allow taking s0.hat to be 0th percentile when min sd is 0 s0.hat=quantile(sd[sd!=0],s0.perc[o]) return(list(s0.perc=s0.perc,cv.sd=cv.sd, s0.hat= s0.hat)) } paired.perm=function(y){ n=max(abs(y)) nn=2*n res=1:nn for(i in 1:n){ o=(1:nn)[abs(y)==i] u=runif(1) if(u>.5){ temp=res[o[1]] res[o[1]]=res[o[2]] res[o[2]]=temp }} return(res) } GSA/R/GSA.listsets.R0000644000175100001440000000601610743771542013526 0ustar hornikusers GSA.listsets= function (GSA.obj, geneset.names = NULL, maxchar = 20, FDRcut = 0.2) { if(is.null(geneset.names)){ geneset.names=rep("xxxxxx", length(GSA.obj$GSA.scores)) } negflag= !(GSA.obj$resp.type=="Multiclass" | GSA.obj$resp.type=="taCorr") r = GSA.obj$GSA.scores rstar = GSA.obj$GSA.scores.perm r[is.na(r)] = 0 rstar[is.na(rstar)] = 0 nperms = ncol(GSA.obj$GSA.scores.perm) np = length(r) geneset.names = substring(geneset.names, 1, maxchar) pvalues.lo = GSA.obj$pvalues.lo pvalues.hi = GSA.obj$pvalues.hi m = sum(!is.na(pvalues.hi)) make.monotone.increasing = function(x) { n = length(x) if (n==0) return(NULL) ## added to prevent error when x is NULL if (n==1) x[1] <- x[1] ## added to prevent error when length of x =1 else { for (i in n:2) { if (x[i - 1] > x[i]) { x[i - 1] = x[i] } } } return(x) } oo = (1:length(r))[!is.na(pvalues.hi)] fdr.lo = fdr.hi = rep(NA, length(r)) for (i in oo) { if(negflag){ fdr.lo[i] = round(m * pvalues.lo[i]/sum(pvalues.lo[!is.na(pvalues.lo)] <= pvalues.lo[i]), 4) } fdr.hi[i] = round(m * pvalues.hi[i]/sum(pvalues.hi[!is.na(pvalues.hi)] <= pvalues.hi[i]), 4) } fdr.lo=pmin(fdr.lo,1) fdr.hi=pmax(fdr.hi,0) res1=NULL if(negflag){ oo1 = (1:length(r))[r < 0] res1 = NULL for (i in oo1) { res1 = rbind(res1, c(i, geneset.names[i], round(GSA.obj$GSA.scores[i], 4), pvalues.lo[i], fdr.lo[i])) } if (!is.null(res1)) o1 = order(res1[, 4],decreasing = FALSE) else o1 = NULL res1 = res1[o1, , drop = F] res1[, 5] = make.monotone.increasing(as.numeric(res1[, 5])) } oo2 = (1:length(r))[r > 0] res2 = NULL for (i in oo2) { res2 = rbind(res2, c(i, geneset.names[i], round(GSA.obj$GSA.scores[i], 4), pvalues.hi[i], fdr.hi[i])) } if (!is.null(res2)) o2 = order(res2[, 4], decreasing = FALSE) else o2 = NULL res2 = res2[o2, , drop = F] res2[, 5] = make.monotone.increasing(as.numeric(res2[, 5])) if (length(res1) == 0) { res1 = NULL } if (length(res2) == 0) { res2 = NULL } if ( (length(res1) > 0) & negflag) { dimnames(res1) = list(NULL, c("Gene_set", "Gene_set_name", "Score", "p-value", "FDR")) } if (length(res2) > 0) { dimnames(res2) = list(NULL, c("Gene_set", "Gene_set_name", "Score", "p-value", "FDR")) } nsets.neg=NULL if(negflag){ res1 = res1[as.numeric(res1[, 5]) <= FDRcut,,drop=FALSE ] nsets.neg = nrow(res1) if (is.null(res1)) { nsets.neg = 0 }} res2 = res2[as.numeric(res2[, 5]) <= FDRcut,,drop=FALSE ] nsets.pos = nrow(res2) if (is.null(res2)) { nsets.pos = 0 } return(list(FDRcut = FDRcut, negative = res1, positive = res2, nsets.neg = nsets.neg, nsets.pos = nsets.pos)) } GSA/MD50000644000175100001440000000245213424630013011211 0ustar hornikusersf844ecdc0e2b9171b15e98e10e166eb6 *DESCRIPTION 326ae6f64e49f24337924caff4a92ce7 *INDEX 092b21ac094ff5e3851c33080dffe45b *NAMESPACE 9b2ddc26975fbb4673f6fea56c974cdf *R/GSA.R 9c330ae3d36b62aab20f312ca15ce3ac *R/GSA.correlate.R a130b42fa11d187d9219b85ada32b305 *R/GSA.func.R 3d01ad310bdfc9376a00f28b812543b7 *R/GSA.genescores.R a3d6e3d2dbba61da386e695cf2b1fc4b *R/GSA.listsets.R 212e8016b2e2450827fcf74aa88aca37 *R/GSA.make.features.R f89489225341999403b3437f93673d42 *R/GSA.morefuns.R c712b967331e172b8b45ce9ae740bf39 *R/GSA.plot.R 30381dafac856a982e31415902714111 *R/GSA.read.gmt.R 06583ca42c9e3f8c37edbf469f314614 *R/GSA.xl.genescores.R 9bbdd3af4e95ff9c857e4095916293d4 *R/GSA.xlfuns.R 41919ec8c136f0b19eac1bdf7a8bd9f9 *R/print.GSA.R 88dc872329261284232a7beee7ddfff1 *R/print.GSA.func.R 6bcb3a41d445b8989b1ffe967630ef3b *R/summary.GSA.genesets.R 0953817f96fced339f891b94f52ef1b2 *man/GSA.Rd 9109392920601e5864d6cf3204ad6a0d *man/GSA.correlate.Rd 98ff04a3d704ea26c7cd47a0ffb239c2 *man/GSA.func.Rd 20703739db06fbb15b5bd55bf99500f6 *man/GSA.genescores.Rd 566e123bf40ffb69eb1cd7483f1517d5 *man/GSA.internal.Rd 10bd811cd05fb5f1af634bdb6a1f70a1 *man/GSA.listsets.Rd 500e59c3f0859bb15d3b5a77eaea442b *man/GSA.make.features.Rd a816845e5069a7cc1928ccd4cae71a85 *man/GSA.plot.Rd 5cec175fbd481dfc8e5e988ba254f45b *man/GSA.read.gmt.Rd GSA/DESCRIPTION0000755000175100001440000000056413424630013012414 0ustar hornikusersPackage: GSA Title: Gene Set Analysis Version: 1.03.1 Author: Brad Efron and R. Tibshirani Description: Gene Set Analysis. Maintainer: Rob Tibshirani Suggests: impute License: LGPL URL: http://www-stat.stanford.edu/~tibs/GSA Packaged: 2019-01-31 16:56:00 UTC; hornik Repository: CRAN Date/Publication: 2019-01-31 17:27:39 UTC NeedsCompilation: no GSA/man/0000755000175100001440000000000011320430575011455 5ustar hornikusersGSA/man/GSA.func.Rd0000644000175100001440000001157311320174445013320 0ustar hornikusers\name{GSA.func} \alias{GSA.func} \title{Gene set analysis without permutations} \description{ Determines the significance of pre-defined sets of genes with respect to an outcome variable, such as a group indicator, quantitative variable or survival time. This is the basic function called by GSA. } \usage{ GSA.func(x,y, genesets, genenames,geneset.names=NULL, method=c("maxmean","mean","absmean"), resp.type=c("Quantitative", "Two class unpaired","Survival","Multiclass", "Two class paired", "tCorr", "taCorr" ), censoring.status=NULL, first.time = TRUE, return.gene.ind = TRUE, ngenes = NULL, gs.mat =NULL, gs.ind = NULL, catalog = NULL, catalog.unique =NULL, s0 = NULL, s0.perc = NULL, minsize = 15, maxsize= 500, restand = TRUE, restand.basis=c("catalog","data")) } \arguments{ \item{x}{Data x: p by n matrix of features, one observation per column (missing values allowed)} \item{y}{Vector of response values: 1,2 for two class problem, or 1,2,3 ... for multiclass problem, or real numbers for quantitative or survival problems} \item{genesets}{Gene set collection (a list)} \item{genenames}{Vector of genenames in expression dataset} \item{geneset.names}{Optional vector of gene set names} \item{method}{Method for summarizing a gene set: "maxmean" (default), "mean" or "absmean"} \item{resp.type}{Problem type: "quantitative" for a continuous parameter; "Two class unpaired" ; "Survival" for censored survival outcome; "Multiclass" : more than 2 groups; "Two class paired" for paired outcomes, coded -1,1 (first pair), -2,2 (second pair), etc } \item{censoring.status}{Vector of censoring status values for survival problems, 1 mean death or failure, 0 means censored)} \item{first.time}{internal use} \item{return.gene.ind}{internal use} \item{ngenes}{internal use} \item{gs.mat}{internal use} \item{gs.ind}{internal use} \item{catalog}{internal use} \item{catalog.unique}{internal use} \item{s0}{Exchangeability factor for denominator of test statistic; Default is automatic choice} \item{s0.perc}{Percentile of standard deviation values to use for s0; default is automatic choice; -1 means s0=0 (different from s0.perc=0, meaning s0=zeroeth percentile of standard deviation values= min of sd values} \item{minsize}{Minimum number of genes in genesets to be considered} \item{maxsize}{Maximum number of genes in genesets to be considered} \item{restand}{Should restandardization be done? Default TRUE} \item{restand.basis}{What should be used to do the restandardization? The set of genes in the genesets ("catalog", the default) or the genes in the data set ("data")} } \details{Carries out a Gene set analysis, computing the gene set scores. This function does not do any permutations for estimation of false discovery rates. GSA calls this function to estimate FDRs. } \value{ A list with components \item{scores}{Gene set scores for each gene set}, \item{norm.scores}{Gene set scores transformed by the inverse Gaussian cdf}, \item{mean}{Means of gene expression values for each sample} \item{sd}{Standard deviation of gene expression values for each sample} \item{gene.ind}{List indicating whch genes in each positive gene set had positive individual scores, and similarly for negative gene sets} \item{geneset.names}{Names of the gene sets} \item{nperms}{Number of permutations used} \item{gene.scores}{Individual gene scores (eg t-statistics for two class problem)} \item{s0}{Computed exchangeability factor} \item{s0.perc}{Computed percentile of standard deviation values} \item{stand.info}{ Information computed used in the restandardization process} \item{method}{Method used (from call to GSA.func)} \item{call}{The call to GSA} } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf} \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some random gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.func.obj<-GSA.func(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired") #to use "real" gene set collection, we read it in from a gmt file: # # geneset.obj<- GSA.read.gmt("file.gmt") # # where file.gmt is a gene set collection from GSEA collection or # or the website http://www-stat.stanford.edu/~tibs/GSA, or one # that you have created yourself. Then # GSA.func.obj<-GSA.func(x,y, genenames=genenames, genesets=geneset.obj$genesets, resp.type="Two class unpaired") # # } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/man/GSA.correlate.Rd0000644000175100001440000000300211320174603014325 0ustar hornikusers\name{GSA.correlate} \alias{GSA.correlate} \title{"Correlates" a gene set collection with a given list of gene nams} \description{ "Correlates" a gene set collection with a given list of gene names. Gives info on the overlap between the collection and the list of genes } \usage{ GSA.correlate(GSA.genesets.obj, genenames) } \arguments{ \item{GSA.genesets.obj}{Gene set collection, created for example by GSA.read.gmt} \item{genenames}{Vector of gene names in expression daatset} } \details{ Gives info on the overlap between a gene set collection and the list of gene names. This is for information purposes, to find out, for example, how many genes in the list of genes appear in the gene set collection.} \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf } \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some random gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.correlate(genesets, genenames) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/man/GSA.plot.Rd0000644000175100001440000000305411320174533013334 0ustar hornikusers\name{GSA.plot} \alias{GSA.plot} \title{Plot the results from a Gene set analysis} \description{ Plots the results from a call to GSA (Gene set analysis) } \usage{ GSA.plot(GSA.obj, fac=1, FDRcut = 1) } \arguments{ \item{GSA.obj}{ Object returned by GSA function}. \item{fac}{value for jittering points in plot ("factor" in called to jitter()} \item{FDRcut}{False discovery rate cutpoint for sets to be plotted. A value of 1 (the default) will cause all sets to be plotted}. } \details{This function makes a plot of the significant gene sets, based on a call to the GSA (Gene set analysis) function. } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf } \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some radnom gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.obj<-GSA(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired", nperms=100) GSA.listsets(GSA.obj, geneset.names=geneset.names,FDRcut=.5) GSA.plot(GSA.obj) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/man/GSA.read.gmt.Rd0000644000175100001440000000317010520507153014055 0ustar hornikusers\name{GSA.read.gmt} \alias{GSA.read.gmt} \title{Read in a gene set collection from a .gmt file} \description{ Read in a gene set collection from a .gmt file } \usage{ GSA.read.gmt(filename) } \arguments{ \item{filename}{The name of a file to read data values from. Should be a tab-separated text file, with one row per gene set. Column 1 has gene set names (identifiers), column 2 has gene set descriptions, remaining columns are gene ids for genes in that geneset}. } \details{This function reads in a geneset collection from a .gmt text file, and creates an R object that can be used as input into GSA. We use UniGene symbols for our gene set names in our .gmt files and expression datasets, to match the two. However the user is free to use other identifiers, as long as the same ones are used in the gene set collections and expression datasets. } \value{ A list with components \item{genesets}{List of gene names (identifiers) in each gene set}, \item{geneset.names}{Vector of gene set names (identifiers)}, \item{geneset.descriptions}{Vector of gene set descriptions} } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf } \author{Robert Tibshirani} \examples{ # read in functional pathways gene set file from Broad institute GSEA website # http://www.broad.mit.edu/gsea/msigdb/msigdb_index.html # You have to register first and then download the file C2.gmt from # their site #GSA.read.gmt(C2.gmt) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/man/GSA.listsets.Rd0000644000175100001440000000451611320174515014234 0ustar hornikusers\name{GSA.listsets} \alias{GSA.listsets} \title{List the results from a Gene set analysis} \description{ List the results from a call to GSA (Gene set analysis) } \usage{ GSA.listsets(GSA.obj, geneset.names = NULL, maxchar = 20, FDRcut = 0.2) } \arguments{ \item{GSA.obj}{ Object returned by GSA function}. \item{geneset.names}{Optional vector of names for the gene sets} \item{maxchar}{Maximum number of characters in printed output} \item{FDRcut}{False discovery rate cutpoint for listed sets. A value of 1 will cause all sets to be listed}. } \details{This function list the sigificant gene sets, based on a call to the GSA (Gene set analysis) function. } \value{ A list with components \item{FDRcut}{The false discovery rate threshold used.} \item{negative}{A table of the negative gene sets. "Negative" means that lower expression of most genes in the gene set correlates with higher values of the phenotype y. Eg for two classes coded 1,2, lower expression correlates with class 2. For survival data, lower expression correlates with higher risk, i.e shorter survival (Be careful, this can be confusing!)} \item{positive}{A table of the positive gene sets. "Positive" means that higher expression of most genes in the gene set correlates with higher values of the phenotype y. See "negative" above for more info.} \item{nsets.neg}{Number of negative gene sets} \item{nsets.pos}{Number of positive gene sets} } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf } \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some radnom gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.obj<-GSA(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired", nperms=100) GSA.listsets(GSA.obj, geneset.names=geneset.names,FDRcut=.5) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/man/GSA.Rd0000644000175100001440000001526111320430561012356 0ustar hornikusers\name{GSA} \alias{GSA} \title{Gene set analysis} \description{ Determines the significance of pre-defined sets of genes with respect to an outcome variable, such as a group indicator, a quantitative variable or a survival time } \usage{ GSA(x,y, genesets, genenames, method=c("maxmean","mean","absmean"), resp.type=c("Quantitative","Two class unpaired","Survival","Multiclass", "Two class paired", "tCorr", "taCorr"), censoring.status=NULL,random.seed=NULL, knn.neighbors=10, s0=NULL, s0.perc=NULL,minsize=15,maxsize=500, restand=TRUE,restand.basis=c("catalog","data"), nperms=200, xl.mode=c("regular","firsttime","next20","lasttime"), xl.time=NULL, xl.prevfit=NULL) } \arguments{ \item{x}{Data x: p by n matrix of features (expression values), one observation per column (missing values allowed); y: n-vector of outcome measurements} \item{y}{Vector of response values: 1,2 for two class problem, or 1,2,3 ... for multiclass problem, or real numbers for quantitative or survival problems} \item{genesets}{Gene set collection (a list)} \item{genenames}{Vector of genenames in expression dataset} \item{method}{Method for summarizing a gene set: "maxmean" (default), "mean" or "absmean"} \item{resp.type}{Problem type: "quantitative" for a continuous parameter; "Two class unpaired" ; "Survival" for censored survival outcome; "Multiclass" : more than 2 groups, coded 1,2,3...; "Two class paired" for paired outcomes, coded -1,1 (first pair), -2,2 (second pair), etc} \item{censoring.status}{Vector of censoring status values for survival problems, 1 mean death or failure, 0 means censored} \item{random.seed}{Optional initial seed for random number generator (integer)} \item{knn.neighbors}{Number of nearest neighbors to use for imputation of missing features values} \item{s0}{Exchangeability factor for denominator of test statistic; Default is automatic choice} \item{s0.perc}{Percentile of standard deviation values to use for s0; default is automatic choice; -1 means s0=0 (different from s0.perc=0, meaning s0=zeroeth percentile of standard deviation values= min of sd values)} \item{minsize}{Minimum number of genes in genesets to be considered} \item{maxsize}{Maximum number of genes in genesets to be considered} \item{restand}{Should restandardization be done? Default TRUE}, \item{restand.basis}{What should be used to do the restandardization? The set of genes in the genesets ("catalog", the default) or the genes in the data set ("data")} \item{nperms}{Number of permutations used to estimate false discovery rates} \item{xl.mode}{Used by Excel interface} \item{xl.time}{Used by Excel interface} \item{xl.prevfit}{Used by Excel interface} } \details{Carries out a Gene set analysis, as described in the paper by Efron and Tibshirani (2006). It differs from a Gene Set Enrichment Analysis (Subramanian et al 2006) in its use of the "maxmean" statistic: this is the mean of the positive or negative part of gene scores in the gene set, whichever is large in absolute values. Efron and Tibshirani shows that this is often more powerful than the modified KS statistic used in GSEA. GSA also does "restandardization" of the genes (rows), on top of the permutation of columns (done in GSEA). Gene set analysis is applicable to microarray data and other data with a large number of features. This is also the R package that is called by the "official" SAM Excel package v3.0. The format of the response vector y and the calling sequence is illustrated in the examples below. A more complete description is given in the SAM manual at http://www-stat.stanford.edu/~tibs/SAM} \value{ A list with components \item{GSA.scores}{Gene set scores for each gene set} \item{GSA.scores.perm}{Matrix of Gene set scores from permutions, one column per permutation} \item{fdr.lo}{Estimated false discovery rates for negative gene sets (negative means lower expression correlates with class 2 in two sample problems, lower expression correlates with increased y for quantitative problems, lower expression correlates with higher risk for survival problems)} \item{fdr.hi}{Estimated false discovery rates for positive gene sets; positive is opposite of negative, as defined above} \item{pvalues.lo}{P-values for negative gene sets} \item{pvalues.hi}{P-values for positive gene sets} \item{stand.info}{Information from restandardization process} \item{stand.info.star}{Information from restandardization process in permutations} \item{ngenes}{Number of genes in union of gene sets} \item{nperms}{Number of permutations used} \item{gene.scores}{Individual gene scores (eg t-statistics for two class problem)} \item{s0}{Computed exchangeability factor} \item{s0.perc}{Computed percentile of standard deviation values. s0= s0.perc percentile of the gene standard deviations} \item{call}{The call to GSA} \item{x}{For internal use} \item{y}{For internal use} \item{genesets}{For internal use} \item{genenames}{For internal use} \item{r.obs}{For internal use} \item{r.star}{For internal use} \item{gs.mat}{For internal use} \item{gs.ind}{For internal use} \item{catalog}{For internal use} \item{catalog.unique}{For internal use} } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf Subramanian, A. and Tamayo, P. Mootha, V. K. and Mukherjee, S. and Ebert, B. L. and Gillette, M. A. and Paulovich, A. and Pomeroy, S. L. and Golub, T. R. and Lander, E. S. and Mesirov, J. P. (2005) A knowledge-based approach for interpreting genome-wide expression profiles. PNAS. 102, pg 15545-15550. } \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some random gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.obj<-GSA(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired", nperms=100) GSA.listsets(GSA.obj, geneset.names=geneset.names,FDRcut=.5) #to use "real" gene set collection, we read it in from a gmt file: # # geneset.obj<- GSA.read.gmt("file.gmt") # # where file.gmt is a gene set collection from GSEA collection or # or the website http://www-stat.stanford.edu/~tibs/GSA, or one # that you have created yourself. Then # GSA.obj<-GSA(x,y, genenames=genenames, genesets=geneset.obj$genesets, resp.type="Two class unpaired", nperms=100) # # } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/man/GSA.internal.Rd0000644000175100001440000000222013424623022014163 0ustar hornikusers\name{GSA-internal} \title{Internal GSA functions} \alias{print.GSA.func} \alias{print.GSA} \alias{summary.GSA.genesets} \alias{ttest.func} \alias{cox.func} \alias{multiclass.func} \alias{quantitative.func} \alias{varr} \alias{coxvar} \alias{coxstuff} \alias{ests0} \alias{GSA.xl.genescores} \alias{GSA.xl.correlate} \alias{GSA.xl.summary} \alias{GSA.xl.plot} \alias{GSA.xl.summary.genesets} \description{Internal samr functions} \usage{ \S3method{print}{GSA.func}(x, ...) \S3method{print}{GSA}(x, ...) \S3method{summary}{GSA.genesets}(object, ...) varr(x, meanx=NULL) coxvar(x, y, ic, offset = rep(0., length(y)), coxstuff.obj = NULL) coxstuff(x, y, ic, offset = rep(0., length(y))) est.s0(tt,sd,s0.perc=seq(0,1, by=.05)) GSA.xl.genescores(GSA.listsets.obj, genesets, GSA.obj, genenames) GSA.xl.plot(GSA.obj, fac=.10, FDRcut=1) GSA.xl.summary.genesets(GSA.genesets.obj) GSA.xl.correlate(GSA.genesets.obj, genenames) ttest.func(x, y, s0 = 0, sd=NULL) cox.func(x, y, censoring.status, s0 = 0) multiclass.func(x, y, s0 = 0) quantitative.func(x, y, s0 = 0) } \author{Robert Tibshirani} \details{ These are not to be called by the user. } \keyword{internal} GSA/man/GSA.make.features.Rd0000644000175100001440000000370311320174641015111 0ustar hornikusers\name{GSA.make.features} \alias{GSA.make.features} \title{Creates features from a GSA analysis that can be used in other procedures} \description{ Creates features from a GSA analysis that can be used in other procedures, for example, sample classification. } \usage{ GSA.make.features(GSA.func.obj, x, genesets, genenames) } \arguments{ \item{GSA.func.obj}{Object returned by GSA.func} \item{x}{Expression dataset from which the features are to be created} \item{genesets}{Gene set collection} \item{genenames}{Vector of gene names in expression dataset} } \details{ Creates features from a GSA analysis that can be used in other procedures, for example, sample classification. For example, suppose the GSA analysis computes a maxmean score for gene set 1 that is positive, based on the mean of the positive part of the scores in that gene set. Call the subset of genes with positive scores "A". Then we compute a new feature for this geneset, for each sample, by computing the mean of the scores for genes in A, setting other gene scores to zero. } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf } \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some random gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.func.obj<-GSA.func(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired") GSA.make.features(GSA.func.obj, x, genesets, genenames) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} GSA/man/GSA.genescores.Rd0000644000175100001440000000345411320174470014517 0ustar hornikusers\name{GSA.genescores} \alias{GSA.genescores} \title{Individual gene scores from a gene set analysis} \description{ Compute individual gene scores from a gene set analysis } \usage{ GSA.genescores(geneset.number, genesets, GSA.obj, genenames, negfirst=FALSE) } \arguments{ \item{geneset.number}{Number indicating which gene set is to examined} \item{genesets}{The gene set collection} \item{GSA.obj}{Object returned by function GSA} \item{genenames}{Vector of gene names for gene in expression dataset} \item{negfirst}{Should negative genes be listed first? Default FALSE} } \details{ Compute individual gene scores from a gene set analysis. Useful for looking ``inside'' a gene set that has been called significant by GSA.} \value{ A list with components \item{res}{Matrix of gene names and gene scores (eg t-statistics) for each gene in the gene set}, } \references{Efron, B. and Tibshirani, R. On testing the significance of sets of genes. Stanford tech report rep 2006. http://www-stat.stanford.edu/~tibs/ftp/GSA.pdf } \author{Robert Tibshirani} \examples{ ######### two class unpaired comparison # y must take values 1,2 set.seed(100) x<-matrix(rnorm(1000*20),ncol=20) dd<-sample(1:1000,size=100) u<-matrix(2*rnorm(100),ncol=10,nrow=100) x[dd,11:20]<-x[dd,11:20]+u y<-c(rep(1,10),rep(2,10)) genenames=paste("g",1:1000,sep="") #create some random gene sets genesets=vector("list",50) for(i in 1:50){ genesets[[i]]=paste("g",sample(1:1000,size=30),sep="") } geneset.names=paste("set",as.character(1:50),sep="") GSA.obj<-GSA(x,y, genenames=genenames, genesets=genesets, resp.type="Two class unpaired", nperms=100) # look at 10th gene set GSA.genescores(10, genesets, GSA.obj, genenames) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{survival} \keyword{ts} \keyword{nonparametric} GSA/INDEX0000644000175100001440000000114410520471276011500 0ustar hornikusersGSA Gene set analysis GSA.correlate "Correlates" a gene set collection with a given list of genes GSA.func Gene set analysis without permutations GSA.genescores Individual gene scores from a gene set analysis GSA.listsets List the results from a Gene set analysis GSA.make.features Creates features from a GSA analysis that can be used in other procedures GSA.plot Plot the results from a Gene set analysis GSA.read.gmt Read in a gene set collection from a .gmt file