rsgcc/0000755000175100001440000000000012157771333011402 5ustar hornikusersrsgcc/MD50000644000175100001440000000316712157771333011721 0ustar hornikusers855422b8eac6fb05843ef523a4ed839c *DESCRIPTION c0a22e8eaa706f753d402c7f01377ae0 *NAMESPACE fe523cb31b5628e04e83241831ac7d83 *R/getdist.R 8f9326b8dcbd4b569d954fc02e52087d *R/gethclust.R 2f53a69b91a2019f4c862c88bc1a9cf0 *R/rsgcc.R 1919d6fbc78e32f99b5ad1dcbb97f48c *R/rsgccgui.R 35656aac95e8d4b136bdadc7e1687ce2 *R/tsgene.R 458134b713ea41d7f5de661fd6db8653 *R/tsheatmap.R a8b918453d9e692e04792330b6142d75 *data/datalist 6d4c31cf75407ab95f0b0142f8fd6e22 *data/rsgcc.rda f2c3954c7a2cbb50695c3e9346b2be68 *man/adjacencymatrix.Rd c2f794e90f756b1f6099b77305a5ef43 *man/cor.matrix.Rd be1e68cb4e6437a09c93793a82f5e335 *man/cor.pair.Rd 53d0d6cc4af03349dd3a663a87144a98 *man/data.Rd 5b2888f29bd06fd9a297516c1028097e *man/gcc.corfinal.Rd cce573494a73f978d6ce325ce75205dd *man/gcc.dist.Rd 7714d7b1484ecf5c61375f1571e72e2f *man/gcc.hclust.Rd e3caf043c294dea996c3ed1de6923ded *man/gcc.heatmap.Rd 8b416b5e466d9cc0710cc6aa5c918df6 *man/gcc.tsheatmap.Rd 964f9d12ab0835b8b726206939521a45 *man/getsgene.Rd df58633bcc7bbfae6b26f88f0ca532ae *man/onegcc.Rd f35737f70c14da86e9929b81f0c7894e *man/rsgcc-package.Rd 593b33c7eaffc3ba9f04bfaa5106ea71 *man/rsgcc.gui.Rd 0ad5b460be2f125a5eec6128d1ded114 *man/uniqueTissues.Rd 2b9a8f933584d2477637da54c9f116bc *src/Makevars d41d8cd98f00b204e9800998ecf8427e *src/Makevars.win f6b881d90a60f9642463835f734f9288 *src/bridge.c 03dada065beb6bf77671d28d25ea5d3a *src/grid.c 6afc5e93755e5c0c9a256b786c43c0b7 *src/grid.h 064a4cc45185d439b4ce884d081f337d *src/iqsort.h 18124791615b0acf691efec5fbf4170f *src/mi.c c4092eef5bf0781ab14d18c7df719949 *src/mi.h 6972ee8be12a08992c6053492803f4e2 *src/points.c 3969ef81d6ee930ff882df46ee950616 *src/points.h rsgcc/src/0000755000175100001440000000000012157732556012175 5ustar hornikusersrsgcc/src/points.h0000644000175100001440000000232712157732556013666 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #ifndef POINTS_H #define POINTS_H #include typedef double coord_t; typedef double dist_t; typedef struct { coord_t xmin; coord_t xmax; coord_t ymin; coord_t ymax; } bbox_t; #define coord_sqrt(v) sqrt(v) #define coord_ceil(v) ceil(v) #define coord_read(p, e) strtod(p, e) #define COORD_MIN DBL_MIN #define COORD_MAX DBL_MAX #define DIST_MIN DBL_MIN #define DIST_MAX DBL_MAX #define dist_abs(v) fabs(v) int normalize(coord_t* const cs, const int n); void add_noise(coord_t* const cs, const int n, const double noise, unsigned int* seed); #endif //POINTS_H rsgcc/src/points.c0000644000175100001440000000377712157732556013673 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #include "points.h" #include #include #ifndef isinf #define isinf(x) \ (sizeof (x) == sizeof (long double) ? isinf_ld (x) \ : sizeof (x) == sizeof (double) ? isinf_d (x) \ : isinf_f (x)) static inline int isinf_f (float x) { return !isnan (x) && isnan (x - x); } static inline int isinf_d (double x) { return !isnan (x) && isnan (x - x); } static inline int isinf_ld (long double x) { return !isnan (x) && isnan (x - x); } #endif int normalize(coord_t* const cs, const int n) { int i; coord_t m, s, l; for (i = 0, m = 0; i < n; i++) m += cs[i]; m /= n; if (isinf(m)) return 0; for (i = 0, s = 0; i < n; i++) s += (cs[i]-m)*(cs[i]-m); s = coord_sqrt(s/(n-1)); if (isinf(s)) return 0; if (s > 0) { for (i = 0, l = COORD_MAX; i < n; i++) { cs[i] = (cs[i]-m)/s; if (cs[i] < l) l = cs[i]; } for (i = 0; i < n; i++) cs[i] -= l; } return 1; } #ifdef WIN32 // This version is in POSIX/C99 for [s]rand() #define rand_r myrand_r #undef RAND_MAX #define RAND_MAX 31767 static int rand_r (unsigned int *seed) { *seed = *seed * 1103515245 + 12345; return((unsigned)(*seed/65536) % 32768); } #endif void add_noise(coord_t* const cs, const int n, const double noise, unsigned int* seed) { int i; for (i = 0; i < n; i++) cs[i] += (1.0*rand_r(seed)/RAND_MAX) * noise; } rsgcc/src/mi.h0000644000175100001440000000316612157732556012761 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #ifndef MI_H #define MI_H #include "grid.h" #include "points.h" typedef struct { int k; int n; coord_t* psi; coord_t* sxs; int* xiis; coord_t* sys; int* yiis; grid_t grid; } mi_t; int make_mi(mi_t* const m, const int n, const int k); void destroy_mi(mi_t* const m); coord_t mutual_information(mi_t* const m, const coord_t* const xs, const coord_t* const ys); coord_t c_gcc(mi_t* const m, const coord_t* const xs, const coord_t* const ys, const int* const xsix, const int* const ysix ); //coord_t gini_correlation(mi_t* const m, const coord_t* const xs, const coord_t* const ys); coord_t c_pcc(mi_t* const m, const coord_t* const xs, const coord_t* const ys); coord_t c_scc(mi_t* const m, const coord_t* const xs, const coord_t* const ys, const int* const xsix, const int* const ysix ); coord_t c_kcc(mi_t* const m, const coord_t* const xs, const coord_t* const ys ); coord_t c_eudist(mi_t* const m, const coord_t* const xs, const coord_t* const ys); #endif //MI_H rsgcc/src/mi.c0000644000175100001440000002030612157732556012747 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #include "iqsort.h" #include "mi.h" #include #include static void init_psi(mi_t* const m) { static const coord_t c = 0.5772156649015328606065; m->psi = Calloc(m->n, coord_t); m->psi[0] = -c; int i; for (i = 1; i < m->n; i++) m->psi[i] = m->psi[i-1] + 1.0/i; } static coord_t get_psi(const mi_t* const m, const int i) { return m->psi[i-1]; } #define compare_coords(a, b) ((**a) < (**b)) static void sort_coords(const coord_t* const cs, coord_t* const scs, int* const iis, const int n) { const coord_t* cps[n]; int i; for (i = 0; i < n; i++) cps[i] = (coord_t*)(cs+i); QSORT(const coord_t*, cps, n, compare_coords); for (i = 0; i < n; i++) { scs[i] = *cps[i]; iis[cps[i]-cs] = i; } } static dist_t find_range(const coord_t* const cs, const int center_idx, const int* const kis, const int k) { int i; dist_t md = 0; for (i = 0; i < k; i++) { const dist_t d = dist_abs(cs[center_idx] - cs[kis[i]]); if (d > md) md = d; } return md; } static int region_count(const coord_t* const scs, const int n, const int center_idx, const dist_t range) { const coord_t center = scs[center_idx]; int c = 0; int i = center_idx-1; while (i >= 0 && center - scs[i] <= range) { c++; i--; } i = center_idx+1; while (i < n && scs[i] - center <= range) { c++; i++; } return c; } int make_mi(mi_t* const m, const int n, const int k) { if (n < k) return 0; m->k = k; m->n = n; init_psi(m); m->sxs = Calloc(n, coord_t); m->xiis = Calloc(n, int); m->sys = Calloc(n, coord_t); m->yiis = Calloc(n, int); return 1; } void destroy_mi(mi_t* const m) { Free(m->sxs); Free(m->xiis); Free(m->sys); Free(m->yiis); Free(m->psi); } coord_t mutual_information(mi_t* const m, const coord_t* const xs, const coord_t* const ys) { const coord_t* pxs; const coord_t* pys; make_grid(&m->grid, xs, ys, m->n, m->k); ordered_points(&m->grid, &pxs, &pys); sort_coords(pxs, m->sxs, m->xiis, m->n); sort_coords(pys, m->sys, m->yiis, m->n); int i; coord_t accum = 0; for (i = 0; i < m->n; i++) { int kis[m->k]; search_knn(&m->grid, pxs[i], pys[i], kis); const dist_t mdx = find_range(pxs, i, kis, m->k); const int nx = region_count(m->sxs, m->n, m->xiis[i], mdx); const dist_t mdy = find_range(pys, i, kis, m->k); const int ny = region_count(m->sys, m->n, m->yiis[i], mdy); accum += get_psi(m, nx) + get_psi(m, ny); } destroy_grid(&m->grid); return get_psi(m, m->k) + get_psi(m, m->n) - (1.0/m->k) - (accum/m->n); } //for gini correlation coord_t c_gcc(mi_t* const m, const coord_t* const xs, const coord_t* const ys, const int* const xsix, const int* const ysix ) { make_grid(&m->grid, xs, ys, m->n, m->k); const int len = m->n; int xorder, yorder; coord_t vecx_x[len], vecx_y[len], vecy_y[len], vecy_x[len]; for (int i = 0; i < len; i++ ) { xorder = xsix[i]; vecx_x[xorder-1] = xs[i]; vecy_x[xorder-1] = ys[i]; yorder = ysix[i]; vecy_y[yorder-1] = ys[i]; vecx_y[yorder-1] = xs[i]; } coord_t accumx_x = 0; coord_t accumx_y = 0; coord_t accumy_y = 0; coord_t accumy_x = 0; for (int i = 0; i < len; i++) { //for order x coord_t weight = 2.0*(i+1)- m->n -1.0; accumx_x += weight*vecx_x[i]; accumx_y += weight*vecx_y[i]; accumy_y += weight*vecy_y[i]; accumy_x += weight*vecy_x[i]; }//for i coord_t gccx_y = accumx_y/accumx_x; coord_t gccy_x = accumy_x/accumy_y; coord_t final_gcc = gccx_y*gccx_y > gccy_x*gccy_x ? gccx_y : gccy_x; destroy_grid(&m->grid); return final_gcc; } //for euclidean distance coord_t c_eudist( mi_t* const m, const coord_t* const xs, const coord_t* const ys) { int i; make_grid(&m->grid, xs, ys, m->n, m->k); coord_t sum1 = 0.0; coord_t t_tmp = 0.0; const int len = m->n; for( i = 0; i < len; i++ ) { t_tmp = xs[i] - ys[i]; sum1 += t_tmp*t_tmp; } destroy_grid(&m->grid); if( sum1 == 0.0 ) return(0.0); else return( sqrt(sum1) ); } //for pearson correlation coord_t c_pcc(mi_t* const m, const coord_t* const xs, const coord_t* const ys) { int i; make_grid(&m->grid, xs, ys, m->n, m->k); coord_t meanx = 0.0; coord_t meany = 0.0; coord_t sum1 = 0.0; coord_t sum2 = 0.0; coord_t sum3 = 0.0; const int len = m->n; for( i = 0; i < len; i++ ) { meanx += xs[i]; meany += ys[i]; } meanx = 1.0*meanx/len; meany = 1.0*meany/len; for( i = 0; i < len; i++ ) { sum1 += (xs[i] - meanx)*(ys[i] - meany); sum2 += (xs[i] - meanx)*(xs[i] - meanx); sum3 += (ys[i] - meany)*(ys[i] - meany); } destroy_grid(&m->grid); if( sum2 == 0.0 || sum3 == 0.0 ) return(0.0); else return( 1.0*sum1/(sqrt(sum2)*sqrt(sum3)) ); } coord_t accsum( const int start, const int end) { return( 0.5*((end+1)*end - start*(start-1)) ); } void maskrankforSCC( coord_t * valuevec, coord_t* ixvec, const int num ) { int i, j; int preIndex = 0; int lastIndex = 0; coord_t meanRank; for( i = 1; i < num; i++ ) { if( valuevec[i] != valuevec[i-1] ) { lastIndex = i - 1; if( preIndex < lastIndex ) { //start to mask the rank meanRank = accsum( preIndex + 1, lastIndex + 1 )/(lastIndex - preIndex + 1); for( j = preIndex; j <= lastIndex; j++ ){ ixvec[j] = meanRank; } } //initialize preIndex = i; lastIndex = 0; }//end if }//end for i if( preIndex >= 0 && preIndex < num - 1 ) { lastIndex = num - 1; meanRank = accsum( preIndex + 1, lastIndex + 1 )/(lastIndex - preIndex + 1); for( j = preIndex; j <= lastIndex; j++ ) ixvec[j] = meanRank; } } //for spearman correlation coord_t c_scc(mi_t* const m, const coord_t* const xs, const coord_t* const ys, const int* const xsix, const int* const ysix ) { make_grid(&m->grid, xs, ys, m->n, m->k); const int len = m->n; int i, j; //sort y by the rank of x int xorder, yorder; coord_t vecx_x[len], vecix_x[len], vecy_x[len], vecixy_x[len]; for(i = 0; i < len; i++ ) { xorder = xsix[i]; vecx_x[xorder-1] = xs[i]; vecix_x[xorder-1] = xorder; vecy_x[xorder-1] = ys[i]; vecixy_x[xorder-1] = ysix[i]; } //mask the rank of x maskrankforSCC( vecx_x, vecix_x, len ); /* for( i = 0; i < len; i++ ) { printf("%f, %f, %f, %f\n", vecx_x[i], vecix_x[i], vecy_x[i], vecixy_x[i] ); } printf("\n\n"); */ //sort x by the rank of y coord_t vecy_y[len], vecixy_y[len], vecixx_y[len]; for( i = 0; i < len; i++ ) { yorder = vecixy_x[i]; vecy_y[yorder-1] = vecy_x[i]; vecixy_y[yorder-1] = yorder; vecixx_y[yorder-1] = vecix_x[i]; } //mask the rank of y maskrankforSCC( vecy_y, vecixy_y, len ); /* for( i = 0; i < len; i++ ) { printf("%f, %f, %f\n", vecy_y[i], vecixy_y[i], vecixx_y[i] ); } printf("\n\n"); */ coord_t tmp = 0; for( i = 0; i < len; i++ ) { tmp += (vecixy_y[i] - vecixx_y[i])*(vecixy_y[i] - vecixx_y[i]); } // printf("%d, %f\n", len, 1.0 - 6.0*tmp/(len*len*len - len) ); destroy_grid(&m->grid); return( 1.0 - 6.0*tmp/(len*len*len - len) ); } //for spearman correlation coord_t c_kcc(mi_t* const m, const coord_t* const xs, const coord_t* const ys ) { make_grid(&m->grid, xs, ys, m->n, m->k); const int len = m->n; int i, j, num; num = 0; for( i = 1; i < len; i++ ) { for( j = 0; j < i; j++ ) { num += (xs[i] - xs[j])*(ys[i] - ys[j]) > 0 ? 1 : -1; }//end for j }//end for i destroy_grid(&m->grid); return( 2.0*num/(len*len - len) ); } rsgcc/src/iqsort.h0000644000175100001440000002577212157732556013704 0ustar hornikusers/* $Id: qsort.h,v 1.5 2008-01-28 18:16:49 mjt Exp $ * Adopted from GNU glibc by Mjt. * See stdlib/qsort.c in glibc */ /* Copyright (C) 1991, 1992, 1996, 1997, 1999 Free Software Foundation, Inc. This file is part of the GNU C Library. Written by Douglas C. Schmidt (schmidt@ics.uci.edu). The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* in-line qsort implementation. Differs from traditional qsort() routine * in that it is a macro, not a function, and instead of passing an address * of a comparison routine to the function, it is possible to inline * comparison routine, thus speeding up sorting a lot. * * Usage: * #include "iqsort.h" * #define islt(a,b) (strcmp((*a),(*b))<0) * char *arr[]; * int n; * QSORT(char*, arr, n, islt); * * The "prototype" and 4 arguments are: * QSORT(TYPE,BASE,NELT,ISLT) * 1) type of each element, TYPE, * 2) address of the beginning of the array, of type TYPE*, * 3) number of elements in the array, and * 4) comparision routine. * Array pointer and number of elements are referenced only once. * This is similar to a call * qsort(BASE,NELT,sizeof(TYPE),ISLT) * with the difference in last parameter. * Note the islt macro/routine (it receives pointers to two elements): * the only condition of interest is whenever one element is less than * another, no other conditions (greather than, equal to etc) are tested. * So, for example, to define integer sort, use: * #define islt(a,b) ((*a)<(*b)) * QSORT(int, arr, n, islt) * * The macro could be used to implement a sorting function (see examples * below), or to implement the sorting algorithm inline. That is, either * create a sorting function and use it whenever you want to sort something, * or use QSORT() macro directly instead a call to such routine. Note that * the macro expands to quite some code (compiled size of int qsort on x86 * is about 700..800 bytes). * * Using this macro directly it isn't possible to implement traditional * qsort() routine, because the macro assumes sizeof(element) == sizeof(TYPE), * while qsort() allows element size to be different. * * Several ready-to-use examples: * * Sorting array of integers: * void int_qsort(int *arr, unsigned n) { * #define int_lt(a,b) ((*a)<(*b)) * QSORT(int, arr, n, int_lt); * } * * Sorting array of string pointers: * void str_qsort(char *arr[], unsigned n) { * #define str_lt(a,b) (strcmp((*a),(*b)) < 0) * QSORT(char*, arr, n, str_lt); * } * * Sorting array of structures: * * struct elt { * int key; * ... * }; * void elt_qsort(struct elt *arr, unsigned n) { * #define elt_lt(a,b) ((a)->key < (b)->key) * QSORT(struct elt, arr, n, elt_lt); * } * * And so on. */ /* Swap two items pointed to by A and B using temporary buffer t. */ #define _QSORT_SWAP(a, b, t) ((void)((t = *a), (*a = *b), (*b = t))) /* Discontinue quicksort algorithm when partition gets below this size. This particular magic number was chosen to work best on a Sun 4/260. */ #define _QSORT_MAX_THRESH 4 /* Stack node declarations used to store unfulfilled partition obligations * (inlined in QSORT). typedef struct { QSORT_TYPE *_lo, *_hi; } qsort_stack_node; */ /* The next 4 #defines implement a very fast in-line stack abstraction. */ /* The stack needs log (total_elements) entries (we could even subtract log(MAX_THRESH)). Since total_elements has type unsigned, we get as upper bound for log (total_elements): bits per byte (CHAR_BIT) * sizeof(unsigned). */ #define _QSORT_STACK_SIZE (8 * sizeof(unsigned)) #define _QSORT_PUSH(top, low, high) \ (((top->_lo = (low)), (top->_hi = (high)), ++top)) #define _QSORT_POP(low, high, top) \ ((--top, (low = top->_lo), (high = top->_hi))) #define _QSORT_STACK_NOT_EMPTY (_stack < _top) /* Order size using quicksort. This implementation incorporates four optimizations discussed in Sedgewick: 1. Non-recursive, using an explicit stack of pointer that store the next array partition to sort. To save time, this maximum amount of space required to store an array of SIZE_MAX is allocated on the stack. Assuming a 32-bit (64 bit) integer for size_t, this needs only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes). Pretty cheap, actually. 2. Chose the pivot element using a median-of-three decision tree. This reduces the probability of selecting a bad pivot value and eliminates certain extraneous comparisons. 3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving insertion sort to order the MAX_THRESH items within each partition. This is a big win, since insertion sort is faster for small, mostly sorted array segments. 4. The larger of the two sub-partitions is always pushed onto the stack first, with the algorithm then concentrating on the smaller partition. This *guarantees* no more than log (total_elems) stack size is needed (actually O(1) in this case)! */ /* The main code starts here... */ #define QSORT(QSORT_TYPE,QSORT_BASE,QSORT_NELT,QSORT_LT) \ { \ QSORT_TYPE *const _base = (QSORT_BASE); \ const unsigned _elems = (QSORT_NELT); \ QSORT_TYPE _hold; \ \ /* Don't declare two variables of type QSORT_TYPE in a single \ * statement: eg `TYPE a, b;', in case if TYPE is a pointer, \ * expands to `type* a, b;' wich isn't what we want. \ */ \ \ if (_elems > _QSORT_MAX_THRESH) { \ QSORT_TYPE *_lo = _base; \ QSORT_TYPE *_hi = _lo + _elems - 1; \ struct { \ QSORT_TYPE *_hi; QSORT_TYPE *_lo; \ } _stack[_QSORT_STACK_SIZE], *_top = _stack + 1; \ \ while (_QSORT_STACK_NOT_EMPTY) { \ QSORT_TYPE *_left_ptr; QSORT_TYPE *_right_ptr; \ \ /* Select median value from among LO, MID, and HI. Rearrange \ LO and HI so the three values are sorted. This lowers the \ probability of picking a pathological pivot value and \ skips a comparison for both the LEFT_PTR and RIGHT_PTR in \ the while loops. */ \ \ QSORT_TYPE *_mid = _lo + ((_hi - _lo) >> 1); \ \ if (QSORT_LT (_mid, _lo)) \ _QSORT_SWAP (_mid, _lo, _hold); \ if (QSORT_LT (_hi, _mid)) { \ _QSORT_SWAP (_mid, _hi, _hold); \ if (QSORT_LT (_mid, _lo)) \ _QSORT_SWAP (_mid, _lo, _hold); \ } \ \ _left_ptr = _lo + 1; \ _right_ptr = _hi - 1; \ \ /* Here's the famous ``collapse the walls'' section of quicksort. \ Gotta like those tight inner loops! They are the main reason \ that this algorithm runs much faster than others. */ \ do { \ while (QSORT_LT (_left_ptr, _mid)) \ ++_left_ptr; \ \ while (QSORT_LT (_mid, _right_ptr)) \ --_right_ptr; \ \ if (_left_ptr < _right_ptr) { \ _QSORT_SWAP (_left_ptr, _right_ptr, _hold); \ if (_mid == _left_ptr) \ _mid = _right_ptr; \ else if (_mid == _right_ptr) \ _mid = _left_ptr; \ ++_left_ptr; \ --_right_ptr; \ } \ else if (_left_ptr == _right_ptr) { \ ++_left_ptr; \ --_right_ptr; \ break; \ } \ } while (_left_ptr <= _right_ptr); \ \ /* Set up pointers for next iteration. First determine whether \ left and right partitions are below the threshold size. If so, \ ignore one or both. Otherwise, push the larger partition's \ bounds on the stack and continue sorting the smaller one. */ \ \ if (_right_ptr - _lo <= _QSORT_MAX_THRESH) { \ if (_hi - _left_ptr <= _QSORT_MAX_THRESH) \ /* Ignore both small partitions. */ \ _QSORT_POP (_lo, _hi, _top); \ else \ /* Ignore small left partition. */ \ _lo = _left_ptr; \ } \ else if (_hi - _left_ptr <= _QSORT_MAX_THRESH) \ /* Ignore small right partition. */ \ _hi = _right_ptr; \ else if (_right_ptr - _lo > _hi - _left_ptr) { \ /* Push larger left partition indices. */ \ _QSORT_PUSH (_top, _lo, _right_ptr); \ _lo = _left_ptr; \ } \ else { \ /* Push larger right partition indices. */ \ _QSORT_PUSH (_top, _left_ptr, _hi); \ _hi = _right_ptr; \ } \ } \ } \ \ /* Once the BASE array is partially sorted by quicksort the rest \ is completely sorted using insertion sort, since this is efficient \ for partitions below MAX_THRESH size. BASE points to the \ beginning of the array to sort, and END_PTR points at the very \ last element in the array (*not* one beyond it!). */ \ \ { \ QSORT_TYPE *const _end_ptr = _base + _elems - 1; \ QSORT_TYPE *_tmp_ptr = _base; \ register QSORT_TYPE *_run_ptr; \ QSORT_TYPE *_thresh; \ \ _thresh = _base + _QSORT_MAX_THRESH; \ if (_thresh > _end_ptr) \ _thresh = _end_ptr; \ \ /* Find smallest element in first threshold and place it at the \ array's beginning. This is the smallest array element, \ and the operation speeds up insertion sort's inner loop. */ \ \ for (_run_ptr = _tmp_ptr + 1; _run_ptr <= _thresh; ++_run_ptr) \ if (QSORT_LT (_run_ptr, _tmp_ptr)) \ _tmp_ptr = _run_ptr; \ \ if (_tmp_ptr != _base) \ _QSORT_SWAP (_tmp_ptr, _base, _hold); \ \ /* Insertion sort, running from left-hand-side \ * up to right-hand-side. */ \ \ _run_ptr = _base + 1; \ while (++_run_ptr <= _end_ptr) { \ _tmp_ptr = _run_ptr - 1; \ while (QSORT_LT (_run_ptr, _tmp_ptr)) \ --_tmp_ptr; \ \ ++_tmp_ptr; \ if (_tmp_ptr != _run_ptr) { \ QSORT_TYPE *_trav = _run_ptr + 1; \ while (--_trav >= _run_ptr) { \ QSORT_TYPE *_hi; QSORT_TYPE *_lo; \ _hold = *_trav; \ \ for (_hi = _lo = _trav; --_lo >= _tmp_ptr; _hi = _lo) \ *_hi = *_lo; \ *_hi = _hold; \ } \ } \ } \ } \ \ } rsgcc/src/grid.h0000644000175100001440000000274712157732556013305 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #ifndef GRID_H #define GRID_H #include "points.h" typedef struct { int fill; int size; int* idxs; coord_t *xs, *ys; } cell_t; typedef struct candidate { int idx; dist_t dist; struct candidate* next; } candidate_t; typedef struct { int size; int used; candidate_t* candidates; candidate_t front; dist_t max_dist; } candidates_t; typedef struct { coord_t* xs; coord_t* ys; int k; coord_t xmin, ymin; dist_t size; int cols, lines; cell_t* cells; candidates_t candidates; } grid_t; void make_grid(grid_t* const g, const coord_t* const xs, const coord_t* const ys, const int n, const int k); void destroy_grid(grid_t* const g); void ordered_points(const grid_t* const g, const coord_t** xs, const coord_t** ys); void search_knn(grid_t* const g, const coord_t x, const coord_t y, int* const ris); #endif //GRID_H rsgcc/src/grid.c0000644000175100001440000001632512157732556013275 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #include #include "grid.h" #include #include /* * The following two functions (distance and bounding_box) really belong to points.c. * Keeping them in a separate translation unit, however, means that they won't be inlined * unless link time optimizations are enabled. * As gcc only recently started to support LTO (and such extension is still considered * somewhat experimental), we manually move these functions here to be sure they are * considered for inlining. */ static dist_t distance(const coord_t x1, const coord_t y1, const coord_t x2, const coord_t y2) { const dist_t dx = dist_abs(x2 - x1); const dist_t dy = dist_abs(y2 - y1); return dx > dy ? dx : dy; } static void bounding_box(const coord_t* const xs, const coord_t* const ys, const int n, bbox_t* const bbox) { bbox->xmin = bbox->ymin = COORD_MAX; bbox->xmax = bbox->ymax = COORD_MIN; int i; for (i = 0; i < n; i++) { if (xs[i] < bbox->xmin) bbox->xmin = xs[i]; if (xs[i] > bbox->xmax) bbox->xmax = xs[i]; if (ys[i] < bbox->ymin) bbox->ymin = ys[i]; if (ys[i] > bbox->ymax) bbox->ymax = ys[i]; } } static int min(const int a, const int b) { return a < b ? a : b; } static int max(const int a, const int b) { return a > b ? a : b; } static dist_t dist_max(const dist_t a, const dist_t b) { return a > b ? a : b; } static int coords_to_idx(const grid_t* const g, const coord_t x, const coord_t y) { int i = (x - g->xmin) / g->size; if (i == g->cols) i--; int j = (y - g->ymin) / g->size; if (j == g->lines) j--; return j*g->cols + i; } static dist_t wall_distance(const grid_t* const g, const coord_t x, const coord_t y, const int ci, const int cj) { dist_t dmin, d; const coord_t wl = g->xmin + ci*g->size; dmin = x - wl; d = wl+g->size - x; if (d < dmin) dmin = d; const coord_t wb = g->ymin + cj*g->size; d = y - wb; if (d < dmin) dmin = d; d = wb+g->size - y; if (d < dmin) dmin = d; return dmin; } static void append_index(cell_t* const cell, const int idx, const int load_factor) { if (cell->fill == cell->size) { cell->size = cell->size == 0 ? load_factor : cell->size*2; cell->idxs = Realloc(cell->idxs, cell->size, int); } cell->idxs[cell->fill++] = idx; } static void fill_cells(grid_t* const g, const coord_t* const xs, const coord_t* const ys, const int n) { const int load_factor = coord_ceil(coord_sqrt(n)); int i; for (i = 0; i < n; i++) { const int cell_idx = coords_to_idx(g, xs[i], ys[i]); append_index(&g->cells[cell_idx], i, load_factor); } int c; coord_t *px, *py; px = g->xs = Calloc(n, coord_t); py = g->ys = Calloc(n, coord_t); for (c = 0; c < g->lines*g->cols; c++) { cell_t* const cell = &g->cells[c]; cell->xs = px; cell->ys = py; for (i = 0; i < cell->fill; i++) { const int idx = cell->idxs[i]; *px++ = xs[idx]; *py++ = ys[idx]; } } for (c = 0; c < g->lines*g->cols; c++) { if (g->cells[c].idxs) Free(g->cells[c].idxs); } } static void reset_candidates(candidates_t* const cs) { cs->used = 0; cs->front.next = NULL; cs->max_dist = DIST_MAX; } static void init_candidates(candidates_t* const cs, const int k) { cs->size = k+1; cs->candidates = (candidate_t*)Calloc(cs->size, candidate_t); reset_candidates(cs); } static void destroy_candidates(candidates_t* const cs) { Free(cs->candidates); } static void insert_candidate(const int idx, const dist_t dist, candidates_t* const cs) { if (dist >= cs->max_dist) return; candidate_t* insert_point = &cs->front; while (insert_point->next && dist < insert_point->next->dist) insert_point = insert_point->next; candidate_t* c; if (cs->used < cs->size) { c = &cs->candidates[cs->used++]; c->idx = idx; c->dist = dist; c->next = insert_point->next; insert_point->next = c; if (cs->used == cs->size) cs->max_dist = cs->front.next->dist; } else { c = cs->front.next; c->idx = idx; c->dist = dist; if (c != insert_point) { cs->front.next = c->next; c->next = insert_point->next; insert_point->next = c; } cs->max_dist = cs->front.next->dist; } } static int candidate_last_dist(candidates_t* cs, dist_t* const d) { if (cs->used != cs->size) return 0; *d = cs->front.next->dist; return 1; } static void extract_candidate_idxs(candidates_t* const cs, const int k, int* const is) { const candidate_t* c = cs->front.next; int i; for (i = 0; i < k; i++) { is[k-i-1] = c->idx; c = c->next; } } void make_grid(grid_t* const g, const coord_t* const xs, const coord_t* const ys, const int n, const int k) { bbox_t bbox; bounding_box(xs, ys, n, &bbox); g->k = k; g->xmin = bbox.xmin; g->ymin = bbox.ymin; const dist_t w = bbox.xmax - bbox.xmin; const dist_t h = bbox.ymax - bbox.ymin; coord_t alpha = 1.23; while (1) { g->size = alpha * dist_max(w/coord_sqrt(n), h/coord_sqrt(n)); g->cols = max(coord_ceil(w/g->size), 1); g->lines = max(coord_ceil(h/g->size), 1); if (w / g->size < g->cols && h / g->size < g->lines) break; alpha += 0.01; } g->cells = Calloc(g->cols*g->lines, cell_t); fill_cells(g, xs, ys, n); init_candidates(&g->candidates, k); } void destroy_grid(grid_t* const g) { destroy_candidates(&g->candidates); Free(g->cells); Free(g->xs); Free(g->ys); } void ordered_points(const grid_t* const g, const coord_t** xs, const coord_t** ys) { *xs = g->xs; *ys = g->ys; } void search_knn(grid_t* const g, const coord_t x, const coord_t y, int* const ris) { const int ci = (x - g->xmin) / g->size; const int cj = (y - g->ymin) / g->size; dist_t dsh = wall_distance(g, x, y, ci, cj); const int lmax = max(max(ci, g->cols-1-ci), max(cj, g->lines-1-cj)); reset_candidates(&g->candidates); int l, j, c; for (l = 0; l <= lmax; l++, dsh += g->size) { const int il = ci-l; const int ih = ci+l; const int jl = cj-l; const int jh = cj+l; for (j = max(0, jl); j <= min(jh, g->lines-1); j++) { int i, istep; if (j == jl || j == jh) { istep = 1; i = max(0, il); } else { istep = ih - il; i = il >= 0 ? il : il+istep; } for (; i <= min(ih, g->cols-1); i += istep) { const cell_t* const cell = &g->cells[j*g->cols + i]; const int base_idx = cell->xs - g->xs; for (c = 0; c < cell->fill; c++) { const dist_t d = distance(x, y, cell->xs[c], cell->ys[c]); insert_candidate(base_idx+c, d, &g->candidates); } } } dist_t dmin; if (candidate_last_dist(&g->candidates, &dmin) && dmin <= dsh) break; } extract_candidate_idxs(&g->candidates, g->k, ris); } rsgcc/src/bridge.c0000644000175100001440000001046412157732556013602 0ustar hornikusers/* Copyright 2010-2011 Gabriele Sales This file is part of parmigene. knnmi is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License version 3 as published by the Free Software Foundation. knnmi is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with parmigene. If not, see . */ #include "mi.h" #include "points.h" #include unsigned int gen_seed(const double* const cs, const int n, const int k) { return n * k * ((int)cs[n/2]*100); } //compute gcc for sub pairs of genes void c_cor_subset(const int* const corIndexp, double* const xs, int* const xsix, const int* const lp, const int* const np, const int* const kp, const double* const noisep, const int* const nt, int* const rowidxp, int* const colidxp, const int* const subrownump, const int* const subcolnump, double* res) { const int corIndex = *corIndexp; const int l = *lp; const int n = *np; const int k = *kp; const double noise = *noisep; const int ntd = *nt; double xnormed[l]; int ntd_new = ntd; const int subrownum = *subrownump; const int subcolnum = *subcolnump; //#pragma omp parallel num_threads(ntd_new), test this #ifdef _OPENMP #pragma omp parallel #endif { int i, j, ii, jj; unsigned int seed = gen_seed(xs, l*n, k); #ifdef _OPENMP #pragma omp for nowait #endif for (i = 0; i < l; i++) { double* const p = xs+(i*n); xnormed[i] = 1.0; } mi_t mi; make_mi(&mi, n, k); #ifdef _OPENMP #pragma omp for schedule(dynamic) #endif for (ii = 0; ii < subrownum; ii++){ i = rowidxp[ii] - 1; for (jj = 0; jj < subcolnum; jj++){ j = colidxp[jj] - 1; if( corIndex == 1 ) res[ii*subcolnum+jj] = c_gcc(&mi, xs+(i*n), xs+(j*n), xsix+(i*n), xsix+(j*n) ); //GCC else if(corIndex == 2 ) res[ii*subcolnum+jj] = c_pcc(&mi, xs+(i*n), xs+(j*n)); //PCC else if(corIndex == 3 ) res[ii*subcolnum+jj] = c_scc(&mi, xs+(i*n), xs+(j*n), xsix+(i*n), xsix+(j*n) ); //SCC else if(corIndex == 4 ) res[ii*subcolnum+jj] = c_kcc(&mi, xs+(i*n), xs+(j*n)); //KCC else res[ii*subcolnum+jj] = c_eudist(&mi, xs+(i*n), xs+(j*n)); //ED } } destroy_mi(&mi); } } //compute gcc for all pairs of genes void c_cor_all(const int* const corIndexp, double* const xs, int* const xsix, const int* const lp, const int* const np, const int* const kp, const double* const noisep, const int* const nt, double* res) { const int corIndex = *corIndexp; const int l = *lp; const int n = *np; const int k = *kp; const double noise = *noisep; const int ntd = *nt; double xnormed[l]; int ntd_new = ntd; //#pragma omp parallel num_threads(ntd_new), test this #ifdef _OPENMP #pragma omp parallel #endif { int i, j; unsigned int seed = gen_seed(xs, l*n, k); #ifdef _OPENMP #pragma omp for nowait #endif for (i = 0; i < l; i++) { double* const p = xs+(i*n); xnormed[i] = 1.0; } #pragma omp for for (i = 0; i < l; i++) { if( corIndex == 5 ) res[i*l+i] = 0.0; else res[i*l+i] = 1.0; //diag for 1.0 } mi_t mi; make_mi(&mi, n, k); #ifdef _OPENMP #pragma omp for schedule(dynamic) #endif for (i = 1; i < l; i++){ for (j = 0; j < i; j++){ if( corIndex == 1 ) res[i*l+j] = res[j*l+i] = c_gcc(&mi, xs+(i*n), xs+(j*n), xsix+(i*n), xsix+(j*n) ); //GCC else if(corIndex == 2 ) res[i*l+j] = res[j*l+i] = c_pcc(&mi, xs+(i*n), xs+(j*n)); //PCC else if(corIndex == 3 ) res[i*l+j] = res[j*l+i] = c_scc(&mi, xs+(i*n), xs+(j*n), xsix+(i*n), xsix+(j*n) ); //SCC else if(corIndex == 4 ) res[i*l+j] = res[j*l+i] = c_kcc(&mi, xs+(i*n), xs+(j*n)); //KCC else res[i*l+j] = res[j*l+i] = c_eudist(&mi, xs+(i*n), xs+(j*n)); //ED } } destroy_mi(&mi); } } rsgcc/src/Makevars.win0000644000175100001440000000000012002453273014434 0ustar hornikusersrsgcc/src/Makevars0000644000175100001440000000011012002455255013644 0ustar hornikusersPKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) rsgcc/man/0000755000175100001440000000000012156120003012132 5ustar hornikusersrsgcc/man/uniqueTissues.Rd0000644000175100001440000000170512155661345015334 0ustar hornikusers\name{uniqueTissues} \alias{uniqueTissues} \title{ get tissue information } \description{ This function reads the sample names of genes and get unique tissue information for further tissue-specific genes finding and clustering. } \usage{ uniqueTissues(x) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{x}{ a numeric matrix containing gene expression value. The column labels are samples names. For two samples from the same tissue T, their names should be assigned as T.1 and T.2, respectively. } } \value{ A data matrix in which the elements is 0 (the sample not from the tissue) or 1 (the sample from the tissue) } \author{ Chuang Ma, Xiangfeng Wang } \seealso{ \code{\link{getsgene}}, \code{\link{gcc.tsheatmap}}. } \examples{ \dontrun{ data(rsgcc) x <- rnaseq uniqueTissues(x) } } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. \keyword{ tissue specific } rsgcc/man/rsgcc.gui.Rd0000644000175100001440000001052611766434716014342 0ustar hornikusers\name{rsgcc.gui} \alias{rsgcc.gui} \title{ graphical user interface (GUI) of rsgcc package } \description{ This function provides a graphical user interface (GUI) to perform the correlation and clustering analysis via a series of mouse actions without command-line based R programmming. The output of clustering informaiton in "CDT" format can be further visulized and analyzed by TreeView program. } \usage{ rsgcc.gui(margins = c(1, 1), labRow = "", labCol = "", lwid = c(0.5, 0.05, 0.01, 0.5, 0.01, 0.05, 0.5), keynote = "FPKM") } %- maybe also 'usage' for other objects documented here. \arguments{ \item{margins}{ a numeric vector containing 2 elements specifying the margins for heat map. See (par(mar=*)). } \item{labRow}{ character strings indicating the lables of rows. Default to rownames(x). } \item{labCol}{ character strings indicating the lables of columns. Default to colnames(x). } \item{lwid}{ column width for visual layout. } \item{keynote}{ a character string indicating the lable of color key. } } \details{ For heat map of ts-genes, rsgcc will run the gcc.tsheatmap function. the lwid could be c(0.5, 0.05, 0.01, 0.5, 0.01, 0.05, 0.5). The 2nd, 4th and 6th elements are column widths of color tissue bar, heat map and color key bar. The 3rd and 5th are the widths of separation for these three figures. The The first and last elements are the widths of "blank region" for displaying the labels of tissue and key bar. If the option "Find and cluster ts-genes" is not selected, rsgcc will call the gcc.heatmap for clustering analysis. In this case, the lwid should be a numeric vector with two elements(e.g.,c(0.65,4)). Here 0.65 is the width of column for row dendrogram. 4 is the width of heat map. } \author{ Chuang Ma, Xiangfeng Wang } \note{ rsgcc.gui is built upon gWidgets package. Make sure "gWidgets" and "gWidgetsRGtk2" package is properly installed. The following is a guide of using the rsgcc GUI. Step 1: Select a gene expression data file and load expression data with read.table or read.csv, which is decided by the program itself according to the file suffix (".txt", ".csv", or nothing). Each row of the table is one gene, and each column is the expression data of one sample. The column names are sample IDs indiciating the tissue informaiton (i.e., "T1.1", "T1.2", "T2"). After the data is alreadly loaded, you can click the option ("Display loaded data") to display the gene expression datain a new window. If the tissue-specific genes are interested for the clustering analysis, please select the option ("Find and cluster ts-genes") and specify the threshold of tissue-specificity score (Default 0.95. The threshold should be smaller than 1.0; the MUST be pressed to confirm the change ). Step 2: Selecte a correlation method. Default: Gini correlation. Step 3: Specify a distance measure. Default: raw correlation (1-coef). Step 4: Choose a cluster method. In current version, rsgcc includes seven cluster methods. More information about these cluster methods can be found at the help page of hclust function. Step 5: Set a integer for the number of CPUs to be used. The snowfall package in R is needed for the paralleled computing to speed up the calculation of correlation coefficients. After you change the number, the MUST be pressed to confirm the change. Step 6: Press the button "Start to run" to perform the correlation and clustering analysis of gene expression data. A heat map will be visualized at the right region of interface if the task is finished. Step 7: Three bars can be slided to adjust colors in heat map. Step 8: After the appropriate colors have already been determined, you can "save correlation and cluster data". The correlations will be output to a file with three columns (gene1, gene2, correlation). The cluster information will be output into three files (the suffix are ".atr", ".gtr" and ".cdt") for visualizing and analyzing by Treeview program. A pdf file will also be generated for heat map. All these files in the same directory of gene expression data file. } \seealso{ \code{\link{gcc.tsheatmap}}, \code{\link{gcc.heatmap}}. } \examples{ \dontrun{ library("gWidgetsRGtk2") library(rsgcc) ## the GUI of rsgcc will show up after the GUI toolkit "gWidgetsRGtk2" is selected. rsgcc.gui() } } \keyword{correlation} \keyword{cluster} rsgcc/man/rsgcc-package.Rd0000644000175100001440000000446412156120003015123 0ustar hornikusers\name{rsgcc-package} \alias{rsgcc-package} \alias{rsgcc} \docType{package} \title{ Gini methodology-based correlation and clustering analysis of microarray and RNA-Seq gene expression data } \description{ This package provides functions for calculating the Gini, the Pearson, the Spearman, the Kendall and Tukey's Biweight correlations, Compared to the other mentioned correlation methods, the GCC may perform better to detect regulatory relationships from gene expression data. In addition, the GCC also has some other advantageous merits, such as independent of distribution forms, more capable of detecting non-linear relationships, more tolerant to outliers and less dependence on sample size. For more information about these correlation methods, please refer to (Ma and Wang, 2012). This package also provides an graphical user interface (GUI) to perform clustering analysis of microarray and RNA-Seq data in a coherent step-by-step manner. } \details{ \tabular{ll}{ Package: \tab rsgcc\cr Type: \tab Package\cr Version: \tab 1.0.6\cr Date: \tab 2013-06-12\cr License: \tab GPL(>=2)\cr } } \note{ 1) The implement of rsgcc requires several R packages developed by other developers(e.g., biwt, cairoDevice, fBasics, snowfall, grDevices, gplots, gWidgets, gWidgetsRGtk2, stringr, ctc). Please make sure that these packages have been successfully installed before loading the rsgcc package. 2) A general method to install a new package on the computer is to use the command: install.package("packagename"). Some other methods can be found at http://math.usask.ca/~longhai/software/installrpkg.html. For the installation of ctc package, please use the biocLite resource with the following commands: source("http://bioconductor.org/biocLite.R") biocLite("ctc") 3) To run the "rsgcc.gui()" function, please do remember to select the GUI toolkit "gWidgetsRGtk2". 4) Bug reports and suggestions/questions can be sent to Chuang Ma (chuangma2006@gmail.com) or Xiangfeng Wang (xwang1@cals.arizona.edu). } \author{ Chuang Ma, Xiangfeng Wang. Maintainer: Chuang Ma } \references{ Chuang Ma, Xiangfeng Wang. Application of the Gini correlation coefficient to infer regulatory relationships in transcriptome analysis. Plant Physiology, 2012, 160(1):192-203. } \keyword{ package } rsgcc/man/onegcc.Rd0000644000175100001440000000166312006300140013661 0ustar hornikusers\name{onegcc} \alias{onegcc} \title{ compute one Gini correlation coefficient } \description{ onegcc calcluates one Gini correlation coefficient with rank information of the first variable. } \usage{ onegcc(x, y) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{x}{ a numeric vector. } \item{y}{ a numeric vector with the same length of x. } } \details{ This is a generic function cacluating correlation with rank information of the first variable and the actual value information of the second variable. } \value{ Gini correlation coefficient (a numeric value ranged from -1.0 to 1.0). } \author{ Chuang Ma, Xiangfeng Wang } \examples{ data(rsgcc) x <- rnaseq[1:10,] #Just use a small subset of RNA-Seq data onegcc(x[1,], x[2,]) # generate one correlaiton for one gene pair onegcc(x[2,], x[1,]) # generate the other correlaiton for the same gene pair } \keyword{ correlation } rsgcc/man/getsgene.Rd0000644000175100001440000000520212156001512014224 0ustar hornikusers\name{getsgene} \alias{getsgene} \title{ identify tissue(or condtion)-specific genes } \description{ This function identifies tissue(or condition)- specific genes by considering the difference between the mean expression value of one tissue and the max expression value of other tissue. } \usage{ getsgene(x, Log = FALSE, Base = 2, AddOne = FALSE, tsThreshold = 0.95, MeanOrMax = "Mean", Fraction = TRUE) } \arguments{ \item{x}{ a numeric matrix containing gene expression value. The column labels are samples names. For two samples from the same tissue T, their names should be assigned as T.1 and T.2, respectively. } \item{Log}{ logical indicating whether the gene expression value would be log-transformed. } \item{Base}{ a numeric value specifying the base of logarithm. } \item{AddOne}{ logical indicating if add one for avoding the problem of log-zero. } \item{tsThreshold}{ a numeric value giving the threshold of tissue specificity score. The tissue specificity score is 1, if the gene is only expressed in one tissue. Otherwise, the tissue specificity socre will be smaller than 1. } \item{MeanOrMax}{ character "Mean" or "Max" indicate the mean or maximal expression value will be calculated for the tissue of interest. } \item{Fraction}{ logical indicating whether the gene expression value would be scaled across tissues. } } \details{ The tissue specificity score is calculated with the formula 1-min(R(1), R(2), ..., R(i),..., R(n)), where R(i) = M(i)/E(i), E(i) is the mean or maximal expression value of tissue i, and M(i) is the maximal expression value of other tissues. If the tissue specificity score higher than tsThreshold, then the gene is considered as tissue specifically expressed. If Fraction is TRUE, the expression values of a gene is scaled accorss the tissues with the formula e(i)/(e(1)+e(2)+...+e(n)). e(i) is the expression value of the consider gene in ith sample. } \value{ A list with following components: \item{csGenes }{a data matrix containing expression vlaues of tissue specific genes.} \item{csScoreMat }{a data matrix with three columns containg the gene index information from x, tissue specificity score and the tissue information with the tissue specificity score.} } \author{ Chuang Ma, Xiangfeng Wang. } \references{ [1] Chuang Ma, Xiangfeng Wang. Machine learning-based differential network analysis of transcriptomic data: a case study of stress-responsive gene expression in Arabidopsis thaliana. 2013 (Submitted). } \examples{ \dontrun{ data(rsgcc) tsRes <- getsgene(rnaseq, tsThreshold = 0.75, MeanOrMax = "Mean", Fraction = TRUE) } } \keyword{ tissue specific } rsgcc/man/gcc.tsheatmap.Rd0000644000175100001440000001532612155672100015162 0ustar hornikusers\name{gcc.tsheatmap} \alias{gcc.tsheatmap} \title{ correlaiton and clustering analysis of tissue-specific genes } \description{ This function performs the correlaiton and clustering analysis of tissue-specific genes with expression data generated from microarray and RNA-Seq experiments. } \usage{ gcc.tsheatmap(x, cpus = 1, ## correlation method method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr"), #cluster method clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward"), #hcdata by output gcc.tsheatmap rowhcdata = NULL, colhcdata = NULL, keynote = "FPKM", ## dendrogram control symm = FALSE, ## data scaling scale = c("none","row", "column"), na.rm=TRUE, ## image plot revC = identical(Colv, "Rowv"), add.expr, ## mapping data to colors breaks, symbreaks=min(x < 0, na.rm=TRUE) || scale!="none", ## colors colrange = c("yellow", "red"), tissuecol= "heat.colors", ## block sepration colsep = 0.15, rowsep, sepcolor="white", sepwidth=c(0.05,0.05), ## level trace trace=c("none","column","row","both"), tracecol="cyan", hline=median(breaks), vline=median(breaks), linecol=tracecol, ## plot margins margins = c(5, 5), ## plot labels main = NULL, xlab = NULL, ylab = NULL, ## plot layout lmat = NULL, lhei = NULL, lwid = NULL, ## extras ...) } \arguments{ \item{x}{ a data matrix containing numeric variables. Example: rows may correspond to genes and columns to samples. } \item{cpus}{ the number of cpus used for correlaiton calcluation. snowfall package in R needed to be installed in advance. } \item{method}{ a character string that specifies a correlation method to be used for association calculation. } \item{distancemethod}{ a character string specifying the distance method to be used. Currently, three distance methods are available, include: "Raw" (1-cor)", "Abs" (1-|cor|), and "Sqr" (1-|cor|^2). } \item{clustermethod}{ the distance measure to be used. This must be one of "complete", "average", "median", "centroid", "mcquitty", "single", or "ward". } \item{rowhcdata}{ the object of class hc generated from gcc.hclust for rows in x. } \item{colhcdata}{ the object of class hc generated from gcc.hclust for columns in x. } \item{keynote}{ a character string indicating the lable of color key. } \item{symm}{ logical indicating if x should be treated as a symmetrical matrix. } \item{scale}{ a character string specifying if the data values would be centered and scaled by rows or by columns, or none. } \item{na.rm}{ logical indicating whether the Nas should be eliminated. } \item{revC}{ logical indicating if the column order should be reversed for plotting. } \item{add.expr}{ expression that will be evaluated after the call to image. } \item{breaks}{ (optional)Either a integer number specifying the break points to be used, or a numeric vector indicating the splitting points for binning x into colors. } \item{symbreaks}{ Boolean indicating whether breaks should be made symmetric about 0. This option works if the quanbreaks is FALSE. } \item{colrange}{ colors used for the image. It could be a function(i.e., heat.colors) or a vector of colors with at leat two elements (e.g., c("green", "black", "red")). } \item{tissuecol}{ colors for tissues. tissuecol could be a function(i.e., heat.colors) or a vector of colors for different tissues. } \item{colsep}{ (optional) vectors of integers indicating which columns should be seperated from the preceding columans by a narrow space of color sepcor. } \item{rowsep}{ (optional) vectors of integers indicating which rows should be seperated from the preceding rows by a narrow space of color sepcor. } \item{sepcolor}{ (optional) color used to seperate rows or columns. } \item{sepwidth}{ (optional) A numeric vector containing two elements giving the width (colsep) or height (rowsep) for the seperation of columns or rows. } \item{trace}{ character string indicating a solid "trace" lined should be drawn across "rows", or "column", or "both" or "none". } \item{tracecol}{ color for trace } \item{hline}{ vector of values whithin cells where horizontal lines should be drawn with line col. } \item{vline}{ vector of values whithin cells where vertical lines should be drawn with line col. } \item{linecol}{ color for hline and vline. } \item{margins}{ a numeric vector containing 2 elements specifying the margins for column and row names, respectively. See (par(mar=*)). } \item{main}{ main title. defaults to none. } \item{xlab}{ x-axis label. defaults to none. } \item{ylab}{ y-axis label. defaults to none. } \item{lmat}{ position matrix for visual layout. } \item{lhei}{ column height for visual layout. } \item{lwid}{ column width for visual layout. For instance, lwid = c(0.5, 0.05, 0.01, 0.5, 0.01, 0.05, 0.5) } \item{\dots}{ additonal arguments passed on to image. } } \value{ A list with the following components: \item{retval }{a list with components of "rowInd" (row index of heat map from x), "colInd" (column index of heat map from x), "call" (the match call), "carpet" (reordered and scaled 'x' values used generate the main 'carpet'), "rowDendrogram" (row dendrogram), "colDendrogram" (column dendrogram), "breaks" (break points for binning x), "col" (colors used), and "colorTable" (a data frame providing the lower and upper bound and color for each bin).} \item{hcr }{the values returned from gcc.hclust function for clustering individuals (e.g., genes) in row direction} \item{hcc }{the values returned from gcc.hclust function for clustering individuals (e.g., genes) in column direction} } \author{ Chuang Ma, Xiangfeng Wang } \seealso{ \code{\link{gcc.dist}}, \code{\link{cor.matrix}}, \code{\link{gcc.hclust}}, \code{\link{gcc.tsheatmap}}. } \examples{ \dontrun{ data(rsgcc) #get expression matrix of tissue-specific genes tsRes <- getsgene(rnaseq, tsThreshold = 0.75, MeanOrMax = "Max", Fraction = TRUE) #heat map of tissue-specific genes thm <- gcc.tsheatmap(tsRes$tsgene, cpus = 1, method = "GCC", distancemethod = "Raw", clustermethod = "complete") } } \keyword{cluster} \keyword{tissue specific} rsgcc/man/gcc.heatmap.Rd0000644000175100001440000002253212155671725014624 0ustar hornikusers\name{gcc.heatmap} \alias{gcc.heatmap} \title{ heat map } \description{ The heat map is a color imange representing the data in the a matrix. The dendrogram information are usually added to the left side and/or to the top for displaying the clustering information. } \usage{ gcc.heatmap(x, cpus = 1, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr"), clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward"), #hcdata output by gcc.tsheatmap function rowhcdata = NULL, colhcdata = NULL, keynote = "FPKM", ## dendrogram control symm = FALSE, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, dendrogram = c("both", "row", "column", "none"), ## data scaling scale = c("none", "row", "column"), na.rm = TRUE, revC = identical(Colv, "Rowv"), add.expr, #break points for binning values in x breaks = 16, quanbreaks = TRUE, symbreaks = min(x < 0, na.rm = TRUE) || scale != "none", #colors colrange = c("green", "black", "red"), colsep, rowsep, sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1, notecol = "cyan", na.color = par("bg"), trace = c("none", "column", "row", "both"), tracecol = "cyan", hline = median(breaks), vline = median(breaks), linecol = tracecol, margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(dim(x)[1]), cexCol = 0.2 + 1/log10(dim(x)[2]), labRow = NULL, labCol = NULL, #color key key = TRUE, keysize = 0.65, density.info = c("none", "histogram", "density"), denscol = tracecol, symkey = min(x < 0, na.rm = TRUE) || symbreaks, densadj = 0.25, #image information main = NULL, xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL, ...) } \arguments{ \item{x}{ a data matrix containing numeric variables. Example: rows may correspond to genes and columns to samples. } \item{cpus}{ the number of cpus used for correlaiton calcluation. snowfall package in R needed to be installed in advance. } \item{method}{ a character string that specifies a correlation method to be used for association calculation. } \item{distancemethod}{ a character string specifying the distance method to be used. Currently, three distance methods are available, include: "Raw" (1-cor)", "Abs" (1-|cor|), and "Sqr" (1-|cor|^2). } \item{clustermethod}{ the distance measure to be used. This must be one of "complete", "average", "median", "centroid", "mcquitty", "single", or "ward". } \item{rowhcdata}{ the object of class hc generated from gcc.hclust for rows in x. } \item{colhcdata}{ the object of class hc generated from gcc.hclust for columns in x. } \item{keynote}{ a character string indicating the lable of color key. } \item{symm}{ logical indicating if x should be treated as a symmetrical matrix. } \item{Rowv}{ logical determining if the row dendrogram should be reordered. } \item{Colv}{ logical determining if the columns dendrogram should be reordered. } \item{dendrogram}{ a character string indicating whether to draw the "none", "row", "column", "both" dendrograms. } \item{scale}{ a character string specifying if the data values would be centered and scaled by rows or by columns, or none. } \item{na.rm}{ logical indicating whether the Nas should be eliminated. } \item{revC}{ logical indicating if the column order should be reversed for plotting. } \item{add.expr}{ expression that will be evaluated after the call to image. Can be used to add components to the plot. } \item{breaks}{ (optional)Either a integer number specifying the break points to be used, or a numeric vector indicating the splitting points for binning x into colors. } \item{quanbreaks}{ logical indicating if the splitting points for binning x in quantile scale. For instance, if quanbreaks is TRUE, the breaks would be quantile( unique(c(x)), probs = seq(0, 1, length = breaks), na.rm = TRUE). } \item{symbreaks}{ Boolean indicating whether breaks should be made symmetric about 0. This option works when the quanbreaks is FALSE. } \item{colrange}{ colors used for the image. It could be a function(i.e., heat.colors) or a vector of colors with at leat two elements (e.g., c("green", "black", "red")). } \item{colsep}{ (optional) vectors of integers indicating which columns should be seperated from the preceding columans by a narrow space of color sepcor. } \item{rowsep}{ (optional) vectors of integers indicating which rows should be seperated from the preceding rows by a narrow space of color sepcor. } \item{sepcolor}{ (optional) color used to seperate rows or columns. } \item{sepwidth}{ (optional) A numeric vector containing two elements giving the width (colsep) or height (rowsep) for the seperation of columns or rows. } \item{cellnote}{ (optional)a matrix of character strings which will be placed within each color cell. } \item{notecex}{ (optional)numeric scaling factor for cellnot itmes. } \item{notecol}{ (optional)character string specifying the color of cellnote text. Default to "green". } \item{na.color}{ color to be used for missing value (NA). Defaults to the plot background color. } \item{trace}{ character string indicating a solid "trace" lined should be drawn across "rows", or "column", or "both" or "none". } \item{tracecol}{ color for trace } \item{hline}{ vector of values whithin cells where horizontal lines should be drawn with line col. } \item{vline}{ vector of values whithin cells where vertical lines should be drawn with line col. } \item{linecol}{ color for hline and vline. } \item{margins}{ a numeric vector containing 2 elements specifying the margins for column and row names, respectively. See (par(mar=*)). } \item{ColSideColors}{ (optional)character string of colors for annotating the columns of heat map. } \item{RowSideColors}{ (optional)character string of colors for annotating the rows of heat map. } \item{cexRow}{ cex.axis for the row lables. } \item{cexCol}{ cex.axis for the column lables. } \item{labRow}{ character strings indicating the lables of rows. Default to rownames(x) } \item{labCol}{ character strings indicating the lables of columns. Default to colnames(x) } \item{key}{ logical indicating whether the color key would be draw. } \item{keysize}{ numeric value specifying the size of color key. } \item{density.info}{ character string indicating whether to superimpose a "histogram", a "density" plot, or not plot("none") on the color-key. } \item{denscol}{ character string giving the color for the density display specified by density.info, defaults to the same value as tracecol. } \item{symkey}{ Boolean indicating whether the color key should be made symmetric about 0. Defaults to TRUE if the data includes negative values and to FALSE otherwise. } \item{densadj}{ Numeric scaling value for tuning the kernel width when a density plot is drawn on the color key. Default to 0.25. } \item{main}{ main title. defaults to none. } \item{xlab}{ x-axis label. defaults to none. } \item{ylab}{ y-axis label. defaults to none. } \item{lmat}{ position matrix for visual layout. See details from the help page of heatmap.2. } \item{lhei}{ column height for visual layout. See details from the help page of heatmap.2 } \item{lwid}{ column width for visual layout. See details from the help page of heatmap.2 } \item{\dots}{ additonal arguments passed on to image. } } \details{ This function plots the heat map of microarray and RNA-Seq gene expression data by modifying the scripts of heatmap.2 in R. The main modifications include: (1) designing several distance measures derived from Gini correlation and other correlation methods; (2) providing the option of quanbreaks for RNA-Seq data. } \value{ A list with the following components: \item{retval }{a list with components of "rowInd" (row index of heat map from x), "colInd" (column index of heat map from x), "call" (the match call), "carpet" (reordered and scaled 'x' values used generate the main 'carpet'), "rowDendrogram" (row dendrogram), "colDendrogram" (column dendrogram), "breaks" (break points for binning x), "col" (colors used), and "colorTable" (a data frame providing the lower and upper bound and color for each bin).} \item{hcr }{the values returned from gcc.hclust function for clustering individuals (e.g., genes) in row direction} \item{hcc }{the values returned from gcc.hclust function for clustering individuals (e.g., genes) in column direction} } \author{ Chuang Ma, Xiangfeng Wang } \note{ This function clusters microarray and RNA-Seq gene expression data and plot heatmap by refining heatmap.2 function in gplots package. Therefore, most parameters and output values are defined similarly as those in heatmap.2. } \seealso{ \code{\link{gcc.dist}}, \code{\link{cor.matrix}}, \code{\link{gcc.hclust}}, \code{\link{gcc.tsheatmap}}. } \examples{ \dontrun{ data(rsgcc) x <- rnaseq[1:50,] ghm <- gcc.heatmap(x, cpus = 1, method = "GCC", distancemethod = "Raw", clustermethod = "complete", labRow = "") } } \keyword{cluster} rsgcc/man/gcc.hclust.Rd0000644000175100001440000000552012155671256014504 0ustar hornikusers\name{gcc.hclust} \alias{gcc.hclust} \title{ hierarchical cluster } \description{ Hierarchical cluster analysis of microarrany and RNA-Seq gene expression data with Gini correlation and four other correlation methods. } \usage{ gcc.hclust(x, cpus = 1, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr"), clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward")) } \arguments{ \item{x}{a data matrix containing numeric variables, which is the same as the GEMatrix defined in the cor.matrix function.} \item{cpus}{the number of cpus used for computation.} \item{method}{a character string indicating the method to be used to calculate the associations.} \item{distancemethod}{a character string specifying the distance method to be used. Currently, three distance methods are available, include: "Raw" (1-cor)", "Abs" (1-|cor|), and "Sqr" (1-|cor|^2).} \item{clustermethod}{the distance measure to be used. This must be one of "complete", "average", "median", "centroid", "mcquitty", "single", or "ward".} } \details{ This function generate the cluster tree with different distance measures for clustering analysis of microarray and RNA-Seq gene expression data by integrating the hclust function of stats package in R (http://stat.ethz.ch/R-manual/R-devel/library/stats/html/hclust.html). Similar to the hclust, the values output by gccdist can be directly used to plot cluster trees with plot function. } \value{ A list with the following components: \item{hc}{an object describes the tree information produced by the clustering process. This object is also a list with five components: "merge" is a numeric matrix with n-1 rows and 2 columns. n is the number of used individuals (e.g., genes). Row i describes the merging of clusters at step i of the clustering. "order" is a vector giving the order of individuals for tree cluster plotting. "height" is a vector with n-1 numeric values associated with the distance measure for the particular cluster method. "labels" are labels of the individuals being clustered. "method" is the distance measure used for cluster analysis. See details for the description in hclust function of stats package.} \item{dist}{a data matrix containing the distances between different genes.} \item{pairmatrix }{a data matrix including the correlation between different genes.} } \author{ Chuang Ma, Xiangfeng Wang } \examples{ \dontrun{ #obtain gene expression data of 10 genes. data(rsgcc) x <- rnaseq[1:10,] #hierarchical clustering analysis of these 10 genes with GCC method hc <- gcc.hclust(x, cpu=1, method = "GCC", distancemethod = "Raw", clustermethod = "complete") #plot cluster tree plot(hc$hc) } } \keyword{ cluster } rsgcc/man/gcc.dist.Rd0000644000175100001440000000255712155671120014142 0ustar hornikusers\name{gcc.dist} \alias{gcc.dist} \title{ compute distance matrix for hierarchical clustering } \description{ This function computes the distance between the rows of a data matrix with the specified distance method. } \usage{ gcc.dist(x, cpus = 1, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr")) } \arguments{ \item{x}{a data matrix containing numeric variables, which is the same as the "GEMatrix" defined in the cor.matrix function.} \item{cpus}{the number of cpus used for computation.} \item{method}{a character string indicating the method to be used to calculate the associations. } \item{distancemethod}{a character string indicating the distance method to be used. Currently, three distance methods are available, include: "Raw" (1-cor)", "Abs" (1-|cor|), and "Sqr" (1-|cor|^2). } } \value{ A list with the following components: \item{dist }{a data matrix containing the distances between different genes.} \item{pairmatrix }{a data matrix including the correlation between different genes.} %% ... } \author{ Chuang Ma, Xiangfeng Wang } \seealso{ \code{\link{cor.matrix}}, \code{\link{gcc.hclust}}, \code{\link{gcc.tsheatmap}}. } \examples{ data(rsgcc) x <- rnaseq[1:10,] gcc.dist(x, method = "GCC", distancemethod = "Raw", cpus = 1) } \keyword{ cluster } rsgcc/man/gcc.corfinal.Rd0000644000175100001440000000232012155661234014765 0ustar hornikusers\name{gcc.corfinal} \alias{gcc.corfinal} \title{ get the final correlaiton and p-value of Gini method } \description{ Compare two correlations produced by GCC method for a gene pair, and choose one as the final output of GCC method. } \usage{ gcc.corfinal(gcccor) } \arguments{ \item{gcccor}{ a list output by cor.pair function for GCC method. } } \details{ If the p-value is "NA", the correlation with absolute maximum value is selected; otherwise, the correlation with lower p-value is chosen. } \value{ \item{gcc.fcor }{the final correlation of GCC.} \item{gcc.fpavlue }{the final pvalue of correlation.} %% ... } \author{ Chuang Ma, Xiangfeng Wang } \seealso{ \code{\link{onegcc}}, \code{\link{cor.pair}}. } \examples{ \dontrun{ data(rsgcc) x <- rnaseq[1:4,] #compute correlation between 1th and 4th genes #significance level of the computed correlation #is calcuated with 200 permutation tests. corpair <- cor.pair(c(1,4), GEMatrix = x, rowORcol = "row", cormethod = "GCC", pernum = 200, sigmethod = "two.sided") #get the final correlation and p-value of GCC method gcc.corfinal(corpair) } } \keyword{correlation} rsgcc/man/data.Rd0000644000175100001440000000047611766424570013367 0ustar hornikusers\name{data} \alias{data} \alias{rnaseq} \non_function{} \title{ example of RNA-Seq gene expression data } \description{ RNA-Seq profiled gene expression data of 100 genes } \usage{ data(rsgcc) } \examples{ data(rsgcc) x <- rnaseq[1:3,] #The first 3 genes in GEMatrix "rnaseq". } \keyword{ datasets } rsgcc/man/cor.pair.Rd0000644000175100001440000000665412006277625014172 0ustar hornikusers\name{cor.pair} \alias{cor.pair} \alias{gcc.cor.pair} \alias{gcc.corpair} \title{ compute the correlation between two genes } \description{ This function can compute the correlation of a pair of genes with Gini correlation and four other correlation methods. The signficance level (p-value) of computed correlation can be estimated with the permutation test method. } \usage{ cor.pair(idxvec, GEMatrix, rowORcol = c("row", "col"), cormethod = c("GCC", "PCC", "SCC", "KCC", "BiWt"), pernum = 0, sigmethod = c("two.sided", "one.sided")) } \arguments{ \item{idxvec}{ a numer vector containing two elements for the index of genes or samples in GEMatrix (e.g., c(1,2) ). } \item{GEMatrix}{ a data matrix containing numeric variables. Example: rows correspond to genes and columns to samples. This parameter is the same as the "GEMatrix" defined for cor.matrix. } \item{rowORcol}{ a character string ("row" or "col") indicating gene expression data will be extracted by rows or columns for correlation calculation. "row": correlation between two genes. "col": correlaiton between two samples. } \item{cormethod}{ a character string that specifies the correlation method to be used for correlation calculation. } \item{pernum}{ the number of permutation test used for calcluating statistical significance level (i.e., p-value) of correlations. } \item{sigmethod}{ a character string ("two-sided" or "one-sided") specifying the method used to compute p-value for permutation test. } } \value{ A list with the following components: \item{gcc.rankx }{a Gini correlation produced by using the rank information of the first gene (i.e., the first element in idxvec).} \item{gcc.ranky }{a Gini correlation produced by using the rank information of the second gene (i.e., the second element in idxvec).} \item{gcc.rankx.pvalue }{p-value of gcc.rankx.} \item{gcc.ranky.pvalue }{p-value of gcc.ranky.} \item{cor }{the correlation produced by "PCC", "SCC", "KCC" or "BiWt".} \item{pvalue }{the p-value of cor.} } \author{ Chuang Ma, Xiangfeng Wang } \note{ (1) To perform BiWt, the R package "biwt" should be installed in advance. (2) When the cormethod is defined as "GCC", this function will output a list with four numeric elements: gcc.rankx, gcc.ranky, gcc.rankx.pvalue, gcc.ranky.pvalue. Otherwise, it will output a list with two elements (cor and p-value) } \seealso{ \code{\link{onegcc}}, \code{\link{cor.matrix}}, \code{\link{gcc.corfinal}}. } \examples{ data(rsgcc) #load the sample data in rsgcc package x <- rnaseq[1:4,] #construct a GEMatrix with the RNA-Seq data of the first four genes #compute correlation between the 1st and 4th genes corpair <- cor.pair(c(1,4), GEMatrix = x, rowORcol = "row", cormethod = "GCC", pernum = 0, sigmethod = "two.sided") \dontrun{ #compute correlation between the 1st and 4th genes, #the p-value of correlation will be estimated with 2,000 permuation test. corpair <- cor.pair(c(1,4), GEMatrix = x, rowORcol = "row", cormethod = "GCC", pernum = 2000, sigmethod = "two.sided") #compute correlation between the 1st and 4th samples corpair <- cor.pair(c(1,4), GEMatrix = x, rowORcol = "col", cormethod = "GCC", pernum = 0, sigmethod = "two.sided") } } \keyword{ correlation } rsgcc/man/cor.matrix.Rd0000644000175100001440000001332012006302307014511 0ustar hornikusers\name{cor.matrix} \alias{cor.matrix} \alias{gcc.cor.matrix} \alias{gcc.cormatrix} \title{ correlation calculation for a set of genes } \description{ This function provides five correlation methods (GCC, PCC, SCC, KCC and BiWt) to calculates the correlations between a set of genes. } \usage{cor.matrix(GEMatrix, cpus = 1, cormethod = c("GCC", "PCC", "SCC", "KCC", "BiWt"), style = c("all.pairs", "pairs.between", "adjacent.pairs", "one.pair"), var1.id = NA, var2.id = NA, pernum = 0, sigmethod = c("two.sided", "one.sided"), output = c("matrix", "paired")) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{GEMatrix}{ a data matrix containing the gene expression data of a set of genes. Each row of the GEMatrix corresponds to a gene, and each column corresponds to the expression level in a sample. } \item{cpus}{ the number of cpus used for correlation calcluation. } \item{cormethod}{ a character string that specifies a correlation method to be used for correlation calculation. } \item{style}{ a character string that indicates the all or partial genes to be used for correlation calculation. } \item{var1.id}{ a numeric vector specifying the row numbers of genes. } \item{var2.id}{ a numeric vector specifying the row numbers of genes. Suppose the var1.id and var2.id are respectively c(1,2) and c(3,6), then the the correlation of gene pairs (G1,G3) and (G2,G6) will be calcuated. For styles of "pairs.between" and "one.pair", this parameter MUST be pre-defined. For the other styles, this parameter can be automatically defined by the program itself. } \item{pernum}{ the number of permutation test used for calcluating statistical significance level (i.e., p-value) of correlations. } \item{sigmethod}{ a character string ("two-sided" or "one-sided") that specifies the method used to compute p-value for permutation test. } \item{output}{ a character string ("matrix" or "paired") that represents the output format of correlations. Specifiying the "matrix" will output two matrix for correlations and p-values, respectively. Specifiying the "paired" will output only one matrix, in which each row provides the information of gene pair, the correlation and p-value. } } \details{ Given a data matrix (e.g., microarray and RNA-Seq gene expression matrix), calculating correlation with GCC and other correlation methods for partial(or all) individuals (e.g., genes). The statistical significance (i.e., p-value) of each correlation is derived from the permutation test. Parallel computing options are also provided for speeding up the correlation calculation. } \value{ A list with the following components: \item{corMatrix }{correlation of gene pairs shown in matrix form. This data matrix is generated only when the output format "matrix" is specified. } \item{pvalueMatrix }{p-value of correlations shown in matrix form. This data matrix is generated only when the output format "matrix" is specified. } \item{corpvalueMatrix }{correlation and p-values listed in one form. This data matrix is generated only when the output format "paired" is indicated. } } \author{ Chuang Ma, Xiangfeng Wang } \note{ (1) The rsgcc provides the RNA-Seq profiled expression level of 100 genes as a sample data to implement cor.matrix, cor.pari and other functions in the package. After running the command: data(rsgcc), the expression data of these genes will be loaded to the GEMatrix "rnaseq". The user can also load the GEMatrix from the gene expression file, which should be in a textual format of a gene expression matrix. An example of the gene expression file(e.g., "/home/rsgcc/geneExpFile.txt") is shown as follow: sample1 sample2 sample3 sample4 gene1 45 65 77 75 gene2 75 78 83 39 gene3 2 11 10 6 Then the GEMatrix can be obtained by load this gene expression file with the command: x <- as.matrix(read.table("/home/rsgcc/geneExpFile.txt")) (1) var1.id and var2.id should be defined with the numeric vector format for "pairs.between", or "one.pair" styles. (2) To perform BiWt, the R package "biwt" should be installed in advance. (3) To perform the parallel compution, the "snowfall" package in R should be installed in advance. } \seealso{ \code{\link{cor.pair}}, \code{\link{onegcc}}, \code{\link{cor.test}}. } \examples{ \dontrun{ data(rsgcc) #load the sample data in rsgcc package x <- rnaseq[1:4,] #construct a GEMatrix with the RNA-Seq data of the first four genes #run on one CPU for all the possible gene pairs in the GEMatrix "x". #do not cacluate the p-value of computated correlations. cor.matrix(x, cpus = 1, cormethod = "GCC", style = "all.pairs", pernum = 0, sigmethod = "two.sided", output = "matrix") #run on two CPUs, snowfall package should be properly installed. #cacluate the p-value of correlations with the 2000 permutation tests. #output the results in "paired" format. cor.matrix(x, cpus = 2, cormethod = "GCC", style = "all.pairs", pernum = 2000, sigmethod = "two.sided", output = "paired") #calcuate correlation on the pairs between the 1st, 2nd and 3rd genes in the GEMatrix "x". cor.matrix(x, cpus = 1, cormethod = "GCC", style = "pairs.between", var1.id = c(1:3), var2.id = c(1:3), pernum = 2000, sigmethod = "two.sided", output = "matrix") #calcuate correlation on the adjacent genes ((G1,G2), (G2,G3), (G3,G4),...) in the GEMatrix "x". cor.matrix(x, cpus = 1, cormethod = "GCC", style = "adjacent.pairs", pernum = 2000, sigmethod = "two.sided", output = "matrix") } } \keyword{ correlation } rsgcc/man/adjacencymatrix.Rd0000644000175100001440000001047712155662654015626 0ustar hornikusers\name{adjacencymatrix} \alias{adjacencymatrix} \title{ adjacency matrix calculation } \description{ This function generates the adjacency matrix for network re-construction from gene expression data with different methods including Gini correlation (GCC), Pearson correlation (PCC), Spearman correlation (SCC), Kendall correlation (KCC),Tukey's biweight correlation coefficient (BiWt), mutual information (MI), and maximal information-based nonparametric exploration (MINE) statistic methods. Euclidean distance (ED) between two genes can also be calculated. It was implemented these methods in C language and parallel mode, and thus is greatly faster than the cor.matrix function. } \usage{ adjacencymatrix(mat, genes.row = NULL, genes.col = NULL, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), k = 3, cpus = 1, saveType = "matrix", backingpath = NULL, backingfile = "adj_mat", descriptorfile = "adj_desc", ... ) } \arguments{ \item{mat}{a data matrix containing gene expression dataset where rows defines for genes and columns for samples.} \item{genes.row}{if genes.row and genes.col are not NULL, a subset of genes will be selected for correlation calcuation and set as the rownames of adjacency matrix. Currently, doesn't work for BiWt and MINE} \item{genes.col}{if genes.row and genes.col are not NULL, a subset of genes will be selected for correlation calcuation and set as the colnames of adjacency matrix.Currently, doesn't work for BiWt and MINE} \item{method}{a method used to calculate the association between a pair of genes.} \item{k}{the number of nearest neighbors to be considered for estimating the mutual information. Must be less than the number of columns of mat, and only work for the mutual information(MI) method.} \item{cpus}{the number of cpus will be used for calcuation.} \item{saveType}{the type (matrix or bigmatrix) specified for the output.} \item{backingpath}{the path used to save big matrix. If it is NULL, current working directory will be used. Works only when the saveType is "bigmatrix".} \item{backingfile}{the file name of big matrix. Works only when the saveType is "bigmatrix".} \item{descriptorfile}{the description file of big matrix. Works only when the saveType is "bigmatrix".} \item{\dots}{Further parameters passed for MINE method. More information can be found in R package minerva.} } \value{ \item{value}{a matrix (or big.matrix) recording the associations between the gene pairs.} } \references{ [1] Ma C and Xiang XF. Application of the Gini correlation coefficient to infer regulatory relationships in transcriptome analysis, Plant Physiology, 2012, 160(1):192-203. [2] Sales G and Romualdi C. Parmigene-a parallel R package for mutual information estimation and gene network reconstruction. Bioinformatics, 2012, 27:1876-1877. [3] Davide Albanese, Michele Filosi, Roberto Visintainer, et al. minerva and minepy: a C engine for the MINE stuite and its R, Python and MATLAB wrappers. Bioinformatics, 2013, 29(3): 407-408. [4] David N. Reshef, Yakir A. Reshef, Hilary K. Finucane, et al. Detecting novel associations in large data sets. Science, 2011, 334(6062): 1518-1524. [5] Johanna Hardin, Aya Mitani, Leanne Hicks and Brian VanKoten. A robust measure of correlation between two genes on a microarray. BMC Bioinformatics, 2007, 8:220. } \note{ 1) The mutural information estimation is based on k-nearest neighbor distance (Sales G and Romualdi C, 2012). Thus the parameter k only works for the mutual information method. 2) Two correlations can be produced by the GCC method by reciprocally using the rank and value information of one gene (or variable). Here the correlation with the maximum absolute values is selected for generating the adjacency matrix. 3) More information about the big.matrix can be found in bigmemory package. } \examples{ \dontrun{ mat = matrix(rnorm(180), nrow = 10) rownames(mat) <- c(1:10) colnames(mat) <- c(1:18) mat #using GCC method to compute the correlation of all gene paris adjacencymatrix( mat, method = "GCC", cpus = 2 ) #using GCC method to compute the correlation of a subset of gene pairs adjacencymatrix( mat = mat, genes.row = c(1:5), genes.col = c(5:8), method = "GCC", cpus = 2 ) #for MI method, k works here. adjacencymatrix( mat, method = "MI", k= 3) } } \keyword{ correlation } rsgcc/data/0000755000175100001440000000000012155670707012314 5ustar hornikusersrsgcc/data/rsgcc.rda0000644000175100001440000001570112157732556014114 0ustar hornikusers‹…[w\TG×F»(vT@A`{Ár½¹ô^„VPA±"6°k4vÅKÔh‚oÔ5j‚=öÞk,à73/Ù½úûöà™;å”ç<çÌM¨g´¤yts###c£†F ŒŒ’62&ÿi`ÔȨùÛ8{¸6G7’ 0522±÷~C ß_µ5ªû‡¨¡÷µ„Tþ¹½yÅäS}ÞÈ€Õ'Nn qM“5gÿÙ "Qງ"`ˆäýU›Ó÷ùñòÁ/ÏÞWºŸ矯y Å?þI~AèøL#øBfü‡­W@z~?]¬¤s^i†¶›yO»xôYêŽÃ~žUh²rnÜó¹Ð¹pò½žÛðó§nÞÄÚ‹ ª™¢ià:ßÈày6†MØYìhÜÒW–îhqò¤šlúø0c&ߟsëÍo¥ñ½¡tO¼=ù—à}ãížs{O/ñH×ykM ±†ì¢y®6/ñÒ[V‚"×;gâÅk0¹žÓèß{߈+à½}ÀEAÁ|>Ñwžv¤†ë;>˜\ã<¼3 ö Úç˜Û×òÏ^ ÎöýKSç¶!½äÛ™ÑczòÏIÆDê6gÀ묗ªZݦvéØ×u$tlÔaþ(úÙv-´z©×ùJÿ^ÏyìVGCl5ʿ͸Ʒ—•=ï?²†x¦© "ÿ·•»×@ò’‚Û>Û,û|råç71»¡ûÌ;Î ·¸;ö™#_%§@qÛ/Äéº/äéT~ü¯Q¨B ,îþq+‹{‚ùóÕŸ>ÿÀ?‡ü­šãÿƾðûeœƒ´2·eÀ1høÔ¸%i¼¿È‘Ìnÿrœwtÿ}Ùˆ]´xvÐ;ùy*ÙÈ3ˆpswÿ8ÄÛñy,2|ŸŸGì±ÉA!’ÍßSŃ¿•)$üÔ5(/ñ2ŒÙÏü€ω|Û½h<8÷‹9>CǽÎ/¶…é ñfßÔØ.[Á1¡ŸÂLfAʫɌ0ðjŸ)ÿ¸{çí{·~j÷át£òç”=׎rº\_ßÚ‡ì¼à=ý,¸ämõggh¾ßºCá¸&1“3ü›s‘o¬· ){{mjlížõ¢ŠO›²ç:ß?Ãç îH¦½üò Én›Šùþ&ôô‘i…eŸ÷×wÄÌd!ÙØ‚ÈÌ÷Ÿ‹:Ú=ZWxêñ>°SŠK4šÈ¿§ iùãÂÇwøçÜâ‰{\w¯Ñl~pn9’ xÂ÷w_ÞÈ}ÊÍJ^ü–ôh‘ßЇ»8³Ps÷gðE´ie £'QóËýÛ8Æÿõ‹KÀºÇV¥tçßëR³}ö»u |ù¢Š%Çî‚ GÍÀ &lC{Ò=Dµ~6—:ÿžô9Vñ(Èpë[ö½¹ØMa×çÁ~CK˧ýf7ÚhOÕîÈ-›²çý§.Ÿ 1÷fz,œØ<»Æ“ûq¿H^ûµ]RêÕÚ{ïtµ À­ƒªáõÅo‡ñó%gêº/L¬\05a ß-1 `è=±{ÜÁNJ€ ²lÑ÷`¾¡_îÇðUü¸‚éí‡/·JºÐ`p;ôƒûáãÕ|ÿ ¥DŒ·aPÙùšW?yóíÆùN*Ÿ,þ¹Ç@¦/˜n‹~¡±3:³·} ø»YYÜÝò–ýJ…ï¹#€<(ÿ@÷'G²8ÙÿY–hµ`{P«)æ€ø ÛÏv{y¯¸jÒªiÁ–GŽšcZ_gê÷EÍ Ü¦ !:ÿΠ†öKUËB !óc§•¿æ‡}$¼Ì[Ë¿«fòäž8Lr4S ‘©dÕˆe|ÿÌIè¿ÁÛÐ?‹|év“êår¥Öª|Çn°HAûï‰x-o‹8„¸êð7…Ù6`á(î{ÇyØ•²ýÔë/ù ˜Ð8:Öì‚1$ X‘NçvpX òSeÄpÿàÇ»DVѯ{óŒdzån¯úhãÿÃZlGÂ×Õ70~SØ»ÊËí¹ªö'9û–þMþàñ쬅þ¯Ÿ¢×Y?à[6m.yž6ß戛)é×]ÈŽÁvA¤s³Úbþ=˜G¢˜l8:gÀ]n¸~\ë7£Çô¿,hãºíªsúT =0^10™4¸—ÍÏÝ?t?Þﱞn3@º–¸åü`WEá.šïOkM¼4=G°NPõθ¨[ÀÆýí£Þ~ë¡¿Ý@—ÉcýÁÌ÷ܶô„½ü¸ e/!‹Åwp_²ªÌéV+haT~d«OgˆûžŠa4ŒêÇ߈gŒ'ðÏÝB]7é-‡¼lÄû´MÅvÇ€·%…ýî0ä±"·’z½=¡t@ ×p\ºò²ðINmþ‹øÁµõù6à9›¸kÕŸPdƒ¸åÌÜ¢/x(+\žEþ Þ·HØ[:Ÿt¬ÙýgO~þøOĽրöbååvÅŽ`V¶ew`ðQ¾ÔZô—س”ætäÛ£ŽR¸ÏåjæÎKí,™ ÚûdšÕ'ëßk„ñ5â:ò»²Á4ÜwûÒÏUY®HŽTdBÏ¿žÛÏzØú3Øï Ý#î­Ü+ ´1øÌmnf|Ãè,4ýiÑÇ“ 㿎k$ w~àü=ùãû&µ'bÊ¿ä<Ù•¶Ú5ñÝeqk~|L"îªÿØîâÊí|»ÏÃè;u}øçäÑ$ОƒñWÈ6:­âîWŸnbþ"BžUËþ^ ÖÕÈû¿uúð¨ª7ÿµbò=°]:hEç¦ßAtëˆeÄ“ þ³îé¤;’Ò•^üün@ÝÈò#ŸâΧÀjhŸßúÈýK”H éVîÅÞ]ç$nòp/Oâ-â÷ÜóËyŸ]oØ@ו‡w¿m— êz~Áϧ:@´&äÁQ˜ž¡÷É|š(€osJ€ ×SrêÁ§øqÅ(6…¨š É?mž©9Câqú§%Ä8Z^½´¡Ì¾¹·K†L‰Þǽw‰)¹iÆû%tWw»þô2Ì®D»Œ(ÄüB£U’ ^@>£‰½ùñPXкlÑ~èm³ëô¸©0PŽ|J›€úsoNÕbý®`|›r”Ù)x^d¼lH2 ö"Ót{£·;Rb¹¿ô[qÕi…«’Ç3¾ .#—œˆÚýÎÐp“îGoqšyñó§Œfx ÞK0¿'¡?¹Ng| ÆD¡ö´ÑkIƒé1zAÓçæ Ä‚o˯ùþ±]XRž ž%@‡q±]ÇJ*0²Õ m×5Ü2L½¹¸Úî§:•M¯öèÁ^85<‚á.¸ÞßC&J‡ÜÑÌ JüøöŽ’U„·Ñ´¸S½]fÐð#´ƒ˜¦ïz¹hPn“m˜¾¹ªWï|WÇL„ ‚¢)Óùq‰&È"†"Îu‹sMßù„_ÁzHÄC†ËÐñ(òµŒPÔ·¢-¥ÔÃ0VIÓßñõöñaMÂì1g ×6žo?ôè %´üsȨ_¿lâ­Í7@マöšOw!éæ}®ñ,ßáÎ^¼ô~‡×0¹=m–C/›ñü<$®ü-<Ö‘Sé ä’üÖaìIúýæ0h  )÷€ÙŒ#²"^ÂanSô¿‰¨¿¨Ës(óù¯'?ZîÉ…P[Œ/º…T,é0£Úû†³üzf›ÇTüpp d¯¡b é.hç!µÈ#²õyŒû„1ö´h öw‡;ïü] yôÃìH<¯û¶#$±þì/úßz³ï!é&…1SwHlüv7È‹‘Ÿƒ qÙ±ÃW~~]G¬yga¾ÇMGÿV|Ä|yìdæ¯ÐÇç’Õ×áâ»1As¯7­Xš‚×**¾t«ŸÀðXŒ³1Í׬gõ[ñìÀNîÞó»¥%y" [Ab‰r³¼éuk÷í®6 y²ÛF†OÜÛ”ñÕO®ƒïT‚^ w€¤†¨sÁvÈ Äü¶’ÄnŒŠÃ|2|/òæp׳Ó+Gôå÷ÅÍcu&ÄÕÚ½D}'Á”.,¿æÛ±<¾Þ‚o‡YÄ\Ë‹¡ûsÂ^›XCD…ÛK’JAHóñ‘¯»Å»”A¹½W@æ´•@r~b–Qú5(ØÅpA°Ÿ^W°.P[•ß›û*§éõ‡5À|3°]{¢ø¶`dêíW–qRG þªŽ1}rëâóË£4ÀAQê5ͶÖsïA¢MA¾ ]÷å`yòäƒ.ŸŸ‚¿‚°GãQ`éJÃh ]B»šËð¦Ó´£3 ºŽÏ\…¹<=CÝx hvŸú"kl½±â¿õ5~½‘sËãOW@ø.Ì›gœDœI@?É´Ó÷GÌ7ã>:»‚oÏíÉp‡ÈIB ‡`)ÂúEÎv<¨óW~¼ÇPšî6y£93FÞÊ·§ßǺœÅG[õÁg|{árŒEsžSœ#zò±–4¬½¼°‘ñ~¾?<çÇÚ#™ ë‚uÿ;hb÷¥Ÿ{iRÞôùýpú‡ÕÅ@{qÊ*uæÓ8' ¤F"?ñPõ¢Ÿ²Þgûx©A÷õ½ ÏJÄ1® Fdz<Ü*)}haŽ–4°ðï÷Laòä÷ñª Æã$þè8.iò¢)XŷǶc ÿ<¸ æIåˆÝê~]õ h:²ú HþÇòJîµf‚é¹K±Öy…ÈÍi4A.ÿI/þÖîi.òÏ^1˜Ÿ§*'~du^ÈX}—@~ùªÉù¶&û¬¯«z~ÎÏÓ§”¤SÚû`w†ÐÉ·ýc0ùðÏñœ^=WX^~ V˜a=eÀÊ^$á‚7‘oÉÓ1_ ÉxÿÞa¼§¬'ó"ŽÝ ’6n¥Ä¶]>Æx\’-ÖQ“sôêWß”W‘#êEýœ¬ú<3z 7öß|/n©~?°«;ÆÍ¸¸8ó“^¼áû#R˜ü¿ûY…qÒ³1%ÐN|»Ë ÆŸùçd:ÞZHÐÖÚ ÔÞœѬ`>h.Š ˆú`¾5½HßýÒò—¯á{ßÜŸuï^sLô‚Ü\ýºcÈp–—Cr„>ïêgó?š æ q@¾d}ã£Ã¶Þ¹àé*]núë­o®«´Â¸2ŠGÓã(!õ âjÀJ}ü ¬ç§|»æ<»ïàŸ£÷c]}JÂRj—ÜÓ•ßúP¸™@GÍ,/Í#PÍÂúAx¬›ôX…ùjj'¡&E¿ _Žõˆ‹±þäïŽõ…ÄÕ~…è7¡¿ÐkX„­a àPÃöËïoÈZäuªJ’ÄE×*‚¾í^€"Å%Èí…ùyL[\7ìa q!Ùxc÷#ÜÛÙXßH4+Q­¿?,|†þê—ˆyÍŒ3z¸Æý3ø÷#‹ƒêö!?¸ÿ§Ës`Ó ëÁññwúÑk­š™`|ãk¬“~,ö»Ÿ ø@Ëò¶PR‹üjPÝv&D„Ñ0d ;‘Ï„ìYK9~·>Þ¶®J ൌïòójŽþ5yã?Ü=ŸˆÏ.ÌÍFýûÕ¾70ÿô“Pš=Rh™+R*Q^ÑÏ;7=F ÍÓ’MƒùÒ’›fÙÕÐ×ë-cz3¹qµ/ïQâ ±ÎýÿúGÀi~k€æÕÐFŽß¤Å}{½b<›»Ótáé½ý;BrWÌ[%îì~[0_f™^]³Þ?Rð~Sö€Ð’^ òâ’çi °èq¥Œd2ü¸?¬Fÿ…÷Á!IìÂlÿ¢ežþ“ûÒžÒ‹3‚õ\Î ß^Þ÷ë?ñ(d«ÓÀ¨4¬?xÔù+÷ ëW]µþGÇz^4¬Ú‹¼ Ú7çr@<õúí5=Ø>V9UºÂ|Ž“ºèt˜ç]Ãû‰a¸×Yž.Øïðøˆ´’]„AðdŒw)c˜¹÷Ží}öpõj¦gèYw?³ÀëH1Ù½ ´¦·Kº"Þ. tQ~|Á¯ãcqBÒq~‹[7ÎöÆŸ~±x+‡Ä)'.Œ™1¡É·êqOõLÇBŽ/Þ%‹q?O{¾1Ò+°nÑb{Û”£ø}<;׺±l@}>!³1þæþ­Ÿ833 ³=q¾N«Ï~¬çï0ã¶­ô‘EpõuHªÀúzöB”ƒØ¸õ~ž`Ó‘<`þ ýý[X,3|ë4þÙ¼bGâ¶ýnõÏòÉëûÎÞ›ñ{–N#F{O/˜ =Æ/~ë8ô$oÐ@û¾ûØ=ÿ^ÔQ¬“ˆ«g43¾ÑTuöÞvìÍR€bÑ6ZH€ÌI,ž×ïó3-@÷kO^†ZùÈoÌJôîk`| ãíÖëYAô¿«ê³ƒ†¹Î0n^[`oÖ¢ã—Ïy͆™%(·ø]÷Â{}úÙvÍr˜ö/æ637yÛ` pfø}’©ßæµ?t8 šNèO¹›1ÏS”cؽBQкl1„´f÷\àùú‰Å™Ýùî¢Hm8T¾à÷“rãW[õ‚¸ ¥–»Xù]ŸÁe \ƒ÷Ø9g1OWaý7×ëÇ SØw9\Mé´NIç#¡Öøy—b¼~÷SF²ü™·Ë*«wÚ“AϺï1Ò!ÞK:SZàÃN¡Üs1ŽX{_+YrÊBCêãsjCÌ“t@ém ;ƒ~1|7òÝþq.Ï"/œ† ®1{Ųýóï»¬Äø2tˆÏDG¬ƒMÜÂê–|;ç‹~<Íå¢yNéÜ@Hq@û“*QZ/¼ÐÕ2À€Þ/Yü®÷¿>ŒBIÁR½¼¢gã ß²~œ÷Ä;±9ûn ÌM‹ÏÛ7†ÞÀõBV›CàºMܹ܆ׯ¶mI¥èÏŠ¹,î æÓì¦×•"A{tK¼géÚƒ~žd="°ŽnÕ0rÑýõrù.ïË ¨¯èˆï]œ—ži¾»˜Ÿ|½Ø0uH&ùÓʈ~àhd”ŠÿÖÒ” ®ÍÔÑ’€yݸ6l,ö·ò ÕÄj$>b…D)’eJ•Ü Q¤I%_F¶qó«Õj…ØY–èí#‰ Ë ™Ze8-YLe¸–H!W¨e†#EJ¥\aØ(—ÈÕjƒF©J)•‰ëw%ao:K¿º+Ò§®%•ªU ÁZ"‘D!2|]M¶jx()Ù«ÚPVR…J¥Ì)“©¿2§H¢4<”L¢–H•†#Uäl†sŠ ‘L ¥L®ˆTNõ%h”É%RÃF‰D%êN­–n^ªT¨$‚³‹åНHž(ÏðDJ‘X*7Ü<‘¼R,8¦D"‘.$+²¯˜®B ‘Z-—Ž$ÊP«û”ËÄRCˆTJ¥ÂpN¹X-R FJÄj‰`N©H,x]LKƒYõ:[F¥geè݆ç¦ë²¿ì68+#C7¼î©y˜N—š(Oôt þ²Ö"ý§©•Wfrö˜¬DÉǵñžš•3B—ù¥ÝȈ&äµÿ$Ä ‡†2rsgcc/data/datalist0000644000175100001440000000001611730021766014032 0ustar hornikusersrsgcc: rnaseq rsgcc/R/0000755000175100001440000000000012155673376011611 5ustar hornikusersrsgcc/R/tsheatmap.R0000644000175100001440000003075312155671553013725 0ustar hornikusers ################################################################################################################# ##This function plots HeatMap with different similarity measures (e.g., 1-CorCoef) for tissue/condition specific genes ##Here method could be GCC (Gini correlation coefficient), ##PCC (Pearson product-moment correlation coefficient), ##SCC (Spearman's rank correlation coefficient), ##KCC (Kendall tau correlation coefficient) ##BiWt(correlation estimates based on Tukey's biweight M-estimator) ##MI (mutual information) ##MINE ##ED ################################################################################################################# gcc.tsheatmap <- function(x, cpus = 1, ## correlation method method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr"), #hclustfun = hclust, clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward"), #hcdata by output gcc.tsheatmap rowhcdata = NULL, colhcdata = NULL, keynote = "FPKM", ## dendrogram control symm = FALSE, ## data scaling scale = c("none","row", "column"), na.rm=TRUE, ## image plot revC = identical(Colv, "Rowv"), add.expr, ## mapping data to colors breaks, symbreaks=min(x < 0, na.rm=TRUE) || scale!="none", ## colors colrange = c("yellow", "red"), tissuecol= "heat.colors", ## block sepration colsep = 0.15, rowsep, sepcolor="white", sepwidth=c(0.05,0.05), ## level trace trace=c("none","column","row","both"), tracecol="cyan", hline=median(breaks), vline=median(breaks), linecol=tracecol, ## Row/Column Labeling margins = c(5, 5), ## plot labels main = NULL, xlab = NULL, ylab = NULL, ## plot layout lmat = NULL, lhei = NULL, lwid = NULL, ## extras ... ) { cat("Dimension information for clustered GE matrix:", dim(x), "\n") #get axis position for tissue bar getAtpos <- function(genenumVec) { posVec <- rep(0, length(genenumVec)) sum <- 0 for( i in 1:length(genenumVec)) { posVec[i] <- sum + genenumVec[i]/2 sum <- sum + genenumVec[i] } return( posVec/sum(genenumVec) ) } #color key scale01 <- function(x, low = min(x), high = max(x)) { x <- (x - low)/(high - low) x } ################################################################################### dendrogram <- "none" #no dendrogram, as only tissue specific genes are considered Rowv <- "none" #row will be ordered by script showning below Colv <- "none" #col will be ordered by script showning below retval <- list() scale <- if (symm & missing(scale)) {"none"} else { match.arg(scale) } trace <- match.arg(trace) # if (length(col) == 1 && is.character(col)) # col <- get(col, mode = "function") if (!missing(breaks) & (scale != "none")) { warning("Using scale=\"row\" or scale=\"column\" when breaks are", "specified can produce unpredictable results.", "Please consider using only one or the other.") } if (is.null(Rowv) | is.na(Rowv)) { Rowv <- FALSE } if (is.null(Colv) | is.na(Colv)) { Colv <- FALSE } else if (Colv == "Rowv" & !isTRUE(Rowv)) { Colv <- FALSE } di <- dim(x) if(length(di) != 2 ) { stop("x is not matrix?") } if(!is.numeric(x)) { stop("x must be a numeric matrix") } nr <- di[1] nc <- di[2] if (nr <= 1 | nc <= 1) { stop("`x' must have at least 2 rows and 2 columns") } if (!is.numeric(margins) | length(margins) != 2) { stop("`margins' must be a numeric vector of length 2") } ################################################################################### #get cluster information and , ordered geneID if( is.null(rowhcdata) ) { #no hcdata, ok, caclulate it hcr <- gcc.hclust( x, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod) #clustered for rows }else { hcr <- rowhcdata } ddr <- as.dendrogram(hcr$hc) ddr <- reorder(ddr, TRUE) rowInd <- order.dendrogram(ddr) ##for hcc data if( is.null(colhcdata ) ) { #no colhcdata, ok, cacluate it hcc <- gcc.hclust( t(x), cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod) #clustered for columns }else { hcc <- colhcdata } ##get ordered sampleID tsMatrix <- uniqueTissues(x) newx <- x[rowInd,] geneTSVec <- rep(0, nrow = nrow(newx)) for( i in 1:nrow(newx)) { meanVec <- rep(0, nrow(tsMatrix)) for( j in 1:nrow(tsMatrix)) { sampleIndex <- which(tsMatrix[j,] > 0) meanVec[j] <- mean(newx[i,sampleIndex]) } geneTSVec[i] <- which(meanVec == max(meanVec))[1] } orderMatrix <- matrix(0, nrow = nrow(tsMatrix), ncol = 4 ) colnames(orderMatrix) <- c("TissueIndex", "startGeneIndex", "EndGeneIndex", "GeneNum" ) rownames(orderMatrix) <- rownames(tsMatrix) for( i in 1:nrow(tsMatrix)) { GeneIndex <- which(geneTSVec == i) if(length(GeneIndex) > 0) { if( length(GeneIndex) != (max(GeneIndex) - min(GeneIndex) + 1) ) { #gene index not continuous, find gene index with max continuous length conlenmatrix <- matrix(0, nrow = length(GeneIndex), ncol = length(GeneIndex)) for( x in 1:length(GeneIndex)-1) { for( y in (x+1):length(GeneIndex) ) { GeneIndexSub <- GeneIndex[x:y] if(length(GeneIndexSub) == max(GeneIndexSub) - min(GeneIndexSub) + 1) { conlenmatrix[x,y] <- length(GeneIndexSub) } }#end for y }#end for x maxconlen <- max(conlenmatrix) if( maxconlen > 0) { maxrow <- apply(conlenmatrix, 1, max) rowIndex <- which(maxrow == maxconlen) colIndex <- which(conlenmatrix[rowIndex[1],] == maxconlen)[1] GeneIndex <- seq(GeneIndex[rowIndex], GeneIndex[colIndex], by=1) }else { GeneIndex <- c(GeneIndex[2]) } } }else { #no ts-gene for this tissue GeneIndex = 0 } orderMatrix[i,1] <- i orderMatrix[i,2] <- min(GeneIndex) orderMatrix[i,3] <- max(GeneIndex) orderMatrix[i,4] <- length(GeneIndex) if( length(GeneIndex) == 1 & GeneIndex[1] == 0 ) orderMatrix[i,4] <- 0 } colInd <- NULL orderMatrix <- orderMatrix[sort(orderMatrix[,2], decreasing=FALSE, index.return = TRUE)$ix,] print(orderMatrix) for( i in 1:nrow(orderMatrix) ) { curTissueIndex <- orderMatrix[i,1] sampleIndex <- which(tsMatrix[curTissueIndex,] > 0) colInd <- c(colInd, sampleIndex) }#end for i newx <- newx[, colInd] #get col for unique tissues TissueCol <- NULL if (missing(tissuecol) ) { TissueCol <- rainbow(nrow(orderMatrix)) }else if( is.function(tissuecol) ) { TissueCol <- tissuecol(nrow(orderMatrix)) }else { if( is.null(names(tissuecol)) ) { stop("Error: tissuecol is a vector parameters whose names are unique tissue names") }else { if( length(tissuecol) != nrow(tsMatrix) | length( which( (sort(rownames(tsMatrix)) == sort(names(tissuecol))) == FALSE)) > 0 ) { cat("Error: the input and detected unique tissue name is not the same, pls change the input ones to detected ones.\n") cat("input tissue names:\n") print( names(tissuecol)) cat("detected tissue namers:\n") print( rownames(tsMatrix)) stop("========") } TissueCol <- rep("NA", nrow(tsMatrix)) for( i in 1:length(TissueCol) ) { TissueCol[i] <- tissuecol[ which( tissuecol == rownames(TissueCol)[i])] } } } RowSideCol <- rep("NA", length(geneTSVec)) for( i in 1:length(RowSideCol) ) { RowSideCol[i] <- TissueCol[geneTSVec[i]] } ################################################################################################# call <- match.call() #add 20130514 retval$rowInd <- rowInd retval$colInd <- colInd retval$call <- match.call() x <- newx x.unscaled <- x if (scale == "row") { retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) x <- sweep(x, 1, rm) retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) x <- sweep(x, 1, sx, "/") } else if (scale == "column") { retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) x <- sweep(x, 2, rm) retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) x <- sweep(x, 2, sx, "/") } #for breaks if( missing(breaks) ) { breaks <- quantile( unique(c(x)), probs = seq(0, 1, 0.01), na.rm = TRUE) }else if( is.null(breaks) | length(breaks) < 1) { breaks <- 16 } if (length(breaks) == 1) { if (!symbreaks) { breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), length = breaks) } else { extreme <- max(abs(x), na.rm = TRUE) breaks <- seq(-extreme, extreme, length = breaks) } } nbr <- length(breaks) ncol <- length(breaks) - 1 if (class(colrange) == "function") { col <- colrange(ncol) }else if( is.null(colrange) ) { if( !require(grDevices) ) install.packages("grDevices") require(grDevices) if( !require(grDevices) ) col <- colorRampPalette(c("yellow", "red"))(nbr - 1) #for RNA-Seq }else if( is.vector(colrange) & length(colrange) == 2) { if( !require(grDevices) ) install.packages("grDevices") require(grDevices) cat("color range is:", colrange, "\n" ) col <- colorRampPalette(colrange)(nbr - 1) #for RNA-Seq }else { stop("Error: colrange should be a character vector containing two colors") } min.breaks <- min(breaks) max.breaks <- max(breaks) x[x < min.breaks] <- min.breaks x[x > max.breaks] <- max.breaks ######################################################################### ##below, plot op <- par(no.readonly = TRUE) on.exit(par(op)) if( is.null( lmat) ) { lmat <- t(matrix(rep(c(4, 1, 6, 2, 7, 0, 5), 3), ncol = 3)) lmat[2,6] <- 3 } if( is.null( lwid)) { lwid <- c(0.5, 0.1, 0.01, 3.5, 0.01, 0.1, 0.5) } lhei <- rep(5, 3) if (length(lhei) != nrow(lmat)) stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) if (length(lwid) != ncol(lmat)) stop("lwid must have length = ncol(lmat) =", ncol(lmat)) layout(lmat, widths = lwid, heights = lhei, respect = FALSE) nr = nrow(newx) nc = ncol(newx) #tissue col bar par(mar = c(margins[1], 0, 2, 0.1)) image(rbind(1:nr), col = RowSideCol, xaxt = "n", yaxt = "n") xv <- getAtpos(orderMatrix[which(orderMatrix[,4] > 0),4]) axis(2, at=xv, labels=rownames(orderMatrix[which(orderMatrix[,4] > 0),]), las= HORIZONTAL<-1, cex.axis=1.2, adj = 1, xpd = TRUE ) #heat map image(1:nc, 1:nr, t(newx), xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, breaks = breaks, ...) min.raw <- min(newx) max.raw <- max(newx) z <- seq(min.raw, max.raw, length = length(col)) image( t(matrix(z, ncol = 1)), col = col, xaxt = "n", yaxt = "n") #remove breaks from orginal function lv <- pretty(breaks) xv <- scale01(as.numeric(lv), min.raw, max.raw) axis(4, at = xv, labels = lv) mtext(side = 4, keynote, line = 3) retval$breaks <- breaks retval$col <- col retval$rownames <- rownames(newx) retval$colnames <- colnames(newx) return(list(retval = retval, hcr = hcr, hcc = hcc) ) } rsgcc/R/tsgene.R0000644000175100001440000000703212144517175013213 0ustar hornikusers################################################################## ##find tissue specific genes if( !require(stringr) ) install.packages("stringr") require(stringr) uniqueTissues <- function(x) { sampleNum <- ncol(x) if( is.null(colnames(x)) ) { #no annoted tissue sample, each sample belongs to different tissue by default tsMatrix <- diag(x = 1, nrow = sampleNum, ncol = sampleNum ) }else { tsMatrix <- matrix(0, nrow = sampleNum, ncol = sampleNum ) colnames(tsMatrix) <- colnames(x) uniTS <- c("") for( i in 1: sampleNum ) { lastdot <- sapply(gregexpr("\\.", colnames(x)[i]), tail, 1) if( lastdot < 0 ) { curTSName <- colnames(x)[i] } else { if( lastdot > 1 ) {curTSName <- str_sub( colnames(x)[i], 1, lastdot - 1) }else { curTSName <- colnames(x)[i] } } curTSIndex <- which(uniTS == curTSName) if( length(curTSIndex) == 0 ) { #not record if( i == 1) uniTS[1] <- curTSName else uniTS <- c(uniTS, curTSName) tsMatrix[length(uniTS), i] <- 1 }else { tsMatrix[curTSIndex, i] <- 1 } }#end for i tsMatrix <- tsMatrix[1:length(uniTS),] rownames(tsMatrix) <- uniTS }#end else tsMatrix } ################################################################################## ##ts gene: t measure, 1- max(nonTissue)/max(Tissue) getsgene <- function(x, Log = FALSE, Base = 2, AddOne = FALSE, tsThreshold = 0.95, MeanOrMax = "Mean", Fraction = TRUE ) { ##remove all zeros if (AddOne) { x <- x + 1 }else { subfun <- function( vec ) { res <- FALSE if( sum(abs(vec)) == 0 ) res <- TRUE res } zeroIndex <- which(apply(x, 1, subfun ) ) if( length(zeroIndex) >= 1 ) x <- x[-zeroIndex, ] } if (Log) { x <- log(x, Base) } onets <- function(vec, tsMatrix, MeanOrMax) { tscorematrix <- matrix(0, nrow = nrow(tsMatrix), ncol = 2) for (i in 1:nrow(tsMatrix)) { sampleIndex <- which(tsMatrix[i, ] > 0) meanvalue <- NULL if( MeanOrMax == "Mean" ) { meanvalue <- mean(vec[sampleIndex]) ##consider the max values in one stress }else { meanvalue <- max(vec[sampleIndex]) ##consider the max values in one stress } tscorematrix[i, 1] <- i if (meanvalue == 0) { meanvalue <- 1.0E-6 } tscorematrix[i, 2] <- 1 - max(vec[-sampleIndex])/meanvalue } tmax = max(tscorematrix[, 2]) tmaxidx = tscorematrix[which(tscorematrix[, 2] == tmax), 1][1] return(list(tmaxidx = tmaxidx, tmax = tmax, tscorematrix = tscorematrix)) } tsMatrix <- uniqueTissues(x) tt <- apply(x, 1, onets, tsMatrix = tsMatrix, MeanOrMax = MeanOrMax) alltsmatrix <- matrix( 0, nrow = nrow(x), ncol = nrow(tsMatrix) ) rownames(alltsmatrix) <- rownames(x) colnames(alltsmatrix) <- rownames(tsMatrix) tscorematrix <- matrix(0, nrow = nrow(x), ncol = 3) rownames(tscorematrix) <- rownames(x) colnames(tscorematrix) <- c("GeneIndex", "tsmaxscore", "tsmaxidx") for (i in 1:nrow(x)) { tscorematrix[i, 1] <- i tscorematrix[i, 2] <- tt[[i]]$tmax tscorematrix[i, 3] <- tt[[i]]$tmaxidx alltsmatrix[i,] <- tt[[i]]$tscorematrix[,2] } tscore <- tscorematrix[which(tscorematrix[, 2] >= tsThreshold), ] tsgene <- x[tscore[, 1], ] if (Fraction) { tsgene <- t(apply(tsgene, 1, function(vec) vec/sum(vec))) } return(list(alltscoremat = alltsmatrix, tscore = tscorematrix, tsgene = tsgene, uniquets = tsMatrix)) } rsgcc/R/rsgccgui.R0000644000175100001440000003501612155671773013546 0ustar hornikusers################################################ ##gui.rsgcc ##Author: Chuang Ma ##Date: 2012-03-02 if( !require(gWidgetsRGtk2) ) install.packages("gWidgetsRGtk2", dependencies=TRUE) if( !require(gWidgets) ) install.packages("gWidgets", dependencies=TRUE) if( !require(fBasics)) install.packages("fBasics") if( !require(gplots)) install.packages("gplots") require(gWidgetsRGtk2) require(gWidgets) require(fBasics) require(gplots) options("guiToolkit"="RGtk2") rsgcc.gui <- function(margins = c(1,1), labRow = "", labCol = "", lwid = c(0.5, 0.05, 0.01, 0.5, 0.01, 0.05, 0.5), keynote = "FPKM" ) { availCor <- c("Gini correlation"="GCC", "Spearman correlation"="SCC", "Kendall correlation"="KCC", "Pearson correlation"="PCC", "Turkey's biweight"="BiWt") availSim <- c("raw correlation(1-coef)" = "Raw", "absolute correlation(1-|coef|)" = "Abs", "squared correlation(1-|coef|^2)" = "Sqr") availCluster <- c("complete linkage" = "complete", "average linkage" = "average", "median linkage" = "median", "centroid linkage" = "centroid", "mcquitty linkage" = "mcquitty", "single linkage" = "single", "ward linkage" = "ward") data_file_dir <- NULL data_matrix <- NULL tsgene_matrix <- NULL hm_dataframe <- NULL CPUNum <- 1 DisplayFlag <- 0 TSGeneClusterFlag <- 0 CorMethodType <- "GCC" SimilarityMethodType <- "Raw" ClusterMethodType <- "complete" OutputGPType <- FALSE AbsCorThreshold <- 0.85 PermutationNum <- 2000 PValueThreshold <- 0.05 DisplayTimes <- 0 rowHC <- NULL colHC <- NULL ColMax <- 16711680 ColMean <- 122 ColMin <- 16776960 tsScore <- 0.95 ##data to color index coverDecToHex <- function(x) { ##tt <- x ##class(tt) <- "hexmode" tt <- .dec.to.hex(x) strlen <- str_length(tt) if( strlen > 6 ) { hh <- str_sub( tt, strlen-5, strlen) }else if( strlen < 6 ){ hh <- paste(str_sub("0000000",1, 6-strlen), tt, sep="") }else { hh <- tt } paste("#", hh, sep="") } ##start to run, update plot updateRun <- function(h,...) { ##data_matrix x <- data_matrix if( is.null(x) ) { stop("Error: no GE data input.") }else { x <- data_matrix } cat("Starting to run...\n") print(SimilarityMethodType) if( TSGeneClusterFlag == 1 ) { tsgene_matrix <<- getsgene(x, tsThreshold = tsScore, Fraction= TRUE)$tsgene hm_dataframe <<- gcc.tsheatmap(tsgene_matrix, cpus = CPUNum, cormethod = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, lwid = lwid, keynote = keynote, margins = margins) }else { hm_dataframe <<- gcc.heatmap(x, cpus = CPUNum, cormethod = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, margins= margins, labRow = labRow, labCol = labCol) } cat("Finish!\n") } #file choose fileChoose <- function(h, ...) { countLines <- function(filename) { cat("Line numer is:", length(readLines(filename)), "\n") } the_data <- gfile(text= "Select a file...", type="open", ..., action = "countLines", filter = list("All files" = list(patterns = c("*")), "text files" = list(patterns = c("*.txt")), "csv files" = list(patterns = c("*.csv"))), handler = function(h,...) { do.call(h$action, list(h$file)) }) data_file_dir <<- the_data cat("Loaded file dir: ", data_file_dir, "\n") lastdot <- sapply(gregexpr("\\.", the_data), tail, 1) if( lastdot < 0 ) { data_matrix <<- as.matrix(read.table(the_data, sep="\t")) } else { curfileName <- str_sub( the_data, lastdot+1, str_length(the_data)) if(curfileName == "csv") { data_matrix <<- as.matrix(read.csv(the_data, sep="\t")) }else { data_matrix <<- as.matrix(read.table(the_data, sep="\t")) } } } #visualize data visData <- function(h, ...) { if( svalue(h$obj) == TRUE ) { cat("Display GE Times: Yes.\n") if( DisplayTimes != 1 ) { if( is.null(data_matrix) ) { stop("Error: no GE data input") } cat("the dim info of data_matrix is:", dim(data_matrix), "\n") if( is.null( rownames(data_matrix)) ) { visx <- data_matrix } else { visx <- matrix(0, nrow = nrow(data_matrix), ncol = ncol(data_matrix)+1 ) colnames(visx) <- c("rownames", colnames(data_matrix)) visx[1:nrow(visx),2:ncol(visx)] <- data_matrix[1:nrow(data_matrix),1:ncol(data_matrix)] visx[,1] <- rownames(data_matrix) } DisplayTimes <<- 1 win <- gwindow("Loaded gene expression data)") gp <- ggroup(container=win) group <- ggroup(horizontal=FALSE, border = TRUE, container=gp) odm <- gtable(visx, multiple = TRUE, container= gp, expand = TRUE) cat("Finish to display GE data: Yes.\n") } }else { #do nothing DisplayTimes <<- 0 } } updateHeatmapMax <- function(h,...) { ColMax <<- svalue(h$obj) cat("Color Index for Heatmap:", ColMax, ColMean, ColMin, "\n") #for TS genes, only two colors if( TSGeneClusterFlag == 1 ) { hmdata <- gcc.tsheatmap(tsgene_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMax)), lwid = lwid) }else { hmdata <- gcc.heatmap(data_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMean), coverDecToHex(ColMax)), margins= margins, labRow = "") } } ##needed to be modified updateHeatmapMean <- function(h,...) { ColMean <<- svalue(h$obj) cat("Color Index for Heatmap:", ColMax, ColMean, ColMin, "\n") #for TS genes, only two colors if( TSGeneClusterFlag == 0 ) { hmdata <- gcc.heatmap(data_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMean), coverDecToHex(ColMax)), margins= margins, labRow = "") }else { #do nothing } } ##needed to be modified updateHeatmapMin <- function(h,...) { ColMin <<- svalue(h$obj) cat("Start to save data.\n") cat("Color Index for Heatmap:", ColMax, ColMean, ColMin, "\n") #for TS genes, only two colors if( TSGeneClusterFlag == 1 ) { hmdata <- gcc.tsheatmap(tsgene_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMax)), lwid = lwid) }else { hmdata <- gcc.heatmap(data_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMean), coverDecToHex(ColMax)), margins= margins, labRow = "") } } saveData <- function(h,...) { pdf(paste(data_file_dir, "_rsgcc_heatmap.pdf", sep=""), width = 5, height= 5) if( TSGeneClusterFlag == 1 ) { hm_dataframe <<- gcc.tsheatmap(tsgene_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMax)), lwid = lwid) }else { hm_dataframe <<- gcc.heatmap(data_matrix, cpus = CPUNum, method = CorMethodType, distancemethod = SimilarityMethodType, clustermethod= ClusterMethodType, rowhcdata = hm_dataframe$hcr, colhcdata = hm_dataframe$hcc, colrange = c(coverDecToHex(ColMin), coverDecToHex(ColMean), coverDecToHex(ColMax)), margins= margins, labRow = "") } dev.off() GEMatrix_Data <- NULL cor_matrix <- NULL if( TSGeneClusterFlag == 1 ) { GEMatrix_Data <- tsgene_matrix cor_matrix <- hm_dataframe$hcr$pairmatrix }else { GEMatrix_Data <- data_matrix cor_matrix <- hm_dataframe$hcr$pairmatrix } cor_file_dir <- paste(data_file_dir, "_rsgcc_correlations.txt", sep="") genenum <- nrow(cor_matrix) cor_pairs <- matrix("NA", nrow = genenum*(genenum-1), ncol = 3) k <- 0 for( i in 1:(genenum-1)) { for( j in (i+1):genenum){ k <- k + 1 cor_pairs[k,1] <- rownames(cor_matrix)[i] cor_pairs[k,2] <- colnames(cor_matrix)[j] cor_pairs[k,3] <- cor_matrix[i,j] }#end for j }#end for i write.table( cor_pairs[1:k,], cor_file_dir, sep="\t", row.names = FALSE, col.names = FALSE, quote = FALSE) if( !require(ctc) ) { biocLite <- NULL rm(biocLite) source("http://bioconductor.org/biocLite.R") biocLite("ctc") } require(ctc) cdt_file_dir <- paste(data_file_dir, "_rsgcc_cluster.cdt", sep="") r2atr( hm_dataframe$hcc$hc, distance = "GCC", file = paste(data_file_dir, "_rsgcc_cluster.atr", sep="")) r2gtr( hm_dataframe$hcr$hc, distance = "GCC", file = paste(data_file_dir, "_rsgcc_cluster.gtr", sep="")) r2cdt(hm_dataframe$hcr$hc, hm_dataframe$hcc$hc, GEMatrix_Data, file = paste(data_file_dir, "_rsgcc_cluster.cdt", sep="")) cat("Success in saving data.\n") } ########################################################################### ## now layout window <- gwindow("rsgcc(correlation and clustering analysis of gene expression data)") BigGroup <- ggroup(container=window) group <- ggroup(horizontal=FALSE, border = TRUE, container=BigGroup) #add a button tmp <- gframe("Step 1: Load gene exp data", container = group) Selectedfiles <- gbutton(c("Click here to load"), handler= fileChoose, container=group) #read and display data tmp <- gcheckbox("Display loaded data", checked = FALSE, handler = visData, container = group) #find and cluster tissue/condition specific genes tmp <- gcheckbox("Find ts-genes for clustering analysis", checked = FALSE, handler = function(h,...) { if( svalue(h$obj) == TRUE ) { TSGeneClusterFlag <<- 1 cat("rsgcc will find and cluster ts-genes.\n") }else { TSGeneClusterFlag <<- 0 } }, container = group) #threshold for tsScore tmp <- gframe("Threshold for tissue specificity score", container=group) tsScoreAdjust <- gedit(text = "0.95", width = 30, coerce.with=as.numeric, horizontal=FALSE, handler=function(h,...){tsScore <<- svalue(h$obj); cat("Threshold for tissue specificity score:", tsScore, "\n")}) add(tmp, tsScoreAdjust) tmp <- gframe("Step 2: Select a correlation method", container=group) Correlations <- gradio(names(availCor), horizontal=FALSE, handler= function(h,...){ CorMethodType <<- availCor[svalue(Correlations)]; cat("Correlation method: ", CorMethodType, "\n")}) add(tmp, Correlations) tmp <- gframe("Step 3: Specify a distance measure", container=group) Similarities <- gradio(names(availSim), horizontal=FALSE, handler=function(h,...){SimilarityMethodType <<- availSim[svalue(Similarities)]; cat("Similarity method: ", SimilarityMethodType, "\n" )}) add(tmp, Similarities) tmp <- gframe("Step 4: Choose a cluster method", container=group) Clusters <- gcombobox(names(availCluster), horizontal=FALSE, handler=function(h,...) { ClusterMethodType <<- availCluster[svalue(Clusters)]; cat("Cluster method:", ClusterMethodType, "\n")} ) add(tmp, Clusters) #number of cpus for computation tmp <- gframe("Step 5: CPUs for correlation calcuation", container=group) cpuAdjust <- gedit(text = "1", width = 30, coerce.with=as.numeric, horizontal=FALSE, handler=function(h,...){CPUNum <<- svalue(h$obj); cat("CPU number:", CPUNum, "\n")}) add(tmp, cpuAdjust) StartRun <- gbutton(c("Start to run"), handler= updateRun, container=group) ######################################################################### ##set color for cluster tmp <- gframe("Adjust colors for heat map", container=group) tmp <- gframe("Color for max GE value", container=group) ColorAdjust1 <- gslider(from=1, to=256^3-1, by=2000, value=16711680, handler=updateHeatmapMax) add(tmp, ColorAdjust1, expand = TRUE) tmp <- gframe("Color for median GE value", container=group) ColorAdjust2 <- gslider(from=1, to=256^3-1, by=2000, value=1, handler=updateHeatmapMean) add(tmp, ColorAdjust2, expand = TRUE) tmp <- gframe("Color for min GE value", container=group) ColorAdjust3 <- gslider(from=1, to=256^3-1, by=2000, value=16776960, handler=updateHeatmapMin) add(tmp, ColorAdjust3, expand = TRUE) ########################################################################## SaveData <- gbutton(c("Save correlations and cluster data"), handler= saveData, container=group) add(BigGroup, ggraphics()) } rsgcc/R/rsgcc.R0000644000175100001440000011731312155672445013037 0ustar hornikusers################################################################################### ##Note: this script contains function for correlation calcuation with snowfall ## package in R for parallell computing. ##Author: Chuang Ma ##Date: 2012-02-16 ################################################################################## if( !require(biwt)) install.packages("biwt") require(biwt) if( !require(parmigene)) install.package("parmigene") library(parmigene) ######################################################################### ##compute Gini correlation by using the rank informaiton of x and the ## value informaiton of y ######################################################################## onegcc <- function(x, y) { #rank function, get y[m] and y(m) with the rank information of x (first line) getrank <- function(datamatrix) { if( dim(datamatrix)[1] != 2 ) { stop("Error: the row num of datamatrix must be 2") } #use the order information of X OrderIndex <- order( datamatrix[1,], decreasing = FALSE ) SortGenePair <- datamatrix[, OrderIndex] Sort2By1 <- SortGenePair[2,] return( list(Sort2By1 = Sort2By1, Sort2By2 = sort(datamatrix[2,], decreasing = FALSE ) ) ) }#end getrank ##compute gcc with weight vector and y[m] and y(m) gcc <- function( weightvec, vectsort, vectselfsort) { Sum1 <- sum(weightvec*vectsort) Sum2 <- sum(weightvec*vectselfsort) if( Sum2 == 0 ) { cat("\n", x, "\n", y, "\n") cat("Warning: the Denominator is ZRRO, the value of one variable is consistent.") gcccor <- 0 }else { gcccor <- Sum1/Sum2 } return(gcccor) } ###################################################### ##for gini correlation ##generate weight vector Length <- length(x) Wt <- t(2*seq(1, Length, by = 1) - Length - 1) ##for gcc.rankx GenePairXY <- t(matrix( c(x,y), ncol = 2)) SortYlist <- getrank(GenePairXY) gcc.rankx <- gcc(Wt, SortYlist$Sort2By1, SortYlist$Sort2By2) return(gcc.rankx) } ########################################################################## ##get final correlation and p-value for GCC ##input gcccor is the output of allcor function for GCC ########################################################################## gcc.corfinal <- function( gcccor ) { ##if gcccor has p-value informaiton, select correlation with p-value if( is.numeric(gcccor$gcc.rankx.pvalue) & is.numeric(gcccor$gcc.ranky.pvalue) ) { fpvalue <- gcccor$gcc.rankx.pvalue fgcc <- gcccor$gcc.rankx if( gcccor$gcc.ranky.pvalue < fpvalue ) { fpvalue <- gcccor$gcc.ranky.pvalue fgcc <- gcccor$gcc.ranky } }else { ##no p-value, we selected the gcc with absolute max vlaue fpvalue <- NA x <- c(gcccor$gcc.rankx, gcccor$gcc.ranky) fgcc <- x[ which( abs(x) == max(abs(x)) ) ] if( length(fgcc) > 1 ) { #check length # cat("Warnning: the correlation coefficients generated by GCC method are the same after abs() operation.", # gcccor$gcc.rankx, gcccor$gcc.ranky, # "In current version of Rgcc, only the first one is selected...") fgcc <- fgcc[1] } } return( list(gcc.fcor = fgcc, gcc.fpvalue = fpvalue)) } cor.pair <- function( idxvec, GEMatrix, rowORcol = c("row", "col"), cormethod = c("GCC", "PCC", "SCC", "KCC", "BiWt"), pernum = 0, sigmethod = c("two.sided", "one.sided") ) { ######################################################################### ##compute Gini correlation by using the rank informaiton of x and the ## value informaiton of y ######################################################################## onegcc <- function(x, y) { #rank function, get y[m] and y(m) with the rank information of x (first line) getrank <- function(datamatrix) { if( dim(datamatrix)[1] != 2 ) { stop("Error: the row num of datamatrix must be 2") } #use the order information of X OrderIndex <- order( datamatrix[1,], decreasing = FALSE ) SortGenePair <- datamatrix[, OrderIndex] Sort2By1 <- SortGenePair[2,] return( list(Sort2By1 = Sort2By1, Sort2By2 = sort(datamatrix[2,], decreasing = FALSE ) ) ) }#end getrank ##compute gcc with weight vector and y[m] and y(m) gcc <- function( weightvec, vectsort, vectselfsort) { Sum1 <- sum(weightvec*vectsort) Sum2 <- sum(weightvec*vectselfsort) if( Sum2 == 0 ) { cat("\n", x, "\n", y, "\n") cat("Warning: the Denominator is ZRRO, the value of one variable is consistent.") gcccor <- 0 }else { gcccor <- Sum1/Sum2 } return(gcccor) } ###################################################### ##for gini correlation ##generate weight vector Length <- length(x) Wt <- t(2*seq(1, Length, by = 1) - Length - 1) ##for gcc.rankx GenePairXY <- t(matrix( c(x,y), ncol = 2)) SortYlist <- getrank(GenePairXY) gcc.rankx <- gcc(Wt, SortYlist$Sort2By1, SortYlist$Sort2By2) return(gcc.rankx) } if(!is.vector(idxvec) | length(idxvec) != 2) { stop("Error: idxvec must be a vector with two elements indicating the indexs(rows) in GEMatrix") } if( class(GEMatrix) != "matrix" ) { stop("Error: GEMatrix should be a numeric data matrix") } if( rowORcol == "row" ) { x1 <- GEMatrix[idxvec[1],] y1 <- GEMatrix[idxvec[2],] }else if(rowORcol == "col") { x1 <- GEMatrix[,idxvec[1]] y1 <- GEMatrix[,idxvec[2]] }else { stop("Error: rowORcol must be \"row\" or \"col\"") } ##function for all considered correlation methods getcor <- function(g1, g2, cormethod ) { if( cormethod == "PCC") { return(cor.test(g1, g2, method="pearson")$estimate) } else if( cormethod == "SCC" ) { return(cor.test(g1, g2, method="spearman")$estimate) } else if( cormethod == "KCC" ) { return(cor.test(g1, g2, method="kendall")$estimate) } else if( cormethod == "BiWt" ){ return(biwt.cor(t(matrix(c(g1,g2), ncol=2)), output="vector")[1]) } else if( cormethod == "GCC" ){ return(list( gcc.rankx = onegcc(g1,g2), gcc.ranky = onegcc(g2,g1) ) ) } }#end getcor ##get pvalue for permutation test getpvalue <- function( percorvec, pernum, realcor, sigmethod ) { pvalue <- length(which(percorvec >= realcor))/pernum if( pvalue == 0 ) pvalue <- 1.0/pernum if( pvalue > 0.5 ) pvalue <- 1.0 - pvalue if( sigmethod == "two.sided" ) { pvalue <- 2*pvalue } return(pvalue) }#end getpvalue #check cormethod if( length(cormethod) > 1 ) { stop("Error: length of cormethod must be of length 1") } #check vector if( !is.vector(x1) || !is.vector(y1) ) { stop("Error: input two vectors for gcc.corpair") } #check numeric if( !is.numeric(x1) || !is.numeric(y1) ) { stop("Error: x should be numeric") } #check NA if( length(which(is.na(x1) == TRUE)) > 0 || length(which(is.na(y1) == TRUE)) > 0 ){ stop("Error: There are Na(s) in x") } #check length if( length(x1) != length(y1) ) { stop("Error: the lengths of each row in x are different.\n") } realcor <- getcor(x1, y1, cormethod) ##check pernum for permutation or output if( pernum <= 0 ) { if( cormethod == "GCC") { return( list(gcc.rankx=realcor$gcc.rankx, gcc.ranky=realcor$gcc.ranky, gcc.rankx.pvalue = NA, gcc.ranky.pvalue = NA) ) }else { return( list(cor = realcor, pvalue = NA)) } }else { ##generate matrix for permuted correlation pGCCMatrix <- matrix(0, nrow = pernum, ncol = 2) colnames(pGCCMatrix) <- c("gcc.rankx", "gcc.ranky") rownames(pGCCMatrix) <- paste("permut", seq(1,pernum, by=1), sep="") GenePairXY <- t(matrix( c(x1, y1), ncol = 2)) pGenePairXY <- GenePairXY Length <- length(x1) for( i in 1:pernum ) { ##get system time for seed and then generate random index curtime <- format(Sys.time(), "%H:%M:%OS4") XXX <- unlist(strsplit(curtime, ":")) curtimeidx <- (as.numeric(XXX[1])*3600 + as.numeric(XXX[2])*60 + as.numeric(XXX[3]))*10000 set.seed( curtimeidx ) TT = sort(runif(Length),index.return=TRUE)$ix pGenePairXY[1,] <- GenePairXY[1,TT] if( cormethod == "GCC" ) { cortmp <- getcor( pGenePairXY[1,], pGenePairXY[2,], cormethod ) pGCCMatrix[i,1] <- cortmp$gcc.rankx pGCCMatrix[i,2] <- cortmp$gcc.ranky }else { pGCCMatrix[i,1] <- getcor( pGenePairXY[1,], pGenePairXY[2,], cormethod ) } }#end for i #compute p-value if( cormethod == "GCC" ) { return( list( gcc.rankx = realcor$gcc.rankx, gcc.ranky = realcor$gcc.ranky, gcc.rankx.pvalue = getpvalue(pGCCMatrix[,1], pernum, realcor$gcc.rankx, sigmethod), gcc.ranky.pvalue = getpvalue(pGCCMatrix[,2], pernum, realcor$gcc.ranky, sigmethod)) ) }else { return( list (cor = realcor, pvalue = getpvalue(pGCCMatrix[,1], pernum, realcor, sigmethod)) ) } } }#end allcor ############################################################################## ##Calcluate correlations for variables in a matrix with parallel computing ##x: numeric matrix ##asym: If ture, output gccx and gccy; or else, output max gcc(or gcc with low p value) ##style: "all.pairs", "pairs.between", "adjacent.pairs", "one.pair" ##var1.id ##var2.id ##pernum ##sigmethod ##Date: 2012-02-16 ############################################################################## cor.matrix <- function( GEMatrix, cpus = 1, cormethod = c("GCC", "PCC", "SCC", "KCC", "BiWt"), style = c("all.pairs", "pairs.between", "adjacent.pairs", "one.pair"), var1.id = NA, var2.id = NA, pernum = 0, sigmethod = c("two.sided", "one.sided"), output = c("matrix", "paired") ) { if( cpus > 1 ) { require(snowfall) } #check cormethod if( length( cormethod ) > 1 ) { cat("Warning: one correlation method should be specified. Default:GCC") } #check style if( length(style) > 1 ) { cat( "Warning: one style should be specified. Default: all.pairs") } #check sigmethod if( pernum > 0 & length( sigmethod ) > 1 ) { sigmethod = "two.sided" } if( pernum == 0 ) { sigmethod <- "two.sided" } #check whether matrix if( !is.matrix( GEMatrix) ) { stop("Error: GEMatrix in cor.matrix function is not matrix") } if( !is.numeric( GEMatrix) ){ stop("Error: GEMatrix is not numeric") } if( length(rownames(GEMatrix)) == 0 ) { #no rownames rownames(GEMatrix) <- seq(1,dim(GEMatrix)[1], by=1) } VariableNum <- nrow(GEMatrix) SampleSize <- ncol(GEMatrix) if( VariableNum <= 1 || SampleSize <= 1 ){ stop("Error:the number of variable is less than 2, or the number of observation is less than 2 ") } if( style == "one.pair" ) { if( length(var1.id) != 1 || length(var2.id) != 1 || is.na(var1.id) == TRUE || is.na(var2.id) == TRUE) { stop("Error: Not define the var1.id or var1.id") } } if( style == "adjacent.pairs") { var1.id <- seq(1, (VariableNum-1), by=1) var2.id <- var1.id + 1 } #for task matrix if( style == "one.pair" || style == "adjacent.pairs") { if( length(var1.id) != length(var2.id) ) { stop("Error: var1.id and var2.id should be vectors with the same length") } taskmatrix <- matrix(c(var1.id, var2.id), ncol = 2) }#end if style if( style == "pairs.between" ) { if( length( which(is.na(var1.id) == TRUE) ) > 0 | length( which(is.na(var1.id) == TRUE) ) > 0 ){ stop("Error: no variable IDs are given") } if( length( which((var1.id != var2.id) == TRUE ) ) > 0 ) { stop("Error: var1.id should be the same with var2.id for the pairs.between style") } if( length(which(is.numeric(var1.id) == FALSE)) > 0 | length(which(is.numeric(var2.id) == FALSE)) > 0 ){ stop("Error:var1.id and var2.id should be numeric vector") } } if( style == "all.pairs" ) { var1.id <- seq(1, dim(GEMatrix)[1], by=1) var2.id <- var1.id } if( style == "pairs.between" || style == "all.pairs") { CurLen <- length(var1.id) taskmatrix <- matrix(0, nrow = length(var1.id)*(length(var1.id)+1)/2, ncol = 2) kk = 0 for( i in 1:length(var1.id)) { j <- 1 while( j <= i ) { kk <- kk + 1 taskmatrix[kk,] <- c(i,j) j <- j + 1 }#end while j }#end for i }#end style ##implement apply if( cpus == 1 | cormethod == "BiWt") { results <- apply(taskmatrix, 1, cor.pair, GEMatrix = GEMatrix, rowORcol = "row", cormethod = cormethod, pernum = pernum, sigmethod = sigmethod) }else { sfInit(parallel=TRUE, cpus=cpus) print(sprintf('%s cpus to be used', sfCpus())) # if( cormethod == "GCC") sfExport('onegcc') # if( cormethod == "BiWt") sfExport('biwt.cor') results <- sfApply(taskmatrix, 1, cor.pair, GEMatrix = GEMatrix, rowORcol = "row", cormethod = cormethod, pernum = pernum, sigmethod = sigmethod) sfStop() } ##get final results if( output == "paired") { kk <- 0 corpvalueMatrix <- matrix(NA, nrow = dim(taskmatrix)[1], ncol = 4) for( i in 1:dim(taskmatrix)[1] ) { if( taskmatrix[i,1] == taskmatrix[i,2] ) next kk <- kk + 1 corpvalueMatrix[kk,1:2] <- rownames(GEMatrix)[taskmatrix[i,]] if( cormethod == "GCC" ) { fGCC <- gcc.corfinal(results[i][[1]]) corpvalueMatrix[kk,3] <- fGCC$gcc.fcor corpvalueMatrix[kk,4] <- fGCC$gcc.fpvalue }else { corpvalueMatrix[kk,3] <- results[i][[1]]$cor corpvalueMatrix[kk,4] <- results[i][[1]]$pvalue } }#end for i return( corpvalueMatrix[1:kk,] ) }else { ##get final results UniqueRow <- sort(unique(taskmatrix[,1])) UniqueCol <- sort(unique(taskmatrix[,2])) corMatrix <- matrix(0, nrow = length(UniqueRow), ncol = length(UniqueCol)) rownames(corMatrix) <- rownames(GEMatrix)[UniqueRow] colnames(corMatrix) <- rownames(GEMatrix)[UniqueCol] pvalueMatrix <- corMatrix pvalueMatrix[] <- NA for( i in 1:dim(taskmatrix)[1] ) { rowidx <- which(UniqueRow == taskmatrix[i,1]) colidx <- which(UniqueCol == taskmatrix[i,2]) if( cormethod == "GCC" ) { fGCC <- gcc.corfinal(results[i][[1]]) corMatrix[rowidx,colidx] <- fGCC$gcc.fcor pvalueMatrix[rowidx,colidx] <- fGCC$gcc.fpvalue }else { corMatrix[rowidx,colidx] <- results[i][[1]]$cor pvalueMatrix[rowidx,colidx] <- results[i][[1]]$pvalue } if( style == "pairs.between" | style == "all.pairs" ) { corMatrix[colidx, rowidx] <- corMatrix[rowidx,colidx] pvalueMatrix[colidx,rowidx] <- pvalueMatrix[rowidx,colidx] } }#end for i return( list(corMatrix = corMatrix, pvalueMatrix = pvalueMatrix) ) }#end else }#end function ############################################################################# ##This function plots HeatMap with different similarity measures (1-CorCoef) ##Here CorCoef could be GCC (Gini correlation coefficient), ##PCC (Pearson product-moment correlation coefficient), ##SCC (Spearman's rank correlation coefficient), ##KCC (Kendall tau correlation coefficient) ##and BiWt(correlation estimates based on Tukey's biweight M-estimator) ##To set other parameters, please ref HeatMap.2 function in gplots package. ############################################################################# gcc.heatmap <- function(x, cpus = 1, ## correlation method method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), ## similarity method distancemethod = c("Raw", "Abs", "Sqr"), #hclustfun = hclust, clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward"), rowhcdata = NULL, colhcdata = NULL, keynote = "FPKM", ## dendrogram control symm = FALSE, Rowv = TRUE, Colv= if(symm)"Rowv" else TRUE, dendrogram = c("both","row","column","none"), ## data scaling scale = c("none","row", "column"), na.rm=TRUE, ## image plot revC = identical(Colv, "Rowv"), add.expr, ## mapping data to colors breaks = 16, quanbreaks = TRUE, symbreaks=min(x < 0, na.rm=TRUE) || scale!="none", ## colors colrange= c("green", "black", "red"), ## block sepration colsep, rowsep, sepcolor="white", sepwidth=c(0.05,0.05), ## cell labeling cellnote, notecex=1.0, notecol="cyan", na.color=par("bg"), ## level trace trace=c("none", "column","row","both"), tracecol="cyan", hline=median(breaks), vline=median(breaks), linecol=tracecol, ## Row/Column Labeling margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(dim(x)[1]), cexCol = 0.2 + 1/log10(dim(x)[2]), labRow = NULL, labCol = NULL, ## color key + density info key = TRUE, keysize = 0.65, density.info=c("none","histogram","density"), denscol=tracecol, symkey = min(x < 0, na.rm=TRUE) || symbreaks, densadj = 0.25, ## plot labels main = NULL, xlab = NULL, ylab = NULL, ## plot layout lmat = NULL, lhei = NULL, lwid = NULL, ## extras ... ) { cat("GE matrix start to be clustered:", dim(x), "\n") scale01 <- function(x, low = min(x), high = max(x)) { x <- (x - low)/(high - low); x } retval <- list() scale <- if(symm & missing(scale)) { "none" } else { match.arg(scale) } dendrogram <- match.arg(dendrogram) trace <- match.arg(trace) density.info <- match.arg(density.info) if (length(colrange) == 1 & is.character(colrange)) { colrange <- get(colrange, mode = "function") col <- colrange } if (!missing(breaks) & (scale != "none")) warning("Using scale=\"row\" or scale=\"column\" when breaks are", "specified can produce unpredictable results.", "Please consider using only one or the other.") if (is.null(Rowv) | is.na(Rowv)) { Rowv <- FALSE } if (is.null(Colv) | is.na(Colv)) { Colv <- FALSE }else if (Colv == "Rowv" & !isTRUE(Rowv)) { Colv <- FALSE } if (length(di <- dim(x)) != 2 || !is.numeric(x)) {stop("`x' must be a numeric matrix") } nr <- di[1] nc <- di[2] if(nr <= 1 | nc <= 1) {stop("`x' must have at least 2 rows and 2 columns")} cat("nr = ", nr, "\n") cat("nc = ", nc, "\n") if(!is.numeric(margins) | length(margins) != 2) {stop("`margins' must be a numeric vector of length 2")} if(missing(cellnote)) { cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) } if(!inherits(Rowv, "dendrogram")) { if(((!isTRUE(Rowv)) | (is.null(Rowv))) & (dendrogram %in% c("both", "row"))) { if (is.logical(Colv) & (Colv)) { dendrogram <- "column" } else { dedrogram <- "none" } warning("Discrepancy: Rowv is FALSE, while dendrogram is `", dendrogram, "'. Omitting row dendogram.") } } if(!inherits(Colv, "dendrogram")) { if(((!isTRUE(Colv)) | (is.null(Colv))) & (dendrogram %in% c("both", "column"))) { if (is.logical(Rowv) & (Rowv)) {dendrogram <- "row" } else { dendrogram <- "none" } warning("Discrepancy: Colv is FALSE, while dendrogram is `", dendrogram, "'. Omitting column dendogram.") } } if (inherits(Rowv, "dendrogram")) { ddr <- Rowv rowInd <- order.dendrogram(ddr) } else if (is.integer(Rowv)) { if( missing(rowhcdata) | is.null(rowhcdata) ) { hcr <- gcc.hclust( x, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod) }else { hcr <- rowhcdata } ddr <- as.dendrogram(hcr$hc) ddr <- reorder(ddr, Rowv) rowInd <- order.dendrogram(ddr) if (nr != length(rowInd)) {stop("row dendrogram ordering gave index of wrong length") } } else if (isTRUE(Rowv)) { Rowv <- rowMeans(x, na.rm = na.rm) if( missing(rowhcdata) | is.null(rowhcdata) ) { hcr <- gcc.hclust( x, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod) }else { hcr <- rowhcdata } ddr <- as.dendrogram(hcr$hc) ddr <- reorder(ddr, Rowv) rowInd <- order.dendrogram(ddr) if (nr != length(rowInd)) {stop("row dendrogram ordering gave index of wrong length") } } else { rowInd <- nr:1 } if (inherits(Colv, "dendrogram")) { ddc <- Colv colInd <- order.dendrogram(ddc) }else if (identical(Colv, "Rowv")) { if (nr != nc) { stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") } if (exists("ddr")) { ddc <- ddr colInd <- order.dendrogram(ddc) } else colInd <- rowInd } else if (is.integer(Colv)) { aa <- x if( !symm ) aa <- t(x) if( missing(colhcdata) | is.null(colhcdata) ) { hcc <- gcc.hclust( aa, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod) }else { hcc <- colhcdata } ddc <- as.dendrogram(hcc$hc) ddc <- reorder(ddc, Colv) colInd <- order.dendrogram(ddc) if (nc != length(colInd)) { stop("column dendrogram ordering gave index of wrong length") } } else if (isTRUE(Colv)) { Colv <- colMeans(x, na.rm = na.rm) aa <- x if( !symm ) aa <- t(x) if( missing(colhcdata) | is.null(colhcdata) ) { hcc <- gcc.hclust( aa, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod) }else { hcc <- colhcdata } ddc <- as.dendrogram(hcc$hc) ddc <- reorder(ddc, Colv) colInd <- order.dendrogram(ddc) if (nc != length(colInd)) {stop("column dendrogram ordering gave index of wrong length") } } else { colInd <- 1:nc } retval$rowInd <- rowInd retval$colInd <- colInd retval$call <- match.call() x <- x[rowInd, colInd] x.unscaled <- x cellnote <- cellnote[rowInd, colInd] if (is.null(labRow)) { labRow <- if (is.null(rownames(x))) (1:nr)[rowInd] else rownames(x) }else { labRow <- labRow[rowInd] } if (is.null(labCol)) { labCol <- if (is.null(colnames(x))) (1:nc)[colInd] else colnames(x) }else { labCol <- labCol[colInd] } if (scale == "row") { retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) x <- sweep(x, 1, rm) retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) x <- sweep(x, 1, sx, "/") } else if (scale == "column") { retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) x <- sweep(x, 2, rm) retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) x <- sweep(x, 2, sx, "/") } ##set breaks for colorkey and heat map if (missing(breaks) | is.null(breaks) | length(breaks) < 1) { if (missing(colrange) || is.function(colrange)) breaks <- 20 else if( is.vector(colrange) == TRUE & length(colrange) > 3) { #pre-define colors breaks <- length(col) + 1 }else { breaks <- 20 } } if (length(breaks) == 1) { #if already if( quanbreaks == TRUE ) { breaks <- quantile( unique(c(x)), probs = seq(0, 1, length = breaks), na.rm = TRUE) }else { if (!symbreaks) breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), length = breaks) else { extreme <- max(abs(x), na.rm = TRUE) breaks <- seq(-extreme, extreme, length = breaks) } } } nbr <- length(breaks) ncol <- length(breaks) - 1 if (class(colrange) == "function") { col <- colrange(ncol) }else if( is.vector(colrange) == TRUE & length(colrange) >= 2 ) { col <- colorRampPalette(colrange)(nbr - 1) #for three col }else{ col <- colorRampPalette(c("green", "black", "red"))(nbr - 1) #for three col } min.breaks <- min(breaks) max.breaks <- max(breaks) x[x < min.breaks] <- min.breaks x[x > max.breaks] <- max.breaks if (missing(lhei) | is.null(lhei)) lhei <- c(keysize, 4) if (missing(lwid) | is.null(lwid)) lwid <- c(keysize, 4) if (missing(lmat) | is.null(lmat)) { lmat <- rbind(4:3, 2:1) if (!missing(ColSideColors)) { if (!is.character(ColSideColors) | length(ColSideColors) != nc) { stop("'ColSideColors' must be a character vector of length ncol(x)") } lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1) lhei <- c(lhei[1], 0.2, lhei[2]) } if (!missing(RowSideColors)) { if (!is.character(RowSideColors) | length(RowSideColors) != nr) { stop("'RowSideColors' must be a character vector of length nrow(x)") } lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), 1), lmat[, 2] + 1) lwid <- c(lwid[1], 0.2, lwid[2]) } lmat[is.na(lmat)] <- 0 } if (length(lhei) != nrow(lmat)) stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) if (length(lwid) != ncol(lmat)) stop("lwid must have length = ncol(lmat) =", ncol(lmat)) op <- par(no.readonly = TRUE) on.exit(par(op)) layout(lmat, widths = lwid, heights = lhei, respect = FALSE) if (!missing(RowSideColors)) { par(mar = c(margins[1], 1, 1, 0.5)) image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) } if (!missing(ColSideColors)) { par(mar = c(0.5, 0, 0, margins[2])) image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) } par(mar = c(margins[1], 0, 0, margins[2])) x <- t(x) cellnote <- t(cellnote) if (revC) { iy <- nr:1 if (exists("ddr")) { ddr <- rev(ddr) } x <- x[, iy] cellnote <- cellnote[, iy] } else { iy <- 1:nr } image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "" , ylab = "", col = col, breaks = breaks, ...) retval$carpet <- x if (exists("ddr")) retval$rowDendrogram <- ddr if (exists("ddc")) retval$colDendrogram <- ddc retval$breaks <- breaks retval$col <- col if (!invalid(na.color) & any(is.na(x))) { mmat <- ifelse(is.na(x), 1, NA) image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", col = na.color, add = TRUE) } axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, cex.axis = cexCol) if (!is.null(xlab)) mtext(xlab, side = 1, line = margins[1] - 1.25) axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, cex.axis = cexRow) if (!is.null(ylab)) mtext(ylab, side = 4, line = margins[2] - 1.25) if (!missing(add.expr)) eval(substitute(add.expr)) if (!missing(colsep)) for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, length(csep)), xright = csep + 0.5 + sepwidth[1], ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, col = sepcolor, border = sepcolor) if (!missing(rowsep)) for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + 1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, col = sepcolor, border = sepcolor) min.scale <- min(breaks) max.scale <- max(breaks) x.scaled <- scale01(t(x), min.scale, max.scale) if (trace %in% c("both", "column")) { retval$vline <- vline vline.vals <- scale01(vline, min.scale, max.scale) for (i in colInd) { if (!is.null(vline)) { abline(v = i - 0.5 + vline.vals, col = linecol, lty = 2) } xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5 xv <- c(xv[1], xv) yv <- 1:length(xv) - 0.5 lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") } } if (trace %in% c("both", "row")) { retval$hline <- hline hline.vals <- scale01(hline, min.scale, max.scale) for (i in rowInd) { if (!is.null(hline)) { abline(h = i + hline, col = linecol, lty = 2) } yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5 yv <- rev(c(yv[1], yv)) xv <- length(yv):1 - 0.5 lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") } } if (!missing(cellnote)) text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), col = notecol, cex = notecex) par(mar = c(margins[1], 0, 0, 0)) if (dendrogram %in% c("both", "row")) { plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") } else plot.new() par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) if (dendrogram %in% c("both", "column")) { plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") } else plot.new() if (!is.null(main)) title(main, cex.main = 1.5 * op[["cex.main"]]) #for key if (key) { # par(mar = c(5, 4, 2, 1), cex = 0.75) tmpbreaks <- breaks if (symkey) { max.raw <- max(abs(c(x, breaks)), na.rm = TRUE) min.raw <- -max.raw tmpbreaks[1] <- -max(abs(x), na.rm = TRUE) tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE) } else { min.raw <- min(x, na.rm = TRUE) max.raw <- max(x, na.rm = TRUE) } z <- seq(min.raw, max.raw, length = length(col)) # image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, xaxt = "n", yaxt = "n") image(z = matrix(z, ncol = 1), col = col, xaxt = "n", yaxt = "n", xlab = "", ylab = "") par(usr = c(0, 1, 0, 1)) lv <- pretty(breaks) xv <- scale01(as.numeric(lv), min.raw, max.raw) axis(1, at = xv, labels = lv) if(scale == "row") { mtext(side = 1, paste(keynote, "(Row Z-Score)", sep=""), line = 2) }else if(scale == "column") {mtext(side = 1, paste(keynote, "(Column Z-Score)", sep=""), line = 2) }else { mtext(side = 1, keynote, line = 2) } if (density.info == "density") { dens <- density(x, adjust = densadj, na.rm = TRUE) omit <- dens$x < min(breaks) | dens$x > max(breaks) dens$x <- dens$x[-omit] dens$y <- dens$y[-omit] dens$x <- scale01(dens$x, min.raw, max.raw) lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, lwd = 1) axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y)) # title("Color Key\nand Density Plot") par(cex = 0.5) mtext(side = 2, "Density", line = 2) } else if (density.info == "histogram") { h <- hist(x, plot = FALSE, breaks = breaks) hx <- scale01(breaks, min.raw, max.raw) hy <- c(h$counts, h$counts[length(h$counts)]) lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", col = denscol) axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy)) # title("Color Key\nand Histogram") par(cex = 0.5) mtext(side = 2, "Count", line = 2) } else title("") } else plot.new() retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], high = retval$breaks[-1], color = retval$col) invisible(retval) return( list(retval = retval, hcr = hcr, hcc = hcc) ) } ##check whether x is a data matrix and k for KNN-based MI esimator. .check.matrix <- function(x, k, name) { if ((!is.matrix(x) && !is.data.frame(x)) || nrow(x) < 2) { stop(paste(name, "must be a multi-row matrix or a data.frame")) } else if (ncol(x) <= k) { stop(paste(name, " has too few columns (must be > ", k, ")", sep="")) } } ##check the variable with multiple elements. .check.variable <- function(var, name, candidates) { if( is.vector(var) ){ vartmp = var[1] } if( length(vartmp) == 0 | is.null(var) == TRUE ) { stop( paste(name, "must have a value with the candicate parameters.")) } if( length( which( candidates == vartmp ) ) == 0 ){ stop( paste(name, "must have a value with the candicate parameters.") ) } vartmp } #correlations for gene pairs of all genes .cor_all <- function(xs, rowidx = NULL, colidx = NULL, corMethod = "GCC", cpus = 1, saveType = "matrix", backingpath = NULL, backingfile = "adj_mat", descriptorfile = "adj_desc" ) { h <- nrow(xs) w <- ncol(xs) k <- 3 noise <- 0.0 subrownum <- 0 subcolnum <- 0 if( (length(rowidx) != h) | (length(colidx) != w) ) { subrownum <- length(rowidx) subcolnum <- length(colidx) } corIndex <- c(1,2,3,4, 5) names(corIndex) <- c("GCC", "PCC", "SCC", "KCC", "ED") curcorIndex <- corIndex[corMethod] res <- NULL xsix <- NULL Rownames <- NULL Colnames <- NULL #all gene paris considered if( (length(rowidx) == h) & (length(colidx) == h) ) { Rownames <- rownames(xs) Colnames <- Rownames if( corMethod == "GCC" || corMethod == "SCC" ) { xsix <- apply( xs, 1, function(x) sort( sort(x,index.return=TRUE)$ix, index.return=TRUE)$ix ) res <- .C("c_cor_all", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), res = double(h*h), PACKAGE = "rsgcc", DUP = TRUE)$res }else if( corMethod == "PCC" || corMethod == "KCC" || corMethod == "ED" ) { res <- .C("c_cor_all", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), res = double(h*h), PACKAGE = "rsgcc", DUP = TRUE)$res }else{ stop("Error: undefined cor method).\n") } }else { #part gene paris considered Rownames <- rownames(xs)[rowidx] Colnames <- rownames(xs)[colidx] if( corMethod == "GCC" || corMethod == "SCC" ) { xsix <- apply( xs, 1, function(x) sort( sort(x,index.return=TRUE)$ix, index.return=TRUE)$ix ) res <- .C("c_cor_subset", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), as.integer(rowidx), as.integer(colidx), as.integer(subrownum), as.integer(subcolnum), res = double(subrownum*subcolnum), PACKAGE = "rsgcc", DUP = TRUE)$res }else if( corMethod == "PCC" || corMethod == "KCC" || corMethod == "ED" ) { res <- .C("c_cor_subset", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), as.integer(rowidx), as.integer(colidx), as.integer(subrownum), as.integer(subcolnum), res = double(subrownum*subcolnum), PACKAGE = "rsgcc", DUP = TRUE)$res }else{ stop("Error: undefined cor method).\n") } } m <- t( matrix(res, nrow=length(Colnames)) ) rownames(m) <- Rownames colnames(m) <- Colnames if( saveType == "bigmatrix" ) { options(bigmemory.allow.dimnames=TRUE) if( is.null(backingpath) ) backingpath <- getwd() m <- as.big.matrix(m, type = "double", separated=FALSE, backingfile= backingfile, backingpath= backingpath, descriptorfile = descriptorfile, shared = TRUE) } m } ############################################################################################# #generate adjacency matrix from a gene expression data #Modified on 2012-11-28. two parameters (genes.row, genes.col) were added for generating a correlation matrix with length(genes.row) X length(genes.col). adjacencymatrix <- function(mat, genes.row = NULL, genes.col = NULL, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), k = 3, cpus = 1, saveType = "matrix", backingpath = NULL, backingfile = "adj_mat", descriptorfile = "adj_desc", ... ) { call <- match.call() .check.matrix(mat, k, "xs") method <- .check.variable(method, "method", c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED") ) if (method == "MI" && k < 2) stop("k must be >= 2.") geneNames <- rownames(mat) if( is.null(geneNames) ) geneNames <- c(1:nrow(mat)) ##for BiWt if( method == "BiWt" ) { m <- cor.matrix(mat, cpus = cpus, cormethod= method, style= "all.pairs", pernum= 0, sigmethod= "two.sided", output = "matrix")$corMatrix rownames(m) <- rownames(mat) colnames(m) <- rownames(mat) if( saveType == "bigmatrix" ) { options(bigmemory.allow.dimnames=TRUE) if( is.null(backingpath) ) backingpath <- getwd() m <- as.big.matrix(m, type = "double", separated=FALSE, backingfile= backingfile, backingpath= backingpath, descriptorfile = descriptorfile, shared = TRUE) }#end if bigmatrix return(m) } ##for mine if( method == "MINE" ) { m <- mine(t(mat))$MIC rownames(m) <- rownames(mat) colnames(m) <- rownames(mat) if( saveType == "bigmatrix" ) { options(bigmemory.allow.dimnames=TRUE) if( is.null(backingpath) ) backingpath <- getwd() m <- as.big.matrix(m, type = "double", separated=FALSE, backingfile= backingfile, backingpath= backingpath, descriptorfile = descriptorfile, shared = TRUE) }#end if bigmatrix return(m) } #for other methods Rownames <- geneNames Colnames <- geneNames row.idx <- c(1:nrow(mat)) col.idx <- c(1:nrow(mat)) if( !is.null(genes.row) ) { row.idx <- match( genes.row, geneNames) if( length( which(is.na(row.idx)) ) > 0 ) { stop("Error: some gene names in genes.row are not found in rownames(mat).\n") } Rownames <- rownames(mat)[row.idx] } if( !is.null(genes.col) ) { col.idx <- match( genes.col, geneNames) if( length( which(is.na(col.idx)) ) > 0 ) { stop("Error: some gene names in genes.col are not found in rownames(mat).\n") } Colnames <- rownames(mat)[col.idx] } #for knnmi m <- NULL if( method == "MI" ){ if( is.null(genes.row) | is.null(genes.col) ) { m <- knnmi.cross( mat[row.idx,], mat[col.idx,], k, noise=1e-12 ) }else { m <- knnmi.all(mat, k, noise=1e-12) } rownames(m) <- Rownames colnames(m) <- Colnames }else{ m <-.cor_all(xs = mat, rowidx = row.idx, colidx = col.idx, corMethod = method, cpus = cpus, saveType = saveType, backingpath = backingpath, backingfile = backingfile, descriptorfile = descriptorfile) } m } rsgcc/R/gethclust.R0000644000175100001440000000254412155671320013726 0ustar hornikusers######################################################################### ##Function: compute cluster for microarray and RNASeq gene expression ## data with different dissimilarity methods. ##Author: Chuang Ma ##Date: 2012-02-16 ######################################################################### gcc.hclust <- function(x, cpus = 1, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr"), clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward") ) { if( length(method) > 1 ) { stop("Error: only allow one correlation method") } if( length(distancemethod) > 1 ) { print(distancemethod) stop("Error: only allow one distance method") } if( length(clustermethod) > 1 ) { stop("Error: only allow one cluster method") } ##default parameter if( is.null(method) ) method <- "GCC" if( is.null(distancemethod)) distancemethod <- "Raw" if( is.null(clustermethod)) clustermethod <- "complete" ddata <- gcc.dist(x, cpus = cpus, method= method, distancemethod = distancemethod ) hcdata <- hclust(ddata$dist, method = clustermethod) return( list(hc = hcdata, dist = ddata$dist, pairmatrix = ddata$pairmatrix)) } rsgcc/R/getdist.R0000644000175100001440000000243512155673376013403 0ustar hornikusers########################################################################## ##Function: Compute the distance (dissimilarity) between both rows. ##Notice: Here we only consider the distance between rows. ## For distance between columns, run t(x) before devoke the DistFun ########################################################################### gcc.dist <- function(x, cpus = 1, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), distancemethod = c("Raw", "Abs", "Sqr")) { if( length(distancemethod) > 1 ) { stop("Error: only allow one distance method") } if( is.null(method)) method = "GCC" if( is.null(distancemethod)) distancemethod = "Raw" AllPairMatrix <- adjacencymatrix( mat = x, method = method, cpus = cpus, saveType = "matrix", backingpath = NULL, backingfile = "adj_mat", descriptorfile = "adj_desc" ) if( distancemethod == "Raw") { ad <- as.dist( 1- AllPairMatrix ) }else if( distancemethod == "Abs") { ad <- as.dist( 1- abs(AllPairMatrix) ) }else if( distancemethod == "Sqr") { ad <- as.dist( 1-AllPairMatrix^2) }else { stop("Error: the distance method should be Raw, Abs, or Sqr") } return( list( dist = ad, pairmatrix = AllPairMatrix)) } rsgcc/NAMESPACE0000644000175100001440000000006012027244730012605 0ustar hornikusersuseDynLib(rsgcc) exportPattern("^[[:alpha:]]+") rsgcc/DESCRIPTION0000644000175100001440000000276712157771333013124 0ustar hornikusersPackage: rsgcc Type: Package Title: Gini methodology-based correlation and clustering analysis of microarray and RNA-Seq gene expression data Version: 1.0.6 Author: Chuang Ma, Xiangfeng Wang Maintainer: Chuang Ma URL: http://www.cmbb.arizona.edu/ Depends: R (>= 2.15.1), biwt, cairoDevice, fBasics, grDevices, gplots, gWidgets, gWidgetsRGtk2, minerva, parmigene, stringr, snowfall Suggests: bigmemory, ctc Description: This package provides functions for calculating associations between two genes with five correlation methods(e.g., the Gini correlation coefficient [GCC], the Pearson's product moment correlation coefficient [PCC], the Kendall tau rank correlation coefficient [KCC], the Spearman's rank correlation coefficient [SCC] and the Tukey's biweight correlation coefficient [BiWt], and three non-correlation methods (e.g., mutual information [MI] and the maximal information-based nonparametric exploration [MINE], and the euclidean distance [ED]). It can also been implemented to perform the correlation and clustering analysis of transcriptomic data profiled by microarray and RNA-Seq technologies. Additionally, this package can be further applied to construct gene co-expression networks (GCNs). LazyLoad: yes License: GPL (>= 2) Date: 2013-06-12 Packaged: 2013-06-18 01:18:38 UTC; wanglab NeedsCompilation: yes Repository: CRAN Date/Publication: 2013-06-18 07:40:43