pax_global_header00006660000000000000000000000064116515471730014524gustar00rootroot0000000000000052 comment=eeb77e97204fd35409abaf3616f3701bb7ffa9d9 RogueNaRok-1.0/000077500000000000000000000000001165154717300134005ustar00rootroot00000000000000RogueNaRok-1.0/Array.c000066400000000000000000000030061165154717300146210ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "Array.h" void freeArray(Array *array) { free(array->arrayTable); free(array); } RogueNaRok-1.0/Array.h000066400000000000000000000056051165154717300146350ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef ARRAY_H #define ARRAY_H #include typedef struct { void *arrayTable; void *commonAttributes; unsigned int length; } Array; #define CLONE_ARRAY_FLAT(FUNCNAME, TYPE, TYPEATTR) \ Array *FUNCNAME (const Array *array) \ { \ Array *result = CALLOC(1,sizeof(Array)); \ result->length = array->length; \ result->arrayTable = CALLOC(result->length, sizeof(TYPE)); \ memcpy(result->arrayTable, array->arrayTable, array->length * sizeof(TYPE) ); \ \ if( array->commonAttributes ) \ { \ result->commonAttributes = CALLOC(1 , sizeof(TYPEATTR)); \ memcpy(result->commonAttributes, array->commonAttributes, sizeof(TYPEATTR) ) ; \ } \ return result; \ } void freeArray(Array *array); #endif RogueNaRok-1.0/BitVector.c000066400000000000000000000060101165154717300154420ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "BitVector.h" BitVector *mask32; void initializeMask() { int i; mask32 = CALLOC(MASK_LENGTH, sizeof(BitVector)); mask32[0] = 1; for(i = 1; i < MASK_LENGTH; ++i) mask32[i] = mask32[i-1] << 1; } void printBitVector(BitVector *bv, int length) { int i ; for(i = 0; i < length * 32; ++i) printf("%d", NTH_BIT_IS_SET(bv, i) ? 1 : 0); } void freeBitVectors(unsigned int **v, int n) { int i; for(i = 1; i < n; i++) free(v[i]); } boolean areSameBitVectors(BitVector *a, BitVector *b, int bitVectorLength) { int i; FOR_0_LIMIT(i,bitVectorLength) if(a[i] != b[i]) return FALSE; return TRUE; } BitVector genericBitCount(BitVector* bitVector, int bitVectorLength) { BitVector i, result = 0; for(i = 0; i < bitVectorLength; i++) result += BIT_COUNT(bitVector[i]); return result; } static int iterated_bitcount(BitVector n) { int count=0; while(n) { count += n & 0x1u ; n >>= 1 ; } return count; } void compute_bits_in_16bits(void) { BitVector i; for (i = 0; i < (0x1u<<16); i++) bits_in_16bits[i] = iterated_bitcount(i); return ; } BitVector precomputed16_bitcount (BitVector n) { /* works only for 32-bit int*/ return bits_in_16bits [n & 0xffffu] + bits_in_16bits [(n >> 16) & 0xffffu] ; } BitVector *copyBitVector(BitVector *bitVector, int bitVectorLength) { BitVector *result = CALLOC(bitVectorLength, sizeof(BitVector)); memcpy(result, bitVector,bitVectorLength * sizeof(BitVector)); return result; } RogueNaRok-1.0/BitVector.h000066400000000000000000000051761165154717300154630ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef BITVECTOR_H #define BITVECTOR_H #include #include #include #include "common.h" typedef unsigned int BitVector; #define BIT_COUNT(x) precomputed16_bitcount(x) #define NUMBER_BITS_IN_COMPLEMENT(bipartition) (mxtips - dropRound - bipartition->numberOfBitsSet) #define GET_BITVECTOR_LENGTH(x) (((x) % MASK_LENGTH) ? ((x) / MASK_LENGTH + 1) : ((x) / MASK_LENGTH)) #define FLIP_NTH_BIT(bitVector,n) (bitVector[(n) / MASK_LENGTH] |= mask32[ (n) % MASK_LENGTH ]) #define UNFLIP_NTH_BIT(bitVector,n) (bitVector[(n) / MASK_LENGTH] &= ~mask32[ (n) % MASK_LENGTH ]) #define NTH_BIT_IS_SET(bitVector,n) (bitVector[(n) / MASK_LENGTH] & mask32[(n) % MASK_LENGTH]) #define NTH_BIT_IS_SET_IN_INT(integer,n) (integer & mask32[n]) #define MASK_LENGTH 32 char bits_in_16bits [0x1u << 16]; extern BitVector *mask32; void initializeMask(); BitVector genericBitCount(BitVector* bitVector, int bitVectorLength); BitVector precomputed16_bitcount (BitVector n); void compute_bits_in_16bits(void); void printBitVector(BitVector *bv, int length); void freeBitVectors(BitVector **v, int n); BitVector *copyBitVector(BitVector *bitVector, int bitVectorLength); void printBitVector(BitVector *bv, int length); #endif RogueNaRok-1.0/Dropset.c000066400000000000000000000212271165154717300151700ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "Dropset.h" unsigned int *randForTaxa = NULL; extern int mxtips, maxDropsetSize, bitVectorLength; extern BitVector *droppedTaxa, *neglectThose, *paddingBits; void initializeRandForTaxa(int mxtips) { int i; randForTaxa = CALLOC(mxtips,sizeof(unsigned int)); FOR_0_LIMIT(i,mxtips) randForTaxa[i] = rand(); } /* is ONLY done for adding OWN elements */ void addEventToDropsetPrime(Dropset *dropset, int a, int b) { List *lIter; lIter = dropset->ownPrimeE; while(lIter) { List *next = lIter->next; MergingEvent *me = lIter->value; assert(NOT me->isComplex); if( me->mergingBipartitions.pair[0] == a || me->mergingBipartitions.pair[1] == b || me->mergingBipartitions.pair[1] == a || me->mergingBipartitions.pair[0] == b) { assert( (me->mergingBipartitions.pair[0] == a) + (me->mergingBipartitions.pair[1] == b) + (me->mergingBipartitions.pair[0] == b) + (me->mergingBipartitions.pair[1] == a) == 2); return; } lIter = next; } MergingEvent *result = CALLOC(1,sizeof(MergingEvent)); result->mergingBipartitions.pair[0] = b; result->mergingBipartitions.pair[1] = a; APPEND(result, dropset->ownPrimeE); } List *addEventToDropsetCombining(List *complexEvents, MergingBipartitions primeEvent) { int a = primeEvent.pair[0], b = primeEvent.pair[1]; List *firstElem = NULL, *secondElem = NULL; /* find list elems that already contain merging events */ List *iter ; iter = complexEvents; FOR_LIST(iter) { int res = isInIndexListSpecial(a,b,((MergingEvent*)iter->value)->mergingBipartitions.many); if(res) { if(NOT firstElem) firstElem = iter; else if(NOT secondElem) secondElem = iter; else break; } } if(firstElem && secondElem) { /* merge bips into first elem */ IndexList *il = ((MergingEvent*)secondElem->value)->mergingBipartitions.many, *persistentIndexList = ((MergingEvent*)firstElem->value)->mergingBipartitions.many; FOR_LIST(il) persistentIndexList = appendToIndexListIfNotThere(il->index, persistentIndexList); ((MergingEvent*)firstElem->value)->mergingBipartitions.many = persistentIndexList; freeIndexList(((MergingEvent*)secondElem->value)->mergingBipartitions.many); free((MergingEvent*)secondElem->value); /* remove second element */ if(complexEvents == secondElem) complexEvents = secondElem->next; else { iter = complexEvents; FOR_LIST(iter) { if(iter->next == secondElem) { iter->next = iter->next->next; break; } } } free(secondElem); } else if(firstElem) { assert( NOT secondElem); MergingEvent *me = firstElem->value; me->mergingBipartitions.many = appendToIndexListIfNotThere(a,me->mergingBipartitions.many); me->mergingBipartitions.many = appendToIndexListIfNotThere(b,me->mergingBipartitions.many); } else { MergingEvent *me = CALLOC(1,sizeof(MergingEvent)); me->isComplex = TRUE; me->mergingBipartitions.many = NULL; APPEND_INT(a,me->mergingBipartitions.many); APPEND_INT(b,me->mergingBipartitions.many); APPEND(me, complexEvents); } return complexEvents; } void freeDropsetDeepInHash(void *value) { freeDropsetDeep(value, TRUE); } void freeDropsetDeep(void *value, boolean extended) { Dropset *dropset = (Dropset*) value; List *iter = NULL; if(dropset->taxaToDrop) freeIndexList(dropset->taxaToDrop); /* free combined events */ if(extended && dropset->complexEvents) { iter = dropset->complexEvents; while(iter) { List *next = iter->next; MergingEvent *me = iter->value; freeIndexList(me->mergingBipartitions.many); free(me); iter = next; } freeListFlat(dropset->complexEvents); } if(extended && dropset->acquiredPrimeE) freeListFlat(dropset->acquiredPrimeE); /* ownPrimeE */ iter = dropset->ownPrimeE; while(iter) { List *next = iter->next; free((MergingEvent*)iter->value); iter = next; } freeListFlat(dropset->ownPrimeE); free(dropset); } void freeDropsetDeepInEnd(void *value) { Dropset *dropset = (Dropset*) value; List *iter = NULL; if(dropset->taxaToDrop) freeIndexList(dropset->taxaToDrop); /* free combined events */ if(dropset->complexEvents) freeListFlat(dropset->complexEvents); if( dropset->acquiredPrimeE) freeListFlat(dropset->acquiredPrimeE); /* ownPrimeE */ iter = dropset->ownPrimeE; while(iter) { List *next = iter->next; free((MergingEvent*)iter->value); iter = next; } freeListFlat(dropset->ownPrimeE); free(dropset); } #ifdef PRINT_VERY_VERBOSE void printMergingEventStruct(MergingEvent **mergingEvents, int mxtips) { int i; MergingEvent *iter_merge; IndexList *iter_index; PR("MERGING EVENT STRUCT:\n"); FOR_0_LIMIT(i,mxtips) { PR("%d:\t",i); iter_merge = mergingEvents[i]; FOR_LIST(iter_merge) { PR("{"); iter_index = iter_merge->mergingBipartitions; FOR_LIST(iter_index) { PR("%d,", iter_index->index); } PR(" : %d} ", (iter_merge->supportGained - iter_merge->supportLost)); } PR( "\n"); } PR("\n"); } #endif unsigned int dropsetHashValue(HashTable *hashTable, void *value) { unsigned int result = 0; Dropset *dropset = (Dropset*)value; IndexList *iter = dropset->taxaToDrop; FOR_LIST(iter) { assert(iter->index < mxtips); result ^= randForTaxa[ iter->index ]; } return result; } boolean dropsetEqual(HashTable *hashtable, void *entryA, void *entryB) { IndexList *aList = ((Dropset*)entryA)->taxaToDrop, *bList = ((Dropset*)entryB)->taxaToDrop; return indexListEqual(aList, bList); } IndexList *convertBitVectorToIndexList(BitVector *bv) { int i; IndexList *result = NULL; FOR_0_LIMIT(i,mxtips) if(NTH_BIT_IS_SET(bv, i)) APPEND_INT(i, result); return result; } boolean elementsEqual(ProfileElem *elemA,ProfileElem *elemB, int a, int b) { return (elemA->id == a && elemB->id == b ) || (elemB->id == a && elemA->id == b) ; } IndexList *getDropset(ProfileElem *elemA, ProfileElem *elemB, boolean complement, BitVector *neglectThose) { int i,j, numBit = 0, localBitCount, differenceByte; IndexList *result = NULL; if(elemA == elemB) return NULL; FOR_0_LIMIT(i,bitVectorLength) { if( complement) differenceByte = ~ ((elemA->bitVector[i] ^ elemB->bitVector[i]) | ( droppedTaxa[i] | paddingBits[i] )); else differenceByte = elemA->bitVector[i] ^ elemB->bitVector[i]; localBitCount = BIT_COUNT(differenceByte); if( (numBit += localBitCount) > maxDropsetSize) { freeIndexList(result); return NULL; } if( NOT localBitCount) continue; FOR_0_LIMIT(j,MASK_LENGTH) { if(NOT localBitCount) break; if(NTH_BIT_IS_SET_IN_INT(differenceByte,j)) { APPEND_INT((( i * MASK_LENGTH) + j),result); localBitCount--; if(NOT NTH_BIT_IS_SET(neglectThose, (i*MASK_LENGTH + j))) { freeIndexList(result); return NULL; } } } } assert(numBit); return result; } RogueNaRok-1.0/Dropset.h000066400000000000000000000054171165154717300152000ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef DROPSET_H #define DROPSET_H #include "common.h" #include "List.h" #include "ProfileElem.h" #include "HashTable.h" typedef union _mergeBips { int pair[2]; IndexList *many; } MergingBipartitions; typedef struct _mergingEvent { MergingBipartitions mergingBipartitions; boolean isComplex; int supportLost; int supportGained; boolean computed; } MergingEvent; typedef struct dropset { IndexList *taxaToDrop; int improvement; List *ownPrimeE; List *acquiredPrimeE; List *complexEvents; } Dropset; boolean dropsetEqual(HashTable *hashtable, void *entryA, void *entryB); unsigned int dropsetHashValue(HashTable *hashTable, void *value); void removeMergingEvent(Dropset *dropset, List *toBeRemoved); List *freeMergingEventReturnNext(List *elem); void removeDropsetAndRelated(HashTable *mergingHash, Dropset *dropset); void addEventToDropsetPrime(Dropset *dropset, int a, int b); List *addEventToDropsetCombining(List *complexEvents, MergingBipartitions primeEvent); void freeDropsetDeepInHash(void *value); void freeDropsetDeepInEnd(void *value); void addEventToDropsetForCombining(Dropset *dropset, IndexList *mergingBips); void initializeRandForTaxa(int mxtips); void freeDropsetDeep(void *values, boolean freeCombinedM); IndexList *getDropset(ProfileElem *elemA, ProfileElem *elemB, boolean complement, BitVector *neglectThose); #endif RogueNaRok-1.0/HashTable.c000066400000000000000000000173671165154717300154150ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "HashTable.h" #ifdef PARALLEL #include #endif HashTable *createHashTable(unsigned int size, void *commonAttr, unsigned int (*hashFunction)(HashTable *hash_table, void *value), boolean (*equalFunction)(HashTable *hash_table, void *entryA, void *entryB)) { static const unsigned int initTable[] = {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648U}; HashTable *hashTable = CALLOC(1, sizeof(HashTable)); unsigned int tableSize, i, #ifdef DEBUG maxSize = (unsigned int)-1, #endif primeTableLength = sizeof(initTable)/sizeof(initTable[0]); hashTable->hashFunction = hashFunction; hashTable->equalFunction = equalFunction; hashTable->commonAttributes = commonAttr; #ifdef DEBUG assert(size <= maxSize); #endif for(i = 0; initTable[i] < size && i < primeTableLength; ++i); assert(i < primeTableLength); tableSize = initTable[i]; #ifdef PARALLEL hashTable->cntLock = CALLOC(1,sizeof(pthread_mutex_t)); pthread_mutex_init(hashTable->cntLock, (pthread_mutexattr_t *)NULL); hashTable->lockPerSlot = (pthread_mutex_t**)CALLOC(tableSize,sizeof(pthread_mutex_t*)); FOR_0_LIMIT(i, tableSize) { hashTable->lockPerSlot[i] = (pthread_mutex_t*) CALLOC(1,sizeof(pthread_mutex_t)); pthread_mutex_init(hashTable->lockPerSlot[i], (pthread_mutexattr_t *)NULL); } #endif hashTable->table = CALLOC(tableSize, sizeof(HashElem*)); hashTable->tableSize = tableSize; hashTable->entryCount = 0; return hashTable; } /* remove without destroying value of hashelem */ boolean removeElementFromHash(HashTable *hashtable, void *value) { unsigned int hashValue = hashtable->hashFunction(hashtable, value), position = hashValue % hashtable->tableSize; HashElem *elem = hashtable->table[position]; if( NOT elem ) { assert(0); return FALSE; } if( elem->fullKey == hashValue && hashtable->equalFunction(hashtable, elem->value, value)) { hashtable->table[position] = elem->next ; free(elem); hashtable->entryCount-- ; return TRUE; } while(elem->next) { if(elem->next->fullKey == hashValue && hashtable->equalFunction(hashtable, elem->next->value, value)) { void *nextOne = elem->next->next; free(elem->next); elem->next = nextOne; hashtable->entryCount-- ; return TRUE; } elem = elem->next; } return FALSE; } void *searchHashTableWithInt(HashTable *hashtable, unsigned int hashValue) { unsigned int position = hashValue % hashtable->tableSize; HashElem *elem; for(elem = hashtable->table[position]; elem; elem = elem->next) if(elem->fullKey == hashValue) return elem->value; return NULL; } void *searchHashTable(HashTable *hashtable, void *value, unsigned int hashValue) { unsigned int position = hashValue % hashtable->tableSize; HashElem *elem; for(elem = hashtable->table[position]; elem; elem = elem->next) if(elem->fullKey == hashValue && hashtable->equalFunction(hashtable, elem->value, value)) return elem->value; return NULL; } void insertIntoHashTable(HashTable *hashTable, void *value, unsigned int index) { /* just copied this */ HashElem *hashElem = CALLOC(1, sizeof(HashElem)); hashElem->fullKey = index; index = hashElem->fullKey % hashTable->tableSize; hashElem->value = value; hashElem->next = hashTable->table[index]; hashTable->table[index] = hashElem; #ifdef PARALLEL pthread_mutex_lock(hashTable->cntLock); hashTable->entryCount++; pthread_mutex_unlock(hashTable->cntLock); #else hashTable->entryCount++; #endif } void destroyHashTable(HashTable *hashTable, void (*freeValue)(void *value)) { unsigned int i; HashElem *elemA, *elemB, **table = hashTable->table; for(i = 0; i < hashTable->tableSize; ++i) { elemA = table[i]; while(elemA != NULL) { elemB = elemA; elemA = elemA->next; if(freeValue) freeValue(elemB->value); free(elemB); } } #ifdef PARALLEL pthread_mutex_destroy(hashTable->cntLock); free(hashTable->cntLock); FOR_0_LIMIT(i,hashTable->tableSize) { pthread_mutex_destroy(hashTable->lockPerSlot[i]); free(hashTable->lockPerSlot[i]); } free(hashTable->lockPerSlot); #endif free(hashTable->commonAttributes); free(hashTable->table); free(hashTable); } void updateEntryCount(HashTable *hashTable) { unsigned int i, result = 0; for(i = 0; i < hashTable->tableSize; ++i) { HashElem *elem = ((HashElem**)hashTable->table)[i]; while(elem) { result++; elem = elem->next; } } hashTable->entryCount = result; } HashTableIterator *createHashTableIterator(HashTable *hashTable) { unsigned int i; HashTableIterator *hashTableIterator = CALLOC(1, sizeof(HashTableIterator)); hashTableIterator->hashTable = hashTable; hashTableIterator->hashElem = NULL; hashTableIterator->index = hashTable->tableSize; if( NOT hashTable->entryCount) return hashTableIterator; FOR_0_LIMIT(i,hashTable->tableSize) { if(hashTable->table[i]) { hashTableIterator->hashElem = hashTable->table[i]; hashTableIterator->index = i; break; } } return hashTableIterator; } boolean hashTableIteratorNext(HashTableIterator *hashTableIterator) { unsigned int i, tableSize = hashTableIterator->hashTable->tableSize; HashElem *next = hashTableIterator->hashElem->next; if(next) { hashTableIterator->hashElem = next; return TRUE; } i = hashTableIterator->index + 1; if(i >= tableSize) { hashTableIterator->index = i; return FALSE; } while( NOT (next = hashTableIterator->hashTable->table[i])) { if( ++i >= tableSize) { hashTableIterator->index = i; return FALSE; } } hashTableIterator->index = i; hashTableIterator->hashElem = next; return next != NULL ; } void *getCurrentValueFromHashTableIterator(HashTableIterator *hashTableIterator) { return ((hashTableIterator->hashElem) ? hashTableIterator->hashElem->value : NULL); } RogueNaRok-1.0/HashTable.h000066400000000000000000000063441165154717300154130ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef HASHTABLE_H #define HASHTABLE_H #include #include #include #include "common.h" typedef struct hash_el { unsigned int fullKey; void *value; struct hash_el *next; } HashElem; typedef struct hash_table { unsigned int tableSize; unsigned int entryCount; void *commonAttributes; unsigned int (*hashFunction)(struct hash_table *h, void *value); boolean (*equalFunction)(struct hash_table *hashtable, void *entrA, void *entryB); HashElem **table; #ifdef PARALLEL pthread_mutex_t **lockPerSlot; pthread_mutex_t *cntLock; #endif } HashTable; typedef struct { HashTable *hashTable; HashElem *hashElem; unsigned int index; } HashTableIterator; #define FOR_HASH(htIter, hashtable) boolean hasNext = TRUE; for(htIter = createHashTableIterator(hashtable); htIter && hasNext ; hasNext = hashTableIteratorNext(htIter)) #define FOR_HASH_2(htIter, hashtable) hasNext = TRUE; for(htIter = createHashTableIterator(hashtable); hasNext ; hasNext = hashTableIteratorNext(htIter)) HashTable *createHashTable(unsigned int size, void *commonAttr, unsigned int (*hashFunction)(HashTable *hash_table, void *value), boolean (*equalFunction)(HashTable *hash_table, void *entryA, void *entryB)); HashTableIterator *createHashTableIterator(HashTable *hashTable); void *getCurrentValueFromHashTableIterator(HashTableIterator *hashTableIterator); boolean hashTableIteratorNext(HashTableIterator *hashTableIterator); void *searchHashTableWithInt(HashTable *hashtable, unsigned int hashValue); void *searchHashTable(HashTable *hashtable, void *value, unsigned int hashValue); void insertIntoHashTable(HashTable *hashTable, void *value, unsigned int index); boolean removeElementFromHash(HashTable *hashtable, void *value); void destroyHashTable(HashTable *hashTable, void (*freeValue)(void *value)); #endif RogueNaRok-1.0/List.c000066400000000000000000000201001165154717300144500ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "List.h" boolean isInIndexList(int index, IndexList *list) { IndexList *iter = list; FOR_LIST(iter) if(iter->index == index) return TRUE; return FALSE; } int isInIndexListSpecial(int a, int b, IndexList *list) { FOR_LIST(list) { if(list->index == a) return 1 ; if(list->index == b) return 2; } return 0; } IndexList *setMinusOf(IndexList *list, IndexList *subtract) { IndexList *iter = list, *result = NULL; FOR_LIST(iter) { IndexList *iter2 = subtract; boolean isThere = FALSE; FOR_LIST(iter2) isThere |= iter->index == iter2->index; if(NOT isThere) APPEND_INT(iter->index, result); } freeIndexList(list); return result; } #ifdef NEW_SUBSET boolean isSubsetOfReverseOrdered(IndexList *subset, IndexList *set) { IndexList *subsetIter = subset; boolean found = FALSE; FOR_LIST(subsetIter) { IndexList *setIter = set; int ind = subsetIter->index; FOR_LIST(setIter) { if(ind > setIter->index) return FALSE; else if((found = (ind == setIter->index))) break; } if(NOT found) return FALSE; } return TRUE; } #else boolean isSubsetOf(IndexList *subset, IndexList *set) { boolean result = TRUE; FOR_LIST(subset) { result &= isInIndexList(subset->index, set); if( NOT result) return result; } return result; } #endif IndexList *appendToIndexListIfNotThere(int elem, IndexList *list) { IndexList *iter = list; boolean contains = FALSE; FOR_LIST(iter) { contains |= elem == iter->index; if(contains) break; } if( NOT contains) return appendToIndexList(elem, list); else return list; } IndexList *appendToIndexListIfNotThere2(int elem, IndexList *list) { IndexList *iter = list; boolean contains = FALSE; FOR_LIST(iter) { contains |= elem == iter->index; if(contains) break; } if( NOT contains) { PR("FAIL: it was not there\n"); return appendToIndexList(elem, list); } else return list; } boolean haveIntersection2(IndexList *listA, IndexList *listB) { FOR_LIST(listA) { IndexList *iter = listB; FOR_LIST(iter) if(listA->index == iter->index) return TRUE; } return FALSE; } boolean haveIntersection(IndexList *listA, IndexList *listB) { FOR_LIST(listA) { IndexList *iter = listB; FOR_LIST(iter) if(listA->index == iter->index) return TRUE; } return FALSE; } void freeIndexList(IndexList *list) { IndexList *iter; for(iter = list; iter;) { list = iter->next; free(iter); iter = list; } } boolean indexListContainsIndexListUnordered(IndexList *list, IndexList *subList) { FOR_LIST(subList) { boolean contained = FALSE; IndexList *iter = list; FOR_LIST(iter) if(iter->index == subList->index) { contained = TRUE; break; } if( NOT contained) return FALSE; } return TRUE; } List *joinLists(List* a, List*b) { List *iter; if( NOT a) return b; for(iter = a; iter ; iter = iter->next) { if( NOT iter->next) { iter->next = b; return a; } } assert(0); return NULL; } int getMax_indexList(IndexList *list) { IndexList *iter = list; int result = 0; FOR_LIST(iter) if(result < iter->index) result = iter->index; return result; } int lengthIndexList(IndexList *list) { IndexList *iter = list; int result = 0; FOR_LIST(iter) result++; return result; } void printIndexList(IndexList *list) { PR(">"); FOR_LIST(list) PR("%d,", list->index); PR("<"); } void printIndexListToFile(FILE *file, IndexList *list) { boolean isFirst = TRUE; FOR_LIST(list) if(isFirst) { fprintf(file, "%d", list->index); isFirst = FALSE; } else fprintf(file, ",%d", list->index); } IndexList *findFirstCommonElem(IndexList *listA, IndexList *listB) { IndexList *iterA = listA, *iterB = listB; FOR_LIST(iterA) { iterB = listB; FOR_LIST(iterB) if(iterA->index == iterB->index) return iterA; } return NULL; } IndexList *concatenateIndexList(IndexList *listA, IndexList *listB) { IndexList *iterA = listA; if( NOT iterA) return listB; FOR_LIST(iterA) if( NOT iterA->next) { iterA->next = listB; return listA; } return NULL; } IndexList *doubleAppendToIndexList(int valueA, int valueB, IndexList *list) { IndexList *listElemB = CALLOC(1, sizeof(IndexList)), *listElemA = CALLOC(1, sizeof(IndexList)); listElemA->index = valueA; listElemB->index = valueB; listElemA->next = list; listElemB->next = listElemA; return listElemB; } IndexList* appendToIndexList(int value, IndexList *list) { IndexList *listElem = CALLOC(1, sizeof(IndexList)); listElem->index = value; listElem->next = list; return listElem; } boolean indexListEqual(IndexList *aList, IndexList *bList) { IndexList *iterA = aList; FOR_LIST(iterA) { boolean indexFound = FALSE; IndexList *iterB = bList; FOR_LIST(iterB) { indexFound = iterA->index == iterB->index; if(indexFound) break; } if( NOT indexFound) return FALSE; } return lengthIndexList(aList) == lengthIndexList(bList); } int lengthOfList(List *list) { int length = 0 ; List *iter = list; FOR_LIST(iter) length++; return length; } List* appendToList(void *value, List *list) { List *listElem = CALLOC(1, sizeof(List)); listElem->value = value; listElem->next = list; return listElem; } List *concatenateLists(List *listA, List *listB) { List *iter = listA ; if( NOT iter) return listB; FOR_LIST(iter) { if( NOT iter->next) { iter->next = listB; return listA; } } return NULL; } void freeList(List* list) { List *iter; for(iter = list; iter;) { list = iter->next; free(iter->value); free(iter); iter = list; } } void freeListFlat(List* list) { List *iter; for(iter = list; iter;) { list = iter->next; free(iter); iter = list; } } boolean elemIsInIndexList(int index, IndexList *list) { FOR_LIST(list) { if(index == list->index) return TRUE; } return FALSE; } IndexList *joinIndexListsNonRedundant(IndexList *listA, IndexList *listB) { IndexList *result = NULL; FOR_LIST(listA) APPEND_INT(listA->index, result); freeIndexList(listA); FOR_LIST(listB) if( NOT elemIsInIndexList(listB->index, result)) APPEND_INT(listB->index, result); freeIndexList(listB); return result; } RogueNaRok-1.0/List.h000066400000000000000000000067111165154717300144710ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef LIST_H #define LIST_H #include #include "common.h" typedef struct List_{ void *value; struct List_ *next; } List; typedef struct _indexList { struct _indexList *next; int index; } IndexList; void freeList(List* list); IndexList *doubleAppendToIndexList(int valueA, int valueB, IndexList *list); boolean isSubsetOf(IndexList *subset, IndexList *set); List* appendToList(void *value, List *list); IndexList *appendToIndexListIfNotThere(int elem, IndexList *list); IndexList *appendToIndexListIfNotThere2(int elem, IndexList *list); void freeIndexList(IndexList *list); IndexList* appendToIndexList(int value, IndexList *list) ; boolean indexListEqual(IndexList *aList, IndexList *bList); boolean isInIndexList(int index, IndexList *list); int isInIndexListSpecial(int a, int b, IndexList *list); void freeIndexList(IndexList *list); IndexList* appendToIndexList(int value, IndexList *list) ; void freeIndexList(IndexList *list); IndexList *concatenateIndexList(IndexList *listA, IndexList *listB) ; List *concatenateLists(List *listA, List *listB); void printIndexList(IndexList *list); IndexList *findFirstCommonElem(IndexList *listA, IndexList *listB); boolean elemIsInIndexList(int index, IndexList *list); List *joinLists(List* a, List*b) ; IndexList *joinIndexListsNonRedundant(IndexList *listA, IndexList *listB); void freeListFlat(List* list); boolean indexListContainsIndexListUnordered(IndexList *list, IndexList *subList); boolean haveIntersection(IndexList *listA, IndexList *listB); int length_indexList(IndexList *list); IndexList *setMinusOf(IndexList *list, IndexList *subtract); int lengthIndexList(IndexList *list); void printIndexListToFile(FILE *file, IndexList *list); int lengthOfList(List *list); boolean isSubsetOfReverseOrdered(IndexList *subset, IndexList *set); #define FOR_LIST(iter) for(;iter;iter = iter->next) #define APPEND(elem,list) list = appendToList(elem,list) #define APPEND_INT(elem,list) list = appendToIndexList(elem,list) #define CONCAT_LISTS(lista,listb) lista = concatenateLists(lista,listb) #endif RogueNaRok-1.0/Makefile000066400000000000000000000030551165154717300150430ustar00rootroot00000000000000CC = gcc CFLAGS = -Wall -D_GNU_SOURCE # -DNDEBUG LFLAGS = -lm ifeq ($(mode), debug) CFLAGS += -g else CFLAGS += -O3 # -march=native ifeq ($(mode), profile) CFLAGS += -pg -g endif endif ifeq ($(mode), parallel) CFLAGS += -DPARALLEL -DPORTABLE_PTHREADS LFLAGS += -pthread endif ifeq ($(mode), parallelDebug) CFLAGS += -DPARALLEL -g LFLAGS += -pthread endif RM = rm -fr TARGETS = RogueNaRok rnr-prune rnr-lsi rnr-tii rnr-mast all : $(TARGETS) rnr-objs = common.o RogueNaRok.o Tree.o BitVector.o HashTable.o List.o Array.o Dropset.o ProfileElem.o legacy.o newFunctions.o parallel.o Node.o lsi-objs = rnr-lsi.o common.o Tree.o BitVector.o HashTable.o legacy.o newFunctions.o List.o tii-objs = rnr-tii.o common.o BitVector.o Tree.o HashTable.o List.o legacy.o newFunctions.o mast-objs = rnr-mast.o common.o List.o Tree.o BitVector.o HashTable.o legacy.o newFunctions.o prune-objs = rnr-prune.o common.o Tree.o BitVector.o HashTable.o legacy.o newFunctions.o List.o rnr-lsi: $(lsi-objs) $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS) rnr-tii: $(tii-objs) $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS) rnr-mast: $(mast-objs) $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS) rnr-prune: $(prune-objs) $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS) ifeq ($(mode),parallel) RogueNaRok: $(rnr-objs) $(CC) $(LFLAGS) -o $@-parallel $^ $(CFLAGS) else RogueNaRok: $(rnr-objs) $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS) endif %.o : %.c $(DEPS) $(CC) -c -o $@ $< $(CFLAGS) clean : $(RM) $(rnr-objs) $(lsi-objs) $(tii-objs) $(mast-objs) $(prune-objs) $(TARGETS) $(TESTS) $(rnr-test-objs) RogueNaRok-parallel RogueNaRok-1.0/Node.c000066400000000000000000000044271165154717300144400ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "Node.h" IndexList *findAnIndependentComponent(HashTable *allNodes, Node *thisNode) { if(thisNode->visited) return NULL; IndexList *iter = thisNode->edges; thisNode->visited = TRUE; IndexList *result = NULL; APPEND_INT(thisNode->id, result); FOR_LIST(iter) { Node *found = searchHashTableWithInt(allNodes, iter->index); if( NOT found->visited) { IndexList *list = findAnIndependentComponent(allNodes, found); result = concatenateIndexList(list, result); } } return result; } void freeNode(void *value) { Node *node = value; freeIndexList(node->edges); free(node); } boolean nodeEqual(HashTable *hashTable, void *entryA, void *entryB) { return ((Node*)entryA)->id == ((Node*)entryB)->id; } unsigned int nodeHashValue(HashTable *hashTable, void *value) { return ((Node*)value)->id; } RogueNaRok-1.0/Node.h000066400000000000000000000035141165154717300144410ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef NODE_H #define NODE_H #include "common.h" #include "List.h" #include "HashTable.h" typedef struct _node { int id; boolean visited; IndexList *edges; } Node; boolean nodeEqual(HashTable *hashTable, void *entryA, void *entryB); unsigned int nodeHashValue(HashTable *hashTable, void *value); void freeNode(void *value); IndexList *findAnIndependentComponent(HashTable *allNodes, Node *thisNode); #endif RogueNaRok-1.0/ProfileElem.c000066400000000000000000000112351165154717300157510ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "ProfileElem.h" CLONE_ARRAY_FLAT(cloneProfileArrayFlat, ProfileElem*, ProfileElemAttr*) void addElemToArray(ProfileElem *elem, Array *array) { GET_PROFILE_ELEM(array, array->length) = elem; array->length++; } void freeProfileElem(ProfileElem *elem) { free(elem->treeVector); free(elem->bitVector); free(elem); } Array* profileToArray(HashTable *profile, boolean updateFrequencyCount, boolean assignIds) { HashTableIterator* hashTableIterator = createHashTableIterator(profile); Array *result = CALLOC(1, sizeof(Array)); unsigned int count = 0; /* remember to always copy s.t. free() runs w/o problems */ ProfileElemAttr *profileElemAttr; result->commonAttributes = CALLOC(1, sizeof(ProfileElemAttr)); result->commonAttributes = memcpy(result->commonAttributes, profile->commonAttributes, sizeof(ProfileElemAttr)); profileElemAttr = result->commonAttributes; result->length = profile->entryCount; result->arrayTable = CALLOC(profile->entryCount, sizeof(ProfileElem*)); if( NOT hashTableIterator) return result; do { ProfileElem *profileElem = getCurrentValueFromHashTableIterator(hashTableIterator); assert(profileElem); if(updateFrequencyCount) profileElem->treeVectorSupport = genericBitCount(profileElem->treeVector, profileElemAttr->treeVectorLength); if(assignIds) profileElem->id = count; ((ProfileElem**)result->arrayTable)[count] = profileElem; assert(profileElem->bitVector && profileElem->treeVector); count++; } while(hashTableIteratorNext(hashTableIterator)); assert(count == profile->entryCount); free(hashTableIterator); return result; } int sortBipProfile(const void *a, const void *b) { if(*((ProfileElem**)a) == NULL) return 1; if(*((ProfileElem**)b) == NULL) return -1; int as = (*(ProfileElem**)a)->numberOfBitsSet, bs = (*(ProfileElem**)b)->numberOfBitsSet; if(as == bs) return 0; return as < bs ? -1 : 1; } int sortById(const void *a, const void *b) { int as = (*(ProfileElem**)a)->id, bs = (*(ProfileElem**)b)->id; if(as == bs) return 0; return as < bs ? -1 : 1; } int sortBySupport(const void *a, const void *b) { unsigned int as = (*(ProfileElem**)a)->treeVectorSupport, bs = (*(ProfileElem**)b)->treeVectorSupport; if(as == bs) return 0; return as > bs ? -1 : 1; } /* what is the index in the (ordered) profile of the first element to have at least i bits set? */ int *createNumBitIndex(Array *bipartitionProfile, int mxtips) { int *result = CALLOC(mxtips, sizeof(int)); memset(result, -1, mxtips * sizeof(int)); qsort(bipartitionProfile->arrayTable, bipartitionProfile->length, sizeof(ProfileElem**), sortBipProfile); int i, max = 0, current = 0; FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile, i); if(NOT elem ) break; if(elem->numberOfBitsSet != current) { current = elem->numberOfBitsSet; result[current] = i; max = i; } } current = max; for(i = mxtips-1; i >= 0; --i) { if(result[i] == -1) result[i] = current; else current = result[i]; } return result; } RogueNaRok-1.0/ProfileElem.h000066400000000000000000000047511165154717300157630ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef PROFILEELEM_H #define PROFILEELEM_H #include #include #include "Array.h" #include "HashTable.h" #include "common.h" #include "BitVector.h" typedef struct { BitVector bitVectorLength; BitVector treeVectorLength; BitVector *randForTaxa; /* random numbers to hash the vectors */ BitVector lastByte; /* the padding bits */ } ProfileElemAttr; typedef struct profile_elem { BitVector *bitVector; BitVector *treeVector; int treeVectorSupport; boolean isInMLTree; BitVector id; int numberOfBitsSet; } ProfileElem; #define GET_PROFILE_ELEM(array,index) (((ProfileElem**)array->arrayTable)[(index)]) #define GET_DROPSET_ELEM(array,index) (((Dropset**)array->arrayTable)[(index)]) int *createNumBitIndex(Array *bipartitionProfile, int mxtips); int sortById(const void *a, const void *b); int sortBySupport(const void *a, const void *b); int sortBipProfile(const void *a, const void *b); Array *cloneProfileArrayFlat(const Array *array); void addElemToArray(ProfileElem *elem, Array *array); void freeProfileElem(ProfileElem *elem); #endif RogueNaRok-1.0/README000066400000000000000000000010621165154717300142570ustar00rootroot00000000000000RogueNaRok is an algorithm for the identification of rogue taxa in a tree set. How do I install it? Download the code and run the "make" command. For a parallel version of the RogueNaRok algorithm, use "make mode=parallel". Note that, the parallel version requires the pthreads-library. Running one of the programs without arguments will trigger the help message. Just follow the instructions. More info is available on https://github.com/aberer/RogueNaRok/wiki/RogueNaRok . Also, use this site for reporting bugs, ask questions of how to employ RogueNaRok. RogueNaRok-1.0/RogueNaRok.c000066400000000000000000001653521165154717300155740ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include #include #include #include #include #include "Tree.h" #include "sharedVariables.h" #include "Dropset.h" #include "legacy.h" #include "newFunctions.h" #include "Node.h" #ifdef PARALLEL #include "parallel.h" #include #endif #define PROG_NAME "RogueNaRok" #define PROG_VERSION "1.0" #define PROG_RELEASE_DATE "2011-10-25" /* #define PRINT_VERY_VERBOSE */ /* #define MYDEBUG */ #define PRINT_DROPSETS #define PRINT_TIME /* try to produce minimal dropsets */ /* #define MIN_DROPSETS */ #define VANILLA_CONSENSUS_OPT 0 #define ML_TREE_OPT 1 #define MRE_CONSENSUS_OPT 2 #define HASH_TABLE_SIZE_CONST 100 extern unsigned int *randForTaxa; int bitVectorLength, treeVectorLength, maxDropsetSize = 1, rogueMode = 0, dropRound = 0, taxaDropped = 0, thresh, numberOfTrees, bestLastTime, *cumScores, cumScore = 0, bestCumEver = 0, numBips, mxtips; Dropset **dropsetPerRound; boolean computeSupport = TRUE; BitVector *droppedTaxa, *neglectThose, *paddingBits; double labelPenalty = 0., timeInc; #ifdef MYDEBUG void debug_dropsetConsistencyCheck(HashTable *mergingHash) { HashTableIterator *htIter; FOR_HASH(htIter, mergingHash) { Dropset *ds = getCurrentValueFromHashTableIterator(htIter); if( NOT ds) break; HashTableIterator *htIter2; boolean hasNext2 = TRUE; for(htIter2 = createHashTableIterator(mergingHash); htIter2 && hasNext2 ; hasNext2 = hashTableIteratorNext(htIter2)) { Dropset *ds2 = getCurrentValueFromHashTableIterator(htIter2); if(ds == ds2) continue; if(indexListEqual(ds->taxaToDrop, ds2->taxaToDrop)) { PR("duplicate dropset: "); printIndexList(ds->taxaToDrop); PR(" and "); printIndexList(ds2->taxaToDrop); PR("\n"); exit(-1); } } free(htIter2); } free(htIter); } #endif boolean isCompatible(ProfileElem* elemA, ProfileElem* elemB, BitVector *droppedTaxa) { unsigned int i; unsigned int *A = elemA->bitVector, *C = elemB->bitVector; FOR_0_LIMIT(i,bitVectorLength) if(A[i] & C[i] & ~ (droppedTaxa[i] | paddingBits[i]) ) break; if(i == bitVectorLength) return TRUE; FOR_0_LIMIT(i,bitVectorLength) if( ( A[i] & ~C[i] ) & ~ (droppedTaxa[i] | paddingBits[i]) ) break; if(i == bitVectorLength) return TRUE; FOR_0_LIMIT(i,bitVectorLength) if( ( ~A[i] & C[i] ) & ~ (droppedTaxa[i] | paddingBits[i]) ) break; if(i == bitVectorLength) return TRUE; else return FALSE; } #ifdef MYDEBUG /* ensures, no merging event occurs twice per dropset */ void debug_mergingHashSanityCheck(HashTable *mergingHash, int totalNumberOfBips) { HashTableIterator *htIter; FOR_HASH(htIter, mergingHash) { Dropset *ds = getCurrentValueFromHashTableIterator(htIter); if(NOT ds) break; BitVector *bv = CALLOC(totalNumberOfBips, sizeof(BitVector)); List *meIter = ds->primeEvents; FOR_LIST(meIter) { MergingEvent *me = meIter->value; if(me->isComplex) { IndexList *il = me->mergingBipartitions.many; FOR_LIST(il) { assert(NOT NTH_BIT_IS_SET(bv, il->index)); FLIP_NTH_BIT(bv, il->index); } } else { assert(NOT NTH_BIT_IS_SET(bv, me->mergingBipartitions.pair[0])); assert(NOT NTH_BIT_IS_SET(bv, me->mergingBipartitions.pair[1])); FLIP_NTH_BIT(bv, me->mergingBipartitions.pair[0]); FLIP_NTH_BIT(bv, me->mergingBipartitions.pair[1]); } } free(bv); } free(htIter); } #endif boolean myBitVectorEqual(ProfileElem *elemA, ProfileElem *elemB) { boolean normalEqual = TRUE, complement = TRUE; int i ; FOR_0_LIMIT(i,bitVectorLength) { normalEqual = normalEqual && ( elemA->bitVector[i] == elemB->bitVector[i]); complement = complement && (elemA->bitVector[i] == ~(elemB->bitVector[i] | droppedTaxa[i] | paddingBits[i])); } return normalEqual || complement; } boolean canMergeWithComplement(ProfileElem *elem) { return mxtips - taxaDropped - 2 * elem->numberOfBitsSet <= 2 * maxDropsetSize; } boolean bitVectorEqual(ProfileElem *elemA, ProfileElem *elemB) { boolean normalEqual = TRUE, complement = TRUE; int i ; FOR_0_LIMIT(i,bitVectorLength) { normalEqual = normalEqual && ( elemA->bitVector[i] == elemB->bitVector[i]); complement = complement && (elemA->bitVector[i] == ~(elemB->bitVector[i] | droppedTaxa[i] | paddingBits[i])); } return normalEqual || complement; } boolean mergedBipVanishes(MergingEvent *me, Array *bipartitionsById, IndexList *taxaToDrop) { int vanBits = 0; IndexList *iter = taxaToDrop; ProfileElem *elem = me->isComplex ? GET_PROFILE_ELEM(bipartitionsById, (me->mergingBipartitions).many->index) : GET_PROFILE_ELEM(bipartitionsById, (me->mergingBipartitions).pair[0]); FOR_LIST(iter) if(NTH_BIT_IS_SET(elem->bitVector,iter->index)) vanBits++; return elem->numberOfBitsSet - vanBits < 2; } /* insert dropset. If it is a multi-taxa dropset, gather all merging events of sub-dropsets */ Dropset *insertOrFindDropset(HashTable *hashtable, Dropset *dropset, unsigned int hashValue) { void *result = searchHashTable(hashtable, dropset, hashValue); if(result) { freeDropsetDeep(dropset, TRUE); return result; } else { insertIntoHashTable(hashtable, dropset, hashValue); return dropset; } } boolean checkForMergerAndAddEvent(boolean complement, ProfileElem *elemA, ProfileElem *elemB, HashTable *mergingHash) { IndexList *dropsetTaxa = getDropset(elemA,elemB,complement, neglectThose); if(dropsetTaxa) { Dropset *dropset, *tmp = CALLOC(1,sizeof(Dropset)); tmp->taxaToDrop = dropsetTaxa; unsigned int hashValue = 0; IndexList *iter = dropsetTaxa; FOR_LIST(iter) { assert(iter->index < mxtips); hashValue ^= randForTaxa[ iter->index ]; } #ifdef PARALLEL int position = hashValue % mergingHash->tableSize; pthread_mutex_lock(mergingHash->lockPerSlot[position]); #endif dropset = insertOrFindDropset(mergingHash, tmp, hashValue); addEventToDropsetPrime(dropset, elemA->id, elemB->id); #ifdef PARALLEL pthread_mutex_unlock(mergingHash->lockPerSlot[position]); #endif return TRUE; } else return FALSE; } /* can i trust you tiny function? (practical relevant -> 0) */ /* boolean bothDropsetsRelevant(ProfileElem *elemA) */ boolean bothDropsetsRelevant(int numBits) { return numBits <= maxDropsetSize && numBits >= mxtips - taxaDropped - maxDropsetSize; } int cleanup_applyOneMergerEvent(MergingEvent *mergingEvent, Array *bipartitionsById, BitVector *mergingBipartitions) { int j; ProfileElem *resultBip, *elem; resultBip = mergingEvent->isComplex ? GET_PROFILE_ELEM(bipartitionsById, mergingEvent->mergingBipartitions.many->index) : GET_PROFILE_ELEM(bipartitionsById, mergingEvent->mergingBipartitions.pair[0]); if(mergingEvent->isComplex) { IndexList *iterBip = mergingEvent->mergingBipartitions.many->next; FOR_LIST(iterBip) { elem = GET_PROFILE_ELEM(bipartitionsById, iterBip->index); FLIP_NTH_BIT(mergingBipartitions, elem->id); resultBip->isInMLTree |= elem->isInMLTree; FOR_0_LIMIT(j,treeVectorLength) resultBip->treeVector[j] |= elem->treeVector[j]; } freeIndexList(mergingEvent->mergingBipartitions.many); free(mergingEvent); } else { elem = GET_PROFILE_ELEM(bipartitionsById,mergingEvent->mergingBipartitions.pair[1]); FLIP_NTH_BIT(mergingBipartitions, elem->id); resultBip->isInMLTree |= elem->isInMLTree; FOR_0_LIMIT(j,treeVectorLength) resultBip->treeVector[j] |= elem->treeVector[j]; } resultBip->treeVectorSupport = genericBitCount(resultBip->treeVector, treeVectorLength); return resultBip->id; } int getSupportOfMRETreeHelper(Array *bipartitionProfile, Dropset *dropset) { int result = 0, i,j; BitVector *taxaDroppedHere = copyBitVector(droppedTaxa, bitVectorLength); if(dropset) { IndexList *iter = dropset->taxaToDrop; FOR_LIST(iter) FLIP_NTH_BIT(taxaDroppedHere, iter->index); } qsort(bipartitionProfile->arrayTable, bipartitionProfile->length, sizeof(ProfileElem**), sortBySupport); Array *mreBips = CALLOC(1,sizeof(Array)); mreBips->arrayTable = CALLOC((mxtips-3), sizeof(ProfileElem*)); mreBips->length = 0; #ifdef MYDEBUG for(i = 1; i < bipartitionProfile->length; i ++) assert(GET_PROFILE_ELEM(bipartitionProfile,i-1)->treeVectorSupport >= GET_PROFILE_ELEM(bipartitionProfile,i)->treeVectorSupport); #endif FOR_0_LIMIT(i, bipartitionProfile->length) if(GET_PROFILE_ELEM(bipartitionProfile,i)->treeVectorSupport > thresh) addElemToArray(GET_PROFILE_ELEM(bipartitionProfile,i), mreBips); else break; for(; i < bipartitionProfile->length && mreBips->length < mxtips-3; ++i) { ProfileElem *elemA = GET_PROFILE_ELEM(bipartitionProfile, i); boolean compatibleP = TRUE; FOR_0_LIMIT(j,mreBips->length) { ProfileElem *elemB = GET_PROFILE_ELEM(mreBips,j); compatibleP &= isCompatible(elemA, elemB, taxaDroppedHere); if( NOT compatibleP) break; } if(compatibleP) addElemToArray(GET_PROFILE_ELEM(bipartitionProfile,i), mreBips); } if(computeSupport) FOR_0_LIMIT(i,mreBips->length) result += GET_PROFILE_ELEM(mreBips,i)->treeVectorSupport; else result = mreBips->length; free(mreBips->arrayTable); free(mreBips); free(bipartitionProfile->arrayTable); free(bipartitionProfile); return result; } void getSupportGainedThreshold(MergingEvent *me, Array *bipartitionsById) { int i; me->supportGained = 0; BitVector *tmp; boolean isInMLTree = FALSE; if(me->isComplex) { IndexList *iI = me->mergingBipartitions.many; int bestPossible = 0; FOR_LIST(iI) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionsById, iI->index); bestPossible += elem->treeVectorSupport; isInMLTree |= elem->isInMLTree; } if( rogueMode == VANILLA_CONSENSUS_OPT && bestPossible < thresh) return ; if( rogueMode == ML_TREE_OPT && NOT isInMLTree) return ; tmp = CALLOC(treeVectorLength, sizeof(BitVector)); /* create new bip vector */ iI = me->mergingBipartitions.many; FOR_LIST(iI) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionsById, iI->index); FOR_0_LIMIT(i, treeVectorLength) tmp[i] |= elem->treeVector[i]; } } else { ProfileElem *elemA = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[0]), *elemB = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[1]); if(rogueMode == VANILLA_CONSENSUS_OPT && elemA->treeVectorSupport + elemB->treeVectorSupport < thresh) return; isInMLTree = elemA->isInMLTree || elemB->isInMLTree; if(rogueMode == ML_TREE_OPT && NOT isInMLTree) return; tmp = CALLOC(treeVectorLength, sizeof(BitVector)); FOR_0_LIMIT(i,treeVectorLength) tmp[i] = elemA->treeVector[i] | elemB->treeVector[i]; } int newSup = genericBitCount(tmp, treeVectorLength); switch (rogueMode) { case MRE_CONSENSUS_OPT: { me->supportGained = computeSupport ? newSup : 1; break; } case VANILLA_CONSENSUS_OPT: { if(rogueMode == VANILLA_CONSENSUS_OPT && newSup > thresh) me->supportGained = computeSupport ? newSup : 1 ; break; } case ML_TREE_OPT: { if(isInMLTree ) me->supportGained = computeSupport ? newSup : 1 ; break; } default : assert(0); } free(tmp); } int getSupportOfMRETree(Array *bipartitionsById, Dropset *dropset) { List *mergingEvents = NULL; if(dropset) { if(maxDropsetSize == 1) mergingEvents = dropset->ownPrimeE; else { List *iter = dropset->acquiredPrimeE ; FOR_LIST(iter) APPEND(iter->value, mergingEvents); iter = dropset->complexEvents; FOR_LIST(iter) APPEND(iter->value, mergingEvents); } } int i; /* initial case */ if( NOT dropset) { Array *array = cloneProfileArrayFlat(bipartitionsById); int tmp = getSupportOfMRETreeHelper( array, dropset); return tmp; } Array *emergedBips = CALLOC(1,sizeof(Array)), *finalArray = CALLOC(1,sizeof(Array)), *tmpArray = cloneProfileArrayFlat(bipartitionsById); emergedBips->arrayTable = CALLOC(lengthOfList(mergingEvents), sizeof(ProfileElem*)); emergedBips->length = 0; finalArray->arrayTable = CALLOC(tmpArray->length, sizeof(ProfileElem*)); finalArray->length = 0; /* kill merging bips from array */ List *meIter = mergingEvents ; FOR_LIST(meIter) { MergingEvent *me = meIter->value; if(me->isComplex) { IndexList *iter = me->mergingBipartitions.many; FOR_LIST(iter) GET_PROFILE_ELEM(tmpArray, iter->index) = NULL; /* create emerged bips in other array */ ProfileElem *elem = CALLOC(1,sizeof(ProfileElem)); getSupportGainedThreshold(me,bipartitionsById); elem->treeVectorSupport = me->supportGained; elem->bitVector = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.many->index)->bitVector; GET_PROFILE_ELEM(emergedBips, emergedBips->length) = elem; emergedBips->length++; } else { int a = me->mergingBipartitions.pair[0], b = me->mergingBipartitions.pair[1]; GET_PROFILE_ELEM(tmpArray, a) = NULL; GET_PROFILE_ELEM(tmpArray, b) = NULL; /* create emerged bips in other array */ ProfileElem *elem = CALLOC(1,sizeof(ProfileElem)); getSupportGainedThreshold(me,bipartitionsById); elem->treeVectorSupport = me->supportGained; elem->bitVector = GET_PROFILE_ELEM(bipartitionsById, a)->bitVector; GET_PROFILE_ELEM(emergedBips, emergedBips->length) = elem; emergedBips->length++; } } /* kill vanishing bips from array */ FOR_0_LIMIT(i, tmpArray->length) { if( GET_PROFILE_ELEM(tmpArray,i) ) { ProfileElem *elem = GET_PROFILE_ELEM(tmpArray,i); int remainingBits = elem->numberOfBitsSet; IndexList *iter = dropset->taxaToDrop; FOR_LIST(iter) if(NTH_BIT_IS_SET(elem->bitVector, iter->index)) remainingBits--; if(remainingBits > 1) addElemToArray(elem, finalArray); } } FOR_0_LIMIT(i, emergedBips->length) addElemToArray(GET_PROFILE_ELEM(emergedBips, i), finalArray); free(tmpArray->arrayTable); free(tmpArray); int result = getSupportOfMRETreeHelper(finalArray, dropset); if(maxDropsetSize > 1 ) freeListFlat(mergingEvents); FOR_0_LIMIT(i,emergedBips->length) free(GET_PROFILE_ELEM(emergedBips,i)); free(emergedBips->arrayTable); free(emergedBips); return result; } boolean bipartitionVanishesP(ProfileElem *elem, Dropset *dropset) { IndexList *iter = dropset->taxaToDrop; int result = elem->numberOfBitsSet; FOR_LIST(iter) if(NTH_BIT_IS_SET(elem->bitVector, iter->index)) result--; return result < 2; } void removeMergedBipartitions(Array *bipartitionsById, Array *bipartitionProfile, BitVector *mergingBipartitions) { int i; FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile,i); if( NOT elem ) continue; if(NTH_BIT_IS_SET(mergingBipartitions,elem->id)) { GET_PROFILE_ELEM(bipartitionProfile, i) = NULL; GET_PROFILE_ELEM(bipartitionsById, elem->id) = NULL; freeProfileElem(elem); #ifdef PRINT_VERY_VERBOSE PR("CLEAN UP: removing %d from bip profile because of merger\n", elem->id); #endif } } } boolean eventMustBeRecomputed(MergingEvent *meIter, BitVector *mergingBipartitions, BitVector *newCandidates) { boolean mustBeRecomputed = FALSE; IndexList *mergingBipsIter = meIter->mergingBipartitions.many; FOR_LIST(mergingBipsIter) mustBeRecomputed |= NTH_BIT_IS_SET(mergingBipartitions,mergingBipsIter->index) | NTH_BIT_IS_SET(newCandidates, mergingBipsIter->index); return mustBeRecomputed; } boolean checkValidityOfEvent(BitVector *obsoleteBips, List *elem) { MergingEvent *me = elem->value; boolean killP = FALSE; if(me->isComplex) { IndexList *iter = me->mergingBipartitions.many; FOR_LIST(iter) killP |= NTH_BIT_IS_SET(obsoleteBips, iter->index); if(killP) freeIndexList(me->mergingBipartitions.many); } else killP = NTH_BIT_IS_SET(obsoleteBips, me->mergingBipartitions.pair[0]) || NTH_BIT_IS_SET(obsoleteBips, me->mergingBipartitions.pair[1]) ; if(killP) { free(me); return FALSE; } else return TRUE; } #ifdef LATER void printDropset(Dropset *dropset) { IndexList *iter; iter = dropset->taxaToDrop; boolean isFirst = TRUE; FOR_LIST(iter) { PR(isFirst ? "%d" : ",%d", iter->index); isFirst = FALSE; } PR(" [%d] : ", dropset->improvement); List *lIter = dropset->primeEvents; FOR_LIST(lIter) { MergingEvent *me = lIter->value; PR("{ %d,%d },", me->mergingBipartitions.pair[0], me->mergingBipartitions.pair[1]); } PR("\n"); if(dropset->combinedEvents) { PR("\t`->"); lIter = dropset->combinedEvents; FOR_LIST(lIter) { if(((MergingEvent*)lIter->value)->isComplex) { isFirst = TRUE; PR("{ "); iter = ((MergingEvent*)lIter->value)->mergingBipartitions.many; FOR_LIST(iter) { PR(isFirst ? "%d" : ",%d" ,iter->index); isFirst = FALSE; } PR(" },"); } else { MergingBipartitions mb = ((MergingEvent*)lIter->value)->mergingBipartitions ; PR("[ %d,%d ],", mb.pair[0], mb.pair[1]); } } PR("\n"); } } #endif #ifdef LATER void printMergingHash(HashTable *mergingHash) { if(mergingHash->entryCount < 1) { PR("Nothing in mergingHash.\n"); return; } HashTableIterator *htIter; FOR_HASH(htIter, mergingHash) { Dropset *dropset = getCurrentValueFromHashTableIterator(htIter); printDropset(dropset); } } #endif #ifdef MYDEBUG void debug_assureCleanStructure(HashTable *hashtable, BitVector *mergingBipartitions) { HashTableIterator *htIter; FOR_HASH(htIter, hashtable) { Dropset *ds = (Dropset*)getCurrentValueFromHashTableIterator(htIter); if( NOT ds) break; List *meIter = ds->primeEvents; FOR_LIST(meIter) { MergingEvent *me = meIter->value; if(me->isComplex) { IndexList *iter = me->mergingBipartitions.many; FOR_LIST(iter) if(NTH_BIT_IS_SET(mergingBipartitions, iter->index)) { PR("%d from merging bipartitions still present. \n", iter->index); printMergingHash(hashtable); exit(-1); } } else { if(NTH_BIT_IS_SET(mergingBipartitions, me->mergingBipartitions.pair[0]) ) { PR("%d from merging bipartitions still present. \n", me->mergingBipartitions.pair[0]); printMergingHash(hashtable); exit(-1); } if(NTH_BIT_IS_SET(mergingBipartitions, me->mergingBipartitions.pair[1])) { PR("%d from merging bipartitions still present. \n", me->mergingBipartitions.pair[1]); printMergingHash(hashtable); exit(-1); } } } } free(htIter); } #endif void cleanup_mergingEvents(HashTable *mergingHash, BitVector *mergingBipartitions, BitVector *candidateBips, int length) { HashTableIterator *htIter; int i; FOR_0_LIMIT(i,GET_BITVECTOR_LENGTH(length)) mergingBipartitions[i] |= candidateBips[i]; if( NOT mergingHash->entryCount) return; FOR_HASH(htIter, mergingHash) { Dropset *dropset = getCurrentValueFromHashTableIterator(htIter); /* always remove combined events */ List *iter = dropset->complexEvents; FOR_LIST(iter) { MergingEvent *me = iter->value; assert(me); if(me && me->isComplex) { freeIndexList(me->mergingBipartitions.many); free(me); } } freeListFlat(dropset->complexEvents); /* always remove acquired elems */ freeListFlat(dropset->acquiredPrimeE); } free(htIter); /* reduce own elements */ FOR_HASH_2(htIter, mergingHash) { Dropset *dropset = getCurrentValueFromHashTableIterator(htIter); assert(dropset); /* prime events */ List *iter = dropset->ownPrimeE, *start = NULL; while(iter) { List *next = iter->next; if( checkValidityOfEvent(mergingBipartitions, iter) ) APPEND(iter->value, start); free(iter); iter = next; } dropset->ownPrimeE = start; } free(htIter); #ifdef MYDEBUG debug_assureCleanStructure(mergingHash, mergingBipartitions); #endif free(mergingBipartitions); } /* • inverses the bit vector, if more than half of the remaining taxa bits is set. This is better anyway, is the bit vector do not get that physically heavy (assuming that a 1 weighs more than a 0) • assuming, we already know the numbers of bits set • assuming, bits are unflipped, if the taxon was dropped */ void unifyBipartitionRepresentation(Array *bipartitionArray, BitVector *droppedTaxa) { int i,j, bvLen = GET_BITVECTOR_LENGTH(mxtips), remainingTaxa = mxtips - genericBitCount(droppedTaxa, bvLen); #ifdef PRINT_VERY_VERBOSE PR("remaining taxa: %d\n", remainingTaxa); PR("inverting bit vectors: "); #endif FOR_0_LIMIT(i,bipartitionArray->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionArray,i); if( elem && elem->numberOfBitsSet > remainingTaxa / 2) { #ifdef PRINT_VERY_VERBOSE PR("%d (%d bits set), ", elem->id, elem->numberOfBitsSet); #endif FOR_0_LIMIT(j,bvLen) elem->bitVector[j] = ~(elem->bitVector[j] | paddingBits[j] | droppedTaxa[j]); elem->numberOfBitsSet = remainingTaxa - elem->numberOfBitsSet; } } #ifdef PRINT_VERY_VERBOSE PR("\n"); #endif } void printBipartitionProfile(Array *bipartitionProfile) { int i; FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile, i); if(elem) { PR("%d (%d):\t\t", elem->id, elem->numberOfBitsSet); printBitVector(elem->bitVector, GET_BITVECTOR_LENGTH(mxtips)); } else break; PR("\n"); } } int getNumberOfBipsPresent(Array *bipartitionArray) { int result = 0; int i; FOR_0_LIMIT(i, bipartitionArray->length) { if(GET_PROFILE_ELEM(bipartitionArray,i)) result++; } return result; } int getInitScore(Array *bipartitionProfile) { int score = 0 , i; if(rogueMode == MRE_CONSENSUS_OPT) return getSupportOfMRETree(bipartitionProfile, NULL); FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile,i); switch(rogueMode) { case VANILLA_CONSENSUS_OPT: if(elem->treeVectorSupport > thresh) score += computeSupport ? elem->treeVectorSupport : 1 ; break; case ML_TREE_OPT: if(elem->isInMLTree) score += computeSupport ? elem->treeVectorSupport : 1; break; case MRE_CONSENSUS_OPT: default: assert(0); } } return score; } void printDropsetImprovement(Dropset *dropset, All *tr, int cumScore) { #ifndef PRINT_DROPSETS return ; #endif IndexList *iter = dropset->taxaToDrop; boolean isFirst = TRUE; FOR_LIST(iter) { PR( isFirst ? ">%d" : ",%d", iter->index); isFirst = FALSE; } isFirst = TRUE; PR("\t"); iter = dropset->taxaToDrop; FOR_LIST(iter) { PR(isFirst ? "%s" : ",%s" , tr->nameList[iter->index+1]); isFirst = FALSE; } PR("\t"); PR("%f\t%f\n", (double)dropset->improvement /(computeSupport ? (double)numberOfTrees : 1) , (double)cumScore / (double)( (computeSupport ? numberOfTrees : 1) * (mxtips-3))); } void fprintRogueNames(All *tr, FILE *file, IndexList *list) { boolean isFirst = TRUE; FOR_LIST(list) { if(isFirst) { fprintf(file, "%s", tr->nameList[list->index+1]); isFirst = FALSE; } else fprintf(file, ",%s", tr->nameList[list->index+1]); } } void printRogueInformationToFile( All *tr, FILE *rogueOutput, int bestCumEver, int *cumScores, Dropset **dropsetInRound) { int i=1,j; boolean reached = bestCumEver == cumScores[0]; while ( NOT reached) { fprintf(rogueOutput, "%d\t", i); printIndexListToFile(rogueOutput, dropsetInRound[i]->taxaToDrop); fprintf(rogueOutput, "\t"); fprintRogueNames(tr, rogueOutput, dropsetInRound[i]->taxaToDrop); fprintf(rogueOutput, "\t%f\t%f\n", (double)(cumScores[i] - cumScores[i-1] )/ (double)tr->numberOfTrees,(double)cumScores[i] / (double)((computeSupport ? numberOfTrees : 1 ) * (mxtips-3)) ); reached = bestCumEver == cumScores[i]; ++i; } FOR_0_LIMIT(j,mxtips) if(NOT NTH_BIT_IS_SET(neglectThose,j)) { fprintf(rogueOutput, "%d\t%d\t%s\t%s\t%s\n", i, j, tr->nameList[j+1], "NA", "NA"); i++; } } void findCandidatesForBip(HashTable *mergingHash, ProfileElem *elemA, boolean firstMerge, Array *bipartitionsById, Array *bipartitionProfile, int* indexByNumberBits) { ProfileElem *elemB; int indexInBitSortedArray; boolean compMerge = canMergeWithComplement(elemA); if(firstMerge) { if(NOT compMerge && maxDropsetSize == 1) indexInBitSortedArray = indexByNumberBits[elemA->numberOfBitsSet +1]; else indexInBitSortedArray = indexByNumberBits[elemA->numberOfBitsSet]; } else indexInBitSortedArray = elemA->numberOfBitsSet - maxDropsetSize < 0 ? indexByNumberBits[0] : indexByNumberBits[elemA->numberOfBitsSet-maxDropsetSize]; for( ; indexInBitSortedArray < bipartitionProfile->length && (elemB = GET_PROFILE_ELEM(bipartitionProfile,indexInBitSortedArray)) && elemB->numberOfBitsSet - elemA->numberOfBitsSet <= maxDropsetSize ; indexInBitSortedArray++) { if( maxDropsetSize == 1 && NOT compMerge && elemA->numberOfBitsSet == elemB->numberOfBitsSet) continue; boolean foundOne = FALSE; if(compMerge) foundOne = checkForMergerAndAddEvent(TRUE,elemA, elemB, mergingHash); if(NOT foundOne || bothDropsetsRelevant(elemA->numberOfBitsSet)) checkForMergerAndAddEvent(FALSE, elemA, elemB, mergingHash); } } /* HashTable * */ void createOrUpdateMergingHash(All *tr, HashTable *mergingHash, Array *bipartitionProfile, Array *bipartitionsById, BitVector *candidateBips, boolean firstMerge, int *indexByNumberBits) { int i; FOR_0_LIMIT(i,bipartitionProfile->length) if(NTH_BIT_IS_SET(candidateBips, i)) findCandidatesForBip(mergingHash, GET_PROFILE_ELEM(bipartitionsById, i), firstMerge, bipartitionsById, bipartitionProfile, indexByNumberBits); free(candidateBips); } void combineEventsForOneDropset(Array *allDropsets, Dropset *refDropset, Array *bipartitionsById) { List *allEventsUncombined = NULL; refDropset->acquiredPrimeE = NULL; refDropset->complexEvents = NULL; int eventCntr = 0; if(NOT refDropset->taxaToDrop->next) { List *iter = refDropset->ownPrimeE; FOR_LIST(iter) APPEND(iter->value, refDropset->acquiredPrimeE); return; } /* gather all events */ int i; FOR_0_LIMIT(i,allDropsets->length) { Dropset *currentDropset = GET_DROPSET_ELEM(allDropsets, i); if( isSubsetOf(currentDropset->taxaToDrop, refDropset->taxaToDrop) ) { List *iter = currentDropset->ownPrimeE; FOR_LIST(iter) { APPEND(iter->value, allEventsUncombined); eventCntr++; } } } /* transform the edges into nodes */ HashTable *allNodes = createHashTable(eventCntr * 10, NULL, nodeHashValue, nodeEqual); Node *found; List *iter = allEventsUncombined; FOR_LIST(iter) { MergingEvent *me = (MergingEvent*)iter->value; int a = me->mergingBipartitions.pair[0]; int b = me->mergingBipartitions.pair[1]; if( ( found = searchHashTableWithInt(allNodes, a) ) ) APPEND_INT(b,found->edges); else { Node *node = CALLOC(1,sizeof(Node)); node->id = a; APPEND_INT(b,node->edges); insertIntoHashTable(allNodes, node, a); } if( ( found = searchHashTableWithInt(allNodes, b) ) ) APPEND_INT(a,found->edges); else { Node *node = CALLOC(1,sizeof(Node)); node->id = b; APPEND_INT(a,node->edges); insertIntoHashTable(allNodes, node, b); } } iter = allEventsUncombined; FOR_LIST(iter) { MergingEvent *me = (MergingEvent*)iter->value; int a = me->mergingBipartitions.pair[0], b = me->mergingBipartitions.pair[1]; Node *foundA = searchHashTableWithInt(allNodes,a), *foundB = searchHashTableWithInt(allNodes,b); if(NOT foundA->edges->next && NOT foundB->edges->next) { assert(foundA->edges->index == foundB->id ); assert(foundB->edges->index == foundA->id ); APPEND(me, refDropset->acquiredPrimeE); } else { IndexList *component = findAnIndependentComponent(allNodes,foundA); if( component) { MergingEvent *complexMe = CALLOC(1,sizeof(MergingEvent)); complexMe->mergingBipartitions.many = component; complexMe->isComplex = TRUE; APPEND(complexMe,refDropset->complexEvents); } } } destroyHashTable(allNodes, freeNode); freeListFlat(allEventsUncombined); } HashTable *combineMergerEvents(HashTable *mergingHash, Array *bipartitionsById) { /* hash to array */ Array *allDropsets = CALLOC(1,sizeof(Array)); allDropsets->arrayTable = CALLOC(mergingHash->entryCount, sizeof(Dropset**)); HashTableIterator *htIter; int cnt = 0; FOR_HASH(htIter, mergingHash) { GET_DROPSET_ELEM(allDropsets, cnt) = getCurrentValueFromHashTableIterator(htIter); cnt++; } free(htIter); assert(cnt == mergingHash->entryCount); allDropsets->length = cnt; #ifdef PARALLEL globalPArgs->allDropsets = allDropsets; globalPArgs->bipartitionsById =bipartitionsById; numberOfJobs = allDropsets->length; masterBarrier(THREAD_COMBINE_EVENTS, globalPArgs); #else int i; FOR_0_LIMIT(i,allDropsets->length) combineEventsForOneDropset(allDropsets, GET_DROPSET_ELEM(allDropsets,i), bipartitionsById); #endif free(allDropsets->arrayTable); free(allDropsets); return mergingHash; } void getLostSupportThreshold(MergingEvent *me, Array *bipartitionsById) { ProfileElem *elemA, *elemB ; me->supportLost = 0; if(me->isComplex) { IndexList *iI = me->mergingBipartitions.many; FOR_LIST(iI) { elemA = GET_PROFILE_ELEM(bipartitionsById, iI->index); switch (rogueMode) { case VANILLA_CONSENSUS_OPT : { if(elemA->treeVectorSupport > thresh) me->supportLost += computeSupport ? elemA->treeVectorSupport : 1; break ; } case ML_TREE_OPT: { if(elemA->isInMLTree) me->supportLost += computeSupport ? elemA->treeVectorSupport : 1 ; break; } default : assert(0); } } } else { elemA = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[0]); elemB = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[1]); switch(rogueMode) { case MRE_CONSENSUS_OPT: case VANILLA_CONSENSUS_OPT: { if(elemA->treeVectorSupport > thresh) me->supportLost += computeSupport ? elemA->treeVectorSupport : 1 ; if(elemB->treeVectorSupport > thresh) me->supportLost += computeSupport ? elemB->treeVectorSupport : 1; break; } case ML_TREE_OPT: { if(elemA->isInMLTree) me->supportLost += computeSupport ? elemA->treeVectorSupport : 1 ; if(elemB->isInMLTree) me->supportLost += computeSupport ? elemB->treeVectorSupport : 1 ; } } } } void evaluateDropset(HashTable *mergingHash, Dropset *dropset,Array *bipartitionsById, List *consensusBipsCanVanish ) { int result = 0; List *allElems = NULL, *elemsToCheck = NULL; if(maxDropsetSize == 1) elemsToCheck = dropset->ownPrimeE ; else { List *otherIter = dropset->acquiredPrimeE; FOR_LIST(otherIter) APPEND(otherIter->value, elemsToCheck); otherIter = dropset->complexEvents; FOR_LIST(otherIter) APPEND(otherIter->value, elemsToCheck); allElems = elemsToCheck; } BitVector *bipsSeen = CALLOC(GET_BITVECTOR_LENGTH(bipartitionsById->length), sizeof(BitVector)); FOR_LIST(elemsToCheck) { MergingEvent *me = (MergingEvent*)elemsToCheck->value; if(NOT me->computed) { getLostSupportThreshold(me, bipartitionsById); getSupportGainedThreshold(me, bipartitionsById); me->computed = TRUE; } result -= me->supportLost; if( me->supportGained && NOT mergedBipVanishes(me, bipartitionsById, dropset->taxaToDrop) ) result += me->supportGained; if(me->isComplex) { IndexList *iI = me->mergingBipartitions.many ; FOR_LIST(iI) { assert(NOT NTH_BIT_IS_SET(bipsSeen, iI->index)); if(NTH_BIT_IS_SET(bipsSeen, iI->index)) { PR("problem:"); printIndexList(me->mergingBipartitions.many); PR("at "); printIndexList(dropset->taxaToDrop); PR("\n"); exit(0); } FLIP_NTH_BIT(bipsSeen, iI->index); } } else { assert( NOT NTH_BIT_IS_SET(bipsSeen, me->mergingBipartitions.pair[0])); assert( NOT NTH_BIT_IS_SET(bipsSeen, me->mergingBipartitions.pair[1])); FLIP_NTH_BIT(bipsSeen,me->mergingBipartitions.pair[0]); FLIP_NTH_BIT(bipsSeen,me->mergingBipartitions.pair[1]); } } freeListFlat(allElems); /* handle vanishing bip */ List *iter = consensusBipsCanVanish; FOR_LIST(iter) { ProfileElem *elem = iter->value; switch(rogueMode) { case VANILLA_CONSENSUS_OPT : { if(elem->treeVectorSupport > thresh && NOT NTH_BIT_IS_SET(bipsSeen, elem->id) && bipartitionVanishesP(elem,dropset)) result -= computeSupport ? elem->treeVectorSupport : 1; break; } case ML_TREE_OPT: { if(elem->isInMLTree && NOT NTH_BIT_IS_SET(bipsSeen, elem->id) && bipartitionVanishesP(elem,dropset)) result -= computeSupport ? elem->treeVectorSupport : 1; break; } default: assert(0); } } free(bipsSeen); dropset->improvement = result; } List *getConsensusBipsCanVanish(Array *bipartitionProfile) { List *consensusBipsCanVanish = NULL; if(rogueMode == VANILLA_CONSENSUS_OPT || rogueMode == MRE_CONSENSUS_OPT) { int i; FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile, i); if(NOT elem) break; if(elem->numberOfBitsSet - maxDropsetSize > 1 ) break; if(elem->treeVectorSupport > thresh) APPEND(elem,consensusBipsCanVanish); } } else if(ML_TREE_OPT) { int i; FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile, i); if(NOT elem) break; if(elem->isInMLTree) APPEND(elem,consensusBipsCanVanish); } } return consensusBipsCanVanish; } Dropset *evaluateEvents(HashTable *mergingHash, Array *bipartitionsById, Array *bipartitionProfile) { Dropset *result = NULL; int i ; List *consensusBipsCanVanish = getConsensusBipsCanVanish(bipartitionProfile); if( NOT mergingHash->entryCount) return NULL ; /* gather dropsets in array */ Array *allDropsets = CALLOC(1,sizeof(Array)) ; allDropsets->length = mergingHash->entryCount; allDropsets->arrayTable = CALLOC(mergingHash->entryCount, sizeof(Dropset*)); int cnt = 0; HashTableIterator *htIter; FOR_HASH(htIter, mergingHash) { GET_DROPSET_ELEM(allDropsets,cnt) = getCurrentValueFromHashTableIterator(htIter); cnt++; } free(htIter); assert(cnt == mergingHash->entryCount); /* compute MRE stuff */ if(rogueMode == MRE_CONSENSUS_OPT) { #ifdef PARALLEL numberOfJobs = allDropsets->length; globalPArgs->bipartitionsById = bipartitionsById; globalPArgs->allDropsets = allDropsets; masterBarrier(THREAD_MRE, globalPArgs); #else FOR_0_LIMIT(i,allDropsets->length) { Dropset *dropset = GET_DROPSET_ELEM(allDropsets, i); dropset->improvement = getSupportOfMRETree(bipartitionsById, dropset) - cumScore; } #endif } /* evaluate dropsets */ if(rogueMode != MRE_CONSENSUS_OPT) { #ifdef PARALLEL numberOfJobs = allDropsets->length; globalPArgs->mergingHash = mergingHash; globalPArgs->allDropsets = allDropsets; globalPArgs->bipartitionsById = bipartitionsById; globalPArgs->consensusBipsCanVanish = consensusBipsCanVanish; masterBarrier(THREAD_EVALUATE_EVENTS, globalPArgs); #else FOR_0_LIMIT(i, allDropsets->length) { Dropset *dropset = GET_DROPSET_ELEM(allDropsets, i); evaluateDropset(mergingHash, dropset, bipartitionsById, consensusBipsCanVanish); } #endif } FOR_0_LIMIT(i,allDropsets->length) { Dropset *dropset = GET_DROPSET_ELEM(allDropsets, i); if(NOT result) result = dropset; else { int drSize = lengthIndexList(dropset->taxaToDrop), resSize = lengthIndexList(result->taxaToDrop); double oldQuality = labelPenalty == 0.0 ? result->improvement * drSize : (double)(result->improvement / (double)(computeSupport ? numberOfTrees : 1.0)) - (double)resSize; double newQuality = labelPenalty == 0.0 ? dropset->improvement * resSize : (double)(dropset->improvement / (double)(computeSupport ? numberOfTrees : 1.0)) - (double)drSize; if( (newQuality > oldQuality) ) result = dropset; } } freeListFlat(consensusBipsCanVanish); free(allDropsets->arrayTable); free(allDropsets); if(labelPenalty == 0.0 && result->improvement > 0) return result; else if(labelPenalty != 0.0 && (result->improvement / (computeSupport ? numberOfTrees : 1.0) - lengthIndexList(result->taxaToDrop)) > 0.0 ) return result; else return NULL; } void cleanup_updateNumBitsAndCleanArrays(Array *bipartitionProfile, Array *bipartitionsById, BitVector *mergingBipartitions, BitVector *newCandidates, Dropset *dropset) { int profileIndex; FOR_0_LIMIT(profileIndex,bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile,profileIndex); if( NOT elem ) continue; /* check if number of bits has changed */ if(NOT NTH_BIT_IS_SET(mergingBipartitions,elem->id)) { if( mxtips - taxaDropped - 2 * elem->numberOfBitsSet <= 2 * maxDropsetSize ) FLIP_NTH_BIT(newCandidates, elem->id); IndexList *iter = dropset->taxaToDrop; boolean taxonDroppedP = FALSE; FOR_LIST(iter) { if(NTH_BIT_IS_SET(elem->bitVector, iter->index)) { taxonDroppedP = TRUE; UNFLIP_NTH_BIT(elem->bitVector, iter->index); elem->numberOfBitsSet--; } } if(taxonDroppedP) { if(elem->numberOfBitsSet < 2) { UNFLIP_NTH_BIT(newCandidates, elem->id); FLIP_NTH_BIT(mergingBipartitions, elem->id); } else FLIP_NTH_BIT(newCandidates, elem->id); } } /* bip has been merged or vanished */ if(NTH_BIT_IS_SET(mergingBipartitions,elem->id)) { assert(NOT NTH_BIT_IS_SET(newCandidates, elem->id)); GET_PROFILE_ELEM(bipartitionProfile, profileIndex) = NULL; GET_PROFILE_ELEM(bipartitionsById, elem->id) = NULL; freeProfileElem(elem); } } } BitVector *cleanup_applyAllMergerEvents(Array *bipartitionsById, Dropset *bestDropset, BitVector *mergingBipartitions) { BitVector *candidateBips = CALLOC(GET_BITVECTOR_LENGTH(bipartitionsById->length), sizeof(BitVector)) ; if( bestDropset) { #ifdef PRINT_VERY_VERBOSE printDropset(bestDropset); #endif List *iter = NULL ; if(maxDropsetSize == 1) iter = bestDropset->ownPrimeE; else iter = bestDropset->acquiredPrimeE; FOR_LIST(iter) { int newBipId = cleanup_applyOneMergerEvent((MergingEvent*)iter->value, bipartitionsById, mergingBipartitions); FLIP_NTH_BIT(candidateBips, newBipId); } if(maxDropsetSize > 1 ) { iter = bestDropset->complexEvents; FOR_LIST(iter) { int newBipId = cleanup_applyOneMergerEvent((MergingEvent*)iter->value, bipartitionsById, mergingBipartitions); FLIP_NTH_BIT(candidateBips, newBipId); } } } return candidateBips; } void cleanup_rehashDropsets(HashTable *mergingHash, Dropset *bestDropset) { if(maxDropsetSize == 1) return; IndexList *taxaToDrop = bestDropset->taxaToDrop; List *allDropsets = NULL; HashTableIterator *htIter; FOR_HASH(htIter, mergingHash) { Dropset *dropset = getCurrentValueFromHashTableIterator(htIter); allDropsets = appendToList(dropset, allDropsets); } free(htIter); List *iter = allDropsets; FOR_LIST(iter) { Dropset *dropset = (Dropset*)iter->value; if( NOT dropset) break; if(NOT dropset->ownPrimeE || isSubsetOf(dropset->taxaToDrop, taxaToDrop) ) { removeElementFromHash(mergingHash, dropset); freeDropsetDeep(dropset, FALSE); } else if(haveIntersection(dropset->taxaToDrop, taxaToDrop)) /* needs reinsert */ { removeElementFromHash(mergingHash, dropset); #ifdef MYDEBUG int length = lengthIndexList(dropset->taxaToDrop); #endif dropset->taxaToDrop = setMinusOf(dropset->taxaToDrop, taxaToDrop); #ifdef MYDEBUG assert(length > lengthIndexList(dropset->taxaToDrop)); #endif unsigned int hv = mergingHash->hashFunction(mergingHash, dropset); Dropset *found = searchHashTable(mergingHash, dropset, hv); if( NOT found) insertIntoHashTable(mergingHash,dropset,hv); else /* reuse the merging events */ { List *iter, *next; for(iter = dropset->ownPrimeE; iter; iter = next) { /* TODO potential error: double check, if this stuff did not already occur would be great */ next = iter->next; iter->next = found->ownPrimeE; found->ownPrimeE = iter; } freeIndexList(dropset->taxaToDrop); free(dropset); } } } freeListFlat(allDropsets); } BitVector *cleanup(All *tr, HashTable *mergingHash, Dropset *bestDropset, BitVector *candidateBips, Array *bipartitionProfile, Array *bipartitionsById) { IndexList *ilIter; BitVector *bipsToVanish = CALLOC(GET_BITVECTOR_LENGTH(bipartitionsById->length), sizeof(BitVector)); /* apply merging events for best dropset */ candidateBips = cleanup_applyAllMergerEvents(bipartitionsById, bestDropset, bipsToVanish); if(NOT bestDropset) { free(bipsToVanish); return candidateBips; } /* add to list of dropped taxa */ ilIter = bestDropset->taxaToDrop; FOR_LIST(ilIter) FLIP_NTH_BIT(droppedTaxa,ilIter->index); /* remove merging bipartitions from arrays (not candidates) */ cleanup_updateNumBitsAndCleanArrays(bipartitionProfile, bipartitionsById, bipsToVanish,candidateBips,bestDropset ); removeElementFromHash(mergingHash, bestDropset); cleanup_mergingEvents(mergingHash, bipsToVanish, candidateBips, bipartitionProfile->length); cleanup_rehashDropsets(mergingHash, bestDropset); #ifdef PRINT_VERY_VERBOSE int i; PR("CLEAN UP: need to recompute bipartitions "); FOR_0_LIMIT(i, bipartitionProfile->length) if(NTH_BIT_IS_SET(candidateBips, i)) PR("%d,", i); PR("\n"); #endif #ifdef MYDEBUG debug_dropsetConsistencyCheck(mergingHash); #endif #ifdef PRINT_TIME PR("[%f] executed the merging events \n", updateTime(&timeInc)); #endif #ifdef PRINT_VERY_VERBOSE PR("bips present %d (id) %d (profile)\n", getNumberOfBipsPresent(bipartitionsById), getNumberOfBipsPresent(bipartitionProfile)); #endif cumScore += bestDropset->improvement; if(cumScore > bestCumEver) bestCumEver = cumScore; bestLastTime += bestDropset->improvement; dropsetPerRound[dropRound+1] = bestDropset; cumScores[dropRound+1] = cumScore; printDropsetImprovement(bestDropset, tr, cumScore); return candidateBips; } void doomRogues(All *tr, char *bootStrapFileName, char *dontDropFile, char *treeFile, boolean mreOptimisation, int rawThresh) { double startingTime = gettime(); timeInc = gettime(); int *indexByNumberBits, i; FILE *bootstrapTreesFile = getNumberOfTrees(tr, bootStrapFileName), *rogueOutput = getOutputFileFromString("droppedRogues"); BitVector *candidateBips; HashTable *mergingHash = NULL; numberOfTrees = tr->numberOfTrees; if(strlen(treeFile)) { rogueMode = ML_TREE_OPT; if(mreOptimisation) { PR("ERROR: Please choose either support in the MRE consensus tree OR the bipartitions in the ML tree for optimization.\n"); exit(-1); } PR("mode: optimization of support of ML tree bipartitions in the bootstrap tree set.\n"); } else if(mreOptimisation) { rogueMode = MRE_CONSENSUS_OPT; thresh = tr->numberOfTrees * 0.5; PR("mode: optimization on MRE consensus tree. \n"); } else { rogueMode = VANILLA_CONSENSUS_OPT; thresh = tr->numberOfTrees * rawThresh / 100; if(thresh == tr->numberOfTrees) thresh--; PR("mode: optimization on consensus tree. Bipartition is part of consensus, if it occurs in more than %d trees\n", thresh); } FILE *bestTree = (rogueMode == ML_TREE_OPT) ? myfopen(treeFile,"r") : NULL; mxtips = tr->mxtips; tr->bitVectorLength = GET_BITVECTOR_LENGTH(mxtips); Array *bipartitionProfile = getOriginalBipArray(tr, bestTree, bootstrapTreesFile); if(maxDropsetSize >= mxtips - 3) { PR("\nMaximum dropset size (%d) too large. If we prune %d taxa, then there \n\ will be no bipartitions left and thus such a pruned tree set can never \n\ have a higher information content (in terms of RBIC) than the original \n\ tree.\n", maxDropsetSize, mxtips-3); exit(-1); } dropsetPerRound = CALLOC(mxtips, sizeof(Dropset*)); Dropset *bestDropset = NULL; neglectThose = neglectThoseTaxa(tr, dontDropFile); initializeRandForTaxa(mxtips); treeVectorLength = GET_BITVECTOR_LENGTH(tr->numberOfTrees); bitVectorLength = GET_BITVECTOR_LENGTH(tr->mxtips); droppedTaxa = CALLOC(bitVectorLength, sizeof(BitVector)); paddingBits = CALLOC(GET_BITVECTOR_LENGTH(mxtips), sizeof(BitVector)); for(i = mxtips; i < GET_BITVECTOR_LENGTH(mxtips) * MASK_LENGTH; ++i) FLIP_NTH_BIT(paddingBits,i); FOR_0_LIMIT(i,bipartitionProfile->length) { ProfileElem *elem = ((ProfileElem**)bipartitionProfile->arrayTable)[i]; elem->numberOfBitsSet = genericBitCount(elem->bitVector, bitVectorLength); } Array *bipartitionsById = CALLOC(1,sizeof(Array)); bipartitionsById->arrayTable = CALLOC(bipartitionProfile->length, sizeof(ProfileElem*)); bipartitionsById->length = bipartitionProfile->length; FOR_0_LIMIT(i,bipartitionsById->length) GET_PROFILE_ELEM(bipartitionsById,i) = GET_PROFILE_ELEM(bipartitionProfile, i); qsort(bipartitionsById->arrayTable, bipartitionsById->length, sizeof(ProfileElem**), sortById); numBips = bipartitionProfile->length; cumScore = getInitScore(bipartitionProfile); cumScores = CALLOC(mxtips-3, sizeof(int)); cumScores[0] = cumScore; bestCumEver = cumScore; bestLastTime = cumScore; fprintf(rogueOutput, "num\ttaxNum\ttaxon\trawImprovement\tRBIC\n"); fprintf(rogueOutput, "%d\tNA\tNA\t%d\t%f\n", 0, 0, (double)cumScore /(numberOfTrees * (mxtips-3)) ); PR("[%f] initialisation done (initScore = %f, numBip=%d)\n", updateTime(&timeInc), (double)cumScore / (double)((tr->mxtips-3) * (computeSupport ? tr->numberOfTrees : 1 ) ), bipartitionsById->length); boolean firstMerge= TRUE; candidateBips = CALLOC(GET_BITVECTOR_LENGTH(bipartitionProfile->length),sizeof(BitVector)); FOR_0_LIMIT(i,bipartitionProfile->length) FLIP_NTH_BIT(candidateBips,i); mergingHash = createHashTable(tr->mxtips * maxDropsetSize * HASH_TABLE_SIZE_CONST, NULL, dropsetHashValue, dropsetEqual); #ifdef PARALLEL globalPArgs = CALLOC(1,sizeof(parallelArguments)); startThreads(); #endif /* main loop */ do { #ifdef PRINT_VERY_VERBOSE PR("ROUND %d ================================================================================================================================================================================================================\n",dropRound); PR("dropped vector is: "); printBitVector(droppedTaxa, GET_BITVECTOR_LENGTH(mxtips)); PR("\n"); #endif /***********/ /* prepare */ /***********/ bestDropset = NULL; unifyBipartitionRepresentation(bipartitionProfile,droppedTaxa); indexByNumberBits = createNumBitIndex(bipartitionProfile, mxtips); #ifdef PRINT_TIME PR("[%f] sorting bipartition profile\n", updateTime(&timeInc)); #endif #ifdef PRINT_VERY_VERBOSE printBipartitionProfile(bipartitionProfile); #endif /***********************************/ /* create / update merging events */ /***********************************/ #ifdef PARALLEL numberOfJobs = bipartitionProfile->length; globalPArgs->mergingHash = mergingHash; globalPArgs->candidateBips = candidateBips; globalPArgs->bipartitionsById = bipartitionsById; globalPArgs->bipartitionProfile = bipartitionProfile ; globalPArgs->indexByNumberBits = indexByNumberBits; globalPArgs->firstMerge = firstMerge; masterBarrier(THREAD_GET_EVENTS, globalPArgs); free(candidateBips); #else createOrUpdateMergingHash(tr, mergingHash, bipartitionProfile, bipartitionsById, candidateBips, firstMerge, indexByNumberBits ); #endif firstMerge = FALSE; #ifdef MYDEBUG debug_dropsetConsistencyCheck(mergingHash); debug_mergingHashSanityCheck(mergingHash, bipartitionProfile->length); #endif #ifdef PRINT_TIME PR("[%f] computed / updated events\n", updateTime(&timeInc)); #endif /* clear */ /******************/ /* combine events */ /******************/ if(maxDropsetSize > 1) mergingHash = combineMergerEvents(mergingHash, bipartitionsById); #ifdef PRINT_TIME PR("[%f] combined events\n", updateTime(&timeInc)); #endif #ifdef PRINT_VERY_VERBOSE if(mergingHash->entryCount > 0) printMergingHash(mergingHash); #endif /**********************/ /* evaluate dropsets */ /**********************/ bestDropset = evaluateEvents(mergingHash, bipartitionsById, bipartitionProfile); free(indexByNumberBits); #ifdef PRINT_TIME PR("[%f] calculated per dropset improvement\n", updateTime(&timeInc)); #endif /*****************/ /* cleanup */ /*****************/ candidateBips = cleanup(tr, mergingHash, bestDropset, candidateBips, bipartitionProfile, bipartitionsById); #ifdef MYDEBUG int l,m; FOR_0_LIMIT(l,bipartitionProfile->length) { ProfileElem *elemA = GET_PROFILE_ELEM(bipartitionProfile,l); if(NOT elemA ) continue; for(m = l+1; m < bipartitionProfile->length; ++m) { ProfileElem *elemB = GET_PROFILE_ELEM(bipartitionProfile,m); if( NOT elemB) continue; if(elemA->numberOfBitsSet == elemB->numberOfBitsSet && myBitVectorEqual(elemA,elemB)) { PR("%d and %d are equal!\n", elemA->id, elemB->id); printBitVector(elemA->bitVector, bitVectorLength); PR("\n"); printBitVector(elemB->bitVector, bitVectorLength); PR("\n"); /* assert(0); */ exit(-1); } } } #endif #ifdef PRINT_VERY_VERBOSE PR("dropped vector is: "); printBitVector(droppedTaxa, GET_BITVECTOR_LENGTH(mxtips)); PR("\n"); #endif if(bestDropset) taxaDropped += lengthIndexList(bestDropset->taxaToDrop); dropRound++; } while(bestDropset); /* print out result */ printRogueInformationToFile(tr, rogueOutput, bestCumEver,cumScores, dropsetPerRound); PR("total time elapsed: %f\n", updateTime(&startingTime)); /* free everything */ FOR_0_LIMIT(i, bipartitionProfile->length) { ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile,i); if(elem) freeProfileElem(elem); } free(((ProfileElemAttr*)bipartitionProfile->commonAttributes)); freeArray(bipartitionProfile); freeArray(bipartitionsById); destroyHashTable(mergingHash, freeDropsetDeepInHash); fclose(rogueOutput); for(i= 0 ; i < dropRound + 1; ++i) { Dropset *theDropset = dropsetPerRound[i]; if(theDropset) freeDropsetDeepInEnd(theDropset); } free(dropsetPerRound); free(neglectThose); free(cumScores); free(paddingBits); free(randForTaxa); free(droppedTaxa); free(candidateBips); } void printHelpFile() { printVersionInfo(FALSE); printf("This program implements the RogueNaRok algorithm for rogue taxon identification.\n\nSYNTAX: ./%s -i -n [-x ] [-c ] [-b] [-s ] [-w ] [-h]\n", programName); printf("\n\tOBLIGATORY:\n"); printf("-i \n\tA collection of bootstrap trees.\n"); printf("-n \n\tAn identifier for this run.\n"); printf("\n\tOPTIONAL:\n"); printf("-t \n\tIf a single best-known tree (such as an ML or MP\n\t\ tree) is provided, RogueNaRok optimizes the bootstrap support in this\n\t\ best-known tree (still drawn from the bootstrap trees). The threshold\n\t\ parameter is ignored.\n"); printf("-x \n\ttaxa in this file (one taxon per line) will not be\n\t\ considered for pruning.\n"); printf("-c \n\t A threshold or mode for the consensus tree that is\n\t\ optimized. Specify a value between 50 (majority rule consensus) and\n\t\ 100 (strict consensus) or MR (for the extended majority rule\n\t\ consensus). Note that rogue taxa identified with respect to different\n\t\ thresholds can vary substantially. DEFAULT: MR consensus\n"); printf("-b\n\tInstead of trying to maximize the support in the consensus tree,\n\t\ the RogueNaRok will try to maximize the number of bipartition in the\n\t\ final tree by pruning taxa. DEFAULT: off\n"); printf("-L \n\ta weight factor to penalize for dropset size. \n\t\ Factor=1 is Pattengale's criterion. The higher the value, the more \n\t\ conservative the algorithm is in pruning taxa. DEFAULT: 0.0 (=RBIC)\n"); printf("-s \n\tmaximum size of dropset per iteration. If\n\t\ dropsetSize == n, then RogueNaRok will test in each iteration which\n\t\ tuple of n taxa increases optimality criterion the most and prunes\n\t\ taxa accordingly. This improves the result, but runtimes will\n\t\ increase at least linearly. DEFAULT: 1\n"); printf("-w \n\tA working directory where output files are created.\n"); printf("-T \n\tExecute RogueNaRok in parallel with threads. You need to compile the program for parallel execution first.\n"); printf("-h\n\tThis help file.\n"); printf("\nMINIMAL EXAMPLE:\n./%s -i -n run1\n", programName); } int main(int argc, char *argv[]) { int c, threshold = 50; char *excludeFile = "", *bootTrees = "", *treeFile = ""; boolean mreOptimisation = FALSE; if(sizeof(int) != 4) { printf("I am sorry, RogueNaRok currently does not support your computer architecture. The code assumes that an integer (type int) consists of 4 bytes.\n"); assert(sizeof(int) == 4); } programName = PROG_NAME; programVersion = PROG_VERSION; programReleaseDate = PROG_RELEASE_DATE; while ((c = getopt (argc, argv, "i:t:n:x:whc:s:bT:L:")) != -1) switch (c) { case 'i': bootTrees = optarg; break; case 'T': { #ifndef PARALLEL printf("\n\nFor running RogueNaRok in parallel, please compile with \n\n"); exit(-1); #else numberOfThreads = wrapStrToL(optarg); #endif break; } case 'b': computeSupport = FALSE; break; case 'n': strcpy(run_id, optarg); break; case 't': treeFile = optarg; break; case 's': maxDropsetSize = wrapStrToL(optarg); break; case 'x': excludeFile = optarg; break; case 'w': strcpy(workdir, optarg) ; break; case 'L': labelPenalty = wrapStrToDouble(optarg); break; case 'c': { if( NOT strcmp(optarg, "MRE")) { mreOptimisation = TRUE; threshold = 50; } else threshold = wrapStrToL(optarg); break; } case 'h': default: { printHelpFile(); abort (); } } /* initialize fast bit counting */ compute_bits_in_16bits(); initializeMask(); #ifdef PARALLEL if(NOT numberOfThreads) { printf("\n\nPlease specify the number of threads for parallel execution with -T\n\n"); exit(-1); } if(numberOfThreads == 1) { printf("\n\nCalling parallel version of RogueNaRok with 1 thread is deprecated.\n\ Please compile a sequential version of RogueNaRok instead.\n\n"); exit(-1); } #endif if( NOT strcmp(treeFile, "")) rogueMode = ML_TREE_OPT; if( NOT strcmp(bootTrees, "")) { printf("ERROR: Please specify a file containing bootstrap trees via -i.\n"); printHelpFile(); exit(-1); } if( NOT strcmp(run_id, "")) { printf("ERROR: Please specify a run-id via -n\n"); printHelpFile(); exit(-1); } if(threshold < 50) { printf("ERROR: Only accepting threshold values between 50 (MR) and 100 (strict).\n"); exit(-1); } if(threshold != 50 && strcmp(treeFile, "") ) { printf("ERROR: threshold option -c not available in combination with best-known tree.\n"); exit(-1); } All *tr = CALLOC(1,sizeof(All)); setupInfoFile(); if (NOT setupTree(tr, bootTrees)) { PR("Something went wrong during tree initialisation. Sorry.\n"); exit(-1); } doomRogues(tr, bootTrees, excludeFile, treeFile, mreOptimisation, threshold); freeTree(tr); free(mask32); free(infoFileName); return 0; } RogueNaRok-1.0/Tree.c000066400000000000000000001016701165154717300144500ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "Tree.h" static int treeGetCh (FILE *fp) ; static void insertHashBootstop(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, int treeVectorLength, unsigned int position); static void treeEchoContext (FILE *fp1, FILE *fp2, int n); static double getBranchLength(All *tr, int perGene, nodeptr p); boolean isTip(int number, int maxTips); void getxnode (nodeptr p); static void insertHashAll(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, unsigned int position); static void insertHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int bipNumber, unsigned int position); static int countHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, unsigned int position); static unsigned int KISS32(void) { static unsigned int x = 123456789, y = 362436069, z = 21288629, w = 14921776, c = 0; unsigned int t; x += 545925293; y ^= (y<<13); y ^= (y>>17); y ^= (y<<5); t = z + w + c; z = w; c = (t>>31); w = t & 2147483647; return (x+y+w); } stringHashtable *initStringHashTable(unsigned int n) { static const unsigned int initTable[] = {53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741}; stringHashtable *h = (stringHashtable*)malloc(sizeof(stringHashtable)); unsigned int tableSize, i, primeTableLength = sizeof(initTable)/sizeof(initTable[0]); i = 0; while(initTable[i] < n && i < primeTableLength) i++; assert(i < primeTableLength); tableSize = initTable[i]; h->table = (stringEntry**)calloc(tableSize, sizeof(stringEntry*)); h->tableSize = tableSize; return h; } static unsigned int hashString(char *p, unsigned int tableSize) { unsigned int h = 0; for(; *p; p++) h = 31 * h + *p; return (h % tableSize); } void addword(char *s, stringHashtable *h, int nodeNumber) { unsigned int position = hashString(s, h->tableSize); stringEntry *p = h->table[position]; for(; p!= NULL; p = p->next) { if(strcmp(s, p->word) == 0) return; } p = (stringEntry *)malloc(sizeof(stringEntry)); assert(p); p->nodeNumber = nodeNumber; p->word = (char *)malloc((strlen(s) + 1) * sizeof(char)); strcpy(p->word, s); /* free(s); */ p->next = h->table[position]; h->table[position] = p; } int getNumberOfTaxa(All *tr, char *bootStrapFile) { FILE *f = myfopen(bootStrapFile, "rb"); char **nameList, buffer[nmlngth + 2]; int i = 0, c, taxaSize = 1024, taxaCount = 0; nameList = (char**)malloc(sizeof(char*) * taxaSize); while((c = fgetc(f)) != ';') { if(c == '(' || c == ',') { c = fgetc(f); if(c == '(' || c == ',') ungetc(c, f); else { i = 0; do { buffer[i++] = c; c = fgetc(f); } while(c != ':' && c != ')' && c != ','); buffer[i] = '\0'; for(i = 0; i < taxaCount; i++) { if(strcmp(buffer, nameList[i]) == 0) { printf("A taxon labelled by %s appears twice in the first tree of tree collection %s, exiting ...\n", buffer, bootStrapFile); exit(-1); } } if(taxaCount == taxaSize) { taxaSize *= 2; nameList = (char **)realloc(nameList, sizeof(char*) * taxaSize); } nameList[taxaCount] = (char*)malloc(sizeof(char) * (strlen(buffer) + 1)); strcpy(nameList[taxaCount], buffer); taxaCount++; ungetc(c, f); } } } printf("Found a total of %d taxa in first tree of tree collection %s\n", taxaCount, bootStrapFile); printf("Expecting all remaining trees in collection to have the same taxon set\n\n"); tr->nameList = (char **)malloc(sizeof(char *) * (taxaCount + 1)); for(i = 1; i <= taxaCount; i++) tr->nameList[i] = nameList[i - 1]; free(nameList); tr->nameHash = initStringHashTable(10 * taxaCount); for(i = 1; i <= taxaCount; i++) addword(tr->nameList[i], tr->nameHash, i); fclose(f); return taxaCount; } boolean setupTree (All *tr, char *bootstrapFile) { nodeptr p0, p, q; int i, j, k, tips, inter; tips = getNumberOfTaxa(tr, bootstrapFile); tr->mxtips = tips; tips = tr->mxtips; inter = tr->mxtips - 1; tr->numberOfTrees = -1; if (NOT(p0 = (nodeptr) malloc((tips + 3*inter) * sizeof(node)))) { printf("ERROR: Unable to obtain sufficient tree memory\n"); return FALSE; } if (NOT(tr->nodep = (nodeptr *) malloc((2*tr->mxtips) * sizeof(nodeptr)))) { printf("ERROR: Unable to obtain sufficient tree memory, too\n"); return FALSE; } tr->nodep[0] = (node *) NULL; /* Use as 1-based array */ for (i = 1; i <= tips; i++) { p = p0++; p->hash = KISS32(); /* hast table stuff */ p->x = 0; p->number = i; p->next = p; p->back = (node *)NULL; p->bInf = (branchInfo *)NULL; for(k = 0; k < NUM_BRANCHES; k++) { p->xs[k] = 0; p->backs[k] = (nodeptr)NULL; } for(k = 0; k < VECTOR_LENGTH; k++) p->isPresent[k] = 0; tr->nodep[i] = p; } for (i = tips + 1; i <= tips + inter; i++) { q = (node *) NULL; for (j = 1; j <= 3; j++) { p = p0++; if(j == 1) p->x = 1; else p->x = 0; p->number = i; p->next = q; p->bInf = (branchInfo *)NULL; p->back = (node *) NULL; p->hash = 0; if(j == 1) for(k = 0; k < NUM_BRANCHES; k++) { p->xs[k] = 1; p->backs[k] = (nodeptr)NULL; } else for(k = 0; k < NUM_BRANCHES; k++) { p->xs[k] = 0; p->backs[k] = (nodeptr)NULL; } for(k = 0; k < VECTOR_LENGTH; k++) p->isPresent[k] = 0; q = p; } p->next->next->next = p; tr->nodep[i] = p; } tr->start = (node *) NULL; tr->ntips = 0; tr->nextnode = 0; for(i = 0; i < tr->numBranches; i++) tr->partitionSmoothed[i] = FALSE; return TRUE; } nodeptr findAnyTip(nodeptr p, int numsp) { return isTip(p->number, numsp) ? p : findAnyTip(p->next->back, numsp); } void hookup (nodeptr p, nodeptr q, double *z, int numBranches) { int i; p->back = q; q->back = p; for(i = 0; i < numBranches; i++) p->z[i] = q->z[i] = z[i]; } static void newviewBipartitions(BitVector **bitVectors, nodeptr p, int numsp, unsigned int vectorLength) { if(isTip(p->number, numsp)) return; { nodeptr q = p->next->back, r = p->next->next->back; unsigned int *vector = bitVectors[p->number], *left = bitVectors[q->number], *right = bitVectors[r->number]; unsigned int i; while(NOT p->x) { if(NOT p->x) getxnode(p); } p->hash = q->hash ^ r->hash; if(isTip(q->number, numsp) && isTip(r->number, numsp)) { for(i = 0; i < vectorLength; i++) vector[i] = left[i] | right[i]; } else { if(isTip(q->number, numsp) || isTip(r->number, numsp)) { if(isTip(r->number, numsp)) { nodeptr tmp = r; r = q; q = tmp; } while(NOT r->x) { if(NOT r->x) newviewBipartitions(bitVectors, r, numsp, vectorLength); } for(i = 0; i < vectorLength; i++) vector[i] = left[i] | right[i]; } else { while((NOT r->x) || (NOT q->x)) { if(NOT q->x) newviewBipartitions(bitVectors, q, numsp, vectorLength); if(NOT r->x) newviewBipartitions(bitVectors, r, numsp, vectorLength); } for(i = 0; i < vectorLength; i++) vector[i] = left[i] | right[i]; } } } } void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, hashtable *h, int treeNumber, int function, branchInfo *bInf, int *countBranches, int treeVectorLength, boolean traverseOnly, boolean computeWRF) { if(isTip(p->number, numsp)) return; else { nodeptr q = p->next; do { bitVectorInitravSpecial(bitVectors, q->back, numsp, vectorLength, h, treeNumber, function, bInf, countBranches, treeVectorLength, traverseOnly, computeWRF); q = q->next; } while(q != p); newviewBipartitions(bitVectors, p, numsp, vectorLength); assert(p->x); if(traverseOnly) { if(NOT(isTip(p->back->number, numsp))) *countBranches = *countBranches + 1; return; } if(NOT(isTip(p->back->number, numsp))) { unsigned int *toInsert = bitVectors[p->number]; unsigned int position = p->hash % h->tableSize; switch(function) { case BIPARTITIONS_ALL: insertHashAll(toInsert, h, vectorLength, treeNumber, position); *countBranches = *countBranches + 1; break; case GET_BIPARTITIONS_BEST: insertHash(toInsert, h, vectorLength, *countBranches, position); p->bInf = &bInf[*countBranches]; p->back->bInf = &bInf[*countBranches]; p->bInf->support = 0; p->bInf->oP = p; p->bInf->oQ = p->back; *countBranches = *countBranches + 1; break; case DRAW_BIPARTITIONS_BEST: { int found = countHash(toInsert, h, vectorLength, position); if(found >= 0) bInf[found].support = bInf[found].support + 1; *countBranches = *countBranches + 1; } break; case BIPARTITIONS_BOOTSTOP: insertHashBootstop(toInsert, h, vectorLength, treeNumber, treeVectorLength, position); *countBranches = *countBranches + 1; break; default: assert(0); } } } } void hookupDefault (nodeptr p, nodeptr q, int numBranches) { int i; p->back = q; q->back = p; for(i = 0; i < numBranches; i++) p->z[i] = q->z[i] = defaultz; } static boolean treeLabelEnd (int ch) { switch (ch) { case EOF: case '\0': case '\t': case '\n': case '\r': case ' ': case ':': case ',': case '(': case ')': case ';': return TRUE; default: break; } return FALSE; } static boolean treeGetLabel (FILE *fp, char *lblPtr, int maxlen) { int ch; boolean done, quoted, lblfound; if (--maxlen < 0) lblPtr = (char *) NULL; else if (lblPtr == NULL) maxlen = 0; ch = getc(fp); done = treeLabelEnd(ch); lblfound = NOT done; quoted = (ch == '\''); if (quoted && NOT done) { ch = getc(fp); done = (ch == EOF); } while (NOT done) { if (quoted) { if (ch == '\'') { ch = getc(fp); if (ch != '\'') break; } } else if (treeLabelEnd(ch)) break; if (--maxlen >= 0) *lblPtr++ = ch; ch = getc(fp); if (ch == EOF) break; } if (ch != EOF) (void) ungetc(ch, fp); if (lblPtr != NULL) *lblPtr = '\0'; return lblfound; } static boolean treeFlushLabel (FILE *fp) { return treeGetLabel(fp, (char *) NULL, (int) 0); } int lookupWord(char *s, stringHashtable *h) { unsigned int position = hashString(s, h->tableSize); stringEntry *p = h->table[position]; for(; p!= NULL; p = p->next) { if(strcmp(s, p->word) == 0) return p->nodeNumber; } return -1; } int treeFindTipByLabelString(char *str, All *tr) { int lookup = lookupWord(str, tr->nameHash); if(lookup > 0) { /* assert(! tr->nodep[lookup]->back); */ return lookup; } else { printf("ERROR: Cannot find tree species: %s\n", str); return 0; } } int treeFindTipName(FILE *fp, All *tr) { char str[nmlngth+2]; int n; if(treeGetLabel(fp, str, nmlngth+2)) n = treeFindTipByLabelString(str, tr); else n = 0; return n; } static boolean treeProcessLength (FILE *fp, double *dptr) { int ch; if ((ch = treeGetCh(fp)) == EOF) return FALSE; /* Skip comments */ (void) ungetc(ch, fp); if (fscanf(fp, "%lf", dptr) != 1) { printf("ERROR: treeProcessLength: Problem reading branch length\n"); treeEchoContext(fp, stdout, 40); printf("\n"); return FALSE; } return TRUE; } static int treeFlushLen (FILE *fp) { double dummy; int ch; ch = treeGetCh(fp); if (ch == ':') { ch = treeGetCh(fp); ungetc(ch, fp); if(NOT treeProcessLength(fp, & dummy)) return 0; return 1; } if (ch != EOF) (void) ungetc(ch, fp); return 1; } boolean isTip(int number, int maxTips) { assert(number > 0); if(number <= maxTips) return TRUE; else return FALSE; } static char *Tree2StringREC(char *treestr, All *tr, nodeptr p, boolean printBranchLengths, boolean printNames, boolean printLikelihood, boolean rellTree, boolean finalPrint, int perGene, boolean branchLabelSupport, boolean printSHSupport) { char *nameptr; if(isTip(p->number, tr->mxtips)) { if(printNames) { nameptr = tr->nameList[p->number]; sprintf(treestr, "%s", nameptr); } else sprintf(treestr, "%d", p->number); while (*treestr) treestr++; } else { *treestr++ = '('; treestr = Tree2StringREC(treestr, tr, p->next->back, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, perGene, branchLabelSupport, printSHSupport); *treestr++ = ','; treestr = Tree2StringREC(treestr, tr, p->next->next->back, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, perGene, branchLabelSupport, printSHSupport); if(p == tr->start->back) { *treestr++ = ','; treestr = Tree2StringREC(treestr, tr, p->back, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, perGene, branchLabelSupport, printSHSupport); } *treestr++ = ')'; } if(p == tr->start->back) { if(printBranchLengths && NOT rellTree) sprintf(treestr, ":0.0;\n"); else sprintf(treestr, ";\n"); } else { if(rellTree || branchLabelSupport || printSHSupport) { if(( NOT isTip(p->number, tr->mxtips)) && ( NOT isTip(p->back->number, tr->mxtips))) { assert(p->bInf != (branchInfo *)NULL); if(rellTree) sprintf(treestr, "%d:%8.20f", p->bInf->support, p->z[0]); if(branchLabelSupport) sprintf(treestr, ":%8.20f[%d]", p->z[0], p->bInf->support); if(printSHSupport) sprintf(treestr, ":%8.20f[%d]", getBranchLength(tr, perGene, p), p->bInf->support); } else { if(rellTree || branchLabelSupport) sprintf(treestr, ":%8.20f", p->z[0]); if(printSHSupport) sprintf(treestr, ":%8.20f", getBranchLength(tr, perGene, p)); } } else { if(printBranchLengths) /* sprintf(treestr, ":%8.20f", getBranchLength(tr, perGene, p)); */ sprintf(treestr, ":%8.20f", p->z[0]); else sprintf(treestr, "%s", "\0"); } } while (*treestr) treestr++; return treestr; } static double getBranchLength(All *tr, int perGene, nodeptr p) { double z = 0.0, x = 0.0; assert(perGene != NO_BRANCHES); assert(tr->fracchange != -1.0); z = p->z[0]; if (z < zmin) z = zmin; x = -log(z) * tr->fracchange; return x; /* here! */ } static nodeptr uprootTree (All *tr, nodeptr p, boolean readBranchLengths, boolean readConstraint) { nodeptr q, r, s, start; int n, i; for(i = tr->mxtips + 1; i < 2 * tr->mxtips - 1; i++) assert(i == tr->nodep[i]->number); if(isTip(p->number, tr->mxtips) || p->back) { printf("ERROR: Unable to uproot tree.\n"); printf(" Inappropriate node marked for removal.\n"); assert(0); } assert(p->back == (nodeptr)NULL); tr->nextnode = tr->nextnode - 1; assert(tr->nextnode < 2 * tr->mxtips); n = tr->nextnode; assert(tr->nodep[tr->nextnode]); if (n != tr->mxtips + tr->ntips - 1) { printf("ERROR: Unable to uproot tree. Inconsistent\n"); printf(" number of tips and nodes for rooted tree.\n"); assert(0); } q = p->next->back; /* remove p from tree */ r = p->next->next->back; assert(p->back == (nodeptr)NULL); if(readBranchLengths) { double b[NUM_BRANCHES]; int i; for(i = 0; i < tr->numBranches; i++) b[i] = (r->z[i] + q->z[i]); hookup (q, r, b, tr->numBranches); } else hookupDefault(q, r, tr->numBranches); if(readConstraint && tr->grouped) { if(tr->constraintVector[p->number] != 0) { printf("Root node to remove should have top-level grouping of 0\n"); assert(0); } } assert(NOT(isTip(r->number, tr->mxtips) && isTip(q->number, tr->mxtips))); assert(p->number > tr->mxtips); if(tr->ntips > 2 && p->number != n) { q = tr->nodep[n]; /* transfer last node's conections to p */ r = q->next; s = q->next->next; if(readConstraint && tr->grouped) tr->constraintVector[p->number] = tr->constraintVector[q->number]; hookup(p, q->back, q->z, tr->numBranches); /* move connections to p */ hookup(p->next, r->back, r->z, tr->numBranches); hookup(p->next->next, s->back, s->z, tr->numBranches); q->back = q->next->back = q->next->next->back = (nodeptr) NULL; } else p->back = p->next->back = p->next->next->back = (nodeptr) NULL; assert(tr->ntips > 2); start = findAnyTip(tr->nodep[tr->mxtips + 1], tr->mxtips); assert(isTip(start->number, tr->mxtips)); tr->rooted = FALSE; return start; } char *Tree2String(char *treestr, All *tr, nodeptr p, boolean printBranchLengths, boolean printNames, boolean printLikelihood, boolean rellTree, boolean finalPrint, int perGene, boolean branchLabelSupport, boolean printSHSupport) { Tree2StringREC(treestr, tr, p, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, perGene, branchLabelSupport, printSHSupport); while (*treestr) treestr++; return treestr; } boolean whitechar (int ch) { return (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r'); } static void treeEchoContext (FILE *fp1, FILE *fp2, int n) { /* treeEchoContext */ int ch; boolean waswhite; waswhite = TRUE; while (n > 0 && ((ch = getc(fp1)) != EOF)) { if (whitechar(ch)) { ch = waswhite ? '\0' : ' '; waswhite = TRUE; } else { waswhite = FALSE; } if (ch > '\0') {putc(ch, fp2); n--;} } } int treeFinishCom (FILE *fp, char **strp) { int ch; while ((ch = getc(fp)) != EOF && ch != ']') { if (strp != NULL) *(*strp)++ = ch; /* save character */ if (ch == '[') { /* nested comment; find its end */ if ((ch = treeFinishCom(fp, strp)) == EOF) break; if (strp != NULL) *(*strp)++ = ch; /* save closing ] */ } } if (strp != NULL) **strp = '\0'; /* terminate string */ return ch; } static int treeGetCh (FILE *fp) { int ch; while ((ch = getc(fp)) != EOF) { if (whitechar(ch)) ; else if (ch == '[') { /* comment; find its end */ if ((ch = treeFinishCom(fp, (char **) NULL)) == EOF) break; } else break; } return ch; } static boolean treeNeedCh (FILE *fp, int c1, char *where) { int c2; if ((c2 = treeGetCh(fp)) == c1) return TRUE; printf("ERROR: Expecting '%c' %s tree; found:", c1, where); if (c2 == EOF) { printf("End-of-File"); } else { ungetc(c2, fp); treeEchoContext(fp, stdout, 40); } putchar('\n'); if(c1 == ':') printf("RAxML may be expecting to read a tree that contains branch lengths\n"); return FALSE; } static boolean addElementLen (FILE *fp, All *tr, nodeptr p, boolean readBranchLengths, boolean readNodeLabels, int *lcount) { nodeptr q; int n, ch, fres; if ((ch = treeGetCh(fp)) == '(') { n = (tr->nextnode)++; if (n > 2*(tr->mxtips) - 2) { if (tr->rooted || n > 2*(tr->mxtips) - 1) { printf("ERROR: Too many internal nodes. Is tree rooted?\n"); printf(" Deepest splitting should be a trifurcation.\n"); return FALSE; } else { assert(NOT readNodeLabels); tr->rooted = TRUE; } } q = tr->nodep[n]; if (NOT addElementLen(fp, tr, q->next, readBranchLengths, readNodeLabels, lcount)) return FALSE; if (NOT treeNeedCh(fp, ',', "in")) return FALSE; if (NOT addElementLen(fp, tr, q->next->next, readBranchLengths, readNodeLabels, lcount)) return FALSE; if (NOT treeNeedCh(fp, ')', "in")) return FALSE; if(readNodeLabels) { char label[64]; int support; if(treeGetLabel (fp, label, 10)) { int val = sscanf(label, "%d", &support); assert(val == 1); if(val != 1 ) PR("error\n"); /*printf("LABEL %s Number %d\n", label, support);*/ p->support = q->support = support; /*printf("%d %d %d %d\n", p->support, q->support, p->number, q->number);*/ assert(p->number > tr->mxtips && q->number > tr->mxtips); *lcount = *lcount + 1; } } else (void) treeFlushLabel(fp); } else { ungetc(ch, fp); if ((n = treeFindTipName(fp, tr)) <= 0) return FALSE; q = tr->nodep[n]; if (tr->start->number > n) tr->start = q; (tr->ntips)++; } if(readBranchLengths) { double branch; if (NOT treeNeedCh(fp, ':', "in")) return FALSE; if (NOT treeProcessLength(fp, &branch)) return FALSE; /* printf("Branch %8.20f %d\n", branch, tr->numBranches); */ hookup(p, q, &branch, tr->numBranches); /* PR(">>> %d\n", tr->numBranches); */ } else { fres = treeFlushLen(fp); if(NOT fres) return FALSE; hookupDefault(p, q, tr->numBranches); } return TRUE; } int getTreeStringLength(char *fileName) { int i = 0; char cc; FILE *f = myfopen(fileName,"r"); while((cc=getc(f))!='\n') i++; fclose(f); return i; } int treeReadLen (FILE *fp, All *tr, boolean readBranches, boolean readNodeLabels, boolean topologyOnly, boolean completeTree) { nodeptr p; int i, ch, lcount = 0; for (i = 1; i <= tr->mxtips; i++) { tr->nodep[i]->back = (nodeptr) NULL; if(topologyOnly) tr->nodep[i]->support = -1; } for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++) { tr->nodep[i]->back = (nodeptr)NULL; tr->nodep[i]->next->back = (nodeptr)NULL; tr->nodep[i]->next->next->back = (nodeptr)NULL; tr->nodep[i]->number = i; tr->nodep[i]->next->number = i; tr->nodep[i]->next->next->number = i; if(topologyOnly) { tr->nodep[i]->support = -2; tr->nodep[i]->next->support = -2; tr->nodep[i]->next->next->support = -2; } } if(topologyOnly) tr->start = tr->nodep[tr->mxtips]; else tr->start = tr->nodep[1]; tr->ntips = 0; tr->nextnode = tr->mxtips + 1; for(i = 0; i < tr->numBranches; i++) tr->partitionSmoothed[i] = FALSE; tr->rooted = FALSE; p = tr->nodep[(tr->nextnode)++]; while((ch = treeGetCh(fp)) != '('); if(NOT topologyOnly) assert(readBranches == FALSE && readNodeLabels == FALSE); if (NOT addElementLen(fp, tr, p, readBranches, readNodeLabels, &lcount)) assert(0); if (NOT treeNeedCh(fp, ',', "in")) assert(0); if (NOT addElementLen(fp, tr, p->next, readBranches, readNodeLabels, &lcount)) assert(0); if (NOT tr->rooted) { if ((ch = treeGetCh(fp)) == ',') { if (NOT addElementLen(fp, tr, p->next->next, readBranches, readNodeLabels, &lcount)) assert(0); } else { /* A rooted format */ tr->rooted = TRUE; if (ch != EOF) (void) ungetc(ch, fp); } } else { p->next->next->back = (nodeptr) NULL; } if (NOT treeNeedCh(fp, ')', "in")) assert(0); if(topologyOnly) assert(NOT(tr->rooted && readNodeLabels)); (void) treeFlushLabel(fp); if (NOT treeFlushLen(fp)) assert(0); if (NOT treeNeedCh(fp, ';', "at end of")) assert(0); if (tr->rooted) { assert(NOT readNodeLabels); p->next->next->back = (nodeptr) NULL; tr->start = uprootTree(tr, p->next->next, FALSE, FALSE); if (NOT tr->start) { printf("FATAL ERROR UPROOTING TREE\n"); assert(0); } } else tr->start = findAnyTip(p, tr->mxtips); if(NOT topologyOnly) { if(tr->ntips < tr->mxtips) { if(completeTree) { printBothOpen("Hello this is your friendly RAxML tree parsing routine\n"); printBothOpen("The RAxML option you are uisng requires to read in only complete trees\n"); printBothOpen("with %d taxa, there is at least one tree with %d taxa though ... exiting\n", tr->mxtips, tr->ntips); exit(-1); } } } return lcount; } void getxnode (nodeptr p) { nodeptr s; if ((s = p->next)->x || (s = s->next)->x) { p->x = s->x; s->x = 0; } assert(p->x); } entry *initEntry(void) { entry *e = (entry*)CALLOC(1,sizeof(entry)); e->bitVector = (unsigned int*)NULL; e->treeVector = (unsigned int*)NULL; e->supportVector = (int*)NULL; e->bipNumber = 0; e->bipNumber2 = 0; e->supportFromTreeset[0] = 0; e->supportFromTreeset[1] = 0; e->next = (entry*)NULL; return e; } static void insertHashAll(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, unsigned int position) { if(h->table[position] != NULL) { entry *e = h->table[position]; do { unsigned int i; for(i = 0; i < vectorLength; i++) if(bitVector[i] != e->bitVector[i]) break; if(i == vectorLength) { if(treeNumber == 0) e->bipNumber = e->bipNumber + 1; else e->bipNumber2 = e->bipNumber2 + 1; return; } e = e->next; } while(e != (entry*)NULL); e = initEntry(); e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int)); /* e->bitVector = (unsigned int*)malloc_aligned(vectorLength * sizeof(unsigned int)); */ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int)); memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength); if(treeNumber == 0) e->bipNumber = 1; else e->bipNumber2 = 1; e->next = h->table[position]; h->table[position] = e; } else { entry *e = initEntry(); e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int)); /* e->bitVector = (unsigned int*)malloc_aligned(vectorLength * sizeof(unsigned int)); */ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int)); memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength); if(treeNumber == 0) e->bipNumber = 1; else e->bipNumber2 = 1; h->table[position] = e; } h->entryCount = h->entryCount + 1; } static void insertHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int bipNumber, unsigned int position) { entry *e = initEntry(); e->bipNumber = bipNumber; /*e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int)); */ e->bitVector = (unsigned int*)CALLOC(vectorLength , sizeof(unsigned int)); memset(e->bitVector, 0, vectorLength * sizeof(unsigned int)); memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength); if(h->table[position] != NULL) { e->next = h->table[position]; h->table[position] = e; } else h->table[position] = e; h->entryCount = h->entryCount + 1; } static int countHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, unsigned int position) { if(h->table[position] == NULL) return -1; { entry *e = h->table[position]; do { unsigned int i; for(i = 0; i < vectorLength; i++) if(bitVector[i] != e->bitVector[i]) goto NEXT; return (e->bipNumber); NEXT: e = e->next; } while(e != (entry*)NULL); return -1; } } static void insertHashBootstop(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, int treeVectorLength, unsigned int position) { if(h->table[position] != NULL) { entry *e = h->table[position]; do { unsigned int i; for(i = 0; i < vectorLength; i++) if(bitVector[i] != e->bitVector[i]) break; if(i == vectorLength) { e->treeVector[treeNumber / MASK_LENGTH] |= mask32[treeNumber % MASK_LENGTH]; return; } e = e->next; } while(e != (entry*)NULL); e = initEntry(); e->bipNumber = h->entryCount; /*e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int));*/ e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int)); memset(e->bitVector, 0, vectorLength * sizeof(unsigned int)); e->treeVector = (unsigned int*)CALLOC(treeVectorLength, sizeof(unsigned int)); e->treeVector[treeNumber / MASK_LENGTH] |= mask32[treeNumber % MASK_LENGTH]; memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength); e->next = h->table[position]; h->table[position] = e; } else { entry *e = initEntry(); e->bipNumber = h->entryCount; e->bitVector = (unsigned int*)CALLOC(vectorLength , sizeof(unsigned int)); memset(e->bitVector, 0, vectorLength * sizeof(unsigned int)); e->treeVector = (unsigned int*)CALLOC(treeVectorLength, sizeof(unsigned int)); e->treeVector[treeNumber / MASK_LENGTH] |= mask32[treeNumber % MASK_LENGTH]; memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength); h->table[position] = e; } h->entryCount = h->entryCount + 1; } FILE *getNumberOfTrees(All *tr, char *fileName) { FILE *f = myfopen(fileName, "r"); int trees = 0, ch; while((ch = fgetc(f)) != EOF) if(ch == ';') trees++; assert(trees > 0); tr->numberOfTrees = trees; rewind(f); return f; } /* INTERFACE TO OUTSIDE WOLRD */ void readBestTree(All *tr, FILE *file) { treeReadLen(file, tr, TRUE, FALSE, TRUE, TRUE); } void readBootstrapTree(All *tr, FILE *file) { treeReadLen(file, tr, FALSE, FALSE, TRUE, TRUE); } char *writeTreeToString(All *tr, boolean printBranchLengths) { Tree2String(tr->tree_string, tr, tr->start->back, /* */ printBranchLengths, TRUE, FALSE, FALSE, TRUE, 0, FALSE, FALSE); return tr->tree_string; } void freeTree(All *tr) { int i; for(i = 1; i <= tr->mxtips; ++i) free(tr->nameList[i]); free(tr->nameList); for(i = 0; i < tr->nameHash->tableSize; ++i) { stringEntry *elem = tr->nameHash->table[i]; while(elem) { stringEntry *iter = elem->next; free(elem->word); free(elem); elem = iter; } } free(tr->nameHash->table); free(tr->nameHash); /* FOR_0_LIMIT(i,((tr->mxtips-1) )) */ /* free(tr->nodep[i]); */ free(tr->nodep); free(tr); } /* END OF INTERFACE */ RogueNaRok-1.0/Tree.h000066400000000000000000000041771165154717300144610ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef TREES_H #define TREES_H #include #include #include #include #include "common.h" #include "legacy.h" extern unsigned int *mask32; boolean isTip(int number, int maxTips); char *writeTreeToString(All *tr, boolean printBranchLengths); void readTree(char *fileName); boolean setupTree (All *tr, char *bootstrapTrees); void readBestTree(All *tr, FILE *file); void readBootstrapTree(All *tr, FILE *file); void hookupDefault (nodeptr p, nodeptr q, int numBranches); nodeptr findAnyTip(nodeptr p, int numsp); int treeFindTipByLabelString(char *str, All *tr); int getTreeStringLength(char *fileName); FILE *getNumberOfTrees(All *tr, char *fileName); void freeTree(All *tr); #endif RogueNaRok-1.0/common.c000066400000000000000000000117561165154717300150460ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "common.h" #define ALLOW_OVERWRITE_INFO_FILE TRUE extern void printHelpFile(); /* GLOBAL VARIABLES =( */ char *infoFileName = "", *programName = "", *programVersion = "", *programReleaseDate ="", run_id[128] = "", workdir[1024] = ""; double updateTime(double* time) { double tmp = *time; return (*time = gettime()) - tmp; } int wrapStrToL(char *string) { int errno = 0; char *p; int result = strtol(string, &p, 10); if (errno != 0 || *p != 0 || p == string) { printf("Something went wrong parsing a number."); exit(-1); } return result; } double wrapStrToDouble(char *string) { int errno = 0; char *p; double result = strtod(string, &p); if (errno != 0 || *p != 0 || p == string) { printf("Something went wrong parsing a number."); exit(-1); } return result; } int filexists(char *filename) { FILE *fp; int res; fp = fopen(filename,"rb"); if(fp) { res = 1; fclose(fp); } else res = 0; return res; } void printVersionInfo(boolean toInfoFile) { if(toInfoFile) PR("\nThis is %s version %s released by Andre J. Aberer in %s.\n\n", programName, programVersion, programReleaseDate); else printf("\nThis is %s version %s released by Andre J. Aberer in %s.\n\n", programName, programVersion, programReleaseDate); } FILE *myfopen(const char *path, const char *mode) { FILE *fp = fopen(path, mode); if(strcmp(mode,"r") == 0 || strcmp(mode,"rb") == 0) { if(fp) return fp; else { if(processID == 0) printf("The file %s you want to open for reading does not exist, exiting ...\n", path); assert(0); exit(-1); return (FILE *)NULL; } } else { if(fp) return fp; else { if(processID == 0) printf("The file %s RAxML wants to open for writing or appending can not be opened [mode: %s], exiting ...\n", path, mode); exit(-1); return (FILE *)NULL; } } } void setupInfoFile() { char *result = CALLOC(1024, sizeof(char)); strcpy(result, workdir); strcat(result, programName); strcat(result, "_info"); strcat(result, "."); strcat(result, run_id); if( NOT ALLOW_OVERWRITE_INFO_FILE && filexists(result)) { printf("The run-id >%s< you specified already exists. Aborting in order to protect the output of this previous run.\n", run_id); exit(-1); } FILE *tmp = myfopen(result, "w"); fclose(tmp); infoFileName = result; printVersionInfo(TRUE); } char *lowerTheString(char *string) { int i, stringLength = strlen(string); char *result = CALLOC(stringLength, sizeof(char)); FOR_0_LIMIT(i,stringLength) result[i] = tolower(string[i]); return result; } FILE *getOutputFileFromString(char *fileName) { char result[1024]; strcpy(result, workdir); strcat(result, programName); strcat(result, "_"); strcat(result, fileName); strcat(result, "."); strcat(result, run_id); return myfopen(result, "w"); } double gettime(void) { #ifdef WIN32 time_t tp; struct tm localtm; tp = time(NULL); localtm = *localtime(&tp); return 60.0*localtm.tm_min + localtm.tm_sec; #else struct timeval ttime; gettimeofday(&ttime , NULL); return ttime.tv_sec + ttime.tv_usec * 0.000001; #endif } void printBothOpen(const char* format, ... ) { FILE *f = myfopen(infoFileName, "ab"); va_list args; va_start(args, format); vfprintf(f, format, args ); va_end(args); va_start(args, format); vprintf(format, args ); va_end(args); fclose(f); } RogueNaRok-1.0/common.h000066400000000000000000000056131165154717300150460ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef COMMON_H #define COMMON_H #include #include #include #include #include #include #include #ifdef WIN32 #include #endif #ifndef WIN32 #include #include #include #include #include #endif typedef char boolean; /* GENERAL */ #define NOT ! #define TRUE 1 #define FALSE 0 #define CALLOC(num, size) calloc(num, size) #define ABS(x) (((x)<0) ? (-(x)) : (x)) #define FOR_0_LIMIT(iter,limit) for(iter=0;iter < (limit); iter++) #define FOR_N_LIMIT(iter,n,limit) for(iter=(n); iter < (limit); iter++) #define PR printBothOpen #define SWAP(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))) #define USE_UPPER_TRIANGLE_LSI(matrix,a,b,c) (matrix[a][b][(c)-(b)]) #define USE_UPPER_TRIANGLE_TII(matrix,x,y) (matrix[(x)][(y-x)]) /* assumes that x < y */ #define MIN(a,b) (((a) > (b)) ? (b) : (a)) #define MAX(a,b) (((a) < (b)) ? (b) : (a)) #define GET_FROM_UPPER_TRIANGLE(matrix,a,b,c) ((btableSize; i++) { if(h->table[i] != NULL) { entry *e = h->table[i]; entry *previous; do { previous = e; e = e->next; if(previous->bitVector) free(previous->bitVector); if(previous->treeVector) free(previous->treeVector); if(previous->supportVector) free(previous->supportVector); free(previous); entryCount++; } while(e != NULL); } } assert(entryCount == h->entryCount); free(h->table); } hashtable *initHashTable(unsigned int n) { static const unsigned int initTable[] = {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648U}; hashtable *h = (hashtable*)CALLOC(1,sizeof(hashtable)); unsigned int tableSize, i, #ifndef NDEBUG maxSize = (unsigned int)-1, #endif primeTableLength = sizeof(initTable)/sizeof(initTable[0]); #ifndef NDEBUG assert(n <= maxSize); #endif i = 0; while(initTable[i] < n && i < primeTableLength) i++; assert(i < primeTableLength); tableSize = initTable[i]; /* printf("Hash table init with size %u\n", tableSize); */ h->table = (entry**)CALLOC(tableSize, sizeof(entry*)); h->tableSize = tableSize; h->entryCount = 0; return h; } BitVector **initBitVector(All *tr, BitVector *vectorLength) { BitVector **bitVectors = (BitVector **)CALLOC(2 * tr->mxtips, sizeof(BitVector*)); int i; if(tr->mxtips % MASK_LENGTH == 0) *vectorLength = tr->mxtips / MASK_LENGTH; else *vectorLength = 1 + (tr->mxtips / MASK_LENGTH); for(i = 1; i <= tr->mxtips; i++) { bitVectors[i] = (BitVector *)CALLOC(*vectorLength, sizeof(BitVector)); bitVectors[i][(i - 1) / MASK_LENGTH] |= mask32[(i - 1) % MASK_LENGTH]; } for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++) bitVectors[i] = (BitVector *)CALLOC(*vectorLength, sizeof(BitVector)); return bitVectors; } ProfileElem *addProfileElem(entry *helem, int vectorLength, int treeVectorLength, int numberOfTrees) { ProfileElem *result = CALLOC(1,sizeof(ProfileElem)); result->isInMLTree = FALSE; result->bitVector = CALLOC(vectorLength, sizeof(BitVector)); result->treeVector = CALLOC(treeVectorLength, sizeof(BitVector)); result->bitVector = memcpy(result->bitVector, helem->bitVector, vectorLength * sizeof(BitVector)); result->treeVector = memcpy(result->treeVector, helem->treeVector, treeVectorLength * sizeof(BitVector)); if(NTH_BIT_IS_SET(result->treeVector, numberOfTrees)) { result->isInMLTree = TRUE; UNFLIP_NTH_BIT(result->treeVector, numberOfTrees); } result->treeVectorSupport = genericBitCount(result->treeVector, treeVectorLength); return result; } RogueNaRok-1.0/legacy.h000066400000000000000000000101211165154717300150100ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef LEGACY_H #define LEGACY_H #include "common.h" #include "BitVector.h" #include "List.h" #include "Array.h" #include "ProfileElem.h" #define NUM_BRANCHES 128 typedef struct ent { unsigned int *bitVector; unsigned int *treeVector; unsigned int amountTips; int *supportVector; unsigned int bipNumber; unsigned int bipNumber2; unsigned int supportFromTreeset[2]; struct ent *next; } entry; typedef struct { unsigned int *vector; int support; struct noderec *oP; struct noderec *oQ; } branchInfo; typedef struct noderec { unsigned int isPresent[NUM_BRANCHES / MASK_LENGTH]; struct noderec *backs[NUM_BRANCHES]; char xs[NUM_BRANCHES]; branchInfo *bInf; double z[NUM_BRANCHES]; struct noderec *next; struct noderec *back; unsigned int hash; int support; int number; char x; double **insertionLWs; } node, *nodeptr; typedef struct stringEnt { int nodeNumber; char *word; struct stringEnt *next; } stringEntry ; typedef struct { unsigned int tableSize; stringEntry **table; } stringHashtable; typedef struct _All { nodeptr start; int mxtips; int numberOfTrees; int bitVectorLength; nodeptr *nodep; int ntips; int nextnode; int numBranches; boolean partitionSmoothed[NUM_BRANCHES]; boolean rooted; stringHashtable *nameHash; boolean grouped; int *constraintVector; char **nameList; char *tree_string; int treeStringLength; double fracchange; } All; typedef struct { unsigned int tableSize; entry **table; unsigned int entryCount; } hashtable; #define FC_INIT 20 #define zmin 1.0E-15 /* max branch prop. to -log(zmin) (= 34) */ #define NO_BRANCHES -1 #define BIPARTITIONS_ALL 0 #define GET_BIPARTITIONS_BEST 1 #define DRAW_BIPARTITIONS_BEST 2 #define BIPARTITIONS_BOOTSTOP 3 #define BIPARTITIONS_RF 4 #define defaultz 0.9 /* value of z assigned as starting point */ #define nmlngth 1024 /* number of characters in species name */ #define VECTOR_LENGTH (NUM_BRANCHES / MASK_LENGTH) void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, hashtable *h, int treeNumber, int function, branchInfo *bInf, int *countBranches, int treeVectorLength, boolean traverseOnly, boolean computeWRF); hashtable *initHashTable(unsigned int n); void freeHashTable(hashtable *h); ProfileElem *addProfileElem(entry *helem, int vectorLength, int treeVectorLength, int numberOfTrees) ; BitVector *neglectThoseTaxa(All *tr, char *toDrop); void pruneTaxon(All *tr, unsigned int k); BitVector **initBitVector(All *tr, BitVector *vectorLength); #endif RogueNaRok-1.0/newFunctions.c000066400000000000000000000127451165154717300162370ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include "newFunctions.h" IndexList *parseToDrop(All *tr, FILE *toDrop) { IndexList *result = NULL; char line[1024]; while(fgets(line, 1024, toDrop) != NULL) { char bla[1024]; sscanf(line, "%s\n", bla); PR("will drop %d\n", treeFindTipByLabelString(bla, tr)); result = appendToIndexList(treeFindTipByLabelString(bla, tr), result); } return result; } void pruneTaxon(All *tr, unsigned int k) { assert(k > 0 && k <= ((unsigned int)(tr->mxtips))); nodeptr p = tr->nodep[k], q = p->back, q1 = q->next->back, q2 = q->next->next->back; hookupDefault(q1, q2, tr->numBranches); tr->start = findAnyTip(q1, tr->mxtips); assert(p != tr->start && q != tr->start); } BitVector *neglectThoseTaxa(All *tr, char *toDrop) { int i = 0; BitVector *result = CALLOC(tr->bitVectorLength, sizeof(BitVector)); for(i = 0; i < tr->mxtips; i++) FLIP_NTH_BIT(result,i); if(strlen(toDrop) == 0) return result; FILE *toDropFile = myfopen(toDrop,"r"); IndexList *iter, *toConvert = parseToDrop(tr, toDropFile); for(iter = toConvert; iter; iter = iter->next) { UNFLIP_NTH_BIT(result, (iter->index -1) ); assert( NOT NTH_BIT_IS_SET(result, iter->index -1 )); } freeIndexList(toConvert); return result; } Array *getOriginalBipArray(All *tr, FILE *bestTree, FILE *treeFile) { Array *result = CALLOC(1, sizeof(Array)); int i,j,bCount = 0, treeVectorLength = GET_BITVECTOR_LENGTH((tr->numberOfTrees+1)); unsigned int vectorLength = 0, **setBitVectors = initBitVector(tr, &vectorLength); hashtable *setHtable = initHashTable(tr->mxtips * FC_INIT * 10); nodeptr commonStart = NULL; BitVector lastByte = 0; for(i = tr->mxtips; i < MASK_LENGTH * vectorLength; ++i) lastByte |= mask32[i % MASK_LENGTH]; unsigned int *randForTaxa = CALLOC(tr->mxtips, sizeof(unsigned int)); for(i = 0; i < tr->mxtips; ++i) randForTaxa[i] = rand(); rewind(treeFile); if(bestTree) rewind(bestTree); /* get bipartitions of bootstrap set */ for( i = 1; i <= tr->numberOfTrees; ++i) { readBootstrapTree(tr, treeFile); if( NOT commonStart) commonStart = tr->start; bCount = 0; bitVectorInitravSpecial(setBitVectors, commonStart->back, tr->mxtips, vectorLength, setHtable, (i - 1), BIPARTITIONS_BOOTSTOP, (branchInfo *)NULL, &bCount, treeVectorLength, FALSE, FALSE); } if(bestTree) { readBestTree(tr,bestTree); bCount = 0; bitVectorInitravSpecial(setBitVectors, commonStart->back, tr->mxtips, vectorLength, setHtable, tr->numberOfTrees, BIPARTITIONS_BOOTSTOP, (branchInfo *)NULL, &bCount, treeVectorLength, FALSE, FALSE); assert(bCount == tr->mxtips - 3); } result->commonAttributes = CALLOC(1,sizeof(ProfileElemAttr)); ((ProfileElemAttr*)result->commonAttributes)->bitVectorLength = vectorLength; ((ProfileElemAttr*)result->commonAttributes)->treeVectorLength = treeVectorLength; ((ProfileElemAttr*)result->commonAttributes)->lastByte = lastByte; ((ProfileElemAttr*)result->commonAttributes)->randForTaxa = randForTaxa; result->length = setHtable->entryCount; result->arrayTable = CALLOC(result->length, sizeof(ProfileElem*)); j = 0; for(i = 0; i < setHtable->tableSize; ++i) { entry *elem = setHtable->table[i]; while(elem) { ((ProfileElem**)result->arrayTable)[j] = addProfileElem(elem, vectorLength, treeVectorLength, tr->numberOfTrees); j++; elem = elem->next; } } assert(j == setHtable->entryCount); int cnt= 0; for(i = 0; i < result->length; ++i) if(((ProfileElem**)result->arrayTable)[i]->isInMLTree) cnt++; if(bestTree) assert(cnt == tr->mxtips - 3); freeHashTable(setHtable); freeBitVectors(setBitVectors, 2 * tr->mxtips); free(setBitVectors); free(setHtable); free(randForTaxa); /* TEST */ for(i = 0; i < result->length; ++i ) ((ProfileElem**)result->arrayTable)[i]->id = i; /* END */ return result; } RogueNaRok-1.0/newFunctions.h000066400000000000000000000034301165154717300162330ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef NEW_FUNCTIONS_H #define NEW_FUNCTIONS_H #include "List.h" #include "common.h" #include "legacy.h" #include "Tree.h" #include "ProfileElem.h" IndexList *parseToDrop(All *tr, FILE *toDrop); void pruneTaxon(All *tr, unsigned int k); BitVector *neglectThoseTaxa(All *tr, char *toDrop); Array *getOriginalBipArray(All *tr, FILE *bestTree, FILE *treeFile) ; #endif RogueNaRok-1.0/parallel.c000066400000000000000000000171051165154717300153440ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifdef PARALLEL #include #include #include "common.h" #include "ProfileElem.h" #include "Dropset.h" #include "parallel.h" void findCandidatesForBip(HashTable *mergingHash, ProfileElem *elemA, boolean firstMerge, Array *bipartitionsById, Array *bipartitionProfile, int* indexByNumberBits); void combineEventsForOneDropset(Array *allDropsets, Dropset *refDropset, Array *bipartitionsById); int getSupportOfMRETree(Array *bipartitionsById, Dropset *dropset); void evaluateDropset(HashTable *mergingHash, Dropset *dropset,Array *bipartitionsById, List *consensusBipsCanVanish ); extern int cumScore; #ifndef PORTABLE_PTHREADS void pinToCore(int tid) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET( tid, &cpuset); if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) { printBothOpen("\n\nThere was a problem finding a physical core for thread number %d to run on.\n", tid); printBothOpen("Probably this happend because you are trying to run more threads than you have cores available,\n"); printBothOpen("which is a thing you should never ever do again, good bye .... \n\n"); assert(0); } } #endif void execFunction(parallelArguments *pArgs, int tid, int n) { int currentJob = threadJob >> 16; switch(currentJob) { case THREAD_GET_EVENTS: { HashTable *mergingHash = pArgs->mergingHash; BitVector *candidateBips = pArgs->candidateBips; Array *bipartitionsById = pArgs->bipartitionsById; Array *bipartitionProfile = pArgs->bipartitionProfile; int *indexByNumberBits = pArgs->indexByNumberBits; boolean firstMerge = pArgs->firstMerge ; boolean done = FALSE ; while (NOT done ) { int jobId = bipartitionProfile->length; ProfileElem *jobElem = NULL ; pthread_mutex_lock(&mutex); do { jobId = bipartitionProfile->length - numberOfJobs; numberOfJobs--; } while(numberOfJobs > 0 && NOT NTH_BIT_IS_SET(candidateBips, jobId)); done = (numberOfJobs <= 0); pthread_mutex_unlock(&mutex); if( bipartitionProfile->length > jobId && (jobElem = GET_PROFILE_ELEM(bipartitionsById, jobId)) ) { assert(jobElem); findCandidatesForBip(mergingHash, jobElem, firstMerge , bipartitionsById, bipartitionProfile, indexByNumberBits); } } break; } case THREAD_COMBINE_EVENTS: { Array *allDropsets = globalPArgs->allDropsets; Array *bipartitionsById = globalPArgs->bipartitionsById; int jobId = allDropsets->length; boolean done = FALSE ; while (NOT done ) { pthread_mutex_lock(&mutex); jobId = allDropsets->length - numberOfJobs; numberOfJobs--; done = (numberOfJobs <= 0); pthread_mutex_unlock(&mutex); if( allDropsets->length > jobId ) combineEventsForOneDropset(allDropsets, GET_DROPSET_ELEM(allDropsets,jobId), bipartitionsById); } break; } case THREAD_MRE: { Array *allDropsets = globalPArgs->allDropsets, *bipartitionsById = globalPArgs->bipartitionsById ; int jobId = allDropsets->length; boolean done = FALSE ; while (NOT done ) { pthread_mutex_lock(&mutex); jobId = allDropsets->length - numberOfJobs; numberOfJobs--; done = (numberOfJobs <= 0); pthread_mutex_unlock(&mutex); if( allDropsets->length > jobId ) { Dropset *dropset = GET_DROPSET_ELEM(allDropsets,jobId); int newSup = getSupportOfMRETree(bipartitionsById, dropset); dropset->improvement = newSup - cumScore; } } break; } case THREAD_EVALUATE_EVENTS: { boolean done = FALSE; int jobId = globalPArgs->allDropsets->length; while (NOT done ) { pthread_mutex_lock(&mutex); jobId = globalPArgs->allDropsets->length - numberOfJobs; numberOfJobs--; done = (numberOfJobs <= 0); pthread_mutex_unlock(&mutex); if(globalPArgs->allDropsets->length > jobId) { Dropset *dropset = GET_DROPSET_ELEM(globalPArgs->allDropsets, jobId); evaluateDropset(globalPArgs->mergingHash, dropset, globalPArgs->bipartitionsById, globalPArgs->consensusBipsCanVanish); } } break; } default: printf("Job %d\n", currentJob); assert(0); } } void *workerThreadWait(void *tData) { threadData *td = (threadData*)tData; parallelArguments *pArgs = td->pArgs; int myCycle = 0; const int n = numberOfThreads, tid = td->threadNumber; #ifndef PORTABLE_PTHREADS pinToCore(tid); #endif printf("This is worker thread number: %d\n", tid); while(1) { while (myCycle == threadJob) ; myCycle = threadJob; execFunction(pArgs, tid, n); barrierBuffer[tid] = 1; } return (void*)NULL; } void startThreads() { pthread_t *threads; pthread_attr_t attr; int rc, t; threadData *tData; jobCycle = 0; threadJob = 0; #ifndef PORTABLE_PTHREADS pinToCore(0); #endif printf("\nThis is the master thread\n"); pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); pthread_mutex_init(&mutex , (pthread_mutexattr_t *)NULL); threads = (pthread_t *)CALLOC(numberOfThreads , sizeof(pthread_t)); tData = (threadData *)CALLOC(numberOfThreads , sizeof(threadData)); barrierBuffer = (volatile char *)CALLOC(numberOfThreads, sizeof(volatile char)); for(t = 0; t < numberOfThreads; t++) barrierBuffer[t] = 0; for(t = 1; t < numberOfThreads; t++) { tData[t].pArgs = globalPArgs; tData[t].threadNumber = t; rc = pthread_create(&threads[t], &attr, workerThreadWait, (void *)(&tData[t])); if(rc) { printf("ERROR; return code from pthread_create() is %d\n", rc); exit(-1); } } } void masterBarrier(int jobType, parallelArguments *pArgs) { const int n = numberOfThreads; int i, sum; jobCycle = NOT jobCycle; threadJob = (jobType << 16) + jobCycle; execFunction(pArgs, 0, n); do { for(i = 1, sum = 1; i < n; i++) sum += barrierBuffer[i]; } while(sum < n); for(i = 1; i < n; i++) barrierBuffer[i] = 0; } #else #endif RogueNaRok-1.0/parallel.h000066400000000000000000000045451165154717300153550ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef PARALLEL_H #define PARALLEL_H #include "HashTable.h" extern volatile int numberOfThreads; extern volatile int numberOfJobs; extern volatile int jobCycle; extern volatile int threadJob; extern volatile char *barrierBuffer; extern pthread_mutex_t mutex; #define THREAD_GET_EVENTS 1 #define THREAD_COMBINE_EVENTS 2 #define THREAD_MRE 3 #define THREAD_EVALUATE_EVENTS 4 typedef struct _parArgs { HashTable *mergingHash; BitVector *candidateBips; Array *bipartitionsById; Array *bipartitionProfile; int *indexByNumberBits; boolean firstMerge; Array *allDropsets; List *consensusBipsCanVanish; } parallelArguments ; parallelArguments *globalPArgs; typedef struct { parallelArguments *pArgs; int threadNumber; } threadData; void masterBarrier(int jobType, parallelArguments *pArgs); void startThreads(); void *workerThreadWait(void *tData); void execFunction(parallelArguments *pArgs, int tid, int n); #endif RogueNaRok-1.0/rnr-lsi.c000066400000000000000000000241661165154717300151430ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include #include #include #include #include #include #include #include #include "common.h" #include "sharedVariables.h" #include "newFunctions.h" #include "legacy.h" #include "Tree.h" #include "BitVector.h" #define PROG_NAME "RnR-lsi" #define PROG_VERSION "1.0" #define PROG_RELEASE_DATE "2011-10-25" #define FLIP_NTH_BIT(bitVector,n) (bitVector[(n) / MASK_LENGTH] |= mask32[ (n) % MASK_LENGTH ]) #define NTH_BIT_IS_SET(bitVector,n) (bitVector[(n) / MASK_LENGTH] & mask32[(n) % MASK_LENGTH]) #define NUMBER_TO_PRUNE(x) ((x) / 2) #define SHORT_UNSIGNED_MAX 65535 typedef unsigned short int smallNumber; extern void freeIndexList(IndexList *list); typedef struct _scorePerTaxon { int taxonId; double mrConsensus; double strictConsensus; double bestTreeScore; double leafStab; double taxonInstability; } ScorePerTaxon; void printQuadruples(All *tr, smallNumber ***quadruples) { int i,j,k; for(i = 0; i < tr->mxtips; ++i) { for(j = 0; j < tr->mxtips; ++j) { for(k = 0; k < tr->mxtips; ++k) { if(quadruples[i][j][k]) printf("%s|%s,%s\t%d\n", tr->nameList[i+1], tr->nameList[j+1], tr->nameList[k+1], quadruples[i][j][k]); } } } } IndexList *extractQuadruplesRecursively(All *tr, nodeptr node, smallNumber ***quadruples, BitVector *neglectThose, boolean isStart) { IndexList *leftList, *rightList, *iterA, *iterB, *iterC; if( NOT isTip(node->number, tr->mxtips) || isStart) { nodeptr leftNode = (isStart) ? node->back->next->back : node->next->back, rightNode = (isStart) ? node->back->next->next->back : node->next->next->back; leftList = extractQuadruplesRecursively(tr, leftNode, quadruples, neglectThose, FALSE); rightList = extractQuadruplesRecursively(tr, rightNode, quadruples, neglectThose,FALSE); } else { IndexList *elem = CALLOC(1,sizeof(IndexList)); elem->index = node->number; elem->next = NULL; return elem; } /* insert the information */ for(iterA = leftList; iterA; iterA = iterA->next) { for(iterB = rightList; iterB; iterB = iterB->next) { for(iterC = iterB->next; iterC; iterC = iterC->next) { int indexB = iterB->index, indexC = iterC->index; if(indexB > indexC) SWAP(indexB,indexC); USE_UPPER_TRIANGLE_LSI(quadruples, (iterA->index-1), (indexB-1), (indexC-1))++; } } } for(iterA = rightList; iterA; iterA = iterA->next) { for(iterB = leftList; iterB; iterB = iterB->next) { for(iterC = iterB->next; iterC; iterC = iterC->next) { int indexB = iterB->index, indexC = iterC->index; if(indexB > indexC) SWAP(indexB,indexC); USE_UPPER_TRIANGLE_LSI(quadruples, (iterA->index-1), (indexB-1), (indexC-1))++; } } } for(iterA = leftList; iterA; iterA = iterA->next) if( NOT iterA->next) { iterA->next = rightList; break; } return leftList; } void freeQuads(All *tr, smallNumber ***quads) { int i,j; for(i = 0; i < tr->mxtips; ++i) { for(j = 0; j < tr->mxtips; ++j) free(quads[i][j]); free(quads[i]); } free(quads); } smallNumber ***initQuads(All *tr) { int i, j; smallNumber ***result = CALLOC(tr->mxtips, sizeof(smallNumber**)); for(i = 0; i < tr->mxtips; ++i) { result[i] = CALLOC(tr->mxtips, sizeof(smallNumber*)); for(j = 0 ; j < tr->mxtips; ++j) result[i][j] = CALLOC(tr->mxtips - j, sizeof(smallNumber)); } return result; } int intcmp(const void *a, const void *b) { return *(int *)b - *(int *)a; } void calculateLeafStability(All *tr, char *bootstrapFileName, char *excludeFileName) { FILE *outf = getOutputFileFromString("leafStabilityIndices"), *bootstrapFile = getNumberOfTrees(tr,bootstrapFileName); BitVector *neglectThose = neglectThoseTaxa(tr, excludeFileName); int i, j, k, l; if(tr->numberOfTrees >= SHORT_UNSIGNED_MAX) { PR("Sorry, %s is not capable of handling more than %d trees. You may want to adjust the code, if you have sufficient memory at disposal.\n"); exit(-1); } fprintf(outf,"taxon\tlsDif\tlsEnt\tlsMax\n"); PR("taxon\tlsDif\tlsEnt\tlsMax\n"); /* calculate leaf stability for n-th taxon */ for(i = 0; i < tr->mxtips; i++) { if( NOT NTH_BIT_IS_SET(neglectThose,i)) { PR("%s\tNA\tNA\tNA\n", tr->nameList[i+1]); fprintf(outf,"%s\tNA\tNA\tNA\n", tr->nameList[i+1]); continue; } smallNumber ***quadruples = initQuads(tr); double lsDif = 0.0, lsEnt = 0.0, lsMax = 0.0; rewind(bootstrapFile); for(j = 1; j <= tr->numberOfTrees; ++j) { int k; readBootstrapTree(tr,bootstrapFile); FOR_0_LIMIT(k,tr->mxtips) if( NOT NTH_BIT_IS_SET(neglectThose, k)) pruneTaxon(tr,k+1); extractQuadruplesRecursively(tr, tr->nodep[i+1], quadruples, neglectThose, TRUE); } /* calculate leaf stability of this leaf */ int sum = 0; double cnt = 0.0; for(j = 0; j < tr->mxtips; ++j) { if(NOT NTH_BIT_IS_SET(neglectThose, j)) continue; for(k = j+1; k < tr->mxtips; ++k) { if(NOT NTH_BIT_IS_SET(neglectThose, k)) continue; for(l = k+1; l < tr->mxtips; ++l) { if(NOT NTH_BIT_IS_SET(neglectThose, l)) continue; int rels[3] = { GET_FROM_UPPER_TRIANGLE(quadruples,j,k,l), GET_FROM_UPPER_TRIANGLE(quadruples,k,j,l), GET_FROM_UPPER_TRIANGLE(quadruples,l,j,k) }; sum = rels[0] + rels[1] + rels[2]; qsort(rels, 3, sizeof(int), intcmp); assert( sum == 0 || sum == tr->numberOfTrees); assert(rels[0] >= 0 && rels[1] >= 0 && rels[0] >= rels[1]); assert( (NTH_BIT_IS_SET(neglectThose, j) && NTH_BIT_IS_SET(neglectThose, k) && NTH_BIT_IS_SET(neglectThose, l) && NTH_BIT_IS_SET(neglectThose, i)) ); if(sum) { lsDif += (double)(rels[0] - rels[1]) / (double)tr->numberOfTrees; lsMax += (double) rels[0] / (double) tr->numberOfTrees ; /* PR(">> %f\n", ((double) rels[0] / (double) tr->numberOfTrees )); */ double tmp = (double)rels[0] / (double)tr->numberOfTrees; lsEnt -= tmp * log(tmp); if(rels[1] > 0) { double tmp = (double) rels[1] / (double) tr->numberOfTrees; lsEnt -= tmp * log(tmp); } if(rels[2] > 0) { double tmp = (double) rels[2] / (double) tr->numberOfTrees; lsEnt -= tmp * log(tmp); } cnt++; } } } } lsDif /= cnt; /* normalize between 0 and 1 */ lsEnt /= cnt; lsEnt /= 3. * log(1./3.) * 1./3.; lsEnt = lsEnt + 1; /* also normalize */ lsMax /= cnt ; lsMax = (lsMax - (1. / 3.)) * 3. / 2. ; PR("%s\t%f\t%f\t%f\n", tr->nameList[i+1], lsDif, lsEnt, lsMax); fprintf(outf, "%s\t%f\t%f\t%f\n", tr->nameList[i+1], lsDif, lsEnt, lsMax); freeQuads(tr,quadruples); } } void printHelpFile() { printVersionInfo(FALSE); printf("This program computes three flavors of leaf stability index for each taxon.\n\nSYNTAX: ./%s -i -n [-w ] [-h]\n", lowerTheString(programName)); printf("\n\tOBLIGATORY:\n"); printf("-i \n\tA collection of bootstrap trees.\n"); printf("-n \n\tAn identifier for this run.\n"); printf("\n\tOPTIONAL:\n"); printf("-x \n\tPrune the taxa in the file first (one taxon per line), before computing the lsi.\n"); printf("-w \n\tA working directory where output files are created.\n"); printf("-h\n\tThis help file.\n"); } int main(int argc, char *argv[]) { programName = PROG_NAME; programVersion = PROG_VERSION; programReleaseDate = PROG_RELEASE_DATE; int c; char *excludeFile = "", *bootTrees = ""; while ((c = getopt (argc, argv, "hi:n:w:m:x:")) != -1) { switch(c) { case 'i': bootTrees = optarg; break; case 'n': strcpy(run_id, optarg); break; case 'w': strcpy(workdir, optarg); break; case 'x': excludeFile = optarg; break; case 'h': default: { printHelpFile(); abort (); } } } if( NOT strcmp(bootTrees, "")) { printf("Please specify a file containing bootstrap trees via -i.\n"); printHelpFile(FALSE); exit(-1); } if( NOT strcmp(run_id, "")) { printf("Please specify a run-id via -n\n"); printHelpFile(FALSE); exit(-1); } compute_bits_in_16bits(); initializeMask(); All *tr = CALLOC(1,sizeof(All)); setupInfoFile(); if (NOT setupTree(tr, bootTrees)) { PR("Something went wrong during tree initialisation. Sorry.\n"); exit(-1); } calculateLeafStability(tr, bootTrees, excludeFile); return 0; } RogueNaRok-1.0/rnr-mast.c000066400000000000000000000576161165154717300153260ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #include #include #include #include #include #include "common.h" #include "List.h" #include "Tree.h" #include "BitVector.h" #include "sharedVariables.h" #include "legacy.h" #define PROG_NAME "RnR-mast" #define PROG_VERSION "1.0" #define PROG_RELEASE_DATE "2011-10-25" /* NOTICE: as the numbering of the taxa is 1-based, throughout this file all array indices (that use the taxon number as access code) are also 1-based. This means that we waste a bit of memory, but hopefully avoids index wars... */ /* SWITCHES */ /* #define PRINT_VERY_VERBOSE */ #define FAST_BV_COMPARISON boolean areSameBitVectors(BitVector *a, BitVector *b, int bitVectorLength); typedef struct _BitVectorList { struct _BitVectorList *next; BitVector *taxaInMast; } MastList; IndexList *traverseForTriples(All *tr, nodeptr node, boolean isStart, BitVector ***rootedTriples, BitVector **bitVectors, int bitVectorLength) { int i; IndexList *leftList, *rightList, *iterA, *iterB; BitVector *leftBv, *rightBv; if( NOT isTip(node->number, tr->mxtips) || isStart) { nodeptr leftNode = (isStart) ? node->back->next->back : node->next->back, rightNode = (isStart) ? node->back->next->next->back : node->next->next->back; leftBv = bitVectors[leftNode->number]; rightBv = bitVectors[rightNode->number]; leftList = traverseForTriples(tr, leftNode, FALSE, rootedTriples, bitVectors, bitVectorLength); rightList = traverseForTriples(tr, rightNode, FALSE, rootedTriples, bitVectors, bitVectorLength); } else { IndexList *elem = malloc(sizeof(IndexList)); elem->index = node->number; elem->next = NULL; FLIP_NTH_BIT(bitVectors[node->number], node->number); assert(NTH_BIT_IS_SET(bitVectors[node->number], node->number)); return elem; } /* enter entries in the triple structure */ for(iterA = leftList; iterA; iterA = iterA->next) { int indexA = iterA->index; assert(NTH_BIT_IS_SET(leftBv, indexA)); for(iterB = rightList; iterB; iterB = iterB->next) { int indexB = iterB->index; assert(NTH_BIT_IS_SET(rightBv, indexB)); for(i = 0; i < bitVectorLength; ++i) { rootedTriples[indexA][indexB][i] |= leftBv[i]; rootedTriples[indexB][indexA][i] |= rightBv[i]; } UNFLIP_NTH_BIT(rootedTriples[indexA][indexB], indexA); UNFLIP_NTH_BIT(rootedTriples[indexB][indexA], indexB); } } /* prepare bitvector and list for this root */ for(i = 0; i < bitVectorLength; ++i) bitVectors[node->number][i] = (leftBv[i] | rightBv[i]); for(iterA = leftList; iterA; iterA = iterA->next) if( NOT iterA->next) { iterA->next = rightList; for(iterA = leftList; iterA; iterA = iterA->next) assert(NTH_BIT_IS_SET(bitVectors[node->number], iterA->index)); break; } /* insert the triples containing the root -- TODO we could omit that, as the root always must be part of the MAST */ i = 0; if(isStart) { for(iterA = leftList; iterA; iterA = iterA->next) { for(iterB = leftList; iterB; iterB = iterB->next) if(iterA->index != iterB->index) FLIP_NTH_BIT(rootedTriples[iterA->index][node->number], iterB->index); i++; } /* assert(i == tr->mxtips-1); */ } return leftList; } BitVector*** initializeTriplesStructure(All *tr, int bitVectorLength) { int i, j; BitVector ***result = malloc((tr->mxtips + 1) * sizeof(BitVector**)); for(i = 0; i < tr->mxtips+1; ++i) { result[i] = malloc((tr->mxtips+1) * sizeof(BitVector*)); for(j = 0; j < tr->mxtips+1; ++j) result[i][j] = calloc(bitVectorLength, sizeof(BitVector)); } return result; } void freeTriplesStructure(All *tr, BitVector ***triples) { int i,j; for(i = 0; i < tr->mxtips+1; ++i) { for(j = 0; j < tr->mxtips+1;++j) free(triples[i][j]); free(triples[i]); } free(triples); } BitVector*** getIntersectionOfRootedTriples(FILE *bootstrapFile, All *tr, int startingNodeIndex, BitVector *neglectThose) { int i,j,k,treeNum=0, bitVectorLength = GET_BITVECTOR_LENGTH((tr->mxtips+1)); IndexList *iter; BitVector **bitVectors = malloc((2 * tr->mxtips - 1) * sizeof(BitVector*)), ***result = initializeTriplesStructure(tr, bitVectorLength), ***theseTriples = initializeTriplesStructure(tr, bitVectorLength); for(i = 0; i < 2 * tr->mxtips - 1; ++i) bitVectors[i] = malloc(bitVectorLength * sizeof(BitVector)); rewind(bootstrapFile); while(treeNum++ < tr->numberOfTrees) { readBootstrapTree(tr,bootstrapFile); FOR_0_LIMIT(i, tr->mxtips) if( NOT NTH_BIT_IS_SET(neglectThose, i)) pruneTaxon(tr, i+1); /* reset bit-vectors */ FOR_0_LIMIT(i, (tr->mxtips+1)) memset(bitVectors[i], 0, sizeof(BitVector) * bitVectorLength); /* reset triples */ for(i = 0; i < tr->mxtips+1; ++i) for(j = 0; j < tr->mxtips+1; ++j) memset(theseTriples[i][j], 0, sizeof(BitVector) * bitVectorLength); /* get all triples in the current tree and clean up */ IndexList *il = traverseForTriples(tr, tr->nodep[startingNodeIndex], TRUE, theseTriples, bitVectors, bitVectorLength); for(iter = il; iter; ) { il = iter->next; free(iter); iter = il; } /* intersect the triples */ for(i = 0; i < tr->mxtips+1; ++i) for(j = 0; j < tr->mxtips+1; ++j) for(k = 0; k < bitVectorLength; ++k) if(treeNum == 1) result[i][j][k] |= theseTriples[i][j][k]; else result[i][j][k] &= theseTriples[i][j][k]; } freeTriplesStructure(tr, theseTriples); for(i = 0; i < 2 * tr->mxtips - 1; ++i) free(bitVectors[i]); free(bitVectors); return result; } typedef struct _agreementMatrix { int score; IndexList* valuesA; IndexList* valuesB; } AgreementMatrix; /* we mean ax|b by that */ IndexList* findBestXinAxB(All *tr, int A, int B, AgreementMatrix **agreementMatrix, BitVector ***triples, int *resultScore) { int i = 0, maximum = 0; BitVector *currentBv = triples[A][B]; IndexList *result = NULL; while(i < tr->mxtips) { #ifdef FAST_BV_COMPARISON if( ((i & 31) == 0) && NOT currentBv[i]) /* NOTE portability issue!!! */ { /* assert((i % 32) == i & (MASK_LENGTH - 1)); */ i += MASK_LENGTH; continue; } #endif if(NTH_BIT_IS_SET(currentBv, i)) { int score = ( A < i ) ? agreementMatrix[A][i].score : agreementMatrix[i][A].score; assert(score); if(score > maximum) maximum = score; } ++i; } i = 0; while(i < tr->mxtips) { #ifdef FAST_BV_COMPARISON if( ((i & 31) == 0) && NOT currentBv[i]) /* NOTE portability issue!!! */ { /* assert((i % 32) == i & (MASK_LENGTH - 1)); */ i += MASK_LENGTH; continue; } #endif if(NTH_BIT_IS_SET(currentBv,i)) { int score = ( A < i ) ? agreementMatrix[A][i].score : agreementMatrix[i][A].score; assert(score); if(score == maximum) result = appendToIndexList(i,result); } i++; } *resultScore = maximum ? maximum : 1; return result; } IndexList *traverseForMastTable(All *tr, nodeptr node, BitVector ***triples, AgreementMatrix **agreementMatrix, boolean isStart) { if(isTip(node->number, tr->mxtips) && NOT isStart) { IndexList *elem = malloc(sizeof(IndexList)); elem->next = NULL; elem->index = node->number; return elem; } else { nodeptr leftNode = (isStart) ? node->back->next->back : node->next->back, rightNode = (isStart) ? node->back->next->next->back : node->next->next->back; IndexList *nodesOnLeft = traverseForMastTable(tr, leftNode, triples, agreementMatrix, FALSE), *nodesOnRight = traverseForMastTable(tr, rightNode, triples, agreementMatrix, FALSE), *iterA, *iterB; for(iterA = nodesOnLeft; iterA; iterA = iterA->next) { int indexA = iterA->index; for(iterB = nodesOnRight; iterB; iterB = iterB->next) { int indexB = iterB->index; int scoreA = 0, scoreB = 0; IndexList *xs = findBestXinAxB(tr, indexA, indexB, agreementMatrix, triples, &scoreA), *ys = findBestXinAxB(tr, indexB, indexA, agreementMatrix, triples, &scoreB); #ifdef PRINT_VERY_VERBOSE printf("%s,x|%s => x = %s\t%s,y|%s => y = %s \n", tr->nameList[indexA], tr->nameList[indexB], (x) ? tr->nameList[x] : "0", tr->nameList[indexB], tr->nameList[indexA], (y) ? tr->nameList[y] : "0"); #endif AgreementMatrix *currentElem = (indexA > indexB) ? &(agreementMatrix[indexB][indexA]) : &(agreementMatrix[indexA][indexB]); currentElem->score = scoreA + scoreB; #ifdef PRINT_VERY_VERBOSE printf("MAST(%s,%s) = %d\n", tr->nameList[indexA], tr->nameList[indexB], currentElem->score); #endif currentElem->valuesA = (indexA > indexB) ? ys : xs; currentElem->valuesB = (indexA > indexB) ? xs : ys; } } /* join the lists */ for(iterA = nodesOnLeft; iterA; iterA = iterA->next) if( NOT iterA->next) { iterA->next = nodesOnRight; break; } if(isStart) { int indexA = node->number; for(iterA = nodesOnLeft; iterA; iterA = iterA->next) { int indexB = iterA->index; int scoreA = 0, scoreB = 0; IndexList *xs = findBestXinAxB(tr, indexA, indexB, agreementMatrix, triples, &scoreA); IndexList *ys = findBestXinAxB(tr, indexB, indexA, agreementMatrix, triples, &scoreB); AgreementMatrix *currentElem = (indexA > indexB) ? &(agreementMatrix[indexB][indexA]) : &(agreementMatrix[indexA][indexB]); currentElem->score = scoreA + scoreB; currentElem->valuesA = (indexA > indexB) ? ys : xs; currentElem->valuesB = (indexA > indexB) ? xs : ys; } } return nodesOnLeft; } } void printAgreementTable(All *tr, AgreementMatrix **matrix) { int i,j; printf("\n\n "); for(i = 1; i <= tr->mxtips; ++i) { printf("%s,", tr->nameList[i]); } printf("\n"); for(i = 1; i <= tr->mxtips; ++i) { printf("%s,", tr->nameList[i]); for(j = 1; j <= tr->mxtips; ++j) { if( NOT matrix[i][j].score) printf(" ,"); else printf("%d,", matrix[i][j].score); } printf("\n"); } printf("\n"); } List *filterDuplicates(List *masts, int bitVectorLength) { List *iter, *result = NULL; int i = 0,j, numElem = 0; boolean *isDuplicate; BitVector **bitVectors; for(iter = masts; iter; iter = iter->next) numElem++; bitVectors = calloc(numElem, sizeof(BitVector*)); i = 0; for(iter = masts; iter;) { bitVectors[i] = (BitVector*)iter->value; i++; masts = iter->next; free(iter); iter = masts; } isDuplicate = calloc(numElem, sizeof(boolean)); for(i = 0; i < numElem; i++) { if(isDuplicate[i]) continue; for(j = i+1; j < numElem; ++j) { if(isDuplicate[j]) continue; isDuplicate[j] = areSameBitVectors(bitVectors[i], bitVectors[j], bitVectorLength); } } for(i = 0; i < numElem; ++i) if(isDuplicate[i]) free(bitVectors[i]); else result = appendToList(bitVectors[i], result); free(isDuplicate); free(bitVectors); return result; } List *backtrace(All *tr, AgreementMatrix **matrix, int cntI, int cntJ, int bitVectorLength, boolean allMasts) { List *iIter; IndexList *iter; int i; List *result = NULL, *fromLeft = NULL, *fromRight = NULL; if(cntI > cntJ) SWAP(cntI,cntJ); AgreementMatrix *matrixElem = &(matrix[cntI][cntJ]); /* int a = 0, b = 0; */ for(iter = matrixElem->valuesA; iter; iter = iter->next) { List *tmp = backtrace(tr, matrix, cntI,iter->index, bitVectorLength, allMasts); fromLeft = joinLists(fromLeft, tmp); if( NOT allMasts) break; /* a++; */ } for(iter = matrixElem->valuesB; iter; iter = iter->next) { List *tmp = backtrace(tr, matrix, cntJ,iter->index, bitVectorLength, allMasts); fromRight = joinLists(fromRight, tmp); if( NOT allMasts) break; /* b++; */ } /* printf("%d\t%d\n", a,b); */ if( NOT (fromLeft || fromRight)) result = appendToList(calloc(bitVectorLength, sizeof(BitVector)),result); else if( NOT (fromLeft && fromRight)) result = fromLeft ? fromLeft : fromRight; else { List *iterA, *iterB; for(iterA = fromLeft; iterA; iterA = iterA->next) { for(iterB = fromRight; iterB; iterB = iterB->next) { BitVector *new = calloc(bitVectorLength, sizeof(BitVector)); for(i = 0; i < bitVectorLength; ++i) new[i] = ((BitVector*)iterA->value)[i] | ((BitVector*)iterB->value)[i]; result = appendToList(new, result); } } freeList(fromRight); freeList(fromLeft); if(allMasts) result = filterDuplicates(result, bitVectorLength); } /* set the bits of the current entry */ for(iIter = result; iIter; iIter = iIter->next) { FLIP_NTH_BIT(((BitVector*)iIter->value), cntI); FLIP_NTH_BIT(((BitVector*)iIter->value), cntJ); } return result; } List *backtraceMasts(All *tr, AgreementMatrix **matrix, boolean allMasts) { int bitVectorLength = GET_BITVECTOR_LENGTH(tr->mxtips), maxScore = 0, i, j; List *result = NULL; /* find maximum in agreement matrix */ for(i = 1; i <= tr->mxtips; ++i) for(j = 1; j <= tr->mxtips; ++j) if(maxScore < matrix[i][j].score) maxScore = matrix[i][j].score; assert(maxScore); for(i = 1; i <= tr->mxtips; ++i) for(j = 1; j <= tr->mxtips; ++j) if(matrix[i][j].score == maxScore) { List *elems = backtrace(tr, matrix, i,j, bitVectorLength, allMasts); result = joinLists(result, elems); } return result; } AgreementMatrix** computeAgreementTable(All *tr, BitVector ***triples, int startingNodeIndex) { int i; IndexList *iter; AgreementMatrix **agreementMatrix = malloc((tr->mxtips+1) * sizeof(AgreementMatrix*)); for(i = 1; i <= tr->mxtips; ++i) agreementMatrix[i] = calloc(tr->mxtips+1, sizeof(AgreementMatrix)); IndexList *list = traverseForMastTable(tr, tr->nodep[startingNodeIndex], triples, agreementMatrix, TRUE); for(iter = list; iter;) { list = iter->next; free(iter); iter = list; } #ifdef PRINT_VERY_VERBOSE printAgreementTable(tr, agreementMatrix); #endif return agreementMatrix; } void printBipartition(All *tr, BitVector *bv, int contentLength) { int i; for(i = 0; i < contentLength;++i) if(NTH_BIT_IS_SET(bv, i)) printf("%s,", tr->nameList[i+1]); } void verifyMasts(All *tr, FILE *bootstrapFile, BitVector *taxaToKeep) { int bCount = 0, treeVectorLength = GET_BITVECTOR_LENGTH(tr->numberOfTrees), i, j; unsigned int vectorLength = 0; hashtable *htable = initHashTable(tr->mxtips * FC_INIT * 10); BitVector **bitVectors = initBitVector(tr, &vectorLength); entry *e; nodeptr commonStart = NULL; rewind(bootstrapFile); for(i = 1; i <= tr->numberOfTrees; ++i) { /* treeReadLen(bootstrapFile, tr, FALSE, FALSE, TRUE, adef, TRUE); */ readBootstrapTree(tr, bootstrapFile); /* prune taxa */ for(j = 1; j <= tr->mxtips; ++j) if( NOT (NTH_BIT_IS_SET(taxaToKeep, j)) ) pruneTaxon(tr, j+1); if(i == 1 ) commonStart = tr->start; bitVectorInitravSpecial(bitVectors, commonStart->back, tr->mxtips, vectorLength, htable, (i - 1), BIPARTITIONS_BOOTSTOP, (branchInfo *)NULL, &bCount, treeVectorLength, FALSE, FALSE); } for(i = 0; i < htable->tableSize; ++i) for(e = htable->table[i]; e; e = e->next) { if(genericBitCount(e->treeVector, treeVectorLength) != tr->numberOfTrees) { printf("error %d/%d\nbv: ", genericBitCount(e->treeVector, treeVectorLength), tr->numberOfTrees); printBipartition(tr,e->bitVector, tr->mxtips); printf("\nrepresented in: "); printBitVector(e->treeVector, tr->numberOfTrees); printf("\n"); } assert(genericBitCount(e->treeVector, treeVectorLength) == tr->numberOfTrees); } freeBitVectors(bitVectors, 2 * tr->mxtips); freeHashTable(htable); } int countTips(nodeptr p, int numsp) { if(isTip(p->number, numsp)) return 1; { nodeptr q; int tips = 0; q = p->next; while(q != p) { tips += countTips(q->back, numsp); q = q->next; } return tips; } } void printMastToFile(All *tr, FILE *bootstrapFile, BitVector *mast, FILE *result) { int droppedTaxaNum = 0, tips, i; rewind(bootstrapFile); readBootstrapTree(tr,bootstrapFile); for(i = 1; i <= tr->mxtips; ++i) if( NOT NTH_BIT_IS_SET(mast,i) ) { pruneTaxon(tr,i+1); droppedTaxaNum++; } tips = countTips(tr->start, tr->mxtips) + countTips(tr->start->back, tr->mxtips); assert((unsigned)tips == ((unsigned)tr->mxtips - droppedTaxaNum)); char *tmp = writeTreeToString(tr, FALSE); fprintf(result, "%s", tmp); printBothOpen("MAST tree: %s", tmp); } void printMastsToFile(All *tr, FILE *bootstrapFile, List *masts) { FILE *result = getOutputFileFromString("MaximumAgreementSubtree"); List *iter; for(iter = masts; iter ; iter = iter->next) printMastToFile( tr, bootstrapFile, iter->value, result); fclose(result); } void freeAgreementMatrix(All *tr, AgreementMatrix **agm) { int i,j; for(i = 1; i <= tr->mxtips; ++i) { for(j = 1; j <= tr->mxtips; ++j) { freeIndexList(agm[i][j].valuesA); freeIndexList(agm[i][j].valuesB); } free(agm[i]); } free(agm); } void freeAmatList(All *tr,List *aml) { List *iter; for(iter = aml; iter;) { aml = iter->next; freeAgreementMatrix(tr, iter->value); free(iter); iter = aml; } } void calculateMast(char *bootStrapFileName, All *tr, char *excludeFileName, boolean allMasts) { int bitVectorLength = GET_BITVECTOR_LENGTH((tr->mxtips+1)), mast = 0, i,j,k; FILE *bootstrapFile = getNumberOfTrees(tr, bootStrapFileName); BitVector *taxaToNeglect = neglectThoseTaxa(tr, excludeFileName); List *iter, *amastList = NULL; for(i = 1; i <= tr->mxtips; ++i) { int currentMast = 0; printBothOpen("rooting %d/%d\t%s\n", i, tr->mxtips, tr->nameList[i]); BitVector ***commonRootedTriples = getIntersectionOfRootedTriples(bootstrapFile, tr, i, taxaToNeglect); AgreementMatrix **amat = computeAgreementTable(tr, commonRootedTriples, i); for(j = 1; j <= tr->mxtips; ++j) for(k = 1 ; k <= tr->mxtips; ++k) if(currentMast < amat[j][k].score) currentMast = amat[j][k].score; if(currentMast > mast) { mast = currentMast; freeAmatList(tr,amastList); amastList = NULL; amastList = appendToList(amat, amastList); } else if( allMasts && currentMast == mast ) amastList = appendToList(amat, amastList); else freeAgreementMatrix(tr, amat); freeTriplesStructure(tr, commonRootedTriples); } List *accMasts = NULL; for(iter = amastList; iter; iter = iter->next) { List *Mast = backtraceMasts(tr, iter->value, allMasts); accMasts = joinLists(Mast, accMasts); } accMasts = filterDuplicates(accMasts, bitVectorLength); /* this works, commenting it out, as it only costs additional time */ for(iter = accMasts; iter; iter = iter->next) verifyMasts(tr, bootstrapFile, ((BitVector*)iter->value)); int cnt = 0; for(iter = accMasts; iter; iter = iter->next) cnt++; printBothOpen("number of alternative MASTs (this is not the number of all possible MASTs, if you did not use \"ALL_MAST\"): %d\n", cnt); cnt = genericBitCount((BitVector*)accMasts->value, GET_BITVECTOR_LENGTH(tr->mxtips)); printBothOpen("MAST size is: %d\n", cnt); /* print */ printMastsToFile(tr,bootstrapFile, accMasts); freeList(accMasts); freeAmatList(tr, amastList); fclose(bootstrapFile); } void printRootedTriples(BitVector ***rootedTriples, All *tr) { int i, j, k; for(i = 1; i <= tr->mxtips; ++i) for(j = 1; j <= tr->mxtips;++j) for(k = 1; k <= tr->mxtips; ++k) if(NTH_BIT_IS_SET(rootedTriples[i][j], k)) printf("%s,%s|%s\n", tr->nameList[i], tr->nameList[k], tr->nameList[j]); } void printHelpFile() { printVersionInfo(FALSE); printf("This program computes maximum agreement trees for unrooted input sets.\n\nSYNTAX: ./%s -i -n [-w ] [-h] [-a] [-x ]\n", lowerTheString(programName)); printf("\nOBLIGATORY:\n"); printf("-i \n\tA collection of bootstrap trees.\n"); printf("-n \n\tAn identifier for this run.\n"); printf("\nOPTIONAL\n"); printf("-a\n\tCompute all possible MAST trees. Without this flag, you will\n\t\ only get a few MASTs that are easy to compute. As there may be an\n\t\ exponential number of MASTs, use this option with care.\n"); printf("-w \n\tA working directory where output files are created.\n"); printf("-x \n\tExclude the taxa in this file (one taxon per line)\n\t\ prior to computing the MAST. If you compute all MASTs anyway, this\n\t\ option is option will not be useful. However, you can use this option\n\t\ to speed up things.\n"); printf("-h\n\tThis help file.\n"); } int main(int argc, char *argv[]) { programName = PROG_NAME; programVersion = PROG_VERSION; programReleaseDate = PROG_RELEASE_DATE; int c; boolean computeAllMasts = FALSE; char *excludeFile = "", *bootTrees = ""; while ((c = getopt (argc, argv, "hi:n:aw:x:")) != -1) { switch(c) { case 'i': bootTrees = optarg; break; case 'n': strcpy(run_id, optarg); break; case 'a': computeAllMasts = TRUE; break; case 'x': excludeFile = optarg; break; case 'w': strcpy(workdir, optarg); break; case 'h': default: { printHelpFile(); abort (); } } } if( NOT strcmp(bootTrees, "")) { printf("Please specify a file containing bootstrap trees via -i.\n"); printHelpFile(); exit(-1); } if( NOT strcmp(run_id, "")) { printf("Please specify a run-id via -n\n"); printHelpFile(); exit(-1); } /* maybe? */ compute_bits_in_16bits(); initializeMask(); All *tr = CALLOC(1,sizeof(All)); setupInfoFile(); if (NOT setupTree(tr, bootTrees)) { PR("Something went wrong during tree initialisation. Sorry.\n"); exit(-1); } tr->tree_string = CALLOC(getTreeStringLength(bootTrees), sizeof(char)); calculateMast(bootTrees, tr, excludeFile, computeAllMasts); return 0; } RogueNaRok-1.0/rnr-prune.c000066400000000000000000000141731165154717300155020ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef WIN32 #include #endif #include #include #include #include #include #include #include #include "common.h" #include "Tree.h" #include "List.h" #include "BitVector.h" #include "sharedVariables.h" #include "newFunctions.h" #define PROG_NAME "RnR-prune" #define PROG_VERSION "1.0" #define PROG_RELEASE_DATE "2011-10-25" extern char run_id[128]; extern char *workdir; extern double masterTime; extern int tid; extern int NumberOfThreads; extern volatile int NumberOfJobs; void pruneTaxaFromTreeset(char *bootstrapFileName, char *bestTreeFile, char *toDropFileName, All *tr) { int i; IndexList *iter; FILE *toDrop = myfopen(toDropFileName, "r"); /* drop taxa from bootstrap trees */ if( strcmp(bootstrapFileName, "")) { tr->tree_string = CALLOC(10 * getTreeStringLength(bootstrapFileName), sizeof(char)); if (NOT setupTree(tr, bootstrapFileName)) { PR("Something went wrong during tree initialisation. Sorry.\n"); exit(-1); } FILE *TMP = getNumberOfTrees(tr, bootstrapFileName); fclose(TMP); } /* drop taxa from best-known tree */ if( strcmp(bestTreeFile, "")) { tr->tree_string = CALLOC(10 * getTreeStringLength(bestTreeFile), sizeof(char)); if (NOT setupTree(tr, bestTreeFile)) { PR("Something went wrong during tree initialisation. Sorry.\n"); exit(-1); } FILE *tmp = getNumberOfTrees(tr, bestTreeFile); fclose(tmp); } IndexList *indicesToDrop = parseToDrop(tr, toDrop); if( strcmp(bootstrapFileName, "")) { FILE *bootstrapFile = getNumberOfTrees(tr, bootstrapFileName), *outf = getOutputFileFromString("prunedBootstraps"); FOR_0_LIMIT(i,tr->numberOfTrees) { readBootstrapTree(tr, bootstrapFile); iter = indicesToDrop; FOR_LIST(iter) pruneTaxon(tr, iter->index); char *tmp = writeTreeToString(tr , FALSE); fprintf(outf, "%s", tmp); } fclose(outf); } if( strcmp(bestTreeFile, "")) { FILE *bestTree = myfopen(bestTreeFile, "r"), *outf = getOutputFileFromString("prunedBestTree"); readBestTree(tr, bestTree); iter = indicesToDrop; FOR_LIST(iter) pruneTaxon(tr, iter->index); char *tmp = writeTreeToString(tr, TRUE); fprintf(outf, "%s", tmp); } freeIndexList(indicesToDrop); exit(EXIT_SUCCESS); } static void printHelpFile() { printVersionInfo(FALSE); printf("This program prunes a list of taxa from a bootstrap tree or a single tree with branch lengths (such as a best-known ML/MP-tree).\n\nSYNTAX: ./%s [-i | -t ] -x -n [-w ] [-h]\n", lowerTheString(programName)); printf("\n\tOBLIGATORY:\n"); printf("-x \n\tA list of taxa (one taxon per line) to prune from either the bootstrap trees or the single best-known tree.\n"); printf("-i \n\tA collection of bootstrap trees.\n"); printf("-t \n\tA single tree with branch lengths. Use either this flag or the -i flag.\n"); printf("-n \n\tAn identifier for this run.\n"); printf("\n\tOPTIONAL:\n"); printf("-w \n\tA working directory where output files are created.\n"); printf("-h\n\tThis help file.\n"); } int main(int argc, char *argv[]) { int c; char *bestTreeFileName = "", *excludeFileName = "", *bootTreesFileName = ""; programName = PROG_NAME; programVersion = PROG_VERSION; programReleaseDate = PROG_RELEASE_DATE; while((c = getopt(argc,argv, "hi:t:x:n:")) != -1) { switch(c) { case 'i': bootTreesFileName = optarg; break; case 'w': strcpy(workdir,optarg); break; case 'n': strcpy(run_id,optarg); break; case 't': bestTreeFileName = optarg; break; case 'x': excludeFileName = optarg; break; case 'h': default: { printHelpFile(); abort(); } } } if( NOT (strcmp(bootTreesFileName, "") || strcmp(bestTreeFileName, ""))) { printf("Please specify either a set of bootstrap trees (-i) or a single best-known tree (-t) from which you want to prune taxa.\n"); printHelpFile(); exit(-1); } if( NOT strcmp(excludeFileName, "") ) { printf("Please specify a file containing taxa to prune (one taxon a line) via -x\n"); printHelpFile(); exit(-1); } if ( NOT strcmp(run_id, "")) { printf("Please specify a run id via -n.\n"); printHelpFile(); exit(-1); } compute_bits_in_16bits(); initializeMask(); setupInfoFile(); All *tr = CALLOC(1,sizeof(All)); tr->numBranches = 1; pruneTaxaFromTreeset(bootTreesFileName, bestTreeFileName, excludeFileName, tr); return 0; } RogueNaRok-1.0/rnr-tii.c000066400000000000000000000214551165154717300151370ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifndef WIN32 #include #endif #include #include #include #include #include #include #include #include "common.h" #include "BitVector.h" #include "Tree.h" #include "List.h" #include "sharedVariables.h" #define PROG_NAME "RnR-tii" #define PROG_VERSION "1.0" #define PROG_RELEASE_DATE "2011-10-25" typedef unsigned short int nodeDistance_t; double tiiZ = 2.; typedef struct _distanceElem { nodeptr node; nodeDistance_t distance; } DistanceElem; List *gatherDistances(All *tr, nodeptr node, nodeDistance_t **resultBranchDistances, boolean isStart) { List *distancesFromLeft, *distancesFromRight, *iterA, *iterB, *end = NULL; if( NOT isTip(node->number, tr->mxtips)) { /* go down to the tips (DFS) */ distancesFromLeft = gatherDistances(tr, node->next->back, resultBranchDistances, FALSE); distancesFromRight = gatherDistances(tr, node->next->next->back, resultBranchDistances, FALSE); } else if(isStart) { distancesFromLeft = gatherDistances(tr, node->back->next->back, resultBranchDistances, FALSE); distancesFromRight = gatherDistances(tr, node->back->next->next->back, resultBranchDistances, FALSE); } else { /* recursion ends */ List *distanceList = CALLOC(1,sizeof(List)); distanceList->value = CALLOC(1,sizeof(DistanceElem)); ((DistanceElem*)distanceList->value)->node = node; ((DistanceElem*)distanceList->value)->distance = 1 ; return distanceList; } /* compute distances of branches from the right to branches from the left side */ iterA = distancesFromLeft; FOR_LIST(iterA) { for(iterB = distancesFromRight; iterB; iterB = iterB->next) { int a = ((DistanceElem*)iterA->value)->node->number - 1, b = ((DistanceElem*)iterB->value)->node->number - 1; if(b < a) SWAP(a,b); assert(a != b ); USE_UPPER_TRIANGLE_TII(resultBranchDistances,a,b) = ((DistanceElem*)iterA->value)->distance + ((DistanceElem*)iterB->value)->distance; } if( NOT iterA->next) end = iterA; } assert(end); end->next = distancesFromRight; /* adding own value */ for(iterA = distancesFromLeft; iterA; iterA = iterA->next) ((DistanceElem*)iterA->value)->distance += 1; if(isStart) { int a = node->number - 1; for(iterA = distancesFromLeft; iterA; iterA = iterA->next) { int b = ((DistanceElem*)iterA->value)->node->number -1; if(b < a) SWAP(a,b); assert(b != a); USE_UPPER_TRIANGLE_TII(resultBranchDistances,a,b) = ((DistanceElem*)iterA->value)->distance; } /* for(i = 0; i < tr->mxtips; ++i) */ /* for(j = i+1; j < tr->mxtips; ++j) */ /* printf("%d\t%d\t%d\n", i,j,USE_UPPER_TRIANGLE(resultBranchDistances,i,j)); */ } return distancesFromLeft; } double getOneTaxonomicInstability(All *tr, int i, nodeDistance_t ***distances, unsigned int *remainingTaxa) { int j,k,l; double taxInstab = 0.0; FOR_0_LIMIT(j,tr->mxtips) { if(j == i || NOT NTH_BIT_IS_SET(remainingTaxa, j)) continue; FOR_0_LIMIT(k, tr->numberOfTrees) { for(l = k+1; l < tr->numberOfTrees; ++l) { double d1 = (i < j) ? (double)(USE_UPPER_TRIANGLE_TII(distances[k], i, j)) : (double)(USE_UPPER_TRIANGLE_TII(distances[k], j, i)), d2 = (i < j) ? (double)(USE_UPPER_TRIANGLE_TII(distances[l], i, j)) : (double)(USE_UPPER_TRIANGLE_TII(distances[l], j, i)), diff = (d1 > d2) ? (d1 - d2) : (d2 - d1), sum = d1 + d2; assert( d1 > 0 && d2 > 0); taxInstab += diff / pow(sum, tiiZ); /* NOTE: could also be another exponent, but the mesquite guys use this one (emphasises close relationship) */ } } } return taxInstab; } void getTaxonomicInstability(All *tr, char *treesFileName, char *excludeFile) { FILE *outf = getOutputFileFromString("taxonomicInstabilityIndex"), *treesFile = getNumberOfTrees(tr, treesFileName); BitVector *neglectThose = neglectThoseTaxa(tr, excludeFile); int i,j,k; nodeDistance_t ***distances = CALLOC(tr->numberOfTrees, sizeof(nodeDistance_t**)); List *iter; rewind(treesFile); FOR_0_LIMIT(i,tr->numberOfTrees) { distances[i] = CALLOC(tr->mxtips, sizeof(nodeDistance_t*)); for(j = 0; j < tr->mxtips; ++j) distances[i][j] = CALLOC(tr->mxtips - j, sizeof(nodeDistance_t)); } /* calculate distances */ FOR_0_LIMIT(i,tr->numberOfTrees) { readBootstrapTree(tr, treesFile); FOR_0_LIMIT(j,tr->mxtips) if( NOT NTH_BIT_IS_SET(neglectThose, j)) pruneTaxon(tr,j+1); /* assert that we get the right labels */ iter = gatherDistances(tr, tr->nodep[1], distances[i], TRUE); freeList(iter); FOR_0_LIMIT(j,tr->mxtips) for(k = j+1; k < tr->mxtips; ++k) { if( NOT USE_UPPER_TRIANGLE_TII(distances[i],j,k)) printf("%s\t%s\t%d\t%d\n", tr->nameList[j+1], tr->nameList[k+1], j,k); assert(USE_UPPER_TRIANGLE_TII(distances[i],j,k)); } } /* calculate taxonomic instability */ FOR_0_LIMIT(i,tr->mxtips) { if(NTH_BIT_IS_SET(neglectThose,i)) { double tii = getOneTaxonomicInstability(tr, i, distances, neglectThose); PR("%s\t%f\n", tr->nameList[i+1], tii); fprintf(outf, "%s\t%f\n", tr->nameList[i+1], tii); } else { PR("%s\tNA\n", tr->nameList[i+1]); fprintf(outf, "%s\tNA\n", tr->nameList[i+1]); } } fclose(outf); } void printHelpFile() { printVersionInfo(FALSE); printf("This program computes the taxonomic instability index.\n\nSYNTAX: ./%s -i -n [-w ] [-h] [-x ]\n", lowerTheString(programName)); printf("\nOBLIGATORY:\n"); printf("-i \n\tA collection of bootstrap trees.\n"); printf("-n \n\tAn identifier for this run.\n"); printf("\nOPTIONAL\n"); printf("-z \n\tThe exponent used in the TII formula. Use small values to emphasize close relationships and vice versa. DEFAULT: 2\n"); printf("-w \n\tA working directory where output files are created.\n"); printf("-x \n\tExclude the taxa in this file (one taxon per line) prior to computing the TII.\n"); printf("-h\n\tThis help file.\n"); } int main(int argc, char *argv[]) { programName = PROG_NAME; programVersion = PROG_VERSION; programReleaseDate = PROG_RELEASE_DATE; int c; char *excludeFile = "", *bootTrees = ""; while ((c = getopt (argc, argv, "hi:n:x:w:z:")) != -1) { switch(c) { case 'i': bootTrees = optarg; break; case 'n': strcpy(run_id, optarg); break; case 'x': excludeFile = optarg; break; case 'w': strcpy(workdir, optarg); break; case 'z': tiiZ = wrapStrToL(optarg); break; case 'h': default: { printHelpFile(); abort (); } } } if( NOT strcmp(bootTrees, "")) { printf("Please specify a file containing bootstrap trees via -i.\n"); printHelpFile(); exit(-1); } if( NOT strcmp(run_id, "")) { printf("Please specify a run-id via -n\n"); printHelpFile(); exit(-1); } compute_bits_in_16bits(); initializeMask(); All *tr = CALLOC(1,sizeof(All)); setupInfoFile(); if (NOT setupTree(tr, bootTrees)) { PR("Something went wrong during tree initialisation. Sorry.\n"); exit(-1); } getTaxonomicInstability(tr, bootTrees, excludeFile); return 0; } RogueNaRok-1.0/sharedVariables.h000066400000000000000000000033231165154717300166510ustar00rootroot00000000000000/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees. * * Moreover, the program collection comes with efficient implementations of * * the unrooted leaf stability by Thorley and Wilkinson * * the taxonomic instability index by Maddinson and Maddison * * a maximum agreement subtree implementation (MAST) for unrooted trees * * a tool for pruning taxa from a tree collection. * * Copyright October 2011 by Andre J. Aberer * * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis. * * This program is free software; you may redistribute it and/or * modify its under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * For any other inquiries send an Email to Andre J. Aberer * andre.aberer at googlemail.com * * When publishing work that is based on the results from RogueNaRok, please cite: * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011. * */ #ifdef PARALLEL volatile int numberOfThreads = 0; volatile int numberOfJobs; volatile int jobCycle; volatile int threadJob; volatile char *barrierBuffer; pthread_mutex_t mutex; #endif extern char *infoFileName, run_id[128], *workdir, *programName, *programVersion, *programReleaseDate;