wham/0000755001532600153260000000000012054751660011046 5ustar yinanyinanwham/hash.h0000644001532600153260000002236612054750341012146 0ustar yinanyinan#ifndef _HASH_H_ #define _HASH_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: hash.h 166 2012-11-26 20:28:17Z yinan $ */ #include #include #include #include "lib.h" #include "sequence.h" #include "hitset.h" #include "embedhash.h" #define L2_CACHE_SIZE (6 * 1024 * 1024) #define HASH_COLLISION_MASK(x) ((x) << 31) #define HASH_EMPTY (0x7fffffff) #define HASH_NOT_FOUND (HASH_EMPTY) #define HASH_IS_EMPTY(x) ((x) == HASH_EMPTY) #define HASH_IS_COLLISION(x) ((x) >> 31) #define HASH_SET_EMPTY(x) ((x) = HASH_EMPTY) #define HASH_SET_COLLISION(x) ((x) |= 0x80000000) #define HASH_SET_NON_COLLISION(x) ((x) &= 0x7fffffff) #define HASH_GET_OFFSET(x) ((x) & 0x7fffffff) #define HASH_SET_OFFSET(x, y) ((x) = ((x) & 0x80000000) | (y)) #define HASH_IS_END(x) ((x) >> 31) #define HASH_SET_END(x) ((x) |= 0x80000000) #define HASH_CLEAR_END(x) ((x) &= 0x7fffffff) #define HASH_OVERFLOW_INIT (0xffffffff) #define HASH_FUNCTION_64(x, numBucket) ((*(x+2)) % (numBucket)) #define HASH_FUNCTION_128(x, numBucket) ((*(x+1) + *(x+2)) % (numBucket)) #define HASH_FUNCTION_192(x, numBucket) ((*(x) + *(x+1) + *(x+2)) % (numBucket)) #define HASH_FUNCTION_256(x, numBucket) ((*(x) + *(x+1) + *(x+2) + *(x+3)) % (numBucket)) #define HASH_FUNCTION_384(x, numBucket) ((*(x) + *(x+1) + *(x+2) + *(x+3) + *(x+4) + *(x+5)) % (numBucket)) #define HASH_XOR_FUNCTION_384(x, numBucket) ((*(x) ^ *(x+1) ^ *(x+2) ^ *(x+3) ^ *(x+4) ^ *(x+5)) % (numBucket)) #ifdef DEBUG_HASH_PRINT #define HASH_DEBUG(x) x #else #define HASH_DEBUG(x) #endif static inline uint32 JenkinsHash(int64 * x, uint32 numBucket) { uint32 hash = 0; uint32 * p = (uint32 *) x; hash += p[11]; hash += (hash << 10); hash ^= (hash >> 6); hash += p[10]; hash += (hash << 10); hash ^= (hash >> 6); hash += p[9]; hash += (hash << 10); hash ^= (hash >> 6); hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return (hash + *(x) + *(x + 1) + *(x + 2) + *(x + 3) + *(x + 4)) % numBucket; } static inline uint32 murmurHash(int64 * x, uint32 numBucket) { const unsigned int m = 0x5bd1e995; const int r = 24; uint32 * p = (uint32 *) x; uint32 h = 322 ^ 3; uint32 k; uint32 sum = 0; k = p[11]; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; k = p[10]; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; k = p[9]; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; if (p[8] != 0) { k = p[8]; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } if (p[7] != 0) { k = p[7]; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } if (p[6] != 0) { k = p[6]; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } h ^= h >> 13; h *= m; h ^= h >> 15; return (h + *(x) + *(x + 1) + *(x + 2)) % numBucket; } //#define HASH_FUNCTION(x, numBucket, words, bucket) bucket = HASH_FUNCTION_384(x, numBucket) //#define HASH_FUNCTION(x, numBucket, words, bucket) bucket = JenkinsHash(x, numBucket) #define HASH_FUNCTION(x, numBucket, words, bucket) bucket = murmurHash(x, numBucket) #define COMPRESS_TABLE_SIZE 2147483648LLU #define STAT_DISTRIBUTION_NUM 10 #define HITLIST_BUCKET_NUM 631 #define HITLIST_BUCKET_LENGTH 100 /* class HitPositionList { public: uint32 counters[HITLIST_BUCKET_NUM]; uint32 buckets[HITLIST_BUCKET_LENGTH][HITLIST_BUCKET_NUM]; int npos; public: HitPositionList() { npos = 0; for (int i = 0; i < HITLIST_BUCKET_NUM; i++) counters[i] = 0; } inline bool insert(uint32 pos) { int bucketId = pos % HITLIST_BUCKET_NUM; uint32 * list = buckets[bucketId]; for (int i = 0; i < counters[bucketId]; i++) { if (buckets[i][bucketId] == pos) return true; } if (counters[bucketId] < HITLIST_BUCKET_LENGTH) { buckets[counters[bucketId]++][bucketId] = pos; npos++; } else { printf("warning: hit list hash list is full.\n"); } return false; } }; */ class HitPositionList { public: int64 array[HITLIST_BUCKET_NUM / 64 + 1]; HitPositionList () { memset(array, 0, sizeof(int64) * (HITLIST_BUCKET_NUM / 64 + 1)); } inline bool insert(uint32 pos) { int bucketId = pos % HITLIST_BUCKET_NUM; int offset = 63 - (bucketId & 0x3f); int64 * data = &array[bucketId >> 6]; // printf("{%d %d %u} ", bucketId, 63 - offset, *data); int64 exist = (*data >> offset) & 1ULL; *data = *data | (1ULL << offset); // printf("{%d %d %u} ", bucketId, 63 - offset, *data); return (bool)exist; } }; class HitPositionArray { public: uint32 num; uint32 array[1000]; }; class EmbedHashTable; class HashTable { private: int indexID; bool compressedTable; int length; /* the length of query sequence (characters) */ int lenSeq; /* the length of query sequence (bits) */ int lenPartition; int lenKey; int lenRest; int nMismatch; /* the number of allowed errors */ int nSubstitute; /* the number of allowed substitutions */ int nInsert; /* the number of allowed insertions */ int nDelete; /* the number of allowed deletions */ int nPartition; // int nLookup; uint32 numBucket; /* the number of buckets */ uint32 numEntry; /* the number of entries */ uint32 numOverflowEntry;/* the number of overflow entries (stored in the overflow list) */ uint32 numCollision; /* the number of buckets with collisions */ uint32 numEmpty; /* the number of empty buckets */ uint32 * buckets; /* bucket array */ uint32 * overflowPool; /* overflow pool array */ CompactSequence * sequence; /* the reference sequence */ unsigned char * emptyBits; /* the bitmap for empty buckets (only used in building phase) */ unsigned char * collisionBits; /* the bitmap for collision buckets (only used in building phase) */ unsigned char * overflowBits; uint32 maxScan; int nMaxError; int nMaxGap; int maxQual; int maxRepeat; int widthKeySpan; int keyPartitions[10]; const static uint32 nHistogram = 28; uint32 histogram[nHistogram]; char seq1[256]; char seq2[256]; char align1[256]; char align2[256]; uint32 statSeqProbe; uint32 statProbe; uint32 statEmpty; uint32 statCollision; bool bUseEmbedTables; int64 headMask[WORDS_PER_READ]; int64 embedHeadMask[WORDS_PER_READ]; int embedShreshold; int numEmbedTables; int numLongLists; int numEmbedTablesPerList; int * embedTableSizes; uint32 * embedTableBucketIds; EmbedHashTable * embedTables; unsigned char * embedBits; public: HashTable(); ~HashTable(); void init(CompactSequence * seq, int len, unsigned int nBucket, int numMismatch, int numInsert, int numDelete, int nPartition, int maxRepeat, bool useEmbedTables, int index); int preProcessInit(); int preProcessEnd(); int buildInit(); void preProcessInsert(int64 * key); /*inline*/ void insert(int64 * key, unsigned int offset); /*inline*/ int buildEmbedTable(); unsigned int lookupEmbedTable(int64 * orgkey, uint32 bucketId, int keyOffset, char * quals, strand s, int rid, HitSet * hits, bool noGap); void preinsertEmbedTableEntry(int64 * query, int embedId, int keyId); void insertEmbedTableEntry(int64 * query, uint32 seqOffset, int embedId, int keyId); unsigned int lookup(int64 * orgkey, int64 * key, int keyOffset, char * quals, strand s, int rid, HitSet * hits, bool noGap = false); bool lookup(int64 * key, uint32 offset); bool longOverflowList(uint32 bucketId, uint32 maxlen); int removeRepeat(uint32 num); void checkRepeat(uint32 num); int sortList(); int check(int num); int bulkLookup(int64 * keys, int num); int save(char * path); int load(char * path, int index, CompactSequence * seq); int remove(); uint32 printOverflowList(uint32 bucketId, uint32 keyOffset, int64 * key); void setScanThreshold(double r); void resetStat() { statProbe = 0; statSeqProbe = 0; statEmpty = 0; statCollision = 0; } int getMaxScan() { return maxScan; } bool useEmbedTables() { return bUseEmbedTables; } void printStat() { printf("Average scan length: %.2f\n", (double) statSeqProbe / statProbe); printf("Avergae lookups: %.2f\n", statProbe / 3000000.0); printf("Scan: %u Same: %u \n", statSeqProbe, statEmpty); } void setErrorModel(int maxerr, int maxgap, int maxqual) { nMaxError = maxerr; nMaxGap = maxgap; maxQual = maxqual; } void setLookupInfo(int keySpan, int * partitions) { widthKeySpan = keySpan; memcpy(keyPartitions, partitions, sizeof(int) * nPartition); } private: // void outputAlignment(int64 * orgkey, int64 * entrykey, uint32 offset, bool forward); // void outputSequence(int64 * key, int len, char * str); unsigned int nextPrime(unsigned int num); }; #endif wham/main.cpp0000644001532600153260000005165012054641424012501 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: main.cpp 165 2012-11-26 10:23:16Z yinan $ */ #include #include #include #include #include #include #ifndef WIN32 #include #endif #include #include #include "error.h" #include "aligner.h" #include "sequence.h" #include "pair.h" #include "short.h" #include "edit_distance.h" #include "perfcounters.h" #include "rdtsc.h" #include "util.h" using namespace std; extern int ELOG_LEVEL; char pgversion[] = "0.1.5"; char * pgcommand; Aligner * aligner; char outputpath[MAX_LENGTH_PATH] = ""; int maxerr = 0; int maxgap = 0; int nThread; int k = 1, m = 0; bool sorted = false, strata = false; PerfCounters perfcounter; unsigned long long perfctr[4]; int minins = 0, maxins = 250; char * alignFileName = NULL, *unalignFileName = NULL; bool concatenate = true; AlignRes * threadres; extern int64 statHashLookup; extern int64 statHashLookupEntry; extern int64 statEmbedHashLookup; extern int64 statEmbedHashLookupEntry; typedef struct ThreadInfo { int id; AlignInfo info; char outputpath[MAX_LENGTH_PATH]; } ThreadInfo; void printStatInfos() { elog(DEBUG1, "Hash Lookups: %llu\n", statHashLookup); elog(DEBUG1, "Hash Lookup Entries: %llu\n", statHashLookupEntry); elog(DEBUG1, "Embed Hash Lookups: %llu\n", statEmbedHashLookup); elog(DEBUG1, "Embed Hash Lookup Entries: %llu\n", statEmbedHashLookupEntry); } /* * skip a line in a text file */ int skipLine(FILE * file) { char c; int i; i = 0; while (1) { c = fgetc(file); i++; if (c == 10 || c == 13 || c == EOF ) break; } return i; } /* * split a comma list */ char ** commaList(char * str, int & num) { int i; char * pch; char ** tok; num = 1; pch = strchr(str, ','); while (pch != NULL) { num++; pch = strchr(pch + 1, ','); } tok = new char *[num]; i = 0; pch = strtok(str, ","); while (pch != NULL) { tok[i] = new char[strlen(pch) + 1];strcpy(tok[i], pch); i++; pch = strtok (NULL, ","); } return tok; } /* * convert string parameter to integer */ int getArguVal(char * str) { char * c; int base; c = strchr(str, 'k'); if (c == NULL ) c = strchr(str, 'K'); if (c == NULL ) c = strchr(str, 'm'); if (c == NULL ) c = strchr(str, 'M'); if (c == NULL ) c = strchr(str, 'g'); if (c == NULL ) c = strchr(str, 'G'); if (c == NULL ) return atoi(str); if (*c == 'k' || *c == 'K') base = 1024; else if (*c == 'm' || *c == 'M') base = 1024 * 1024; else if (*c = 'g' || *c == 'G') base = 1024 * 1024 * 1024; else base = 1; *c = '\0'; return base * atoi(str); } void getVersion(char * buf) { ifstream versionfile("VERSION", ifstream::in); versionfile.getline(buf, 16); versionfile.close(); } int printversion() { printf("WHAM Version %s\n", pgversion); return SUCCESS; } int printhelp() { printf("Usage:\n"); printf( " wham [options]* { | -1 -2 } \n"); printf( " comma-separated list of files containing unpaired reads\n"); printf( " comma-separated list of files containing upstream mates\n"); printf( " comma-separated list of files containing downstream mates\n"); printf( " write wham data to files with this dir/basename\n"); printf(" file to write alignments to\n"); printf("Input options:\n"); printf(" -l use first bases in each read\n"); printf("Alignment options:\n"); printf( " -v specify the max number of errors in a reported alignment.\n"); printf( " -g/--gap specify the max number of gaps in a reported alignment.\n"); printf(" -e/--maqerr max sum of mismatch quals across alignment\n"); printf( " --nofw/--norc do not align to forward/reverse-complement ref strand\n"); printf( " --nofr/--norf do not align to mate1/mate2 strand: fw/rev, rev/fw.\n"); // printf(" --noff/--norr do not align to mate1/mate2 strand: fw/fw, rev/rev.\n"); printf( " -I/--minins minimum insert size for paired-end alignment (default: 0).\n"); printf( " -X/--maxins maximum insert size for paired-end alignment (default: 250).\n"); printf("Reporting options:\n"); printf( " -k report up to valid alignemtns per read (default: 1).\n"); printf(" -a/--all report all valid alignments per read.\n"); printf( " --best reprot valid alignments in a sorted order of quality.\n"); printf( " -m discard reads with more than valid alignmetns.\n"); printf("Output options:\n"); printf(" -S/--sam write alignment in SAM format\n"); printf(" --al wirte aligned reads/pairs to file(s) \n"); printf( " --un write unaligned reads/pairs to file(s) \n"); printf("Performance options:\n"); printf(" -t specify the number of threads\n"); printf( " --nocat do not concatenate results from various threads\n"); printf( " --step specify the number of indexes that fit into memory.\n"); printf("Other options:\n"); // printf(" --pipeline load and lookup hash indexes one by one\n"); printf(" --version print version information\n"); printf(" -h/--help print this usage message\n"); return SUCCESS; } /* * concatenate all command options */ char * getCommand(int argc, char * argv[]) { int i, len = 0; char * str; for (i = 0; i < argc; i++) len += strlen(argv[i]) + 1; str = new char[len]; for (i = 0; i < argc; i++) { if (i > 0) strcat(str, " "); strcat(str, argv[i]); } return str; } /* * merge output files generated by various threads */ void mergeFiles(string fname) { int i; string command; char buf[10]; command = "cat "; for (i = 0; i < nThread; i++) { sprintf(buf, ".t%d ", i); command += fname + buf; } command += "> " + fname; system(command.c_str()); for (i = 0; i < nThread; i++) { sprintf(buf, ".t%d", i); command = "rm -f " + fname + buf; system(command.c_str()); } } /* * merge alignments, aligned reads, unaligned reads generated by various threads */ void merge(char * outputpath, char * alignFileName, char * unalignFileName, bool paired) { string path; // merge output files if (outputpath[0] != '\0') { path.assign(outputpath); mergeFiles(path); } //merge aligned read files if (alignFileName != NULL) { path.assign(alignFileName); if (paired) { mergeFiles(path + "_1"); mergeFiles(path + "_2"); } else { mergeFiles(path); } } //merge unaligned read files if (unalignFileName != NULL) { path.assign(unalignFileName); if (paired) { mergeFiles(path + "_1"); mergeFiles(path + "_2"); } else { mergeFiles(path); } } } /* * thread procedure to align reads */ void * alignThreadProc(void * value) { AlignRes res; ThreadInfo * info = (ThreadInfo *) value; // cpu_set_t mask; // CPU_ZERO(&mask); // CPU_SET(info->id, &mask); // if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) // elog(ERROR, "set affinity error\n"); res = aligner->align(&(info->info), info->outputpath); info->info.reader1->flush(); if (info->info.reader2) info->info.reader2->flush(); threadres[info->id] = res; return NULL; } /* * load indexes, align reads, and output results. Using multithreading if necessary */ int align(char * basepath, AlignInfo * info, char * filename, int nThread) { int i, j, ret; double t; pthread_attr_t attr; ShortRead ** partitions1, **partitions2; pthread_t * threadpool; Timer timer; AlignRes res; ThreadInfo * infos; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); /* make sure the index can fit into memory */ elog(INFO, "loading WHAM indexes...\n"); /* load index header */ aligner = new Aligner(basepath); if (aligner->getReadLength() != info->reader1->getReadLength()) { elog(ERROR, "Unmatched read length (index: %d, reads: %d)\n", aligner->getReadLength(), info->reader1->getReadLength()); return ERR_PARA; } if (maxgap > 0 && aligner->allowGap() == false) { elog(ERROR, "The index does not support indel.\n"); return ERR_PARA; } aligner->printInfo(); //!!!this function should be modified aligner->setErrorModel(maxerr, maxgap, info->maxQual); /* load hash tables */ timer.start(); ret = aligner->loadHashtables(basepath); if (ret != SUCCESS) { if (ret == ERR_MEM) elog(ERROR, "No enough memory\n"); return ret; } elog(INFO, "loading time: %.0f sec\n", timer.stop()); // aligner->setScanThreshold(info->scanThreshold); /* elog(INFO, "testing WHAM indexes..."); ret = aligner->check(1000); if (ret != SUCCESS) { elog(ERROR, "failed to pass the tests. \n"); return ret; } elog(INFO, "pass.\n"); */ elog(INFO, "align reads using indexes...\n"); if (nThread == 1) { /* quick path for single thread */ #ifdef PERFCOUNT unsigned long long timer1; perfcounter.init(); startTimer(&timer1); perfcounter.threadinit(); perfcounter.writeCounters(&perfctr[0], &perfctr[1]); #endif info->showBar = true; timer.start(); /* single-thread execution */ res = aligner->align(info, filename); t = timer.stop(); #ifdef PERFCOUNT perfcounter.writeCounters(&perfctr[2], &perfctr[3]); stopTimer(&timer1); printf("Cycles: %lld, PerfCnt 1: %lld, PerfCnt 2: %lld\n", timer1, perfctr[2]-perfctr[0], perfctr[3]-perfctr[1]); #endif } else { threadpool = new pthread_t[nThread]; infos = new ThreadInfo[nThread]; threadres = new AlignRes[nThread]; bool pair = (info->reader2 != NULL); /* multithreading execution */ partitions1 = info->reader1->split(nThread); if (pair) partitions2 = info->reader2->split(nThread); for (int i = 0; i < nThread; i++) { infos[i].id = i; infos[i].info = *info; infos[i].info.reader1 = partitions1[i]; if (pair) infos[i].info.reader2 = partitions2[i]; else infos[i].info.reader2 = NULL; //only the first thread shows the progress bar if (i == 0) infos[i].info.showBar = true; else infos[i].info.showBar = false; if (outputpath[0] == '\0') infos[i].outputpath[0] = '\0'; else sprintf(infos[i].outputpath, "%s.t%d", outputpath, i); assert( !pthread_create(&threadpool[i], &attr, alignThreadProc, (void *)&infos[i])); } pthread_attr_destroy(&attr); timer.start(); for (int i = 0; i < nThread; i++) assert(!pthread_join(threadpool[i], NULL)); t = timer.stop(); elog(INFO, "\n"); res.nRead = res.nValidRead = res.nValidAlignment = 0; for (int i = 0; i < nThread; i++) { res.nValidAlignment += threadres[i].nValidAlignment; res.nValidRead += threadres[i].nValidRead; res.nRead += threadres[i].nRead; elog(INFO, "Thread %d: Valid Reads: %u/%u, Valid Alignments: %d\n", i, threadres[i].nValidRead, threadres[i].nRead, threadres[i].nValidAlignment); } if (concatenate) { elog(INFO, "collect results...\n"); merge(filename, alignFileName, unalignFileName, pair); } delete[] infos; delete[] threadpool; delete[] threadres; } /* remove hash tables */ ret = aligner->removeHashTables(); if (ret != SUCCESS ) return ret; PairAligner::printTimePairAlign(); elog(INFO, "\n***************************\n"); elog(INFO, "Total Align Time: %.6f sec\n", t); float ratio = (float) res.nValidRead * 100 / res.nRead; elog(INFO, "Valid Reads: %u/%u=%.2f%%\n", res.nValidRead, res.nRead, ratio); elog(INFO, "Valid Alignments: %u\n", res.nValidAlignment); elog(INFO, "***************************\n\n"); aligner->printStat(); return SUCCESS; } int main(int argc, char* argv[]) { int i, j; int numKeys; char ** m1_files = NULL, **m2_files = NULL; int m1_nfile, m2_nfile; int nSeq = 1; bool m1_forward = true, m1_backward = true; bool m2_forward = true, m2_backward = true; bool paired = false; int outputMode = MODE_NORMAL; int ret; CompactSequence * sequence; char basepath[256] = ""; pthread_t * threadpool; ShortRead * reader1 = NULL, *reader2 = NULL; int * threadid; int maxlen = 0; int maxqual = 255; int maxMate = 100; double scanThreshold = 0.001; bool mateMatch = true; bool pairStrand[2][2]; srand((unsigned int) time(NULL)); nThread = 1; pairStrand[FORWARD][FORWARD] = false; pairStrand[FORWARD][BACKWARD] = true; pairStrand[BACKWARD][FORWARD] = true; pairStrand[BACKWARD][BACKWARD] = false; j = 0; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-l") == 0) maxlen = atoi(argv[++i]); else if (strcmp(argv[i], "-k") == 0) k = atoi(argv[++i]); else if (strcmp(argv[i], "-m") == 0) m = atoi(argv[++i]); else if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--all") == 0) m = k = 0; else if (strcmp(argv[i], "-e") == 0 || strcmp(argv[i], "--maqerr") == 0) maxqual = atoi(argv[++i]); else if (strcmp(argv[i], "-v") == 0) maxerr = atoi(argv[++i]); else if (strcmp(argv[i], "-g") == 0 || strcmp(argv[i], "--gap") == 0) maxgap = atoi(argv[++i]); else if (strcmp(argv[i], "-s") == 0) scanThreshold = atof(argv[++i]); else if (strcmp(argv[i], "-t") == 0) nThread = atoi(argv[++i]); else if (strcmp(argv[i], "--best") == 0) sorted = true; else if (strcmp(argv[i], "--hit") == 0) maxMate = atoi(argv[++i]); else if (strcmp(argv[i], "--al") == 0) { alignFileName = new char[256]; strcpy(alignFileName, argv[++i]); } else if (strcmp(argv[i], "--un") == 0) { unalignFileName = new char[256]; strcpy(unalignFileName, argv[++i]); } else if (strcmp(argv[i], "--nofw") == 0) m1_forward = false; else if (strcmp(argv[i], "--norc") == 0) m1_backward = false; else if (strcmp(argv[i], "--nocat") == 0) concatenate = false; else if (strcmp(argv[i], "-S") == 0 || strcmp(argv[i], "--sam") == 0) outputMode = MODE_SAM; else if (strcmp(argv[i], "--nofr") == 0) { pairStrand[FORWARD][BACKWARD] = false; } else if (strcmp(argv[i], "--norf") == 0) { pairStrand[BACKWARD][FORWARD] = false; } else if (strcmp(argv[i], "--noff") == 0) { pairStrand[FORWARD][FORWARD] = false; } else if (strcmp(argv[i], "--norr") == 0) { pairStrand[BACKWARD][BACKWARD] = false; } else if (strcmp(argv[i], "--nomate") == 0) { mateMatch = false; } else if (strcmp(argv[i], "--info") == 0) { i++; if (strcmp(argv[i], "ERROR") == 0) ELOG_LEVEL = ERROR; else if (strcmp(argv[i], "WARNING") == 0) ELOG_LEVEL = WARNING; else if (strcmp(argv[i], "INFO") == 0) ELOG_LEVEL = INFO; else if (strcmp(argv[i], "DEBUG1") == 0) ELOG_LEVEL = DEBUG1; } else if (strcmp(argv[i], "-I") == 0 || strcmp(argv[i], "--minins") == 0) minins = atoi(argv[++i]); else if (strcmp(argv[i], "-X") == 0 || strcmp(argv[i], "--maxins") == 0) maxins = atoi(argv[++i]); else if (strcmp(argv[i], "--version") == 0) { printversion(); return SUCCESS; } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { printhelp(); return SUCCESS; } else if (strcmp(argv[i], "-1") == 0) { j = 1; paired = true; m1_files = commaList(argv[++i], m1_nfile); if (strcmp(argv[++i], "-2") == 0) m2_files = commaList(argv[++i], m2_nfile); else { elog(ERROR, "Specify files for mate2 of paired-end reads\n"); printf("See usage message by specifying -h/--help.\n"); return ERR_PARA; } } else { if (argv[i][0] == '-') { printf("Invalid option %s.\n", argv[i]); printf("See usage message by specifying -h/--help.\n"); return ERR_PARA; } if (j == 0) { paired = false; m1_files = commaList(argv[i], m1_nfile); } else if (j == 1) strcpy(basepath, argv[i]); else if (j == 2) strcpy(outputpath, argv[i]); else { printf("Invalid option %s.\n", argv[i]); printf("See usage message by specifying -h/--help.\n"); return ERR_PARA; } j++; } } if (basepath[0] == '\0') { elog(ERROR, "specify the base name.\n"); printf("See usage message by specifying -h/--help.\n"); exit(1); } // if (outputpath[0] == '\0') { // elog(ERROR, "specify the output file name.\n"); // printf("See usage message by specifying -h/--help.\n"); // exit(1); // } if (m1_files == NULL) { elog(ERROR, "specify the read files.\n"); printf("See usage message by specifying -h/--help.\n"); return ERR_PARA; } if (maxgap > MAX_GAP) { elog(ERROR, "WHAM supports up to %d gaps.\n", MAX_GAP); return ERR_PARA; } if (!pairStrand[FORWARD][FORWARD] && !pairStrand[FORWARD][BACKWARD]) m1_forward = false; if (!pairStrand[FORWARD][FORWARD] && !pairStrand[BACKWARD][FORWARD]) m2_forward = false; if (!pairStrand[BACKWARD][FORWARD] && !pairStrand[BACKWARD][BACKWARD]) m1_backward = false; if (!pairStrand[FORWARD][BACKWARD] && !pairStrand[BACKWARD][BACKWARD]) m2_backward = false; /* catenate the command line */ pgcommand = getCommand(argc, argv); elog(INFO, "loading short reads...\n"); if (!paired) { /* single-end read */ reader1 = new ShortRead(); reader1->init(m1_files, m1_nfile, m1_forward, m1_backward, alignFileName, unalignFileName); ret = reader1->load(maxlen); if (ret != SUCCESS) { elog(ERROR, "failed to load the short reads.\n"); return ret; } } else { char * alFileName1 = NULL, *alFileName2 = NULL; char * unFileName1 = NULL, *unFileName2 = NULL; if (alignFileName != NULL) { alFileName1 = new char[256]; alFileName2 = new char[256]; sprintf(alFileName1, "%s_1", alignFileName); sprintf(alFileName2, "%s_2", alignFileName); } if (unalignFileName != NULL) { unFileName1 = new char[256]; unFileName2 = new char[256]; sprintf(unFileName1, "%s_1", unalignFileName); sprintf(unFileName2, "%s_2", unalignFileName); } /* paired-end read */ reader1 = new ShortRead(); reader1->init(m1_files, m1_nfile, m1_forward, m1_backward, alFileName1, unFileName1); ret = reader1->load(maxlen); if (ret != SUCCESS) { elog(ERROR, "failed to load the short reads.\n"); return ret; } reader2 = new ShortRead(); reader2->init(m2_files, m2_nfile, m2_forward, m2_backward, alFileName2, unFileName2); ret = reader2->load(maxlen); if (ret != SUCCESS) { elog(ERROR, "failed to load the short reads.\n"); return ret; } /* check mate1 and mate2 */ if (reader1->getNumReads() != reader2->getNumReads()) { elog(ERROR, "the numbers of reads in mate files does not match.\n"); return ret; } if (reader1->getReadLength() != reader2->getReadLength()) { elog(ERROR, "the lengthes of reads in mate files does not match\n"); return ret; } } if (reader1->getNumReads() == 0) { elog(INFO, "Empty read files.\n"); return SUCCESS; } AlignInfo info; info.reader1 = reader1; info.reader2 = reader2; info.minins = minins; info.maxins = maxins; info.sorted = sorted; info.strata = strata; info.maxHit = k; info.maxMatch = m; info.maxQual = maxqual; info.maxGap = maxgap; info.scanThreshold = scanThreshold; info.mateMatch = mateMatch; info.maxMate = maxMate; info.outputFormat = outputMode; info.concatenate = concatenate; memcpy(info.pairStrand, pairStrand, 4 * sizeof(bool)); ret = align(basepath, &info, outputpath, nThread); if (ret != SUCCESS) { elog(ERROR, "failed to align read files.\n"); return ret; } if (outputpath[0] != '\0') elog(INFO, "see %s for all valid alignments.\n", outputpath); printStatInfos(); return SUCCESS; } wham/unittest.cpp0000644001532600153260000001033112003705361013416 0ustar yinanyinan#include #include #include #include "bitread.h" #include "sequence.h" #include "error.h" void genRandomSequence(char * str, int len) { for (int i = 0; i < len; i++) { switch (rand() % 4) { case 0: str[i] = 'A'; break; case 1: str[i] = 'C'; break; case 2: str[i] = 'G'; break; case 3: str[i] = 'T'; break; } } } bool testExtract() { int len, offset; int i, j; char str[1024], str1[256]; char * fname[1]; int64 space[16]; int64 * key = &space[8]; int ret; fname[1] = new char[128]; strcpy(fname[1], "./tmp_test.fa"); genRandomSequence(str, 1000); FILE * file = fopen(fname[1], "w"); for (i = 0; i < 20; i++) { for (j = 0; j < 50; j++) { fprintf(file, "%c", str[i * 50 + j]); } fprintf(file, "\n"); } fclose(file); CompactSequence * sequence = new CompactSequence(true); sequence->build(fname, 1, 36, 2); if (ret != SUCCESS) { printf("Failed to load reference sequences.\n"); return false; } int64 * seq = sequence->getSequence(); printf("Testing extract... "); for (len = 36; len <= 36; len++) { offset = rand() % 500; BitRead::extract(seq, key, offset * BITS_PER_BASE, len * BITS_PER_BASE); CompactSequence::decompose(str1, len, key); for (i = 0; i < len; i++) { if (str[i + offset] != str1[i]) { printf("Failed at %d bps\n", len); return false; } } } printf("Passed\n"); return true; } bool testRemoveHead() { int len; int head, i; char str1[256], str2[256]; int64 space1[16], space2[16]; int64 mask[WORDS_PER_READ]; int64 * key1, *key2; key1 = &space1[8]; key2 = &space2[8]; printf("Testing remove head... "); for (len = 36; len <= 128; len++) { genRandomSequence(str1, len); CompactSequence::compose(str1, len, key1); for (head = len - 10; head < len; head++) { BitRead::genHeadMask(mask, head * BITS_PER_BASE); BitRead::removeHead(key1, key2, mask); CompactSequence::decompose(str2, head, key2); /* printf("\n"); for (i = 0; i < len; i++) printf("%c", str1[i]); printf("\n"); for (i = 0; i < len - head; i++) printf(" "); for (i = 0; i < head; i++) printf("%c", str2[i]); printf("\n"); */ //check for (i = 0; i < head; i++) { if (str1[i + len - head] != str2[i]) { printf("Failed at %d bps, %d head\n", len, head); return false; } } } } printf("Passed\n"); return true; } bool testRemoveInterval() { int len; int p, offset, i; char str1[256], str2[256]; int64 space1[16], space2[16]; int64 * key1, *key2; key1 = &space1[8]; key2 = &space2[8]; printf("Testing remove interval... "); for (len = 36; len <= 128; len++) { genRandomSequence(str1, len); CompactSequence::compose(str1, len, key1); for (p = 2; p <= 8; p++) { for (offset = 0; offset < len / p * p; offset += len / p) { BitRead::removeInterval(key1, key2, offset * BITS_PER_BASE , len / p * BITS_PER_BASE); CompactSequence::decompose(str2, len - len / p, key2); /* for (i = 0; i < len; i++) printf("%c", str1[i]); printf("\n"); for (i = 0; i < len - offset - len/p; i++) printf("%c", str2[i]); for (i = 0; i < len/p; i++) printf(" "); for (i = len - offset - len/p; i < len - len/p; i++) printf("%c", str2[i]); printf("\n\n"); */ //check for (i = 0; i < len - offset - len / p; i++) { if (str1[i] != str2[i]) { printf("Failed at %d bps, %d partition, %d offset\n", len, p, offset); return false; } } for (i = len - offset - len / p; i < len - len / p; i++) { if (str1[i + len / p] != str2[i]) { printf("Failed at %d bps, %d partition, %d offset\n", len, p, offset); return false; } } } } } printf("Passed\n"); return true; } int main() { int i = 1; int j = 2; // testExtract(); testRemoveHead(); testRemoveInterval(); return 0; } wham/builder.cpp0000644001532600153260000001733312054641424013203 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: builder.cpp 165 2012-11-26 10:23:16Z yinan $ */ #include #include #include #include #include #ifndef WIN32 #include #endif #include "error.h" #include "aligner.h" #include "sequence.h" #include "short.h" #include "edit_distance.h" #define BUILD_MODE_PIPELINE 1 #define BUILD_MODE_ALL 2 extern int ELOG_LEVEL; char pgversion[] = "0.1.5"; char * pgcommand; char ** commaList(char * str, int & num) { int i; char * pch; char ** tok; num = 1; pch = strchr(str, ','); while (pch != NULL) { num++; pch = strchr(pch + 1, ','); } tok = new char *[num]; i = 0; pch = strtok(str, ","); while (pch != NULL) { tok[i] = new char[strlen(pch) + 1];strcpy(tok[i], pch); i++; pch = strtok (NULL, ","); } return tok; } char * getCommand(int argc, char * argv[]) { int i, len = 0; char * str; for (i = 0; i < argc; i++) len += strlen(argv[i]) + 1; str = new char[len]; for (i = 0; i < argc; i++) { if (i > 0) strcat(str, " "); strcat(str, argv[i]); } return str; } int getArguVal(char * str) { char * c; int base; c = strchr(str, 'k'); if (c == NULL ) c = strchr(str, 'K'); if (c == NULL ) c = strchr(str, 'm'); if (c == NULL ) c = strchr(str, 'M'); if (c == NULL ) c = strchr(str, 'g'); if (c == NULL ) c = strchr(str, 'G'); if (c == NULL ) return atoi(str); if (*c == 'k' || *c == 'K') base = 1024; else if (*c == 'm' || *c == 'M') base = 1024 * 1024; else if (*c = 'g' || *c == 'G') base = 1024 * 1024 * 1024; else base = 1; *c = '\0'; return base * atoi(str); } int printversion() { printf("WHAM Version %s\n", pgversion); return SUCCESS; } int printhelp() { printf("Usage:\n"); printf(" wham-build [options]* -l \n"); printf(" -l specify the length of short reads\n"); printf(" comma-separated list of fiels with ref sequences\n"); printf(" write wham data to files with this dir/basename\n"); printf("Options:\n"); printf(" -v report hits with <=v errors (0-5), ignore qualities\n"); printf(" -p specify the number of fragments for alignments\n"); printf(" -m discard subsequences appearing more than times (default: 100).\n"); printf(" -b specify the number of buckets\n"); printf(" -a find all valid matches (need much more memory)\n"); // printf(" --pipeline build and save hash tables one by one\n"); printf(" --mask keep masked characters in the sequences (default: on)\n"); printf(" --unmask discard masked characters in the sequences. Masks are treated as Ns\n"); printf(" --version print version information\n"); printf(" -h/--help print this usage message\n"); return SUCCESS; } int main(int argc, char* argv[]) { int i, j; int len, nPartition, nMismatch, nBucket; bool isEmbedHashTable; char ** fname = NULL; int nSeq = 1; int ret; int build_mode = BUILD_MODE_ALL; int skipMask = false; int maxRepeat = 100; CompactSequence * sequence; char basepath[128] = ""; srand((unsigned int) time(NULL)); len = 0; nPartition = 0; nMismatch = 2; nBucket = 0; isEmbedHashTable = false; for (i = 1, j = 0; i < argc; i++) { if (strcmp(argv[i], "-l") == 0) len = getArguVal(argv[++i]); else if (strcmp(argv[i], "-p") == 0) nPartition = getArguVal(argv[++i]); else if (strcmp(argv[i], "-v") == 0) nMismatch = getArguVal(argv[++i]); else if (strcmp(argv[i], "-m") == 0) maxRepeat = getArguVal(argv[++i]); else if (strcmp(argv[i], "-a") == 0) isEmbedHashTable = true; else if (strcmp(argv[i], "-b") == 0) nBucket = getArguVal(argv[++i]); else if (strcmp(argv[i], "--mask") == 0) skipMask = false; else if (strcmp(argv[i], "--unmask") == 0) skipMask = true; // else if (strcmp(argv[i], "--pipeline") == 0) // build_mode = BUILD_MODE_PIPELINE; else if (strcmp(argv[i], "--version") == 0) { printversion(); return SUCCESS; } else if (strcmp(argv[i], "--info") == 0) { i++; if (strcmp(argv[i], "ERROR") == 0) ELOG_LEVEL = ERROR; else if (strcmp(argv[i], "WARNING") == 0) ELOG_LEVEL = WARNING; else if (strcmp(argv[i], "INFO") == 0) ELOG_LEVEL = INFO; else if (strcmp(argv[i], "DEBUG1") == 0) ELOG_LEVEL = DEBUG1; } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { printhelp(); return SUCCESS; } else { if (argv[i][0] == '-') { printf("Invalid option %s.\n", argv[i]); printhelp(); return ERR_PARA; } if (j == 0) fname = commaList(argv[i], nSeq); else if (j == 1) strcpy(basepath, argv[i]); else { printf("Invalid option %s.\n", argv[i]); printhelp(); return ERR_PARA; } j++; } } if (len == 0) { elog(ERROR, "specify the length of reads using -l\n"); return ERR_PARA; } if (basepath[0] == '\0') { elog(ERROR, "specify the base name.\n"); return ERR_PARA; } if (fname == NULL) { elog(ERROR, "specify the read files.\n"); return ERR_PARA; } if (len > 128) { elog(ERROR, "WHAM supports read up to 128bps.\n"); return ERR_PARA; } if (nMismatch > 5) { elog(ERROR, "specify the number of errors in range (0,5)\n"); return ERR_PARA; } // if (nPartition == 0) // nPartition = nMismatch + 1; if (nPartition > 0 && nPartition <= nMismatch) { elog(ERROR, "Specify a greater value of nPartition (>nMismatch) using -p\n"); return ERR_PARA; } /* save the command line */ pgcommand = getCommand(argc, argv); /* if (skipMask) { maxRepeat = 0; } */ elog(INFO, "length: %d, #mismatch: %d, #partition: %d\n", len, nMismatch, nPartition); elog(INFO, "loading reference sequences...\n"); sequence = new CompactSequence(skipMask); ret = sequence->build(fname, nSeq, len, nMismatch); if (ret != SUCCESS) { elog(ERROR, "failed to load reference sequences.\n"); return ret; } int nChar = sequence->getNum(); elog(INFO, "saving the reference sequences...\n"); if (sequence->save(basepath) != SUCCESS) { elog(ERROR, "failed to save reference sequences.\n"); return ret; } Aligner * aligner = new Aligner(); ret = aligner->init(sequence, len, nPartition, nBucket, nMismatch, 0, 0, maxRepeat, isEmbedHashTable); if (ret != SUCCESS) { elog(ERROR, "failed to initialize the aligner.\n"); return ret; } elog(INFO, "building WHAM index...\n"); ret = aligner->build(basepath); if (ret != SUCCESS) { if (ret == ERR_MEM) elog(ERROR, "No enough memory.\n"); else elog(ERROR, "failed to build WHAM index.\n"); return ret; } elog(INFO, "Complete.\n"); return SUCCESS; } wham/short.h0000644001532600153260000000737112003705361012355 0ustar yinanyinan#ifndef _SHORT_READ_H_ #define _SHORT_READ_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: short.h 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include "sequence.h" #include "bitread.h" #include "error.h" typedef int strand; #define FORWARD 0 #define BACKWARD 1 class ShortRead { private: char ** fnames; /* file names */ int nFile; /* the number of files */ bool forward; /* forward scan */ bool backward; /* backward scan */ int length; /* length of short read (characters) */ int nRead; //query sequence int nReadPerAlign; int64 * reads; //qname bool storeName; int lenName; char * names; //qualities bool storeQual; int lenQual; char * quals; //output char alFileName[256]; char unFileName[256]; FILE * alfile; FILE * unfile; int sidx[2]; /* mapping strand to offset in array */ char code2Base[8]; public: ShortRead(); ShortRead(ShortRead * read); ~ShortRead(); int init(char ** files, int numFile, bool fw, bool bw, char * alignFileName, char * unalignFileName); int load(int maxlen); ShortRead ** split(int num); inline int getReadLength() { return length; } inline int64 * getRead(int id, strand s = 0) { return &reads[id * WORDS_PER_READ * nReadPerAlign + sidx[s] * WORDS_PER_READ]; } inline int getNumReads() { return nRead; } inline char * getRefName(int i) { return fnames[i]; } inline char * getReadName(int i) { return &names[i * lenName]; } inline char * getQual(int i) { return &quals[i * lenQual]; } inline bool isForward() { return forward; } inline bool isBackward() { return backward; } inline char * getAlignFileName() { if (alfile) return alFileName; else return NULL; } inline char * getUnalignFileName() { if (unfile) return unFileName; else return NULL; } inline void printAlign(int i) { if (alfile != NULL ) printRead(i, alfile); } inline void printUnalign(int i) { if (unfile != NULL ) printRead(i, unfile); } void flush() { if (alfile != NULL) { if (fflush(alfile) != 0) elog(ERROR, "file error when flush align file.\n"); if (fclose(alfile) != 0) elog(ERROR, "file error when close align file.\n"); } if (unfile != NULL) { if (fflush(unfile) != 0) elog(ERROR, "file error when flush unalign file.\n"); if (fclose(unfile) != 0) elog(ERROR, "file error when close unalign file.\n"); } } int getReadNameLength(char * str) { char * p; p = strchr(str, ' '); if (p == NULL ) return strlen(str) + 1; else return p - str + 1; } void extractReadName(char * str1, char * str2) { char * p; p = strchr(str2, ' '); if (p != NULL ) *p = '\0'; strcpy(str1, str2); } private: int readLine(FILE * file, char * str, int maxLength); int allocate(); int preProcess(int maxlen); void printRead(int id, FILE * file); }; #endif wham/model.h0000644001532600153260000000313112003705361012304 0ustar yinanyinan#ifndef _MODEL_H_ #define _MODEL_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: model.h 157 2012-07-25 05:58:09Z yinan $ */ class AlignerModel { public: static int estimateNumPartition(unsigned int nEntry, int length, int nError, bool memory); static unsigned int getNumHashtableFitMemory(unsigned int nEntry, int length, int nMismatch, int nPartition); static unsigned int estimateIndexSpace(unsigned int nEntry, int length, int nError, int nPartition); static bool isFitMemory(unsigned int nEntry, int length, int nError, int nPartitiion); private: static int computeNumLookup(int nError, int nPartition); static int computeNumIndex(int nError, int nPartition); static unsigned int getFreeMemory(); static unsigned int estimateHashtableSpace(unsigned int nEntry, int length, int nError, int nPartition); }; #endif wham/edit_distance.cpp0000644001532600153260000002132612003705361014344 0ustar yinanyinan#include #include "edit_distance.h" using namespace std; int ** F; char ** traceback; /* Description of edit_distance algo * given q_seq and db_seq it return alignment between these 2 sequence * * we have qlen*dlen edit_distance matrix. * 0th row is intitalized with 0 * 0th columng is initialized with their row number * * [i,j] = min ( [i-1][j]+1 , [i][j-1]+1 , [i-1][j-1] + {0 if db[i] == query[j] , 1 otherwise } * this is very similar to Needlman+Wunsch global alignment alog. * * traceBack matrix of same size is also created. depending upon which of three are minimum traceback is initialized * * once we have traceback. we look at last row and start building alignment from lowest column in last row. * * for example if last row is 7,6,6,5,4,3,2,4,5. then we start building alignment from 3. * reason we do this is because dblen is more then qlen ( to adjust gaps) so if we have better score some where in * last row means last few character of dbseq can be ignored * * once we have starting location we follow traceback matrix and start building the alignment. * * this way alignment is built in reverse order need to modify the order. * * * */ void edit_distance_init() { int i; int L1 = 99; int L2 = 99; // Dynamic programming matrix F = (int **) malloc(sizeof(int *) * (L2 + 1)); for (i = 0; i <= L2; i++) F[i] = (int *) malloc(sizeof(int) * (L1 + 1)); // Traceback matrix traceback = (char **) malloc(sizeof(char *) * (L2 + 1)); for (i = 0; i <= L2; i++) traceback[i] = (char *) malloc(sizeof(char) * (L1 + 1)); } scoreinfo edit_distance(char *q_seq, char *db_seq, int q_len, int db_len, char *align_query, char *align_db, int prm) { int gap_penalty = obj_score_mat.gap_penalty; /* gap penalty */ // can not assume that q_seq and db_seq are null terminated int L1 = db_len; //strlen(db_seq); int L2 = q_len; //strlen(q_seq); int i; #if EDIT_DEBUG printf("db_len : %d , query_len : %d \n",L1,L2); #endif //printf("inside nw() %s %s ",db_seq,q_seq); // Initialize traceback and F matrix (fill in first row and column) matrix_init(F, traceback, L1, L2, gap_penalty); // Create alignment scoreinfo ret = edit_distance_align(F, traceback, q_seq, db_seq, q_len, db_len, align_query, align_db, gap_penalty); //print_score(ret); #if EDIT_DEBUG cout << "Length after alignment: " << ret.align_len << endl; #endif if (prm) { printf("\nEdit Distance matrix:\n\n"); print_matrix(F, db_seq, q_seq, db_len, q_len); printf("\nTraceback matrix: \n\n"); print_traceback(traceback, db_seq, q_seq, db_len, q_len); printf("\n"); } //for( int i = 0; i <= L2; i++ ) delete F[ i ]; //delete [] F; //for( int i = 0; i <= L2; i++ ) delete traceback[ i ]; //delete [] traceback; return ret; } void matrix_init(int ** F, char ** traceback, int L1, int L2, int d) { F[0][0] = 0; traceback[0][0] = 'n'; int i = 0, j = 0; // initialize 1st row to 0 for (j = 1; j <= L1; j++) { F[0][j] = 0; traceback[0][j] = '-'; } for (i = 1; i <= L2; i++) { F[i][0] = i; traceback[i][0] = '|'; } } /* * Logic of edit_distance Alignment. * given two string q_seq and db_seq. we perform edit_distance based alignment. * * matrix F is populated using dynamic programming appraoch. * * * * */ scoreinfo edit_distance_align( // Needleman-Wunsch algorithm int ** F, char ** traceback, char * q_seq, char * db_seq, int q_len, int db_len, char *query_align, char *db_align, int gap_penalty // Gap penalty ) { //char * db_align = tmp_seq_1_al; //char * query_align = tmp_seq_2_al; int k = 0, x = 0, y = 0; int fU, fD, fL; char ptr; //, nuc ; int i = 0, j = 0; scoreinfo ret = { 0, 0, 0, 0, 0, 0, 0 }; int L1 = db_len; int L2 = q_len; int s; for (i = 1; i <= L2; i++) { for (j = 1; j <= L1; j++) { //nuc = seq_1[ j-1 ] ; if (db_seq[j - 1] == q_seq[i - 1]) { s = 0; } else { s = 1; } fU = F[i - 1][j] + 1; fD = F[i - 1][j - 1] + s; fL = F[i][j - 1] + 1; F[i][j] = min(fU, fD, fL, &ptr); traceback[i][j] = ptr; } } i--; j--; // instead of starting look back from [qlen][dlen] start from minimum in [qlen] row. int loc_j = j; ret.score = F[i][j]; for (int jj = j; jj >= 0; jj--) { if (ret.score > F[i][jj]) { //TODO : we need to check if we should move when min score is equal to score on left . moving when equal might put more gap. ret.score = F[i][jj]; loc_j = jj; } } #ifdef EDIT_DEBUG printf("Best score %d found at [%d][%d]",ret.score,i,loc_j); #endif j = loc_j; //print_traceback(traceback,seq_1,seq_2); int kk = 0; while (i > 0 || j > 0) { //int tmp_db = strlen(db_align),tmp_q = strlen(query_align),tmp_i=i,tmp_j = j; char tmp_trace = traceback[i][j]; switch (traceback[i][j]) { case '|': db_align[k] = '-'; query_align[k] = q_seq[i - 1]; // db_align is initialized to 0 so direct assignment can be made here i--; ret.dgap++; break; case '\\': db_align[k] = db_seq[j - 1]; query_align[k] = q_seq[i - 1]; if (db_seq[j - 1] != q_seq[i - 1]) ret.mm++; else ret.n_iden++; i--; j--; break; case '-': db_align[k] = db_seq[j - 1]; query_align[k] = '-'; ret.qgap++; j--; } k++; } ret.align_len = k; //print_score(ret); // check if gap occur at any end of query_align ( qstring ) and if it occurs then correct the score and qgap and accordingly while (query_align[0] == '-') { // gap at beginning ( or at the end of actual alignment currently its reversed ) ret.qgap--; int l1 = ret.align_len; // move alignment to right its equivalent to saying query_align = query_align+1; for (int i = 0; i < l1; i++) { query_align[i] = query_align[i + 1]; db_align[i] = db_align[i + 1]; } ret.align_len--; } //print_score(ret); while (query_align[ret.align_len - 1] == '-') { ret.pos++; // string is reversed righnow to pos should increase when its at the end ret.qgap--; //ret.score -= obj_score_mat.gap_penalty; query_align[ret.align_len - 1] = '\0'; db_align[ret.align_len - 1] = '\0'; ret.align_len--; } //print_score(ret); reverse_str(db_align, ret.align_len); reverse_str(query_align, ret.align_len); return ret; } int min(int fu, int fd, int fl, char * ptr) { int min = 0; if (fd <= fu && fd <= fl) { min = fd; *ptr = '\\'; } else if (fu < fl) { min = fu; *ptr = '|'; } else { min = fl; *ptr = '-'; } return min; } void print_matrix(int ** F, char * seq_1, char * seq_2, int L1, int L2) { int i, j; printf(" "); for (j = 0; j < L1; j++) { printf("%c ", seq_1[j]); } printf("\n"); for (i = 0; i <= L2; i++) { if (i > 0) { printf("%c ", seq_2[i - 1]); } for (j = 0; j <= L1; j++) { //cout.width( 3 ); printf("%d ", F[i][j]); } printf("\n"); } } void print_traceback(char ** traceback, char * seq_1, char * seq_2, int L1, int L2) { char line[100] = ""; int i, j; printf(" "); for (j = 0; j < L1; j++) { char tmp[4] = ""; sprintf(tmp, "%c ", seq_1[j]); strcat(line, tmp); } printf(" %s \n", line); line[0] = '\0'; for (i = 0; i <= L2; i++) { char tmp[4] = ""; if (i > 0) { sprintf(tmp, "%c ", seq_2[i - 1]); strcat(line, tmp); } for (j = 0; j <= L1; j++) { sprintf(tmp, "%c ", traceback[i][j]); strcat(line, tmp); } printf("%s \n", line); line[0] = '\0'; } } void print_al(char * db_align, char * query_align) { printf("DB:\t%s\n", db_align); printf("Q:\t%s\n", query_align); } void initialize_str(char *str, int len) { int i; for (i = 0; i < len; i++) { str[i] = '\0'; } } void reverse_str(char *str, int len) { char tmp; //int len= strlen(str),i; for (int i = 0; i < len / 2; i++) { tmp = str[i]; str[i] = str[len - 1 - i]; str[len - 1 - i] = tmp; } } void print_score(scoreinfo s) { printf( " Score: %d, n_iden: %d,Pos: %d Mismatch: %d, dgap: %d, s.qgap: %d, align_len %d\n", s.score, s.n_iden, s.pos, s.mm, s.dgap, s.qgap, s.align_len); } /* * */ #ifdef EXE int main(int argc,char *argv[]) { char *query = argv[1]; char *db = argv[2]; if(argc != 4 ) { printf("%d , usage : ./a.out query db prm",argc); return -1; } int prm = atoi(argv[3]); int q_len = strlen(query); int db_len = strlen(db); char query_align[MAX],db_align[MAX]; initialize_str(query_align,MAX); initialize_str(db_align,MAX); scoreinfo score = edit_distance(query,db,q_len,db_len,query_align,db_align,prm); print_score(score); print_al(db_align,query_align); } #endif wham/error.h0000644001532600153260000000243712003705361012345 0ustar yinanyinan#ifndef _ERROR_H_ #define _ERROR_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: error.h 157 2012-07-25 05:58:09Z yinan $ */ #define SUCCESS 0 #define ERR_FILE 500 #define ERR_PARA 501 #define ERR_MEM 502 #define ERR_CHECK 503 #define ERR_SEQ 504 #define ERR_LONGSEQ 505 #define ERR_READ_FORMAT 506 #define ERR_INDEX 507 #define MSG_HITSETFULL 600 /* * macro for message level. */ #define ERROR 0 #define WARNING 1 #define INFO 2 #define DEBUG1 3 #define DEBUG2 4 #define DEBUG3 5 void elog(int level, char * format, ...); #endif wham/writer.cpp0000644001532600153260000004511312003705361013061 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: writer.cpp 157 2012-07-25 05:58:09Z yinan $ */ #include "writer.h" extern char pgversion[]; extern char * pgcommand; /* * A = 0, C = 1, G = 2, T = 2, N = 7. * If the value of A, C, G, T is changed, modify this array. */ const char code2Gene[8] = { 'A', 'C', 'G', 'T', 'N', 'N', 'N', 'N' }; void SimpleWriter::writeValidAlignment(int readId, HitSet * set) { int h; uint32 sid = 0, soffset = 0; char qseq[128], rseq[128]; char field[128]; char * refName; int64 * reference, *query; uint32 offset; char s; for (h = 0; h < set->getNumHits(); h++) { // query = set->getQuerySeq(h); query = reader1->getRead(readId, set->getStrand(h)); reference = set->getReferenceSeq(h); offset = set->getOffset(h); sequence->itree->lookup(offset, sid, soffset); // sprintf(str, "%16u ", offset); // shortread = str + 17; // change = str + 17 + length; refName = sequence->getSeqName(sid); // sprintf(str, "chr%u %10u ", sid, soffset); if (set->getStrand(h) == FORWARD ) s = '+'; else s = '-'; CompactSequence::decompose(qseq, length, query); CompactSequence::decompose(rseq, set->getErrorVector(h).len, reference); // uncompressSequence(query, length, shortread); fprintf(file, "%c\t%s\t%d\t%s\t", s, refName, soffset, qseq); field[0] = '\0'; if (set->getNumMismatch(h) > 0) writeField(field, qseq, rseq, set->getStrand(h), set->getErrorVector(h)); // writeField(query, reference, set->getStrand(h)); fprintf(file, "%s\n", field); } return; } void SimpleWriter::writeField(char * str, char * query, char * reference, strand s, ErrorVector error) { int i, j, e; int type; int len; len = strlen(query); if (s == FORWARD) { e = 0; for (i = 0, j = 0; i < len; i++, j++) { if (query[i] == reference[j]) continue; type = GET_ERROR(error.vec, e); switch (type) { case ERROR_VECTOR_MIS: sprintf(str, "%2d:%c>%c,", i, reference[j], query[i]); str += 7; break; case ERROR_VECTOR_INS: sprintf(str, "%2d:_>%c,", i, query[i]); str += 7; j--; break; case ERROR_VECTOR_DEL: sprintf(str, "%2d:%c>_,", i, reference[j]); str += 7; i--; break; default: break; } e++; } } else { e = error.num - 1; for (i = len - 1, j = error.len - 1; i >= 0; i--, j--) { if (query[i] == reference[j]) continue; type = GET_ERROR(error.vec, e); switch (type) { case ERROR_VECTOR_MIS: sprintf(str, "%2d:%c>%c,", len - 1 - i, reference[j], query[i]); str += 7; break; case ERROR_VECTOR_INS: sprintf(str, "%2d:_>%c,", len - 1 - i, query[i]); str += 7; j++; break; case ERROR_VECTOR_DEL: sprintf(str, "%2d:%c>_,", len - 1 - i, reference[j]); str += 7; i++; break; default: break; } e--; } } //remove the last comma str[strlen(str) - 1] = '\0'; } void SimpleWriter::writeField(int64 * query, int64 * reference, strand s) { int i, j = 0, k; int64 code1, code2; uint32 sid = 0, soffset = 0; bool firstMismatch = true; if (s == FORWARD) { k = (4 * BITS_PER_LONGWORD - length * BITS_PER_BASE) / BITS_PER_LONGWORD; j = BITS_PER_LONGWORD - (length * BITS_PER_BASE) & BITS_LONGWORD_MASK; for (i = 0; i < length; i++) { if (j + BITS_PER_BASE > BITS_PER_LONGWORD) { code1 = ((query[k] << (j + BITS_PER_BASE - BITS_PER_LONGWORD)) | (query[k + 1] >> (BITS_PER_LONGWORD + BITS_PER_LONGWORD - j - BITS_PER_BASE))) & 0x7; code2 = ((reference[k] << (j + BITS_PER_BASE - BITS_PER_LONGWORD)) | (reference[k + 1] >> (BITS_PER_LONGWORD + BITS_PER_LONGWORD - j - BITS_PER_BASE))) & 0x7; j = j + BITS_PER_BASE - BITS_PER_LONGWORD; k++; } else { code1 = (query[k] >> (BITS_PER_LONGWORD - BITS_PER_BASE - j)) & 0x7; code2 = (reference[k] >> (BITS_PER_LONGWORD - BITS_PER_BASE - j)) & 0x7; j += BITS_PER_BASE; } // shortread[i] = code2Gene[code1]; if (code1 != code2) { if (!firstMismatch) fprintf(file, ","); firstMismatch = false; fprintf(file, "%2d:%c>%c", i, code2Gene[code2], code2Gene[code1]); } } } else { k = 3; j = 0; for (i = length - 1; i >= 0; i--) { if (j + BITS_PER_BASE > BITS_PER_LONGWORD) { if (j < BITS_PER_LONGWORD) { code1 = ((query[k] >> j) | (query[k - 1] << (BITS_PER_LONGWORD - j))) & 0x7; code2 = ((reference[k] >> j) | (reference[k - 1] << (BITS_PER_LONGWORD - j))) & 0x7; } else { code1 = query[k - 1] & 0x7; code2 = reference[k - 1] & 0x7; } j = j + BITS_PER_BASE - BITS_PER_LONGWORD; k--; } else { code1 = (query[k] >> j) & 0x7; code2 = (reference[k] >> j) & 0x7; j += BITS_PER_BASE; } // shortread[i] = code2Gene[code1]; if (code1 != code2) { if (!firstMismatch) fprintf(file, ","); firstMismatch = false; fprintf(file, "%2d:%c>%c", length - 1 - i, code2Gene[code2], code2Gene[code1]); } } } } void SimplePairWriter::writeValidAlignment(int readId, HitSet * set) { int h; uint32 sid = 0, soffset = 0; char shortread[128]; char * refName; int64 * reference, *query; uint32 offset; char s; if (!set->isProperMatch()) return; for (h = 0; h < set->getNumHits(); h++) { /* currently skip all partial match */ if (set->getNumMismatch(h) < 0 || set->getNumMismatch(h + 1) < 0) continue; if (h % 2 == 0) query = reader1->getRead(readId, set->getStrand(h)); else query = reader2->getRead(readId, set->getStrand(h)); reference = set->getReferenceSeq(h); offset = set->getOffset(h); sequence->itree->lookup(offset, sid, soffset); refName = sequence->getSeqName(sid); if (set->getStrand(h) == FORWARD ) s = '+'; else s = '-'; CompactSequence::decompose(shortread, length, query); fprintf(file, "%c\t%s\t%d\t%s\t", s, refName, soffset, shortread); if (set->getNumMismatch(h) > 0) writeField(query, reference, set->getStrand(h)); fprintf(file, "\n"); } return; } void SamWriter::writeValidAlignment(int readId, HitSet * set) { int i, j, k, h, lastDiff; int64 code1, code2; uint32 sid = 0, soffset = 0; int64 * reference, *query; uint32 offset; char * refName, *qual; int flag, mapq = 0; double p, sump; char qseq[128], rseq[128], tag[128]; char cigar[128]; char shortread[256]; //QNAME char * queryName = reader1->getReadName(readId); for (h = 0; h < set->getNumHits(); h++) { // query = set->getQuerySeq(h); query = reader1->getRead(readId, set->getStrand(h)); reference = set->getReferenceSeq(h); offset = set->getOffset(h); sequence->itree->lookup(offset, sid, soffset); //FLAG flag = 0; if (set->getStrand(h) == BACKWARD ) flag |= SAM_FLAG_QUERY_STRAND; if (h > 0) flag |= SAM_FLAG_NOT_PRIMARY; //RNAME refName = sequence->getSeqName(sid); //POS(soffset) soffset++; //MAPQ mapq = set->getQual(h); //SEQ CompactSequence::decompose(qseq, length, query); CompactSequence::decompose(rseq, set->getErrorVector(h).len, reference); //CIGAR(length-M) getCIGAR(cigar, qseq, rseq, set->getErrorVector(h)); //MRNM(*) //MPOS(0) //ISIZE(0) //QUAL qual = reader1->getQual(readId); fprintf(file, "%s\t%d\t%s\t%d\t%d\t%s\t*\t0\t0\t%s\t%s", queryName, flag, refName, soffset, mapq, cigar, qseq, qual); writeOptionalField(qseq, rseq, set->getErrorVector(h)); } } void SamWriter::writeInvalidAlignment(int readId, HitSet * set) { char queryStr[128]; char * qualStr; int flag; int64 * query; //QNAME char * name = reader1->getReadName(readId); //FLAG flag = SAM_FLAG_UNMAPPED; //QUERY query = reader1->getRead(readId, FORWARD); CompactSequence::decompose(queryStr, length, query); //QUAL qualStr = reader1->getQual(readId); fprintf(file, "%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", name, flag, queryStr, qualStr); } void SamWriter::writeOptionalField(char * query, char * reference, ErrorVector error) { char mdstr[256]; //MD field getMDfield(mdstr, query, reference, error); fprintf(file, "\tMD:Z:%s", mdstr); //NM field fprintf(file, "\tNM:i:%d\n", error.num); } void SamWriter::getCIGAR(char * str, char * query, char * reference, ErrorVector error) { int i, j, e; int lastErrPos; int type; //quick path for non-indel match if (error.gap == 0) { sprintf(str, "%dM", (int) strlen(query)); return; } str[0] = '\0'; lastErrPos = 0; e = 0; for (i = 0, j = 0; i < strlen(query); i++, j++) { if (query[i] == reference[j]) continue; type = GET_ERROR(error.vec, e); switch (type) { case ERROR_VECTOR_MIS: //nothing to do break; case ERROR_VECTOR_INS: if (j - lastErrPos > 0) sprintf(str, "%dM", j - lastErrPos); strcat(str, "1I"); str += strlen(str); j--; lastErrPos = j + 1; break; case ERROR_VECTOR_DEL: if (j - lastErrPos > 0) sprintf(str, "%dM", j - lastErrPos); strcat(str, "1D"); str += strlen(str); i--; lastErrPos = j + 1; break; default: break; } e++; } if (j > lastErrPos) sprintf(str, "%dM", j - lastErrPos); } void SamWriter::getMDfield(char * str, char * query, char * reference, ErrorVector error) { int i, j, e; int lastErrPos; int type; //quick path for exact match if (error.num == 0) { sprintf(str, "%d", (int) strlen(query)); return; } str[0] = '\0'; lastErrPos = 0; e = 0; for (i = 0, j = 0; i < strlen(query); i++, j++) { if (query[i] == reference[j]) continue; type = GET_ERROR(error.vec, e); switch (type) { case ERROR_VECTOR_MIS: sprintf(str, "%d", j - lastErrPos); str += strlen(str); str[0] = reference[j]; str[1] = '\0'; str++; lastErrPos = j + 1; break; case ERROR_VECTOR_INS: //nothing to do j--; break; case ERROR_VECTOR_DEL: sprintf(str, "%d", j - lastErrPos); str += strlen(str); str[0] = '^'; str[1] = reference[j]; str[2] = '\0'; str += 2; i--; lastErrPos = j + 1; break; default: break; } e++; } sprintf(str, "%d", j - lastErrPos); } void SamWriter::writeHead() { int i; /* for pair-end reads, we don't guarantee the coordinate order */ fprintf(file, "@HD\tVN:1.3\tSO:queryname\n"); for (i = 0; i < sequence->getNumSeq(); i++) { fprintf(file, "@SQ\tSN:%s\tLN:%d\n", sequence->getSeqName(i), sequence->getSeqLen(i)); } fprintf(file, "@PG\tID:WHAM\tVN:%s\tCL:\"%s\"\n", pgversion, pgcommand); } void RawWriter::writeValidAlignment(int readId, HitSet * set) { int nHit, ret; /* In RAW formate, we have to output all hits even the number of * hits exceeds -m . Otherwise (discard all hits), the merged * results may be wrong. * we use getNumAllHits() instead of getNumHIts(). */ nHit = set->getNumAllHits(); ret = fwrite(set->getHits(), sizeof(Hit), nHit, file); if (ret != nHit) { elog(ERROR, "failed to write RAW align data.\n"); } } /** * SamPairWriter::writeValidAlignment() * write a mapped alignment in SAM format for a pair-end read. */ void SamPairWriter::writeValidAlignment(int readId, HitSet * set) { int h; uint32 sid1 = 0, soffset1 = 0; uint32 sid2 = 0, soffset2 = 0; int64 * reference1, *reference2; int64 * query1, *query2; char * qualStr1, *qualStr2; uint32 offset1, offset2; char * refName1, *refName2; char * mateRefName1, *mateRefName2; char equalStr[] = "="; int flag1, flag2, mapq1 = 0, mapq2 = 0; int tlen1 = 0, tlen2 = 0; char queryStr1[128], queryStr2[128], rcQualStr1[128], rcQualStr2[128]; char refStr1[128], refStr2[128]; char cigar1[16], cigar2[16]; bool mate1Unmatch, mate2Unmatch, properMatch; //QNAME char * queryName1 = reader1->getReadName(readId); char * queryName2 = reader2->getReadName(readId); for (h = 0; h < set->getNumHits(); h += 2) { mate1Unmatch = (set->getNumMismatch(h) < 0); mate2Unmatch = (set->getNumMismatch(h + 1) < 0); properMatch = set->isProperMatch(); // query1 = set->getQuerySeq(h); query1 = reader1->getRead(readId, set->getStrand(h)); reference1 = set->getReferenceSeq(h); offset1 = set->getOffset(h); // query2 = set->getQuerySeq(h + 1); query2 = reader2->getRead(readId, set->getStrand(h + 1)); reference2 = set->getReferenceSeq(h + 1); offset2 = set->getOffset(h + 1); //FLAG flag1 = SAM_FLAG_PAIRED; if (set->getStrand(h) == BACKWARD ) flag1 |= SAM_FLAG_QUERY_STRAND; if (set->getStrand(h + 1) == BACKWARD ) flag1 |= SAM_FLAG_MATE_STRAND; flag1 |= SAM_FLAG_FIRST_IN_PAIR; if (properMatch) flag1 |= SAM_FLAG_MAPPED_PAIRED; if (mate1Unmatch) flag1 |= SAM_FLAG_UNMAPPED; else if (mate2Unmatch) flag1 |= SAM_FLAG_MATE_UNMAPPED; if (h > 0) flag1 |= SAM_FLAG_NOT_PRIMARY; flag2 = SAM_FLAG_PAIRED; if (set->getStrand(h + 1) == BACKWARD ) flag2 |= SAM_FLAG_QUERY_STRAND; if (set->getStrand(h) == BACKWARD ) flag2 |= SAM_FLAG_MATE_STRAND; flag2 |= SAM_FLAG_SECOND_IN_PAIR; if (properMatch) flag2 |= SAM_FLAG_MAPPED_PAIRED; if (mate1Unmatch) flag2 |= SAM_FLAG_MATE_UNMAPPED; else if (mate2Unmatch) flag2 |= SAM_FLAG_UNMAPPED; if (h > 0) flag2 |= SAM_FLAG_NOT_PRIMARY; //POS(soffset) sequence->itree->lookup(offset1, sid1, soffset1); soffset1++; /* 1-based coordination */ sequence->itree->lookup(offset2, sid2, soffset2); soffset2++; /* 1-based coordination */ //RNAME refName1 = sequence->getSeqName(sid1); if (sid1 == sid2) { refName2 = refName1; mateRefName1 = equalStr; mateRefName2 = equalStr; } else { refName2 = sequence->getSeqName(sid2); mateRefName1 = refName2; mateRefName2 = refName1; } //MAPQ mapq1 = set->getQual(h); mapq2 = set->getQual(h + 1); //SEQ CompactSequence::decompose(queryStr1, length, query1); CompactSequence::decompose(queryStr2, length, query2); CompactSequence::decompose(refStr1, set->getErrorVector(h).len, reference1); CompactSequence::decompose(refStr2, set->getErrorVector(h + 1).len, reference2); // uncompressSequence(query1, length, queryStr1); // uncompressSequence(query2, length, queryStr2); //CIGAR(length-M) if (mate1Unmatch) strcpy(cigar1, "*"); else getCIGAR(cigar1, queryStr1, refStr1, set->getErrorVector(h)); // sprintf(cigar1, "%dM", length); if (mate2Unmatch) strcpy(cigar2, "*"); else getCIGAR(cigar2, queryStr2, refStr2, set->getErrorVector(h + 1)); // sprintf(cigar2, "%dM", length); //MRNM(*) //MPOS(0) //ISIZE(0) if (!mate1Unmatch && !mate2Unmatch & sid1 == sid2) { if (set->getStrand(h) == set->getStrand(h + 1)) { tlen1 = soffset2 - soffset1; tlen2 = soffset1 - soffset2; } else if (set->getStrand(h) == FORWARD) { tlen1 = soffset2 - soffset1 + length; tlen2 = soffset1 - soffset2 - length; } else { tlen1 = soffset2 - soffset1 - length; tlen2 = soffset1 - soffset2 + length; } } //QUAL quals1 quals2 qualStr1 = reader1->getQual(readId); if (set->getStrand(h) == BACKWARD ) qualStr1 = reverseSequence(rcQualStr1, qualStr1); qualStr2 = reader2->getQual(readId); if (set->getStrand(h + 1) == BACKWARD ) qualStr2 = reverseSequence(rcQualStr2, qualStr2); /* check whether mate1 is matched */ fprintf(file, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", queryName1, flag1, refName1, soffset1, mapq1, cigar1, mateRefName1, soffset2, tlen1, queryStr1, qualStr1); if (!mate1Unmatch) writeOptionalField(queryStr1, refStr1, set->getErrorVector(h)); else fprintf(file, "\n"); /* check whether mate2 is matched */ fprintf(file, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", queryName2, flag2, refName2, soffset2, mapq2, cigar2, mateRefName2, soffset1, tlen2, queryStr2, qualStr2); if (!mate2Unmatch) writeOptionalField(queryStr2, refStr2, set->getErrorVector(h + 1)); else fprintf(file, "\n"); } } /** * SamPairWriter::writeValidAlignment() * write a unmapped alignment in SAM format for a pair-end read. */ void SamPairWriter::writeInvalidAlignment(int readId, HitSet * set) { int flag1, flag2; char queryStr1[128], queryStr2[128]; char * qualStr1, *qualStr2; //QNAME char * queryName1 = reader1->getReadName(readId); char * queryName2 = reader2->getReadName(readId); flag1 = SAM_FLAG_PAIRED | SAM_FLAG_UNMAPPED | SAM_FLAG_MATE_UNMAPPED | SAM_FLAG_FIRST_IN_PAIR; flag2 = SAM_FLAG_PAIRED | SAM_FLAG_UNMAPPED | SAM_FLAG_MATE_UNMAPPED | SAM_FLAG_SECOND_IN_PAIR; //SEQ query1 query2 CompactSequence::decompose(queryStr1, length, reader1->getRead(readId)); CompactSequence::decompose(queryStr2, length, reader2->getRead(readId)); //QUAL quals1 quals2 qualStr1 = reader1->getQual(readId); qualStr2 = reader2->getQual(readId); fprintf(file, "%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", queryName1, flag1, queryStr1, qualStr1); fprintf(file, "%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", queryName2, flag2, queryStr2, qualStr2); return; } char * Writer::reverseSequence(char * str2, char * str1) { int i; int len = strlen(str1); for (i = 0; i < len; i++) str2[len - i - 1] = str1[i]; str2[len] = '\0'; return str2; } wham/short.cpp0000644001532600153260000003252512003705361012707 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: short.cpp 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include #include #include "lib.h" #include "error.h" #include "short.h" ShortRead::ShortRead() { memset(this, 0, sizeof(ShortRead)); } ShortRead::ShortRead(ShortRead * read) { memcpy(this, read, sizeof(ShortRead)); } ShortRead::~ShortRead() { } /* * ShortRead::readLine * read a line from the specified file. */ int ShortRead::readLine(FILE * file, char * str, int maxLength) { char c; int num = 0; if (file == NULL ) return 0; while (1) { c = fgetc(file); if (c == EOF || c == '\n' || num == maxLength - 1) break; str[num++] = c; } str[num] = '\0'; return num; } /* * ShortRead::init * store the names and the number of files that contain * the short reads. */ int ShortRead::init(char ** files, int numFile, bool fw, bool bw, char * alignFileName, char * unalignFileName) { int i; if (numFile <= 0) return ERR_PARA; forward = fw; backward = bw; nReadPerAlign = 0; if (forward) { sidx[FORWARD] = nReadPerAlign; nReadPerAlign++; } if (backward) { sidx[BACKWARD] = nReadPerAlign; nReadPerAlign++; } if (nReadPerAlign <= 0) return ERR_PARA; length = 0; nFile = numFile; fnames = new char *[numFile]; for (i = 0; i < numFile; i++) { fnames[i] = new char[256]; if (strlen(files[i]) >= 256) return ERR_PARA; strcpy(fnames[i], files[i]); } nRead = 0; storeName = true; storeQual = true; lenName = 0; lenQual = 0; if (alignFileName != NULL) { strcpy(alFileName, alignFileName); alfile = fopen(alignFileName, "w"); if (alfile == NULL) { return ERR_FILE; } } if (unalignFileName != NULL) { strcpy(unFileName, unalignFileName); unfile = fopen(unalignFileName, "w"); if (unfile == NULL) { return ERR_FILE; } } code2Base[BASE_A] = 'A'; code2Base[BASE_C] = 'C'; code2Base[BASE_G] = 'G'; code2Base[BASE_T] = 'T'; code2Base[BASE_N] = 'N'; return SUCCESS; } /* * ShortRead::allocate() * allocate space for query sequence, name, and qual. */ int ShortRead::allocate() { /* allocate the space for the short reads */ reads = (int64 *) malloc( (int64) nRead * nReadPerAlign * WORDS_PER_READ * sizeof(int64)); if (reads == NULL) { elog( ERROR, "failed to allocate space for compressed reads. [%lldMB]\n", (int64) nRead * nReadPerAlign * WORDS_PER_READ * sizeof(int64) / 1024 / 1024); return ERR_MEM; } /* allocate the qname space */ if (storeName) { names = (char *) malloc((int64) nRead * lenName * sizeof(char)); if (names == NULL) { elog(ERROR, "failed to allocate space for read names.[%lldMB]\n", (int64) nRead * lenName * sizeof(char) / 1024 / 1024); return ERR_MEM; } } /* allocate the qual space */ if (storeQual) { quals = (char *) malloc((int64) nRead * lenQual * sizeof(char)); if (quals == NULL) { elog(ERROR, "failed to allocate space for qual scores. [%lldMB]\n", (int64) nRead * lenQual * sizeof(char) / 1024 / 1024); return ERR_MEM; } } return SUCCESS; } /* * ShortRead::preProcess * count the number of short reads in the specified files. */ int ShortRead::preProcess(int maxLen) { int i; int len; FILE * file; char str[256]; if (fnames == 0) { elog(ERROR, "no specified file name of short reads.\n"); return ERR_PARA; } if (nReadPerAlign <= 0) return ERR_PARA; nRead = 0; for (i = 0; i < nFile; i++) { file = fopen(fnames[i], "r"); if (file == NULL) { elog(ERROR, "failed to open file %s.\n", fnames[i]); return ERR_FILE; } int lineid = 0; while (len = readLine(file, str, 256)) { lineid++; if (str[0] == '\0') continue; switch (lineid % 4) { case 1: if (str[0] == '@') { /* HERE CAN BE IMPROVED */ /* remove first '@', add '\0' in the end */ len = getReadNameLength(str); if (len > lenName) lenName = len; } else { elog(ERROR, "illegal character at line %d of file %s.\n", lineid, fnames[i]); return ERR_READ_FORMAT; } break; case 2: /* initialize the read length */ if (length == 0) { length = len; lenQual = len + 1; } if (length != len) { elog(ERROR, "Read at line %d of file %s has illegal read length.\n", lineid, fnames[i]); return ERR_READ_FORMAT; } if (maxLen > len) { elog( ERROR, "Read at line %d of file %s is shorter than the specified length.\n", lineid, fnames[i]); return ERR_PARA; } break; case 3: if (str[0] == '+') { // readLine(file, str, 256); } break; case 0: //query sequence nRead++; /* initialize the read length */ if (length == 0) { length = len; lenQual = len + 1; } /* the read file contain shorts with varied length */ if (length != len) { elog(ERROR, "Read at line %d of file %s has illegal read length.\n", lineid, fnames[i]); return ERR_READ_FORMAT; } if (maxLen > len) { elog( ERROR, "Read at line %d of file %s is shorter than the specified length.\n", lineid, fnames[i]); return ERR_PARA; } break; } } if (fclose(file) != 0) { elog(ERROR, "failed to close file %s.\n", fnames[i]); return ERR_FILE; } } if (maxLen > 0) { length = maxLen; lenQual = maxLen + 1; } return SUCCESS; } /* * ShortRead::load * This function is used to load the short reads in the specified * files. The short reads are loaded into a 64-bit integer array, * using 4 64-bit integer to represent a short read. The short read * can be loaded in forward or backward order. */ int ShortRead::load(int maxLen) { int i, j, k, curRead; int ret; int forward_offset, backward_offset, offset; int64 forward_word, backward_word, code; int64 reverse[8] = { 3, 2, 1, 0, 4, 5, 6, 7 }; char str[256]; FILE * file; /* statistics */ ret = preProcess(maxLen); if (ret != SUCCESS ) return ret; /* allocate the space for reads. */ ret = allocate(); if (ret != SUCCESS) { elog(ERROR, "failed to allocate space for reads.\n"); return ret; } i = 0; curRead = 0; offset = (WORDS_PER_READ * BITS_PER_LONGWORD - length * BITS_PER_BASE) / BITS_PER_LONGWORD; for (k = 0; k < nFile; k++) { file = fopen(fnames[k], "r"); if (file == NULL ) return ERR_FILE; int lineid = 0; while (readLine(file, str, 256)) { lineid++; if (str[0] == '\0') continue; if (str[0] == '+') { /* read the score line */ assert(curRead < nRead); readLine(file, str, 256); lineid++; if (storeQual) { strncpy(&quals[curRead * lenQual], str, length); quals[curRead * lenQual + length] = '\0'; } curRead++; } else if (str[0] == '@') { /* copy the query sequence name */ assert(curRead < nRead); if (storeName) extractReadName(&names[curRead * lenName], &str[1]); // strcpy(&names[curRead * lenName], &str[1]); /* store the query sequence using 3 bits to represent each base */ readLine(file, str, 256); lineid++; /* cut the sequence if necessary */ str[length] = '\0'; /* initialize the values */ reads[i] = 0; reads[i + 1] = 0; reads[i + 2] = 0; reads[i + 3] = 0; if (nReadPerAlign > 1) { reads[i + 6] = 0; reads[i + 7] = 0; reads[i + 8] = 0; reads[i + 9] = 0; } /* initialize the current word in forward/backward format */ forward_word = 0; backward_word = 0; /* initialize the begining offset in forward/backward format */ backward_offset = 0; forward_offset = (WORDS_PER_READ * BITS_PER_LONGWORD - length * BITS_PER_BASE) % BITS_PER_LONGWORD; /* * scan the sequence and generate the compact representation * in forward or/and backward format. */ for (j = 0; j < length; j++) { if (str[j] == 'A') code = BASE_A; else if (str[j] == 'C') code = BASE_C; else if (str[j] == 'G') code = BASE_G; else if (str[j] == 'T') code = BASE_T; else if (str[j] == 'N') code = BASE_N; else elog(ERROR, "ERROR: unknown character in short read files.\n"); if (forward) { /* forward format */ if (forward_offset + BITS_PER_BASE >= BITS_PER_LONGWORD) { /* on the boundary of 64-bit word */ reads[i + offset + j * BITS_PER_BASE / BITS_PER_LONGWORD] = (forward_word << (BITS_PER_LONGWORD - forward_offset)) | (code >> (forward_offset + BITS_PER_BASE - BITS_PER_LONGWORD)); forward_offset = forward_offset + BITS_PER_BASE - BITS_PER_LONGWORD; forward_word = ~((~code) | (-1LL << forward_offset)); } else { forward_word = (forward_word << BITS_PER_BASE) | code; forward_offset += BITS_PER_BASE; } } if (backward) { /* backward format */ if (backward_offset + BITS_PER_BASE >= BITS_PER_LONGWORD) { /* on the boundary of 64-bit word */ reads[i + nReadPerAlign * WORDS_PER_READ - 1 - j * BITS_PER_BASE / BITS_PER_LONGWORD] = backward_word | (reverse[code] << backward_offset); backward_offset = backward_offset + BITS_PER_BASE - BITS_PER_LONGWORD; backward_word = reverse[code] >> (BITS_PER_BASE - backward_offset); } else { backward_word = backward_word | (reverse[code] << (backward_offset % BITS_PER_LONGWORD)); backward_offset += BITS_PER_BASE; } } } if (backward) reads[i + nReadPerAlign * WORDS_PER_READ - 1 - j * BITS_PER_BASE / BITS_PER_LONGWORD] = backward_word; // outputSegment(reads + i, length); // outputSegment(reads + i + WORDS_PER_READ, length); /* jump to the begining 64-bit integer of next short read */ i += WORDS_PER_READ * nReadPerAlign; } else { elog(ERROR, "illegal character at line %d of file %s\n", lineid, fnames[k]); return ERR_READ_FORMAT; } } if (fclose(file) != 0) return ERR_FILE; } return SUCCESS; } ShortRead ** ShortRead::split(int nPartition) { int i; int szPartition, curPartition; unsigned int curRead = 0; ShortRead ** partitions; szPartition = (int) ceil((double) nRead / (double) nPartition); partitions = new ShortRead *[nPartition]; for (i = 0; i < nPartition; i++) { partitions[i] = new ShortRead(this); if (szPartition <= nRead - curRead) curPartition = szPartition; else curPartition = nRead - curRead; partitions[i]->nRead = curPartition; partitions[i]->reads = &reads[curRead * WORDS_PER_READ * nReadPerAlign]; if (storeName && names != NULL) { partitions[i]->names = &names[curRead * lenName]; } if (storeQual && quals != NULL) { partitions[i]->quals = &quals[curRead * lenQual]; } if (alfile != NULL) { sprintf(partitions[i]->alFileName, "%s.t%d", alFileName, i); partitions[i]->alfile = fopen(partitions[i]->alFileName, "w"); assert(partitions[i]->alfile != NULL); } if (unfile != NULL) { sprintf(partitions[i]->unFileName, "%s.t%d", unFileName, i); partitions[i]->unfile = fopen(partitions[i]->unFileName, "w"); assert(partitions[i]->unfile != NULL); } curRead += curPartition; } return partitions; } void ShortRead::printRead(int id, FILE * file) { char readstr[256]; assert(file != NULL); CompactSequence::decompose(readstr, length, getRead(id)); fprintf(file, "@%s\n", getReadName(id)); fprintf(file, "%s\n", readstr); fprintf(file, "+%s\n", getReadName(id)); fprintf(file, "%s\n", getQual(id)); } wham/aligner.h0000644001532600153260000001276112054627014012642 0ustar yinanyinan#ifndef _ALIGNER_H_ #define _ALIGNER_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: aligner.h 164 2012-11-26 08:53:32Z yinan $ */ #include #include "hash.h" #include "sequence.h" #include "short.h" #define ALIGNER_ALL_INDEX -1 typedef struct AlignInfo { ShortRead * reader1; ShortRead * reader2; int maxHit; int maxMatch; int maxQual; int maxGap; double scanThreshold; bool pairStrand[2][2]; bool sorted; bool strata; unsigned int minins; unsigned int maxins; bool mateMatch; int outputFormat; bool concatenate; int maxMate; bool showBar; } AlignInfo; typedef struct AlignRes { int nRead; int nValidRead; int nValidAlignment; } AlignRes; class Aligner { private: int length; /* the length of query sequence */ int words; /* the number of 64-bit word to store the query sequence */ int nPartition; /* the number of fragments */ int nError; /* the number of allowed errors */ int nSubstitute;/* the number of allowed substitutions */ int nInsert; /* the number of allowed insertions */ int nDelete; /* the number of allowed deletions */ int maxRepeat; /* the maximum number of repeats */ bool embedHashTable; /* whether use the embed hash tables */ /* * This figure is used to demonstrate how the query sequence * is split into uniform-sized fragments. * |<------------------- query sequence -------------------->| * | head | fragment f-1 |......| fragment 1 | fragment 0 | * |lenRest| lenPartitions | * | | lenPartition |......| lenPartition | lenPartition | * | lenKey | */ int lenKey; /* the length of the query sequence (bits) */ int lenPartition; /* the length of each fragment (bits) */ int lenPartitions; /* the length of all fragments (bits) */ int lenRest; /* the length of the head (bits) */ uint32 numEntry; int nHashTable; /* the number of hash indexes */ int nLookup; /* the number of lookups for each alignment */ HashTable * hashTables; /* hash indexes */ int * lookupIndex; /* lookup Index array. store the group id of all concatenations */ int * lookupOffset; /* lookup Offset array. store the distance before the leftmost indexed fragment */ // HitSet hits; /* the result pool. store all matched results of an alignment without duplications */ CompactSequence * sequence; /* the reference sequence */ /* * these variables are only used in alignment phase */ char * indexpath; int outputMode; /* normal or SAM */ int nMaxError; /* the maximum number of errors in alignments that wham reprots */ int maxQual; int64 headMask[WORDS_PER_READ]; public: Aligner(); Aligner(char * path); int init(CompactSequence * sequence, int length, int numPartition, unsigned int numBucket, int numMismatch, int numInsert, int numDelete, int repeat, bool embedHashTable); int build(char * path); uint32 align(int64 * key, char * quals, strand s, int rid, HitSet * hits, bool skipFirst = false, bool noGap = false); uint32 alignFirst(int64 * key, char * quals, strand s, int rid, HitSet * hits, bool noGap = false); int check(int num); AlignRes align(AlignInfo * info, char * path); void sortList(); int save(char * path, int indexID = ALIGNER_ALL_INDEX); int saveHead(char * path); int saveIndex(char * path, int indexID); int load(char * path, int indexID = ALIGNER_ALL_INDEX); int loadHead(char * path); int loadHashtables(char * path); int removeHashTables(); void printInfo(); int valid(int length, int numPartition, int numMismatch, int numInsert, int numDelete); int getReadLength() { return length; } int getNumIndex() { return nHashTable; } int getNumError() { return nError; } bool allowGap() { return nPartition == nError + 1; } void setErrorModel(int maxerr, int maxgap, int maxqual) { if (maxqual == 0) maxqual = 0; if (maxerr == 0) maxerr = nError; nMaxError = maxerr; maxQual = maxqual; nInsert = nDelete = maxgap; // for (int i = 0; i < nHashTable; i++) // hashTables[i].setErrorModel(maxerr, maxqual); } void setScanThreshold(double r) { for (int i = 0; i < nHashTable; i++) hashTables[i].setScanThreshold(r); } void printStat(); private: int preProcessHashTables(); int buildHashTables(char * path); AlignRes alignSingleEnd(AlignInfo * info, char * path); AlignRes alignPairEnd(AlignInfo * info, char * path); AlignRes merge(AlignInfo * info, int num, char * path); AlignRes mergeSingleEnd(AlignInfo * info, int num, char * path); AlignRes mergePairEnd(AlignInfo * info, int num, char * path); int computeNumIndex(); int computeNumLookup(); int initLookupArray(); friend class CompactSequence; }; #endif wham/hitset.cpp0000644001532600153260000001100612003705361013037 0ustar yinanyinan#include "hitset.h" #include "sequence.h" /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: hitset.cpp 157 2012-07-25 05:58:09Z yinan $ */ int HitPairSet::add(Hit * hit1, Hit * hit2) { int i; /* if there is one different pair, there are multiple alignments for read */ if (nHit >= 2) unique = false; if (nHit >= MAX_NUM_HITS) { // elog(WARNING, "WARNING: result set is full. Some reads may be discarded.\n"); return MSG_HITSETFULL; } if ((maxMatch == MAX_INT && nHit >= maxHit && !sorted) || nHit > maxMatch) return MSG_HITSETFULL; /* duplication has been removed in each hitset.*/ int nError = 0; if (hit1->error.num > 0) nError += hit1->error.num; if (hit2->error.num > 0) nError += hit2->error.num; if (nError < MAX_NUM_COUNT) { counts[nError]++; psum += p[nError]; } i = nHit; if (sorted) { for (; i > 0; i -= 2) { if (hits[i - 2].error.num + hits[i - 1].error.num <= hit1->error.num + hit2->error.num || (hits[i - 2].error.num + hits[i - 1].error.num == hit1->error.num + hit2->error.num && hit2[i - 2].qual + hits[i - 1].qual <= hit1->qual + hit2->qual)) break; hits[i] = hits[i - 2]; hits[i + 1] = hits[i - 1]; } } hits[i] = *hit1; hits[i + 1] = *hit2; hits[i].qual = hits[i + 1].qual = UNASSIGNED_QUAL; if (nHit < maxHit || (maxMatch != MAX_INT && nHit <= maxMatch)) nHit += 2; // if ((maxMatch == MAX_INT && nHit >= maxHit && !sorted) || nHit > maxMatch) // return MSG_HITSETFULL; return SUCCESS; } /* * HitPairSet::build() * check if the pairs of matched mate1 and mate2 satisfy * the constrain on the distance between mate1 and mate2. */ int HitPairSet::build(HitSet * set1, HitSet * set2, uint32 minins, uint32 maxins, bool pairStrand[][2], bool reportMateMatch) { int i, j; unsigned int pos1, pos2; strand s1, s2; int ret; bool pairMatch = false; reset(); properMatch = true; /* join the two hitset and get the pair-end alignment */ for (i = 0; i < set1->nHit; i++) { for (j = 0; j < set2->nHit; j++) { pos1 = set1->hits[i].pos; pos2 = set2->hits[j].pos; s1 = set1->hits[i].strand; s2 = set2->hits[j].strand; /* mate 2 is upstream, mate 1 is downstream */ if (pos1 > pos2 && pos1 - pos2 + length >= minins && pos1 - pos2 + length <= maxins && pairStrand[s1][s2]) { ret = add(&set1->hits[i], &set2->hits[j]); } /*mate 1 is upstream, mate 2 is downstream */ if (pos1 <= pos2 && pos2 - pos1 + length >= minins && pos2 - pos1 + length <= maxins && pairStrand[s1][s2]) { ret = add(&set1->hits[i], &set2->hits[j]); } if (ret == MSG_HITSETFULL ) return nHit; } } if (nHit > 0) return nHit; properMatch = false; /* if no proper matched, find matched mates */ if (set1->nHit > 0 && set2->nHit > 0) { for (i = 0; i < set1->nHit; i++) { for (j = 0; j < set2->nHit; j++) { ret = add(&set1->hits[i], &set2->hits[j]); if (ret == MSG_HITSETFULL ) return nHit; } } } if (nHit > 0) return nHit; /* if no matched pair, add partial matched pair */ if (nHit == 0 && reportMateMatch) { Hit hit; /* find matching in mate1 */ for (i = 0; i < set1->nHit; i++) { hit = set1->hits[i]; hit.error.num = -1; hit.strand = FORWARD; ret = add(&set1->hits[i], &hit); if (ret == MSG_HITSETFULL ) return nHit; } /* find matching in mate2 */ for (i = 0; i < set2->nHit; i++) { hit = set2->hits[i]; hit.error.num = -1; hit.strand = FORWARD; ret = add(&hit, &set2->hits[i]); if (ret == MSG_HITSETFULL ) return nHit; } } return nHit; } wham/aligner.cpp0000644001532600153260000020437512054750341013201 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: aligner.cpp 166 2012-11-26 20:28:17Z yinan $ */ #include #include #include #include #include "aligner.h" #include "error.h" #include "model.h" #include "writer.h" #include "bitread.h" #include "util.h" #include "perfcounters.h" #include "rdtsc.h" #define NO_PARTITION -1 Aligner::Aligner() { memset(this, 0, sizeof(Aligner)); } Aligner::Aligner(char * path) { loadHead(path); /* set the load path */ indexpath = path; } void Aligner::printInfo() { elog(DEBUG1, " ---- Index Info ----\n"); elog(DEBUG1, " read length: %d\n", length); elog(DEBUG1, " num of errors: %d\n", nError); elog(DEBUG1, " num of fragments: %d\n", nPartition); } /* * Aligner::computeNumIndex() * Given the number of errors and the number of partitions, * compute the number of required hash tables, according to the * formula C(nPartition-1, nMismatch). */ int Aligner::computeNumIndex() { int i; int64 x = 1; for (i = nPartition - nError; i <= nPartition - 1; i++) x *= i; for (i = 1; i <= nError; i++) x /= i; return (int) x; } /* * Aligner::computeNumLookup() * Given the number of errors and the number of partitions, * compute the number of lookups for each alignment, according * to the formula C(nPartition, nMismatch). */ int Aligner::computeNumLookup() { int i; int64 x = 1; for (i = nPartition - nError + 1; i <= nPartition; i++) x *= i; for (i = 1; i <= nError; i++) x /= i; return (int) x; } /* * Aligner::initLookupArray() * compute the number of hash tables and lookups, and initialize * the lookupIndex and lookupOffset arrays. * LookupIndex stores the partitioning group (hash table) ID for * each parititioning. * LookupOffset stores the offset of the first indexed fragments * in each partitioning. */ int Aligner::initLookupArray() { int iid = 0, i, j, m, k, l, i4, i5; unsigned int x; unsigned int * indexBits; /* allocate the array for lookup infomations */ lookupIndex = new int[nLookup]; if (lookupIndex == NULL ) return ERR_MEM; lookupOffset = new int[nLookup]; if (lookupOffset == NULL ) return ERR_MEM; indexBits = new unsigned int[nHashTable]; if (indexBits == NULL ) return ERR_MEM; /* * we initialize lookupOffset and lookupIndex based on bitwise * techniques. A partitioning is represented by a binary string. * An indexed fragment is represented by 1, whereas a non-indexed * fragment is represented by 0. For example, 1010 represents four * fragments. The first and third fragments are indexed fragments. */ switch (nError) { case 1: /* * enumerate the non-indexed fragment position. Binary * representations for all partitionings are stored in * indexBits. */ for (i = 0, iid = 0; i < (nPartition - 1); i++) { indexBits[iid] = (0xffffffff >> (32 - nPartition)) & ~(1 << i); iid++; } for (i = 0, iid = 0; i < (nPartition); i++) { /* * generate a binary representation for a partitioning * group. */ x = (0xffffffff >> (32 - nPartition)) & ~(1 << i); /* * find the partitions that matches x by sliding the * partitions. */ for (k = 0; k < nHashTable; k++) { for (l = 0; l < nPartition; l++) { if (indexBits[k] == (x << l)) { /* update element values */ lookupIndex[iid] = k; lookupOffset[iid] = l * lenPartition; break; } } } iid++; } break; case 2: /* * enumerate the non-indexed fragment positions. Binary * representations for all partitionings are stored in * indexBits. */ for (i = 0, iid = 0; i < (nPartition - 2); i++) { for (j = i; j < (nPartition - 2); j++) { indexBits[iid] = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)); iid++; } } for (i = 0, iid = 0; i < (nPartition - 1); i++) { for (j = i; j < (nPartition - 1); j++) { /* * generate a binary representation for a partitioning * group. */ x = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)); /* * find the partitions that matches x by sliding the * partitions. */ for (k = 0; k < nHashTable; k++) { for (l = 0; l < nPartition; l++) { if (indexBits[k] == (x << l)) { /* update element values */ lookupIndex[iid] = k; lookupOffset[iid] = l * lenPartition; break; } } } iid++; } } break; case 3: /* * enumerate the non-indexed fragment positions. Binary * representations for all partitionings are stored in * indexBits. */ for (i = 0, iid = 0; i < (nPartition - 3); i++) { for (j = i; j < (nPartition - 3); j++) { for (m = j; m < (nPartition - 3); m++) { indexBits[iid] = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)) & ~(1 << (m + 2)); iid++; } } } for (i = 0, iid = 0; i < (nPartition - 2); i++) { for (j = i; j < (nPartition - 2); j++) { for (m = j; m < (nPartition - 2); m++) { /* * generate a binary representation for a partitioning * group. */ x = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)) & ~(1 << (m + 2)); /* * find the partitions that matches x by sliding the * partitions. */ for (k = 0; k < nHashTable; k++) { for (l = 0; l < nPartition; l++) { if (indexBits[k] == (x << l)) { /* update element values */ lookupIndex[iid] = k; lookupOffset[iid] = l * lenPartition; break; } } } iid++; } } } break; case 4: /* * enumerate the non-indexed fragment positions. Binary * representations for all partitionings are stored in * indexBits. */ for (i = 0, iid = 0; i < (nPartition - 4); i++) { for (j = i; j < (nPartition - 4); j++) { for (m = j; m < (nPartition - 4); m++) { for (i4 = m; i4 < (nPartition - 4); i4++) { indexBits[iid] = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)) & ~(1 << (m + 2)) & ~(1 << (i4 + 3)); iid++; } } } } for (i = 0, iid = 0; i < (nPartition - 3); i++) { for (j = i; j < (nPartition - 3); j++) { for (m = j; m < (nPartition - 3); m++) { for (i4 = m; i4 < (nPartition - 3); i4++) { /* * generate a binary representation for a partitioning * group. */ x = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)) & ~(1 << (m + 2)) & ~(1 << (i4 + 3)); /* * find the partitions that matches x by sliding the * partitions. */ for (k = 0; k < nHashTable; k++) { for (l = 0; l < nPartition; l++) { if (indexBits[k] == (x << l)) { /* update element values */ lookupIndex[iid] = k; lookupOffset[iid] = l * lenPartition; break; } } } iid++; } } } } break; case 5: /* * enumerate the non-indexed fragment positions. Binary * representations for all partitionings are stored in * indexBits. */ for (i = 0, iid = 0; i < (nPartition - 5); i++) { for (j = i; j < (nPartition - 5); j++) { for (m = j; m < (nPartition - 5); m++) { for (i4 = m; i4 < (nPartition - 5); i4++) { for (i5 = i4; i5 < (nPartition - 5); i5++) { indexBits[iid] = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)) & ~(1 << (m + 2)) & ~(1 << (i4 + 3)) & ~(1 << (i5 + 4)); iid++; } } } } } for (i = 0, iid = 0; i < (nPartition - 4); i++) { for (j = i; j < (nPartition - 4); j++) { for (m = j; m < (nPartition - 4); m++) { for (i4 = m; i4 < (nPartition - 4); i4++) { for (i5 = i4; i5 < (nPartition - 4); i5++) { /* * generate a binary representation for a partitioning * group. */ x = (0xffffffff >> (32 - nPartition)) & ~(1 << i) & ~(1 << (j + 1)) & ~(1 << (m + 2)) & ~(1 << (i4 + 3)) & ~(1 << (i5 + 4)); /* * find the partitions that matches x by sliding the * partitions. */ for (k = 0; k < nHashTable; k++) { for (l = 0; l < nPartition; l++) { if (indexBits[k] == (x << l)) { /* update element values */ lookupIndex[iid] = k; lookupOffset[iid] = l * lenPartition; break; } } } iid++; } } } } } break; } /* initialize infos for embed table lookups */ int keySpan; int keyPartitions[10]; memset(keyPartitions, 0, sizeof(int) * 10); for (i = 0; i < nHashTable; i++) { for (j = 0; j < nPartition; j++) { if ((indexBits[i] >> j) & 0x1 == 1) { keySpan = nPartition - j; break; } } for (j = nPartition - 1, k = 0; j >= 0; j--) { if ((indexBits[i] >> j) & 0x1 == 1) { keyPartitions[k++] = j * lenPartition; } } hashTables[i].setLookupInfo(keySpan, keyPartitions); } delete[] indexBits; return SUCCESS; } /* * Aligner::init() * initialize the aligner structure. */ int Aligner::init(CompactSequence * seq, int len, int numPartition, unsigned int numBucket, int numError, int numInsert, int numDelete, int repeat, bool embed) { if (numError > 5) { printf("nError = %d is not supported in this version.\n", numError); return ERR_PARA; } if (numPartition > 0 && numPartition <= numError) { printf( "The number of partitions should be larger than the number of mismatches.\n"); return ERR_PARA; } /* initialize the genome sequence */ sequence = seq; numEntry = sequence->getNum(); /* choose number of partition based on a cost model */ if (numPartition == 0) { numPartition = AlignerModel::estimateNumPartition(numEntry, len, numError, false); elog(INFO, "optimizer chooses %d fragments\n", numPartition); } /* check the index size */ if (!AlignerModel::isFitMemory(numEntry, len, numError, numPartition)) { int nPartition; nPartition = AlignerModel::estimateNumPartition(numEntry, len, numError, true); if (nPartition == 0) { elog( ERROR, "insufficient memory to build any WHAM index. Increase memory size or use smaller reference sequence.\n"); return ERR_MEM; } } /* update the parameters */ length = len; nPartition = numPartition; nError = numError; nInsert = numInsert; nDelete = numDelete; maxRepeat = repeat; embedHashTable = embed; nMaxError = nError; maxQual = MAX_INT; words = NUM_LONGWORD(length * BITS_PER_BASE); /* initialize partitioning info */ lenKey = length * BITS_PER_BASE; lenPartition = (length - nInsert) / nPartition * BITS_PER_BASE; lenPartitions = lenPartition * nPartition; lenRest = lenKey - lenPartitions; /* initialize head mask */ BitRead::genHeadMask(headMask, lenPartitions); /* * compute the number of hashtables, and the number of * lookups */ nHashTable = computeNumIndex(); nLookup = computeNumLookup(); hashTables = new HashTable[nHashTable]; if (hashTables == NULL ) return ERR_MEM; for (int i = 0; i < nHashTable; i++) { hashTables[i].init(seq, len, numBucket, nError, nInsert, nDelete, nPartition, maxRepeat, embed, i); } /* initialize lookup info */ int ret = initLookupArray(); if (ret != SUCCESS) return ret; return SUCCESS; } /* * Aligner::valid * this function is used to check if the aligner is compatible * with the specificied parameters */ int Aligner::valid(int len, int numPartition, int numError, int numInsert, int numDelete) { if (len != length) return ERR_PARA; if (numPartition != nPartition) return ERR_PARA; if (numError != nError) return ERR_PARA; if (numInsert != nInsert) return ERR_PARA; if (numDelete != nDelete) return ERR_PARA; return SUCCESS; } /* * Alinger::preProcessHashTables() * This function is used to collect infomation about the sequence for * building the hash table. If indexID is not specified (indexID == * ALIGNER_ALL_INDEX), we process on all indexes. Otherwise, we only * process the specified index. * * To save the space, the collectted infomation is stored in the hash * buckets that is allocated in this function. In particular, we scan * the genome sequence to: * 1) count the number of entries in each hash bucket. The number is * stored in corresponding bucket. * 2) identify empty hash buckets. The values of these buckets are set * to be HASH_EMPTY. * 3) identify collision in hash buckets. If more than one entry is hashed * into a bucket, the MSB of the bucket is set to be 1. */ int Aligner::preProcessHashTables() { unsigned int i; int i1, i2, i3, i4, i5; unsigned int num; int ret, pid; int64 tspace[96]; int64 * key = &tspace[8], *subkey1 = &tspace[24], *subkey2 = &tspace[40], *subkey3 = &tspace[56], *subkey4 = &tspace[72], *subkey5 = &tspace[88]; int64 * seq; /* clear all bits in key space */ memset(tspace, 0, sizeof(int64) * 16 * nError); /* * allocate hash buckets and bitmaps for specified * hash tables. The hash buckets are temporarily used * to store statistics information. */ for (i = 0; i < nHashTable; i++) { ret = hashTables[i].preProcessInit(); if (ret != SUCCESS ) return ret; } num = numEntry - length + nError * lenPartition / BITS_PER_BASE; seq = sequence->getSequence(); ProgressBar bar(num, PROGRESS_BAR_WIDTH); for (i = 0; i <= num; i++) { /* update the progress bar.*/ bar.update(i); /* * extract a portion starting from k-th character in the * sequence, store it to a right-aligned integer array. * The offset is plused by lenRest to discard the leftmost * lenRest bits that are not used for lookup. The portion * has lenPartitions bits. */ // sequence->get(key, i * BITS_PER_BASE + lenRest, lenPartitions); BitRead::extract(seq, key, i * BITS_PER_BASE_LL + lenRest, lenPartitions); switch (nError) { case 0: /* exact match. No partitioning. */ hashTables[0].preProcessInsert(key); break; case 1: /* * generate all partitioning on the query sequence by * enumerating the unindexed fragment. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 1); i1 += lenPartition) { /* * extract the indexed segments by removing the intervals * from the sequence. The interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].preProcessInsert(subkey1); pid++; } break; case 2: /* * generate all partitioning on the query sequence by * enumerating two unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 2); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 2); i2 += lenPartition) { /* * extract the indexed fragments by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].preProcessInsert(subkey2); pid++; } } break; case 3: /* * generate all partitioning on the query sequence by * enumerating three unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 3); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 3); i2 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 3); i3 += lenPartition) { /* * extract the indexed fragments by removing the third intervals * from the intermediate sequence. The second interval is a segment * starting from i3 with the length of lenPartition. */ BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].preProcessInsert(subkey3); pid++; } } } break; case 4: pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 4); i1 += lenPartition) { BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 4); i2 += lenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 4); i3 += lenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); for (i4 = i3; i4 < lenPartition * (nPartition - 4); i4 += lenPartition) { BitRead::removeInterval(subkey3, subkey4, i4, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].preProcessInsert(subkey4); pid++; } } } } break; case 5: pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 5); i1 += lenPartition) { BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 5); i2 += lenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 5); i3 += lenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); for (i4 = i3; i4 < lenPartition * (nPartition - 5); i4 += lenPartition) { BitRead::removeInterval(subkey3, subkey4, i4, lenPartition); for (i5 = i4; i5 < lenPartition * (nPartition - 5); i5 += lenPartition) { BitRead::removeInterval(subkey4, subkey5, i5, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].preProcessInsert(subkey5); pid++; } } } } } break; } } /* * collect information and store the results in specificed * hash buckets. */ for (i = 0; i < nHashTable; i++) { ret = hashTables[i].preProcessEnd(); if (ret != SUCCESS ) return ret; } return SUCCESS; } int Aligner::build(char * path) { int ret; unsigned int i, j; Timer timer; double t; timer.start(); ret = buildHashTables(path); if (ret != SUCCESS ) return ret; t = timer.stop(); elog(INFO, "building time: %.6f sec\n", t); // ret = removeHashTables(); // if (ret != SUCCESS) // return ret; /* check the test result */ /* elog(INFO, "testing WHAM indexes..."); ret = check(1000); if (ret != SUCCESS) { elog(ERROR, "failed\n"); return ret; } elog(INFO, "pass\n"); */ /* * write aligner head. The index has been tested. * The saved index is always correct. */ if (path != NULL) { ret = saveHead(path); if (ret != SUCCESS ) return ret; } return SUCCESS; } /* * Alinger::buildHashTables() * build the hash table. If indexID is not specified (indexID == * ALIGNER_ALL_INDEX), we perform build all indexes. Otherwise, * we only build the specified index. * (1) invoke preProcess to collect statistics informations. * (2) allocate the overflow array according to the number of * collision entries. * (3) scan the genome sequence to insert entries into specified * index(es). */ int Aligner::buildHashTables(char * path) { unsigned int i, i1, i2, i3, i4, i5; unsigned int num; int ret, pid; int64 tspace[96]; int64 * key = &tspace[8], *subkey1 = &tspace[24], *subkey2 = &tspace[40], *subkey3 = &tspace[56], *subkey4 = &tspace[72], *subkey5 = &tspace[88]; int64 * seq; /* clear all bits in key space */ memset(tspace, 0, sizeof(int64) * 16 * nError); elog(INFO, "preprocessing...\n"); /* * invoke preProcess to get the statistics info, and allocate * hash buckets and bitmaps. */ ret = preProcessHashTables(); if (ret != SUCCESS ) return ret; elog(INFO, "building...\n"); /* * allocate overflow pool for specified hash tables. */ for (i = 0; i < nHashTable; i++) { ret = hashTables[i].buildInit(); if (ret != SUCCESS ) return ret; } num = numEntry - length + nError * lenPartition / BITS_PER_BASE; seq = sequence->getSequence(); ProgressBar bar(num, PROGRESS_BAR_WIDTH); for (i = num; i >= 0; i--) { /* update the progress bar.*/ bar.update(num - i); /* * extract a portion starting from k-th character in the * sequence, store it to a right-aligned integer array. * The offset is plused by lenRest to discard the leftmost * lenRest bits that are not used for lookup. The portion * has lenPartitions bits. */ BitRead::extract(seq, key, i * BITS_PER_BASE_LL + lenRest, lenPartitions); switch (nError) { case 0: /* exact match. No partitioning. */ hashTables[0].insert(key, i); break; case 1: /* * generate all partitioning on the query sequence by * enumerating the unindexed fragment. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 1); i1 += lenPartition) { /* * extract the indexed segments by removing the intervals * from the sequence. The interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].insert(subkey1, i); pid++; } break; case 2: /* * generate all partitioning on the query sequence by * enumerating two unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 2); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 2); i2 += lenPartition) { /* * extract the indexed fragments by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].insert(subkey2, i); pid++; } } break; case 3: /* * generate all partitioning on the query sequence by * enumerating three unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 3); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 3); i2 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 3); i3 += lenPartition) { /* * extract the indexed fragments by removing the third intervals * from the intermediate sequence. The second interval is a segment * starting from i3 with the length of lenPartition. */ BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].insert(subkey3, i); pid++; } } } break; case 4: pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 4); i1 += lenPartition) { BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 4); i2 += lenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 4); i3 += lenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); for (i4 = i3; i4 < lenPartition * (nPartition - 4); i4 += lenPartition) { BitRead::removeInterval(subkey3, subkey4, i4, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].insert(subkey4, i); pid++; } } } } break; case 5: pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 5); i1 += lenPartition) { BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 5); i2 += lenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 5); i3 += lenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); for (i4 = i3; i4 < lenPartition * (nPartition - 5); i4 += lenPartition) { BitRead::removeInterval(subkey3, subkey4, i4, lenPartition); for (i5 = i4; i5 < lenPartition * (nPartition - 5); i5 += lenPartition) { BitRead::removeInterval(subkey4, subkey5, i5, lenPartition); /* insert the segment into specified hash table. */ hashTables[pid].insert(subkey5, i); pid++; } } } } } break; } if (i == 0) break; } if (embedHashTable) { elog(INFO, "building embeded tables...\n"); for (i = 0; i < nHashTable; i++) { int ret = hashTables[i].buildEmbedTable(); if (ret != SUCCESS) return ret; } } /* elog(INFO, "sorting repeats...\n"); for (i = 0; i < nHashTable; i++) { ret = hashTables[i].sortList(); if (ret != SUCCESS) { elog(ERROR, "failed to remove repeats in hash table %d.\n", i); return ret; } } */ elog(INFO, "saving...\n"); if (path != NULL) { for (i = 0; i < nHashTable; i++) { ret = hashTables[i].save(path); if (ret != SUCCESS) { elog(ERROR, "failed to save hash table %d.\n", i); return ret; } } } return SUCCESS; } void Aligner::sortList() { int ret; elog(INFO, "sorting repeats...\n"); for (int i = 0; i < nHashTable; i++) { ret = hashTables[i].sortList(); if (ret != SUCCESS) { elog(ERROR, "failed to remove repeats in hash table %d.\n", i); return; } } } unsigned int Aligner::alignFirst(int64 * orgkey, char * quals, strand s, int rid, HitSet * hits, bool noGap) { uint32 offset; int64 tspace[96]; int64 * key = &tspace[8], *subkey1 = &tspace[24], *subkey2 = &tspace[40], *subkey3 = &tspace[56], *subkey4 = &tspace[72], *subkey5 = &tspace[88]; int nHit = 0; int ret; /* clear all bits in key space */ memset(tspace, 0, sizeof(int64) * 16 * nError); /* * The leftmost $lenPartitions$ bits are used to lookups on hash * tables. The bits are extracted and stored in $key$.. */ BitRead::removeHead(orgkey, key, headMask); switch (nError) { case 0: ret = hashTables[0].lookup(orgkey, key, 0, quals, s, rid, hits, noGap); break; case 1: BitRead::removeInterval(key, subkey1, 0, lenPartition); ret = hashTables[lookupIndex[0]].lookup(orgkey, subkey1, lookupOffset[0], quals, s, rid, hits, noGap); break; case 2: BitRead::removeInterval(key, subkey1, 0, lenPartition); BitRead::removeInterval(subkey1, subkey2, 0, lenPartition); ret = hashTables[lookupIndex[0]].lookup(orgkey, subkey2, lookupOffset[0], quals, s, rid, hits, noGap); break; case 3: BitRead::removeInterval(key, subkey1, 0, lenPartition); BitRead::removeInterval(subkey1, subkey2, 0, lenPartition); BitRead::removeInterval(subkey2, subkey3, 0, lenPartition); ret = hashTables[lookupIndex[0]].lookup(orgkey, subkey3, lookupOffset[0], quals, s, rid, hits, noGap); break; case 4: BitRead::removeInterval(key, subkey1, 0, lenPartition); BitRead::removeInterval(subkey1, subkey2, 0, lenPartition); BitRead::removeInterval(subkey2, subkey3, 0, lenPartition); BitRead::removeInterval(subkey3, subkey4, 0, lenPartition); ret = hashTables[lookupIndex[0]].lookup(orgkey, subkey4, lookupOffset[0], quals, s, rid, hits, noGap); break; case 5: BitRead::removeInterval(key, subkey1, 0, lenPartition); BitRead::removeInterval(subkey1, subkey2, 0, lenPartition); BitRead::removeInterval(subkey2, subkey3, 0, lenPartition); BitRead::removeInterval(subkey3, subkey4, 0, lenPartition); BitRead::removeInterval(subkey4, subkey5, 0, lenPartition); ret = hashTables[lookupIndex[0]].lookup(orgkey, subkey5, lookupOffset[0], quals, s, rid, hits, noGap); break; } if (ret == MSG_HITSETFULL) { HASH_DEBUG(printf("\n")); HASH_DEBUG(getchar()); return ret; } return SUCCESS; } /* * Aligner::align * This function is used to perform an alignment on a query sequence. * The query sequence is storen in orgkey. If indexID is not specified * (indexID == ALIGNER_ALL_INDEX), we perform lookups on all indexes. * Otherwise, we only search the specified index. The successful * alingments are appended to the file. */ unsigned int Aligner::align(int64 * orgkey, char * quals, strand s, int rid, HitSet * hits, bool skipFirst, bool noGap) { int i1, i2, i3, i4, i5; int pid; uint32 offset; int64 tspace[96]; int64 * key = &tspace[8], *subkey1 = &tspace[24], *subkey2 = &tspace[40], *subkey3 = &tspace[56], *subkey4 = &tspace[72], *subkey5 = &tspace[88]; int nHit = 0; int ret; // HitPositionList list; /* clear all bits in key space */ memset(tspace, 0, sizeof(int64) * 16 * nError); /* * The leftmost $lenPartitions$ bits are used to lookups on hash * tables. The bits are extracted and stored in $key$.. */ BitRead::removeHead(orgkey, key, headMask); nHit = 0; offset = HASH_NOT_FOUND; switch (nError) { case 0: /* exact match. No partitioning. */ ret = hashTables[0].lookup(orgkey, key, 0, quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL ) return ret; break; case 1: /* * generate all partitioning on the query sequence by * enumerating the unindexed fragment. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition); i1 += lenPartition) { if (skipFirst && pid == 0) { pid++; continue; } /* * extract the indexed segments by removing the intervals * from the sequence. The interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); /* * search specified hash table to find potential matched * segment based on the partitioning. */ ret = hashTables[lookupIndex[pid]].lookup(orgkey, subkey1, lookupOffset[pid], quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL ) return ret; pid++; } break; case 2: /* * generate all partitioning on the query sequence by * enumerating two unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 1); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 1); i2 += lenPartition) { if (skipFirst && pid == 0) { pid++; continue; } /* * extract the indexed fragments by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); /* * search specified hash table to find potential matched * segment based on the partitioning. */ ret = hashTables[lookupIndex[pid]].lookup(orgkey, subkey2, lookupOffset[pid], quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL) { HASH_DEBUG(printf("\n")); HASH_DEBUG(getchar()); return ret; } pid++; } } break; case 3: /* * generate all partitioning on the query sequence by * enumerating three unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 2); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 2); i2 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 2); i3 += lenPartition) { if (skipFirst && pid == 0) { pid++; continue; } /* * extract the indexed fragments by removing the third intervals * from the intermediate sequence. The second interval is a segment * starting from i3 with the length of lenPartition. */ BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); /* * search specified hash table to find potential matched * segment based on the partitioning. */ ret = hashTables[lookupIndex[pid]].lookup(orgkey, subkey3, lookupOffset[pid], quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL ) return ret; pid++; } } } break; case 4: /* * generate all partitioning on the query sequence by * enumerating three unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 3); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 3); i2 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 3); i3 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); for (i4 = i3; i4 < lenPartition * (nPartition - 3); i4 += lenPartition) { if (skipFirst && pid == 0) { pid++; continue; } /* * extract the indexed fragments by removing the third intervals * from the intermediate sequence. The second interval is a segment * starting from i3 with the length of lenPartition. */ BitRead::removeInterval(subkey3, subkey4, i4, lenPartition); /* * search specified hash table to find potential matched * segment based on the partitioning. */ ret = hashTables[lookupIndex[pid]].lookup(orgkey, subkey4, lookupOffset[pid], quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL ) return ret; pid++; } } } } break; case 5: /* * generate all partitioning on the query sequence by * enumerating three unindexed fragments. */ pid = 0; for (i1 = 0; i1 < lenPartition * (nPartition - 4); i1 += lenPartition) { /* * extract the intermediate sequence by removing the first intervals * from the sequence. The first interval is a segment starting * from i1 with the length of lenPartition. */ BitRead::removeInterval(key, subkey1, i1, lenPartition); for (i2 = i1; i2 < lenPartition * (nPartition - 4); i2 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey1, subkey2, i2, lenPartition); for (i3 = i2; i3 < lenPartition * (nPartition - 4); i3 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey2, subkey3, i3, lenPartition); for (i4 = i3; i4 < lenPartition * (nPartition - 4); i4 += lenPartition) { /* * extract the intermediate sequence by removing the second intervals * from the intermediate sequence. The second interval is a segment * starting from i2 with the length of lenPartition. */ BitRead::removeInterval(subkey3, subkey4, i4, lenPartition); for (i5 = i4; i5 < lenPartition * (nPartition - 4); i5 += lenPartition) { if (skipFirst && pid == 0) { pid++; continue; } /* * extract the indexed fragments by removing the third intervals * from the intermediate sequence. The second interval is a segment * starting from i3 with the length of lenPartition. */ BitRead::removeInterval(subkey4, subkey5, i5, lenPartition); /* * search specified hash table to find potential matched * segment based on the partitioning. */ ret = hashTables[lookupIndex[pid]].lookup(orgkey, subkey5, lookupOffset[pid], quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL ) return ret; pid++; } } } } } break; } HASH_DEBUG(printf("\n")); HASH_DEBUG(getchar()); /* ret = hashTables[0].lookupHitList( orgkey, subkey2, lookupOffset[pid], quals, s, rid, hits, &list); if (ret == MSG_HITSETFULL) return ret; */ return SUCCESS; } /* * Alinger::check * This function is used to examine the errors in the alinger. * We randomly choose some positions from the compact sequence. * Beginning from these positions, we extract subsequences and * manually modify some chracters. The modified subsequences are * searched in the aligner and check if we can find those choose * positions. */ int Aligner::check(int num) { unsigned int i, j, sid, pos = 0, word; int64 k1[16], k2[16], code; int64 * key = &k1[8], *key2 = &k2[8]; int64 * seqVector; HitSet * hits; int ret; memset(k1, 0, 16 * sizeof(int64)); memset(k2, 0, 16 * sizeof(int64)); hits = new HitSet(0, 0, 0, false, false); hits->init(sequence, length); seqVector = sequence->getSequence(); for (i = 0; i < num; i++) { /* choose a random position in the compact sequence */ sid = RAND() % (numEntry - length + 1); /* get the subsequence beginning from the position */ BitRead::extract(seqVector, key, sid * BITS_PER_BASE_LL , length * BITS_PER_BASE); BitRead::copy(key, key2); /* * modify nError characters in the subsequence. Note that * the modified character may be the same as the original * character. Thus, we generate a sequence that can be * aligned to the compact sequence within numError errors. */ for (j = 0; j < nError; j++) { pos = RAND() % length; word = 3 - (pos * BITS_PER_BASE / BITS_PER_LONGWORD); /* align the position to the boundary of characters */ pos = (pos * BITS_PER_BASE) % BITS_PER_LONGWORD; if (pos + BITS_PER_BASE > BITS_PER_LONGWORD ) continue; /* generate a random character */ code = RAND() % 4; /* modify the character */ key2[word] = (key2[word] & ~((int64) 7 << pos)) | (code << pos); } /* align the modified subsequence */ hits->reset(); align(key2, NULL, FORWARD, i, hits); if (hits->getNumHits() <= 0) { return ERR_CHECK; } } delete hits; return SUCCESS; } AlignRes Aligner::align(AlignInfo * info, char * filename) { AlignRes ret; if (info->reader2 == NULL ) ret = alignSingleEnd(info, filename); else ret = alignPairEnd(info, filename); printStat(); return ret; } /* * Aligner::align * This function is used to align all query sequences in the short * read structure in batch. */ AlignRes Aligner::alignSingleEnd(AlignInfo * info, char * filename) { unsigned int i; FILE * file = NULL, *alfile = NULL, *unfile = NULL; int64 * query; char * quals = NULL; int strand; HitSet * hits; bool isQual = false; float ratio; Writer * writer = NULL; AlignRes res; ShortRead * read = info->reader1; int maxHit = info->maxHit, maxMatch = info->maxMatch, maxQual = info->maxQual; bool sorted = info->sorted, strata = info->strata; isQual = sorted || (maxQual <= 255); hits = new HitSet(maxHit, maxMatch, maxQual, sorted, false); hits->init(sequence, length); if (filename[0] != '\0') { if (info->outputFormat == MODE_NORMAL ) writer = new SimpleWriter(sequence, read, filename); else if (info->outputFormat == MODE_SAM ) writer = new SamWriter(sequence, read, filename); } //for (i = 0; i < nHashTable; i++) // hashTables[i].resetStat(); if (writer != NULL ) writer->writeHead(); res.nRead = read->getNumReads(); res.nValidRead = 0; res.nValidAlignment = 0; for (i = 0; i < nHashTable; i++) { elog(DEBUG1, "Max Scan: %d\n", hashTables[i].getMaxScan()); } ProgressBar bar(res.nRead - 1, PROGRESS_BAR_WIDTH); for (i = 0; i < res.nRead; i++) { /* update the progress bar.*/ if (info->showBar) bar.update(i); hits->reset(); /* forward case */ if (read->isForward()) { HASH_DEBUG(printf("forward (first fragment):\n")); query = read->getRead(i, FORWARD); if (isQual) quals = read->getQual(i); alignFirst(query, quals, FORWARD, i, hits, true); } /* backward case */ if (read->isBackward() && !hits->isFull()) { HASH_DEBUG(printf("backward (first fragment):\n")); query = read->getRead(i, BACKWARD); if (isQual) quals = read->getQual(i); alignFirst(query, quals, BACKWARD, i, hits, true); } /* forward case */ if (read->isForward() && !hits->isFull()) { HASH_DEBUG(printf("forward:\n")); query = read->getRead(i, FORWARD); if (isQual) quals = read->getQual(i); align(query, quals, FORWARD, i, hits, true, true); } /* backward case */ if (read->isBackward() && !hits->isFull()) { HASH_DEBUG(printf("backward:\n")); query = read->getRead(i, BACKWARD); if (isQual) quals = read->getQual(i); align(query, quals, BACKWARD, i, hits, true, true); } if (info->maxGap > 0) { if (read->isForward() && !hits->isFull()) { HASH_DEBUG(printf("forward:\n")); query = read->getRead(i, FORWARD); if (isQual) quals = read->getQual(i); align(query, quals, FORWARD, i, hits, false, false); } if (read->isBackward() && !hits->isFull()) { HASH_DEBUG(printf("backward:\n")); query = read->getRead(i, BACKWARD); if (isQual) quals = read->getQual(i); align(query, quals, BACKWARD, i, hits, false, false); } } hits->verifyQual(); if (hits->getNumHits() > 0) { res.nValidRead++; res.nValidAlignment += hits->getNumHits(); read->printAlign(i); } else { read->printUnalign(i); } //write may chagne the nubmer of hits in the set if (writer != NULL) { writer->writeAlignment(i, hits); } } //for (i = 0; i < nHashTable; i++) // printf("%u probes, %u empty, %u collision, %u seq probe.\n", hashTables[i].getStatProbe(), // hashTables[i].getStatEmpty(), hashTables[i].getStatCollision(), hashTables[i].getStatSeqProbe()); delete writer; delete hits; return res; } AlignRes Aligner::alignPairEnd(AlignInfo * info, char * filename) { unsigned int i; bool success; unsigned int nSuccessRead = 0; FILE * file = NULL; int64 * query; char * quals = NULL; HitPairSet * hitpair; HitSet * set1, *set2; bool isQual = false; Writer * writer = NULL; char mateFilename[256]; AlignRes res; ShortRead * read1 = info->reader1, *read2 = info->reader2; unsigned int minins = info->minins, maxins = info->maxins; int maxHit = info->maxHit, maxMatch = info->maxMatch, maxQual = info->maxQual; bool sorted = info->sorted, strata = info->strata; isQual = sorted || (maxQual <= 255); set1 = new HitSet(info->maxMate, 0, maxQual, false, false); set2 = new HitSet(info->maxMate, 0, maxQual, false, false); hitpair = new HitPairSet(maxHit, maxMatch, maxQual, sorted, false); set1->init(sequence, length); set2->init(sequence, length); hitpair->init(sequence, length); if (filename[0] != '\0') { if (info->outputFormat == MODE_NORMAL ) writer = new SimplePairWriter(sequence, read1, read2, filename); else if (info->outputFormat == MODE_SAM ) writer = new SamPairWriter(sequence, read1, read2, filename); } if (writer != NULL ) writer->writeHead(); //for (i = 0; i < nHashTable; i++) // hashTables[i].resetStat(); res.nRead = read1->getNumReads(); res.nValidRead = 0; res.nValidAlignment = 0; ProgressBar bar(res.nRead - 1, PROGRESS_BAR_WIDTH); for (i = 0; i < res.nRead; i++) { /* update the progress bar.*/ if (info->showBar) bar.update(i); set1->reset(); /* forward case for mate1 */ if (read1->isForward()) { query = read1->getRead(i, FORWARD); if (isQual) quals = read1->getQual(i); align(query, quals, FORWARD, i, set1); } /* backward case for mate1*/ if (read1->isBackward()) { query = read1->getRead(i, BACKWARD); if (isQual) quals = read1->getQual(i); align(query, quals, BACKWARD, i, set1); } set2->reset(); /* forward case for mate2*/ if (read2->isForward()) { query = read2->getRead(i, FORWARD); if (isQual) quals = read2->getQual(i); align(query, quals, FORWARD, i, set2); } /* backward case for mate2*/ if (read2->isBackward()) { query = read2->getRead(i, BACKWARD); if (isQual) quals = read2->getQual(i); align(query, quals, BACKWARD, i, set2); } hitpair->build(set1, set2, minins, maxins, info->pairStrand, info->mateMatch); hitpair->verifyQual(); if (hitpair->isProperMatch() && hitpair->getNumHits() > 0) { res.nValidRead++; res.nValidAlignment += hitpair->getNumHits() / 2; read1->printAlign(i); read2->printAlign(i); } else { read1->printUnalign(i); read2->printUnalign(i); } if (writer != NULL) { writer->writeAlignment(i, hitpair); } } //for (i = 0; i < nHashTable; i++) // printf("%u probes, %u empty, %u collision, %u seq probe.\n", hashTables[i].getStatProbe(), // hashTables[i].getStatEmpty(), hashTables[i].getStatCollision(), hashTables[i].getStatSeqProbe()); delete set1; delete set2; delete hitpair; delete writer; return res; } AlignRes Aligner::merge(AlignInfo * info, int step, char * filename) { if (info->reader2 == NULL ) return mergeSingleEnd(info, step, filename); else return mergePairEnd(info, step, filename); } /* * Aligner::mergeSingleEnd() * This function is used to merge the hits in various files into a single * hit file for single-end alignments. The input hit file is in RAW format, * and is generated by search a partition of indexes. The output hit file * is in a format specified by the users. */ AlignRes Aligner::mergeSingleEnd(AlignInfo * info, int step, char * filename) { int i, ret; Hit * hits; HitSet * set; int num; char fname[MAX_LENGTH_PATH]; int curid; Writer * writer = NULL; RawReader * readers; AlignRes res; if (info->outputFormat == MODE_NORMAL ) writer = new SimpleWriter(sequence, info->reader1, filename); else if (info->outputFormat == MODE_SAM ) writer = new SamWriter(sequence, info->reader1, filename); if (writer != NULL ) writer->writeHead(); set = new HitSet(info->maxHit, info->maxMatch, info->maxQual, info->sorted, false); set->init(sequence, length); num = (nHashTable + step - 1) / step; hits = new Hit[num]; res.nRead = info->reader1->getNumReads(); res.nValidRead = 0; res.nValidAlignment = 0; readers = new RawReader[num]; for (i = 0; i < num; i++) { sprintf(fname, "%s.p%d", filename, i * step); if (readers[i].init(fname, res.nRead) == false) { elog(ERROR, "ERROR: open raw align files.\n"); return res; } readers[i].next(&hits[i]); } /* * the merge algorithms is effcient when most reads * have valid alignments */ for (curid = 0; curid < res.nRead; curid++) { /* empty the hit set */ set->reset(); for (i = 0; i < num; i++) { while (hits[i].id == curid) { set->add(hits[i].query, hits[i].reference, hits[i].pos, hits[i].strand, &hits[i].error, hits[i].qual, hits[i].id); readers[i].next(&hits[i]); } } set->verifyQual(); if (set->getNumHits() > 0) { res.nValidRead++; res.nValidAlignment += set->getNumHits(); info->reader1->printAlign(curid); } else info->reader1->printUnalign(curid); if (writer != NULL) { writer->writeAlignment(curid, set); } } delete writer; delete set; delete[] hits; delete[] readers; /* remove all intermediate hit files */ char command[MAX_LENGTH_PATH]; for (i = 0; i < num; i++) { sprintf(command, "rm -f %s.p%d", filename, i * step); system(command); } return res; } /* * Aligner::mergePairEnd() * This function is used to merge the hits in various files into a single * hit file for paired-end alignments. The input hit file is in RAW format, * and is generated by search a partition of indexes. The output hit file * is in a format specified by the users. */ AlignRes Aligner::mergePairEnd(AlignInfo * info, int step, char * filename) { int i, ret; Hit * hits1, *hits2; HitSet * set1, *set2; HitPairSet * hitpair; int num; char fname[MAX_LENGTH_PATH]; int curid; Writer * writer = NULL; RawReader * readers1, *readers2; AlignRes res; if (info->outputFormat == MODE_NORMAL ) writer = new SimplePairWriter(sequence, info->reader1, info->reader2, filename); else if (info->outputFormat == MODE_SAM ) writer = new SamPairWriter(sequence, info->reader1, info->reader2, filename); if (writer != NULL ) writer->writeHead(); set1 = new HitSet(0, 0, 0, false, false); set2 = new HitSet(0, 0, 0, false, false); hitpair = new HitPairSet(info->maxHit, info->maxMatch, info->maxQual, info->sorted, false); set1->init(sequence, length); set2->init(sequence, length); hitpair->init(sequence, length); num = (nHashTable + step - 1) / step; hits1 = new Hit[num]; hits2 = new Hit[num]; res.nRead = info->reader1->getNumReads(); res.nValidRead = 0; res.nValidAlignment = 0; readers1 = new RawReader[num]; readers2 = new RawReader[num]; for (i = 0; i < num; i++) { /* initialize the reader for mate1 */ sprintf(fname, "%s.m1.p%d", filename, i * step); if (readers1[i].init(fname, res.nRead) == false) { elog(ERROR, "ERROR: open raw align files.\n"); return res; } readers1[i].next(&hits1[i]); /* initialize the reader for mate2 */ sprintf(fname, "%s.m2.p%d", filename, i * step); if (readers2[i].init(fname, res.nRead) == false) { elog(ERROR, "ERROR: open raw align files.\n"); return res; } readers2[i].next(&hits2[i]); } /* * the merge algorithms is effcient when most reads * have valid alignments */ for (curid = 0; curid < res.nRead; curid++) { /* merge hits for mate 1 */ /* empty the hit set */ set1->reset(); for (i = 0; i < num; i++) { while (hits1[i].id == curid) { set1->add(hits1[i].query, hits1[i].reference, hits1[i].pos, hits1[i].strand, &hits1[i].error, hits1[i].qual, hits1[i].id); readers1[i].next(&hits1[i]); } } /* merge hits for mate 2 */ /* empty the hit set */ set2->reset(); for (i = 0; i < num; i++) { while (hits2[i].id == curid) { set2->add(hits2[i].query, hits2[i].reference, hits2[i].pos, hits2[i].strand, &hits2[i].error, hits2[i].qual, hits2[i].id); readers2[i].next(&hits2[i]); } } hitpair->build(set1, set2, info->minins, info->maxins, info->pairStrand, info->mateMatch); hitpair->verifyQual(); if (hitpair->isProperMatch() && hitpair->getNumHits() > 0) { res.nValidRead++; res.nValidAlignment += hitpair->getNumHits() / 2; info->reader1->printAlign(curid); info->reader2->printAlign(curid); } else { info->reader1->printUnalign(curid); info->reader2->printUnalign(curid); } if (writer != NULL) { writer->writeAlignment(curid, hitpair); } } delete writer; delete set1; delete set2; delete hitpair; delete[] hits1; delete[] hits2; delete[] readers1; delete[] readers2; /* remove all intermediate hit files */ char command[MAX_LENGTH_PATH]; for (i = 0; i < num; i++) { sprintf(command, "rm -f %s.m1.p%d", filename, i * step); system(command); sprintf(command, "rm -f %s.m2.p%d", filename, i * step); system(command); } return res; } void Aligner::printStat() { #ifdef DEBUG_STAT for (int i = 0; i < nHashTable; i++) { printf("Hash table %d: \n", i); hashTables[i].printStat(); } #endif } /* * Aligner::save * This function is used to save the in-memory index on disk. If indexID * is not specified (indexID == ALIGNER_ALL_INDEX), we save all hash * tables. Otherwise, we only save the specified index. The Aligner * structure is stored in file head.whm in the specified data path. Hash * tables are separately stored in h$ID$.whm. */ int Aligner::save(char * path, int indexID) { int ret; char fname[MAX_LENGTH_PATH]; FILE * file; if (strlen(path) > 240) return ERR_PARA; if (indexID != ALIGNER_ALL_INDEX && indexID >= nHashTable) return ERR_PARA; sprintf(fname, "%s.head.whm", path); file = fopen(fname, "wb"); if (file == NULL) { elog(DEBUG1, "ERROR: open data file to write aligner structure.\n"); return ERR_PARA; } ret = fwrite(this, sizeof(Aligner), 1, file); if (ret != 1) { elog(DEBUG1, "ERROR: write aligner structure.\n"); return ERR_FILE; } ret = fwrite(lookupIndex, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: write aligner structure.\n"); return ERR_FILE; } ret = fwrite(lookupOffset, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: write aligner structure.\n"); return ERR_FILE; } ret = fflush(file); if (ret != 0) { elog(DEBUG1, "ERROR: flush the aligner structure.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) { elog(DEBUG1, "ERROR: close data file for aligner structure.\n"); return ERR_FILE; } if (indexID == ALIGNER_ALL_INDEX) { for (int i = 0; i < nHashTable; i++) { ret = hashTables[i].save(path); if (ret != SUCCESS) { elog(DEBUG1, "ERROR: write hash table %d.\n", i); return ret; } } } else { ret = hashTables[indexID].save(path); if (ret != SUCCESS) { elog(DEBUG1, "ERROR: write hash table %d.\n", indexID); return ret; } } return SUCCESS; } /* * Aligner::load * This function is used to load the on-disk index into memory. If indexID * is not specified (indexID == ALIGNER_ALL_INDEX), we load all hash * tables. Otherwise, we only load the specified index. The Aligner * structure is loaded from the file head.whm in the specified data path. Hash * tables are separately loaded from files h$ID$.whm. */ int Aligner::load(char * path, int indexID) { int ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; if (indexID != ALIGNER_ALL_INDEX && indexID >= nHashTable) return ERR_PARA; sprintf(fname, "%s.head.whm", path); file = fopen(fname, "rb"); if (file == NULL) { elog(DEBUG1, "ERROR: head file does not exist.\n"); return ERR_PARA; } ret = fread(this, sizeof(Aligner), 1, file); if (ret != 1) { elog(DEBUG1, "ERROR: read head data file.\n"); return ERR_FILE; } lookupIndex = new int[nLookup]; if (lookupIndex == NULL ) return ERR_MEM; ret = fread(lookupIndex, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: read head data file.\n"); return ERR_FILE; } lookupOffset = new int[nLookup]; if (lookupIndex == NULL ) return ERR_MEM; ret = fread(lookupOffset, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: read head data file.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) { elog(DEBUG1, "ERROR: close the head data file.\n"); return ERR_FILE; } /* load the compact sequence */ sequence = new CompactSequence(); ret = sequence->load(path); if (ret != SUCCESS) { elog(ERROR, "failed to load reference sequences.\n"); return ret; } /* verify the sequence matches with the aligner */ ret = sequence->valid(length, nError); if (ret != SUCCESS) { elog(ERROR, "ERROR: unmismatched sequence/index.\n"); return ERR_INDEX; } /* load hash table one by one */ hashTables = new HashTable[nHashTable]; if (hashTables == NULL ) return ERR_MEM; if (indexID == ALIGNER_ALL_INDEX) { for (int i = 0; i < nHashTable; i++) { ret = hashTables[i].load(path, i, sequence); if (ret != SUCCESS) { elog(DEBUG1, "ERROR: read hash table %d data file.\n", i); return ret; } } } else { ret = hashTables[indexID].load(path, indexID, sequence); if (ret != SUCCESS) { elog(DEBUG1, "ERROR: read hash table %d data file.\n", indexID); return ret; } } return SUCCESS; } int Aligner::saveHead(char * path) { int ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.head.whm", path); file = fopen(fname, "wb"); if (file == NULL) { elog(DEBUG1, "ERROR: open data file to write aligner structure.\n"); return ERR_PARA; } ret = fwrite(this, sizeof(Aligner), 1, file); if (ret != 1) { elog(DEBUG1, "ERROR: write aligner structure.\n"); return ERR_FILE; } ret = fwrite(lookupIndex, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: write aligner structure.\n"); return ERR_FILE; } ret = fwrite(lookupOffset, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: write aligner structure.\n"); return ERR_FILE; } ret = fflush(file); if (ret != 0) { elog(DEBUG1, "ERROR: flush the aligner structure.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) { elog(DEBUG1, "ERROR: close data file for aligner structure.\n"); return ERR_FILE; } return SUCCESS; } int Aligner::saveIndex(char * path, int indexID) { int ret; if (strlen(path) > 240) return ERR_PARA; if (indexID < 0 || indexID >= nHashTable) return ERR_PARA; ret = hashTables[indexID].save(path); if (ret != SUCCESS) { elog(DEBUG1, "ERROR: write hash table %d.\n", indexID); return ret; } return SUCCESS; } int Aligner::loadHead(char * path) { int ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.head.whm", path); file = fopen(fname, "rb"); if (file == NULL) { elog(DEBUG1, "ERROR: head file does not exist.\n"); return ERR_PARA; } ret = fread(this, sizeof(Aligner), 1, file); if (ret != 1) { elog(DEBUG1, "ERROR: read head data file.\n"); return ERR_FILE; } // sequence = seq; lookupIndex = new int[nLookup]; if (lookupIndex == NULL ) return ERR_MEM; ret = fread(lookupIndex, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: read head data file.\n"); return ERR_FILE; } lookupOffset = new int[nLookup]; if (lookupIndex == NULL ) return ERR_MEM; ret = fread(lookupOffset, sizeof(int), nLookup, file); if (ret != nLookup) { elog(DEBUG1, "ERROR: read head data file.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) { elog(DEBUG1, "ERROR: close the head data file.\n"); return ERR_FILE; } hashTables = new HashTable[nHashTable]; if (hashTables == NULL ) return ERR_MEM; /* load the compact sequence */ sequence = new CompactSequence(); ret = sequence->load(path); if (ret != SUCCESS) { elog(ERROR, "failed to load reference sequences.\n"); return ret; } /* verify the sequence matches with the aligner */ ret = sequence->valid(length, nError); if (ret != SUCCESS) { elog(ERROR, "ERROR: unmismatched sequence/index.\n"); return ERR_INDEX; } return SUCCESS; } int Aligner::loadHashtables(char * path) { int i; int ret; if (strlen(path) > 240) return ERR_PARA; for (i = 0; i < nHashTable; i++) { ret = hashTables[i].load(path, i, sequence); if (ret != SUCCESS) { elog(ERROR, "ERROR: read hash table %d data file.\n", i); return ret; } hashTables[i].setErrorModel(nMaxError, nInsert, maxQual); } return SUCCESS; } /* * Aligner::remove * free the space of specified index */ int Aligner::removeHashTables() { int i; int ret; for (i = 0; i < nHashTable; i++) { ret = hashTables[i].remove(); if (ret != SUCCESS) { return ret; } } return SUCCESS; } wham/hash.cpp0000644001532600153260000015570212054751052012502 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: hash.cpp 167 2012-11-26 20:33:46Z yinan $ */ #include #include #include #include #include #include #include "hash.h" #include "bitread.h" #include "error.h" #include "pair.h" #include "edit_distance.h" #include "util.h" #include "rdtsc.h" #include "interval.h" //#define NUM_BUCKET 1350000011 #define NUM_BUCKET 1500000001 //#define NUM_BUCKET 2000000011 #define HASH_OVERFLOW_LIST_SCAN_BOUND 64 #define BITWISE_ALIGNMENT //#define DEBUG_PRINT_LIST unsigned long long timePairAlign = 0; unsigned long long numPairAlign = 0; unsigned long long numPairAlignFilter = 0; unsigned long long numPairAlignDBA = 0; unsigned long long statHashLookupEntry = 0; unsigned long long statHashLookup = 0; int64 lookuptspace[96]; int64 lookuptspace2[16]; typedef struct EntryCounter { int64 key[6]; uint32 num; uint32 offset; } EntryCounter; int compareEntryCounter(const void * a, const void * b) { EntryCounter * p1 = (EntryCounter *) a; EntryCounter * p2 = (EntryCounter *) b; return p1->num - p2->num; } HashTable::HashTable() { memset(this, 0, sizeof(HashTable)); memset(lookuptspace, 0, sizeof(int64) * 96); memset(lookuptspace2, 0, sizeof(int64) * 16); } HashTable::~HashTable() { delete[] buckets; delete[] overflowPool; } /* * HashTable::init() * initialize the private variables */ void HashTable::init(CompactSequence * seq, int len, unsigned int nBucket, int numError, int numInsert, int numDelete, int numPartition, int maxRepeat, bool embed, int index) { sequence = seq; indexID = index; length = len; lenSeq = length * BITS_PER_BASE; nMismatch = numError; nInsert = numInsert; nDelete = numDelete; nMaxError = nMismatch; nPartition = numPartition; // nLookup = numLookup; nMaxGap = 0; maxQual = MAX_INT; this->maxRepeat = maxRepeat; embedTables = NULL; lenPartition = length / nPartition * BITS_PER_BASE; lenKey = length / nPartition * (nPartition - nMismatch) * BITS_PER_BASE; lenRest = length * BITS_PER_BASE - lenPartition * nPartition; BitRead::genHeadMask(headMask, lenPartition * nPartition); BitRead::genHeadMask(embedHeadMask, (lenSeq - lenKey - lenRest) / BITS_PER_BASE / (nMismatch + 1) * (nMismatch + 1) * BITS_PER_BASE); if (nBucket == 0) { double nEntry, nSpace; nEntry = (double) seq->getNum(); nSpace = pow(8.0, length / nPartition * (nPartition - nMismatch)); numBucket = nEntry < nSpace ? (int) nEntry : (int) nSpace; } else numBucket = nBucket; numBucket = nextPrime(numBucket); numEmpty = numBucket; /* * if the sequence size is greater than 2^31, we have to * use a normal hash table, otherwise, we use a compressed * hash table to speedup the searches. */ if (seq->getNum() < COMPRESS_TABLE_SIZE) compressedTable = true; else compressedTable = false; bUseEmbedTables = embed; // compressedTable = false; resetStat(); } /* * HashTable::preProcessInit() * Allocate and initialize the hash bucket array and bitmap arrays * for collision bits and empty bits. */ int HashTable::preProcessInit() { /* allocate hash buckets. */ buckets = (unsigned int *) malloc((int64) numBucket * sizeof(unsigned int)); if (buckets == NULL ) return ERR_MEM; /* * allocate bitmap arrays to identify empty buckets * and collision buckets. */ emptyBits = (unsigned char *) malloc((int64) numBucket / BITS_PER_BYTE + 1); if (emptyBits == NULL ) return ERR_MEM; collisionBits = (unsigned char *) malloc( (int64) numBucket / BITS_PER_BYTE + 1); if (collisionBits == NULL ) return ERR_MEM; /* Initialization */ memset(buckets, 0, numBucket * sizeof(unsigned int)); memset(emptyBits, 0, numBucket / BITS_PER_BYTE + 1); memset(collisionBits, 0, numBucket / BITS_PER_BYTE + 1); /* embed table bis */ numEmbedTables = 0; embedBits = (unsigned char *) malloc((int64) numBucket / BITS_PER_BYTE + 1); if (embedBits == NULL ) return ERR_MEM; memset(embedBits, 0, numBucket / BITS_PER_BYTE + 1); return SUCCESS; } /* * HashTable::preProcessEnd() * apply the empty bits and collision bits to the hash buckets. * For empty buckets, the bucket values are set to be HASH_EMPTY. * For the buckets with collisions, the most significant bits in * the buckets are set to be 1. */ int HashTable::preProcessEnd() { uint32 i; uint32 tmp, sum = 0; uint32 collision; const double ln2 = log(2); if (buckets == NULL) { elog(DEBUG1, "ERROR: unallocated bucket array in hash table.\n"); return ERR_PARA; } if (emptyBits == NULL || collisionBits == NULL) { elog(DEBUG1, "ERROR: unallocated bitmap in hash table.\n"); return ERR_PARA; } if (numOverflowEntry < 0) return ERR_PARA; /* allicate overflow pool and bitmaps */ overflowPool = (unsigned int *) malloc( (int64) numOverflowEntry * sizeof(unsigned int)); if (overflowPool == NULL ) return ERR_MEM; memset(overflowPool, 0, numOverflowEntry * sizeof(unsigned int)); if (!compressedTable) { overflowBits = (unsigned char *) malloc( (int64) numOverflowEntry / BITS_PER_BYTE + 1); if (overflowBits == NULL ) return ERR_MEM; memset(overflowBits, 0, (numOverflowEntry / BITS_PER_BYTE + 1) * sizeof(char)); } for (i = 0; i < numBucket; i++) { tmp = buckets[i]; /* update histogram */ int h = 0; if (tmp > 0) h = (int) ceil(log(tmp) / ln2); if (h >= nHistogram) histogram[nHistogram - 1]++; else histogram[h]++; } maxScan = 64; if (useEmbedTables()) { embedShreshold = 64; setScanThreshold(0.001); maxScan = embedShreshold; numEmbedTables = 0; for (int i = nHistogram - 1; i >= 0; i--) { if ((0x1 << i) == embedShreshold) break; numEmbedTables += histogram[i]; } embedTableSizes = (int *) malloc(sizeof(int) * numEmbedTables); embedTableBucketIds = (uint32 *) malloc(sizeof(uint32) * numEmbedTables); } /* * scan the hash buckets to appy the empty bits and * collision bits. */ int embedId = 0; for (i = 0; i < numBucket; i++) { /* * The current value of the bucket is the number of collision * entries in each bucket. We accumulate this value to compute * the position of the last entries in the overflow array for each bucket, * and store the position into the bucket. For the non-collision * buckets, the values will be updated to the position of segment * in the function insert. */ tmp = buckets[i]; sum += tmp; buckets[i] = sum; if (buckets[i] > 0) { if (compressedTable) { HASH_SET_END(overflowPool[buckets[i] - 1]); } else { BITMAP_SET(overflowBits[(buckets[i] - 1) / BITS_PER_BYTE], (buckets[i] - 1) % BITS_PER_BYTE); } } if (useEmbedTables() && tmp > embedShreshold) { /* set embed bit */ BITMAP_SET(embedBits[i/BITS_PER_BYTE], i % BITS_PER_BYTE); assert(embedId < numEmbedTables); embedTableSizes[embedId] = tmp; embedTableBucketIds[embedId] = i; embedId++; } /* set the values for empty buckets */ if (!BITMAP_IS(emptyBits[i / BITS_PER_BYTE], i % BITS_PER_BYTE)) buckets[i] = HASH_EMPTY; else { if (compressedTable) { /* apply the collision bit to the most significant bit of the bucket */ collision = BITMAP_IS(collisionBits[i / BITS_PER_BYTE], i % BITS_PER_BYTE); buckets[i] |= HASH_COLLISION_MASK(collision); } } } /* free the bitmap arrays */ free(emptyBits); emptyBits = NULL; if (compressedTable) { free(collisionBits); collisionBits = NULL; } elog( DEBUG1, " numBucket| numEmpty| Collision| numEntry| Col Rat| Emp Rat|Avg List|Avg Miss\n"); elog( DEBUG1, "%11u %11u %11u %11u %8.2f %8.2f %8.2f %8.2f\n", numBucket, numEmpty, numCollision, numEntry, (double) (numCollision) / (numBucket - numEmpty), (double) (numEmpty) / (numBucket), (double) (numOverflowEntry) / numCollision, (double) (numOverflowEntry + numBucket - numEmpty - numCollision) / (numBucket - numEmpty)); return SUCCESS; } void HashTable::setScanThreshold(double r) { int64 sum = 0; int64 total = numBucket; int64 top = total - total * r; // uint32 top = total - total / 1000; elog(DEBUG1, "Scan threshold: %f\n", r); elog(DEBUG1, "Hash List Length Histogram:\n"); elog(DEBUG1, "Empty: %d\n", numEmpty); for (int i = 0; i < nHistogram - 1; i++) { sum += histogram[i]; if (sum <= top) maxScan = 0x1 << (i + 1); elog(DEBUG1, "List length <= %d: %d (%.2f%%)\n", (0x1 << i), histogram[i], histogram[i] * 100.0 / total); } sum += histogram[nHistogram - 1]; if (sum <= top) maxScan = 0x1 << nHistogram; elog(DEBUG1, "List length for the rest: %d (%.2f%%)\n", histogram[nHistogram - 1], histogram[nHistogram - 1] * 100.0 / total); elog(DEBUG1, "Choose maximum of embed threshold: %d\n", maxScan); } /* * HashTable::preProcessInsert() * update the statistics infos for the hash tables. In particular, * we update the empty bitmap array and collision bitmap array, * and update the bucket value to be the number of collision entries * hashed into the bucket. */ void HashTable::preProcessInsert(int64 * key) { uint32 bucketID; numEntry++; /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketID); if (!BITMAP_IS(emptyBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE)) { /* set the empty bit*/ BITMAP_SET(emptyBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE); numEmpty--; } else { if (!BITMAP_IS(collisionBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE)) { /* The two collsision entries will be added into the overflow array */ BITMAP_SET(collisionBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE); numCollision++; numOverflowEntry += 2; buckets[bucketID] += 2; } else { /* The collsision entry will be added into the overflow array */ numOverflowEntry++; buckets[bucketID]++; } } } /* * HashTable::buildInit() * allocate and initialize the overflow pool. */ int HashTable::buildInit() { return SUCCESS; } /* * HashTable::insert() * insert an segment(entry) into the hash table. If the collision * bit is 0, the position of the segment is directly stored in the * hash bucket. Otherwise, the position of the segment is stored in * the end of the overflow list of the bucket. */ void HashTable::insert(int64 * key, unsigned int offset) { uint32 curOverflowEntry; uint32 bucketId; uint32 seqOffset; bool collision; uint32 counter = 0; int64 * seqVector; int64 tspace[16]; int64 * target = &tspace[8]; bool isBloomFilter; uint32 * bloomFilter; uint32 bloomFilterNum; seqVector = sequence->getSequence(); /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketId); if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (!collision) { /* store the position into the bucket */ if (compressedTable) buckets[bucketId] = HASH_GET_OFFSET(offset); else buckets[bucketId] = offset; } else { buckets[bucketId]--; /* get the overflow list position */ if (compressedTable) curOverflowEntry = HASH_GET_OFFSET(buckets[bucketId]); else curOverflowEntry = buckets[bucketId]; /* isBloomFilter = BLOOM_FILTER_EMPLOY(overflowPool[curOverflowEntry]); if (isBloomFilter) { bloomFilterNum = BLOOM_FILTER_GET_NUM(overflowPool[curOverflowEntry]); bloomFilter = &overflowPool[curOverflowEntry + 1]; curOverflowEntry += 1 + BLOOM_FILTER_BYTES(bloomFilterNum); } */ /* * append the position of the new segment to the end of * overflow list. */ if (compressedTable) overflowPool[curOverflowEntry] = HASH_SET_OFFSET(overflowPool[curOverflowEntry], offset); else overflowPool[curOverflowEntry] = offset; } } int HashTable::buildEmbedTable() { int64 ttspace[80]; int64 * query = &ttspace[8], *rest = &ttspace[24], *rest1 = &ttspace[40], *rest2 = &ttspace[56], *noheadkey = &ttspace[72]; int64 * seqVector = sequence->getSequence(); uint32 seqOffset, entryOffset, startOffset; memset(ttspace, 0, sizeof(int64) * 80); /* create space for embed tables */ numEmbedTablesPerList = (nPartition - widthKeySpan + 1) * (nMismatch + 1); numLongLists = numEmbedTables; numEmbedTables *= numEmbedTablesPerList; elog(DEBUG1, "Number of long lists: %d\n", numLongLists); elog(DEBUG1, "Number of embed tables per list: %d\n", numEmbedTablesPerList); elog(DEBUG1, "Size: %lu bytes\n", sizeof(EmbedHashTable)); elog(DEBUG1, "Size: %llu\n", (int64) numEmbedTables * sizeof(EmbedHashTable)); embedTables = new EmbedHashTable[numEmbedTables]; if (embedTables == NULL) return ERR_MEM; for (int i = 0; i < numEmbedTables; i++) { int size = embedTableSizes[i / numEmbedTablesPerList]; embedTables[i].init(sequence, length, size, nMismatch, nPartition); int ret = embedTables[i].preProcessInit(); if (ret != SUCCESS) return ret; } for (int l = 0; l < numLongLists; l++) { unsigned int bucketId = embedTableBucketIds[l]; if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; startOffset = entryOffset; /* scan the overflow list */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { for (int i = 0; i < nPartition - widthKeySpan + 1; i++) { BitRead::extract(seqVector, query, seqOffset * BITS_PER_BASE_LL - i * lenPartition, lenSeq); preinsertEmbedTableEntry(query, l, i); } } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } } for (int i = 0; i < numEmbedTables; i++) { int ret = embedTables[i].preProcessEnd(); if (ret != SUCCESS) return ret; } for (int l = 0; l < numLongLists; l++) { unsigned int bucketId = embedTableBucketIds[l]; if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; startOffset = entryOffset; /* scan the overflow list */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { for (int i = 0; i < nPartition - widthKeySpan + 1; i++) { BitRead::extract(seqVector, query, seqOffset * BITS_PER_BASE_LL - i * lenPartition, lenSeq); insertEmbedTableEntry(query, seqOffset, l, i); } } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } //update the bucket value buckets[bucketId] = l; } return SUCCESS; } void HashTable::preinsertEmbedTableEntry(int64 * query, int embedId, int keyId) { int64 tspace[112]; int64 * noheadkey = &tspace[8], *rest = &tspace[24]; int64 * subkeys[6] = {query, &tspace[40], &tspace[56], &tspace[72], &tspace[88], &tspace[104]}; int64 * subkey1 = &tspace[8], *subkey2 = &tspace[56], *subkey3 = &tspace[72], *subkey4 = &tspace[88], *subkey5 = &tspace[104]; memset(tspace, 0, sizeof(int64) * (nMismatch + 2) * 16); // BitRead::removeHead(query, noheadkey, headMask); for (int i = 0; i < nPartition - nMismatch; i++) { assert(i < 6); assert(keyPartitions[i] - keyId * lenPartition >= 0); BitRead::removeInterval(subkeys[i], subkeys[i+1], keyPartitions[i] - keyId * lenPartition, lenPartition); } // BitRead::removeInterval(query, noheadkey, // lenPartition * (nPartition - 1 - keyId), lenPartition); BitRead::removeHead(subkeys[nPartition - nMismatch], rest, embedHeadMask); int newNumPartition = nMismatch + 1; int newLenPartition = (lenSeq - lenRest - lenKey) / BITS_PER_BASE / newNumPartition * BITS_PER_BASE; int pid = 0; switch(nMismatch) { case 0: /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].preProcessInsert( rest); break; case 1: for (int i1 = 0; i1 < newLenPartition * (newNumPartition); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].preProcessInsert( subkey1); pid++; } break; case 2: for (int i1 = 0; i1 < newLenPartition * (newNumPartition - 1); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); for (int i2 = i1; i2 < newLenPartition * (newNumPartition - 1); i2 += newLenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, newLenPartition); /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].preProcessInsert( subkey2); pid++; } } break; case 3: for (int i1 = 0; i1 < newLenPartition * (newNumPartition - 2); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); for (int i2 = i1; i2 < newLenPartition * (newNumPartition - 2); i2 += newLenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, newLenPartition); for (int i3 = i2; i3 < newLenPartition * (newNumPartition - 2); i3 += newLenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, newLenPartition); /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].preProcessInsert( subkey3); pid++; } } } break; default: assert(0); } } void HashTable::insertEmbedTableEntry(int64 * query, uint32 seqOffset, int embedId, int keyId) { int64 tspace[112]; int64 * noheadkey = &tspace[8], *rest = &tspace[24]; int64 * subkeys[6] = {query, &tspace[40], &tspace[56], &tspace[72], &tspace[88], &tspace[104]}; int64 * subkey1 = &tspace[8], *subkey2 = &tspace[56], *subkey3 = &tspace[72], *subkey4 = &tspace[88], *subkey5 = &tspace[104]; memset(tspace, 0, sizeof(int64) * (nMismatch + 2) * 16); // BitRead::removeHead(query, noheadkey, headMask); for (int i = 0; i < nPartition - nMismatch; i++) { assert(i < 6); assert(keyPartitions[i] - keyId * lenPartition >= 0); BitRead::removeInterval(subkeys[i], subkeys[i+1], keyPartitions[i] - keyId * lenPartition, lenPartition); } // BitRead::removeInterval(query, noheadkey, // lenPartition * (nPartition - 1 - keyId), lenPartition); BitRead::removeHead(subkeys[nPartition - nMismatch], rest, embedHeadMask); memset(tspace + 40, 0, sizeof(int64) * (nPartition - nMismatch) * 16); int newNumPartition = nMismatch + 1; int newLenPartition = (lenSeq - lenRest - lenKey) / BITS_PER_BASE / newNumPartition * BITS_PER_BASE; int pid = 0; switch(nMismatch) { case 0: /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].insert( rest, seqOffset); break; case 1: for (int i1 = 0; i1 < newLenPartition * (newNumPartition); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].insert( subkey1, seqOffset); pid++; } break; case 2: for (int i1 = 0; i1 < newLenPartition * (newNumPartition - 1); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); for (int i2 = i1; i2 < newLenPartition * (newNumPartition - 1); i2 += newLenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, newLenPartition); /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].insert( subkey2, seqOffset); pid++; } } break; case 3: for (int i1 = 0; i1 < newLenPartition * (newNumPartition - 2); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); for (int i2 = i1; i2 < newLenPartition * (newNumPartition - 2); i2 += newLenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, newLenPartition); for (int i3 = i2; i3 < newLenPartition * (newNumPartition - 2); i3 += newLenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, newLenPartition); /* insert the segment into specified hash table. */ embedTables[embedId * numEmbedTablesPerList + keyId * newNumPartition + pid].insert( subkey3, seqOffset); pid++; } } } break; default: assert(0); } } /* * HashTable::lookup() * search the segment(key) on the hash table, find the potential * matched portions in the genome sequence. Call function * pairAligner::pairAlign to perform pairwise alignment between * query sequence and the potential matched portions. */ unsigned int HashTable::lookup(int64 * orgkey, int64 * key, int keyOffset, char * quals, strand s, int rid, HitSet * hits, bool noGap) { int num; uint32 bucketId; uint32 seqOffset, entryOffset, startOffset; int64 tspace1[16], tspace2[16]; int64 * diff = &tspace1[8], *target = &tspace2[8]; int64 * seqVector; bool collision; uint32 sid, soffset; int ret; ErrorVector error; int nScanEntry = 0; int rett = SUCCESS; int maxGap = 0; if (!noGap) maxGap = nMaxGap; #ifdef DEBUG_STAT statProbe++; #endif seqVector = sequence->getSequence(); /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketId); if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); //if we don't use embed hash tables if (useEmbedTables()) { bool embed = BITMAP_IS(embedBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (embed == true) { lookupEmbedTable(orgkey, bucketId, keyOffset, quals, s, rid, hits, noGap); // printf("embed list\n"); return rett; } } statHashLookup++; if (!collision) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(buckets[bucketId]); else seqOffset = buckets[bucketId]; if (seqOffset != HASH_EMPTY) { statHashLookupEntry++; HASH_DEBUG(printf(" %u", seqOffset - keyOffset / BITS_PER_BASE)); #ifdef DEBUG_HASH_PRINT BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL + lenRest, lenKey); if (BitRead::compare(target, key)) printf("*"); #endif /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL - maxGap * BITS_PER_BASE - keyOffset, lenSeq + maxGap * 2 * BITS_PER_BASE); nScanEntry++; /** * perform the pairwise alignment under the constraint * on the number of errors. */ error = PairAligner::pairAlign(orgkey, target, length, nMaxError, maxGap); if (error.num <= nMaxError) { seqOffset = seqOffset - maxGap + error.offset - keyOffset / BITS_PER_BASE; if (maxGap != 0) { /** * if supports indel, re-extract the matched portion * with proper offset and length **/ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL, error.len * BITS_PER_BASE); } ret = hits->add(orgkey, target, seqOffset, s, &error, error.qual, rid); if (ret == MSG_HITSETFULL) { HASH_DEBUG(printf(" HIT")); rett = ret; } } } // else // stat_empty++; } else { // stat_collision++; /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; startOffset = entryOffset; /* scan the overflow list */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { statHashLookupEntry++; HASH_DEBUG(printf(" %u", seqOffset - keyOffset / BITS_PER_BASE)); #ifdef DEBUG_HASH_PRINT BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL + lenRest, lenKey); if (BitRead::compare(target, key)) printf("*"); #endif /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL - maxGap * BITS_PER_BASE - keyOffset, lenSeq + maxGap * 2 * BITS_PER_BASE); nScanEntry++; /** * perform the pairwise alignment under the constraint * on the number of errors. */ error = PairAligner::pairAlign(orgkey, target, length, nMaxError, maxGap); if (error.num <= nMaxError) { seqOffset = seqOffset - maxGap + error.offset - keyOffset / BITS_PER_BASE; if (maxGap != 0) { /** * if supports indel, re-extract the matched portion * with proper offset and length **/ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL, error.len * BITS_PER_BASE); } ret = hits->add(orgkey, target, seqOffset, s, &error, error.qual, rid); if (ret == MSG_HITSETFULL) { HASH_DEBUG(printf(" HIT")); rett = ret; break; } } } // if (entryOffset > startOffset + HASH_OVERFLOW_LIST_SCAN_BOUND) if (entryOffset > startOffset + maxScan * 2) break; if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } } HASH_DEBUG(printf("\n")); /* if (nScanEntry >= 100) { if (hits->getNumHits() > 0) printf("Pos: %u, Error: %d\n", seqOffset, error.num); statSeqProbe += nScanEntry > 10000? 10000: nScanEntry; statEmpty += printOverflowList(bucketId, keyOffset, orgkey); getchar(); } */ #ifdef DEBUG_STAT // statSeqProbe += nScanEntry; int i; for (i = 0; i <= STAT_DISTRIBUTION_NUM - 1; i++) { if (nScanEntry <= statDistributionRange[i]) { statDistribution[i]++; break; } } if (i >= STAT_DISTRIBUTION_NUM - 1) statDistribution[STAT_DISTRIBUTION_NUM - 1]++; #endif return rett; } unsigned int HashTable::lookupEmbedTable(int64 * orgkey, uint32 bucketId, int keyOffset, char * quals, strand s, int rid, HitSet * hits, bool noGap) { int64 tspace[112]; int64 * noheadkey = &tspace[8], *rest = &tspace[24]; int64 * subkeys[6] = {orgkey, &tspace[40], &tspace[56], &tspace[72], &tspace[88], &tspace[104]}; int64 * subkey1 = &tspace[8], *subkey2 = &tspace[56], *subkey3 = &tspace[72], *subkey4 = &tspace[88], *subkey5 = &tspace[104]; assert(useEmbedTables() == true); memset(tspace, 0, sizeof(int64) * (nMismatch + 2) * 16); int embedId = buckets[bucketId]; int offset = keyOffset / lenPartition; int64 * seqVector = sequence->getSequence(); // BitRead::removeHead(query, noheadkey, headMask); for (int i = 0; i < nPartition - nMismatch; i++) { assert(i < 6); assert(keyPartitions[i] - keyOffset >= 0); BitRead::removeInterval(subkeys[i], subkeys[i+1], keyPartitions[i] - keyOffset, lenPartition); } // BitRead::removeInterval(query, noheadkey, // lenPartition * (nPartition - 1 - keyId), lenPartition); BitRead::removeHead(subkeys[nPartition - nMismatch], rest, embedHeadMask); int newNumPartition = nMismatch + 1; int newLenPartition = (lenSeq - lenRest - lenKey) / BITS_PER_BASE / newNumPartition * BITS_PER_BASE; int pid = 0, ret; switch (nMismatch) { case 0: ret = embedTables[embedId * numEmbedTablesPerList + offset * newNumPartition + pid].lookup(orgkey, rest, keyOffset, quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL) return ret; break; case 1: for (int i1 = 0; i1 < newLenPartition * (newNumPartition); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); /* insert the segment into specified hash table. */ ret = embedTables[embedId * numEmbedTablesPerList + offset * newNumPartition + pid].lookup(orgkey, subkey1, keyOffset, quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL) return ret; pid++; } break; case 2: for (int i1 = 0; i1 < newLenPartition * (newNumPartition - 1); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); for (int i2 = i1; i2 < newLenPartition * (newNumPartition - 1); i2 += newLenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, newLenPartition); /* insert the segment into specified hash table. */ ret = embedTables[embedId * numEmbedTablesPerList + offset * newNumPartition + pid].lookup(orgkey, subkey2, keyOffset, quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL) return ret; pid++; } } break; case 3: for (int i1 = 0; i1 < newLenPartition * (newNumPartition - 2); i1 += newLenPartition) { BitRead::removeInterval(rest, subkey1, i1, newLenPartition); for (int i2 = i1; i2 < newLenPartition * (newNumPartition - 2); i2 += newLenPartition) { BitRead::removeInterval(subkey1, subkey2, i2, newLenPartition); for (int i3 = i2; i3 < newLenPartition * (newNumPartition - 2); i3 += newLenPartition) { BitRead::removeInterval(subkey2, subkey3, i3, newLenPartition); /* insert the segment into specified hash table. */ ret = embedTables[embedId * numEmbedTablesPerList + offset * newNumPartition + pid].lookup(orgkey, subkey3, keyOffset, quals, s, rid, hits, noGap); if (ret == MSG_HITSETFULL) return ret; pid++; } } } break; default: assert(0); } } bool HashTable::lookup(int64 * key, uint32 offset) { int num; uint32 bucketId; uint32 seqOffset, entryOffset; int64 tspace1[16], tspace2[16]; int64 * diff = &tspace1[8], *target = &tspace2[8]; int64 * seqVector; bool collision; uint32 sid, soffset; seqVector = sequence->getSequence(); /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketId); if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (!collision) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(buckets[bucketId]); else seqOffset = buckets[bucketId]; if (seqOffset != HASH_EMPTY && seqOffset == offset) return true; } else { /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; /* scan the overflow list */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY && seqOffset == offset) return true; #ifdef DEBUG_PRINT_LIST printf("%u ", HASH_GET_OFFSET(overflowPool[entryOffset])); #endif if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } #ifdef DEBUG_PRINT_LIST printf("\n"); #endif } return false; } typedef struct SeqHit { char str[128]; uint32 pos; } SeqHit; int compareSeqHit(const void * a, const void * b) { SeqHit * p1 = (SeqHit *) a; SeqHit * p2 = (SeqHit *) b; return strcmp(p1->str, p2->str); } uint32 HashTable::printOverflowList(uint32 bucketId, uint32 keyOffset, int64 * key) { uint32 seqOffset, entryOffset, startOffset; int64 tspace1[16], tspace2[16]; int64 * diff = &tspace1[8], *target = &tspace2[8]; int64 * seqVector; bool collision; SeqHit set[10000]; int nhit = 0; int64 statEntry = 0; int64 statSameEntry = 0; char keystr[128]; CompactSequence::decompose(keystr, length, key); printf("\nkey %s\n", keystr); seqVector = sequence->getSequence(); if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (!collision) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(buckets[bucketId]); else seqOffset = buckets[bucketId]; printf("%10u ", seqOffset); if (seqOffset != HASH_EMPTY) { /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL - keyOffset, lenSeq); CompactSequence::decompose(set[0].str, length, target); printf("%s", set[0].str); } printf("\n"); } else { /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; printf("\n "); for (int i = 0; i < keyOffset / BITS_PER_BASE; i++) printf("-"); for (int i = 0; i < lenKey / BITS_PER_BASE; i++) printf("+"); for (int i = 0; i < (lenSeq - keyOffset - lenKey) / BITS_PER_BASE; i++) printf("-"); printf("\n"); startOffset = entryOffset; /* scan the overflow list */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; // printf("%10u ", seqOffset); if (seqOffset != HASH_EMPTY) { if (nhit < 10000) { /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL - keyOffset, lenSeq); CompactSequence::decompose(set[nhit].str, length, target); set[nhit].pos = seqOffset; nhit++; // printf("%s", str); } } // printf("\n"); if (entryOffset > startOffset + HASH_OVERFLOW_LIST_SCAN_BOUND ) break; if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } qsort(set, nhit, sizeof(SeqHit), compareSeqHit); statEntry += nhit; for (int i = 0; i < nhit; i++) if (i > 0 && strcmp(set[i].str, set[i - 1].str) == 0) statSameEntry++; for (int i = 0; i < nhit; i++) { printf("%10u %s", set[i].pos, set[i].str); if (i > 0 && strcmp(set[i].str, set[i - 1].str) != 0) { int ndiff = 0; for (int j = 0; j < strlen(set[i].str); j++) if (set[i].str[j] != set[i - 1].str[j]) ndiff++; printf(" %d", ndiff); } printf("\n"); int diff = 0; for (int j = 0; j < strlen(set[i].str); j++) if (set[i].str[j] != keystr[j]) { printf("*"); diff++; } else printf(" "); printf(" %2d\n", diff); } } // printf("\n%llu %llu %.2f\n", statSameEntry, statEntry, (double)statSameEntry / statEntry); return statSameEntry; } /* * longOverflowList() * check whether the overflow list of the specified bucket is * longer than maxlen entries. */ bool HashTable::longOverflowList(uint32 bucketId, uint32 maxlen) { bool collision; uint32 entryOffset; uint32 lenlist; if (buckets[bucketId] == HASH_EMPTY ) return false; if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (!collision) { lenlist = 1; } else { if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; /* scan the overflow list */ lenlist = 0; while (entryOffset < numOverflowEntry) { lenlist++; if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset]) || lenlist >= maxlen) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE) || lenlist >= maxlen) break; } } } if (lenlist >= maxlen) return true; else return false; } int HashTable::sortList() { uint32 bucketId; uint32 entryOffset, seqOffset, offset; bool collision; uint32 i, numKey = 0; int64 tspace1[16]; int64 * target = &tspace1[8]; int64 * seqVector; EntryCounter * counters; uint32 * space; uint32 numEntryCounter = 1000000; counters = new EntryCounter[numEntryCounter]; space = new uint32[1000000]; int64 statLongList = 0; int64 statLongListEntry = 0; seqVector = sequence->getSequence(); ProgressBar bar(numBucket - 1, PROGRESS_BAR_WIDTH); for (bucketId = 0; bucketId < numBucket; bucketId++) { /* update the progress bar */ bar.update(bucketId); /* quickpath -- empty bucket */ if (buckets[bucketId] == HASH_EMPTY ) continue; if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (!collision) continue; numKey = 0; /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; offset = entryOffset; /* scan the overflow list, get the number of each key */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL + lenRest, lenKey); for (i = 0; i < numKey; i++) { if (BitRead::compare(counters[i].key, target)) { counters[i].num++; break; } } if (i >= numKey) { if (numKey >= numEntryCounter) { elog(WARNING, "WARNING: increase number of entry counters [%u].\n", numEntryCounter * 10); EntryCounter * tmp = counters; counters = new EntryCounter[numEntryCounter * 10];memcpy (counters, tmp, sizeof(EntryCounter) * numEntryCounter); numEntryCounter *= 10; delete[] tmp; } BitRead::copy(target, counters[numKey].key); counters[numKey].num = 1; numKey++; } } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } if (entryOffset == numOverflowEntry) entryOffset--; qsort(counters, numKey, sizeof(EntryCounter), compareEntryCounter); int nhit = 0; for (i = 0; i < numKey; i++) { counters[i].offset = nhit; nhit += counters[i].num; } uint32 * pool; if (nhit < 1000000) pool = space; else pool = new uint32[nhit]; if (nhit > 500) { statLongList++; statLongListEntry += nhit; } #ifdef ABCD entryOffset = offset; /* scan the overflow list again, remove all repeat keys */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL + lenRest, lenKey); for (i = 0; i < numKey; i++) { if (BitRead::compare(counters[i].key, target)) break; } #ifdef DEBUG assert(i < numKey); #endif pool[counters[i].offset] = seqOffset; counters[i].offset++; } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } memcpy(&overflowPool[offset], pool, sizeof(uint32) * nhit); if (compressedTable) HASH_SET_END(overflowPool[offset + nhit - 1]); else BITMAP_SET(overflowBits[(offset + nhit - 1)/BITS_PER_BYTE], (offset + nhit - 1) % BITS_PER_BYTE); if (pool != space) delete [] pool; #endif } elog(INFO, "\nNumber of long list: %llu\n", statLongList); elog(INFO, "Number of long list entry: %llu\n", statLongListEntry); delete[] counters; delete[] space; return SUCCESS; } /* * removeRepeat() * This function is used to remove all entries that appear * more than num times. */ int HashTable::removeRepeat(uint32 num) { uint32 bucketId; uint32 entryOffset, seqOffset, offset; uint32 i, numKey = 0; int64 tspace1[16]; int64 * target = &tspace1[8]; int64 * seqVector; EntryCounter * counters; uint32 numCounter; seqVector = sequence->getSequence(); if (num < 2) return SUCCESS; numCounter = 128; counters = new EntryCounter[numCounter]; ProgressBar bar(numBucket - 1, PROGRESS_BAR_WIDTH); for (bucketId = 0; bucketId < numBucket; bucketId++) { /* update the progress bar */ bar.update(bucketId); /* quickpath -- empty bucket */ if (buckets[bucketId] == HASH_EMPTY ) continue; /* if the list contain less than num entries, continue */ if (!longOverflowList(bucketId, num)) continue; numKey = 0; /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; offset = entryOffset; /* scan the overflow list, get the number of each key */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL , lenKey); for (i = 0; i < numKey; i++) { if (BitRead::compare(counters[i].key, target)) { counters[i].num++; break; } } if (i >= numKey) { if (numKey >= numCounter) { EntryCounter * tmp; tmp = new EntryCounter[numCounter * 2];memcpy (tmp, counters, numCounter * sizeof(EntryCounter)); delete[] counters; counters = tmp; numCounter = numCounter * 2; } BitRead::copy(target, counters[numKey].key); counters[numKey].num = 1; numKey++; } } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } /* quickpath -- all entries are removed */ for (i = 0; i < numKey; i++) if (counters[i].num < num) break; if (i >= numKey) { buckets[bucketId] = HASH_EMPTY; if (!compressedTable) BITMAP_CLEAR(collisionBits[bucketId/BITS_PER_BYTE], bucketId % BITS_PER_BYTE); continue; } entryOffset = offset; /* scan the overflow list again, remove all repeat keys */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL , lenKey); for (i = 0; i < numKey; i++) { if (BitRead::compare(counters[i].key, target)) break; } #ifdef DEBUG assert(i < numKey); #endif /* move the entry forward */ if (counters[i].num < num) { overflowPool[offset] = overflowPool[entryOffset]; offset++; } } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) { HASH_SET_END(overflowPool[offset - 1]); break; } } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) { BITMAP_SET(overflowBits[(offset - 1)/BITS_PER_BYTE], (offset - 1) % BITS_PER_BYTE); break; } } entryOffset++; } } delete[] counters; return SUCCESS; } void HashTable::checkRepeat(uint32 num) { uint32 bucketId; uint32 entryOffset, seqOffset, offset; uint32 i, numKey = 0; int64 tspace1[16]; int64 * target = &tspace1[8]; int64 * seqVector; EntryCounter * counters; uint32 numCounter; seqVector = sequence->getSequence(); if (num < 2) return; numCounter = 128; counters = new EntryCounter[numCounter]; for (bucketId = 0; bucketId < numBucket; bucketId++) { restart: if (buckets[bucketId] == HASH_EMPTY ) continue; if (!longOverflowList(bucketId, num)) continue; numKey = 0; /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; offset = entryOffset; /* scan the overflow list, get the number of each key */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL , lenKey); for (i = 0; i < numKey; i++) { if (BitRead::compare(counters[i].key, target)) { counters[i].num++; break; } } if (i >= numKey) { if (numKey >= numCounter) { EntryCounter * tmp; tmp = new EntryCounter[numCounter * 2];memcpy (tmp, counters, numCounter * sizeof(EntryCounter)); delete[] counters; counters = tmp; numCounter = numCounter * 2; } BitRead::copy(target, counters[numKey].key); counters[numKey].num = 1; numKey++; } } if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } for (i = 0; i < numKey; i++) { if (counters[i].num >= num) { printf("%u: (%llu, %llu, %llu, %llu), %u", bucketId, counters[i].key[0], counters[i].key[1], counters[i].key[2], counters[i].key[3], counters[i].num); // goto restart; } } } delete[] counters; } /* * HashTable::save() * This function is used to save the in-memory hash table on * disk. */ int HashTable::save(char * path) { size_t ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.i%d.whm", path, indexID); file = fopen(fname, "wb"); if (file == NULL ) return ERR_PARA; ret = fwrite(this, sizeof(HashTable), 1, file); if (ret != 1) { return ERR_FILE; } ret = fwrite(buckets, sizeof(uint32), numBucket, file); if (ret != numBucket) { return ERR_FILE; } ret = fwrite(overflowPool, sizeof(uint32), numOverflowEntry, file); if (ret != numOverflowEntry) { return ERR_FILE; } if (!compressedTable) { ret = fwrite(collisionBits, sizeof(unsigned char), numBucket / BITS_PER_BYTE + 1, file); if (ret != numBucket / BITS_PER_BYTE + 1) { return ERR_FILE; } ret = fwrite(overflowBits, sizeof(unsigned char), numOverflowEntry / BITS_PER_BYTE + 1, file); if (ret != numOverflowEntry / BITS_PER_BYTE + 1) { return ERR_FILE; } } ret = fwrite(embedBits, sizeof(unsigned char), numBucket / BITS_PER_BYTE + 1, file); if (ret != numBucket / BITS_PER_BYTE + 1) { return ERR_FILE; } if (useEmbedTables()) { for (int i = 0; i < numEmbedTables; i++) embedTables[i].save(file); } ret = fflush(file); if (ret != 0) { return ERR_FILE; } ret = fclose(file); if (ret != 0) { return ERR_FILE; } return SUCCESS; } /* * HashTable::load() * This function is used to load the on-disk copy of hash table * into memory. */ int HashTable::load(char * path, int index, CompactSequence * seq) { size_t ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.i%d.whm", path, index); file = fopen(fname, "rb"); if (file == NULL ) return ERR_PARA; ret = fread(this, sizeof(HashTable), 1, file); if (ret != 1) { elog(ERROR, "failed to load hash table head.\n"); return ERR_FILE; } sequence = seq; /* we can use smaller block to reduct the memory consumption */ buckets = (uint32 *) malloc((int64) numBucket * sizeof(uint32)); ret = fread(buckets, sizeof(uint32), numBucket, file); if (ret != numBucket) { elog(ERROR, "failed to load buckets.\n"); return ERR_FILE; } overflowPool = (uint32 *) malloc((int64) numOverflowEntry * sizeof(uint32)); ret = fread(overflowPool, sizeof(uint32), numOverflowEntry, file); if (ret != numOverflowEntry) { elog(ERROR, "failed to load overflow array.\n"); return ERR_FILE; } if (!compressedTable) { collisionBits = (unsigned char *) malloc( (int64) (numBucket / BITS_PER_BYTE + 1) * sizeof(unsigned char)); ret = fread(collisionBits, sizeof(unsigned char), numBucket / BITS_PER_BYTE + 1, file); if (ret != numBucket / BITS_PER_BYTE + 1) { elog(ERROR, "failed to load collision bits.\n"); return ERR_FILE; } overflowBits = (unsigned char *) malloc( (int64) (numOverflowEntry / BITS_PER_BYTE + 1) * sizeof(unsigned char)); ret = fread(overflowBits, sizeof(unsigned char), numOverflowEntry / BITS_PER_BYTE + 1, file); if (ret != numOverflowEntry / BITS_PER_BYTE + 1) { elog(ERROR, "failed to load overflow bits. %d %d\n", ret, numOverflowEntry / BITS_PER_BYTE + 1); return ERR_FILE; } } if (useEmbedTables()) { embedBits = (unsigned char *) malloc( (int64) (numBucket / BITS_PER_BYTE + 1) * sizeof(unsigned char)); if (embedBits == NULL) return ERR_MEM; ret = fread(embedBits, sizeof(unsigned char), numBucket / BITS_PER_BYTE + 1, file); if (ret != numBucket / BITS_PER_BYTE + 1) { elog(ERROR, "failed to load collision bits.\n"); return ERR_FILE; } embedTables = (EmbedHashTable *) malloc( (int64) numEmbedTables * sizeof(EmbedHashTable)); if (embedTables == NULL) return ERR_MEM; for (int i = 0; i < numEmbedTables; i++) { int ret = embedTables[i].load(file, sequence); if ( ret != SUCCESS) return ret; } elog(DEBUG1, "load %d embed hash tables\n", numEmbedTables); } ret = fclose(file); if (ret != 0) { return ERR_FILE; } resetStat(); return SUCCESS; } /* * HashTable::remove * free the space occupied by the hash index. */ int HashTable::remove() { if (buckets) { free(buckets); buckets = NULL; } if (overflowPool) { free(overflowPool); overflowPool = NULL; } if (emptyBits) { free(emptyBits); emptyBits = NULL; } if (collisionBits) { free(collisionBits); collisionBits = NULL; } return SUCCESS; } /* * nextPrime() * return the least prime number that is greater * than the input number. */ unsigned int HashTable::nextPrime(unsigned int num) { unsigned int i, j, x; num = num / 2 * 2 + 1; for (i = num; i < num + 1000; i += 2) { x = (unsigned int) sqrtl(i); for (j = 3; j < x; j += 2) { if (i % j == 0) break; } if (j >= x) return i; } return i; } wham/perfcounters.h0000644001532600153260000000200112003705361013716 0ustar yinanyinan/* * Copyright 2009, Spyros Blanas */ #ifdef __sparc__ #include #endif class PerfCounters { public: void init(); void threadinit(); void destroy(); inline void writeCounters(unsigned long long* counter1, unsigned long long* counter2) { #ifdef PERFCOUNT #if defined(__i386__) || defined(__x86_64__) *counter1 = readpmc(0); *counter2 = readpmc(1); #elif defined(__sparc__) unsigned long long val; __asm__ __volatile__ ( "rd %%pic, %0" : "=r" (val) /* output */ ); *counter1 = val >> 32; *counter2 = val & 0xFFFFFFFFull; #else #error Performance counters not known for this architecture. #endif #endif } private: #if defined(__i386__) || defined(__x86_64__) inline unsigned long long readpmc(unsigned int counterid) { unsigned long hi, lo; __asm__ __volatile__ ("rdpmc" : "=d" (hi), "=a" (lo) : "c" (counterid) ); return (((unsigned long long) hi) << 32) | lo; } #elif defined(__sparc__) cpc_t* cpc; #endif }; wham/util.h0000644001532600153260000000444012003705361012165 0ustar yinanyinan#ifndef _UTIL_H_ #define _UTIL_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: util.h 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include #include "error.h" using namespace std; extern int ELOG_LEVEL; #define PROGRESS_BAR_WIDTH 40 class Timer { public: void start() { gettimeofday(&s1, 0); } double stop() { double t; gettimeofday(&s2, 0); t = (s2.tv_sec - s1.tv_sec) + (s2.tv_usec - s1.tv_usec) * 0.000001; return t; } private: struct timeval s1, s2; }; class ProgressBar { public: ProgressBar(long long maxwork, char width) : maxwork(maxwork), width(width), firsttime(true), value(0) { } void update(long long work) { if (ELOG_LEVEL < INFO ) return; if (firsttime) { cout << '['; for (char i = 0; i < width; ++i) { cout << ' '; } cout << "] "; cout << " 0%" << flush; firsttime = false; } long long newvalue = (long long) (work * 100. / maxwork); if (newvalue == value) { if (work == maxwork) cout << endl; return; } value = newvalue; cout << "\b\b\b\b\b\b"; for (char i = 0; i < width; ++i) cout << '\b'; for (char i = 0; i < width * value / 100; ++i) cout << '#'; for (char i = width * value / 100; i < width; ++i) cout << ' '; cout << "] "; cout << setw(3) << value << '%' << flush; if (work == maxwork) cout << endl; } private: long long maxwork; char width; bool firsttime; long long value; }; #endif wham/embedhash.cpp0000644001532600153260000004562612054750341013502 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: hash.cpp 152 2012-07-22 10:52:53Z yinan $ */ #include #include #include #include #include #include #include "hash.h" #include "embedhash.h" #include "bitread.h" #include "error.h" #include "pair.h" #include "edit_distance.h" #include "util.h" #include "rdtsc.h" #define BITWISE_ALIGNMENT //#define DEBUG_PRINT_LIST unsigned long long statEmbedHashLookup = 0; unsigned long long statEmbedHashLookupEntry = 0; EmbedHashTable::EmbedHashTable() { memset(this, 0, sizeof(EmbedHashTable)); } EmbedHashTable::~EmbedHashTable() { delete[] buckets; delete[] overflowPool; } /* * HashTable::init() * initialize the private variables */ void EmbedHashTable::init(CompactSequence * seq, int len, unsigned int nBucket, int numError, int nPartition) { sequence = seq; length = len; lenSeq = length * BITS_PER_BASE; nMismatch = numError; nMaxError = nMismatch; nMaxGap = 0; maxQual = MAX_INT; if (nBucket == 0) { double nEntry, nSpace; nEntry = (double) seq->getNum(); nSpace = pow(8.0, length / nPartition * (nPartition - nMismatch)); numBucket = nEntry < nSpace ? (int) nEntry : (int) nSpace; } else numBucket = nBucket; numBucket = nextPrime(numBucket); numEmpty = numBucket; /* * if the sequence size is greater than 2^31, we have to * use a normal hash table, otherwise, we use a compressed * hash table to speedup the searches. */ if (seq->getNum() < COMPRESS_TABLE_SIZE ) compressedTable = true; else compressedTable = false; } /* * HashTable::preProcessInit() * Allocate and initialize the hash bucket array and bitmap arrays * for collision bits and empty bits. */ int EmbedHashTable::preProcessInit() { /* allocate hash buckets. */ buckets = (unsigned int *) malloc((int64) numBucket * sizeof(unsigned int)); if (buckets == NULL ) return ERR_MEM; /* * allocate bitmap arrays to identify empty buckets * and collision buckets. */ emptyBits = (unsigned char *) malloc((int64) numBucket / BITS_PER_BYTE + 1); if (emptyBits == NULL ) return ERR_MEM; collisionBits = (unsigned char *) malloc( (int64) numBucket / BITS_PER_BYTE + 1); if (collisionBits == NULL ) return ERR_MEM; /* Initialization */ memset(buckets, 0, numBucket * sizeof(unsigned int)); memset(emptyBits, 0, numBucket / BITS_PER_BYTE + 1); memset(collisionBits, 0, numBucket / BITS_PER_BYTE + 1); return SUCCESS; } /* * HashTable::preProcessEnd() * apply the empty bits and collision bits to the hash buckets. * For empty buckets, the bucket values are set to be HASH_EMPTY. * For the buckets with collisions, the most significant bits in * the buckets are set to be 1. */ int EmbedHashTable::preProcessEnd() { uint32 i; uint32 tmp, sum = 0; uint32 collision; const double ln2 = log(2); if (buckets == NULL) { elog(ERROR, "ERROR: unallocated bucket array in hash table.\n"); return ERR_PARA; } if (emptyBits == NULL || collisionBits == NULL) { elog(ERROR, "ERROR: unallocated bitmap in hash table.\n"); return ERR_PARA; } if (numOverflowEntry < 0) return ERR_PARA; /* allicate overflow pool and bitmaps */ overflowPool = (unsigned int *) malloc( (int64) numOverflowEntry * sizeof(unsigned int)); if (overflowPool == NULL ) return ERR_MEM; memset(overflowPool, 0, numOverflowEntry * sizeof(unsigned int)); if (!compressedTable) { overflowBits = (unsigned char *) malloc( (int64) numOverflowEntry / BITS_PER_BYTE + 1); if (overflowBits == NULL ) return ERR_MEM; memset(overflowBits, 0, (numOverflowEntry / BITS_PER_BYTE + 1) * sizeof(char)); } /* * scan the hash buckets to appy the empty bits and * collision bits. */ for (i = 0; i < numBucket; i++) { /* * The current value of the bucket is the number of collision * entries in each bucket. We accumulate this value to compute * the position of the last entries in the overflow array for each bucket, * and store the position into the bucket. For the non-collision * buckets, the values will be updated to the position of segment * in the function insert. */ tmp = buckets[i]; sum += tmp; buckets[i] = sum; if (buckets[i] > 0) { if (compressedTable) { HASH_SET_END(overflowPool[buckets[i] - 1]); } else { BITMAP_SET(overflowBits[(buckets[i] - 1) / BITS_PER_BYTE], (buckets[i] - 1) % BITS_PER_BYTE); } } /* update histogram */ /* int h = 0; if (tmp > 0) h = (int)ceil(log(tmp)/ln2); if (h >= nHistogram) histogram[nHistogram - 1]++; else histogram[h]++; */ /* set the values for empty buckets */ if (!BITMAP_IS(emptyBits[i / BITS_PER_BYTE], i % BITS_PER_BYTE)) buckets[i] = HASH_EMPTY; else { if (compressedTable) { /* apply the collision bit to the most significant bit of the bucket */ collision = BITMAP_IS(collisionBits[i / BITS_PER_BYTE], i % BITS_PER_BYTE); buckets[i] |= HASH_COLLISION_MASK(collision); } } } /* free the bitmap arrays */ free(emptyBits); emptyBits = NULL; if (compressedTable) { free(collisionBits); collisionBits = NULL; } // setScanThreshold(0.001); elog( DEBUG2, " numBucket| numEmpty| Collision| numEntry| Col Rat| Emp Rat|Avg List|Avg Miss\n"); elog( DEBUG2, "%11u %11u %11u %11u %8.2f %8.2f %8.2f %8.2f\n", numBucket, numEmpty, numCollision, numEntry, (double) (numCollision) / (numBucket - numEmpty), (double) (numEmpty) / (numBucket), (double) (numOverflowEntry) / numCollision, (double) (numOverflowEntry + numBucket - numEmpty - numCollision) / (numBucket - numEmpty)); return SUCCESS; } void EmbedHashTable::setScanThreshold(double r) { int64 sum = 0; int64 total = numBucket; int64 top = total - total * r; // uint32 top = total - total / 1000; /* elog(INFO, "Scan threshold: %f\n", r); elog(INFO, "Hash List Length Histogram:\n"); elog(INFO, "Empty: %d\n", numEmpty); for (int i = 0; i < nHistogram - 1; i++) { sum += histogram[i]; if (sum <= top) maxScan = 0x1 << (i + 1); elog(INFO, "List length <= %d: %d (%.2f%%)\n", (0x1 << i), histogram[i], histogram[i] * 100.0 / total); } sum += histogram[nHistogram - 1]; if (sum <= top) maxScan = 0x1 << nHistogram; elog(INFO, "List length for the rest: %d (%.2f%%)\n", histogram[nHistogram - 1], histogram[nHistogram - 1] * 100.0 / total); elog(INFO, "Choose maximum of scan: %d\n", maxScan); */ } /* * HashTable::preProcessInsert() * update the statistics infos for the hash tables. In particular, * we update the empty bitmap array and collision bitmap array, * and update the bucket value to be the number of collision entries * hashed into the bucket. */ void EmbedHashTable::preProcessInsert(int64 * key) { uint32 bucketID; numEntry++; /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketID); if (!BITMAP_IS(emptyBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE)) { /* set the empty bit*/ BITMAP_SET(emptyBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE); numEmpty--; } else { if (!BITMAP_IS(collisionBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE)) { /* The two collsision entries will be added into the overflow array */ BITMAP_SET(collisionBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE); numCollision++; numOverflowEntry += 2; buckets[bucketID] += 2; } else { /* The collsision entry will be added into the overflow array */ numOverflowEntry++; buckets[bucketID]++; } } } /* * HashTable::buildInit() * allocate and initialize the overflow pool. */ int EmbedHashTable::buildInit() { return SUCCESS; } /* * HashTable::insert() * insert an segment(entry) into the hash table. If the collision * bit is 0, the position of the segment is directly stored in the * hash bucket. Otherwise, the position of the segment is stored in * the end of the overflow list of the bucket. */ void EmbedHashTable::insert(int64 * key, unsigned int offset) { uint32 curOverflowEntry; uint32 bucketId; uint32 seqOffset; bool collision; uint32 counter = 0; int64 * seqVector; int64 tspace[16]; int64 * target = &tspace[8]; bool isBloomFilter; uint32 * bloomFilter; uint32 bloomFilterNum; seqVector = sequence->getSequence(); /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketId); if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); if (!collision) { /* store the position into the bucket */ if (compressedTable) buckets[bucketId] = HASH_GET_OFFSET(offset); else buckets[bucketId] = offset; } else { buckets[bucketId]--; /* get the overflow list position */ if (compressedTable) curOverflowEntry = HASH_GET_OFFSET(buckets[bucketId]); else curOverflowEntry = buckets[bucketId]; /* * append the position of the new segment to the end of * overflow list. */ if (compressedTable) overflowPool[curOverflowEntry] = HASH_SET_OFFSET(overflowPool[curOverflowEntry], offset); else overflowPool[curOverflowEntry] = offset; } } /* * HashTable::lookup() * search the segment(key) on the hash table, find the potential * matched portions in the genome sequence. Call function * pairAligner::pairAlign to perform pairwise alignment between * query sequence and the potential matched portions. */ unsigned int EmbedHashTable::lookup(int64 * orgkey, int64 * key, int keyOffset, char * quals, strand s, int rid, HitSet * hits, bool noGap) { int num; uint32 bucketId; uint32 seqOffset, entryOffset, startOffset; int64 tspace1[16], tspace2[16]; int64 * diff = &tspace1[8], *target = &tspace2[8]; int64 * seqVector; bool collision; uint32 sid, soffset; int ret; ErrorVector error; int nScanEntry = 0; int rett = SUCCESS; int maxGap = 0; if (!noGap) maxGap = nMaxGap; seqVector = sequence->getSequence(); /* compute the hash value */ HASH_FUNCTION(key, numBucket, words, bucketId); if (compressedTable) collision = HASH_IS_COLLISION(buckets[bucketId]); else collision = BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE); statEmbedHashLookup++; if (!collision) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(buckets[bucketId]); else seqOffset = buckets[bucketId]; if (seqOffset != HASH_EMPTY) { statEmbedHashLookupEntry++; HASH_DEBUG(printf(" %u", seqOffset - keyOffset / BITS_PER_BASE)); #ifdef DEBUG_HASH_PRINT BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL + lenRest, lenKey); if (BitRead::compare(target, key)) printf("*"); #endif /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL - maxGap * BITS_PER_BASE - keyOffset, lenSeq + maxGap * 2 * BITS_PER_BASE); nScanEntry++; /** * perform the pairwise alignment under the constraint * on the number of errors. */ error = PairAligner::pairAlign(orgkey, target, length, nMaxError, maxGap); if (error.num <= nMaxError) { seqOffset = seqOffset - maxGap + error.offset - keyOffset / BITS_PER_BASE; if (maxGap != 0) { /** * if supports indel, re-extract the matched portion * with proper offset and length **/ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL, error.len * BITS_PER_BASE); } ret = hits->add(orgkey, target, seqOffset, s, &error, error.qual, rid); if (ret == MSG_HITSETFULL) { HASH_DEBUG(printf(" HIT")); rett = ret; } } } // else // stat_empty++; } else { // stat_collision++; /* get the position of the overflow list */ if (compressedTable) entryOffset = HASH_GET_OFFSET(buckets[bucketId]); else entryOffset = buckets[bucketId]; startOffset = entryOffset; /* scan the overflow list */ while (entryOffset < numOverflowEntry) { /* get the position of potential matched portion */ if (compressedTable) seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]); else seqOffset = overflowPool[entryOffset]; if (seqOffset != HASH_EMPTY) { statEmbedHashLookupEntry++; HASH_DEBUG(printf(" %u", seqOffset - keyOffset / BITS_PER_BASE)); #ifdef DEBUG_HASH_PRINT BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL + lenRest, lenKey); if (BitRead::compare(target, key)) printf("*"); #endif /* get the potential matched portion */ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL - maxGap * BITS_PER_BASE - keyOffset, lenSeq + maxGap * 2 * BITS_PER_BASE); nScanEntry++; /** * perform the pairwise alignment under the constraint * on the number of errors. */ error = PairAligner::pairAlign(orgkey, target, length, nMaxError, maxGap); if (error.num <= nMaxError) { seqOffset = seqOffset - maxGap + error.offset - keyOffset / BITS_PER_BASE; if (maxGap != 0) { /** * if supports indel, re-extract the matched portion * with proper offset and length **/ BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL, error.len * BITS_PER_BASE); } ret = hits->add(orgkey, target, seqOffset, s, &error, error.qual, rid); if (ret == MSG_HITSETFULL) { HASH_DEBUG(printf(" HIT")); rett = ret; break; } } } // if (entryOffset > startOffset + HASH_OVERFLOW_LIST_SCAN_BOUND) // if (entryOffset > startOffset + 64 * 16) // break; if (compressedTable) { if (HASH_IS_END(overflowPool[entryOffset])) break; } else { if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE)) break; } entryOffset++; } } HASH_DEBUG(printf("\n")); return rett; } /* * HashTable::save() * This function is used to save the in-memory hash table on * disk. */ int EmbedHashTable::save(FILE * file) { size_t ret; ret = fwrite(this, sizeof(EmbedHashTable), 1, file); if (ret != 1) { return ERR_FILE; } ret = fwrite(buckets, sizeof(uint32), numBucket, file); if (ret != numBucket) { return ERR_FILE; } ret = fwrite(overflowPool, sizeof(uint32), numOverflowEntry, file); if (ret != numOverflowEntry) { return ERR_FILE; } if (!compressedTable) { ret = fwrite(collisionBits, sizeof(unsigned char), numBucket / BITS_PER_BYTE + 1, file); if (ret != numBucket / BITS_PER_BYTE + 1) { return ERR_FILE; } ret = fwrite(overflowBits, sizeof(unsigned char), numOverflowEntry / BITS_PER_BYTE + 1, file); if (ret != numOverflowEntry / BITS_PER_BYTE + 1) { return ERR_FILE; } } return SUCCESS; } /* * HashTable::load() * This function is used to load the on-disk copy of hash table * into memory. */ int EmbedHashTable::load(FILE * file, CompactSequence * seq) { size_t ret; ret = fread(this, sizeof(EmbedHashTable), 1, file); if (ret != 1) { elog(ERROR, "failed to load hash table head.\n"); return ERR_FILE; } sequence = seq; /* we can use smaller block to reduct the memory consumption */ buckets = (uint32 *) malloc((int64) numBucket * sizeof(uint32)); if (buckets == NULL) return ERR_MEM; ret = fread(buckets, sizeof(uint32), numBucket, file); if (ret != numBucket) { elog(ERROR, "failed to load buckets.\n"); return ERR_FILE; } overflowPool = (uint32 *) malloc((int64) numOverflowEntry * sizeof(uint32)); if (overflowPool == NULL) return ERR_MEM; ret = fread(overflowPool, sizeof(uint32), numOverflowEntry, file); if (ret != numOverflowEntry) { elog(ERROR, "failed to load overflow array.\n"); return ERR_FILE; } if (!compressedTable) { collisionBits = (unsigned char *) malloc( (int64) (numBucket / BITS_PER_BYTE + 1) * sizeof(unsigned char)); if (collisionBits == NULL) return ERR_MEM; ret = fread(collisionBits, sizeof(unsigned char), numBucket / BITS_PER_BYTE + 1, file); if (ret != numBucket / BITS_PER_BYTE + 1) { elog(ERROR, "failed to load collision bits.\n"); return ERR_FILE; } overflowBits = (unsigned char *) malloc( (int64) (numOverflowEntry / BITS_PER_BYTE + 1) * sizeof(unsigned char)); if (overflowBits == NULL) return ERR_MEM; ret = fread(overflowBits, sizeof(unsigned char), numOverflowEntry / BITS_PER_BYTE + 1, file); if (ret != numOverflowEntry / BITS_PER_BYTE + 1) { elog(ERROR, "failed to load overflow bits. %d %d\n", ret, numOverflowEntry / BITS_PER_BYTE + 1); return ERR_FILE; } } return SUCCESS; } /* * HashTable::remove * free the space occupied by the hash index. */ int EmbedHashTable::remove() { if (buckets) { free(buckets); buckets = NULL; } if (overflowPool) { free(overflowPool); overflowPool = NULL; } if (emptyBits) { free(emptyBits); emptyBits = NULL; } if (collisionBits) { free(collisionBits); collisionBits = NULL; } return SUCCESS; } /* * nextPrime() * return the least prime number that is greater * than the input number. */ unsigned int EmbedHashTable::nextPrime(unsigned int num) { unsigned int i, j, x; num = num / 2 * 2 + 1; for (i = num; i < num + 1000; i += 2) { x = (unsigned int) sqrtl(i); for (j = 3; j < x; j += 2) { if (i % j == 0) break; } if (j >= x) return i; } return i; } wham/sequence.cpp0000644001532600153260000005643412003705361013365 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: sequence.cpp 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include #include #include #include "error.h" #include "sequence.h" #include "bitread.h" #include "util.h" #include "aligner.h" #include "hash.h" /* * A = 0, C = 1, G = 2, T = 2, N = 7. * If the value of A, C, G, T is changed, modify this array. */ const char code2Gene[8] = { 'A', 'C', 'G', 'T', 'N', 'N', 'N', 'N' }; CompactSequence::CompactSequence() { memset(this, 0, sizeof(CompactSequence)); skipMask = true; } CompactSequence::CompactSequence(bool skip) { memset(this, 0, sizeof(CompactSequence)); skipMask = skip; } /* * CompactSequence::skipLine * skip the next line in the specified file */ int CompactSequence::skipLine(FILE * file) { char c; int i = 0; /* * skip all characters until get a character * with the value of 10, 13, or EOF */ while (1) { c = fgetc(file); i++; if (c == 10 || c == 13 || c == EOF ) break; } return i; } int CompactSequence::getSeqName(FILE * file, char * str) { int i = 0; char * c; /* * skip all characters until get a character * with the value of 10, 13, or EOF */ while (1) { str[i] = fgetc(file); if (str[i] == 10 || str[i] == 13 || str[i] == EOF ) break; i++; } c = strchr(str, ' '); if (c == NULL ) str[i] = '\0'; else *c = '\0'; return i; } void CompactSequence::extractFileName(char * dest, char * src) { int start, end; char * str; start = 0; end = strlen(src) - 1; for (int i = strlen(src) - 1; i >= 0; i--) { //remove path if (src[i] == '\\' || src[i] == '/') { start = i + 1; break; } //remove file extension name if (src[i] == '.') { if (strcmp(&src[i + 1], "fq") == 0 || strcmp(&src[i + 1], "fa") == 0 || strcmp(&src[i + 1], "fastq") == 0 || strcmp(&src[i + 1], "mfa") == 0) end = i - 1; } } strncpy(dest, &src[start], end - start + 1); dest[end - start + 1] = '\0'; } /* * CompactSequence::preProcess * This function is used to collection statistics infomations * for the building phase. The informations include: * 1) the number of effective characters. Effective characters * include all A, C, G adn T characters, and the first numError+1 * unknown characters in each N segment. * 2) the number of N Segments * 3) the number of sequences in the specific file */ int CompactSequence::preProcess(char * fname, uint32 numError, int64 & num, int64 & numNSegment, int64 & numFileSeq) { FILE * file; char c; bool isUnknownChar; unsigned int numContinuousN; int ret; file = fopen(fname, "rb"); if (file == NULL) { printf("File does not exist.\n"); return ERR_FILE; } num = 0; numContinuousN = 0; numNSegment = 0; numFileSeq = 0; while ((c = fgetc(file)) != EOF) { isUnknownChar = true; /* skip the comments */ if (c == '>') { skipLine(file); numFileSeq++; numContinuousN = 0; continue; } else if (c == 10 || c == 13) { continue; } /* handle valid characters */ else if (c == 'A' || c == 'C' || c == 'G' || c == 'T') { isUnknownChar = false; } else if ((c == 'a' || c == 'c' || c == 'g' || c == 't') && !skipMask) { isUnknownChar = false; } /* all other characters are treated as unknown characters */ if (!isUnknownChar) { /* * the segment contains (numError + 1) unknown * characters are treated as a N segment. */ if (numContinuousN > numError + 1) numNSegment++; num++; numContinuousN = 0; } else { numContinuousN++; /* * the first numError + 1 unknown characters are counted * in the number of effective characters. */ if (numContinuousN <= numError + 1) num++; } } ret = fclose(file); if (ret != 0) return ret; return SUCCESS; } /* * CompactSequence::build * This function is used to build the compact sequence. The compact * sequence contains all effective characters, each of which is * represented by three bits. Effective characters include all A, C, * G and T characters, and the first numError+1 unknown characters * in each N segment, which is the segment that contains continugous * numError+1 Ns. The interval tree is built to transfer the location * in the original sequnce and compact sequence. * 1) invoke preProcess to collect sequence infos. * 2) allocate the sequence space. * 3) load effective characters of the original sequences into the * compact sequence. */ int CompactSequence::build(char ** fname, int numFile, int length, int numError) { unsigned int i, j, ret; int curSeq; FILE * file; char c; unsigned int offsetInCmptSeq, offsetInOrgSeq; unsigned int lenSegment, lenSegmentN; unsigned int step, nextstep; int64 numFileChar, numFileNSeg, numFileSeq, nChar, nNSegment; int64 word, code; len = length; nError = numError; /* * scan all sequences to accumulate the number of effective * characters and N segments. */ nChar = 0; nSeq = 0; nNSegment = 0; elog(INFO, "Preprocessing reference sequences...\n"); for (i = 0; i < numFile; i++) { elog(INFO, "preprocessing %s...\n", fname[i]); ret = preProcess(fname[i], numError, numFileChar, numFileNSeg, numFileSeq); if (ret != SUCCESS ) return ret; /* we add numError+1 Ns between two adjacent sequences */ nChar += numFileChar + (numError + 1) * numFileSeq; /* accumulate the number of sequences in all files */ nSeq += numFileSeq; /* * we use 32-bit entry in hash tables, and the MSB is reservered. * So we support up to 2^31 non-N characters in the reference * sequences. */ if (nChar > MAX_NUM_CHAR) { // elog(ERROR, "#characters: %lld\n", nChar); // elog(ERROR, "ERROR: the number of non-N characters in the reference sequences exceeds the maximum value %d\n", MAX_NUM_CHAR); // return ERR_SEQ; } /* we add a N segment in the begining of each sequence in the file */ nNSegment += numFileNSeg + numFileSeq; } /* we add a N segment in the end of all sequences */ nNSegment++; nChar += numError + 1; elog(INFO, "\n"); /* * we use 32-bit entry in hash tables, and the MSB is reservered. * So we support up to 2^31 non-N characters in the reference * sequences. */ if (nChar > MAX_NUM_CHAR) { elog(ERROR, "#characters: %lld\n", nChar); elog( ERROR, "ERROR: the number of non-N characters in the reference sequences exceeds the maximum value %d\n", MAX_NUM_CHAR); return ERR_SEQ; } numChar = (uint32) nChar; numNSegment = (uint32) nNSegment; /* * allocate the sequence space. SEQUENCE_HEAD_WORDS integers * should be left in the begining of the sequence. The left * space is used to avoid the memory overflow when get the * the subsequence starting from the first several characters. */ size = NUM_LONGWORD_BASE(numChar + length) + SEQUENCE_HEAD_WORDS; pool = (int64 *) malloc(size * sizeof(int64)); sequence = pool + SEQUENCE_HEAD_WORDS; memset(pool, 0, SEQUENCE_HEAD_WORDS * sizeof(int64)); /* build an empty interval tree */ itree = new IntervalTree(numNSegment, nError + 1); /* allocate space for seuqence names */ seqNames = new char *[nSeq]; seqNamepool = new char[nSeq * MAX_LENGTH_PATH];for ( i = 0; i < nSeq; i++) seqNames[i] = &seqNamepool[i * MAX_LENGTH_PATH]; /* allocate space for sequence lens */ seqLens = new uint32[nSeq]; word = 0; offsetInCmptSeq = 0; lenSegment = 0; lenSegmentN = 0; curSeq = -1; ProgressBar bar(numChar - 1, PROGRESS_BAR_WIDTH); elog(INFO, "loading reference sequences...\n"); for (i = 0; i < numFile; i++) { /* open the sequence file */ file = fopen(fname[i], "rb"); if (file == NULL) { return ERR_PARA; } while ((c = fgetc(file)) != EOF) { /* update the progress bar*/ bar.update(offsetInCmptSeq); /* the begining of a new sequence */ if (c == '>') { /* * we need to insert numError+1 Ns in the beginning of each sequence. * Otherwise, a substring span over the boundary of two sequences * may introduce wrong alignment. */ code = BASE_N; for (j = 0; j < numError + 1; j++) { if (lenSegment + BITS_PER_BASE >= BITS_PER_LONGWORD) { assert(offsetInCmptSeq <= numChar); assert((offsetInCmptSeq * BITS_PER_BASE) % BITS_PER_LONGWORD != 0); /* append the high bits into the current word. */ sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] = (word << (BITS_PER_LONGWORD - lenSegment)) | (code >> (lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD)); lenSegment = lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD; /* the current 64-bit word is updated to the low bits */ word = ~((~code) | (-1LL << lenSegment)); } else { /* append the current character into the current word */ lenSegment += BITS_PER_BASE; word = (word << BITS_PER_BASE) | code; } offsetInCmptSeq++; } /* save sequence length */ if (curSeq >= 0) seqLens[curSeq] = offsetInOrgSeq; curSeq++; /* save sequence names */ getSeqName(file, seqNames[curSeq]); /* initialization for each sequence */ offsetInOrgSeq = 0; itree->append(offsetInCmptSeq, 0, curSeq, offsetInOrgSeq); lenSegmentN = 0; } /* skip the line break characters */ else if (c == 10 || c == 13) continue; else { bool isUnknownChar = true; if (c == 'A' || (c == 'a' && !skipMask)) { code = BASE_A; isUnknownChar = false; } else if (c == 'C' || (c == 'c' && !skipMask)) { code = BASE_C; isUnknownChar = false; } else if (c == 'G' || (c == 'g' && !skipMask)) { code = BASE_G; isUnknownChar = false; } else if (c == 'T' || (c == 't' && !skipMask)) { code = BASE_T; isUnknownChar = false; } if (!isUnknownChar) { /* * insert the last N segment into the interval tree. * The insertted point is at the end of the N segment. * We insert the offsets in original sequence and * compact sequence as a pair. */ if (lenSegmentN > numError + 1) { itree->append(offsetInCmptSeq, 0, curSeq, offsetInOrgSeq); } lenSegmentN = 0; } else { code = BASE_N; /* * all other characters are handled similar to the unknown * character 'N'. */ /* update the current N segment length */ lenSegmentN++; /* * if the current N segment exceed numError+1 characters, * discard the character. */ if (lenSegmentN > numError + 1) { offsetInOrgSeq++; continue; } } /* * We use three bits to represent a character. The characters * are packed into 64-bit words. Here we check if the current * position is on the boundary of 64-bit word. */ if (lenSegment + BITS_PER_BASE >= BITS_PER_LONGWORD) { assert(offsetInCmptSeq <= numChar); assert((offsetInCmptSeq * BITS_PER_BASE) % BITS_PER_LONGWORD != 0); /* append the high bits into the current word. */ sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] = (word << (BITS_PER_LONGWORD - lenSegment)) | (code >> (lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD)); lenSegment = lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD; /* the current 64-bit word is updated to the low bits */ word = ~((~code) | (-1LL << lenSegment)); } else { /* append the current character into the current word */ lenSegment += BITS_PER_BASE; word = (word << BITS_PER_BASE) | code; } offsetInCmptSeq++; offsetInOrgSeq++; } } ret = fclose(file); if (ret != 0) return ERR_FILE; } assert(offsetInCmptSeq + numError + 1 == numChar); /* save the last sequence length */ if (curSeq >= 0) seqLens[curSeq] = offsetInOrgSeq; /* add the last N-segment */ code = BASE_N; for (j = offsetInCmptSeq; j < numChar; j++) { /* update the progress bar */ bar.update(j); if (lenSegment + BITS_PER_BASE >= BITS_PER_LONGWORD) { assert(offsetInCmptSeq <= numChar); assert((offsetInCmptSeq * BITS_PER_BASE) % BITS_PER_LONGWORD != 0); /* append the high bits into the current word. */ sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] = (word << (BITS_PER_LONGWORD - lenSegment)) | (code >> (lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD)); lenSegment = lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD; /* the current 64-bit word is updated to the low bits */ word = ~((~code) | (-1LL << lenSegment)); } else { /* append the current character into the current word */ lenSegment += BITS_PER_BASE; word = (word << BITS_PER_BASE) | code; } offsetInCmptSeq++; } /* flush the current word */ word = word << (BITS_PER_LONGWORD - lenSegment); sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] = word; itree->flush(offsetInCmptSeq, 0, curSeq, offsetInOrgSeq); elog(DEBUG1, "#characters in reference sequences: %u\n", numChar); return SUCCESS; } int CompactSequence::filter(Aligner * aligner, char ** fname, int numFile, char * path) { unsigned int i, j, ret; FILE * file; char c; unsigned int offsetInCmptSeq; unsigned int lenSegmentN; unsigned int step, nextstep; int64 numFileChar, numFileNSeg, numFileSeq, nChar, nNSegment; int64 space[16]; int nSkipChar = 0; int64 * key = &space[8]; FILE * outfile = fopen(path, "w"); offsetInCmptSeq = 0; lenSegmentN = 0; ProgressBar bar(numChar - 1, PROGRESS_BAR_WIDTH); elog(INFO, "loading reference sequences...\n"); for (i = 0; i < numFile; i++) { /* open the sequence file */ file = fopen(fname[i], "rb"); if (file == NULL) { return ERR_PARA; } while ((c = fgetc(file)) != EOF) { /* update the progress bar*/ bar.update(offsetInCmptSeq); /* the begining of a new sequence */ if (c == '>') { char str[256]; fputc(c, outfile); j = 0; while (1) { str[j] = fgetc(file); if (str[j] == 10 || str[j] == 13 || str[j] == EOF ) break; j++; } str[j] = '\0'; fprintf(outfile, "%s\n", str); offsetInCmptSeq += nError + 1; lenSegmentN = 0; } else if (c == 10 || c == 13) { fputc(c, outfile); continue; } else { nSkipChar--; if (c == 'A' || c == 'a' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'T' || c == 't') { lenSegmentN = 0; //search BitRead::extract(sequence, key, offsetInCmptSeq * BITS_PER_BASE_LL, len * BITS_PER_BASE); if (aligner->hashTables[0].lookup(key, offsetInCmptSeq)) { if (nSkipChar > 0) fputc('N', outfile); else fputc(c, outfile); } else { fputc('N', outfile); nSkipChar = len; } } else { fputc(c, outfile); /* update the current N segment length */ lenSegmentN++; /* * if the current N segment exceed numError+1 characters, * discard the character. */ if (lenSegmentN > nError + 1) continue; } offsetInCmptSeq++; } } fputc('\n', outfile); ret = fclose(file); if (ret != 0) return ERR_FILE; } ret = fclose(outfile); if (ret != 0) return ERR_FILE; return SUCCESS; } /** * CompactSequence::compose * compose a character-based sequence into bit-vector format * that using 3 bits to represent a character. */ void CompactSequence::compose(char * str, int length, int64 * words) { int j; int forward_offset, offset; int64 forward_word, code; /* cut the sequence if necessary */ str[length] = '\0'; /* initialize the values */ words[0] = 0; words[1] = 0; offset = (WORDS_PER_READ * BITS_PER_LONGWORD - length * BITS_PER_BASE) / BITS_PER_LONGWORD; /* initialize the current word in forward/backward format */ forward_word = 0; /* initialize the begining offset in forward/backward format */ forward_offset = (WORDS_PER_READ * BITS_PER_LONGWORD - length * BITS_PER_BASE) % BITS_PER_LONGWORD; /* * scan the sequence and generate the compact representation * in forward or/and backward format. */ for (j = 0; j < length; j++) { if (str[j] == 'A') code = BASE_A; else if (str[j] == 'C') code = BASE_C; else if (str[j] == 'G') code = BASE_G; else if (str[j] == 'T') code = BASE_T; else if (str[j] == 'N') code = BASE_N; else elog(ERROR, "ERROR: unknown character in short read files.\n"); /* forward format */ if (forward_offset + BITS_PER_BASE >= BITS_PER_LONGWORD) { /* on the boundary of 64-bit word */ words[offset++] = (forward_word << (BITS_PER_LONGWORD - forward_offset)) | (code >> (forward_offset + BITS_PER_BASE - BITS_PER_LONGWORD)); forward_offset = forward_offset + BITS_PER_BASE - BITS_PER_LONGWORD; forward_word = ~((~code) | (-1LL << forward_offset)); } else { forward_word = (forward_word << BITS_PER_BASE) | code; forward_offset += BITS_PER_BASE; } } } void CompactSequence::decompose(char * str, int length, int64 * words) { int i, j = 0, k; int64 code; k = WORDS_PER_READ - 1; str[length] = '\0'; for (i = length - 1; i >= 0; i--) { if (j + BITS_PER_BASE > BITS_PER_LONGWORD) { if (j < BITS_PER_LONGWORD ) code = ((words[k] >> j) | (words[k - 1] << (BITS_PER_LONGWORD - j))) & 0x7; else code = words[k - 1] & 0x7; str[i] = code2Gene[code]; j = j + BITS_PER_BASE - BITS_PER_LONGWORD; k--; } else { code = (words[k] >> j) & 0x7; str[i] = code2Gene[code]; j += 3; } } } /* * CompactSequence::save * This function is used to save the in-memory sequence to disk. * The CompactSequence structure is stored in the file sequence.whm * in the specified data path. Interval tree is stored in the file * interval.whm. */ int CompactSequence::save(char * path) { int ret; char fname[MAX_LENGTH_PATH]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.sequence.whm", path); file = fopen(fname, "wb"); if (file == NULL) { elog(ERROR, "ERROR:failed to open file: %s\n", fname); return ERR_PARA; } ret = fwrite(this, sizeof(CompactSequence), 1, file); if (ret != 1) { elog(ERROR, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fwrite(pool, sizeof(int64), size, file); if (ret != size) { elog(ERROR, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fwrite(seqNamepool, sizeof(char) * MAX_LENGTH_PATH, nSeq, file); if (ret != nSeq) { elog(ERROR, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fwrite(seqLens, sizeof(uint32), nSeq, file); if (ret != nSeq) { elog(ERROR, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fflush(file); if (ret != 0) { elog(ERROR, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) return ERR_FILE; ret = itree->save(path); if (ret != SUCCESS ) return ret; return SUCCESS; } /* * CompactSequence::load * This function is used to load the on-disk sequence structure into * main memory. The CompactSequence structure is loaded from the file * sequence.whm in the specified data path. Interval tree is loaded * from the file interval.whm. */ int CompactSequence::load(char * path) { int ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.sequence.whm", path); file = fopen(fname, "rb"); if (file == NULL ) return ERR_PARA; ret = fread(this, sizeof(CompactSequence), 1, file); if (ret != 1) { elog(ERROR, "ERROR: read sequence structure data file.\n"); return ERR_FILE; } pool = (int64 *) malloc(size * sizeof(int64)); sequence = pool + SEQUENCE_HEAD_WORDS; ret = fread(pool, sizeof(int64), size, file); if (ret != size) { elog(ERROR, "ERROR: read sequence data file.\n"); return ERR_FILE; } seqNamepool = (char *) malloc(nSeq * MAX_LENGTH_PATH); ret = fread(seqNamepool, sizeof(char) * MAX_LENGTH_PATH, nSeq, file); if (ret != nSeq) { elog(ERROR, "ERROR: read sequence data file.\n"); return ERR_FILE; } seqNames = (char **) malloc(nSeq * sizeof(char *)); for (int i = 0; i < nSeq; i++) seqNames[i] = &seqNamepool[i * MAX_LENGTH_PATH]; seqLens = (uint32 *) malloc(nSeq * sizeof(uint32)); ret = fread(seqLens, sizeof(uint32), nSeq, file); if (ret != nSeq) { elog(ERROR, "ERROR: read sequence data file.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) return ERR_FILE; itree = new IntervalTree; ret = itree->load(path); if (ret != SUCCESS ) return ret; return SUCCESS; } /* * CompactSequence::valid * this function is used to check if the sequence is compatible * with the specificied parameters */ int CompactSequence::valid(int length, int numError) { if (length != len) return ERR_PARA; if (numError != nError) return ERR_PARA; return SUCCESS; } /* int CompactSequence::loadRead(char * path) { int ret; char fname[256]; FILE * file; sprintf(fname, "%s//short.dat", path); file = fopen(fname, "rb"); if (file == NULL) return ; ret = fseek(file, 0, SEEK_END); if (ret != 0) return 0; long int size; size = ftell(file); numRead = size / (sizeof(int64) * 3 + sizeof(uint32)); ret = fseek(file, 0, SEEK_SET); if (ret != 0) return 0; keys = new int64[numRead * 3]; offsets = new uint32[numRead]; for (uint32 i = 0; i < numRead; i++) { ret = fread(keys + i * 3, sizeof(int64), 3, file); if (ret != 3) return 0; ret = fread(offsets + i, sizeof(uint32), 1, file); if (ret != 1) return 0; } fclose(file); printf("load reads succesfully.\n"); return 1; } */ wham/pair.h0000644001532600153260000007247112003705361012154 0ustar yinanyinan#ifndef _PAIRWISE_ALIGNER_H_ #define _PAIRWISE_ALIGNER_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: pair.h 157 2012-07-25 05:58:09Z yinan $ */ #include "sequence.h" #include "short.h" #include "rdtsc.h" #include #define MAX_GAP 6 typedef struct int128 { int64 high; int64 low; } int128; typedef struct ErrorVector { unsigned char offset :4; //offset of matched target str unsigned char gap :4; signed char num; unsigned char len; //length of matched target str unsigned char qual; unsigned int vec; } ErrorVector; #define ERROR_MAX_ERR 127 #define ERROR_VECTOR_NUL 0 #define ERROR_VECTOR_INS 1 #define ERROR_VECTOR_DEL 2 #define ERROR_VECTOR_MIS 3 #define GET_ERROR(x, i) (((x) >> (32 - ((i) + 1) * 2)) & 0x3) #define SET_ERROR(x, e) (x) = ((x) >> 2) | ((e) << 30) //#define UPDATE_ERROR(x, e) (x) = (((x) & 0x3fffffff) | ((e) << 30)) extern unsigned long long timePairAlign; extern unsigned long long numPairAlign; extern unsigned long long numPairAlignFilter; extern unsigned long long numPairAlignDBA; class PairAligner { public: static inline ErrorVector pairAlign(int64 * key, int64 * target, int length, int nError) { int64 diff[6]; int64 cmp[2]; ErrorVector error; diff[0] = target[0] ^ key[0]; diff[1] = target[1] ^ key[1]; diff[2] = target[2] ^ key[2]; diff[3] = target[3] ^ key[3]; diff[4] = target[4] ^ key[4]; diff[5] = target[5] ^ key[5]; //compact a 64-bit difference int into a 32-bit difference int cmp[0] = (diff[0] | (diff[0] >> 1) | (diff[0] >> 2)) & 0x2492492492492492LL; cmp[0] |= (diff[1] | (diff[1] >> 1) | (diff[1] >> 2) | (diff[0] << 63)) & 0x4924924924924924LL; cmp[0] |= (diff[2] | (diff[2] >> 1) | (diff[2] >> 2) | (diff[1] << 63) | (diff[1] << 62)) & 0x9249249249249249LL; cmp[1] = (diff[3] | (diff[3] >> 1) | (diff[3] >> 2)) & 0x2492492492492492LL; cmp[1] |= (diff[4] | (diff[4] >> 1) | (diff[4] >> 2) | (diff[3] << 63)) & 0x4924924924924924LL; cmp[1] |= (diff[5] | (diff[5] >> 1) | (diff[5] >> 2) | (diff[4] << 63) | (diff[4] << 62)) & 0x9249249249249249LL; #if POPCNT cmp[0] = popcnt(cmp[0]); cmp[1] = popcnt(cmp[1]); #else cmp[0] = BitCount(cmp[0]); cmp[1] = BitCount(cmp[1]); #endif error.num = cmp[0] + cmp[1]; error.offset = 0; error.gap = 0; error.len = length; error.vec = -1; //all mismatches return error; } static inline ErrorVector pairAlign(int64 * key, int64 * target, int length, int nError, int nGap) { unsigned long long time; ErrorVector errorVector; startTimer(&time); #ifdef PAIRALIGN_BASIC PairAligner aligner; bool match = aligner.pairAlignEnum(key, target, length, nError, nGap); if (match) errorVector.num = 0; else errorVector.num = nError + 5; errorVector.len = length; errorVector.offset = 0; #else #ifdef PAIRALIGN_NW bool match = pairAlignDP(key, target, length, nError, nGap); if (match) errorVector.num = 0; else errorVector.num = nError + 5; errorVector.len = length; errorVector.offset = 0; #else #ifdef PAIRALIGN_BITVECTOR bool match = pairAlignBitVector(key, target, length, nError, nGap); if (match) errorVector.num = 0; else errorVector.num = nError + 5; errorVector.len = length; errorVector.offset = 0; #else errorVector = pairAlignDBA(key, target, length, nError, nGap); #endif #endif #endif stopTimer(&time); if (nGap != 0) { timePairAlign += time; numPairAlign++; } return errorVector; } static void printTimePairAlign() { if (numPairAlign != 0) { printf("Average pair align time: %llu cycles\n", timePairAlign / numPairAlign); printf("Filter ratio: %llu/%llu = %.2f %%\n", numPairAlignFilter, numPairAlign, (double) numPairAlignFilter / numPairAlign * 100); printf("Filter pass ratio: %llu/%llu = %.2f %%\n", numPairAlignDBA, numPairAlignFilter, (double) numPairAlignDBA / numPairAlignFilter * 100); } } /** * the target sequence should be (nInsertion + nDelete) longer than the key sequence. */ static inline ErrorVector pairAlignDBA(int64 * key, int64 * target, int length, int nError, int nGap) { int i, e; int64 diff[6]; int128 d[10][2 * MAX_GAP + 1];int128 bases[2 * MAX_GAP + 1];ErrorVector vector[2 * MAX_GAP + 1], _vector[2 * MAX_GAP + 1]; int128 filter, mask; int64 num; /* quick path for non-indel pair alignment */ if (nGap == 0) { return pairAlign(key, target, length, nError); } numPairAlignFilter++; filter.low = filter.high = -1; if (length < 64) { mask.high = 0; mask.low = (1ULL << length) - 1; } else { mask.high = (1ULL << (length - 64)) - 1; mask.low = -1; } for (i = 0; i <= 2 * nGap; i++) { //get the bit-difference vector if (i == 0) { diff[0] = key[0] ^ target[0]; diff[1] = key[1] ^ target[1]; diff[2] = key[2] ^ target[2]; diff[3] = key[3] ^ target[3]; diff[4] = key[4] ^ target[4]; diff[5] = key[5] ^ target[5]; } else { int i1 = i * 3; int i2 = 64 - i1; diff[0] = key[0] ^ (target[0] >> i1); diff[1] = key[1] ^ ((target[1] >> i1) | (target[0] << i2)); diff[2] = key[2] ^ ((target[2] >> i1) | (target[1] << i2)); diff[3] = key[3] ^ ((target[3] >> i1) | (target[2] << i2)); diff[4] = key[4] ^ ((target[4] >> i1) | (target[3] << i2)); diff[5] = key[5] ^ ((target[5] >> i1) | (target[4] << i2)); } //use 1 bit to represent a character diff[5] = (diff[5] | (diff[5] >> 1) | (diff[5] >> 2) | (diff[4] << 63) | (diff[4] << 62)) & 0x9249249249249249LL; diff[4] = ((diff[4] >> 2) | (diff[4] >> 3) | (diff[4] >> 4) | (diff[3] << 63)) & 0x9249249249249249LL; diff[3] = ((diff[3] >> 1) | (diff[3] >> 2) | (diff[3] >> 3)) & 0x9249249249249249LL; diff[2] = (diff[2] | (diff[2] >> 1) | (diff[2] >> 2) | (diff[1] << 63) | (diff[1] << 62)) & 0x9249249249249249LL; diff[1] = ((diff[1] >> 2) | (diff[1] >> 3) | (diff[1] >> 4) | (diff[0] << 63)) & 0x9249249249249249LL; diff[0] = ((diff[0] >> 1) | (diff[0] >> 2) | (diff[0] >> 3)) & 0x9249249249249249LL; //push all valid bits to the rightmost diff[0] = BitPushRight(diff[0]); diff[1] = BitPushRight(diff[1]); diff[2] = BitPushRight(diff[2]); diff[3] = BitPushRight(diff[3]); diff[4] = BitPushRight(diff[4]); diff[5] = BitPushRight(diff[5]); d[0][2 * nGap - i].high = (diff[0] << 43) | (diff[1] << 22) | diff[2]; d[0][2 * nGap - i].low = (diff[3] << 43) | (diff[4] << 22) | diff[5]; /** * if we clear all errors and the number of indels is * less than nGap: */ if (Bit128EqualZero(d[0][2 * nGap - i], length)) { _vector[i].num = 0; _vector[i].offset = 2 * nGap - i; _vector[i].len = length; return _vector[i]; } bases[2 * nGap - i] = d[0][2 * nGap - i]; vector[2 * nGap - i].len = 0; vector[2 * nGap - i].gap = 0; filter = Bit128And(filter, d[0][2 * nGap - i]); } //check filter #ifndef PAIRALIGN_NO_FILTER filter = Bit128And(filter, mask); num = 0; #if POPCNT num += popcnt(filter.high); num += popcnt(filter.low); #else num += BitCount(filter.high); num += BitCount(filter.low); #endif if (num > nError) { vector[0].num = ERROR_MAX_ERR; return vector[0]; } #endif numPairAlignDBA++; /** * we make a performance evaluation. Mainteining the status arrays * take long time to execute. * To be improved. */ int128 cur; for (e = 1; e <= nError; e++) { /* -nGap, -nGap + 1, ..., -1, 0, 1, ..., nGap - 1, nGap */ for (i = 0; i <= 2 * nGap; i++) { //apply mismatch d[e][i] = Bit128RemoveRightmost1(d[e - 1][i]); _vector[i] = vector[i]; SET_ERROR(_vector[i].vec, ERROR_VECTOR_MIS); //apply deletion if (i < 2 * nGap) { cur = Bit128SmearRightmost1(d[e - 1][i + 1]); cur = Bit128And(bases[i], cur); if (Bit128CompareRightmost1(d[e][i], cur)) { d[e][i] = cur; _vector[i] = vector[i + 1]; _vector[i].len++; _vector[i].gap++; SET_ERROR(_vector[i].vec, ERROR_VECTOR_DEL); } } //apply insert if (i > 0) { /** * this identical to RemoveSmearRightmost1 * cur = Bit128LeftShift1(d[e - 1][i - 1]); * cur = Bit128SmearRightmost1(cur); **/ cur = Bit128RemoveSmearRightmost1(d[e - 1][i - 1]); cur = Bit128And(cur, bases[i]); if (Bit128CompareRightmost1(d[e][i], cur)) { d[e][i] = cur; _vector[i] = vector[i - 1]; _vector[i].len--; _vector[i].gap++; SET_ERROR(_vector[i].vec, ERROR_VECTOR_INS); } } /** * if we clear all errors and the number of indels is * less than nGap: */ if (Bit128EqualZero(d[e][i], length) && _vector[i].gap <= nGap) { _vector[i].num = e; _vector[i].offset = i; _vector[i].len += length; return _vector[i]; } } for (i = 0; i <= 2 * nGap; i++) vector[i] = _vector[i]; } vector[0].num = ERROR_MAX_ERR; return vector[0]; } static inline unsigned char qual(int64 * key, int64 * target, int len, char * quals, ErrorVector * error, strand s) { /* if (error->num <= (int)(len * 0.25 + 0.5)) return 37; else return 25; */ int i, j, e, idx; int type; char keyStr[129], targetStr[129]; double p = 1; unsigned char qual; //the maining is used for future improvement CompactSequence::decompose(keyStr, len, key); CompactSequence::decompose(targetStr, error->len, target); e = 0; for (i = 0, j = 0; i < len; i++, j++) { if (s == FORWARD ) idx = i; else idx = len - i - 1; if (keyStr[i] == targetStr[j]) { p *= 1 - pow((double) 10, (quals[idx] - 33) / -10.0); } else { p *= pow((double) 10, (quals[idx] - 33) / -10.0); type = GET_ERROR(error->vec, e); if (type == ERROR_VECTOR_INS) { j--; } else if (type == ERROR_VECTOR_DEL) { i--; } } e++; } qual = (int) (-10 * log10(1 - p) + 0.5); return qual; } /* static inline int qual(int64 * key, int64 * target, int len, char * quals, strand s) { int64 diff[6]; int64 cmp[6]; int64 value; int qual = 0; int i, j, id; diff[0] = target[0] ^ key[0]; diff[1] = target[1] ^ key[1]; diff[2] = target[2] ^ key[2]; diff[3] = target[3] ^ key[3]; diff[4] = target[4] ^ key[4]; diff[5] = target[5] ^ key[5]; //compact a 64-bit difference int into a 32-bit difference int cmp[0] = (diff[0] | (diff[0] >> 1) | (diff[0] >> 2)) & 0x2492492492492492LL; cmp[1] = (diff[1] | (diff[1] >> 1) | (diff[1] >> 2) | (diff[0] << 63)) & 0x4924924924924924LL; cmp[2] = (diff[2] | (diff[2] >> 1) | (diff[2] >> 2) | (diff[1] << 63) | (diff[1] << 62)) & 0x9249249249249249LL; cmp[3] = (diff[3] | (diff[3] >> 1) | (diff[3] >> 2)) & 0x2492492492492492LL; cmp[4] = (diff[4] | (diff[4] >> 1) | (diff[4] >> 2) | (diff[3] << 63)) & 0x4924924924924924LL; cmp[5] = (diff[5] | (diff[5] >> 1) | (diff[5] >> 2) | (diff[4] << 63) | (diff[4] << 62)) & 0x9249249249249249LL; for (i = 0; i < 6; i++) { if (cmp[i] == 0) continue; for (j = 2 - i % 3; j < 64; j += 3) { id = len - 1 - ((6 - i) * 64 - j) / 3; // id = i * (64/3) + j/3 - (128 - len); value = cmp[i] & (1LL << (63 - j)); if (value != 0) { if (s == BACKWARD) id = len - 1 - id; qual += quals[id] - 33; } } } return qual + 33; } */ int64 * g_key; int64 * g_target; int g_nError; int g_nInsert; int g_nDelete; int g_length; int numerror[3]; int maxerror[3]; int errorlist[10]; int128 d[2 * MAX_GAP + 1]; bool pairAlignEnum(int64 * key, int64 * target, int length, int nError, int nGap) { int i; int64 diff[4]; int nInsert = nGap; int nDelete = nGap; /* quick path for non-indel pair alignment */ if (nInsert == 0 && nDelete == 0) { ErrorVector error; error = pairAlign(key, target, length, nError); return error.num <= nError; } nGap = nInsert; for (i = 0; i <= 2 * nGap; i++) { //get the bit-difference vector if (i == 0) { diff[0] = key[0] ^ target[0]; diff[1] = key[1] ^ target[1]; diff[2] = key[2] ^ target[2]; diff[3] = key[3] ^ target[3]; diff[4] = key[4] ^ target[4]; diff[5] = key[5] ^ target[5]; } else { diff[0] = key[0] ^ (target[0] >> (i * 3)); diff[1] = key[1] ^ ((target[1] >> (i * 3)) | (target[0] << (64 - i * 3))); diff[2] = key[2] ^ ((target[2] >> (i * 3)) | (target[1] << (64 - i * 3))); diff[3] = key[3] ^ ((target[3] >> (i * 3)) | (target[2] << (64 - i * 3))); diff[4] = key[4] ^ ((target[4] >> (i * 3)) | (target[3] << (64 - i * 3))); diff[5] = key[5] ^ ((target[5] >> (i * 3)) | (target[4] << (64 - i * 3))); } //use 1 bit to represent a character diff[5] = (diff[5] | (diff[5] >> 1) | (diff[5] >> 2) | (diff[4] << 63) | (diff[4] << 62)) & 0x9249249249249249LL; diff[4] = ((diff[4] >> 2) | (diff[4] >> 3) | (diff[4] >> 4) | (diff[3] << 63)) & 0x9249249249249249LL; diff[3] = ((diff[3] >> 1) | (diff[3] >> 2) | (diff[3] >> 3)) & 0x9249249249249249LL; diff[2] = (diff[2] | (diff[2] >> 1) | (diff[2] >> 2) | (diff[1] << 63) | (diff[1] << 62)) & 0x9249249249249249LL; diff[1] = ((diff[1] >> 2) | (diff[1] >> 3) | (diff[1] >> 4) | (diff[0] << 63)) & 0x9249249249249249LL; diff[0] = ((diff[0] >> 1) | (diff[0] >> 2) | (diff[0] >> 3)) & 0x9249249249249249LL; //push all valid bits to the rightmost diff[0] = BitPushRight(diff[0]); diff[1] = BitPushRight(diff[1]); diff[2] = BitPushRight(diff[2]); diff[3] = BitPushRight(diff[3]); diff[4] = BitPushRight(diff[4]); diff[5] = BitPushRight(diff[5]); d[2 * nGap - i].high = (diff[0] << 43) | (diff[1] << 22) | diff[2]; d[2 * nGap - i].low = (diff[3] << 43) | (diff[4] << 22) | diff[5]; /** * if we clear all errors and the number of indels is * less than nGap: */ if (Bit128EqualZero(d[2 * nGap - i], length)) { return true; } } g_key = key; g_target = target; g_length = length; g_nError = nError; g_nInsert = nInsert; g_nDelete = nDelete; numerror[0] = numerror[1] = numerror[2] = 0; maxerror[0] = nDelete; maxerror[1] = nError; maxerror[2] = nInsert; return errorPermutation(0); } bool errorPermutation(int k) { int offset; int128 m, u; if (k == g_nError) { // for (int i = 0; i < k; i++) // printf("%d ", errorlist[i]); // printf("\n"); offset = 0; u = d[offset]; if (Bit128EqualZero(u, g_length)) { return true; } for (int i = 0; i < k; i++) { if (errorlist[i] == 1) { u = Bit128RemoveRightmost1(u); } else if (errorlist[i] == 0) { //apply deletion if (i > 0) { offset --; m = Bit128SmearRightmost1(u); u = Bit128And(d[offset], m); } else return false; } else if (errorlist[i] == 2) { //apply insert if (i < g_nInsert + g_nDelete) { /** * this identical to RemoveSmearRightmost1 * cur = Bit128LeftShift1(d[e - 1][i - 1]); * cur = Bit128SmearRightmost1(cur); **/ offset ++; m = Bit128RemoveSmearRightmost1(u); u = Bit128And(d[offset], m); } else return false; } /** * if we clear all errors and the number of indels is * less than nGap: */ if (Bit128EqualZero(u, g_length)) { return true; } } return false; } for (int i = 0; i <= 2; i++) { if (numerror[i] >= maxerror[i]) continue; numerror[i]++; errorlist[k] = i; if (errorPermutation(k+1)) return true; numerror[i]--; } return false; } static inline bool pairAlignDP(int64 * key, int64 * target, int length, int nError, int nGap, bool print = false) { int i, j, offset, min, tmp; int c[130][130]; char x[130]; char y[130]; int start, end; bool done, ret; CompactSequence::decompose(y, length, key); CompactSequence::decompose(x, length + nGap * 2, target); // for (i = 0; i <= length; i++) // for (j = 0; j < length + nGap * 2; j++) // c[i][j] = 66; for (i = 0; i <= length; i++) c[i][0] = i; for (i = 0; i <= length + nGap * 2; i++) c[0][i] = 0; for (i = 1; i <= length + nGap * 2; i++) { if (i - nGap -nGap > 1) start = i - nGap - nGap; else start = 1; end = nError + i; done = true; for (j = start; j <= end; j++) { if (j > length) break; min = nError + 1; if (j > start || j == 1) { if (c[j-1][i] + 1 < min) min = c[j-1][i] + 1; } if (j < end || i == 1) { if (c[j][i-1] + 1 < min) min = c[j][i-1] + 1; } tmp = c[j-1][i-1] + 1; if (x[i - 1] == y[j - 1]) tmp--; if (tmp < min) min = tmp; c[j][i] = min; if (c[j][i] <= nError) done = false; } if (end >= length && c[length][i] <= nError) { ret = true; break; } if (done) { ret = false; break; } } if (print) { printf("\n%s\n%s\n", x, y); printf("current i: %d\n", i); for (i = 0; i <= length; i++) { for (j = 0; j < length + nGap * 2; j++) printf("%2d ", c[i][j]); printf("\n"); } getchar(); } return ret; } static inline bool pairAlignBitVector64(int64 * key, int64 * target, int patlen, int dif, int ngap) { register unsigned long long P, M, X, U, Y; unsigned long long Ebit, One; int i, p, num, base, Cscore; int a; unsigned long long Pc[4]; char buf[130], query[130]; CompactSequence::decompose(query, patlen, key); CompactSequence::decompose(buf, patlen + ngap * 2, target); Pc[0] = Pc[1] = Pc[2] = Pc[3] = 0; One = 1; for (p = 0; p < patlen; p++) { if (query[p] == 'A') Pc[0] |= One; else if (query[p] == 'C') Pc[1] |= One; else if (query[p] == 'G') Pc[2] |= One; else if (query[p] == 'T') Pc[3] |= One; One <<= 1; } One = 1; Ebit = (One << (patlen-1)); P = -1; M = 0; Cscore = patlen; for (i = 0; i < patlen + 2 * ngap; i++) { if (buf[i] == 'A') U = Pc[0]; else if (buf[i] == 'C') U = Pc[1]; else if (buf[i] == 'G') U = Pc[2]; else if (buf[i] == 'T') U = Pc[3]; else U = 0; X = (((U & P) + P) ^ P) | U; U |= M; Y = P; P = M | ~ (X | Y); M = Y & X; if (P & Ebit) Cscore += 1; else if (M & Ebit) Cscore -= 1; Y = P << 1; P = (M << 1) | ~ (U | Y); M = Y & U; if (Cscore <= dif) { return true; } } return false; } static inline bool pairAlignBitVector(int64 * key, int64 * target, int patlen, int dif, int ngap) { int128 P, M, X, U, Y; int128 Ebit, One; int i, p, num, base, Cscore; int a; int128 Pc[4]; char buf[130], query[130]; CompactSequence::decompose(query, patlen, key); CompactSequence::decompose(buf, patlen + ngap * 2, target); Pc[0].high = Pc[0].low = Pc[1].high = Pc[1].low = 0; Pc[2].high = Pc[2].low = Pc[3].high = Pc[3].low = 0; One.high = 0; One.low = 1; for (p = 0; p < patlen; p++) { if (query[p] == 'A') Pc[0] = Bit128Or(Pc[0], One); else if (query[p] == 'C') Pc[1] = Bit128Or(Pc[1], One); else if (query[p] == 'G') Pc[2] = Bit128Or(Pc[2], One); else if (query[p] == 'T') Pc[3] = Bit128Or(Pc[3], One); One = Bit128LeftShift1(One); } One.high = 0; One.low = 1; P.high = -1; P.low = -1; M.high = 0; M.low = 0; if (patlen - 1 < 64) { Ebit.high = 0; Ebit.low = 1ULL << (patlen - 1); } else { Ebit.high = 1ULL << (patlen - 1 - 64); Ebit.low = 0; } Cscore = patlen; for (i = 0; i < patlen + 2 * ngap; i++) { if (buf[i] == 'A') U = Pc[0]; else if (buf[i] == 'C') U = Pc[1]; else if (buf[i] == 'G') U = Pc[2]; else if (buf[i] == 'T') U = Pc[3]; else U.high = U.low = 0; X = Bit128Or(Bit128Xor(Bit128Plus(Bit128And(U, P), P), P), U); U = Bit128Or(U, M); Y = P; P = Bit128Or(M, Bit128Neg(Bit128Or(X, Y))); M = Bit128And(Y, X); if (Bit128True(Bit128And(P, Ebit))) Cscore += 1; else if (Bit128True(Bit128And(M, Ebit))) Cscore -= 1; Y = Bit128LeftShift1(P); P = Bit128Or(Bit128LeftShift1(M), Bit128Neg(Bit128Or(U, Y))); M = Bit128And(Y, U); if (Cscore <= dif) { return true; } } return false; } static inline int64 popcnt(int64 v) { int64 ret; #if defined(__i386__) || defined(__x86_64__) __asm__ __volatile__ ("popcnt %0, %1" : "=r"(ret) : "r"(v)); #else #error "POPCNT is only supported in Intel X86/IA-64 architectures" #endif return ret; } static inline int64 BitCount(int64 x) { x = ((x) & 0x5555555555555555LL) + (((x) >> 1) & 0x5555555555555555LL); x = ((x) & 0x3333333333333333LL) + (((x) >> 2) & 0x3333333333333333LL); x = ((x) & 0x0F0F0F0F0F0F0F0FLL) + (((x) >> 4) & 0x0F0F0F0F0F0F0F0FLL); x = ((x) & 0x00FF00FF00FF00FFLL) + (((x) >> 8) & 0x00FF00FF00FF00FFLL); x = ((x) & 0x0000FFFF0000FFFFLL) + (((x) >> 16) & 0x0000FFFF0000FFFFLL); x = ((x) & 0x00000000FFFFFFFFLL) + (((x) >> 32) & 0x00000000FFFFFFFFLL); return x; } /** * push all valid bits in a int64 to the rightmost of the int64 * 00X00X00X00X00X00X -> 000000000000XXXXXX */ static inline int64 BitPushRight(int64 x) { if (x != 0) { x = (x | (x >> 2)) & 0x30c30c30c30c30c3LL; x = (x | (x >> 4)) & 0xf00f00f00f00f00fLL; x = (x | (x >> 8)) & 0x00ff0000ff0000ffLL; x = (x | (x >> 16) | (x >> 32)) & 0x0000000000ffffffLL; } return x; } static inline int128 Bit128And(int128 x, int128 y) { int128 tmp; tmp.low = x.low & y.low; tmp.high = x.high & y.high; return tmp; } static inline int128 Bit128Or(int128 x, int128 y) { int128 tmp; tmp.low = x.low | y.low; tmp.high = x.high | y.high; return tmp; } static inline int128 Bit128Xor(int128 x, int128 y) { int128 tmp; tmp.low = x.low ^ y.low; tmp.high = x.high ^ y.high; return tmp; } static inline int128 Bit128Neg(int128 x) { int128 tmp; tmp.low = ~x.low; tmp.high = ~x.high; return tmp; } static inline int128 Bit128Plus(int128 x, int128 y) { int128 tmp; tmp.low = x.low + y.low; tmp.high = x.high + x.high; //low overflow if (tmp.low < x.low && tmp.low < y.low) tmp.high++; return tmp; } static inline bool Bit128True(int128 x) { return (x.low != 0) || (x.high != 0); } /** * remove the rightmost 1 in the 128-bit integer */ static inline int128 Bit128RemoveRightmost1(int128 x) { int128 tmp; tmp.low = x.low & (x.low - 1); /** * the following line is identical to the code block: * if (tmp.low == x.low) //the low 64 bits has no 1. * tmp.high = x.high & (x.high - 1); * else * tmp.high = x.high; **/ tmp.high = x.high & (x.high - (tmp.low == x.low)); return tmp; } /** * smear the rightmost 1 to the left in the 128-bit integer */ static inline int128 Bit128SmearRightmost1(int128 x) { int128 tmp; tmp.low = x.low | -x.low; /** * the following line is identical to the code block: * if (x.low == 0) // the low 64 bits have no 1. * tmp.high = x.high | - x.high; * else * tmp.high = -1LL; **/ tmp.high = x.high | - x.high | (0LL - (x.low != 0)); return tmp; } /** * remove and smear the rightmost 1 to the left in the * 128-bit integer * NEED TO BE MODIFIED */ static inline int128 Bit128RemoveSmearRightmost1(int128 x) { int128 tmp; tmp.low = x.low ^ -x.low; /** * the following line is identical to the code block: * if (x.low == 0) // the low 64 bits have no 1. * tmp.high = x.high ^ - x.high; * else * tmp.high = -1LL; **/ tmp.high = x.high ^ - x.high | (0LL - (x.low != 0)); return tmp; } /** * left shit the 128-bit integar */ static inline int128 Bit128LeftShift1(int128 x) { int128 tmp; tmp.high = (x.high << 1) | (x.low >> 63); tmp.low = (x.low << 1); return tmp; } /** * right shit the 128-bit integar */ static inline int128 Bit128RightShift1(int128 x) { int128 tmp; tmp.high = (x.high >> 1); tmp.low = (x.high << 63) | (x.low >> 1); return tmp; } static inline bool Bit128EqualZero(int128 x, int len) { int128 tmp; if (len >= 64) { tmp.high = (x.high << (128 - len)) | ( x.low >> (len - 64)); tmp.low = x.low << (128 - len); } else { tmp.high = x.low << (64- len); tmp.low = 0; } return (tmp.high == 0) && (tmp.low == 0); } static inline bool Bit128CompareRightmost1(int128 x, int128 y) { int128 tmp1, tmp2; tmp1 = Bit128SmearRightmost1(x); tmp2 = Bit128SmearRightmost1(y); tmp1.low = ~tmp1.low; tmp1.high = ~tmp1.high; tmp2.low = ~tmp2.low; tmp2.high = ~tmp2.high; if (tmp1.high == tmp2.high) return tmp1.low < tmp2.low; else return tmp1.high < tmp2.high; } }; #endif /* static inline int pairAlign(int64 * orgkey, int64 * entrykey, int length, int nError, int nGap) { int i, e; int64 diff[4]; int128 d[10][10]; int128 offset[10]; for (i = - nGap; i <= nGap; i++) { //get the bit-difference vector if (i == 0) { diff[0] = orgkey[0] ^ (entrykey[0] >> (i * 3)); diff[1] = orgkey[1] ^ (entrykey[1] >> (i * 3)); diff[2] = orgkey[2] ^ (entrykey[2] >> (i * 3)); diff[3] = orgkey[3] ^ (entrykey[3] >> (i * 3)); } else if (i < 0) { diff[0] = orgkey[0] ^ (entrykey[0] >> (-i * 3)); diff[1] = orgkey[1] ^ ((entrykey[1] >> (-i * 3)) | (entrykey[0] << (64 + i * 3))); diff[2] = orgkey[2] ^ ((entrykey[2] >> (-i * 3)) | (entrykey[1] << (64 + i * 3))); diff[3] = orgkey[3] ^ ((entrykey[3] >> (-i * 3)) | (entrykey[2] << (64 + i * 3))); } else if (i > 0) { diff[0] = orgkey[0] ^ ((entrykey[0] << (i * 3)) | (entrykey[1] >> (64 - i * 3))); diff[1] = orgkey[1] ^ ((entrykey[1] << (i * 3)) | (entrykey[2] >> (64 - i * 3))); diff[2] = orgkey[2] ^ ((entrykey[2] << (i * 3)) | (entrykey[3] >> (64 - i * 3))); diff[3] = orgkey[3] ^ (entrykey[3] << (i * 3)); } //use 1 bit to represent a character diff[3] = (diff[3] | (diff[3] >> 1) | (diff[3] >> 2) | (diff[2] << 63) | (diff[2] << 62)) & 0x9249249249249249LL; diff[2] = ((diff[2] >> 2) | (diff[2] >> 3) | (diff[2] >> 4) | (diff[1] << 63)) & 0x9249249249249249LL; diff[1] = ((diff[1] >> 1) | (diff[1] >> 2) | (diff[1] >> 3)) & 0x9249249249249249LL; diff[0] = (diff[0] | (diff[0] >> 1) | (diff[0] >> 2)) & 0x9249249249249249LL; //push all valid bits to the rightmost diff[0] = BitPushRight(diff[0]); diff[1] = BitPushRight(diff[1]); diff[2] = BitPushRight(diff[2]); diff[3] = BitPushRight(diff[3]); d[0][nGap + i].high = diff[0]; d[0][nGap + i].low = (diff[1] << 44) | (diff[2] << 22) | diff[3]; offset[nGap + i] = d[0][nGap + i]; } int128 cur; for (e = 1; e <= nError; e++) { // -nGap, -nGap + 1, ..., -1, 0, 1, ..., nGap - 1, nGap for (i = 0; i <= 2 * nGap; i++) { //apply mismatch d[e][i] = Bit128RemoveRightmost1(d[e - 1][i]); //apply deletion if (i < 2 * nGap) { cur = Bit128SmearRightmost1(d[e - 1][i + 1]); cur = Bit128And(offset[i], cur); if (Bit128CompareRightmost1(d[e][i], cur)) d[e][i] = cur; } //apply insert if (i > 0) { cur = Bit128LeftShift1(d[e - 1][i - 1]); cur = Bit128SmearRightmost1(cur); cur = Bit128And(cur, offset[i]); if (Bit128CompareRightmost1(d[e][i], cur)) d[e][i] = cur; } if (Bit128EqualZero(d[e][i], length)) return e; } } return 100; } */ wham/VERSION0000644001532600153260000000000612054641424012106 0ustar yinanyinan0.1.5 wham/perfcounters.cpp0000644001532600153260000000371212003705361014263 0ustar yinanyinan/* * Copyright 2009, Spyros Blanas */ #include "perfcounters.h" #if defined(__sparc__) #include using namespace std; #include #include #include "exceptions.h" #endif void PerfCounters::init() { #if defined(__i386__) || defined(__x86_64__) // Nothing to do, `wrmsr` must be called from ring 0. // #elif defined(__sparc__) if ((cpc = cpc_open(CPC_VER_CURRENT)) == NULL) { cout << "perf counters unavailable: " << strerror(errno) << endl; throw PerfCountersException(); } #else #error Performance counters not known for this architecture. #endif } void PerfCounters::destroy() { #if defined(__i386__) || defined(__x86_64__) // Nothing to do, `wrmsr` must be called from ring 0. // #elif defined(__sparc__) cpc_close(cpc); #else #error Performance counters not known for this architecture. #endif } void PerfCounters::threadinit() { #if defined(__i386__) || defined(__x86_64__) // Nothing to do, `wrmsr` must be called from ring 0. // #elif defined(__sparc__) char *event0 = NULL, *event1 = NULL; cpc_set_t *set; if ((event0 = getenv("EVENT0")) == NULL) { event0 = "L2_dmiss_ld"; } if ((event1 = getenv("EVENT1")) == NULL) { event1 = "Instr_cnt"; } if ((set = cpc_set_create(cpc)) == NULL) { cout << "could not create set: " << strerror(errno) << endl; throw PerfCountersException(); } if (cpc_set_add_request(cpc, set, event0, 0, CPC_COUNT_USER, 0, NULL) == -1) { cout << "could not add first request: " << strerror(errno) << endl; throw PerfCountersException(); } if (cpc_set_add_request(cpc, set, event1, 0, CPC_COUNT_USER, 0, NULL) == -1) { cout << "could not add second request: " << strerror(errno) << endl; throw PerfCountersException(); } if (cpc_bind_curlwp(cpc, set, 0) == -1) { cout << "cannot bind lwp: " << strerror(errno) << endl; throw PerfCountersException(); } #else #error Performance counters not known for this architecture. #endif } wham/MANUAL0000644001532600153260000001775411766023737011772 0ustar yinanyinanSTEPS: 1.compile. make 2.build index. Before performing alignments, you have to build a new index. The index is stored on disk and will be loaded when performing alignments. The WHAM tarball comes with a sample sequence containing the first 100,000 bases of Human Genome Chromosome 1. As an example, we issue the following command to build a index on the sample sequence. If you see an error “command not found”, add ./ before wham-build. wham-build -l 60 -v 2 --mask sequences/chr1_100k.fa indexes/idx The options -l 60 and -v 2 specify that the index is used for aligning 60bps reads with up to 2 mismatches. This command prints the message “Complete” if the index is built successfully. The directory indexes should contain four new files: idx.head.whm, idx.interval.whm, idx.sequence.whm, and idx.i0.whm. 3.align reads. With the pre-built index, we can use WHAM aligner to align reads. The WHAM tarball includes a sample single-end read file sample.fq and a pair of paired-end read files. We show the following examples to demonstrate how to use WHAM aligner. If you see an error “command not found”, add ./ before wham, i.e. type in “./wham”. Example 1 wham reads/sample.fq indexes/idx output This command aligns all reads in the file reads/sample.fq, and prints out one alignment per read into the file output. WHAM finds three alignments as shown below. The first alignment is on the forward strand (+) and has 1 mismatch (C⇒T). The second alignment is an exact match on the forward strand. The last match is on the reverse strand (-). + chr1 51614 CCAGCACTTTGGGAGGCCGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTG 26:C>T + chr1 83977 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA - chr1 17446 CACAGCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCCCTGGCT Example 2 wham -a --best reads/sample.fq indexes/idx output Specifying -a instructs WHAM to report all valid alignments for each read. Option –best results in a best-to-worst order on the reported alignments. WHAM finds 2 alignments for the first read, 6 alignments for the second read, and 1 alignment for the third read. All alignments are printed in a sorted order. + chr1 51614 CCAGCACTTTGGGAGGCCGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTG 26:C>T - chr1 62070 CAGGATGGTCTCGATCTCCTGACCTCGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGG 22:T>G,41:T>C + chr1 83977 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA + chr1 83981 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA - chr1 54712 TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC + chr1 83973 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA 0:A>G - chr1 54716 TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC 1:C>T - chr1 54708 TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC 56:T>C,58:A>T - chr1 17446 CACAGCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCCCTGGCT Example 3 wham -k 3 -m 5 reads/sample.fq indexes/idx output Specifying -k 3 instructs WHAM to report up to 3 valid alignments per read. Specifying -m 5 instructs WHAM to refrain from reporting any alignments for reads that have more than 5 valid alignments. In this case, a total of 3 valid alignments exist. All alignments of the third read are discarded. + chr1 51614 CCAGCACTTTGGGAGGCCGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTG 26:C>T - chr1 62070 CAGGATGGTCTCGATCTCCTGACCTCGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGG 22:T>G,41:T>C - chr1 17446 CACAGCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCCCTGGCT Example 4 wham -v 3 reads/sample.fq indexes/idx output Specifying -v 3 instructs WHAM to report alignments with up to 3 mismatches. However, WHAM does not guarantee that it can find all valid alignments with three mismatches. Note that since the index “idx” is built with the option -v 2 (see the command for building the index), all alignments with up to 2 mismatches can be found by WHAM. In this case, WHAM reports one more alignment for the fourth read with 3 mismatches. + chr1 51614 CCAGCACTTTGGGAGGCCGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTG 26:C>T + chr1 83977 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA - chr1 17446 CACAGCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCCCTGGCT - chr1 17962 GCGGGTGCGTCTATGCAGGCCAGGGTCCTGGGCGCCCGTGAAGATGGAGCCATAGTCCTG 5:T>G,47:C>A,54:G>T Example 5 wham -a -e 60 reads/sample.fq indexes/idx output Specifying -e 60 instructs WHAM to report alignments with up to 2 mismatches. In addition, the sum of the Phred quality scores at all mismatched positions cannot exceed 60. Compared with the results of Example 2, most of the alignments with mismatches are filtered out by the option -e 60. + chr1 51614 CCAGCACTTTGGGAGGCCGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTG 26:C>T + chr1 83977 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA + chr1 83981 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA - chr1 54712 TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC - chr1 17446 CACAGCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCCCTGGCT Example 6 wham -1 reads/sample_pair_1.fq -2 reads/sample_pair_2.fq indexes/idx output This command takes paired-end reads from the two files, and outputs valid alignments to the file “output”. Example 7 wham -t 16 reads/sample.fq indexes/idx output Specifying -t 16 instructs WHAM to align reads with 16 concurrent threads. The output is exactly same as that of Example 1. ------------------------------------------------------- WHAM INDEX BUILDER Usage: wham-build [options]* -l -l specify the length of short reads comma-separated list of fiels with ref sequences write wham data to files with this dir/basename Options: -v report hits with <=v errors (0-5), ignore qualities -p specify the number of fragments for alignments -m discard substring appearing more than times (default: 100). -b specify the number of buckets --mask keep masked characters in the sequences (default: on) --unmask discard masked characters in the sequences. Masks are treated as Ns --version print version information -h/--help print this usage message WHAM ALIGNER Usage: wham [options]* { | -1 -2 } comma-separated list of files containing unpaired reads comma-separated list of files containing upstream mates comma-separated list of files containing downstream mates write wham data to files with this dir/basename file to write alignments to Input options: -l use first bases in each read Alignment options: -v specify the max number of errors in a reported alignment. -g/--gap specify the max number of gaps in a reported alignment. -e/--maqerr max sum of mismatch quals across alignment --nofw/--norc do not align to forward/reverse-complement ref strand --nofr/--norf do not align to mate1/mate2 strand: fw/rev, rev/fw. -I/--minins minimum insert size for paired-end alignment (default: 0). -X/--maxins maximum insert size for paired-end alignment (default: 250). Reporting options: -k report up to valid alignemtns per read (default: 1). -a/--all report all valid alignments per read. --best reprot valid alignments in a sorted order of quality. -m discard reads with more than valid alignmetns. Output options: -S/--sam write alignment in SAM format --al wirte aligned reads/pairs to file(s) --un write unaligned reads/pairs to file(s) Performance options: --step specify the number of indexes that fit into memory. Other options: --version print version information -h/--help print this usage message wham/edit_distance.h0000644001532600153260000000306112003705361014005 0ustar yinanyinan/* * nw.h for program nw. * TODO : integrate gap penalty ( currently can have hard coded ) * return score/pos in int or just Int. */ #include #include #include #include //#define EDIT_DEBUG 1 //#ifdef EXE #define MAX 100 //#endif typedef struct { int gap_penalty; int mm_penalty; int match_reward; int gap_ext_penalty; } score_mat; // all 1 means we are looking for edit distance static score_mat obj_score_mat = { 1, 1, 1, 1 }; typedef struct { int score; // edit_distance int pos; // position of db_seq at which alignment found. this happens when we remove leading '-' from query int qgap; // number of gap in query (string2) int dgap; // number of gap in database ( string1) int mm; // number of mismatch in alignment int n_iden; int align_len; } scoreinfo; /* * parameter - * q_seq * db_seq * q_len * db_len * query_align * db_align * prm ( print matrix) */ scoreinfo edit_distance(char *, char *, int, int, char *, char *, int); scoreinfo edit_distance_align(int **, char **, char *, char *, int, int, char *, char *, int); /*typedef struct{ char al_str1[MAX]; char al_str2[MAX]; scoreinfo score; int pos; }align_result; */ void matrix_init(int **, char **, int, int, int); void print_al(char *, char *); void print_matrix(int ** const, char *, char *, int, int); void print_traceback(char ** const, char *, char *, int, int); int min(int, int, int, char *); void initialize_str(char *, int); void reverse_str(char *, int); void print_score(scoreinfo s); void edit_distance_init(); wham/model.cpp0000644001532600153260000001430312054641424012647 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: model.cpp 165 2012-11-26 10:23:16Z yinan $ */ #include #include #include #include #include "model.h" #include "error.h" /* * AlignerModel::computeNumIndex(int nError, int nPartition) * Given the number of errors and the number of partitions, * compute the number of required hash tables, according to the * formula C(nPartition-1, nMismatch). */ int AlignerModel::computeNumIndex(int nError, int nPartition) { int i; unsigned long long x = 1; for (i = nPartition - nError; i <= nPartition - 1; i++) x *= i; for (i = 1; i <= nError; i++) x /= i; return (int) x; } /* * AlignerModel::computeNumLookup(int nError, int nPartition) * Given the number of errors and the number of partitions, * compute the number of lookups for each alignment, according * to the formula C(nPartition, nMismatch). */ int AlignerModel::computeNumLookup(int nError, int nPartition) { int i; unsigned long long x = 1; for (i = nPartition - nError + 1; i <= nPartition; i++) x *= i; for (i = 1; i <= nError; i++) x /= i; return (int) x; } /* * AlignerModel::estimateNumPartition(int nEntry, int length, int nError) * Estimate the optimal number of partitions based on a cost model, given * parameters of the aligner. */ int AlignerModel::estimateNumPartition(unsigned int nEntry, int length, int nError, bool memory) { int p, bestp; unsigned int nIndex, nLookup, nBucket; double nSpace; double costLookup, cost, min; double p0, p1, p2; min = 1000000000; bestp = 0; for (p = nError + 1; p <= nError + 5; p++) { if (memory && estimateIndexSpace(nEntry, length, nError, p) >= getFreeMemory()) continue; nIndex = computeNumIndex(nError, p); nLookup = computeNumLookup(nError, p); nSpace = pow(4.0, length / p * (p - nError)); nBucket = (double) nEntry < nSpace ? nEntry : (unsigned int) nSpace; costLookup = 0; /* probability of empty bucket */ p0 = pow(1 - double(1) / nBucket, nEntry); costLookup += p0; /* probability of non-overflow bucket */ p1 = nEntry * (double(1) / nBucket) * pow(1 - double(1) / nBucket, nEntry - 1); costLookup += 2 * p1; /* probability of overflow bucket */ p2 = 1 - p0 - p1; costLookup += p2 * ((nEntry - nEntry * p1) / (nBucket * p2)); cost = costLookup * nLookup; if (cost < min) { min = cost; bestp = p; } } return bestp; } bool AlignerModel::isFitMemory(unsigned int nEntry, int length, int nMismatch, int nPartition) { unsigned int szIndex; unsigned int szMemory; szIndex = estimateIndexSpace(nEntry, length, nMismatch, nPartition); szMemory = getFreeMemory(); elog(DEBUG1, "Estimated index size: %d MB\n", szIndex); elog(DEBUG1, "Free memory size: %d MB\n", szMemory); /* left 200MB free space*/ return (szIndex + 200) < szMemory; } unsigned int AlignerModel::getNumHashtableFitMemory(unsigned int nEntry, int length, int nMismatch, int nPartition) { unsigned int szSeq; unsigned int szIndex; unsigned int szMemory; unsigned int nHashtable; szSeq = nEntry / 32 * 3 * sizeof(int) / 1024 / 1024; nHashtable = computeNumIndex(nMismatch, nPartition); szIndex = estimateHashtableSpace(nEntry, length, nMismatch, nPartition); szMemory = getFreeMemory(); elog(DEBUG1, "Estimated hashtable size: %d MB\n", szIndex); elog(DEBUG1, "Free memory size: %d MB\n", szMemory); if ((szMemory - szSeq - 200) / szIndex < nHashtable) return (szMemory - szSeq - 200) / szIndex; else return nHashtable; } unsigned int AlignerModel::getFreeMemory() { unsigned int memTotal, memFree, memBuffer, memCache; FILE * file; // return 4096; file = fopen("/proc/meminfo", "r"); if (file == NULL ) //on non-linux system return 0; fscanf(file, "MemTotal: %d kB\n", &memTotal); fscanf(file, "MemFree: %d kB\n", &memFree); fscanf(file, "Buffers: %d kB\n", &memBuffer); fscanf(file, "Cached: %d kB\n", &memCache); fclose(file); return memTotal / 1024; // return (memFree + memBuffer + memCache)/1024; } unsigned int AlignerModel::estimateIndexSpace(unsigned int nEntry, int length, int nError, int nPartition) { int nHashtable = 0; unsigned int szSeq; unsigned int szHashtable; unsigned int szIndex; szSeq = nEntry / 32 * 3 * sizeof(int) / 1024 / 1024; nHashtable = computeNumIndex(nError, nPartition); szHashtable = estimateHashtableSpace(nEntry, length, nError, nPartition); szIndex = szSeq + szHashtable * nHashtable; if (szIndex == 0) szIndex = 1; return szIndex; } unsigned int AlignerModel::estimateHashtableSpace(unsigned int nEntry, int length, int nError, int nPartition) { unsigned int szHashtable; unsigned int nBucket; double nSpace, p0, p1, p2; nSpace = pow(4.0, length / nPartition * (nPartition - nError)); nBucket = (double) nEntry < nSpace ? nEntry : (unsigned int) nSpace; /* probability of empty bucket */ p0 = pow(1 - double(1) / nBucket, nEntry); /* probability of non-overflow bucket */ p1 = nEntry * (double(1) / nBucket) * pow(1 - double(1) / nBucket, nEntry - 1); /* probability of overflow bucket */ p2 = 1 - p0 - p1; // nSpace = pow(8.0, length / nPartition * (nPartition - nError)); // nBucket = (double)nEntry < nSpace? nEntry : (unsigned int)nSpace; szHashtable = (unsigned int) ((nEntry - nEntry * p1 + nBucket) / 1024 / 1024 * sizeof(int)); if (szHashtable == 0) szHashtable = 1; return szHashtable; } wham/reads/0000755001532600153260000000000012054751660012144 5ustar yinanyinanwham/reads/sample_pair_2.fq0000644001532600153260000000155611552457724015226 0ustar yinanyinan@HWI-EAS243:1:9:567:68#0 GTTTCTGGGCTTTTTTTTTAAAAAAACTATTCTAATCTTTTTTTTTATTCTCAATAATTT +HWI-EAS243:1:9:567:68#0 aabaaa`aaaaaa_aaaaa````a`aVaa``a```^``U^^a`VH[^b\^]YYa^^^^^^ @HWI-EAS243:1:30:1016:1182#0 TGGAACTCCCACAGGGAAGGTTCTTATCTTTGCGTGCACAACACAGAGCCCTCTGGGTAG +HWI-EAS243:1:30:1016:1182#0 a_``a^a`a`_`a_`_a__`a___a^`_a^^_a^^^a]_^a^^_a^^_a^^^a\^^a]^^ @HWI-EAS243:1:2:919:1212#0 TCTATACTTTAAGGAACAAAGATATGAAGTTTGCAATGAGAAGGCTGAATCAACATATTT +HWI-EAS243:1:2:919:1212#0 aaaaa`a`a`\``^a`^X^a`a```^^_X\\XZ[^V_X]_YY]]]^]\XR]]ZU]XZXUV @HWI-EAS243:1:8:352:1514#0 GTATGTATCAGGAAAAGACAATACCCTTCAAACTTTGAGAGTTTACATCAGAAAGAAAAC +HWI-EAS243:1:8:352:1514#0 a^aa_`ZGV^[U`^\`^]`__a_`_[Y_^_`^ZUKZ^__^_^_^^XNY[\[[^`^^\`\V @HWI-EAS243:1:4:101:827#0 CATTGTTGAAGATGCCCTCATTCATGGTGTGGGGGTCAGTGAGCTTAATGTGTGTCAAGA +HWI-EAS243:1:4:101:827#0 aaa````a\_\a`aaa`\\a^aS``aXR_aa`_\``aaY]`]aY]`[G[^XXYYE\`X]] wham/reads/sample_pair_1.fq0000644001532600153260000000155611552457724015225 0ustar yinanyinan@HWI-EAS243:1:9:567:68#0 AAAAGTTAACCCATATGGAATGCAATGGAGGAAATCAATGACATATCAGATCTAGAAACT +HWI-EAS243:1:9:567:68#0 aabaaa`aaaaaa_aaaaa````a`aVaa``a```^``U^^a`VH[^b\^]YYa^^^^^^ @HWI-EAS243:1:30:1016:1182#0 CACAGAAGCTCTGCCTGCCTTTGCTGGCCAGCTGGGCTGAGCGGGCCTGGGAATTAAGGC +HWI-EAS243:1:30:1016:1182#0 a_``a^a`a`_`a_`_a__`a___a^`_a^^_a^^^a]_^a^^_a^^_a^^^a\^^a]^^ @HWI-EAS243:1:2:919:1212#0 TCACATCACCGTAGTGGTTTTGTTTTTTGCTCCATGCATGTTTCTCTACGTGTGGCCTTT +HWI-EAS243:1:2:919:1212#0 aaaaa`a`a`\``^a`^X^a`a```^^_X\\XZ[^V_X]_YY]]]^]\XR]]ZU]XZXUV @HWI-EAS243:1:8:352:1514#0 CTTATTCATTCAGAAAACATACTAAGTGCTGGCTCTTTTTCATGTCCTTTATCAAGTTTG +HWI-EAS243:1:8:352:1514#0 a^aa_`ZGV^[U`^\`^]`__a_`_[Y_^_`^ZUKZ^__^_^_^^XNY[\[[^`^^\`\V @HWI-EAS243:1:4:101:827#0 AACACATTTTCAGTGTTGAATGATAAATTTTGGAATAGTTAACAGATGATAAAAGTGTTG +HWI-EAS243:1:4:101:827#0 aaa````a\_\a`aaa`\\a^aS``aXR_aa`_\``aaY]`]aY]`[G[^XXYYE\`X]] wham/reads/sample.fq0000644001532600153260000000160211540751372013753 0ustar yinanyinan@HWI-EAS243:1:9:567:68#0/1 CCAGCACTTTGGGAGGCCGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTG +HWI-EAS243:1:9:567:68#0/1 aabaaa`aaaaaa_aaaaa````a`aVaa``a```^``U^^a`VH[^b\^]YYa^^^^^^ @HWI-EAS243:1:30:1016:1182#0/1 GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA +HWI-EAS243:1:30:1016:1182#0/1 a_``a^a`a`_`a_`_a__`a___a^`_a^^_a^^^a]_^a^^_a^^_a^^^a\^^a]^^ @HWI-EAS243:1:2:919:1212#0/1 AGCCAGGGGGTGACGGGTGGCTCGGCTCGGGAGGCCTGGGACCCCACAGTGCACGCTGTG +HWI-EAS243:1:2:919:1212#0/1 aaaaa`a`a`\``^a`^X^a`a```^^_X\\XZ[^V_X]_YY]]]^]\XR]]ZU]XZXUV @HWI-EAS243:1:8:352:1514#0/1 GTAATCCCTGGGCCCTAACTCACTCATCCCAACTATTCACTCACTGCCTTGCCCCACACC +HWI-EAS243:1:8:352:1514#0/1 a^aa_`ZGV^[U`^\`^]`__a_`_[Y_^_`^ZUKZ^__^_^_^^XNY[\[[^`^^\`\V @HWI-EAS243:1:4:101:827#0/1 CAGGACTATGGCTCCATCTTCACGGGCGCCCAGGACCCTGGCCTGCATAGACGCACCCGC +HWI-EAS243:1:4:101:827#0/1 aaa````a\_\a`aaa`\\a^aS``aXR_aa`_\``aaY]`]aY]`[G[^XXYYE\`X]] wham/embedhash.h0000644001532600153260000000562612005144676013147 0ustar yinanyinan#ifndef _EMBEDHASH_H_ #define _EMBEDHASH_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: hash.h 152 2012-07-22 10:52:53Z yinan $ */ #include #include #include "lib.h" #include "sequence.h" #include "hitset.h" #define COMPRESS_TABLE_SIZE 2147483648LLU class EmbedHashTable { private: bool compressedTable; int length; /* the length of query sequence (characters) */ int lenSeq; /* the length of query sequence (bits) */ int nMismatch; /* the number of allowed errors */ uint32 numBucket; /* the number of buckets */ uint32 numEntry; /* the number of entries */ uint32 numOverflowEntry;/* the number of overflow entries (stored in the overflow list) */ uint32 numCollision; /* the number of buckets with collisions */ uint32 numEmpty; /* the number of empty buckets */ uint32 * buckets; /* bucket array */ uint32 * overflowPool; /* overflow pool array */ CompactSequence * sequence; /* the reference sequence */ unsigned char * emptyBits; /* the bitmap for empty buckets (only used in building phase) */ unsigned char * collisionBits; /* the bitmap for collision buckets (only used in building phase) */ unsigned char * overflowBits; uint32 maxScan; int nMaxError; int nMaxGap; int maxQual; // const static uint32 nHistogram = 4; // uint32 histogram[nHistogram]; public: EmbedHashTable(); ~EmbedHashTable(); void init(CompactSequence * seq, int len, unsigned int nBucket, int numMismatch, int nPartition); int preProcessInit(); int preProcessEnd(); int buildInit(); void preProcessInsert(int64 * key); /*inline*/ void insert(int64 * key, unsigned int offset); /*inline*/ unsigned int lookup(int64 * orgkey, int64 * key, int keyOffset, char * quals, strand s, int rid, HitSet * hits, bool noGap = false); int sortList(); int check(int num); int save(FILE * file); int load(FILE * file, CompactSequence * seq); int remove(); unsigned int nextPrime(unsigned int num); void setScanThreshold(double r); int getMaxScan() { return maxScan; } void setErrorModel(int maxerr, int maxgap, int maxqual) { nMaxError = maxerr; nMaxGap = maxgap; maxQual = maxqual; } }; #endif wham/COPYING0000644001532600153260000010451311540734433012103 0ustar yinanyinan GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . wham/error.cpp0000644001532600153260000000224112003705361012671 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: error.cpp 157 2012-07-25 05:58:09Z yinan $ */ #include "error.h" #include #include //int ELOG_LEVEL = DEBUG1; int ELOG_LEVEL = INFO; void elog(int level, char * format, ...) { va_list args; char *filename = __FILE__; int lineno = __LINE__; if (level <= ELOG_LEVEL) { va_start(args, format); vprintf(format, args); va_end(args); } } wham/lib.h0000644001532600153260000000221312003705361011752 0ustar yinanyinan#ifndef _LIB_H_ #define _LIB_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: lib.h 157 2012-07-25 05:58:09Z yinan $ */ #ifndef WIN32 typedef unsigned long long int64; #else typedef unsigned __int64 int64; #endif typedef unsigned int uint32; #ifndef WIN32 #define RAND() (rand()) #else #define RAND() ((rand() << 15) | rand()) #endif #define CACHE_LINE_SIZE 128 #define MAX_RESULT_POOL 16 #endif wham/sorter.cpp0000644001532600153260000000442412003705361013063 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: main.cpp 135 2012-03-23 07:47:27Z yinan $ */ #include #include #include #include #include #include #ifndef WIN32 #include #endif #include #include #include "error.h" #include "aligner.h" #include "rdtsc.h" #include "util.h" using namespace std; char pgversion[] = "0.1.3"; char * pgcommand; int main(int argc, char* argv[]) { char basepath[256] = ""; char newpath[256] = ""; srand((unsigned int) time(NULL)); strcpy(basepath, argv[1]); strcpy(newpath, argv[2]); if (basepath[0] == '\0') { elog(ERROR, "specify the base name.\n"); printf("See usage message by specifying -h/--help.\n"); exit(1); } if (newpath[0] == '\0') { elog(ERROR, "specify the new name.\n"); printf("See usage message by specifying -h/--help.\n"); exit(1); } elog(INFO, "loading WHAM indexes...\n"); /* load index header */ Aligner * aligner = new Aligner(basepath); aligner->printInfo(); /* load hash tables */ int ret = aligner->loadHashtables(basepath); if (ret != SUCCESS) { elog(ERROR, "failed to load indexes.\n"); } aligner->sortList(); /* elog(INFO, "saving indexes...\n"); ret = aligner->saveHead(newpath); if (ret != SUCCESS) { elog(ERROR, "failed to save index head.\n"); return ret; } ret = aligner->saveIndex(newpath, 0); if (ret != SUCCESS) { elog(ERROR, "failed to save index.\n"); return ret; } */ return SUCCESS; } wham/rdtsc.h0000644001532600153260000000222512003705361012326 0ustar yinanyinan#ifndef RDTSC_H #define RDTSC_H /* * Copyright 2007, Spyros Blanas * Distributed under the terms of the GNU General Public License, verion 3. */ /* Programmed by Spyros Blanas for 764/758 joint project, November 2007. */ /* IT CONTAINS HARDCODED VALUES, DOESN'T WORK FOR ANY MACHINE */ #ifdef __cplusplus extern "C" { #endif #if !defined(__i386__) && !defined(__x86_64__) && !defined(__sparc__) //#warning No supported architecture found -- timers will return junk. #endif static __inline__ unsigned long long curtick() { unsigned long long tick; #if defined(__i386__) unsigned long lo, hi; __asm__ __volatile__ (".byte 0x0f, 0x31" : "=a" (lo), "=d" (hi)); tick = (unsigned long long) hi << 32 | lo; #elif defined(__x86_64__) unsigned long lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); tick = (unsigned long long) hi << 32 | lo; #elif defined(__sparc__) __asm__ __volatile__ ("rd %%tick, %0" : "=r" (tick)); #endif return tick; } static __inline__ void startTimer(unsigned long long* t) { *t = curtick(); } static __inline__ void stopTimer(unsigned long long* t) { *t = curtick() - *t; } #ifdef __cplusplus } #endif #endif wham/writer.h0000644001532600153260000001264012003705361012525 0ustar yinanyinan#ifndef _WRITER_H_ #define _WRITER_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: writer.h 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include "hitset.h" #include "sequence.h" #include "short.h" #define SAM_FLAG_PAIRED 0x1 #define SAM_FLAG_MAPPED_PAIRED 0x2 #define SAM_FLAG_UNMAPPED 0x4 #define SAM_FLAG_MATE_UNMAPPED 0x8 #define SAM_FLAG_QUERY_STRAND 0x10 #define SAM_FLAG_MATE_STRAND 0x20 #define SAM_FLAG_FIRST_IN_PAIR 0x40 #define SAM_FLAG_SECOND_IN_PAIR 0x80 #define SAM_FLAG_NOT_PRIMARY 0x100 #define SAM_FLAG_FAILED 0x200 #define SAM_FLAG_DUPLICATE 0x400 class Writer { public: Writer(CompactSequence * _sequence, ShortRead * _reader, char * fname) { sequence = _sequence; reader1 = _reader; reader2 = NULL; length = _reader->getReadLength(); file = fopen(fname, "w"); assert(file != NULL); } ~Writer() { fflush(file); fclose(file); } virtual void writeHead() { } virtual void writeAlignment(int readId, HitSet * set) = 0; protected: char * reverseSequence(char * str2, char * str1); protected: FILE * file; CompactSequence * sequence; ShortRead * reader1; ShortRead * reader2; int length; }; /** * writer class for normal format output */ class SimpleWriter: public Writer { public: SimpleWriter(CompactSequence * _sequence, ShortRead * _reader, char * fname) : Writer(_sequence, _reader, fname) { } void writeAlignment(int readId, HitSet * set) { if (set->getNumHits() > 0) writeValidAlignment(readId, set); } protected: void writeField(int64 * query, int64 * reference, strand s); void writeField(char * str, char * query, char * reference, strand s, ErrorVector error); private: void writeValidAlignment(int readId, HitSet * set); }; class SimplePairWriter: public SimpleWriter { public: SimplePairWriter(CompactSequence * _sequence, ShortRead * _reader1, ShortRead * _reader2, char * fname) : SimpleWriter(_sequence, _reader1, fname) { reader2 = _reader2; } void writeAlignment(int readId, HitSet * set) { if (set->getNumHits() > 0) writeValidAlignment(readId, set); } private: void writeValidAlignment(int readId, HitSet * set); }; /** * writer class for single-end SAM output */ class SamWriter: public Writer { public: SamWriter(CompactSequence * _sequence, ShortRead * _reader, char * fname) : Writer(_sequence, _reader, fname) { } void writeHead(); void writeAlignment(int readId, HitSet * set) { if (set->getNumHits() == 0) writeInvalidAlignment(readId, set); else writeValidAlignment(readId, set); } void writeOptionalField(char * query, char * reference, ErrorVector error); void getMDfield(char * str, char * query, char * reference, ErrorVector error); void getCIGAR(char * str, char * query, char * reference, ErrorVector error); private: void writeValidAlignment(int readId, HitSet * set); void writeInvalidAlignment(int readId, HitSet * set); }; /** * writer class for paired-end SAM output */ class SamPairWriter: public SamWriter { public: SamPairWriter(CompactSequence * _sequence, ShortRead * _reader1, ShortRead * _reader2, char * fname) : SamWriter(_sequence, _reader1, fname) { reader2 = _reader2; } void writeAlignment(int readId, HitSet * set) { if (set->getNumHits() == 0) writeInvalidAlignment(readId, set); else writeValidAlignment(readId, set); } private: void writeValidAlignment(int readId, HitSet * set); void writeInvalidAlignment(int readId, HitSet * set); }; /** * writer class for raw format. Raw format is used for * storeing the intermediate results in pipeline mode. */ class RawWriter: public Writer { public: RawWriter(CompactSequence * _sequence, ShortRead * _reader, char * fname) : Writer(_sequence, _reader, fname) { } void writeAlignment(int readId, HitSet * set) { /* In RAW formate, we have to output all hits even the number of * hits exceeds -m . Otherwise (discard all hits), the merged * results may be wrong. * we use getNumAllHits() instead of getNumHIts(). */ if (set->getNumAllHits() > 0) writeValidAlignment(readId, set); } private: void writeValidAlignment(int readId, HitSet * set); }; class RawReader { public: RawReader() { file = NULL; } ~RawReader() { if (file != NULL ) fclose(file); } bool init(char * fname, int _maxid) { maxid = _maxid; file = fopen(fname, "r"); if (file == NULL ) return false; return true; } void next(Hit * hit) { int ret = fread(hit, sizeof(Hit), 1, file); if (ret != 1) hit->id = maxid; } private: FILE * file; int maxid; }; #endif wham/bitread.h0000644001532600153260000001764412005720734012637 0ustar yinanyinan#ifndef BITREAD_H_ #define BITREAD_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: bitread.h 162 2012-07-31 09:14:04Z yinan $ */ #include "lib.h" #define BITS_PER_BASE 3 #define BITS_PER_BASE_LL 3LL #define BITS_PER_BYTE 8 #define BITS_PER_WORD 32 #define BITS_PER_LONGWORD 64 #define BITS_LONGWORD_SHIFT 6 #define BITS_LONGWORD_MASK 0x000000000000003f #define BITS_TWO_LONGWORD 128 #define BITS_THREE_LONGWORD 192 #define BITS_FOUR_LONGWORD 256 #define BITS_FIVE_LONGWORD 320 #define WORDS_PER_READ 6 class BitRead { public: /* * BitRead::extract * extract a subsequence from the sequence. The subsequence * starts from offset-th character of the sequence and has len * characters. * * This implementation is highly optimized for modern CPUs. All IF * statements are implemented by arithmetic operations to eliminate the * branch. It support up to 256-bit sequence (85 bases) that can be * fitted into four 64-bit integers. */ inline static void extract(int64 * a, int64 * b, int64 offset, int len) { int shift = ((offset) + (len)) & BITS_LONGWORD_MASK; int index = ((offset) + (len)) >> BITS_LONGWORD_SHIFT; int64 mask = 0LL - (shift != 0); int cshift = BITS_PER_LONGWORD - shift; b[0] = (mask & (a[index - 5] >> cshift)) | (a[index - 6] << shift); b[1] = (mask & (a[index - 4] >> cshift)) | (a[index - 5] << shift); b[2] = (mask & (a[index - 3] >> cshift)) | (a[index - 4] << shift); b[3] = (mask & (a[index - 2] >> cshift)) | (a[index - 3] << shift); b[4] = (mask & (a[index - 1] >> cshift)) | (a[index - 2] << shift); b[5] = (mask & (a[index] >> cshift)) | (a[index - 1] << shift); shift = BITS_PER_LONGWORD - ((len) & BITS_LONGWORD_MASK); b[0] = (0LL - (len > BITS_FIVE_LONGWORD)) & (b[0] << shift >> shift); cshift = (0LL - (len < BITS_FIVE_LONGWORD)) & shift; b[1] = b[1] << cshift >> cshift; b[1] = (0LL - ((len) > BITS_FOUR_LONGWORD)) & b[1]; cshift = (0LL - (len < BITS_FOUR_LONGWORD)) & shift; b[2] = b[2] << cshift >> cshift; b[2] = (0LL - ((len) > BITS_THREE_LONGWORD)) & b[2]; cshift = (0LL - (len < BITS_THREE_LONGWORD)) & shift; b[3] = b[3] << cshift >> cshift; b[3] = (0LL - ((len) > BITS_TWO_LONGWORD)) & b[3]; cshift = (0LL - (len < BITS_TWO_LONGWORD)) & shift; b[4] = b[4] << cshift >> cshift; b[4] = (0LL - ((len) > BITS_PER_LONGWORD)) & b[4]; cshift = (0LL - (len < BITS_PER_LONGWORD)) & shift; b[5] = b[5] << cshift >> cshift; } inline static void genHeadMask(int64 * mask, int l) { int64 tmp[WORDS_PER_READ]; for (int i = 0; i < WORDS_PER_READ; i++) tmp[i] = -1; removeHead2(tmp, mask, l); } /* * BitRead::removeHead * This function is used to extract rightmost portion of sequence a * into sequence b. The extracted portion has l bits. * * This implementation is highly optimized for modern CPUs. All IF * statements are implemented by arithmetic operations to eliminate the * branch. It support up to 256-bit sequence (85 bases) that can be * fitted into four 64-bit integers. */ inline static void removeHead(int64 * a, int64 * b, int64 * mask) { b[0] = a[0] & mask[0]; b[1] = a[1] & mask[1]; b[2] = a[2] & mask[2]; b[3] = a[3] & mask[3]; b[4] = a[4] & mask[4]; b[5] = a[5] & mask[5]; } inline static void removeHead2(int64 * a, int64 * b, int l) { int shift; int lshift = BITS_PER_LONGWORD - ((l) & BITS_LONGWORD_MASK); shift = lshift; b[0] = ((int64) 0 - (l > BITS_FIVE_LONGWORD)) & (a[0] << shift >> shift); shift = ((int64) 0 - (l < BITS_FIVE_LONGWORD)) & lshift; b[1] = a[1] << shift >> shift; shift = lshift; b[1] = ((int64) 0 - ((l) > BITS_FOUR_LONGWORD)) & b[1]; shift = ((int64) 0 - (l < BITS_FOUR_LONGWORD)) & lshift; b[2] = a[2] << shift >> shift; shift = lshift; b[2] = ((int64) 0 - ((l) > BITS_THREE_LONGWORD)) & b[2]; shift = ((int64) 0 - ((l) < BITS_THREE_LONGWORD)) & lshift; b[3] = a[3] << shift >> shift; shift = lshift; b[3] = ((int64) 0 - ((l) > BITS_TWO_LONGWORD)) & b[3]; shift = ((int64) 0 - ((l) < BITS_TWO_LONGWORD)) & lshift; b[4] = a[4] << shift >> shift; shift = lshift; b[4] = ((int64) 0 - ((l) > BITS_PER_LONGWORD)) & b[4]; shift = ((int64) 0 - ((l) < BITS_PER_LONGWORD)) & lshift; b[5] = a[5] << shift >> shift; } /* * BitRead::removeInterval * This function is used to remove an interval segment from sequence a, * and extract others into sequence b. The interval starts from x-th * bit in the sequence and has len bits. * * This implementation is highly optimized for modern CPUs. All IF * statements are implemented by arithmetic operations to eliminate the * branch. It support up to 256-bit sequence (85 bases) that can be * fitted into four 64-bit integers. * * This implementation requires that the sequence a is stored in the * rightmost of seven 64-bit integers. All unoccupied bits in the seven * 64-bit integers are set to be 0s. */ inline static void removeInterval(int64 * a, int64 * b, uint32 x, int p) { int index1 = WORDS_PER_READ - 1 - ((x) >> BITS_LONGWORD_SHIFT); int index2 = WORDS_PER_READ - 1 - (((x) + (p)) >> BITS_LONGWORD_SHIFT); int shift = (x) & BITS_LONGWORD_MASK; int shift2 = ((x) + (p)) & BITS_LONGWORD_MASK; int poffset = (p) >> BITS_LONGWORD_SHIFT; int pshift = (p) & BITS_LONGWORD_MASK; int64 mask = ((int64) 0 - (shift != 0)) & ((int64) -1 >> (BITS_PER_LONGWORD - shift)); int64 mask2 = (int64) 0 - (pshift != 0); int index3 = index1 - poffset; int rpshift = BITS_PER_LONGWORD - pshift; b[5] = a[5]; b[4] = a[4]; b[3] = a[3]; b[2] = a[2]; b[1] = a[1]; b[index1] = (a[index1] & mask) | (a[index2] >> shift2 << shift); b[index1] |= ((int64) 0 - (shift2 > shift)) & ((a[index2 - 1] << rpshift)); b[index1 - 1] = (a[index3 - 1] >> pshift) | (mask2 & (a[index3 - 2] << rpshift)); b[index1 - 2] = (a[index3 - 2] >> pshift) | (mask2 & (a[index3 - 3] << rpshift)); b[index1 - 3] = (a[index3 - 3] >> pshift) | (mask2 & (a[index3 - 4] << rpshift)); b[index1 - 4] = (a[index3 - 4] >> pshift) | (mask2 & (a[index3 - 5] << rpshift)); b[index1 - 5] = (a[index3 - 5] >> pshift) | (mask2 & (a[index3 - 6] << rpshift)); } inline static void copy(int64 * a, int64 * b) { b[5] = a[5]; b[4] = a[4]; b[3] = a[3]; b[2] = a[2]; b[1] = a[1]; b[0] = a[0]; } inline static bool compare(int64 * a, int64 * b) { return (a[0] == b[0]) && (a[1] == b[1]) && (a[2] == b[2]) && (a[3] == b[3]) && (a[4] == b[4]) && (a[5] == b[5]); } inline static void leftShift(int64 * a, int64 * b) { b[0] = (a[0] << 3) | (a[1] >> 61); b[1] = (a[1] << 3) | (a[2] >> 61); b[2] = (a[2] << 3) | (a[3] >> 61); b[3] = (a[3] << 3) | (a[4] >> 61); b[4] = (a[4] << 3) | (a[5] >> 61); b[5] = (a[5] << 3); } inline static void rightShift(int64 * a, int64 * b) { b[0] = (a[0] >> 3); b[1] = (a[1] >> 3) | (a[0] << 61); b[2] = (a[2] >> 3) | (a[1] << 61); b[3] = (a[3] >> 3) | (a[2] << 61); b[4] = (a[4] >> 3) | (a[3] << 61); b[5] = (a[5] >> 3) | (a[4] << 61); } }; #endif /* BITREAD_H_ */ wham/makefile0000644001532600153260000000560612054641424012551 0ustar yinanyinanCC=g++ CFLAGS=-c -O3 -Wno-write-strings #CFLAGS=-c -g -Wno-write-strings LDLIBS=-lpthread #CFLAGS+=-DPOPCNT #CFLAGS+=-DPERFCOUNT #CFLAGS+=-DDEBUG_STAT #CFLAGS+=-DDEBUG_HASH_PRINT #CFLAGS+=-DPAIRALIGN_NO_FILTER #CFLAGS+=-DPAIRALIGN_BASIC #CFLAGS+=-DPAIRALIGN_NW #CFLAGS+=-DPAIRALIGN_BITVECTOR all: wham wham-build wham: makefile hash.o embedhash.o main.o sequence.o aligner.o interval.o edit_distance.o error.o short.o hitset.o perfcounters.o model.o writer.o $(CC) $(LDLIBS) -o wham sequence.o hash.o embedhash.o main.o aligner.o interval.o edit_distance.o error.o short.o hitset.o model.o perfcounters.o writer.o wham-build: makefile hash.o embedhash.o builder.o sequence.o aligner.o interval.o edit_distance.o error.o short.o hitset.o model.o writer.o $(CC) $(LDLIBS) -o wham-build sequence.o hash.o embedhash.o builder.o aligner.o interval.o edit_distance.o error.o short.o hitset.o model.o perfcounters.o writer.o wham-test: makefile unittest.o hash.o embedhash.o sequence.o aligner.o interval.o edit_distance.o error.o short.o hitset.o model.o $(CC) $(LDLIBS) -o wham-test unittest.o sequence.o hash.o embedhash.o aligner.o interval.o edit_distance.o error.o short.o hitset.o model.o perfcounters.o unittest.o: makefile unittest.cpp bitread.h $(CC) $(CFLAGS) unittest.cpp interval.o: makefile interval.cpp interval.h lib.h error.h $(CC) $(CFLAGS) interval.cpp edit_distance.o: makefile edit_distance.cpp edit_distance.h error.h $(CC) $(CFLAGS) edit_distance.cpp embedhash.o: makefile embedhash.cpp embedhash.h hash.h edit_distance.h sequence.h lib.h error.h hitset.h pair.h $(CC) $(CFLAGS) embedhash.cpp hash.o: makefile hash.cpp hash.h edit_distance.h sequence.h lib.h error.h hitset.h pair.h $(CC) $(CFLAGS) hash.cpp aligner.o: makefile aligner.cpp aligner.h hash.h sequence.h lib.h error.h short.h hitset.h $(CC) $(CFLAGS) aligner.cpp main.o: makefile main.cpp hash.h aligner.h sequence.h lib.h error.h perfcounters.h rdtsc.h pair.h $(CC) $(CFLAGS) main.cpp sequence.o: makefile sequence.cpp sequence.h lib.h error.h $(CC) $(CFLAGS) sequence.cpp error.o: makefile error.h $(CC) $(CFLAGS) error.cpp short.o: makefile short.cpp short.h sequence.h lib.h error.h $(CC) $(CFLAGS) short.cpp hitset.o: makefile hitset.cpp hitset.h lib.h short.h $(CC) $(CFLAGS) hitset.cpp model.o: makefile model.cpp model.h $(CC) $(CFLAGS) model.cpp writer.o: makefile writer.cpp writer.h sequence.h short.h $(CC) $(CFLAGS) writer.cpp builder.o: makefile builder.cpp hash.h aligner.h sequence.h lib.h error.h short.h $(CC) $(CFLAGS) builder.cpp filter.o: makefile filter.cpp hash.h aligner.h sequence.h lib.h error.h short.h $(CC) $(CFLAGS) filter.cpp sorter.o: makefile sorter.cpp aligner.h $(CC) $(CFLAGS) sorter.cpp perfcounters.o: makefile perfcounters.cpp perfcounters.h $(CC) $(CFLAGS) perfcounters.cpp clean: rm wham wham-build *.o wham/sequences/0000755001532600153260000000000012054751660013041 5ustar yinanyinanwham/sequences/chr1_100k.fa0000644001532600153260000030710311524221165014736 0ustar yinanyinan>chr1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN taaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta accctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaac cctaacccaaccctaaccctaaccctaaccctaaccctaaccctaacccc taaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaa ccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaa ccctaaaccctaaccctaaccctaaccctaaccctaaccccaaccccaac cccaaccccaaccccaaccccaaccctaacccctaaccctaaccctaacc ctaccctaaccctaaccctaaccctaaccctaaccctaacccctaacccc taaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccct aaccctaaccctaaccctcgcggtaccctcagccggcccgcccgcccggg tctgacctgaggagaactgtgctccgccttcagagtaccaccgaaatctg tgcagaggacaacgcagctccgccctcgcggtgctctccgggtctgtgct gaggagaacgcaactccgccgttgcaaaggcgcgccgcgccggcgcaggc gcagagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccg gcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcagagaggcgcg ccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcaga cacatgctagcgcgtcggggtggaggcgtggcgcaggcgcagagaggcgc gccgcgccggcgcaggcgcagagacacatgctaccgcgtccaggggtgga ggcgtggcgcaggcgcagagaggcgcaccgcgccggcgcaggcgcagaga cacatgctagcgcgtccaggggtggaggcgtggcgcaggcgcagagacgc aagcctacgggcgggggttgggggggcgtgtgttgcaggagcaaagtcgc acggcgccgggctggggcggggggagggtggcgccgtgcacgcgcagaaa ctcacgtcacggtggcgcggcgcagagacgggtagaacctcagtaatccg aaaagccgggatcgaccgccccttgcttgcagccgggcactacaggaccc gcttgctcacggtgctgtgccagggcgccccctgctggcgactagggcaa ctgcagggctctcttgcttagagtggtggccagcgccccctgctggcgcc ggggcactgcagggccctcttgcttactgtatagtggtggcacgccgcct gctggcagctagggacattgcagggtcctcttgctcaaggtgtagtggca gcacgcccacctgctggcagctggggacactgccgggccctcttgctCCA ACAGTACTGGCGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTC TCAgtagactcctaaatatgggattcctgggtttaaaagtaaaaaataaa tatgtttaatttgtgaactgattaccatcagaattgtactgttctgtatc ccaccagcaatgtctaggaatgcctgtttctccacaaagtgtttactttt ggatttttgccagtctaacaggtgaAGccctggagattcttattagtgat ttgggctggggcctggccatgtgtatttttttaaatttccactgatgatt ttgctgcatggccggtgttgagaatgactgCGCAAATTTGCCGGATTTCC TTTGCTGTTCCTGCATGTAGTTTAAACGAGATTGCCAGCACCGGGTATCA TTCACCATTTTTCTTTTCGTTAACTTGCCGTCAGCCTTTTCTTTGACCTC TTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGACTTCCCGTGT CCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTT CATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCA AGCTGAGCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG TGGGATGGGCCATTGTTCATCTTCTGGCCCCTGTTGTCTGCATGTAACTT AATACCACAACCAGGCATAGGGGAAAGATTGGAGGAAAGATGAGTGAGAG CATCAACTTCTCTCACAACCTAGGCCAGTAAGTAGTGCTTGTGCTCATCT CCTTGGCTGTGATACGTGGCCGGCCCTCGCTCCAGCAGCTGGACCCCTAC CTGCCGTCTGCTGCCATCGGAGCCCAAAGCCGGGCTGTGACTGCTCAGAC CAGCCGGCTGGAGGGAGGGGCTCAGCAGGTCTGGCTTTGGCCCTGGGAGA GCAGGTGGAAGATCAGGCAGGCCATCGCTGCCACAGAACCCAGTGGATTG GCCTAGGTGGGATCTCTGAGCTCAACAAGCCCTCTCTGGGTGGTAGGTGC AGAGACGGGAGGGGCAGAGCCGCAGGCACAGCCAAGAGGGCTGAAGAAAT GGTAGAACGGAGCAGCTGGTGATGTGTGGGCCCACCGGCCCCAGGCTCCT GTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCA GGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTTGTG AGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGAGTGGGAG TGGCGTCGCCCCTAGGGCTCTACGGGGCCGGCGTCTCCTGTCTCCTGGAG AGGCTTCGATGCCCCTCCACACCCTCTTGATCTTCCCTGTGATGTCATCT GGAGCCCTGCTGCTTGCGGTGGCCTATAAAGCCTCCTAGTCTGGCTCCAA GGCCTGGCAGAGTCTTTCCCAGGGAAAGCTACAAGCAGCAAACAGTCTGC ATGGGTCATCCCCTTCACTCCCAGCTCAGAGCCCAGGCCAGGGGCCCCCA AGAAAGGCTCTGGTGGAGAACCTGTGCATGAAGGCTGTCAACCAGTCCAT AGGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGAAG GGGAGAAGAGGAAAGTGAGGTTGCCTGCCCTGTCTCCTACCTGAGGCTGA GGAAGGAGAAGGGGATGCACTGTTGGGGAGGCAGCTGTAACTCAAAGCCT TAGCCTCTGTTCCCACGAAGGCAGGGCCATCAGGCACCAAAGGGATTCTG CCAGCATAGTGCTCCTGGACCAGTGATACACCCGGCACCCTGTCCTGGAC ACGCTGTTGGCCTGGATCTGAGCCCTGGTGGAGGTCAAAGCCACCTTTGG TTCTGCCATTGCTGCTGTGTGGAAGTTCACTCCTGCCTTTTCCTTTCCCT AGAGCCTCCACCACCCCGAGATCACATTTCTCACTGCCTTTTGTCTGCCC AGTTTCACCAGAAGTAGGCCTCTTCCTGACAGGCAGCTGCACCACTGCCT GGCGCTGTGCCCTTCCTTTGCTCTGCCCGCTGGAGACGGTGTTTGTCATG GGCCTGGTCTGCAGGGATCCTGCTACAAAGGTGAAACCCAGGAGAGTGTG GAGTCCAGAGTGTTGCCAGGACCCAGGCACAGGCATTAGTGCCCGTTGGA GAAAACAGGGGAATCCCGAAGAAATGGTGGGTCCTGGCCATCCGTGAGAT CTTCCCAGGGCAGCTCCCCTCTGTGGAATCCAATCTGTCTTCCATCCTGC GTGGCCGAGGGCCAGGCTTCTCACTGGGCCTCTGCAGGAGGCTGCCATTT GTCCTGCCCACCTTCTTAGAAGCGAGACGGAGCAGACCCATCTGCTACTG CCCTTTCTATAATAACTAAAGTTAGCTGCCCTGGACTATTCACCCCCTAG TCTCAATTTAAGAAGATCCCCATGGCCACAGGGCCCCTGCCTGGGGGCTT GTCACCTCCCCCACCTTCTTCCTGAGTCATTCCTGCAGCCTTGCTCCCTA ACCTGCCCCACAGCCTTGCCTGGATTTCTATCTCCCTGGCTTGGTGCCAG TTCCTCCAAGTCGATGGCACCTCCCTCCCTCTCAACCACTTGAGCAAACT CCAAGACATCTTCTACCCCAACACCAGCAATTGTGCCAAGGGCCATTAGG CTCTCAGCATGACTATTTTTAGAGACCCCGTGTCTGTCACTGAAACCTTT TTTGTGGGAGACTATTCCTCCCATCTGCAACAGCTGCCCCTGCTGACTGC CCTTCTCTCCTCCCTCTCATCCCAGAGAAACAGGTCAGCTGGGAGCTTCT GCCCCCACTGCCTAGGGACCAACAGGGGCAGGAGGCAGTCACTGACCCCG AGACGTTTGCATCCTGCACAGCTAGAGATCCTTTATTAAAAGCACACTGT TGGTTTCTGCTCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGC CTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG CACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCT TCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCCTC CCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGTGTCCATGTCAGAGC AACGGCCCAAGTCTGGGTCTGGGGGGGAAGGTGTCATGGAGCCCCCTACG ATTCCCAGTCGTCCTCGTCCTCCTCTGCCTGTGGCTGCTGCGGTGGCGGC AGAGGAGGGATGGAGTCTGACACGCGGGCAAAGGCTCCTCCGGGCCCCTC ACCAGCCCCAGGTCCTTTCCCAGAGATGCCTGGAGGGAAAAGGCTGAGTG AGGGTGGTTGGTGGGAAACCCTGGTTCCCCCAGCCCCCGGAGACTTAAAT ACAGGAAGAAAAAGGCAGGACAGAATTACAAGGTGCTGGCCCAGGGCGGG CAGCGGCCCTGCCTCCTACCCTTGCGCCTCATGACCAGCTTGTTGAAGAG ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTGCAACGGGAA AGCCACAGACTGGGGTGAAGAGTTCAGTCACATGCGACCGGTGACTCCCT GTCCCCACCCCCATGACACTCCCCAGCCCTCCAAGGCCACTGTGTTTCCC AGTTAGCTCAGAGCCTCAGTCGATCCCTGACCCAGCACCGGGCACTGATG AGACAGCGGCTGTTTGAGGAGCCACCTCCCAGCCACCTCGGGGCCAGGGC CAGGGTGTGCAGCAccactgtacaatggggaaactggcccagagaggtga ggcagcttgcctggggtcacagagcaaggcaaaagcagcgctgggtacaa gctcaAAACCATAGTGCCCAGGGCACTGCCGCTGCAGGCGCAGGCATCGC ATCACACCAGTGTCTGCGTTCACAGCAGGCATCATCAGTAGCCTCCAGAG GCCTCAGGTCCAGTCTCTAAAAATATCTCAGGAGGCTGCAGTGGCTGACC ATTGCCTTGGACCGCTCTTGGCAGTCGAAGAAGATTCTCCTGTCAGTTTG AGCTGGGTGAGCTTAGAGAGGAAAGCTCCACTATGGCTCCCAAACCAGGA AGGAGCCATAGCCCAGGCAGGAGGGCTGAGGACCTCTGGTGGCGGCCCAG GGCTTCCAGCATGTGCCCTAGGGGAAGCAGGGGCCAGCTGGCAAGAGCAG GGGGTGGGCAGAAAGCACCCGGTGGACTCAGGGCTGGAGGGGAGGAGGCG ATCTTGCCCAAGGCCCTCCGACTGCAAGCTCCAGGGCCCGCTCACCTTGC TCCTGCTCCTTCTGCTGCTGCTTCTCCAGCTTTCGCTCCTTCATGCTGCG CAGCTTGGCCTTGCCGATGCCCCCAGCTTGGCGGATGGACTCTAGCAGAG TGGCCAGCCACCGGAGGGGTCAACCACTTCCCTGGGAGCTCCCTGGACTG GAGCCGGGAGGTGGGGAACAGGGCAAGGAGGAAAGGCTGCTCAGGCAGGG CTGGGGAAGCTTACTGTGTCCAAGAGCCTGCTGGGAGGGAAGTCACCTCC CCTCAAACGAGGAGCCCTGCGCTGGGGAGGCCGGACCTTTGGAGACTGTG TGTGGGGGCCTGGGCACTGACTTCTGCAACCACCTGAGCGCGGGCATCCT GTGTGCAGATACTCCCTGCTTCCTCTCTAGCCCCCACCCTGCAGAGCTGG ACCCCTGAGCTAGCCATGCTCTGACAGTCTCAGTTGCACACACGAGCCAG CAGAGGGGTTTTGTGCCACTTCTGGATGCTAGGGTTACACTGGGAGACAC AGCAGTGAAGCTGAAATGAAAAATGTGTTGCTGTAGTTTGTTATTAGACC CCTTCTTTCCATTGGTTTAATTAGGAATGGGGAACCCAGAGCCTCACTTG TTCAGGCTCCCTCTGCCCTAGAAGTGAGAAGTCCAGAGCTCTACAGTTTG AAAACCACTATTTTATGAACCAAGTAGAACAAGATATTTGAAATGGAAAC TATTCAAAAAATTGAGAATTTCTGACCACTTAACAAACCCACAGAAAATC CACCCGAGTGCACTGAGCACGCCAGAAATCAGGTGGCCTCAAAGAGCTGC TCCCACCTGAAGGAGACGCGCTGCTGCTGCTGTCGTCCTGCCTGGCGCCT TGGCCTACAGGGGCCGCGGTTGAGGGTGGGAGTGGGGGTGCACTGGCCAG CACCTCAGGAGCtgggggtggtggtgggggcggtgggggtggtgttagtA CCCCATCTTGTAGGTCTGAAACACAAAGTGTGGGGTGTCTAGGGAAGAAG GTGTGTGACCAGGGAGGTCCCCGGCCCAGCTCCCATCCCAGAACCCAGCT CACCTACCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCAGTTCT GGAATGGTGCCAGGGGCAGAGGGGGCAATGCCGGGGCCCAGGTCGGCAAT GTACATGAGGTCGTTGGCAATGCCGGGCAGGTCAGGCAGGTAGGATGGAA CATCAATCTCAGGCACCTGGCCCAGGTCTGGCACATAGAAGTAGTTCTCT GGGACCTGCAAGATTAGGCAGGGACATGTGAGAGGTGACAGGGACCTGCA GGGGCAGCCAACAAGACCTTGTGTGCACCTCCCATGGGTGGAATAAGGGG CCCAACAGCCTTGACTGGAGAGGAGCTCTGGCAAGGCCCTGGGCCACTGC ACCTGTCTCCACCTCTGTCCCACCCCTCCCACCTGCTGTTCCAGCTGCTC TCTCTTGCTGATGGACAAGGGGGCATCAAACAGCTTCTCCTCTGTCTCTG CCCCCAGCATCACATGGGTCTTTGTTACAGCACCAGCCAGGGGGTCCAGG AAGACATACTTCTTCTACCTACAGAGGCGACATGGGGGTCAGGCAAGCTG ACACCCGCTGTCCTGAGCCCATGTTCCTCTCCCACATCATCAGGGGCACA GCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCC CTGGCTCCTGGCCTATGTGCTGTACCTGTGTCTGATGCCCTGGGTCCCCA CTAAGCCAGGCCGGGCCTCCCGCCCACACCCCTCGGCCCTGCCCTCTGGC CATACAGGTTCTCGGTGGTGTTGAAGAGCAGCAAGGAGCTGACAGAGCTG ATGTTGCTGGGAAGACCCCCAAGTCCCTCTTCTGCATCGTCCTCGGGCTC CGGCTTGGTGCTCACGCACACAGGAAAGTCCTTCAGCTTCTCCTGAGAGG GCCAGGATGGCCAAGGGATGGTGAATATTTGGTGCTGGGCCTAATCAGCT GCCATCCCATCCCAGTCAGCCTCCTCTGGGGGACAGAACCCTATGGTGGC CCCGGCTCCTCCCCAGTATCCAGTCCTCCTGGTGTGTGACAGGCTATATG CGCGGCCAGCAGACCTGCAGGGCCCGCTCGTCCAGGGGGCGGTGCTTGCT CTGGATCCTGTGGCGGGGGCGTCTCTGCAGGCCAGGGTCCTGGGCGCCCG TGAAGATGGAGCCATATTCCTGCAGGCGCCCTGGAGCAGGGTACTTGGCA CTGGAGAACACCTGTGGACACAGGGACAAGTCTGAGGGGGCCCCAAGAGG CTCAGAGGGCTAGGATTGCTTGGCAGGAGAGGGTGGAGTTGGAAGCCTGG GCGAGAAGAAAGCTCAAGGTACAGGTGGGCAGCAGGGCAGAGACTGGGCA GCCTCAGAGGCACGGGGAAATGGAGGGACTGCCCAGTAGCCTCAGGACAC AGGGGTATGGGGACTACCTTGATGGCCTTCTTGCTGCCCTTGATCTTCTC AATCTTGGCCTGGGCCAAGGAGACCTTCTCTCCAATGGCCTGCACCTGGC TCCGGCTCTGCTCTACCTGCTGGGAGATCCTGCCATGGAGAAGATCACAG AGGCTGGGCTGCTCCCCACCCTCTGCACACCTCCTGCTTCTAACAGCAGA GCTGCCAGGCCAGGCCCTCAGGCAAGGGCTCTGAAGTCAGGGTCACCTAC TTGCCAGGGCCGATCTTGGTGCCATCCAGGGGGCCTCTACAAGGATAATC TGACCTGCAGGGTCGAGGAGTTGACGGTGCTGAGTTCCCTGCACTCTCAG TAGGGACAGGCCCTATGCTGCCACCTGTACATGCTATCTGAAGGACAGCC TCCAGGGCACACAGAGGATGGTATTTACACATGCACACATGGCTACTGAT GGGGCAAGCACTTCACAACCCCTCATGATCACGTGCAGCAGACAATGTGG CCTCTGCAGAGGGGGAACGGAGACCGGAGGCTGAGACTGGCAAGGCTGGA CCTGAGTGTCGTCACCTAAATTCAGACGGGGAACTGCCCCTGCACATACT GAACGGCTCACTGAGCAAACCCCGAGTCCCGACCACCGCCTCAGTGTGGT CTAGCTcctcacctgcttccatcctccctggtgcggggtgggcccagtga tatcagctgcctgctgttccccagatgtgccaagtgcattcttgtgtgct tgcatctcatggaacgccatttccccagacatccctgtggctggctccTG ATGCCCGAGGCCCAAGTGTCTGATGCTTTAAGGCACATCACCCCACTCAT GCTTTTCCATGTTCTTTGGCCGCAGCAAGGCCGCTCTCACTGCAAAGTTA ACTCTGATGCGTGTGTAACACAACATCCTCCTCCCAGTCGCCCCTGTAGC TCCCCTACCTCCAAGAGCCCAGCCCTTGCCCACAGGGCCACACTCCACGT GCAGAGCAGCCTCAGCACTCACCGGGCACGAGCGAGCCTGTGTGGTGCGC AGGGATGAGAAGGCAGAGGCGCGACTGGGGTTCATGAGGAAGGGCAGGAG GAGGGTGTGGGATGGTGGAGGGGTTTGAGAAGGCAGAGGCGCGACTGGGG TTCATGAGGAAAGGGAGGGGGAGGATGTGGGATGGTGGAGGGGCTGCAGA CTCTGGGCTAGGGAAAGCTGGGATGTCTCTAAAGGTTGGAATGAATGGCC TAGAATCCGACCCAATAAGCCAAAGCCACTTCCACCAACGTTAGAAGGCC TTGGCCCCCAGAGAGCCAATTTCACAATCCAGAAGTCCCCGTGCCCTAAA GGGTCTGCCCTGATTACTCCTGGCTCCTTGTGTGCAGGGGGCTCAGGCAT GGCAGGGCTGGGAGTACCAGCAGGCACTCAAGCGGCTTAAGTGTTCCATG ACAGACTGGTATGAAGGTGGCCACAATTCAGAAAGAAAAAAGAAGAGCAC CATCTCCTTCCAGTGAGGAAGCGGGACCACCACCCAGCGTGTGCTCCATC TTTTCTGGCTGGGGAGAGGCCTTCATCTGCTGTAAAGGGTCCTCCAGCAC AAGCTGTCTTAATTGACCCTAGTTCCCAGGGCAGCCTCGTTCTGCCTTGG GTGCTGACACGACCTTCGGTAGGTGCATAAGCTCTGCATTCGAGGTCcac aggggcagtgggagggaactgagactggggagggacaaaggctgctctgt cctggtgctcccacaaaggagaagggctgatcactcaaagttgcgaacac caagctcaacaatgagccctggaaaatttctggaatggattattaaacag agagtctgtaagcacttagaaaaggccgcggtgagtcccaggggccagca ctgctcgaaatgtacagcatttctctttgtaacaggattattagcctgct gtgcccggggaaaacatgcagcacagtgcatctcgagtcagcaggatttt gacggcttctaacaaaatcttgtagacaagatggagctatgggggttgga ggagagaacatataggaaaaatcagagccaaatgaaccacagccccaaag ggcacagttgaacaatggactgattccagccttgcacggagggatctggc agagtCCATCCAGTTCATTCAACACCTGGTTAGAAAACTGGGGCCAGCAC ACAGGGGAAGGGTAAGCTGGTTTCATGATCGAATCAAGGCTCAGACAATT TTTAAAGGCCAGAGGGTAGACTGCAATCACcaagatgaaatttacaagga acaaatgtgaagcccaacatttaggttttaaaaatcaagcgtataaatac agaaggtggagggaacttgctttagacacagttcaggtgaagaaagacct ggaaacttctgttaactataagctcagtaGGGGCTAAAAGCATGTTAATC GGCATAAAAAGGCAATGAGATCTTAGGGCACACAGCTCCCCGCCCCTCTT CTGCCCTTCATCCTTCTTTCAATCAGCAGGGACCGTGCACTCTCTTGGAG CCACCACAGAAAACAGAGGTGCATCCAGCACCACAGAAAACAGAGCCACC ACAGAAAACAGAGGGTGACTGTCATCCCCTCCAGTCTCTGCACACTCCCA GCTGCAGCAGAGCAGGAGGAGAGAGCACAGCCTGCAATGCTAATTTGCCA GGAGCTCACCTGCCTGCGTCACTGGGCACAGACGCCAGTGAGGCCAGAGG CCGGGCTGTGCTGGGGCCTGAGCCGGGTGGTGGGGAGAGAGTCTCTCCCC TGCCCCTGTCTCTTCCGTGCAGGAGGAGCATGTTTAAGGGGACGGGTTCA AAGCTGGTCACATCCCCACCGAAAAAGCCCATGGACAACGAAAAGCCCAC TAGCTTGTCCAGTGCCACAGGAGGGGCAAGTGGAGGAGGAGAGGTGGCGG TGCTCCCCACTCCACTGCCAGTCGTCACTGGCTCTCCCTTCCCTTCATCC TCGTTCCCTATCTGTCACCATTTCCTGTCGTCGTTTCCTCTGAATGTCTC ACCCTGCCCTCCCTGCTTGCAAGTCCCCTGTCTGTAGCCTCACCCCTGTC GTATCCTGACTACAATAACAGCTTCTGGGTGTCCCTGGCATCCACTCTCT CTCCCTTCTTGTCCCTTCCGTGACGGATGCCTGAGGAACCTTCCCCAAAC TCTTCTGTCCCATCCCTGCCCTGCTCAAAATCCAATCACAGCTCCCTAAC ACGCCTGAATCAACTTGAAGTCCTGTCTTGAGTAATCCGTGGGCCCTAAC TCACTCATCCCAACTCTTCACTCACTGCCCTGCCCCACACCCTGCCAGGG AGCCTCCCGTGGCACCGTGGGGACACAAAGGAACCAGGGCAAAGCTCCCT CAGCCCCATTCAAAGAGGCCTGGCCCACAGGCTCACGGAAAGTCAGCCTC TCATGCCCCGAGAGCTGAGTGCAAGGGAGAGGCAGCGCTGTCTGTGCTTC CCATGCAGAAGCACCCCCCTCCCACCCCTGTGCAGGCCGGCCTTCGCGGC AGACCACCATACACCACGTTCCAAGCCACACTGAGGCCTCCCTCCAAGCC TGCAGCCCCCATTTCCAGACCCTGCCAGGGCAACCTGCATATCCACCTCC CTACCCTGCCCCCCTCTTCCAGGAGTCTGCCCTATGTGGAGTAAGCACgt ggttttcctcttcagcaactatttcctttttactcaagcaatggccccat ttcccttggggaatccatctctctcgcaggcttagtcccagagcttcagg tggggctgcccacagagctcctcagTCTAAGCCAAGTGGTGTGTCATAGT CCCCTGGCCCCATTAATGGATTCTGGGATAGACATGAGGACCAAGCCAGG TGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGACAGCAT TCTTTCCTGCTGGACCTGACCCTGTGTCATGTCACCTTGCTACCACGAGA GCATGGCCTGTCTGGGAATGCAGCCAGACCCAAAGAAGCAAACTGACATG GAAGGAAAGCAAAACCAGGCCCTGAGGACATCATTTTAGCCCTTACTCCG AAGGCTGCTCTACTGATTGGTTAATTTTTGCTTAGCTTGGTCTGGGGAGT TCTGACAGGCGTGCCACCAATTCTTACCGATTTCTCTCCACTCTAGACCC TGAGAAGCCCACGCGGTTCATGCTAGCAATTAACAATCAATCTCGCCCTA TGTGTTCCCATTCCAGCCTCTAGGACACAGTGGCAGCCACATAATTGGTA TCTCTTAAGGTCCAGCACGAGGTGGAGCACATGGTGGAGAGACAGATGCA GTGACCTGGAACCCAGGAGTGAGGGAGCCAGGACTCAGGCCCAAGGCTCC TGAGAGGCATCTGGCCCTCCCTGCGCTGTGCCAGCAGCTTGGAGAACCCA CACTCAATGAACGCAGCACTCCACTACCCAGGAAATGCCTTCCTGCCCTC TCCTCATCCCATCCCTGGGCAGGGGACATGCAACTGTCTACAAGGTGCCA AGTACCAGGACAGGAAAGGAAAGACGCCAAAAATCCAGCGCTGCCCTCAG AGAAGGGCAACCACGCAGTCCCCATCTTGGCAAGGAAACACAATTTCCGA GGGAATGGTTTTGGCCTCCATTCTAAGTGCTGGACATGGGGTGGCCATAA TCTGGAGCTGATGGCTCTTAAAGACCTGCATCCTCTTCCCTAGGTGTCCC TCGGGCACATTTAGCACAAAGATAAGCACAAAAGGTGCATCCAGCACTTT GTTACTATTGGTGGCAGGTTTATGAATGGCAACCAAAGGCAGTGTACGGG TCAAGATTATCAACAGGGAagagatagcatttcctgaaggcttcctaggt gccaggcactgttccattcctttgcatgttttgattaatttaatatttaa aataattctaccaggaagctaccattattaccacaacttcacaaatgaga acaccgaggcttagaggggttgggttgcccaaggttacagaggaagaaaa caggggagctggatctgagccaaggcatcaactccaaggtaacccctcag tcacttcactgtgtgtcccctGGTTACTGGGACATTCTTGACAAACTCGG GGCAAGCCGGTGAGTCAGTGGGGGAGGACTTTCAGGAAGAGGTGGGTTCC CAGTTGGTGACAGAAGAGGAGGCTGCAAAGTGAAGGAGCAGGGGCTCCAG GTCTGGCGACAACCAGGGAAGGGACAGGGCAGGGATGGCTTGGACCACGA GAGGCACCTGAGTCAGGCAGTCACATACTTCCCACTGGGGTCTACCATGT GAGGCATGGTGTGGGATCCTGGGAAGGAGACCAAGCCTCATTTCAGTTTG CTTATGGCCAAAGACAGGACCTGTGTACCCGACAACCCCTGGGACCTTTA CCAAAAAAAGAGCAAACACCATTCACTCACTCATGTTAGATAAACACTGA GTGAAGTCACTGGAGCCCAAGGACTGTGCGAGGTCAGCACTGCCAATACA AGAagctgcagccctccagctcgcctccctcaatggccactccgtgctcc agccatgctggcttccttttaggtcctccacctccaggctgtagttcatg tgcttctttctggaatgttcttcccaacctacccactcaaccctcagact ttaccataaatgtcatttcctcacgtctgccttccctgacctgagaccaa gccaggcttcccatgacgagcctcacagtaccccatctCCCCTGAACAGA TGCAGTAATAACCTACATAACCCGGGGCCATGATCTAtggctttgaatcc tggctctgtcactaggccaggtctctcagcccttctgtgcctcagtttcc tcatctataaaatgagatgacggcagtgcctgctcatgaagtgtgagtta atgcactcaaatcaatggttgtgcacggtttatatgaatattagtgatta CAAAatattatcaatagaccttgtcacaactgttattgaagaactaatca tctattgcttatttaggtctttctctcctgccagaatgtgcgctccaggt ggagaggtatgttgccttatccgtggctggatatatagagattcccacac tgccttgcacacgagcactgctgggtaaatatttgttggctgcaggaaAA CGTGAAGGAATAGGCCCTCCAATGGGAGGAAAAGCATGAGTTGTGAGAGC AGAGCCACCACAGGAAACCAGGAGGCTAAGTGGGGTGGAAGGGAGTGAGC TCTCGGACTCCCAGGAGTAAAAGCTTCCAAGTTGGGCTCTCACTTCAGCC CCTCCCACACAGGGAAGCCAGATGGGTTCCCCAGGACCGGGATTCCCCAA GGGGGCTGCTCCCAGAGGGTGTGTTGCTGGGATTGCCCAGGACAGGGATG GCCCTCTCATCAGGTGGGGGTGAGTGGCAGCACCCACCTGCTGAAGATGT CTCCAGAGACCTTCTGCAGGTACTGCAGGGCATCCGCCATCTGCTGGACG GCCTCCTCTCGCCGCAGGTCTGGCTGGATGAAGGGCACGGCATAGGTCTG ACCTGCCAGGGAGTGCTGCATCCTCACAGGAGTCATGGTGCCTGTGGGTC GGAGCCGGAGCGTCAGAGCCACCCACGACCACCGGCACGCCCCCACCACA GGGCAGCGTGGTGTTGAGACAACACAGCCCTCATCCCAACTATGCACATA GCTTCAGCCTGCACAGATAGGGGAGTAGGGGACAGAGCATTTGCTGAGAG GCCAGGAGCGCATAGATGGGACTCTGCTGATGCCTGCTGAGTGAATGAGG GAAAGGGCAGGGCCCGGGACTGGGGAATCTGTAGGGTCAATGGAGGAGTT CAGAGAAGGTGCAACATTTCTGACCCCCTACAAGGTGCTTGCTACCTGCC AGGCACCCTTTCCATACCTTGTCTCAGTTCAGCTCCCCACCTTGGATAAA CAAGAAACCTTGGTTGCAGAGGAAAAAAGAGGCTGGAAACAAAGGGGTAG AAATGGGGTAGCAGGGGAGATTGCCTGATCAACTGCCAAATGGTACACAG TTCTGGAAAAGCACAAAAAATGTGCACACACGGGTTCTTCCCACTTTAAC CCCTGAGGAATCTGAGGCCTGCTCCTGAAACAGACTGGGCAGTGGCTAGT GACTCTAGGTATAGGAGTATCCAGCCCTGCTCACCCAGGCTAGAGCTTAG GGGGACAAGAGGAAAGAGGTGCCTGTGGGGGTGGAGGACAGGAAGGAAAA ACACTCCTGGAATTGCAAAGTGAGGGCAGAGTCTATTTATATTGGGTTTA ATTAACTCCTCTCCCTGGTGCCACTAAAGCAGCAATCACACTGCAGACAG CACTGATTTGATTGGCAAGAGATGCACCAGGCAGAATATTAAGGGACCAG GCCCCTATAAATAGGCCTAATCACAGCCCCTCACTGGAAAATGGTAAGGA AGACATTAATCAGGCCTGGCACTGTGCCCTAGACCTGCTCCCCTAGGCAC TACAGTGGGGCCCTTGGTTGCAACACAAGTAGGTAGGGATGGATGAGTGT GGCATGAAGGGCCTAGGAGATTTCACTTGGGTTTAAAATGCTGTGACCTT GAGTAAGTTGCCGTCTCTGAATCTGATCCTTTCGATTTCCCATTCTCCAA ACTGAGAACTAGCACTGCTGAGACGTGGTTATTTCCAATAATAATTTGTA TATTTTACATAACGCACCACACCAACATCTTCACCCAGTTGGAGCCTACT CCTTTGCTCCCGCTGCTGGCTTCCCCAGCCCTCCCTTCTGCCCTCCTCAG GCCAGCACTTTTCAGTGAGTTCCTCCTTTGCATACAGGCTTTCCAGATCT GTACTTGCCTTGAATACTCATCAGAGCCCAGGAGTTACTCCTCACCTCCC ACTTATTTTTCCTCCCATCAAATAACTAAAGCATGGCCAGCTGATGCCCA GCCAACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAAGCAT GTTCCTCCCTCCCCTTCTTTGTATTTATACTGATGCAAGTTTGCTGGCTG TCCTAacttatttctgtgcctcagttctcccatatgtaagatcacaaagg gggtaaagatgcAAGATATTTCCTGTGCACATCTTCAGATGAATTTCTTG TTAGTGTGTGTGTGTTTGCTCACACATATGCGTGAAAGAAGAGTACATAC ACAGATCTCCTCAAAAAGGAGGCAGCAAGCCCGTTCAAGAATGGGACTGA ATACACCTGATGAGTGGTTTACTTTCTGTCTGcaaacatctactgatcat ctgttaggtgcaggccatgatcacaacaaagacgaataagacactacact agccagggagagtctcaaaaacaactaaactcaaattaaattcattctac tccagtcatgggtacaaagctaaggagtgacaaatccctcttggagttag gggagtcaggaaaaagctcttagcagaatgtgtgcctctcggccgggcgc agcggctcacgcctgtaatcccagcactttgggaggcgaaggcaggcaga tcacctgaggtcgggagttcgagaccagtctgaccaacatggtgaaactc catctctactaaaaatacaaaattagccaggcgtggtggtgcatgcctgt aatccccgctactcgggaggctgaggaaggagaatcacttgaaccaggaa ggtggaggttgcagtgtgccaagatcgcgccatggcactccagcctaggc aacgagggtgaaccaggtccaggaagaaggtgcaaagacagcattccagg taaaagaaacagcttgaacaaaaagtgtgtaggggaaCCGCAAGCGGTCT TGAGTGCTGAGGGTACAATCATCCTTGGGGAAGTACTAGAAGAAAGAATG ATAAACAGAGGCCAGTTTGTTAAAAACACTCAAAATTAAAGCTAGGAGTT TGGACTTGTGGCAGGAATgaaatccttagacctgtgctgtccaatatggt agccaccaggcacatgcagccactgagcacttgaaatgtggatagtctga attgagatgtgccataagtgtaaaatatgcaccaaatttcaaaggctaga aaaaaagaatgtaaaatatcttattattttatattgattacgtgctaaaa taaccatatttgggatatactggattttaaaaatatatcactaatttcat ctgtttctttttacttttAGAAATCACATATGTGACTTAAATATTTCTTT TCTTTTTCTTTCCTCTCACTCAGCGTCCTGTGATTCCAAAGAAATGAGTC TCTGCTGTTTTTGGGCAGCAGATATCCTAGAATGGACTCTGACCTAAGCA TCAAAATTAATCATCATAACGTTATCATTTTATGGCCCCTTCTTCCTATA TCTGGTAGCTTTTAAATGATGACCATGTAGATAATCTTTATTGTCCCTCT TTCAGCAGACGGTATTTTCTTATGCTACAGTATGACTGCTAATAATACCT ACACATGTTAGAACCATTCTGACTCCTCAAGAatctcatttaactcttat tatcagtgaatttatcatcatcccctattttacataaggaaatggggtta gaaagaccaaataacattttttcaacatcaaaacactagcttgagatcaa gcccagacttggatctgtcgtctgaattccaagctttttgttatttattg atatgttttgttgtTTTCATGCAATAATGCAAATCTTAGCCCAAACATTT TGTTAGTAGTACCAACTGTAAGTCACCTTATCTTCATACTTTGTCTTTAT GTAAACCTAAATTAGATCTGTTTTTGATACTGAGGGAAAAACAAGGGAAT ctaacactaaccagcccgtagtgtgtggtcaacactttcgttactttagt atacatcaccccaattgtttgtcttcaccacacactttggagttaggtag tagtatctatttttacaaataagaaaacccaggcacaaaggggttgatta gcAATTATCTTTTGAAAAGCCTGTAGTTGCTCATCTGAAGAAGTGACGGA CCACCTCTTATTTAGTGGACAGACAGTAACTAGTTGAGAAGACAGGGGAT TTTGTTGGCGGAAAAAAAAATTTATCAAAAGTCGTCTTCTATCAGGGAGT TTTATGAGAAACCCTAGCTCCTCAGTTCCACAGTGGGTAACTGTAATTCA TTCTAGGTCTGCGATATTTCCTGCCTATCCATTTTGTTAACTCTTCAATG CATTCCACAAATACCTAAGTATTCTTTAATAATGGTGGTTTTTTTTTTTT TTTGCATCTATGAAGTTTTTTCAAATTCTTTTTAAGTGACAAAACTTGTA CATGTGTATCGCTCAATATTTCTAGTCGACAGCACTGCTTTCGAGAATGT AAACCGTGCACTCCCAGGAAAATGCAGACACAGCACGCCTCTTTGGGACC GCGGTTTATACTTTCGAAGTGCTCGGAGCCCTTCCTCCAGACCGTTCTCC CACACCCCGCTCCAGGGTCTCTCCCGGAGTTACAAGCCTCGCTGTAGGCC CCGGGAACCCAACGCGGTGTCAGAGAAGTGGGGTCCCCTACGAGGGACCA GGAGCTCCGGGCGGGCAGCAGCTGCGGAAGAGCCGCGCGAGGCTTCCCAG AACCCGGCAGGGGCGGGAAGACGCAGGAGTGGGGAGGCGGAACCGGGACC CCGCAGAGCCCGGGTCCCTGCGCCCCACAAGCCTTGGCTTCCCTGCTAGG GCCGGGCAAGGCCGGGTGCAGGGCGCGGCTCCAGGGAGGAAGCTCCGGGG CGAGCCCAAGACGCCTCCCGGGCGGTCGGGGCCCAGCGGCGGCGTTCGCA GTGGAGCCGGGCACCGGGCAGCGGCCGCGGAACACCAGCTTGGCGCAGGC TTCTCGGTCAGGAACGGTCCCGGGCCTCCCGCCCGCCTCCCTCCAGCCCC TCCGGGTCCCCTACTTCGCCCCGCCAGGCCCCCACGACCCTACTTCCCGC GGCCCCGGACGCCTCCTCACCTGCGAGCCGCCCTCCCGGAAGCTCCCGCC GCCGCTTCCGCTCTGCCGGAGCCGCTGGGTCCTAGCCCCGCCGCCCCCAG TCCGCCCGCGCCTCCGGGTCCTAACGCCGCCGCTCGCCCTCCACTGCGCC CTCCCCGAGCGCGGCTCCAGGACCCCGTCGACCCGGAGCGCTGTCCTGTC GGGCCGAGTCGCGGGCCTGGGCACGGAACTCACGCTCACTCCGAGCTCCC GACGTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCCT ACCCGTGCTTTCTGCTCTGCAGACCCTCTTCCTAGACCTCCGTCCTTTGT CCCATCGCTGCCTTCCCCTCAAGCTCAGGGCCAAGCTGTCCGCCAACCTC GGCTCCTCCGGGCAGCCCTCGCCCGGGGTGCGCCCCGGGGCAGGACCCCC AGCCCACGCCCAGGGCCCGCCCCTGCCCTCCAGCCCTACGCCTTGACCCG CTTTCCTGCGTCTCTCAGCCTACCTGACCTTGTCTTTACCTCTGTGGGCA GCTCCCTTGTGATCTGCTTAGTTCCCACCCCCCTTTAAGAATTCAATAGA Gaagccagacgcaaaactacagatatcgtatgagtccagttttgtgaagt gcctagaatagtcaaaattcacagagacagaagcagtggtcgccaggaat ggggaagcaaggcggagttgggcagctcgtgttcaatgggtagagtttca ggctggggtgatggaagggtgctggaaatgagtggtagtgatggcggcac aacagtgtgaatctacttaatcccactgaactgtatgctgaaaaatggtt tagacggtgaattttaggttatgtatgttttaccacaatttttaaaaaGC TAGTGAAAAGCTGGTAAAAAGAAAGAAAAGAGGCTTTTTTAAAAAGTTAA ATATATAAAAAGAGCATCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCTG GAATCCGTTGGCTTGCCTCCGGCATTTTTGGCCCTTGCCTTTtagggttg ccagattaaaagacaggatgcccagctagtttgaattttagataaacaac gaataatttcgtagcataaatatgtcccaagcttagtttgggacatactt atgctaaaaaacattattggttgtttatctgagattcagaattaagcatt ttatattttatttgctgcctctggccaccctaCTCTCTTCCTAACACTCT CTCCCTCTCCCAGTTTTGTCCGCCTTCCCTGCCTCCTCTTCTGGGGGAGT TAGATCGAGTTGTAACAAGAACATGCCACTGTCTCGCTGGCTGCAGCGTG TGGTCCCCTTACCAGAGGTAAAGAAGAGATGGATCTCCACTCAtgttgta gacagaatgtttatgtcctctccaaatgcttatgttgaaaccctaacccc taatgtgatggtatgtggagatgggcctttggtaggtaattacggttaga tgaggtcatggggtggggccctcattatagatctggtaagaaaagagaGC ATTGtctctgtgtctccctctctctctctctctctctctctcatttctct ctatctcatttctctctctctcgctatctcatttttctctctctctcttt ctctcctctgtcttttcccaccaagtgaggatgcgaagagaaggtggctg tctgcaaaccaggaagagagccctcaccgggaacccgtccagctgccacc ttgaacttggacttccaagcctccagaactgtgagggataaatgtatgat tttaaagtcgcccagtgtgtggtattttgttTTGACTAATACAACCTGAA AACATTTTCCCCTCACTCCACCTGAGCAATATCTGAGTGGCTTAAGGTAC TCAGGACACAACAAAGGAGAAATGTCCCATGCACAAGGTGCACCCATGCC TGGGTAAAGCAGCCTGGCACAGAGGGAAGCACACAGGCTCAGggatctgc tattcattctttgtgtgaccctgggcaagccatgaatggagcttcagtca ccccatttgtaatgggatttaattgtgcttgccctgcctccttttgaggg ctgtagagaaaagatgtcaaagtattttgtaatctggctgggcgtggtgg ctcatgcctgtaatcctagcactttggtaggctgacgcgagaggactgct tgagcccaagagtttgagatcagcctgggcaatattgtgagattccatct ctacaaaaataaaataaaatagccagtcatggtgtcacacacctgtagtc ccagctacatgggaggctgaggcgggaggatcacttgagcttgggagatc gaggctgcagtgagctatgattgtaccactgcactccaggctgggcgaca gagagagaccctgtctcagaaaaaaaaaaaaaagtactttgtaatctgta aggtTTATTTCAACACACACAAAAAAAGTGTATATGCTCCACGATGCCTG TGAATATACACACACACCACATCATATACCAAGCCTGGCTGTGTCTTCTC ACAAATGCACTGCTAGGCACCACCCCCAGTTCTAGAATCACACCAGCCAG TTCACCCTCCAGATGGTTCACCCTCAACTTCATAAAAGTTCCCTACCTAA TCTACTGACAGGCTCATCCCCGACCTAATTTTAAAGATTTCCTAGGAGCT GCAGTGGGAATCCTGGACCTCAGCCTGGACAAAGAACAGCTGCAGGTCAT TCTCATGTGTGGACACAGAAGCTCTGCCTGCCTTTGCTGGCCAGCTGGGC TGAGCGGGCCTGGGAATTAAGGCTGCAGGGTTGGTCCCAGGCAGTCTTGC TGAAGCTTGCCACATCCCCCAGCCTCCTGGATTTGCCAGGATCCAAGAGC ATGGACTTTAGGAATTCCTGGTGGAGGAGTGAAGAAAATGTGACAGGGTG TCCTAAGCCCCGATCTACAGGAAGAAAACTGGAAATAAGACTGAGGACTT AGTTTAAGATGTTCCTACTCAGCCTCTAGCTTTTGTGCTACAGTTCTGGG AACAGACTCCTCTCTCCTGAAAACCACTTCCCTCCGCAGCATTAGATTTC ACCAAGATGTCTTGCTTGTGGGAAAGACTTCCAAGGATGCCTGGAGAGAG GAGGATGGAAATGTCCTGCTCTCTAAACAGATAGACAGATGCAGCCAGAC AGAAAATAGTTTATCTTGCTGAGGTTTCTAATGTATTTGAAAGAGGCCTG GGTCTAGAAGTCTACCCAGAGGGCTCTGTGTTGTGCACGCAAAGATAAGA ACCTTCCCTGTGGGAGTTCCAGAGCCAGTTTTCATAAACACCCATCGGTG ACTGTGTTCAGAGTGAGTTCACACCATCCTGACCTGCCCTGAGTTAGACC TTACATGGTCTTCCTCCTCTAGGAAGCCTCTGCAGCCCAGGAACCTCCCC TTATCGGAAATGAACAGCATTTGAAGCTTCACCAGACAGACCAGACAGCT TAGCCCTCGTGTTGTGCCATGTGGGTTGTTCTCTGAGAGGcaggagagca tagtggttactaggaagggaaggactttgggactagactgcctcggctgg agtcctctttctgcttcatagccacgtgatcctaggcatgttacctgtgc ctcagttttcactctgtcaatatgtaataactgaatctgtctttgtggtg aggattcagtgagttaacatatttgaagtgcttaaaaATGAGGCTTGtgt ccatagattaatgagtgaatacacaaatggtgatatggacatacagtgga gtattagtcataaaaaggaaggcagagctgatccatggcaccatgtgaca gaacctcaaaagcattaggttaagtggaagaagccagacacaggtcacct attgtgtaattccatttataggaaatatacagaatatgtaaatccgtgga gaaagaaagccgatttccaggggctaaggggaggggagaatgggaagtgg ctgcttcatgggtacaaggtttcattttgagctgatgaaaatgttttgga actacatagagatagtgttggcacaacatggtgaatgtactgaatgccac tgattgttcaatttaaaatggtcaaacttatatgaatttcacctccatta aaaaaaAAAAAAAAGgaccagatgtggttgctcacacccataatcccaac actttggaaAAAGGTGAAAGTTTTTTTTtctttttttttttatatactta agttctagggtacatgtgcataatgtgcaggttggatacatagatatgcg tgtgccatgttggtttgctgcacccatcaacttgtcatttacattaggta tttcttctaatgctatccctcccccagccccccacccactgacaggcccc agtgtatgatgttctctgccccatgtccaagcgttctcattgttcaattc ccacctgtgagtgagaacatgcagtgtttggttttctgtctttgtgatag tttgctcagaatgatggtttccagcttcatccatgtccctgcaaaggaca tgaactcatcctttttaatggctgcatagtatcccatggtatatatgtgc cacattctcttaatccagtctgtcattgatggacatttgggttggttcaa agtctttgctattgtgaatactgccacaataaacatacatgtgcatgtgt ctttatagtagcacgatttataatcctttgggtatatacccTAAGACctg ggacgcatttaaagcagtgtgtaaagagacatttatagcactaaatgccc acaagagaCCTCTGCCTGAGAACGTGGGTTTCAGCCTAAGAGTTGTAATA TGTGTGCCCATTCACAGGTGCTGCATCAGAGTCCCAGGTGGGAAGAAGGC AAGCATACACAAAAATGGTAAAAGGCAGAAAGGAGCCCAGTCTCGTTCTT TTTAAGAAGTTTTCCTAAGAATCTCCACCCAGCGACTTGCTCTCACATCT TCTTGGCCAGCACTGGACCACACAACTCCTTCTAGATACAGAGGAGTCCT AGGATTCTATGAGAAAGAAGGGGAGGGTGGGCAAAGGGCAGCCAGCTGTG CAGCATCTGCTGGAGACACCTAACCCTTGGTGGAGGGGTTGTGGTGCTGG gagaaggctttctggacggtgtgacagcagagataaacttaaaggccaag taggagttaccctggtgaagcagggcagggttacaagcattccagcaaca tgaagcagcaGGAGtgttttaattaaaagaaggcagttgctgtaaccaac tataaacaaataaaggcttaaacacaatggaagtttatttctcactaagg gaacatccaaatccatgatactttaagtcagggacccaggttcctcccat ctatggttctgccatcactaatctgggtcttccacaattgccgtgctcct tggaggtgggaagagcaggcggaggacacgtgggaggttttagggacaag cctggaggcagcatgcgtcactcccatgcagagtccattggccaatgctg gctccgatggccacatctcactgcaggggcagctgggaaatacagtctgg ctgtctacccaggaggaagagCAGCCAGTTTCTGCTGCTGATGATCAGGA GGTGGAGAAAATGTTCAGTCAGGCAGGGAGTGGGAATAGACAAGACCACA AGCAGCTTGGTGCCTCTGAAAGGGAGAGGGGTGGAGGGGAGACTAGAGAG GTGGGTAGGAATACTGGATTCCACTGACCACGTGCTGGATGTCACGCTTA GCCCTCCTGCTCTGTGCCGGGTTAGGCACCTGGTGTTTTACGTACATAAT CTCAATTCTGTGAGGGCATCCGACCTGTGGGAAAAGAGCTGTTTGTTTCA AATGCCAGTCCTGCTTcctaacaagtgtttagagcttaatcgtgttcaaa atacatatacaatgtttaatacttacaagaatttggtggggaaaatatta ccatctttcccttttgtgattggagaaaaatgaggctttgaagggtttaa gaacttgcccaaggtcggccaggtgcagtggctcatgtctataatcccaa cactttgggaggctgaggtgggaggatcgcttgaggccaggagttcaaga ccagcctgagcaacatagtgagactttgtctctataaaaaataaataaaT AAATAAAAACAACTTGTCCAAGGTCAGACAGGCAGCCTCTTAGTAAGCAC ACATATCCTCTATATTATACTACCTCTCATGGAGGATCTCCTGTGTTCTA CAAATAGTCTGGACTTGAGCCAGAATGTGTTATAATCCTGGGATCACGGC CAGTGGGCTTAGAAGAAGCCATCTCTTTCTCATGCCAAGATGAGGCTCCC CCAGATTTGCTCAGACTTACCTATAGTCAGCAGCATCGGGGGTCAGGAAA GACTTCACGAAGCCATAAATGCATCCTTCTCGGGGCAGCACCTGGCTCTC CCAGGTGAGAGAGGACTCCATTTTCACAGGCAGGCGTGGGAGCTTCAGCA CCCATCTCTGGGCCCAGAATGACCCACTGGAGACCTTACAGCTCTCCTGT CACCCCCAATTCCTGCCCCCTCTGCAGCCTTGGAGGAGAATGGAGCTGAA GGGCCTGCCCTCTGTAGGGTGAGAAAGGGAGGCTAAAGCCTGGTGCCCAC TGCCCTGGCTGCTCCGCATTGCAGGAGCTGCGCCCTTCCTTTCCTGGCAC AGGGTCCACAGCCCCGAAACCCCGTTGTGTGGGAGCTGGGCACAGGGCAG CAGGACTAATCCTTGGAACAGCTCAGGGAGGATTATCCCAGCCACTGTCA GCAGCGGTGCAGCTGGCTCATTCCCATATAGGGGGAGGCCAGAGCCAGGG GCCTGCCACAAGTTGGAAGGCTGGGGAAGGGGAGGCCAGCAGAGGTGTCC TGGCTGTGGGTGGCTCTGAGGGGGCTCTCAGGGGTGGGGCTAAATCTCAG GGGCAGGATTATGTAAATCAAACCAATTCTAGCCACAGATTTAAAGTTTG GAAAAAAAAAAAAACCCAGCCTGGCGGAAAGAATTTAAATTATAAAAACT TAGAAGTATGGAATGTGAAATCATCCTGTAGGTGCTTATTTAACAACGAA ATCATCCCGACACAATGAGCCATATGTGAAAAGTCATCCTTCCCCAACAC ATCCCCCAACAGGCACTCCTCAAGCCTCTCCCACCCAAGTGCTGGCATCC TCCCTGTCCTGCTTCACCTGAGACACCCCTTGTCTCATTAGACATGCAAC TACGGGAGGGGTGACAGGAAGACAAGACACTATTTCCTCAGGCCCAGTTT GGTGTGGGGAGAAAGCCTCCTGATCCTGAAAGCAAGAATTTGACCAGAGC AGAAGTAATCAGTATGCAGATTGATTCTGTGGTATGTTAATGTTTATGCA TAGATTATGAGGACCAGGTGAAAAGTGGGCCAGGGGAGCCAGATGTGTGT GTGAGTCATGGGTGGCTGAGATGAGGACAGGAGGGAAACTGGTTTGGAGG GTGCTGGCGATGGGGTGGGGGTGCCAGGAGGAAGGGAGGCTAGTTGTTTG AATGTCTGCATGAAAAAGCGGACGACAGCGGGGTCTGGGTGAATTCGGGC AACCATTTGGACCGTGGAGAAAACTGCCTGCGTGCGGCTGAGGACCTGCA CTATTAATTTGTTTTTTAGCTAAGGCAAAGATAAATATAAAAACtgatac tccacccagttaccagaaaacatttaggtatgtgtgagacaacttgggta tgtgaacctaccttttcaatgtaaattcagtgaaatctaagtgcagatcc catatttccaataaaaaggtaacatccaaactcagatgtcctatgagtat aaaatacacaaagatcttctggacttagtatgaaaagggatttttttttt gtcaggtacctcactagttatttttaaaataggattgcatgttgaaatga taatcttttggatatattgggttaaataaatttattattaaagttaattt cacttaaaaatgtttaatgtagctactagaaattttaaaattaagcatgt tgctcaccttatgtttctattggacggctctCTCTAGATACAAAGGCTGC CAAGAGGGACCTCACTCTAGCTTCAGGGAGAAGAGAGGAATTAGCAAGGC CAAGCAGAGGCTCCTGAGGGCAGGGCCAAGGGCGGCTTGGTGGGGTGGGG ATGGGATGCACAGAGATAACTCCAACCCTTAAGAAGGTGTTTCCTAGAGC AGGCTGTGACCTGTCAGTTTATATACTGAGGCTTAGGAGCCTCTTGGATG CCCCCAGATCTGCACCCCTGAATTGCCCTGTGCCCCTGCCGTCTTTGTTC CTGTGCTGGCATAGTGGTCTCACCTCCGGCAGtatcaccaccactgggca caagcttctccagcacagcaactgtgtcttatttctccttgtactcccag tgttcacaccatgctgcactcacagaagactcttcgttgatattttgtgg acagagagaatGCCTGTGAGAGTGGGCTGAAGTGTGCGTTGGGCTCCAGA GACCTTAAGGAGGGGAGACCAGGTCCTGAGTAAAGTTGAAGGGGAGGGGC TGAGTCCTGCTAGCCAGGAGTCTCATCCCCTGGGGAAGTTCCAGGGACCC CTCAGAAGTGCAAGGGGACGGTGTTAGTGTTAGTCCAGTAACACAGCCCA GAGCCTGCcttccacgtgggtttgacaggagcctcctaactgctcttctg cttccatttttgccccttcagtctattctcaacagggaagccagaggcat ccttaaccatgtcagatcatgtggctcctcagctcaaagccTCATCTCAG AGGAAAGCTCTGGTCCCTTAGAAATGGCCCAAGTGGTGACAGACAGACTC TAAGGtgagcagactgttgctagatatctgggctcggaggactcgccact gctcaaaggcagtgaggattttcgcactagaagctggaggacagggatcc ttgttaggtaggagcagaaagcttagaaaagtggtctcctgcagttacgt ggcaaacacatcatgtaagtgataaattgggtatgcagttgaggagattt ccaagtaaaatgttgaggatgctgcctggtttcttcttactgcttataat atagtgtgagagaagagagataaattgagaaagagactggtttttaaact gttaaaattgaatcaggacttgatgattttgaaaattgtcagtctcccca catggaaaaagatgctgaaattaacaaatggcttctgagcatgtggcata gggtgtaactgtacagtcttttgtgattatgcataaagatcaaaggatgg gagtagcaatgagtcacacagaggtctgttgcaagagattacaagggtgt accatgcagaacctctccaccaaaccttagggcccttgggaagcttcagt gagttaccctgggggccatcttggcaggagctgaaggtagaaaggtagag tttatctctaaaagattcatgggtatggctcttgacaaatcgactatgag ccccaccgaaacccacagaggacaggcaaagggtttgggaaagctgtttc acccacagtgctggcagattggtctgtaggggacagagtgcaaaatgaaa gaagactgtcagagaccccaaactctgctgtcaagaagaaggctgataaa actacttggctgcaaacacgtggatctttcgtgagaaaagaaggatgacc cagaggcagaagcccagaaggcagagccaagagacatggaatcttcccac atcttaaaacctgtttagggaacaccagcatctgtccagctggatttcag aaccaccattccttcatccttcccctgctgcctctttctgaacagcaatg tctcaagctttacccaccattgtgtgttgcatatgtagggggcagatagc ttgtatctttagttttccagatcagaggaacatccaaagaaatctgttct acacctaaacccgatttagatgagattcgggactgtgagcatgaagggat ctcaagaggggtgaatgtgttttgcatgcacaagggacaggagtcttggg gacagaggacaggctgtggtggcagatactaaggtgacccccacaacccc cacctctgccattcacacccttgaataatccccttctctggttgtaagca gaacctgtggcttgcttatgaaggaggcggtatatatgtgattcatgtac tgatcatattgtataagatcactggctggatgcagtggctcgtgcctgta atcccaacactttgggaggctgaggcgggtggatcacctgaggtcaggag ttcgagaccaggctggccaacatggcaaaaccccgcctctactaaaaata caaaaattagccaggcatagtggtgcacgcctgtaatcacagctactcaa gaggctgaagcaggagaattgcttgaactcaggaggtggaggtggcagtg agccaagatcgtgccactgcactccagcctcagtgacagagcgagactct gtctcaaaaaataaataaataaaatgttaagatcataacctgtctttctg gggactctctcttgacgcctttgaagaagcaggctgccatgttgcaagct gcctcatggaggggatcagctgcgaggagctaagagccccctccagtcga tgctcaccaggaagctgaggtcttgtgtccagcaccctgcatagaactga atgctgccatgtgagcttggaagcagagccatccacacagctgagcccta gatgagaacccagtgctggctgacaccctgatggcaccttacagaggacc agttaggctgtgccaactcctgacctgcagaagctggggaacactgggtc gtatttgcagctgctggatttgtgggaatttgtcacacagcaatTGGGAG TCACACAGCCTGTGACGCCCCAACAATCCACAcctcctgcatctccctgc cttcacttcctagcacactgccctgactccctctgccgcagccacgctgg ccctctgctgttcttcgaagccaccagggctgcattggctcccagccttt gctctcactgctttctcctcctagagagcccttcctgcatgtatatgttt gactcactcccttgcctccttcagacttgtacttaaaaatctcagtaagc atttccctggctacccttttaaaaattgcaacccacttccatccccatcc ccaacatgccatatttcctttcttctTCttccttcttccttttttttttt ttttttttgacacaggttctctgtcacccagcctggagtgcagtgacatg atctcggctcactgcaacctctgcctcccCAGGCAagaaaaggggaggat gccaataaaggatgcattgatttgtatttactacagtggacatcaagggc acattcttgctgtggccatcaagagactgtataaattctatgacttgtag ttgtcccacttaagaaacaaagaagctgTGCATTTCTTTACTGGTCTAGA GCTGCTCTAGGGCATTTTCTCTACAGCAATTCTAGGTTTCCCCACCTTGT GAGTTTAGCTTTTTCTATATTCAAAGAAAAGTCCTCAGCCAGAGATTCTC AGGAGCTTATAGAACAATCCAAACTCTTGGGAATATTAAGTGGAGAGGGG TACGTGCAAGACACCAACAGCACTAGAAACAGTCCACATCTTTCCATGCG TGGAGGAGTTTATGCTCTATGTGAGTTCACTCCATCATTAATTCTTCAAA CACAAGAGTGTTAAAGGAACAAGAGTTAATGGGTCCTGTCATTACACTTG TTCCCAGGATGACATTCTTCATCTTCCTCTTCTACAACCTGTTCTATATT CCCCTCATGTTTATCCAGTGCTTCTGCTAGTCTAGTTCACTTCCAAAGAC CCATGATTACCATGGCCCTGTCAGGCTGTAATTGCTGCAATTTCCAATTT ACAATTGTCATCATCTATGGTTGATAAAGgtatagcaatatttctatttc ctcatgataatgaaggtcaattacaactgccagtataataacttatttct ttgtctgccaacctacatacacaaggaagccaaaatgacagggagctact aaaactttattcttattggaatgcttactatgtacccagaagaagcattc tccctactccagcagagcttaatgctgtaggtccaggaagctcaaattct ccaagggagttttagtgagaggagccactctcaccctctgcccttggttt acaaacctgtatattctaggacccaatatcttacaatgtccattggttca aagtataacatgttaaagcacagagccccaactctgaaaagtaccatccc taaattggcatttagttgcacctttatatccacctttaaaagaaatatct tttaatgttctatcagactgatagattctgtttaatatagtatattatag caccagtggatcatttggttgtatgcatattattgtaccttctctgctac aaaatatattcctttgtcctaaggtgtgttacaaagaacattaggcattc tatgcatctttggatagtttaatggccaagacattgatggcaggagagtc aaagccacaggtggaaaacacatttatcccagtaagaacaaattgctatt cttccactgtagagagggtaaacaatgtgccattacgttgccaattgaat gcctcaatcatgtcaagggctgaacatctatgactgtttctgaaaggtca aacattcaacagaggctgtagctagaactgccttaatgataagagatcat gctgaattacccatgcaaaaccttaatacttgacacttatcactacttta ttcaagagcctattgtgcaagcataagtggctgagTCAGGTTCTCAACTC TGCTCATTAATACTATGCTTGGAGTATACAGTAAGATAAGAAACATAAAT AAGAAGTGTACATTTGTTTcttcctgttttcttctggctattggatcaat tacatcccatcttaagctgacccctgtgtaattaatcaatatccgtttta agcagcaatccatagttgtgcagaaattagaaaactgacccacacagaaa aactAATTGTGAGAACCAATATTATACTAAATTCATTTGACAATTCTCAG CAAAGTGCTGGGTTGATCTCTATTTACGCTTTTCTTAAACACACAAAATA CAAAAGTTAACCCATATGGAATGCAATGGAGGAAATCAATGACATATCAG ATCTAGAAACTAATCAATTAGCAATCAGGAAGGAGTTGTGGTAGGAAGTC TGTGCTGTTGAATGTACACTAATCAATGATTCCTTAAATTATTCACAATA AAAAAAAAGATTAGAATAGTTTTTTTAAAAAAAAAGCCCAGAAACTAATC TAAGTTTTGTCTGGTAATAAAGGTATATTTTCAAAAGAGAGGTAAATAGA TCCACATACTGTGGAGGGAATAAAATACTTTTTGAAAAACAAACAACAAG TTGGATTTTTAGACACATAGAAATTGAATATGTACATTTATAAATATTTT TGGATTGAACTATTTCAAAATTATACCATAAAATAACTTGTAAAAATGTA GGCAAAATGTATATAATTATGGCATGAGGTATGCAACTTTAGGCAAGGAA GCAAAAGCAGAAACCATGAAAAAAGTCTAAATTTTACCATATTGAATTTA AATTTTCAAAAACAAAAATAAAGACAAAGTGGGAAAAATATGTATGCTTC ATGTGTGACAAGCCACTGATACCTATTAAATATGAAGAATATTATAAATC ATATCAATAACCACAACATTCAAGCTGTCAGTTTGAATAGACaatgtaaa tgacaaaactacatactcaacaagataacagcaaaccagcttcgacagca cgttaaaggggtcatacaacataatcgagtagaatttatctctgagatgc aagaatggttcaaaatatggaaaccaataaatgtgatatgccacactaac agaataaaaaataaaaatcatattatcatctcaatagatgcagaaaaagc attaacaaaagtaaacattctttcataataagacatcagataaaacaaat taggaatagaaggaatgtaccgcaacacaataaaggccatatataacaag cccacagctaacatcataatagtaaaatcatcacactggtaaaaaaaatg aaagcttttcctctaaggtcagaaataatataaaggttcccactcttgct atttctattccatatcgtactaaaagtcctagccaggacaattagacaaa ataaaaataaaaacacccaaattggaaagatagaagcaaacttttctgtt tacagataacataatcttatatgtagaaaccccttaaaacttcagcaaaa aaaaaaaaaaaactacagagctagtaaattcagtgaagttgcagaataca aaatcaacatacaaaaatcagtagtgtctctatacactaataaggactta acagagaaagaagttaagaaaacaataccactaacaatagaatccaaaaa ataaaatacttaggaataaattttaccaaacatctgtacactaaaaacta taaaacattgaaaaaagaagttgaataagacacatataaatagaaagcta tctcatgttaatagattagaaaaagtaatattgttaagatgtcctcacta cttaaagcaatttatagatctaatgcatttattgcaatctcttcaaaatc ccaaaggtatttttgacagaaataaaaaaaaaattctaaaatatgcatga aaccacaaaagactgtgaatagctaaagcaatcttgagcaagatgaacaa cactggaagcatcacactaccttatttcaaaatctactacaaagctatag tgatcaaagcaacatgatactgtcataaaaacacacagataaacctatgg aatggaataaagagcacagaaataagtccacacatttacattcaattgat tttcaacaacaatgtcaagaagacaatggggaaaagacaatctcttcaat aaatgatgctggaaaaactatatatccacatgcagaagaatgcagttgaa tcctgatttcataccatatgcaaaattcaactggaaatggattaaataca aatttaaaacatgaaatggtataactattagaacaaaacatagaaaatat tcttcctgacattggtttgggccatcatttttctgatatgactctaaaag cacaggcaaaaaaagaaaaaatagacaaatgagactatgccaaattaaaa aatttctaacaacaaaagaaacgatcaatagagtgaaaaagataacctct tgaatgggagaaatatttgcaaactactcatccaaccggggattgatatc cagaatatacaagtaacacaaatatgtcaaaagtaaaataaataaataaa taaataaataaataaattaaataaattatttaaaaatcggcagaggacag gaatagacatttctcaggagacaacatacaaagggccacagatacatcaa aaaatgctcaacatcactatttgtcagggaagtactaattaaaaccaaaa tgagatgtcccctcaaacctgttagaatggctattatcaaaaagatgaaa gatagcaactatcagagaggatgatagaaaagggaacccttgcatcatgt acaaattaaaaatagaactatcacatgatccaagaatcctacttctgggt atatagccaaaggaattgaaatcaatatgtcaaagggatatctgcactcc tatgttattgcagcatgttcacaatggccaagatatagaatcaacctaac tgttcatagacagatgaatggataaatgaaatgtgatatggaaaattatt cagccttaaaaacagtaggaaattctgtcatttgagacaacgtggatgaa cctagaggacattaagctaagtgaaataagctagacacagaaagacaaat attgcatgatctcacttagaatctaaaaaatctgaactcatagaagcaga gaatagtatgatggttactagggttatctggcagggagaggatgaggaaa tgggacattgttaataaaaggaaaaaaattcaattagtaggattacattc aggggacccaatatacgacatgttgactgtaattaataatgtattgtatg cttgaaaattgctaatacagtatattgtaaatgttaatatgaggtaatat atgtgttaattaacttgatttattcattcaacaacatacacatatattaa aacatcacactgtattccacaaatatatataatttttgtcaattaaaaaa taaTTTTTAAAAATGAGAAACAAAAAAGCTGACATTTTCAGATTAAAAAA ATTATACAGAAGAATTAATTCATTAAAGTAAAAACAAATGTGGGAAAATG GTTTTTAAATATAATTTAAACCAAATTTAAAATAAGcatataaagactat ggacaaaacaagaaatccaaataaaaaataaacatatgaagaatattcaa actcactttttatcaaagaaatgtaaattttaaaataTAGCATTGCTATT GTGTTTTCATAAATAATAATATATCATGGATGAGCCTGTGAGGAAACAGA CACTCATACTCTGCAAAGCAATGACTAAgataattatgtcagatcatgaa ttacgttaattagcttgatggtggtcactgtttcacgataaatatacata tgtatcaaaacatcacattacacaccataaagatatataacttgttatCA AAAAGAAATATAGCAGttaaaatttaaaatttttaaaaaaCGTCTTTTTG AGGTTCGTACCTCACTTAAGTCACACTGTTCAAAATATTCATGCACTCAT TTCTCTCATTCATGTGTTAATGTACAGGGTACGGGCCACTATAAATTCCT TCAGCAACTGGAAAGGAAACTTTATGTACTGAGTGCTCAGAGTTGTATTA ACTTTTTTTTTTTTTtgagcagcagcaagatttattgtgaagagtgaaag aacaaagcttccacagtgtggaaggggacccgagcggtttgccCAGTTGT ATTAACTTCTAATTCAACACTTTAAGATTCTTAGCATTATTGCAGACAAC ATcagcttcacaagtgtgtgtcctgtgcagttgaacaagatcccacactt aaaaggatcctacactttttttaatgctctgctgtttctgccttgaaatt cttaacaatttttttaaccaaagtcctcacaaattcagtttacattagcc ctgcaatcatgtagacatcctgATTCCAGACAATGTGTCTGGAGGCAGGG TTTACAGGACTTCAAGAACCTTACCTTCTCAACTTTCATCTGCATCTTTA CTCCCAACTATATATGAAGATGATGAAGATAGATATGGATGGTGCTTCTA CCATACCCTCTTCCTCTGCCAAACTTCCTTGATCTAGGATAAggtcagta aacttcttccgtaaaaggccaaaagtaaatattataggctctacaggccc tagagtgtctgtcataactactcaactcttattgtagcataaaaactgtc aacagacaatacagaaacaaatgagtgtgactgggttccagtgaaacttt atttacaaaagatttgtcccatgagtcaaatttaccacctccAGATCTAG AGAAACAGTTTTGAGCCCTTTTATTTTGCTCAACAGTTAAGCATGGCTCC ATGTCCCTTATATTTAGTCAGAACTCGGTATGTTTTAAGGAAAGAATGGT TACACGAAGACATACATTCATTCATTTATACAACACATTTTCAGTGTTGA ATGATAAATTTTGGAATAGTTAACAGATGATAAAAGTGTTGTTTTCAGTC ATCCCTATCCAATGAAGTAAAAAAAAAAGTGTTGAATGGGAAGAAATCAA GAATAGTTATACGAATATCACCATTGCATTAAAGCTCTCTTCCTTGTTTC TAAAAGAATATCTTGACACACATTAAGCTCACTGACCCCCACACCATGAA TGAGGGCATCTTCAACAATGGTGGATGACGTCTTAGTTTCCCTCAACTCA GTTAATCTAAGTAAGCTCATGGTATCACTTTCCTGTCCTAGAGGGAACAT ATTTCCTGCATTTTTCTTTTTTTCCTTACTTTCCATCACCAAGTAACTCT TCTGATATTTTTTCTCTTGAGAAAATTAATATGACTCATAGATCTGGTTC CCAAGAGAAATCAATGGAGGCCTGGTTACAAGGATCTAAGAAGCATCAAT GGGTCACTAACATCTAGTGGTACTAATTAACTCTGTTAATCATTGGGAAG AAAATGTATATATACTTTTGTCTTGGAGCTGATTCTACTAGAAAGCAGAA ATCAAAATGATCAGTTTCCCAGTGTCACTACTGCACACCCTGGAACAGAA CAGGTAGGTCAGAAAAACGCTCCCAAAGTTTAGCAATGTCAAGGCAATCT CTCTCTTCTTACATTTCCCTTCAACCTTCTATCTCCTCCACTTTTCTGTT TTCCTCCTATCTCCAATTATTTCAATCCTCAGAGCATTATTCTTACAATC TTAATCACTAAATTATATTACACCCGTTAAAGGAGAGATTTCTAAATGCA TTGACATTTGTACTGTCTCTCTTTGGAGAATTAGTATTATAAGGATCTGT TATCTCTTGTCACCTTCCTTATGTCATATGATATGTCACATTTCCCACTG CGGAGACCAAACATGTTCACATCGTGTGCGTTCCATTTTCCTAATGGAAA GTGGGGGGAAGTGATTTTCTGTCCTCATATAGAGAATGCTGGGGCCATTC CCTCTGTATGCCATATTTGATAAAGCATTTGATAATCTTAGTCAATGCCT GGGCCAAGAATTAAAGGGGTAATTATCAGAATGAAAATGGTTTAATGAAA CTGTGTCTATCAGTTCTGAAAAGGGCCTCTATCACAATGAACTAAGGTAG TTATGAATAGAGCTAAaacttaggcaacaccatcctggacataggaacgg gcaaagatttcatgacaaagacacggaaaccaatcacaacaaaagcaaaa attgagaagtggaatctaataaaacaatagcttctgcacagcaaaagaag ctaccaacaaagtaaacagacaacctacagaatgggagaaaatatttgcc aactgtaagtctgacaaaaatctaatatctggcagctataaggaacttaa atttacaagacaaaaacaaccccattaaaaagtgggcaaagaacatgaat agacactctcaaaagaagatatacatatggttaacaagcatatgaaaaaa aagctcaatatactgagcattagagaaatgcaaatcaaaaccatattgag atatcatctcataccaggcagaatggctattattaaaaagtcaaaaataa cagatatcggtgaggttacagagaaaagggaacacttatacactgttggt gggactgtaaattatttcaaccattgtggaaagcagtatgggatggcgat tcctcaaaaagccaaaaacagaactatcattcaacccagcaattccatta ctgggtatatacccagaagaatataaatcgttctaccataaagacgcatg catgagaatgttcattgcagcactactcacaatagcagagacatggaatc aacttaaatgcccatcagtaacagactggataaagaaagtgtggtacaga tacaccgtggattactatgcagccataaaaaagaacaagatcatgtcttt gacaggaacatggatggagctggaggctactatccttagcaagctaaggc aggaacagaaatccaaataccgcatgttctcacttatgagcgtgagataa atgatgagaacttgtaaacacaaagaaggaaacaacaggcagtggggtct acttgaggacgacgggaagagggagaggagcagaaaagataactactgac taccgggcgctacctgggggatgaaacaatctgtacaacgaacccccagg acatgagtttacctatgtaacaaaccttcacgtgtacccccgaacctaaa ataaaagtcaaaaagaaaAAGAAAAAAAGAAAAATCCATGCATATGATAC ATCAGTTAACAAGGCACTGGTGAAATTAATTTTAAGTATTATTGTCTCTT TGTGTTTTTGGTCTCAGAAAAGTTACGATTTCCCTTAGTTCCTTAGGGCA GAGAGAATCTTCAATCACTGAAGTCAGGAGACACACATTCTATCTGATTT TCTACATTATCTGTTTGAAAAGGTTACCCACTTATTAGTGTTAAAGCCAA GATATCCAGCAAGGATAGCAACCAACTCTTAAGGTACTCTCCCTTAGGAG GATTCCTGATTCTTTAATGTTTTCTAAAAAAGCAAAACAAACAAACAAAC AAAACAAAACACTAAATGTTTTCTCTTTCAACTTATTTGAATACACTCTT TTCTCACTGCTCTGAGCATGAATTCAATATTTCAGGGCAAACTAACTGAA TGTTAGAACCAACTCCTGATAAGTCTTGAACAAAAGATAGGATCCTCTAT AAACAGGTTAATCGCCACGACATAGTAGTATTTAGAGTTACTAGTAAGCC TGATGCCACTACACAATTCTAGCTTTTCTCTTTAGGATGATTGTTTCATT CAGTCTTATCTCTTTTAGAAAACATAGGaaaaaattatttaataataaaa tttaattGGCAAAATGAAGGTATGGCTTATAAGAGTGTTTTCCTATTGTT TTCAGTGTAGGACTCACTGTTCTAAATAACTGGGACACCCAAGGATTCTG TAAAATGCCATCCAGTTATCATTTATATTCCCTAACTCAAAATTCATTCA CATGTATTCATTTTTTTCTAAACAAATTAGCATGTAGAATTCTGGTTAAA ATTTGGCATAGAACACCCGGGTATTTTTTCATAATGCACCCAATAACTGT CATTCACTAATTGAGAATGGTGATTTAACAAAGGATAATAAAGTTATGAA ACCAATGCCACAAAACATCTGTCTCTAACTGgtgtgtgtgtgtgtgtgtg tgtgtgtgtgtgtAAGAGGGAGAGAGAGAAAATTTCACTCCCTCCATAAA TCTCACAGTATTCTTTTCTTtttcctttcctttccttgctcttctttctc tcctattgctttcctttcatttccttCTCATAAAAGAAAAATAACAATAT AGAAAATAACAAAATATAGATGGTCAACCTTTTTAATATTAAGGTTACCT AAAATGCCATTATCCAAAGTGGTTCTCTAGAGATGCTGATGTATATACTT ACATATTTTACAGTGTATTCAAATAAAGAGTATATTACATAAGACATATC CTTTTGTAACCAACTTTTGTCATTAACAATTTACTGGACTTGTCAACAAA CCTAAATCTGTATCGTCTATAATGGCTACGTTCATTTTGGTATGAATCTT AATTACCCCTTTCTGCATTATTTAATGATTTTCTCATATGTCACTCTTAA ATGTACTTCTAATTTTTCACTTTACATCACATAATGAATGGATCCAAATA TGTTATGGATAGATATCTTCAAACTTTCTACTTACAAGTAGTGATAATAA CAGATGTTCTCTCTAAAGTGTAGTTGGTATCAATTTTACTGACCTTTAAA AATATCTTAATGGGACAAAGTTCAAATATTTGATGACCAGCTATCGTGAC CTTTATCTCTGTGGCTCTGTGGGCCTGTAGTTTTTACGTGCTTTTAGTGT ATCATGATTAAATATTTTGTTTTAGTAAAGACACCATTATTTCCCAACTT CATATTCAAATTGTCAAAGGTATTAATCCTAGAGCAGAACTCTCAAAAGC ACCAACTCTGATTCCTAACAAAGCATGGAAAAGCCCTCTCTCTGAGTTTC AGATACTCTTTTTTGTGGGGGTTGAGTTTCACTTTATTTAAAGTGAGTCT TAATCCTCCAACAAGTCAACAAGTGATTGGCTGGAATCACACGTATTGGA AAACCAGCGGAAGAGTAAGTCTTTGTATTTTATGCTACTGTACCTCTGGG ATTAATTGCTCTTTCCCTCATTGGCCAGTCACTCTTAGTGTGTGATTAAT GCCTGAGACTGTGTGAAGTAAGAGATGGATCAGAggccgggcgcgggggc tcgcgcctgtcatcccagcactttgggaggccgaggcgggcggatcacga ggtcaggagatcgagaccatcctggctaacacggggaaaccccgtctcca ctaaaaatacaaaaagttagccgggcgcggtggcgggcgcctgcggtccc agctgctggggaggccgaggcgggagcatggcgggaaccgggaggcggag cctgcagtgagccgagatggcgccaccgcactccagcctgggcgacccag cgagactccgcctcaaaaaaaaaaaaagaaGATTGATCAGAGAGTACCTC CCCTAAGGGTACATGCAGATAAATACAGTTAAGGCGATTAACATTTCAAA TACGGTGACTGTTTCTTACGTGGACGACGTTGTGTTGAACATGGGTGAGT AAGACTGAAGCAGCCGTAATTACTGCACGATGCGCATGGTAAAGAAGCAC TCCGTTAGGGAAATTATATTCTTTGCCCCTCTAATCCTTCACTCCACCTG CCATATTCCCACATGATTTTTTTCTTTGCTGTTCTTGTCTAATTGttatt aataattaataaataaCTTATGATCTAATTGTTATTAATAATAACTTATC ATCACATGatttattaataaattaataaataacttattatCACCGCATTT CCCCAATTCATTTATCTTTCTTTCATTTTCTCTCTTTGTGTGTTTTCTGT CTTCATATTTCAGCACTTGCCACATATTTCCCACAAAATCATTTATGGTC AAACAACACTTCAACGTGTAGCATTTGTATTTCTCAATTCTTCCTCACTT TCTTCCTTCAGAATACTAAAGCTTCTTCTCTACTGACTGAGTCAATGGCC AATGGATAGAGTAAATAATTCTGCGGTATCTAAATTTGTATTGATTGGAC TTTCAAGCTCTTGGGAGATGCATCTTTTTCTTTTTTGGTTCTTCTCTGTG TTCTACATGGGAATTATCCTGGAAAATCTCTTCATTGTGTTCACAGTAAT TATTGACTCTCATTTAAATTCCCCAGGTACTGCCTACTGGCCAACATTTA TCTTCTTGATCTGGGTCTTCTCCTACAGTTCTGACTTTTTCACTAACTGC AGCATCATTTCTTTTCCAAGATGCATCATACAGATATTTTTCATTTGTGT CATGCGTAAAAATTGAGATGGTGCTGCTCATAACCATGGCATAGAGCAGG TACACTGCCAATCTGTAAGCCTCCCCATTACCTGACCACAATGAACCCCA AAATGTGTGTTTCCTTTGTTGGAGGCATCCTGGATAGTCAGGATAATCCA TGCTGTATCTCAGTTTGTTTTTGCCATAAACTTGCCTTTTTGTGGCCCTA ATAGAGTAGGTAGTTTTCACTGTGATTTTCCTTATGTCATGAAACTTGCT TGTGTAGACACTTACAAACTAGAGGTTGTAGTCACTGCTAACAGTGGGCT TATATCCATAGCTACCTGTTTCTTATTAATAATATCCTATATTTTCATTT CGGTAACCGTCTAGAATCCTTCTTCAGGAGACTTATCTAAAGCATTTGTG TCATGTTAGATCACATCACAGTAGGGATTTTGTTTTTTATGCCATGTATA TTTCTGTATGTGTAGCCTTTGCCTAAAACAACACATGATTAATATTTGTT CATTGTTCCTTTTGCTATCACCCCTGTCTAGGATCTACACATTAAGAAAC AAAGACATGAACGTCTCCATGGAAAGACTGGGAAAATGGATTGCAGGTTC TAGCAGGATGTCATAATAAATGGTGCATATCCAGAGTGCAAGATGATTCA GTCTCACCAAGAACACTGAAAGTCACATGGCTACCAGCATTATTGTGATA AGAACTACTATTTTGGGAGATAGTTTAGCAAAGGTGCCATGTAGAAATTG ATTAAGTCAGAGGTATCTTTAACTTGCCACCACAGAGAAGAGATTAATTT CATATACTTCCATTGAGAAGAGAGATAAGAATACAAAACCAAGCTGATTT GCAGGAGTAAACTTGATATTCAAATACTATTTCCTGAATGACATTTTCTG AGACATGCTAATTGTAATTACTTTCAGCTTCAAAACATAATAAATTTATC TCATAGTAAGCATATAGATGGAATAAATAAAATGTGAACTTAGGtaaatt ataaattaataaagtatatttttaaaatttCCATTTTAATTTCTGTTTAA ATTAGAATAAGAAACAAAAACAACTATGTAATACGTGTGCAAAGCCCTGA ACTGAGATTTGACTTTACCTTGAGCTTTGTCAGTTTACGATGCTATTTCA GTTTTGTGCTCAGATTTGAGTGATTGCAGGAAGAGAATAAATTTCTTTAA TGCTGTCAAGACTTTAAATAGATACAGACAGAGCATTTTCACTTTTTCCT ACATCTCTATTATTCTAAAAATGAGAACATTCCAAAAGTCAACCATCCAA GTTTATTCTAAATAGATGTGTAGAAATAACAGTTGTTTCACAGGAGACTA ATCGCCCAAGGATATGTGTTTAGAGGTACTGGTTTCTTAAATAAGGTTTT CTAGTCAGGCAAAAGATTCCCTGGAGCTTATGCATCTGTGGTTGATATTT TGGGATAAGAATAAAGCTAGAAATGGTGAGGCATATTCAATTTCATTGAA GATTTCTGCATTCAAAATAAAAACTCTATTGAAGTTACACATACTTTTTT CATGTATTTGTTTCTACTGCTTTGTAAATTATAACAGCTCAATTAAGAGA AACCGTACCTATGCTATTTTGTCCTGTGATTCTCCAAGAACCTTCCTAAG TTATTCTACTTAATTGCTTTATCACTCATATGAATGGGAATTTCTTCTCT TAATTGCTGCTAATctcccccatcttcaaatactctaccgggcttctgga acaccacagcttcctggctttttctcctacctcctgggcaagtccttccc tgtgtcttttgttgagtgttcctcatctgcttaactaccaatcaacctat tgcccctaatttgatctttggcctgttttcacttagattctatccctacg tatcacccattcccacagctttaatcaccatctaaacactaggggctctc aaaccttgtatttttctttctttctttctttctttctttctttctttctt tctttctttctttctttctttcttcctccttttctttccttttctttctt tcattctttctttcttttttAaggggcagggtctcactatgttgctgagg ctggtctcaaactcctgacctcaagcaatctgtctgcttcagcctcccaa gtagctgagaatacagggacaagccattgcacctgacCctggtactattt cttgagttcctgatccacagatctaacctcctactttcctggatgccaca caagatcttccactcaacaagtctgcaactaaactagccttcctcttttc aaacctactcttctttcagtgttctcagtcacaataatttgtaccaacta gttacctagttgcacaacccaaaatctgggaaaaataatagatttctttc tccatagtacccccaaatcaataaatcatcaagtcttattctaccttcca aagagccttacatatgttcctttattttcatctgtaacaccactattcct gtctaagcctacctatgtcatttttggaagagaatatagtcacctatgcg accttcccacttaaaatcctactatttacgcttcagtaaaagaaaaaaaa tttttaatctaagtatgtaattcttttgctgaagacacttcacttgcttc tgtgcccttaaactggtatgttatcatggtatagtaggccatccaagacc tggcttccttcctttttttcagtctcagagaataacatactctttccctg caactccagatccaatttggttttcttttacttgcctggaaactccaaaa tctatcaactctggggctttccactagctaatcattttgtatacaatatt tgtccttcATGTTTTGCCTCTTAACATCTCAGCTTTCAGTTTCATCATTT TACCAGGGAGGCCTCCCAGAACCTGAGTCCAGAAGAGTTCCTTCCATTGT ATATTCCTCTAGCACTACCTATTACCTCTTTTGTAAGACTAACAGCCCTC AAAATTTTTCATTCAGTGATGTCTTCCTCATTGCATTTTAAGTTCAACAT GAGCAGGACTTTGTCGTGTTCACCTCTATCACATCATAAATATAGCAAAC AGTAAAACTATTGCAACATGACTAATGTATTGAACGATGCTTCAGCTTTC TTCTTACGTTCAATCACAGGTCATATGACTAAAGAACTTCCTTTTTAATC TCCTTTTCTATTCTCAATTAATTTCTTCTGCCTGCATCACCTCAAGTCTC TGGGGTGAAATCCACTAATGAATTCCTTTTGCAGCTTAAGCCAATTCCAA TCTTGAGCCAATCTCAGGTGAAGAAGCCTGTAAATTATCACTCTCAGTCC TCTCTTGTACTACTAGGTCTCATGAACTCTTCATTAACAACTCCAGCTTC TCTGTTAGCCCAAAAGCCTTTTGCTGCCTAGAAAACCCATGATTCATGCC TCAGGAAACAGCCTTCAAATCACAACATGTTCTGTATCTGGCTGGCCAAC TCCCTGCAACTTATTTCTGCCTAGATTCTCCCTCATTCATTTCAATACGC TGTTCGGCCTGCTACCCCAGTTTCCCACTTAGAACAATGGCACACAGGAC AGGAGCACATTGGCACATCAGAATGACTTATGTACTGCTCATTGTGTTGC AGAAGAGACCTCTGTGGGGGCAATAGAACAGATTTTCCTCTCACGTCACT GTAGTTGTGGTTTCCCTAAGCACCTACACTGTTTCACCTCATCTTAGGTA GACAATAATCCATGTAACTGACTGTGTATCCTAATTTTAAAAAATATTTC TGCCCACATTATTCTGCAGTTTTTATCTTGCTTACGTATTTTTGGAATGT TACTATTTTTCAAAAATTAATTTGGGATCAACCAACACTTCTTATTCTGC TGCTGTTCTAGAGAAAATCATTTTCCTCATTTCTGAACAAGAGAAAATGA AATACAGCTCTAAACAAATGCCACTGTAAACCAAGGTGGAGCCTTTGCAC TTTCAGGCCACCATGATAACCTGGAGATTAGATTTTTCTGTGTCTTTATA TCAATAATAAAGCCAAGCTTCTCCAGGGGTATCCACTAGGCTTGTCTCAA TGGCTCAATACAGGTCCTTTTGTGAATGATTACCTCACCCTCATGGAAAC ACACTCTTGTTACAGAAACTCAGAATGATTCTATTTTTTCTTTTATATTT GTATATGTTTTTCCAATACCTCTGAAAAAACTGATCCAAAAAAAATACAA ATTTTAATTGTAGCCAGTCAATTCAGGAAGGATAAAGGTCAAAAACTTTC AAAGAAACCTTCAGCCCCAACACACTAAACTTTGGGAGCACAGGTTGGCA TCCAGAGGTAAACATTTGCTATAACTGATAACAGGAGAAGGATCCATTTA TTCACCTGTTATCAATTACAGGCATTGTATTTAAAGATCAGATGTTTTAT ATTTATTTCTTCAAATTTCATTCATGGTGCCATAAGTGAAGGTATCTCTG TCCACCCTGAATATATTTTCACTCCCTCATCTCAGTCATTCCGAACAATT CACACACTAAGATTACCCATGCTAAATGGGGATTCTTTTTTACTAGCCAA TGTAGTACCTCAAATCCTTCCTTCCCTCCCCCTATTTCATCAGCAGGCAA TTCTTTTGATACTTTTGTCAAGGGGAAATTGTGTGACTCAGAGATCTAGT CCCCAAGAGAAACTAATAATGGGCTGGGTATTGTCTGTCTCAGCAGCATC AGTGGGTCCCTCTCCTGTGCAGCTAATTAGCTTCCTTTCCAATATGAAGA ATCTTATATATAGCTTTGTCTTTGGGGTATTACATAAATGAAGATTAAGC TATCTGAATTTCTCCTTCTCCTAAAAATGCACATCCTATGACTGAAAAGA CAGGTAAAAGAGATGCTTTTAATTACAAAACTTTCCCTGTCGTGGTTGCT TCTCTCTATCCTTCTAAACTCCCTTTCAATTTCTTCTCTTCTGTAACATA TTTGTGCCCAAAATCTTCTGCTTTCTGAAATATTTTATCTTTTTCTTCCA CACTATCTCTTATTTTCCAATTTTAATCATTAAATTATATTATGTCTTAT AAAACTAATCCCACATATAAACCCCTATGATAATTTCAGTTTGTCCCTAG TATGAAGTTCTTTAAAGATGTGTAGTTTTCTAACTTTCATGCTCTCCAAT TAATTATAAACTTCATTTTCCACTCTGAAAAGGAGATGTCTGATCTCAGC TATTTCCATCCTATTTGAAAACCAGATTTAGTTTTAAACCAGAGGAAGGG AATCTCAAGTCTTTACCTCCCACAGTCTGGTGTGATTCTCTCTCTTTTGG TATTACCTTCCTCCACATTGGAACACTCCAGCCAATGCATAGGCTGAGAG GCTATCTCAGATTCAGAAAGATTTGGCCTCATCCCAGGGGAGGGTACAGA GGAGCTGATGACTATGAATTCTGAAATGGAACTGTTCCAGGTTGAAGAAA TAAGAAAGGGAATTGGGAAGAGCAATGCCCAGTGAAAAAGAAGAAATAAT ATTTTAGGAAGTGAATGCTAATTTTATTTTAAACAAAATAAGAACTCAAG GAATAAGAGGGTTCTTCCAATAGGTTAGAGTGATCCTGTCAAACATATAT GCTTCTAGATTTTTTTAAAGACTGTTTCTACTAAGAAAGCATAGACCGCT ATTGAGAAAGATCATTAAACTGGAATTTAGGAGGTCTGCCTTCTGATTCT GACTTCTTGAATGTATTGTTAGCCATTTAACCACACTGTGTTGTTTCTCA TTCTACCTGTAGAATCTCAAAGTTCTTTCCCACTTCTATACAAAACTATA ATTCTGAACATCCTTTTTGTTTAATATAAGTCTGCATTTCCTGTTTGAAG ATATGTGTCCCAGACCCTAAATGACTGACAAATTTTAAATCTCCAATAGG AAAGATGACAAACTCTATGGAAACTTGGCTTCTGAAGAACTCCTAGAAGC TTTCCAAAGTCATCAGTGTTTCCTAAGAAGGCAGAGAAATCAAACACATG GTCTTTTCCTCCAGACAAGCTCCTTTGGGTCATCAGGATTTCTTCAACAA TAAAATGTAATAATTCCAAATGTTTGTAACAGAATGGGTAGGACTTTCTT CACTTATTTAAATACTCCCTTTTTTATGCAACTGAGTTTTCATCAACAAG TACAAGCTTGTGAAGGAGTACTTTAAAATGCAATTTCTCTCTATTTTTGT GGGGGCTAATATTTTATTTCTCATATTGACAATTTATTATGCTGTTTTTA AAAAGttcattcatcaagtatttcttgagctttttctatgagacaggcac tgttttaggcaagtaattatgcactgaacaatgcaaaaagtttccctgca ctcatggactttaattttacatttatgaaaagctacaaatattagaataa gtaaaataCTGCCTGGAGGCTAAAGCATATTTTGATCACTTATTCCCTAA TTCTTTTAGAAGAGAACTCACCTGTCGGTTAGCTGAACCACTGCCAGTGA TATCCAACTATACATTCAATCCCACCATACCTCATTATCACACCTATTCA CTCACAAGCTTAAACTCTTAACTTTTCTCCACATATCAGTGACTATTTCC TACAGCTTTTCTTTTACTTTCCATGTTTGCAGTGACAATATACATAAACA GTGTATGAAAACTCAAGTAAAATCTACTCTCTCAGGTGTTCATAATGTAT CAATGTATATTGCTTTAAGCCTGAAGGTAACCTAAGTAAAGATGTACCAT GTTCCACCAATGCTTCTTTTGATCATCATTTTATCCTGTTTTTTCTTTAG GATTCTTTCTTATTCCTTCCCCTGACCCTTCTTTTATTCTCCAAATTTCT TTCCAATTCATCTTTGTTCTTCCCTTTCCTTTTTACTCTCTTTAAACATT CTATGGACTCTGCCTCCTTCACACTGATATTGAACGCCCATAGTTTCATA TTTTGGATTGCGATTGTTTTATTTTAAAATGGCAAATGTTCATGTTATAA AGAGAATTTTTCAGTCTTTAGACTAATAGGTTCATGTAGTTTGGGATTTT CCTCTTTAAGAAAATTAATTATCACTCACACTCCAAGACAAACACCATTT CAGTAGCAATATGAATTTCAGTAGTAATAGGAATCTCCAAATATGACAAA GTAATTCAGACATTAATTGCTTTTGTTTTGGAATTGCTCTTATAAGATGA AATATCACTTTCATGATGAGAGTCCTAGAGTGCTTGGTTTATATATTGTA TCTTAGTTTTAACAGGATAAAACACTTGATCCTAAGCAGTAAACATGATT CTTCAGCTTCAACTTCATTTCTTTATAAATAACTATTTATGAATTGGTGT TGAGCTTAGTAAGTCACCAAACACCTTCTGCTCAGCAGCATAAAGGACAT TTCCATGAAACCTCCCAGGGATAATCTTATTTACTCTATAATGTTTCCCG GGTTCAATTCCTCTCCCAAAATTCTTTGTTCTTAAGCCCCTATGATCTGG GTGATCTAAATATGGGTAAGAAGTCCAGGGATAGCACTATGAATGAAGTG AAAATAGTAAAACATAGTTAAAAATGTAcagatgctctctgacttataat agggttacgtcctgataaatccatcataagtcaaaaatgcatttaatatt cctaatgtacctcacatcatagtttggcctagcctaccttaaatgtgctc agaacactttcattagcttatataagatcacctaatacaaagcctatttt ataataaaatattgaatagctcacgtaatatactgactactatactcaag tacagtttcttctgaatgcatgtcactttctcaccattgtaaagtcaaac aattataagtcaaactatcacaagccagggaccatcCATATGTATTTCAT TCAGAAAATGCTGGAAAGAGCATTTCGGAGAATATCTAGATGAGAGAAGG TAGAAAGCCATGCACAAATTCACTGAGAGTTTAAAAAAATACATGCATAT TGTGGAGATAGAAATCAAATCTATTTGTCTCCATCTGCTGTATTCTTCCC AAAATATTATCTCTTCTTATCCCATTGTACTATATTGCATTTCTTTGACC ATTTATTGTGTATCTCTTAATATTTCCCACTTCATCATTACTAACCTCAC TCACTCTGAACTTGATGAGAGCACCTGAGCATTAATTTTTCTTATAATTA TTTAATGATTACCAGAATTCGTTCAGTATGGCCAGCTCTGGTCAAAGTGA GGCAGGCAAGATGCTTTGTCAACTGCCTGGATGGAATGTCTCAAAAGGTT TCCATTTCATGGTAGCATTATGCAAAGTTCAAGACGTTTAATCAAGACCC TTCACTTACTTAACTATACCTCCTTGAGAATCCCATCTATGAAAAAATTC TAGTCATTATAAAAATGATTGATTAAATGAGGGAAGTAGTAGAGTTCTTC ATTTCTTTAGTTGGTTTAGTCTCCTATGAGTCAATCCTATTTTCAAAATT CTTAATAAACCATTTATTCCTTCAACTTTCTATGCCATTTGATGTTTTGT AAAAAAAAAAATATAATATGTATACAAAAAGATATTTCAAAATCTAGAAA GAGAGCTTTAGAGCTTTGTAAAGCTCTTTTAAAAATCAAAAACAACTACT GTTAATTAACATGTTGTACTATGCAATTTGTTTACCATTATTACTCTTGG TATTTTTAAGAAAAGTCTTTCCATTGTTATTATAAATGCTTCTATTGATA TTTATTTTAATAACTGTTATTACAGTCCGTCATGTACATACACTATACTT AAACCTAATGTTTGGTATTTAAATCGTTTCAAGATTTTATCACTGTCAAC AAAGTATGATGAATATTTTTATGCTGAAAACTTCTGTAAAAATAGaattc caagagtattattgcaccaaaaggcatggacttaaaattcttgatacatg atttcaaaatattttctttaaggtttgaatcagtctatattccctccagc agcgtataaaagtgccaatttctctgatccttagccagtttgggtaataa taattgtaaaacttttttttctttttttttgagacagagtctccctctgt cgccaggctgaagtgcagtggcgcaatctcggctcactgcaacctccgcc tcccggggtcaagctattctcctgcctcagcctcccaagtagctgggact acaggcatgcaccaccatgcccagctaatttttgttatttttagtagaga tggagtttccccatgttggacaggatggtctcgatctcttgacctcgtga tccaccctcctcggcctcccaaagtgctgggataacaggcgtgaacaacc atgcccggcctgtaaaactttttcctaatttaacagaaaaataatagtat tatattttatcatatttctttgatttctaAGacacacatacacacacaca cacacaTATCTGTATATACAAATACACGTATAGCTTACATTTTAATTCTT CATTTCATTTGTTCATTTATTAGGTCTTGGAGATTTTGTGAAACTGTTTA AATTCTTTTTTATACTATGAAGATATCAACCTTTTGTCTCTACAGCATTT CAAATTCAAGTATGATTCACGTGTTGGTTTGGGGTAGATCATTATAGGCA CATGTAGGAAACAGCTTTCAGAGATGCCTTAACCGTAATTATGCATTTGT ATTCTAATTTTTATTTAATGTTATTATTGATTGCATTTTTAAAGATTCTG TATTTTTTAAACCATTTATTTGTATATGTTGGTATACAATCTTGCCATTT TCTGGGATTTCATATTTCCTTATTTTTGTTTTTTACCTTTTTTGGCTTGA ATTTTTTGAGTTTTTATGCATTCTTTTCCAGTTTCTTAAGATGCTAATAA GTTCATGTATTTGAGCAATTGAGAACATTTAAAGCAATAGACTGCCTCTG AGCACAGCTTTGTCCATATTACATTAACCTTTTATACCCTGGGTTCCCAC TAGTTTTTAAATAATCTACTATCAAATAAAAGATTTGTTAATAATAAATT TTAAATCATTAACACTTAACGCATTATTTTCAGTCACACTAAGTTGATTC CTTCGTTTCTTTCAGGTTGCTTCAGAGTCTTCCCTTCTATCTGATTCAGT GGACCAAGTAAATGACTCTCTGGTAACAGAATTTGTATTACTTGGACTTG CACAATCCTTGGAAATGCAGTTTTTCCTTTTTCTCTTCTTCTCTTTATTC TATGTGGGAATTATCCTGGGAAAACTCTTCATTGTGTTCACAGTGATCTT TGATCCTCACTTACACTCCCCCATGTATATTCTGCTGGCCAACCTATCGC TCATTGACTTGAGCCTTTCATCTACCACAGTTCCTAGGTTGATCTACGAT CTTTTTACTGATTGTAAAGTTATTTCCTTCCATAATTGCATGATACAAAA GTTCTTTATCCATGTTATGGGAGGAGTTGAAATGGTGCTGCTGATAGTCA TGGCATATGATAGGTACACTGCGATCTGCAAGCCTCTCCACTATCCAACT ATTATGAATCCCAAAATGTGCATGTTTTTGGTAGCAGCAGCTTGGGTCAT TGGGGTGATTCATGCTATGTCTCAGTTTGTTTTTGTCATAAATTTACCCT TCTGTGGCCCTAATAATGTGGGGAGCTTTTATTGTGATTTTCCTCGGGTT ATTAAACTTGCATGCATGGACACTTATGGGCTAGAATTTGTGGTCACTGC CAACAGTGGATTCATATCGATGGGCACCTTCTTTTTCTTAATTGTATCAT ACATTTTTATTCTGGTCACTGTCCAACGACATTCCTCAAATGATTTATCC AAAGCATTCTTCACTTCGTCGGCTCACATCACCGTAGTGGTTTTGTTTTT TGCTCCATGCATGTTTCTCTACGTGTGGCCTTTCCCTACTAAGTCATTGG ATAAATTTTTTGCCATCATGAACTTTGTTGTCACCCCTGTCGTAAATCCT GCCATCTATACTTTAAGGAACAAAGATATGAAGTTTGCAATGAGAAGGCT GAATCAACATATTTTAAATTCTATGGAGACGACATAACACATTTGGTTGA TGAGAGCACAGGATAAATGCCATGGACCATCAAGACTCCTGTGATCACCA TGATCACTATGGAACGCGCACATTTTTAGTATTGCCTGAAAAAACTGAAA AATCTGCAAAAAGGATGCATTAAATCTAAGAATTGTATTTCAGATAAAGT TGCAACATTTTTTGTTAATCATAAAAAGTATATATTTCTATCTAATGTGT GTATCTAATTAACAGCAATGACTACCTTTAATTTTGATGTAGTTATTTTA TATCTGTATATAAGCACATACACATATATATGACCTAGGTTTATTTATCA GTATTTTTATGCTGATAATAAGCATCACTGGAAATTAATTTTCTTATGGA AATTATGTGGATCCAATGGATAAAATATGAGTTTATATAAATTAGTAAAT GCCAAAATCAAGGAAGAAACAATTTTTATTTTAATTGTACTTTAAGTTAG ATAAATGGTAAGGTCAACAGCTTGTTACAACCCTTAAGTATTATTTTCAG GCTGATTGTCAATATGTTTTGTACAatgttctcacttataggtgggaatt gaacaatgagaacacatggacacaggaaggggaacatcacacaccggggc ctgttgtggggtggggggaagggggagggatagcattaggagatataact agtgttaaatgacgagttaatgggtgcagcacacccacatggcacatgta tacatatgtaactaacctgcacattgtgcacatgtaccctagaacttaaa gtataataaaaaaaaaTAGACTCTAGTACTCTGTATTATGCAAAATTTGT CTATGTTACACTTTTTTAACAACACAATCCTATTGCCCTTGAAATCTTCT TCAAAGCATTTCTCGAGTCACTCTTAAAAAGCATCTACAACCTAAAAGTA TAGGAAGAGATTTATTTCCTGGAGAAGAGACCCCATTGAGATCTTAAAAG CACATTTAATGTGCCTGTGCTTAACTTAAGGTGCTTAGGACAAAGAAGGC GATTGACATCTTTCAGGTAAAACCTGGTAAGTTTGGTGGTCAAGGAACAC AACTGAGACATCACTTGGATGTATTCCTATGACTATTTTAAGAAACATAA ATTGTGGTGACTCACTCAGCTCACTTTTAACTACTGCATGGTAATTAAAG ATGCAAAATAAAATAAGTTACAAGAAGTGAGGTTTTTTATTGGTTAAAGC AATTTTTCTATATTTTCTCCGCAAGTTGGTCATAAAAGTTCTAAGCATTC CTCTTTTTATAAAATCGAAGCATTATTACTTACTCTCTTGTTAACCTATC TGGATTTTAATTTTGTAACTTTATTATATTTGTTTTGCTGTGATTCTTTA AAAAGCACCTTTAGACTCAGTGAGATAGCAAAAATATCCAAATAGGCCAA AAAATTGTGGCAATGTCCTCTCACTCAGGAAAATTCTGTGTGTTTTCTCT AATGGCCAAGGGAAAACTTGTGAGACTATAAAAGTTAGTCTCAGTACACA AAGCTCAGACTGGCTATTCCCAGATCTCTTCAGGTACATCTAGTCCATTC ATAAAGGGCTTTTAATTAACCAAGTGGTTTACTAAAAAGGACAATTCACT ACATATTATTCTCTTACAGTTTTTATGCCTCATTCTGTGAAAATTGCTGT AGTCTCTTCCAGTTATGAAGAAGGTAGGTGGAAACAAAGACAAAACACAT ATATTAGAAGAATGAATGAAATTGTAGCATTTTATTGACAATGAGATGGT TCTATTAGTAGGAATCTATTCTGCATAATTCCATTTTGTGTTTACCTTCT GGAAAAATGAAAGGATTCTGTATGGTTAACTTAAATACTTAGAGAAATTA ATATGAATAATGTTAGCAAGAATAACCCTTGTTATAAGTATTATGCTGGC AACAATTGTCGAGTCCTCCTCCTCACTCTTCTGGGCTAATTTGTTCTTTT CTCCCCATTTAATAGTCCTTTTCCCCATCTTTCCCCAGGTCCGGTGTTTT CTTACCCACCTCCTTCCCTCCTTTTTATAATACCAGTGAAACTTGGTTTG GAGCATTTCTTTCACATAAAGGTACAaatcatactgctagagttgtgagg atttttacagcttttgaaagaataaactcattttaaaaacaggaaagcta aggcccagagatttttaaatgatattcccatgatcacactgtgaatttgt gccagaacccaaatgcctactcccatctcactgaGACTTACTATAAGGAC ATAAGGCatttatatatatatatattatatatactatatatttatatata ttacatattatatatataatatatattatataatatatattatattatat aatatataatataaatataatataaattatattatataatatataatata aatataatataaattatataaatataatatatattttattatataatata atatatattatataaatataatatataaattatataatataatatatatt atataatataatatattttattatataaatatatattatattatataata tatattttattatataatatatattatatatttatagaatataatatata ttttattatataatatatattatataatatatattatatttatatataac atatattattatataaaatatgtataatatatattatataaatatattta tatattatataaatatatatattatatataatTCTAATGGTTGAATTCCA AGAATAATCTATGGCATGAAAGATTTTACCTGTCAACAGTGGCTGGCTCT TCATGGTTGCTACAATGAGTGTGTAAGATTCTGAAGGACTCCTTTAATAA GCCTAAACTTAATGTTCAACTTAGAATAAATACAATTCTTCTAATTTTTT TTGAATAATTTTTAAAAAGTCAGAAATGAGCTTTGAAAGAATTATGGTGG TGAAGGATCCCCTCAGCAGCACAAATTCAGGAGAGAGATGTCTTAACTAC GTTAGCAAGAAATTCCTTTTGCTAAAGAATAGCATTCCTGAATTCTTACT AACAGCCATGATAGAAAGTCTTTTGCTACAGATGAGAACCCTCGGGTCAA CCTCATCCTTGGCATATTTCATGTGAAGATATAACTTCAAGATTGTCCTT GCCTATCAATGAAATGAATTAATTTTATGTCAATGCATATTTAAGGTCTA TTCTAAATTGCACACTTTGATTCAAAAGAAACAGTCCAACCAACCAGTCA GGACAGAAATTATCTCACAATAAAAATCCTATCGTTTGTACTGTCAATGA TTAGTATGATTATATTTATTACCGTGCTAAGCAGAAGAGAAATGAAGTGA ATGTTCATGATTTATTCCACTATTAGACTTCTCTTTATTCTTAAAAATAT TTAAGATCACTAAATTTTTATAGGACTTTAAAAACAGTAATGTGCTGCTT TGAGTGTGTAGGACTAAGAAATGGGATTCAGAGTAGTAAAGAGAAAAGTG GAATTTCCAAGCACTATGAATTACTGTTCTTTAAAAAACAGCAAAAATCA AATAACAGTATTCCTCCAAAAAAGATGGCAAGTGTAAACTCTATACCTTC ATGTCTCCCGTGGAATGTTAGTGATCAATTTCCACTTCTCTCTTTTACAT CTTACTTGCCCATTAACTCTTATACCTAATCCAAAGATTGTTAATATGGC TATGTCTCACTTTCAGGACACCTTTTATTTGTTACTTCTCTTCACTGCAA AACTTCTTGAAACAGTACTTATTTTCTCTCCTCCATACACAATTGAAATG GCTCTCAACTCATGCCCAGAAGTCAGTGTTCAGTCTCTCACCTGGCAGAT AGCAACTTACAAAGATGCCCCAACAATACCTCCTTGTGTCTAGACAGTCA TCATTATCCTTTACCTTTTTCTGTATTTATTTCTGCTCCTAAAAGGGATC TCTATGTAAAGTATTGTTATACTAGTGCTTGTTATAATTATTATCAGAGT TAAAGCCATCACAATGTTCCCAATTACTTAAAGACATTGGAATAACATTT TTTTTATTTTCCACATCTTGCCAAAAAATATTTTGTTATCAGTACCTTaa taatggctattatatattgaccattactatttgctagaaaatttatatac ctggtcgtatccaatcctcacagaacttctataaagttgtgctattatca cctatattttccagatgtggccgtaagactgaaatcacttaggtgacttg tctaaggtcattcagatacatagtagataacccaggatttgaacacaggc ctcctagcacacaagctcatatcttaactactttaatacgttgctcGATG GGATCTTACAGGTCTTCATTCACCCCTTTCCTGCTCACACAACCACAACC TGCAGCTATTACCTATTGTTAGGCTTAAAATAATTACTTGGCTTCATTTC CAAGCTCCCTCCCTTCCAATTCACATTGAGTCCAGAGCTAAATTAAACAA TCATTCAAAATTTTTCAGTAGTTCTTGTCTCTATAATAAAACAGAAATGC TTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTC CTTCACactttctctcatttaaactttattgcattttcctcactttttct cacttcacttttgaattccctattcttttatcctctgttaatttttaagt attatatttgtgatattattttttctttttttctattttttatctttcat ttcattttggcctatttttttctcttAAGAACTTTAATATCACCAAATAA CATGTGTGCTACAAACTGTTTTGTAGTTCAAAGAAAAAGGAGATAAACAT AGAGTTATGGCATAGACTTAATCTGGCAGAGAGACAAGCATAAATAATGG TATTTTATATTAGGAATAAACCTAACATTAATGGAGACACTGAGAAGCCG AGATAACTGAATTATAAGGCATAGCCAGGGAAGTAGTGCGAGATAGAATT ATGATCTTGTTGAATTCTGAATGTCTTTAAGTAATAGATTATAGAAAGTC ACTGTAAGAGTGAGCAGAATGATATAAAATGAGGCTTTGAATTTGAATAT AATAATTCTGACTTCCTTCTCCTTCTCTTCTTCAAGGTAACTGCAGAGGC TATTTCCTGGAATGAATCAACGAGTGAAACGAATAACTCTATGGTGACTG AATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCTA TTTATGTTGTTTTTTGTATTCTATGGAGGAATCGTGTTTGGAAACCTTCT TATTGTCATAACAGTGGTATCTGACTCCCACCTTCACTCTCCCATGTACT TCCTGCTAGCCAACCTCTCACTCATTGATCTGTCTCTGTCTTCAGTCACA GCCCCCAAGATGATTACTGACTTTTTCAGCCAGCGCAAAGTCATCTCTTT CAAGGGCTGCCTTGTTCAGATATTTCTCCTTCACTTCTTTGGTGGGAGTG AGATGGTGATCCTCATAGCCATGGGCTTTGACAGATATATAGCAATATGC AAGCCCCTACACTACACTACAATTATGTGTGGCAACGCATGTGTCGGCAT TATGGCTGTCACATGGGGAATTGGCTTTCTCCATTCGGTGAGCCAGTTGG CGTTTGCCGTGCACTTACTCTTCTGTGGTCCCAATGAGGTCGATAGTTTT TATTGTGACCTTCCTAGGGTAATCAAACTTGCCTGTACAGATACCTACAG GCTAGATATTATGGTCATTGCTAACAGTGGTGTGCTCACTGTGTGTTCTT TTGTTCTTCTAATCATCTCATACACTATCATCCTAATGACCATCCAGCAT CGCCCTTTAGATAAGTCGTCCAAAGCTCTGTCCACTTTGACTGCTCACAT TACAGTAGTTCTTTTGTTCTTTGGACCATGTGTCTTTATTTATGCCTGGC CATTCCCCATCAAGTCATTAGATAAATTCCTTGCTGTATTTTATTCTGTG ATCACCCCTCTCTTGAACCCAATTATATACACACTGAGGAACAAAGACAT GAAGACGGCAATAAGACAGCTGAGAAAATGGGATGCACATTCTAGTGTAA AGTTTTAGATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAA ATATAGTGAAGTTGGTAAGTTATTTAGTAAAGCTCATGAAAATTGTGCCC TCCATTCCCATATAATTTAGTAATTGTCTAGGAACTTCCACATACATTGC CTCAATTTATCTTTCAACAACTTGTGTGTTATATTTTGGAATACAGATAC AAAGTTATTATGCTTTCAAAATATTCTTTTGCTAATTCTTAGAACAAAGA AAGGCATAAATATATTAGTATTTGTGTACACCTGTTCCTTCCTGTGTGAC CCTAAGTTTAGTAGAAGAAAGGAGAGAAAATATAGCCTAGCttataaatt taaaaaaaaatttatttGGTCCATTTTGTGAAAAACATAAAAAAAGAACT GTCACATCTTAATTTAAAAAATATATGCTTAGTGGTAAGGAGATATATGT CAACTTTTAAGAGGTTGAAAAACAAACGCCTCCCATTATAAGTTTATACT TCAcctcccaccactataacaacccagaatccatgagggcattatcagga gtgagtggaagagtaagtttgccaatgtgaaatgtgccttctaggtccta gacgtctgtggtataactgctcataagcagtagaaagaatttagagggat ccaggctctcatcacgttggcacaaagtatattacttggatccatctatg tcattttccatgGTTAATGTTTaaaagcacaggctttaaagtaaaaaaca aagagctggattcaactctactgactcttattaatcatgattttgggcac attacgtagctttcatgagctttagtttctacatttataaacaggagatt atacctattatgcatggttattatgaaggaaaatgacaaaatagatataa atcaaatagcccacttcgagacatattaagcatgaataaacattagatac tattaAAATCCTATATATTAACAAAGCCAAAAGTTTCAAACTTTACTTTT TCCCAACATTCTTGTGAAATATGACACATCCCAATCTTAACAGATGCTCA TTTGGGATACTGTACTTGTGAGTGGAAGTGTGTATATTTGTGTGCAAGTG TGTACTCATATACTTCCACCTTACCACCCTAGAAAGGCATGATGAAAATT TAAGATAGAAGGAAAATATAAATTGAAAAAAAAAAACCTTAACAAATGAT TCTGACAAATATCTTCTCTTTCCAGGGAGAATCACTGAGCCAGAATAAAA TTGAACACTAAATATTCTAAGAAAAAAGGAATCTAGTTTGTCAAAATGTG ACTTGAATTAATAGATAAGGAGAGTCAGATGATAAGAGGGTCAAAATTAT GTTTATCTTAGGAAAAGTagaatagaaaatttataagcagattaaaaaca cataataaaagtagtaaataataatgacagtatctcaaatcagtgcaggg gggaaaggcctactaatgtgatggtgggataattggatagcaatatggga aaagatatatttaatttatttgctacaccaaatgccaggacaatctctaa gtgaattcaagacataactcttttttcaaaaaaactatgcaaatattaaa agaaaacaagttaatgtttttataatctatgaatatggtaaagatGGATA ACATTGACTATCAAATTAATTTTTAATGCGTAATAAAACTATGAGAAAAT TTAAAAGTGAGAAGAAACTACTTGTAACTCACATAATAGActagtacttc taacacatagggaacttctaaaacaaaacccaaaatattaataggaaaat gggcaaaacagttaaacttacagttcataCATAAGGAGAATCAGTCTTTT TTTTTTTTTTTACAGTTGTAGGCAGAAAACTTTTATTTTTCATTTATTTG TAAAATTTACCCCTAATTTATTCATAATTCATTTAACTGCTAAGGGCATT AATGTGTACAACGCCATGGGAGAAACCAGTATATTCAGAATTTCTCCTGA AATTTGACCAGAAGTTATGGGCATCCCTCCCCTGGGAAGGAGGCAGGCAG AAAAGTTTGGAATCTATGTAGTAAAATATGTTACTCTTTtatatatatac atatatgtgtgtatatgtgtatatatatatacacacatatatacatacat acatacatacatatTATCTGAATTAGGCCTGGTCttttttaatactttaa gttctgggatacatgtgcagaatgtacaggtttgttacacaggtatacac ctgccatggttgtttgctgcacccatcaactcaccatctacattaggtat ttctcctaacgttatccctctccttgcctcccacctcccgacaggccctg gtgtgtgatattcccttccctgtgcccatatgttctcattggtcaactcc catttatgagtgagaacatgcggtgtttggttttctgttcttgtgttagt ttgcggagaatgatggtttccagcttcatccatgtccctgcaaaggacat gaactcattcttttttatggctgcaagaaatgcaaatcaaaaccacaatg agatgccatctcacaccagttagaatggcaatcattaaaaagtcaggaaa caatagatgctggagaggatgtggagaaataggaatgcttttacactgtt ggtgggagcgtacattagttcaaccattgtggaagacagtgtggtgtttc ctcaaggatctaaaactagaaataccatttgacccagcaatcccattact gggtatatacccaaacgattgtaagtcattctactacaaagacacatgca caggtatgtttattgcagcactattcacaatagggaagacttggaaccaa cccaaatgcccgtcaatgttagactagataaaatgtggcacatagacCTG GTCTTAAAATCAAGAACAGAGATTGTTACTTTTACATCCATTCCTAATTG ATAAACCATTCAGTTATACCACATCTTAGCTTCTGGACTACAATGACCAT ATTTGGGGttttctttctaatttcattataggttcagagggtacatgtgc aggtttgagacaaaggtatattgcatgatactaaggtttggagtacaaat gattccacctcccaggtagcaagaataatacccaatatgtagtttttcaa ctctttcccctcttcctccatcctccctctgctactctgtggtgtctgtt tttctcatctttatgtccatgtgtactcgatgtttagctcccccttgtta ggtgagaacatgtggtatttggttttctgtttcagtgttaattcacttag gataatggcctccaactgcattcatgctgctgcaaaggatgtgactttct tcttattagctgcatatattttgtggtggatttgtaccacatttacttta tctagtccaaagttgttgggcacccaggtggattccatgtctttgctatt gtgaatagcactgggacaacccatacaagttcatgtgtctttttggtaaa acaatgtattttcctttgggcatatatgcggtgatggaattgctggatcg agtggtagtttaactcttagttctttgagaaatccccagactgttctcca cagtggctggactaagttgcattcccaccagcagtgtagaagtgttcccc attctctgtagcctcaccagcacatgttAAACTATCTttaaatatatgaa aaaaatgttcaagtctctcagattaagatgcatgcaaagtaaaatgatac ttaaatatcagttctaacctataaaatatcaaatatctgacctcaatatt tgataatccaacctgttgatgaagctgtagagagaggcaccctTtttttt ttttttaattatactttaagttttagggtacatgtgcaccttgtgcaggt tagttacatatgtatacatgtgccatgctggtgcgctgaacccactaact cgtcatctagcattaggtatatctcccaatgctatccctcccccctcccc ccaccccacaacagtccccagagtgtgatattccccttcctgtgtccatg tgatctcattgttcacttcccacctatgagtgagaatatgcggtgtttgg ttttttgttcttgcgatagtttactgagaatgatgatttccagtttcatc catgtccctacaaaggacatgaactcatcattttttatggctgcatagta ttccatggtgtatatgtgccacattttcttaatccagtctatcattgttg gacatttgggttggttccaagtctttgctattgtgaataatgccgcaata aacatacgtgtgcatgtgtctttatagcagcatgatttatagtcctttgg gtatatacccagtaatgggatggctgggtcaaatggtatttccagttcga gatccctgaggaatcgccacactgacttccacaatggttgaactagttta cagtcccaccaacagtgtaaaagtgttcctatttctccacatcctctcca gcacctgttgtttcctgactttttaatgattgccattctaactggtgtga gatgatatctcattgtggttttgatttgcatttctctgatggccagtgat gatgagcattttttcatgtgttttttggctgcatagatgtcttcttttga gaagtgtctgttcatgtccttcgcccacttgttgatggggttgtttgttt ttttcttgtaaatttgtttgagttcattgtagattctggatattagccct ttgtcagatgagtaggttgcaaaaattttctcccattttctgggttgcct gttcactctgatggtagtttcttttgctgtgcagaagctctttagtttaa ttagatcccatttgtcaattttgtcttttgttgccattgcttttgTCcca ccgatcccacagaaatacaaactaccatcagagaatactacaaacacctc tacgcaaataaactagaaaatctagaagaaatggataaattcctggacac atacactctcccaagcctaaaccaggaagaagttgaatctctgaatagac caataacagaagctgaaattgtggcaataatcaatagcttaccaaccaaa aagagtccaggaccagatggattcacagccgaattctaccagaggtacaa ggaggaactggtaccattccttctgaaactattccaatcaatagaaaaag agggagtcctccctaactcattttatgaggccagcatcattctgatacca aagccaggcagagacacaacaaaaaaagagaattttagaccaatatcctt gatgaacattgatgcaaaaatcctcaataaaatactggcaaaacgaatcc agcagcacatcaaaaagcttatccaccaagatcaagtgggcttcatccct gggatgcaaggctggttcaatatacgcaaatcaataaatgtaatccagca tataaacagagccaaagacaaaaaccacatgattatctcaatagatgcag aaaaggcctttgacaaaattcaacaacccttcatgctaaaaactctcaat aaattaggtattgatgggacgtatttcaaaataataagagctatctatga caaacccacagccaatatcatactgaatgggcaaaaactggaagcattcc ctttgaaaactggcacaagacagggatgccctctctcaccactcctattc aacatagtgttggaagttctggccagggcaattaggcaggagaaggaaat aaagggtattcagttaggaaaagaggaagtcaaattgtccctgtttgcag acgacatgattgtatatctagaaaaccccattgtctcagcccaaaatctt cctaagctgataagcaacttcagcaaagtctcaggatacaaaatcaatgt acaaaaatcacaagcattcttatacaccaacaacagacaaacagagagcc aaaccatgagtgaactcccattcacaattgtttcaaagagaataaaatac ctaggaatccaacttacaagggacgtgaaggacctcttcaaggagaacta caaatcactgctcaaggaaataaaagaggatacaaagaaatggaagaaca ttccatgctcatgggtaggaagaatcaatatcgtgaaaatggccatactg cccaaggtaatttacagattcaatgccatccccatcaagctaccaatgac tttcttcacagaattggaaaaaactactttaaagttcatatggaaccaaa aaagagcctgcattgccaagtcaatcctaagccaaaagaacaaagctgga ggcatcacgctacctgacttcaaactatacgacaaggctacagtaaccaa aacagcatggtactggtaccaaaacagagatatagatcaatggaacagaa cagagccctcagaaataatgccgcatatctacaactatctgatctttgac aaacctgagaaaaacaagcaatggggaaaggattccctatttaataaatg gtgctgggaaaactggctagccatatgtagaaagctgaaactggatccct tccttacaccttatacaaaaatcaattcaagatggattaaagacttaaac gttagacctcaaaccataaaaaccctagaagaaaacctaggctttaccat tcaggacataggcatgggcaaggacttcatgtctaaaacaccgagagagg cactcttatgcattgttggtgagaatacaaaatggtacaactcttggcaa tatcttaaaaaatttacatggtactgacttttggtctagcaatcctactt ctatcctaaagatatattggcaaaaatacaaaataattgatgcactcaag tctattcattgaagcattgtttttcatagtaaacggaaagtaggccgggc gtggtggctcatgcctgtgatcccagcattttgggaggctgaggcgggca gatcacttgaggccaggaattcaagaccagcgtggctaacatggcgaaac cccatctctaccaaaaatacaaaaattagctgggcgtggtggtgcacact tgtaattccagctacttgagaggctgaggtgggaggatcgcttgaacctg ggaggcagaagtttcagtgagcccagaacgtgcctctgcactccagccag gatgacagagcaagactccatctcaaaaaaaaaaaaaaaaaaaaaggaaa ataaccaaatgacaattagtgagtactacttgcaaaacttgtacgcaata gagtatgaagcaactataaaatgagagagaaatatctccaaatactactc taaagtaatctacaaggtataccttaactgaaaagaaacaaaaaagtgac accagaatgctatttttatgttaaaacagggataaataCATTGGATTTAC ATGCatatataagtatatattttataaatgtttaaataaGCATACTTAAA ATGGCAAAAACGTAATACATATATAATTTTCTTATGGCAGGAGGAGGAAA CAGGGCAAGGCACAGGGATAAAAGTTATTCTGAATACATCTTATTTTATA TTTTTGACTTTGAAATCCTGTAGCTGTTTTATGTAATATAAAAATGTAAT TAAATTAACAGAAAAAAATTACAACTGCTAAAAATCAAGATCTGGCATTT TAATTAAGTTATAAAACATCGGAGAAAAGAATTGTTTCATGGGACACTAA CATACAGACAAATTCATTTGGAACCCAATGAATTAATGGGCCTAAGATAA CAACCAATAGAAGCTAAAATGACGAATAACTGTTTCAGAAGAAAACATAT ATGGAATGAATCAGCTGAAAATACCTGAACCTACTGATCAATTTTTATAT CACATGAAGTGAATACACATAAAGTATAATATGGAGCACATAGAACCAAC TAGAAATGAGCCTAATTGTTAAATATTCTCTATTTTATGAcaatatacag gaaatatgtcgaagagagaaacatgcaagaacaccgtagggtttaataag ataatcacaaggtatggaatattcaacaggatgagtatcctggattattc agcaaatacacagagctaaaaagcaggagaaaggaattcatatatatttt taaaaactaaaaagatatattagctgatgcaactttgaaacttctttaga tcctgattcaaatagagcaaatttaacaaatatatttgaaactattaaaa taatttaaaaatgaccaagtatttgattatatcaaatatagacaataata accttgaatgtacatggattaaatgtccacttaGGggctgggtgtggtgg ctcatgactataattccagcactttgggaggccaaggcagaaggattgct tgaggtcagaggttcaagtgcagcctggtcaacacagtgaaaccctatct ctacaaaaaacaaacaaaaataaaaaaTTAACtaattttaaaaaatatat atttCTTCtaaattctccacctgaaagatatagactgactgaatgaattt taactatgatctgactatgtgcttccctgaacaaatgcactttacctgta aaacacatattaactaaaagaaaagagatggaaaaaggtattccatgaac agaaaccaaaatgagtaggagtagctatacttctgtcagacaaaacagac tttaagtcaaaactagctttagaaaaaagacaaaaatgcttattatacaa cgataaaggaatcaatccagaaagaggatataacaattttaaatatatat gcagccaacactggagcagccagattcataaagcaaatactactagatca aaacagagaggtagactcaaatataataatagtgaaggacttcaacaccc cactttcagcattaaacagatcatctaataagaaaaccaatctcgcagcc ctcaccctggagagtccacaggtaccaggggttggtctgaacccccagca cagagcacctgcctcacagaagagtggctgcatttttcttcctgcagttt tcagtcctcacttctccttaccaagcagggccacctggcctgggactccg gtacaactaccctgccccccacctgacgacttcaataagaagtagcccag catttctccaaggaggaaataccagagtcaattcacaaccactgcaattg cagtggtaccaccataacagcccttgggctgcagaaggaactaagagtct agtcactacagtggcaccttcagcacaccacagccaccatacagagagga atccagccccctcccctgggaacccccaccacccactccaccaggcacag cacccagctcataactgcagatcagttgccccacccacagctgagcttac ctactggcagtggcccagactttccctagggagaggctcccagaggcaaa cggcagcctctctgcccgtgtcacagcagcagttctatccatgctgtcct caggcttggaaagaaacaaagcgcctgaaggctgcacctgaacttacagc atgccacagttcccatatggagaggagaccagtctctcctcccagtgagc cctaaaccccctgatccccaacaagcagagccctaacctcacaccagcag tacagctgccccatcccccaggctgaacattcccagtaatagcagctcca cctggagatggaacccccagggtcaactaaaagcccctctgccactgcct ctacagtggtactacccctgctacccttgaactaacaaaggagcaaagac cccagtgctttatccacacctccaacaagctgcagtcgaccacaaagaag aaacacgtctgtctcccatgggtcctacccacaccccctgctgttcacca tggatgatagagtcaacagtgtgaaaacgaccatactgccaaaagcaacc tacaaattcaatgcaattcccatcaaaataccaccatcattcttcacaga actagaaaaaacaaggctaaaattcacatggaaccaaaaaagagcccaca tagccaaagcaagactaagcaaaaagaataaatctagaggcatcacatta ctcgacttcaaactatactataaggccatagtcaccaaaacagcatggta ctggtataaaaataggcatatagaccaatggaatagaataaagaacccag aaataaagccaaatactttcagccaactgatctttgacaaagcaagcaaa aacataaagtggggaaaggacaccctattcaacaaatggtgctggtataa ttggcaagccacatgtagaagaatgcaactggatcctcatctctcacctt ataaacaaatcaactcaagatggttcacagacttaaatctaagacctgaa accataaaaattctagaagataagattggaaaaacccttctagacattgg cttaggcaaagacttcacaatcaagaacccaaaagcaaacacaacaaaac aaagataaatagatgggacttaattaaactgaaagccttctgcacatcaa aataaataatcagcagagtaaacagacaacccacagagtgggagaaaatc ttcacaaactatgcatccaacagaggactaatatccagaatctacaaaga attggaacaaatcagcaagaaaaaaaaccaaacaCAAGGATGACAGTGGA AATACAAAAACAAGACATAAATATTCTGAATAGTGATAATAAAACAGTGC ATACCAGAATAcaaactgtttccaagttacaatggttcaaccatttttca gctttatggtggtgtgaaagtgatatccattcattagaaaccatgctcca ggatgggcgcagtgggtcacgcctgtaatcctagcactttgggaggccga ggagggcggatcacaaggtcaagagatcaagaccatcctggccaacatgg tgaaaccccgtctctcctaaaaatacaaaaattagctgggcattgtggtg cgtgcctgtaatcccagctattcgggaggctgaggcaggagaatcacttg aaccagggagtcggaggtgttgcagtgagccgagatcgtgccactgcctc cagcctggcaacagagtgagactccatctcaaaaaaaagaaagaaaccct actccgaattttgaattttgatattttcctggactaccaatatgtggcac aatgctctctcacaatgttgtgcaacagcggtgagctgcagcttccagtc agctaaatgataataaaggtagataatccatcttgatatcttcctgaaga acataatgcctgcctaccatcaacaggcatcaatactttctaccagctat tctcaaccctcatgatcggaagagacagagactgactgtgtcaaagtatt agtcccatcattcagcaattaactttagctcaatgcttcaaaaattcttc aggccctgtgtaatttcagctacgtacattaatgatgagtacccatacaa ccattctgtttcttattttcagtaccatatttaataaatatcagttattc aatactttatttagacattttgttagattattttgaccaactgaagtcta atctaaatgttctgagcatgttcaaagtaagctaggccaacctataattt tcggtgtgctaaatgcatttttaacttatgatattttcagtttacggggg tttgttgagacataacttcatcatacatcaaggagcatctgTAtatggga tatagttaaagcagtgatcagaggaaaatctatagccttaacacatttat taataaaagtgtaggaattaaattatcagctgaaaaatgtaaaaagtatc taaaagagtaagcagaaagtacaagaaagaacccaaagtagaaaaaagtg aaaattaataaaataagaagccaaaaaacagatcaaatcagtaaaccaaa aatcttgttctttaaacaaatcaacaaagttgacaaaaaaattagatctt ttaatcatgaataaaaaaaagagaaagcacaaaaatgaataaggaatggt gagagaaataactattgataatcagcaaataaaaaatcattaaaaacaat gttgttcacatctatgaaaaacattgaaagctagagggaatgggtaattt tctagaaaaatacaattcaccacaactgacttcaaaaaaaaaaaaaaaaa aaaagaagtaccgcacttatgtgagcaatttccatagagaaatacagttg tcatggaattataacacacacacaaacactaggtttagatgttttcacag agaattccaccaaacctttaGAAATCAGATCGTCCAaaggcaaattaaca actctcagccatttgaggcaaaatattacaattgaggcaagatatactgt actgaaaacttgaggaaaaagcaggagagaaagttcctttgggaaattcg aatactcaaaagtgcttacatacaatgaaaaatttggaaatccataagca tggccaaggtgggacacatgctcagaaaaggcctgagaagacactaataa ctcacctttagtaattcctaggctcacagcaagaaaaaatgaaggctaag gcagaattatatatggctccgctaagtgttgagggagccccaatacagag tcagtaagcaaagtctgggagaagtttttcatatttttttctttcttggc tccttgcagtcaaggaaatcatttttaaatcactaaatgctaaatgaaca caagctaaaggaaccgagccttcaaacatcaaatataaaaaagaatgcag atattacaaaaccagtttacaaaagttactaaacaaataaaaactacatc ccacagtgggtaacaaaaataaccttgaagaagggaaaaatttggtttcc agaataaacacattataatatccaaaatgtccagttttcaacaaaaatta agaagcatgcaaataaacacaaaactatggcccatttacagaagaaataa atgagactctccctgagtaagcagatattgaaaatattagacaaaaactt tatataactgtcttaaataaacttaaagagctaaagaaacccaagagaat gacatataaataaataagaaatatgaatttttttaaaggtacaaaaaaat tctgaggctgaaaagtacaataagtaaaaagttactttttacttagggtt ccaatagaagatttgagcagctggaaaaaagaatcagtgaacttgataga tcaaatgaaatgattcagtctgaagagcaggaaaatgaaagaatgacaac aaaaaagaatagagcctaaagacctgtgtaacaacatcaagaatgcctac atacagaatcctggtggggagtgaggggcaggaagactatttgaagaaat gtgtttgaaagcttcccaaatttcactaaaaacaaatatatacattcaaa aagctcagtgaacttcatcaaggaaatatacaaagatattcacaccaaga cacactatgtttcaaattgtcaaaaggcaaagcgaatgtttgaaagcagc aagagaaaggcaacgcgtcatttacaaaggatcctcaataagtttgacag cagatagtgcattataagccatggatgccagaagagcttaggaaaaaggc aacgcgtcatttacaaaggatcctcagtaagtttgacagcagagagctca ttataaaccatgggtgccagaagagcttaggatgacattttaaagttctg aaagaaaaaaacactgtcaaccaaaaattctataacttggaagatgcccc ttcaagtattaaggataaattacacattcccagattaaaaaaaagaaaga gagagagagagaaagagaaagaaagaaagagaaagaaagaaagaaagaaa gaaagaaagaaagaaagaaagaagagaaagaaagaaagaagaaagagaaa gaaagaaagaaagagagagagaaagagagagaaagaaaaagaaggaaaga aagaaagaaagaaaaaagaaagaaaaagaaagaaagaaagaaagaaagaa agaaagaaagaaagaaagaaagaaagaaagaaagaaagaaaagcaagcaa gctttaaaagttcatgtttggtaggctgtacttcaagatacacttttaaa aaaaagactccttcagatacaaactaaaaaacactagaaagtaactcaaa accacataaagaaataactccagtaaagataactacataggtaaatataa aagcaattatcacattttttgtaagtcttttttaatattctatatgtttt aaaacaaatgtgtaaaataatgactataaatctatgttaatgaagcatga tgtatacagatgtggtttgtgaaattaccaacataaagaaattcatagga aactaaataataatagagattttgtatactattgaagttgtttcaattta ctctaaattgttccaaattaagaatgttaattgtaaatccccatggtaac cactaagttaatatcttttgaaaatacagaaaaggaaagcacagggtaaa cacagtgatatgctacaaaatagcaactaaacacaaaagaaggcgataat tgaggaaattaggaacaaaggaggtataagacatacagaaaacaaaagca aaatggtaggagtaagcccctctttatcagtaattacattaaatacaaat gaattaaactctccaatccaaagaaagagattaacagaatggatttttta aaaatgatccaactatattgtccacaagatactcactttagatcaaaata cacaatgagttgaaatgaaaggatgggagaaaatattccatgtaagtaat aaccaaaggagatctgaggcaaatatacttatatcagacaaaatagactt taagtcaaaaactgttacaaaatacaaagaacagtatatattgatttcaa aattaattaagaagatataacaattataaatatatgtacaccaactaaca gggctccaaaatatataatgtaaccattgagagaattaaagggagagaca gacaattccacgaaaattgttgggcatttgaaaacccaactttaaataaa agataaaacatctagagcaaatatcaagggaggaattagaggatttgaat aaaactataagcaataactatagataacacttctctcaaaaactgcagag tacacattcttctcaagtgaacatggaacattctccagcacagatgatat gttaggccataagataagctcaataaacttaaaaagattgaaatcatgca aagtatcttcactggccacaatggaatgaaataagatatcaataacaaaa gaaaaactagaaaatttacaaatatttggaaattaaacaacacagtattt accaaccaatgaatcaaagaacaaatcatgagggaaattagaaaatgttt agagacgattgaaaacaaatatataacaagatgggtgtgatatatcaaaa gcagtgctcagagttgtaacacctacattttaaaaaagaaacatgtcaaa tcaataaccaaactttactcaataaaccgtaaaaggaagagcaaacaaaa tccagagctagcagaaggaaggaaatgaagattagagcagagataaatga aattgagaattaaaaaattatacagagatcaacaaaattaaaagttggtt cttttaaaatatcaataaaattaatatacttttacatagactaagcaaaa catctctattcagctgactttttttacaagggagccaacattattcagtg gggaataatagctttttcaacaaaaagtgctgggaatactgaatattcat atgcaaaaaaaatgaagctggacccctacctcacattatatacaaaatct agattggatcaataatgtaaatataagagtgaaaaccatacatgcttaga agaaaacatggaaataaaacattgctgtggattggcaatgcgttcttaga taatacaccaaaaatacaagcatgaaacaaacaaatGCAGCCAAAATGTA CCAGAATCTGAAAACATCTATTATCTATGaagaattagaggggaatttgg tgaaagaaatatgggagaatgggacattgctctgtgaatgcttttgtgca taattgtacatttttaattaagttaatcttttacactctcaaagtgtgat attaagcaagcaaagataagttattacaagactctaaaaccgaatgcaat gagaaacaagtgaatccaaatatatttcaaatgaatgaatgacataatca aacttaaggggaaaataataattaatctgattaatttttgactgttcttt tagttcaaattgacttttgaacatacttggactacataccattgcttgaa aaaataaaatatctgcaaaaaattattaaatcttcatgataggctttttt ctttttatattagtataaatataacaattctgaaacaaatgtatgtgcat tgtaagattaagccaatgagtaaatattaatatatttgtattgctagaac cccagattctcactgtgaaaggacagagatacagatatggaataagacaa ggaaagaagcagcccactgagttacattagaatcagtattatcaacataa aTATGCAATGTGCTCTCTCACATGCTCTTTCCTTCTCTTAAAAAAATATA ATATGGACATATTATATATTATATGCATAGACACACGTGTGTCTATACAT ATCCTATCTATACATATTGAGGATTAACAGGTGCTAGTAGAAAATATTAA CTTTCTTTGTATTAACAGGTGTTAGTAGAAAGTAGTAGTAGGTGCTAAGA TAAAAGCCATAATTAAAcctcctggtgaatgaacacaccatcacctacaa tcttaccaaaaatagaatcaagcacgtgtcctagtcaaacctctggattc aactgtcatttggataaaacgcaaaggatagtgaaaatgtcgatcttcac tgagagtctaaccagcaaatttcacagtgtggacatcaagtgacaaaaat cccaaatttttcaacaaatatattgtatgggaaagaaaactttgaaaaga aacctgtatgttagaagagattttaaaaacatgacaaATGAAAAAAAATG GGCAAGACTAAAACTTTTAAAAAAGtttgagacagggtctcactctgtca cccaggctggagtgcagtggtgtgaccatggctcactgtggcctcaacct cctggctcaagtgatcctaccacctcagtcttccatgtagctgggactac agctgcgtgccaccacatctggctcatttttttttcttttttaagtagag acggggacttgctatgttgcccaggctagtctcaaactcctaagcacaag cgatcctcccgcctcggcccctgaaagtgctgggattgcaggcatgagcc accacacccggccAAAAGTTGCTTTTGAGGAGTTATTGCTGTGTGGATGT GATATAACCCTTTCTGTCATCTCTTCACAAAACTTTCTGTAAAACATAAA AATCACCTGGACCTTCAGAGATGAGTTTGtttatttttttattttttaaa aaattGCTAATTTACAGAACATGGAGATGAGTATGTTTTGAAGGCTTGGA AGCATGCAAGTGGGAGAAGAAAGGAGTCAGCTACATTCTGGCTGTGTGCA GAGGCAGGTCACTGTGGTGGGAGTGTTCCTGTTTCATGGACTCTGCAAAT CGCAATGCTTGGCATGGCCTCCCGACCCTGATGGCAGAGAAGCAAACACC AGTCGGAGAGCTGGGGTCCTCCCAGCCCTCTTGGCCCTGTGGCCAATTTT TTCTTCAATAGCCTCATAAAATCACATTATTTGAGTGCCCATGGCTCCAA AACAAGCAGGGATGCCCATGGACCCTGATTATCCATTGTCACCCTTCCCT CCAAACAGCCACCTCTCCCCTGGAGACAGCCCCATACTCCACTCAGACCT GTGCACTTTCTGGTATCCTTGTCACCTGCTTTTTATGTCTCATTTTACAA ACACCAAATTGGAAGACAGCAGGAGCTGCCCCATAATACCAGTAAAGTGA GAAGCAGAGATAAACTAGTCCTAGACAGCCGACTCATGTTGGGGGCAGCC CACTCACAGTGGCCCTGACCCAACTCTGACTAGAGGCCACTTGctctcaa caccagggtgctcaatggcccgtcctggtactctgctcttctctctccac cttcgctttcctgcaatctatgcagcctgtgactccatccatgggctagt gacccccagaccttctcctgggaccacaggcctgtgtctctatctgctgc tcaatacctcccctcgaacatccatggctaaaactgagctcctgatactc tctccctacccgcttctctgtggattccccacctccgcgaaggacagctt catcctttcagctactcaggccagaagattgaagtcatctccttctccag gaaatcgtattgagggagctacaaatatccaaaatccgatcgcttctcct ccactacacccgaggcccgccacccatttttgcctgaattgctgcagcag cctcctaaccgatctctgctttcacgtgggcacctcagttttttccagaa caacaaccagagagatctgctcacacccaagtcagaccaggttactcctc tgctctcatagcatttggaggaaaacccagagtgctcgtgttggccggca gagccggcccccatctcctctgacctcctccccacctcttgccctcagca cccagagtgctcgtgacggccagcagagccagcctccatctcctctgacc tcccacctctcgccctcagcaccCAGAGTGCTCGTGTTGGCCAGCAAAGC CGGCCCCCATCTCCTCTGACCTCCCACCTCTCGCCCTCTGCACCCAGAGT GCTCGTGACGGCCAGCAGAGCCGGcccccatctcctctgacctcccacct ctctccctcagctagtcctcgaacatgtctgatgtggtcccaccttggga cccacattgctactcctctgcctgtaggggtacccacagttatccacaca gttcactcctgtctttcaggtctttgtgcaaatatcaccttctcagtgga gactACaccttcaggacttaggctgtgcctggcacatagtaggtgctcag tagacactggttgtaggaaggaatCTACAGGTTGAAATAAGGAGATCATT TCCCTGAGGTTCCGAAGCTCATATTTACTCACCATTTGTTGTTTACTGCT AATATTGAGCACTATCAGTAAAATACATAAAACCCtttgccaatccagga agtgaaaatgacactttactgttttagtttgcatttctctgcttacaaat ggattacacgcattttcatgtgctgttggctACTTATTCATTCAGAAAAC ATACTAAGTGCTGGCTCTTTTTCATGTCCTTTATCAAGTTTGGATCATGT CATTTGCTGTTTTCTTTCTGATGTAAACTCTCAAAGTTTGAAGGGTATTG TCTTTTCCTGACACATACGTTGTAAATAATTTTCTGGCTTACATTTTGAC TTTTAATTTCATTCACGATGTTTTTAATGAATAATTTTAATTTTTATGAA TGCAAGTTAAAATAATTCTTTCATTGTGGTTTCTGACATGTCATGCCAAT AAGGGTCTTCTCCTCCAAGAGCACAGAAATATTTGCCAATACTGTCCTTA AAATCGGTCACAGTTTCATTTTTTATATATGCATTTTACTTCAATTGGGG CTTCATTTTACTGGCCCTATTTGAAGCAAGTTTCTCAGTTAATTCTTTTC TCAAAGTGCTAAGTATGGTAGATTGCAAACATAAGTGGCCACATAATACT CCCACCTCctttgcctcctctcccaggaggagatagcctccatctttcca ctccttaatctgggcttggccaagtgacttacactggccaatgggatatt aacaagtctgatgtgcacagaggctgtagaatgtgcactggggcttggtc tctcttgctgccctggagaccagctgccccacgaaggaaacagagccaac ctgctgCTTCCTGGGGGGAGACAGTCCCTCAGTCCCTCTGTCTCTGCCAA TCAGTTAACCTGCTGCTTCCTGGAGGAAGACAGTCCCTCAGTCCCTCTGT CTCTGCCAACCAGTTAACCTGCTGCTTCCTGGAGGAAGACAGTCCCTCAG TCCCTCTGTCTCTGCCAACCAGTTAACCTGCTGCTTCATGGAGGAAGACA GTCCCTCAGTCCCTCTGTCTCTGCCAACCAGTTAACCTGCTGCTTCCTGG AGGAAGACAGTCCCTCTGTCCCTCTGTCTCTGCCAACCAGTTAACCTGCT GCTTCCTGGAGGAAGACAGTCCCTCTGTCCCTCTGTCTCTGCCAACCAGT TAACCTGCTGCTTCCTGGAGGAAGACAGTCACTCTGTCTCTGccaaccca gttgaccgcagacatgcaggtctgctcaggtaagaccagcacagtccctg ccctgtgagccaaaccaaatggtccagccacagaatcgtgagcaaataag tgatgcttaagtcactaagatttgggCAAAAGCTGAGCATTTATCCCAAT CCCAATACTGTTTGTCCTTCTGTTTATCTGTCTGTCCTTCTCTGCTCATT TAAAATGCCCCCACTGCATCTAGTACATTTTTATAGGATCAGGGATCTGC TCTTGGATTTATGTCATGTTCCCACCTCGAGGCAGCTTTGTAAGCTTCTG AGCACTTCCCAATTCCGGGTGACTTCAGGCGCTGGGAGGCCTGTGCATCA GCTGCTGCTGTCTGTAGCTGAGTTCCTTCACCCCTCTGCTGTCCTCAGCT CCTTCGCCCCTGGGCCTCAGGAAATCAATGTCATGCTGACATCACTCTAG ATCTAAAACTTGGGTTCTTGgaccaggtgcggtggctcacatctgtaatc ccagcaatttgggaggccgaggcgggtggatcacaaggtcaggagatcaa gacgatcctggctaacacggtgaaaccccgtctctactaaaaatacaaaa aaattagccgggtttggtggcaggtgcctgtagccccagctacttgggag gctgaagcaggagaatggcgtgaacctgggaggtggagctggcagtgagc caagatcacgccactgcactccagactgggagagagagcgagactttctc aaaaaaaaaaaaaTCTTAGGTTCTTGGATGTTCGGGAAAGGGGGTTATTA TCTAGAATCCTTGAAGCGCCCCCAAGGGCATCTTCTCAAAGTTGGATGTG TGCATTTTCCTGAGAGGAAAGCTTTCCCACATTATACAGCTTCTGAAAGG GTTGCTTGACCCACAGATGTGAAGCTGAGGCTGAAGGAGACTGATGTGGT TTCTCCTCAGTTTCTCTGTGTGGCACCAGGTGGCAGCAGAGGTCAGCAAG GCAAACCCGAGCCCAGGGATGCGGGGTGGGGGCAGGTACATCCTCTCTTG AGCTACAGCAGATTAACTCTGTTCTGTTTCATTGTGGTTGTTTAGTTTGC GTTTTTTTTTCTCCAACTTTGTGCTTCATCGGGAAAAGCTTTGGATCACA ATTCCCAGtgctgaagaaaaggccaaactctggaaaaaatttgaatattt tgagccaaatgtgaggaccacaacctgtgagaacggaaaataaatcctgg gaccccagactcactaagccaaagggaaaagccaagctgggaactggctt atgcaaacctgcttcccatctggttcctaaataagatagctattacacaa agacaaaaaagctacatccctgcctctacctccatcgcatgcaaaatgtg tattcagtgaacgctgaccaaagacagaagaatgcaaccatttgcctctg atttacccacacccattttttccacttcttcccctttccccaatacccgc acttttcccctttacttactgaggtccccagacaacctttgggaaaagca cggaccacagtttttcctgtggttctctgttcttttctcaggtgtgtcct taaccttgcaaatagatttcttgaaatgattgagactcaccttggttgtg ttctttgattAGTgcctgtgacgcagcttcaggaggtcctgagaacgtgt gcacagtttagtcggcagaaacttagggaaatgtaagaccaccatcagca cataggagttctgcattggtttggtctgcattggtttggtctggaaggag gaaaattcaaagtaatggggcttacaggtcatagatagattcaaagattt tctgattgtcaattggttgaaagaattattatctacagacctgctatcaa tagaaaggagagtctgggttaagataagagactgtggagaccGTGCATAG TTGCTTCCTGATCAGCTCTTTATTTGATTGAGAGTGAGGCAGGGAAGATT AGAGGGAAGCTTACAGTGGAATTCAGGGCTGAGGCTGCTATTCTTTTGCT CCTTGTAACTTCCTACAGTGTTGTCAGCATCCACATACTTCTCTGTGGGG TTggtctcagagccaggttaccttgtcttaggtccagtggcaccctgact ggcttggtgtccttgaacaagttacctaacctctccaaacctcagtccct cagttgtaaaattaaaaaaaaaaaaaagaagaagaagagtacctactgta tagcattgatttgaagattgaatgagctggtattatacaacgtttagaag cagtgcctgacacgcaaaaggctctcaacaaatACTATCCTTTACTAATA TCCTGTGTGTCTGTATCAGAGCTGGTGGGGTGGAGGGACAGAAACAAGTG GGAGAAGGTaaagagatggacaaatgatctctaaagtctctctggcacta acaCAATTCTTTATTATGTGTTTTGTCTGGCTCTTTATATTGATAGCTGT TCCAGAGGCAATCAATAGCTATTAGTCGGTTTTATTCTTATTTTTCTGTC TGATCTTACAGGGGAGCAAACTGTGGCAAAGTATGAACTTACTTCTCAGG AAATTAACCATTATATTGGCAATCACTGTGATTATTTGAACTTCAGCGTC TGGACAAATTTAGTCACATGAAATACAGAAGAGAGATTTCTCATGGTTAA AACGAAGCtctctttatttgcttctgctaattaaaaaatcagagctaaag atacttaaacactacagttaaaatgccatggttgtctattggcttaacga attctcttatgaaatcaactctaaaatgctatccatcataaatcatgaaa cgcaatttttcttattctctttagagctttacaattcatcttaaagacca gtgtttacactctcttctgtaggttgtacaataacttttggcgagaaaaa ataaaagtctggctttctgacTCATAGGTGTGTTCCCTTTAACAGAAAAA GAAAATATGTCCTCTTTAAAACTGATGATCATTGGTCACCTCAATTTTAT TGAAGTTCACTTCTGACCTCTTTAGATGTAGTTCTCTACATAAAACTGCC CAACAGAATTCTCTGTCTGAATGTCTCCTCCACAAACAAAATTTTAAGAA CTAAAATTATCATCTTTCCTTCCAAATATGCTCTCCCTATGTCCCCAGGG CTCTCCATGTGTAGAGCTGAGACCATTTGCCACTCAGTTTCCTCACCCAA TTAATTACAAGTCCCAACAATTTTCCGGtttttttgtttttgtttttgtt tttagacggagtcttgctctgtcaccaggctggtgtgcggtggtgcaatc tcagctcactgcaacctccgctgcctgtgttcaagcgattctcctgcctc agcttcccaagtagctgggattataggtgtgtgccactacatccagataa tttttgtatttttagtagagaggggatttcaccatattggcccagatgat ctcaatctcttgacctcatgatctgcccaccttggcctcccaaagtgctg ggattacaggtgtgagccgccatccctggccCAGTTTTGCCTTTTTAACA TCCCTCAGCTCTTCAAATCCATTTTCTcttctctaacacctccccattcc ccagctcgtaatgaactcgtaagtagattactacaatcacctcccaaatg gtcttcctggctccatcagccttgtgaccttcaagttcattttccacatg gatgtcagagtaactttctaaaatgaaaatctgaccacgttactctcttg cctaaatccgcctatggccgctgttaggatcaagtctaaactcccgaccc tggaacatcaggtcttcgtgctctgttcactgcttctctacctcacctgc aaccaACACCACTCCCACATCCATATGCTGCTCACCGTGTATCAACATGA ACAGGAGGTGGGTGTTTCAGTCCCCAGGAAGACACTGGGCCTTTTCAATC ATCTACTGCTGTGTAATAACCACCCCGCAAACTGACCACATGATTTCATT TTGCAAGGGTTCCTTCCTTgggctgtgttcagcaaaagggtttactgagc tggcaggtccaagatggcctcactcacaggactggctgttgatgggagcc ttgatgctcttgggctcaccccttatcctccagtaggttagagcttctta cagtggtttcaggcagcatctgaagacagtaaaagcagaagctccaaggc ttcttacattctagcctggaaaatcacatcacattgcttccttcatattt ttttggcaaatcaggttgcaaggcttgcccagattagggtaaagaggcaa agaggctccttttcttTTCttttctttttttttttttttttttttttgag tcagaatctcgctctgttgcccaggctggagtgcagtggtgcaatctagg ctcactgcaagctctgcctcctgggttcacgtcattctcctgcctcaggc tcccaagtagctgagactacaggcacctaccaccacgcctggctaatttt tttttattttttattttttagtagagactgtgtttcactgtgttagccag gatggtctccatctcctgacctcgtgatccTTGCAAAGGGACATGCAGAC CACATTAGTGAGAATATGTGCCTGTATTTTGCAATCTGTAACATGGGCAT AAACTAAATGTTTTCCAAAGGGAATAGGGCAAAACAAAAAGGACCTTGAC CACTCCTTGGCCCTGAATAAATCCAGGAAGCCTAAGAGTATGACTATCCT GAGGTAGAAAGAGGGTCACATGCTGGATAAGAGGTACCTGGGCTCTCCAC TTACAAGAAGAGAGCATGGTTACATTTATAATCACCATTCCCAACATGCT GTGAGTGCAGGCAGCTACCAGGAGGAGAACAAAGGAAATAACCAGGACAC TCATCTCTAAACCTGTTAATTTAATCACACGGAACACTTCTATTTAAAAT TCCCGAGAGTTAAGATGTAAGAATGCTTATCAAGGTAAATGCTGTTCACA CTGCTTGGAGTGTCAGGCCTAGATCTCTATCCATCAGAaacaacaatatc aataacaacaacagcaacaTGATGATGGGGCAATTTCTTAAAAGCACCAT GTATTTTATCGATACATGTCCGTTGCAGAAAATCCAGGTGAATCCAAAGA AGAAATAAATGTCTTCCACAATCCCATAGCCCAGAGCTAACTAACCACTA TAAAGAACCCAGCGTGGTTTTAACTAATGGATCAAAAGATGCTCATCAAA GGCTCTGAGCTTTCCTGAGTGCTAACAGGAAACATCCAGCATCACTGGTC TCTCCAAGGCTGCAGGTGTCTTTGCCCATAGTGCCTGTTTTGTGTCAGGG AAAGAATCAACCTGGGAGCCAAGCCCAGGAATCAGGATGACCAAGACATA CTGCACAAGGAGGGAACAAACCCATCCAAGGACACTCAAGGACAAATCAA GCAAATGAATTTAAGGGAGACGTGCTCATGGTCTGCTTTGCTGCTCAGCA TGGCTGGGAGGCACAGTGGAAGATCATGCATCCTGCCCCTGGGACTCCTC TGCCAGAGCCTGAGAGCTTTCTCCTGCCCACAGGCTAGGGGTAGGGCAGT TGGAATTGATCCATGCCTTCTAGCTAGACTGTGGGTCCCCTCAGTCTTGG GCATGGTGACAGCCCAGCATCAGACAGAGGTCAGTATCAAACTAGAAAAT TTAATAAATACTGTCAGATTTGTAGACCCAAGAAAATATAAACTGCCAAT CACGGAGGAAAAAAATCTCTCAATGATCTTATCTTTATATGATTCCCTTG CTGCCTGGAGATTGACATTTCCTTGGGGATAATCTGGTCATAGGATTGGT GAAGGTGGAAGGGAGGCAACCTCCAAAGGTGGGGCCCTCTGCTCACCTGG GACAGGGAGGGCCTGAGGTAGGTGTCTGTGTGGGCTGGGGAGGAGGATGG GAGCAGTGCTTCTAGATGTTTCCACTTTCTCCTCATTAGATAATAACGAA TGGGTGATTTCCCTAGTCACTGCAGTGTGAGGAAATCTACAAAATTAATT TCACAATACGCTTTACAGGATAGGTGGAGAAACACATGAAGTACAACTGC AGTGGGTTATAAAAAACGGCCTTTCGAGTTGAGCAATAAATTCGTTCAAG CAGCCATTCTGAAGGACAAACTGGCTCTGTATTTAAGAGGGGCATTCCAG CACTTCTCTAGCCACTGGGTTGACAATGACTCACCAAAGCCTCTGGTAGC CACCACAGGACACCCAGAGCATATGTTTTAAAGCTGAACACCAAACTGCG GACTTCGGGAGTAAGTGAACTGACTGGTTTTTATTTTGTTTTACTGCTTT TAACATTACAGTAACTGTTACAGGTTCCAGCAGGCTAACTGGGTGGAAAT GAGTTTGGTTTCACTTAGTCTCTCTAAAGAGAAAGCAAGTCGGTAGACTA ATACCTAATAAAAGCAAAGCTGCCAACAATTGAAATTGCCTAGGCTGCTC TGTGTGTCCCACAtgcatgggtgtgggtgccagtgtgtgtgcgtgtgtgc atgcatgtgcatgtgtgtTGGGATAGAGTGGTAAGAAAATGGGAAATAAT AAGAATGTTCAGTCCATAGCCCTTCATTATAAAAAGGTGAGCTGTAATAA ATACTAGTGCCACATTTAGCCAAAACTTTACTCCAGCCAAAGGTGATATT TTCATGATAACATCCTGTGATTGCTTTGTTCTTCGTCTTTTATGTTCTTC CTAGATGGGCTCAGAACATACAAGAATTAAGTACACATCTTATTTTCCAG TGATAATGCTACCGGCAAATTCTGTTGTTTGTATAAACATCAGCCATGTT TATATAACTAAACTAGTGTTTTGTTTTGTCAATTCAGCAAGAAATTAGAC CAAATGGTGGCTTAATGCTGCATTGATTTGGCTATCAATTTGTTTTCACT TTTCTGCAAAATAATTAATACATTATTAAATTGAATTGTGCTGATGCCAC AGTTGTTCTTATCTCAAGTGTCTTAAAATTCATTTAATTTGTTTTTCCTT TGGTTTCATTATTCAAATTTTAACTTCAGTTCTCAAGATTTTATCTGATG GAAGAGATGGAGTCCATTACTAAGGACTCCATTGTGCTCCATCATGCCAG AGTTGTAAAATAGATCTTTTAAAGGAAATTTACTGTGATTTTTTTTCTAT TTAAGAGCTTCCTCTCCAGTTGAGCATGTAAGAAAATTATACCAGGAGAA TACAGTAAACTCTATGAGGCAAGCTATAAACATGTAGCATTGTGATTAGG Gctggttctccttctagagacatggtaggattgcaatttcataccatcct tgaagttagagagagccacgtgactcatttagccaatgaactgtgagcag aatgacatgtcacttccagctgaagctttaacaatctgagagacattcat acattttccatgtgctgtagccttatacccaaagcctgggtcccaagtga ccatgacaggcagagctccctgttgagccacagagatttagagaatggct gttaacacagcataatccagcccatcctgactaatCTGATATTAACATGT ATAATAAAGAATTCTATCAATGCTGAGGGAAGATGACTAGTTAAGGTCCT AGGTTGCAAGTCTCAAAACCTCTTCTAAGGATTGTAGACAGGAAATTAAA TGACTTCTAGTCCCTAGAGTTCCCAATCTCCTACCATCCCATCCTAATAT GACAGAAGTAATTCCTGAGTTGCTTCTGAAACCAGAGCTTCCCTCAGAAC CCTTAGCCTGCCAGATGGCTTCTTGGAGAGCCCTCACTCACTTTTCTCCT TCTGCTATTGCTGCTCATTCATTCCAGTTTTTAAAAATTCATCTTTATCC AGGAACCTCGCTTCTAGAAAAGTCATACAGGTGCTTCCAGGAGGCTACAT GGGCACCCATATTTTTCTAGCCACTTTCATTAGACCAATGCAGCAGAGAA GAAAAGCCTCAATAATTATTATGACATGGCATGTTAGGATACCAAGTAAA TTGCATTTGTAAAATGTGATTTTCTGTTGGTGTTCACTTCAGCTCTACTG ACATTTGGTAAGTATTATTGACTGACTGACTAACTAATGTGGTCATTAGT CTTCATAAAGAAAGGCTCTCTACAAAAACGGAGGGATGCCCTTTTTCTGG CATTTAATACGTAAGAAATTGCCTCCAATAGAAACCAGAGTTGCCTGATT ACTATCAGCACAGGAGAAATGTATTAATGTGCCTTTCTAGTAACAGGTTT TTAGAAAGTCAAATATAAACAAATCTGTCTATTTGTGTGTGTGCATGTGG TAGTGGGGAGGGAAGAAAAAAGGAGGGGGAGAGAAAGAGAAATAAGAACC AAGTTTATTATACTGTATTCAGGGGGAAAAAATTTTCCCAAGGTCCTAAC AGAAGAGCAAAGTGCCACTGTCAATAGCCTCAGTAGTGTTAGGGTTGCTt ttatttatttatttatttatttatttatttatttatttatttttcctttt ttttctttctctttttttcttcttttttttttcttttctttctttttttt ttttttttttttttttggacagagtctcacactgtcacctgggctggagt gcattggtgcaatctcgactcactgcaacttctgcctcccaggttcaagt gattctcctgcctcagccgcccaagtagctgggattacaggtgtctgcca ccgtgcctagctaatttttttgtatttttagtagagatgaggtttcacta tgttggccaggctggtctcaaactcctgacctcatgatccacccacgttg gcctcccaaagtgctgggattacaggcgtgagccaccgcccctggccAGG ATTGCTTTTACAGCCAGTCTTCAGGTGCCCACTGTAGGAACAATGTCATT TAACCCTCGGGATTATTCTGTGCCAAATATGGATAATGACTAATATCCAA CACAGATATTCTCAGCTCAGAAGAGCAATTAGCAAATTCATAAATTAAGT GCTTGCTTCCTCTTTAGTCAAATACAAACGTTTGTTAAAAGATATTATTT TGCTTTACACTTTTTCTCTCAGAAATAAGCAGATGCTTGAATTCCCACAG TGCTGCTTGAGCCTCACACCATGTCATCCTGCCAGGCACCCAGATCCAGT TCTAGAGTTTCACATGATCGTGAgtgttggttaataagtcaatgtgaact gggaggggagatttttcaggagtgccacagggctctccctttaatcACAT ACACTCCCTGCTTTCATTGGAAAGTGTATAATGATGTCAGAGTGCCCCAG AATGGAGCTAGTTGGAAGACTGCCGTCATAGGGAtgccttagtgaattaa taaggttttaatttctggctctcaactttgtagatgtaaaagttgattta wham/indexes/0000755001532600153260000000000012054751660012505 5ustar yinanyinanwham/hitset.h0000644001532600153260000002003012003705361012501 0ustar yinanyinan#ifndef _HITSET_H_ #define _HITSET_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: hitset.h 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include "lib.h" #include "sequence.h" #include "short.h" #include "bitread.h" #include "error.h" #include "pair.h" #define MAX_NUM_HITS 100 #define MAX_NUM_COUNT 4 #define MAX_INT 2147483647 #define UNASSIGNED_QUAL 256 #define MODE_NORMAL 0 #define MODE_RAW 1 #define MODE_SAM 2 #define MODE_CAM 3 /* * Valid alignment struct */ typedef struct Hit { int id; unsigned int pos; int strand; ErrorVector error; // int nMismatch; int qual; int64 reference[8]; int64 query[8]; /* this becomes an optional field */ } Hit; /* * the set that containing all valid alignemtns of a particular read. */ class HitSet { public: int mode; Hit * hits; int maxWrite; //number of hits outputed int nHit; //number of hits stored int maxHit; int maxMatch; int maxQual; int length; CompactSequence * sequence; char str[256]; bool sorted; bool strata; bool unique; /* only one valid alignment */ bool properMatch; int counts[MAX_NUM_COUNT]; //counters for different error models double p[MAX_NUM_COUNT]; double psum; public: HitSet(int maxhit, int maxmatch, int maxqual, bool isSorted, bool isStrata) { int size; if (maxhit == 0) maxhit = MAX_INT; if (maxmatch == 0) maxmatch = MAX_INT; size = maxhit < maxmatch ? maxhit : maxmatch; hits = (Hit *) malloc(sizeof(Hit) * MAX_NUM_HITS); maxHit = maxWrite = maxhit; maxMatch = maxmatch; maxQual = maxqual; sorted = isSorted; strata = isSorted && isStrata; if (maxQual < 255 && maxhit < 10) { //keep up to 10 hits maxHit = 10; } unique = true; properMatch = true; } ~HitSet() { delete[] hits; } inline void init(CompactSequence * seq, int len) { sequence = seq; length = len; nHit = 0; for (int i = 0; i < MAX_NUM_COUNT; i++) counts[i] = 0; double pbase = 0.99; int i, j; for (i = 0; i < MAX_NUM_COUNT; i++) { //compute the binomial distribution p[i] = 1; for (j = 0; j < i; j++) p[i] *= 1 - pbase; for (j = 0; j < length - i; j++) p[i] *= pbase; for (j = 0; j < i; j++) { p[i] *= length - j; p[i] /= (j + 1); } } psum = 0; } inline int add(int64 * query, int64 * reference, unsigned int pos, strand s, ErrorVector * error, int qual, int rid) { int i; /* if there has been one element in the set, * there are multiple alignments for this read. */ // if (nHit == 1 && hits[0].pos != pos) // unique = false; if (nHit >= MAX_NUM_HITS) { // elog(WARNING, "WARNING: result set is full. Some reads may be discarded.\n"); return MSG_HITSETFULL; } if ((maxMatch == MAX_INT && nHit >= maxHit && !sorted) || nHit > maxMatch) return MSG_HITSETFULL; for (i = 0; i < nHit; i++) { /* linear search. can be replaced by binary search. */ if (hits[i].pos == pos) return SUCCESS; } if (error->num < MAX_NUM_COUNT) { counts[error->num]++; psum += p[error->num]; } i = nHit; if (sorted) { for (; i > 0; i--) { if (hits[i - 1].error.num <= error->num) // if (hits[i-1].error.num <= error.num // || (hits[i-1].error.num == error.num && hits[i-1].qual <= qual)) break; hits[i] = hits[i - 1]; } } hits[i].pos = pos; hits[i].error = *error; hits[i].strand = s; hits[i].qual = UNASSIGNED_QUAL; hits[i].id = rid; // BITMAP_COPY(query, hits[i].query); BitRead::copy(reference, hits[i].reference); if (nHit < maxHit || (maxMatch != MAX_INT && nHit <= maxMatch)) nHit++; if ((maxMatch == MAX_INT && nHit >= maxHit && !sorted) || nHit > maxMatch) return MSG_HITSETFULL; //sorted return SUCCESS; } inline bool isFull() { if ((maxMatch == MAX_INT && nHit >= maxHit && !sorted) || nHit > maxMatch) return true; return false; } inline void reset() { nHit = 0; unique = true; properMatch = true; psum = 0.0001; for (int i = 0; i < MAX_NUM_COUNT; i++) counts[i] = 0; } /* * verify the quality score and shrink set under the limitation of maxWrite */ inline void verifyQual() { int i, j; /* * suppress all alignments if more than maxMatch valid * alignmetns exist for it. */ if (nHit > maxMatch) { nHit = 0; return; } for (i = 0, j = 0; i < nHit; i++) { if (getQual(i) < maxQual) continue; if (i != j) hits[j] = hits[i]; j++; if (j >= maxWrite) break; } nHit = j; } inline int64 * getQuerySeq(int i) { return hits[i].query; } inline int64 * getReferenceSeq(int i) { return hits[i].reference; } inline uint32 getOffset(int i) { return hits[i].pos; } inline ErrorVector getErrorVector(int i) { return hits[i].error; } inline int getQual(int i) { if (hits[i].qual == UNASSIGNED_QUAL) { int num = hits[i].error.num; if (maxQual >= 255) hits[i].qual = 255; else { // if (num < MAX_NUM_COUNT && counts[num] >= 1) // return 0; if (num < MAX_NUM_COUNT) { hits[i].qual = (int) (-10 * log10(1 - p[num] / psum) + 0.5); if (hits[i].qual > 255) hits[i].qual = 255; } else { hits[i].qual = 3; } } } return hits[i].qual; } inline int getNumMismatch(int i) { return hits[i].error.num; } inline int getStrand(int i) { return hits[i].strand; } inline Hit * getHits() { return hits; } inline bool isProperMatch() { return properMatch; } inline int getNumHits() { return nHit; } inline int getNumAllHits() { return nHit; } }; /* * the set containing the valid aligments of paired-end reads. */ class HitPairSet: public HitSet { public: HitPairSet(int maxhit, int maxmatch, int maxqual, bool isSorted, bool isStrata) : HitSet(maxhit * 2, maxmatch * 2, maxqual, isSorted, isStrata) { } int build(HitSet * set1, HitSet * set2, uint32 maxins, uint32 minins, bool pairStrand[][2], bool mateMatch); inline int getQual(int i) { i -= i % 2; if (hits[i].qual == UNASSIGNED_QUAL) { int num = hits[i].error.num + hits[i + 1].error.num; if (maxQual >= 255) hits[i].qual = 255; else { // if (num < MAX_NUM_COUNT && counts[num] >= 1) // return 0; hits[i].qual = (int) (-10 * log10(1 - p[num] / psum) + 0.5); if (hits[i].qual > 255) hits[i].qual = 255; } hits[i + 1].qual = hits[i].qual; } return hits[i].qual; } /* * verify the quality score and shrink set under the limitation of maxWrite */ inline void verifyQual() { int i, j; /* * suppress all alignments if more than maxMatch valid * alignmetns exist for it. */ if (nHit > maxMatch) { nHit = 0; return; } for (i = 0, j = 0; i < nHit; i += 2) { if (getQual(i) < maxQual) continue; if (i != j) hits[j] = hits[i]; j += 2; if (j >= maxWrite) break; } nHit = j; } private: int add(Hit * hit1, Hit * hit2); }; #endif wham/interval.h0000644001532600153260000000503112003705361013031 0ustar yinanyinan#ifndef _INTERVAL_H_ #define _INTERVAL_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: interval.h 157 2012-07-25 05:58:09Z yinan $ */ #include "lib.h" /* interval tree leaf level entry */ typedef struct IntervalLEntry { uint32 key; /* the offset in compact sequence */ uint32 len; uint32 sid; /* the sequence id */ uint32 offset; /* the offset in the originial sequence */ } IntervalLEntry; /* interval tree internal level entry */ typedef struct IntervalIEntry { uint32 key; /* the offset in compact sequence */ uint32 offset; /* the position of the node in its child level */ } IntervalIEntry; /* interval tree level */ typedef struct IntervalLevel { uint32 curNode; /* the current node in the level */ uint32 curEntry;/* the current entry in the current node */ } IntervalLevel; class IntervalTree { private: uint32 lenNSeg; uint32 numIEntry; /* the number of internal entries */ uint32 numLEntry; /* the number of leaf entries */ uint32 numLevel; /* the number of levels */ uint32 fl; /* the fanout of leaf node */ uint32 fi; /* the fanout of internal node */ uint32 curLNode; /* the current allocation position in leaf node pool */ uint32 curINode; /* the current allocation position in internal node pool */ IntervalLEntry * lpool; /* the leaf node pool */ IntervalIEntry * ipool; /* the internal node pool */ IntervalLevel * level; /* the levels */ public: IntervalTree(); IntervalTree(uint32 num, uint32 len); int append(uint32 key, uint32 len, uint32 sid, uint32 offset); int flush(uint32 key, uint32 len, uint32 sid, uint32 offset); int lookup(uint32 key, uint32 & sid, uint32 & offset); int save(char * path); int load(char * path); private: int append(uint32 key, uint32 offset, uint32 l); int flush(uint32 key, uint32 offset, uint32 l); }; #endif wham/sequence.h0000644001532600153260000000635712003705361013031 0ustar yinanyinan#ifndef _SEQUENCE_H_ #define _SEQUENCE_H_ /** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: sequence.h 157 2012-07-25 05:58:09Z yinan $ */ #include "lib.h" #include #include "interval.h" #define BASE_A 0 #define BASE_C 1 #define BASE_G 2 #define BASE_T 3 #define BASE_N 4 #define NUM_LONGWORD(x) ((((x) - 1) >> BITS_LONGWORD_SHIFT) + 1) #define NUM_LONGWORD_BASE(x) (((((x) - 1) >> BITS_LONGWORD_SHIFT) * BITS_PER_BASE) + 1) #define BITMAP_IS(x, y) (((x) >> (y)) & 0x1) #define BITMAP_SET(x, y) ((x) |= (0x1 << (y))) #define BITMAP_CLEAR(x, y) ((x) &= ~(0x1 << (y))) #define MAX_NUM_CHAR 4294967295LLU #define SEQUENCE_HEAD_WORDS 6 #define MAX_LENGTH_PATH 256 class Aligner; class CompactSequence { private: uint32 numChar; /* the number of characters in the compact sequence */ uint32 numNSegment; /* the number of N segments */ uint32 size; /* the size of the compact sequence (3bit/character) */ int64 * sequence; /* the sequence (has an offset to the beginingn of the pool array) */ int64 * pool; /* the space for storing the compact sequence */ char ** seqNames; /* file names */ char * seqNamepool; uint32 * seqLens; int nSeq; /* the number of sequences */ int len; int nError; bool skipMask; /* treat the masks( lowercase characters) as unknown character)*/ uint32 numRead; int64 * keys; uint32 * offsets; public: IntervalTree * itree; /* the interval tree that used for translation between orginal sequence offset and compact sequence offset. */ public: CompactSequence(); CompactSequence(bool skip); int build(char ** fname, int numSeq, int length, int numError); int filter(Aligner * aligner, char ** fname, int numFile, char * path); int save(char * path); int load(char * path); int alignment(int lenKey, int nMismatch); int valid(int length, int numError); static void compose(char * str, int length, int64 * words); static void decompose(char * str, int length, int64 * words); private: int preProcess(char * fname, uint32 numError, int64 & num, int64 & numNSegment, int64 & numFileSeq); int skipLine(FILE * file); int getSeqName(FILE * file, char * str); void extractFileName(char * dest, char * src); public: unsigned int getNum() { return numChar; } int getNumSeq() { return nSeq; } int64 * getSequence() { return sequence; } char * getSeqName(int sid) { return seqNames[sid]; } uint32 getSeqLen(int sid) { return seqLens[sid]; } }; //CompactSequence * sequenceLoad(char * fname, int length); #endif wham/interval.cpp0000644001532600153260000002201412003705361013364 0ustar yinanyinan/** * WHAM - high-throughput sequence aligner * Copyright (C) 2011 WHAM Group, University of Wisconsin * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* $Id: interval.cpp 157 2012-07-25 05:58:09Z yinan $ */ #include #include #include #include #include "interval.h" #include "error.h" IntervalTree::IntervalTree() { } /* * IntervalTree::IntervalTree * allocate space for specified size of the interval tree. */ IntervalTree::IntervalTree(uint32 num, uint32 len) { uint32 k; lenNSeg = len; /* fanout of internal node and leaf node */ fi = CACHE_LINE_SIZE / sizeof(IntervalIEntry); fl = CACHE_LINE_SIZE / sizeof(IntervalLEntry); k = (uint32) ceil(double(num) / fl); numLEntry = k * fl; numIEntry = 0; numLevel = 1; do { numLevel++; k = (uint32) ceil(double(k) / fi); numIEntry += k * fi; } while (k > 1); /* allocate the space */ ipool = new IntervalIEntry[numIEntry]; lpool = new IntervalLEntry[numLEntry]; level = new IntervalLevel[numLevel]; curLNode = 0; curINode = 0; /* initialize the current entry in each level */ level[numLevel - 1].curNode = 0; level[numLevel - 1].curEntry = 0; curLNode += fl; for (int i = 0; i < numLevel - 1; i++) { level[i].curNode = i * fi; level[i].curEntry = 0; curINode += fi; } } /* * IntervalTree::append * append a new entry (a pair of values) to the end of the leaf * level of interval tree. */ int IntervalTree::append(uint32 key, uint32 len, uint32 sid, uint32 offset) { uint32 curEntry; uint32 l = numLevel - 1; /* if the current node is full */ if (level[l].curEntry == fl) { level[l].curEntry = 0; level[l].curNode = curLNode; curLNode += fl; } /* copy the value of the new entry */ curEntry = level[l].curNode + level[l].curEntry; lpool[curEntry].key = key; lpool[curEntry].len = len; lpool[curEntry].sid = sid; lpool[curEntry].offset = offset; /* update the current entry */ level[l].curEntry++; /* if the current node is full */ if (level[l].curEntry == fl) { /* * insert the current node into the above level as * a new internal entry */ append(key, level[l].curNode, l - 1); } return SUCCESS; } /* * IntervalTree::append * append a new entry (a pair of values) to the end of a internal * level of interval tree. */ int IntervalTree::append(uint32 key, uint32 offset, uint32 l) { uint32 curEntry; /* if the current node is full */ if (level[l].curEntry == fi) { level[l].curEntry = 0; level[l].curNode = curINode; curINode += fi; } /* copy the value of the new entry */ curEntry = level[l].curNode + level[l].curEntry; ipool[curEntry].key = key; ipool[curEntry].offset = offset; /* update the current entry */ level[l].curEntry++; /* if the current node is full */ if (level[l].curEntry == fi) { /* * insert the current node into the above level as * a new internal entry */ if (l > 0) append(key, level[l].curNode, l - 1); } return SUCCESS; } /* IntervalTree::flush * insert the last entry and flush out the interval tree. Insert * the last node of the leaf level into its parent level. */ int IntervalTree::flush(uint32 key, uint32 len, uint32 sid, uint32 offset) { uint32 i; uint32 curEntry; uint32 l = numLevel - 1; /* insert the last entry */ append(key, len, sid, offset); /* set all remaining keys the same as the last key */ for (i = level[l].curEntry; i < fl; i++) { curEntry = level[l].curNode + i; lpool[curEntry].key = key; lpool[curEntry].len = len; lpool[curEntry].sid = sid; lpool[curEntry].offset = offset; } /* * insert the current node to parent level, and flush the * parent level. */ flush(key, level[l].curNode, l - 1); return 1; } /* IntervalTree::flush * Insert the last entry and flush out the interval tree. Insert * the last node of the current internal level into its parent * level, if the current level is not the root level. */ int IntervalTree::flush(uint32 key, uint32 offset, uint32 l) { uint32 i; uint32 curEntry; /* insert the last entry */ append(key, offset, l); /* set all remaining keys the same as the last key */ for (i = level[l].curEntry; i < fi; i++) { curEntry = level[l].curNode + i; ipool[curEntry].key = key; ipool[curEntry].offset = offset; } /* * insert the current node to parent level, and flush the * parent level, if the current level is not the root level. */ if (l > 0) flush(key, level[l].curNode, l - 1); return 1; } /* * IntervalTree::lookup * This function is used to return the offset in the compact sequence * given an offset in the original sequence. To do this, we search the * interval tree top down, find the entry with the greatest original * sequence offset that is less or equal to the search key. The compact * sequence offset of the found entry is returned. */ int IntervalTree::lookup(uint32 skey, uint32 & sid, uint32 & offset) { uint32 i, id = 0; uint32 key, low, high, mid, entry; key = skey + lenNSeg; /* * search the internal nodes starting from the root node. With the * node in each level, we find the entry with the greatest original * sequence offset that is less or equal to the search key. Following * the offset field of the found entry, we go to the next level. */ for (i = 0; i < numLevel - 1; i++) { high = fi - 1; low = 0; /* binary search */ while (high > low) { mid = low + ((high - low) / 2); if (key >= ipool[id + mid].key + 1) low = mid + 1; else high = mid; } id = ipool[id + low].offset; } high = fl - 1; low = 0; /* binary search in the leaf level */ while (high > low) { mid = low + ((high - low) / 2); if (key >= lpool[id + mid].key + 1) low = mid + 1; else high = mid; } /* find the leaf entry */ entry = id + low; if (key < lpool[id + low].key) entry--; sid = lpool[entry].sid; /* * we compute the distance between the searched position * and the last N segment position in compact sequence, * add this distance to the last N segment position in the * original sequence. */ offset = key - lpool[entry].key + lpool[entry].offset; offset = offset >= lenNSeg ? offset - lenNSeg : 0; return 1; } /* * IntervalTree::save * save the in-memory interval tree to disk. */ int IntervalTree::save(char * path) { int ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.interval.whm", path); file = fopen(fname, "wb"); if (file == NULL) { elog(DEBUG1, "failed to open file: %s\n", fname); return ERR_PARA; } ret = fwrite(this, sizeof(IntervalTree), 1, file); if (ret != 1) { elog(DEBUG1, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fwrite(lpool, sizeof(IntervalLEntry), numLEntry, file); if (ret != numLEntry) { elog(DEBUG1, "ERROR: write leaf entry.\n"); return ERR_FILE; } ret = fwrite(ipool, sizeof(IntervalIEntry), numIEntry, file); if (ret != numIEntry) { elog(DEBUG1, "ERROR: write interval entry.\n"); return ERR_FILE; } ret = fflush(file); if (ret != 0) { elog(DEBUG1, "ERROR: write head data file.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) { return ERR_FILE; } return SUCCESS; } /* * IntervalTree:load * load the on-disk interval tree into memory. */ int IntervalTree::load(char * path) { int ret; char fname[256]; FILE * file; if (strlen(path) > 240) return ERR_PARA; sprintf(fname, "%s.interval.whm", path); file = fopen(fname, "rb"); if (file == NULL ) return ERR_PARA; ret = fread(this, sizeof(IntervalTree), 1, file); if (ret != 1) { elog(DEBUG1, "ERROR: read interval tree structure data file.\n"); return ERR_FILE; } lpool = new IntervalLEntry[numLEntry]; if (lpool == NULL ) return ERR_MEM; ret = fread(lpool, sizeof(IntervalLEntry), numLEntry, file); if (ret != numLEntry) { elog(DEBUG1, "ERROR: read leaf entry data file.\n"); return ERR_FILE; } ipool = new IntervalIEntry[numIEntry]; if (ipool == NULL ) return ERR_MEM; ret = fread(ipool, sizeof(IntervalIEntry), numIEntry, file); if (ret != numIEntry) { elog(DEBUG1, "ERROR: read internal entry data file.\n"); return ERR_FILE; } ret = fclose(file); if (ret != 0) return ERR_FILE; return SUCCESS; }