wham/ 0000755 0015326 0015326 00000000000 12054751660 011046 5 ustar yinan yinan wham/hash.h 0000644 0015326 0015326 00000022366 12054750341 012146 0 ustar yinan yinan #ifndef _HASH_H_
#define _HASH_H_
/**
* WHAM - high-throughput sequence aligner
* Copyright (C) 2011 WHAM Group, University of Wisconsin
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/* $Id: hash.h 166 2012-11-26 20:28:17Z yinan $ */
#include
#include
#include
#include "lib.h"
#include "sequence.h"
#include "hitset.h"
#include "embedhash.h"
#define L2_CACHE_SIZE (6 * 1024 * 1024)
#define HASH_COLLISION_MASK(x) ((x) << 31)
#define HASH_EMPTY (0x7fffffff)
#define HASH_NOT_FOUND (HASH_EMPTY)
#define HASH_IS_EMPTY(x) ((x) == HASH_EMPTY)
#define HASH_IS_COLLISION(x) ((x) >> 31)
#define HASH_SET_EMPTY(x) ((x) = HASH_EMPTY)
#define HASH_SET_COLLISION(x) ((x) |= 0x80000000)
#define HASH_SET_NON_COLLISION(x) ((x) &= 0x7fffffff)
#define HASH_GET_OFFSET(x) ((x) & 0x7fffffff)
#define HASH_SET_OFFSET(x, y) ((x) = ((x) & 0x80000000) | (y))
#define HASH_IS_END(x) ((x) >> 31)
#define HASH_SET_END(x) ((x) |= 0x80000000)
#define HASH_CLEAR_END(x) ((x) &= 0x7fffffff)
#define HASH_OVERFLOW_INIT (0xffffffff)
#define HASH_FUNCTION_64(x, numBucket) ((*(x+2)) % (numBucket))
#define HASH_FUNCTION_128(x, numBucket) ((*(x+1) + *(x+2)) % (numBucket))
#define HASH_FUNCTION_192(x, numBucket) ((*(x) + *(x+1) + *(x+2)) % (numBucket))
#define HASH_FUNCTION_256(x, numBucket) ((*(x) + *(x+1) + *(x+2) + *(x+3)) % (numBucket))
#define HASH_FUNCTION_384(x, numBucket) ((*(x) + *(x+1) + *(x+2) + *(x+3) + *(x+4) + *(x+5)) % (numBucket))
#define HASH_XOR_FUNCTION_384(x, numBucket) ((*(x) ^ *(x+1) ^ *(x+2) ^ *(x+3) ^ *(x+4) ^ *(x+5)) % (numBucket))
#ifdef DEBUG_HASH_PRINT
#define HASH_DEBUG(x) x
#else
#define HASH_DEBUG(x)
#endif
static inline uint32 JenkinsHash(int64 * x, uint32 numBucket) {
uint32 hash = 0;
uint32 * p = (uint32 *) x;
hash += p[11];
hash += (hash << 10);
hash ^= (hash >> 6);
hash += p[10];
hash += (hash << 10);
hash ^= (hash >> 6);
hash += p[9];
hash += (hash << 10);
hash ^= (hash >> 6);
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return (hash + *(x) + *(x + 1) + *(x + 2) + *(x + 3) + *(x + 4)) % numBucket;
}
static inline uint32 murmurHash(int64 * x, uint32 numBucket) {
const unsigned int m = 0x5bd1e995;
const int r = 24;
uint32 * p = (uint32 *) x;
uint32 h = 322 ^ 3;
uint32 k;
uint32 sum = 0;
k = p[11];
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
k = p[10];
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
k = p[9];
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
if (p[8] != 0) {
k = p[8];
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
}
if (p[7] != 0) {
k = p[7];
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
}
if (p[6] != 0) {
k = p[6];
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
}
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return (h + *(x) + *(x + 1) + *(x + 2)) % numBucket;
}
//#define HASH_FUNCTION(x, numBucket, words, bucket) bucket = HASH_FUNCTION_384(x, numBucket)
//#define HASH_FUNCTION(x, numBucket, words, bucket) bucket = JenkinsHash(x, numBucket)
#define HASH_FUNCTION(x, numBucket, words, bucket) bucket = murmurHash(x, numBucket)
#define COMPRESS_TABLE_SIZE 2147483648LLU
#define STAT_DISTRIBUTION_NUM 10
#define HITLIST_BUCKET_NUM 631
#define HITLIST_BUCKET_LENGTH 100
/*
class HitPositionList
{
public:
uint32 counters[HITLIST_BUCKET_NUM];
uint32 buckets[HITLIST_BUCKET_LENGTH][HITLIST_BUCKET_NUM];
int npos;
public:
HitPositionList() {
npos = 0;
for (int i = 0; i < HITLIST_BUCKET_NUM; i++)
counters[i] = 0;
}
inline bool insert(uint32 pos) {
int bucketId = pos % HITLIST_BUCKET_NUM;
uint32 * list = buckets[bucketId];
for (int i = 0; i < counters[bucketId]; i++) {
if (buckets[i][bucketId] == pos)
return true;
}
if (counters[bucketId] < HITLIST_BUCKET_LENGTH) {
buckets[counters[bucketId]++][bucketId] = pos;
npos++;
} else {
printf("warning: hit list hash list is full.\n");
}
return false;
}
};
*/
class HitPositionList {
public:
int64 array[HITLIST_BUCKET_NUM / 64 + 1];
HitPositionList () {
memset(array, 0, sizeof(int64) * (HITLIST_BUCKET_NUM / 64 + 1));
}
inline bool insert(uint32 pos) {
int bucketId = pos % HITLIST_BUCKET_NUM;
int offset = 63 - (bucketId & 0x3f);
int64 * data = &array[bucketId >> 6];
// printf("{%d %d %u} ", bucketId, 63 - offset, *data);
int64 exist = (*data >> offset) & 1ULL;
*data = *data | (1ULL << offset);
// printf("{%d %d %u} ", bucketId, 63 - offset, *data);
return (bool)exist;
}
};
class HitPositionArray {
public:
uint32 num;
uint32 array[1000];
};
class EmbedHashTable;
class HashTable {
private:
int indexID;
bool compressedTable;
int length; /* the length of query sequence (characters) */
int lenSeq; /* the length of query sequence (bits) */
int lenPartition;
int lenKey;
int lenRest;
int nMismatch; /* the number of allowed errors */
int nSubstitute; /* the number of allowed substitutions */
int nInsert; /* the number of allowed insertions */
int nDelete; /* the number of allowed deletions */
int nPartition;
// int nLookup;
uint32 numBucket; /* the number of buckets */
uint32 numEntry; /* the number of entries */
uint32 numOverflowEntry;/* the number of overflow entries (stored in the overflow list) */
uint32 numCollision; /* the number of buckets with collisions */
uint32 numEmpty; /* the number of empty buckets */
uint32 * buckets; /* bucket array */
uint32 * overflowPool; /* overflow pool array */
CompactSequence * sequence; /* the reference sequence */
unsigned char * emptyBits; /* the bitmap for empty buckets (only used in building phase) */
unsigned char * collisionBits; /* the bitmap for collision buckets (only used in building phase) */
unsigned char * overflowBits;
uint32 maxScan;
int nMaxError;
int nMaxGap;
int maxQual;
int maxRepeat;
int widthKeySpan;
int keyPartitions[10];
const static uint32 nHistogram = 28;
uint32 histogram[nHistogram];
char seq1[256];
char seq2[256];
char align1[256];
char align2[256];
uint32 statSeqProbe;
uint32 statProbe;
uint32 statEmpty;
uint32 statCollision;
bool bUseEmbedTables;
int64 headMask[WORDS_PER_READ];
int64 embedHeadMask[WORDS_PER_READ];
int embedShreshold;
int numEmbedTables;
int numLongLists;
int numEmbedTablesPerList;
int * embedTableSizes;
uint32 * embedTableBucketIds;
EmbedHashTable * embedTables;
unsigned char * embedBits;
public:
HashTable();
~HashTable();
void init(CompactSequence * seq, int len, unsigned int nBucket,
int numMismatch, int numInsert, int numDelete, int nPartition,
int maxRepeat, bool useEmbedTables, int index);
int preProcessInit();
int preProcessEnd();
int buildInit();
void preProcessInsert(int64 * key); /*inline*/
void insert(int64 * key, unsigned int offset); /*inline*/
int buildEmbedTable();
unsigned int lookupEmbedTable(int64 * orgkey, uint32 bucketId, int keyOffset,
char * quals, strand s, int rid, HitSet * hits, bool noGap);
void preinsertEmbedTableEntry(int64 * query, int embedId, int keyId);
void insertEmbedTableEntry(int64 * query, uint32 seqOffset, int embedId, int keyId);
unsigned int lookup(int64 * orgkey, int64 * key, int keyOffset, char * quals,
strand s, int rid, HitSet * hits, bool noGap = false);
bool lookup(int64 * key, uint32 offset);
bool longOverflowList(uint32 bucketId, uint32 maxlen);
int removeRepeat(uint32 num);
void checkRepeat(uint32 num);
int sortList();
int check(int num);
int bulkLookup(int64 * keys, int num);
int save(char * path);
int load(char * path, int index, CompactSequence * seq);
int remove();
uint32 printOverflowList(uint32 bucketId, uint32 keyOffset, int64 * key);
void setScanThreshold(double r);
void resetStat() {
statProbe = 0;
statSeqProbe = 0;
statEmpty = 0;
statCollision = 0;
}
int getMaxScan() {
return maxScan;
}
bool useEmbedTables() {
return bUseEmbedTables;
}
void printStat() {
printf("Average scan length: %.2f\n", (double) statSeqProbe / statProbe);
printf("Avergae lookups: %.2f\n", statProbe / 3000000.0);
printf("Scan: %u Same: %u \n", statSeqProbe, statEmpty);
}
void setErrorModel(int maxerr, int maxgap, int maxqual) {
nMaxError = maxerr;
nMaxGap = maxgap;
maxQual = maxqual;
}
void setLookupInfo(int keySpan, int * partitions) {
widthKeySpan = keySpan;
memcpy(keyPartitions, partitions, sizeof(int) * nPartition);
}
private:
// void outputAlignment(int64 * orgkey, int64 * entrykey, uint32 offset, bool forward);
// void outputSequence(int64 * key, int len, char * str);
unsigned int nextPrime(unsigned int num);
};
#endif
wham/main.cpp 0000644 0015326 0015326 00000051650 12054641424 012501 0 ustar yinan yinan /**
* WHAM - high-throughput sequence aligner
* Copyright (C) 2011 WHAM Group, University of Wisconsin
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/* $Id: main.cpp 165 2012-11-26 10:23:16Z yinan $ */
#include
#include
#include
#include
#include
#include
#ifndef WIN32
#include
#endif
#include
#include
#include "error.h"
#include "aligner.h"
#include "sequence.h"
#include "pair.h"
#include "short.h"
#include "edit_distance.h"
#include "perfcounters.h"
#include "rdtsc.h"
#include "util.h"
using namespace std;
extern int ELOG_LEVEL;
char pgversion[] = "0.1.5";
char * pgcommand;
Aligner * aligner;
char outputpath[MAX_LENGTH_PATH] = "";
int maxerr = 0;
int maxgap = 0;
int nThread;
int k = 1, m = 0;
bool sorted = false, strata = false;
PerfCounters perfcounter;
unsigned long long perfctr[4];
int minins = 0, maxins = 250;
char * alignFileName = NULL, *unalignFileName = NULL;
bool concatenate = true;
AlignRes * threadres;
extern int64 statHashLookup;
extern int64 statHashLookupEntry;
extern int64 statEmbedHashLookup;
extern int64 statEmbedHashLookupEntry;
typedef struct ThreadInfo {
int id;
AlignInfo info;
char outputpath[MAX_LENGTH_PATH];
} ThreadInfo;
void printStatInfos() {
elog(DEBUG1, "Hash Lookups: %llu\n", statHashLookup);
elog(DEBUG1, "Hash Lookup Entries: %llu\n", statHashLookupEntry);
elog(DEBUG1, "Embed Hash Lookups: %llu\n", statEmbedHashLookup);
elog(DEBUG1, "Embed Hash Lookup Entries: %llu\n", statEmbedHashLookupEntry);
}
/*
* skip a line in a text file
*/
int skipLine(FILE * file) {
char c;
int i;
i = 0;
while (1) {
c = fgetc(file);
i++;
if (c == 10 || c == 13 || c == EOF
)
break;
}
return i;
}
/*
* split a comma list
*/
char ** commaList(char * str, int & num) {
int i;
char * pch;
char ** tok;
num = 1;
pch = strchr(str, ',');
while (pch != NULL) {
num++;
pch = strchr(pch + 1, ',');
}
tok = new char *[num];
i = 0;
pch = strtok(str, ",");
while (pch != NULL) {
tok[i] = new char[strlen(pch) + 1];strcpy(tok[i], pch);
i++;
pch = strtok (NULL, ",");
}
return tok;
}
/*
* convert string parameter to integer
*/
int getArguVal(char * str) {
char * c;
int base;
c = strchr(str, 'k');
if (c == NULL
)
c = strchr(str, 'K');
if (c == NULL
)
c = strchr(str, 'm');
if (c == NULL
)
c = strchr(str, 'M');
if (c == NULL
)
c = strchr(str, 'g');
if (c == NULL
)
c = strchr(str, 'G');
if (c == NULL
)
return atoi(str);
if (*c == 'k' || *c == 'K')
base = 1024;
else if (*c == 'm' || *c == 'M')
base = 1024 * 1024;
else if (*c = 'g' || *c == 'G')
base = 1024 * 1024 * 1024;
else
base = 1;
*c = '\0';
return base * atoi(str);
}
void getVersion(char * buf) {
ifstream versionfile("VERSION", ifstream::in);
versionfile.getline(buf, 16);
versionfile.close();
}
int printversion() {
printf("WHAM Version %s\n", pgversion);
return SUCCESS;
}
int printhelp() {
printf("Usage:\n");
printf(
" wham [options]* { | -1 -2 }