gclib-0.12.7/000077500000000000000000000000001407072766100126735ustar00rootroot00000000000000gclib-0.12.7/.gitignore000066400000000000000000000004011407072766100146560ustar00rootroot00000000000000.svn # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app /Default/ gclib-0.12.7/GAlnExtend.cpp000066400000000000000000001075631407072766100154040ustar00rootroot00000000000000#include "GAlnExtend.h" //greedy gapped alignment extension //(mostly lifted from NCBI's megablast gapped extension code) int GXMemPool::kMinSpace = 1000000; // ifdef TRIMDEBUG char COLOR_buf[6]={0x1B,'[', 'n','m','m','\0'}; void color_fg(int c,FILE* f) { if (f!=stderr && f!=stdout) return; sprintf((char *)(&COLOR_buf[2]),"%dm",c+30); fwrite(COLOR_buf,1,strlen(COLOR_buf), f); } void color_bg(int c, FILE* f) { if (f!=stderr && f!=stdout) return; sprintf((char *)(&COLOR_buf[2]),"%dm",c+40); fwrite(COLOR_buf,1,strlen(COLOR_buf),f); }; void color_resetfg(FILE* f) { if (f!=stderr && f!=stdout) return; sprintf((char *)(&COLOR_buf[2]),"39m"); fwrite(COLOR_buf,1,strlen(COLOR_buf), f); }; void color_resetbg(FILE* f) { if (f!=stderr && f!=stdout) return; sprintf((char *)(&COLOR_buf[2]),"49m"); fwrite(COLOR_buf,1,strlen(COLOR_buf), f); } void color_reset(FILE* f) { if (f!=stderr && f!=stdout) return; sprintf((char *)(&COLOR_buf[2]),"0m"); fwrite(COLOR_buf,1,strlen(COLOR_buf), f); }; void color_normal(FILE* f) { if (f!=stderr && f!=stdout) return; sprintf((char *)(&COLOR_buf[2]),"22m"); fwrite(COLOR_buf,1,strlen(COLOR_buf), f); }; // endif char xgapcodes[4]={'S','I', 'D', 'X'}; int get_last(int **flast_d, int d, int diag, int *row1) { if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) { *row1 = flast_d[d-1][diag-1]; return diag-1; } if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) { *row1 = flast_d[d-1][diag]; return diag; } *row1 = flast_d[d-1][diag+1]; return diag+1; } void GapXEditScript::print() { //debug GapXEditScript* p=this; do { GMessage("%d%c ",p->num, xgapcodes[p->op_type]); } while ((p=p->next)!=NULL); GMessage("\n"); } int BLAST_Gcd(int a, int b) { int c; b = abs(b); if (b > a) c=a, a=b, b=c; while (b != 0) { c = a%b; a = b; b = c; } return a; } int BLAST_Gdb3(int* a, int* b, int* c) { int g; if (*b == 0) g = BLAST_Gcd(*a, *c); else g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c)); if (g > 1) { *a /= g; *b /= g; *c /= g; } return g; } uint16 get6mer(char* p) { uint16 r=gdna2bit(p,3); r <<= 6; r |= gdna2bit(p,3); return r; } void table6mers(const char* s, int slen, GVec* amers[]) { for (uint16 i=0; i <= slen-6; i++) { char* p = (char*)(s+i); uint16 v=get6mer(p); if (amers[v]==NULL) { amers[v]=new GVec(1); } amers[v]->Add(i); } } GVec* match6mer(char* start, GVec* amers[]) { //careful: this is broken if start+5 falls beyond the end of the string! uint16 r=get6mer(start); return amers[r]; } //signal that a diagonal is invalid static const int kInvalidOffset = -2; int s_FindFirstMismatch(const char *seq1, int len1, const char *seq2, int len2, int seq1_index, int seq2_index, //bool &fence_hit, bool reverse) { int start_index = seq1_index; /* Sentry detection here should be relatively inexpensive: The sentry value cannot appear in the query, so detection only needs to be done at exit from the subject-query matching loop. For uncompressed sequences, ambiguities in the query (i.e. seq1) always count as mismatches */ if (reverse) { while (seq1_index < len1 && seq2_index < len2 && //seq1[len1-1 - seq1_index] < 4 && seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) { ++seq1_index; ++seq2_index; } //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) { //if len2-1-seq2_index<=0) { // fence_hit = true; // } } else { //forward lookup while (seq1_index < len1 && seq2_index < len2 && //seq1[seq1_index] < 4 && seq1[seq1_index] == seq2[seq2_index]) { ++seq1_index; ++seq2_index; } //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) { //if (seq2_index==len2) { // fence_hit = true; //} } return seq1_index - start_index; } /** During the traceback for a non-affine greedy alignment, compute the diagonal that will result from the next traceback operation @param last_seq2_off Array of offsets into the second sequence; last_seq2_off[d][k] gives the largest offset into the second sequence that lies on diagonal k and has distance d [in] @param d Starting distance [in] @param diag Index of diagonal that produced the starting distance [in] @param seq2_index The offset into the second sequence after the traceback operation has completed [out] @return The diagonal resulting from the next traceback operation being applied */ int s_GetNextNonAffineTback(int **last_seq2_off, int d, int diag, int *seq2_index) { // choose the traceback operation that results in the // largest seq2 offset at this point, then compute the // new diagonal that is implied by the operation if (last_seq2_off[d-1][diag-1] > GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) { *seq2_index = last_seq2_off[d-1][diag-1]; return diag - 1; // gap in seq2 } if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) { *seq2_index = last_seq2_off[d-1][diag]; return diag; // match } *seq2_index = last_seq2_off[d-1][diag+1]; return diag + 1; // gap in seq1 } int GXGreedyExtend(const char* seq1, int len1, const char* seq2, int len2, bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost, int& seq1_align_len, int& seq2_align_len, CGreedyAlignData& aux_data, GXEditScript *edit_block) { //GapPrelimEditBlock *edit_block, //bool& fence_hit, SGreedySeed *seed) { int seq1_index; int seq2_index; int index; int d; int k; int diag_lower, diag_upper; int max_dist; int diag_origin; int best_dist; int best_diag; int** last_seq2_off; int* max_score; int xdrop_offset; int longest_match_run; bool end1_reached, end2_reached; GXMemPool* mem_pool; /* ordinary dynamic programming alignment, for each offset in seq1, walks through offsets in seq2 until an X-dropoff test fails, saving the best score encountered along the way. Instead of score, this code tracks the 'distance' (number of mismatches plus number of gaps) between seq1 and seq2. Instead of walking through sequence offsets, it walks through diagonals that can achieve a given distance. Note that in what follows, the numbering of diagonals implies a dot matrix where increasing seq1 offsets go to the right on the x axis, and increasing seq2 offsets go up the y axis. The gapped alignment thus proceeds up and to the right in the graph, and diagonals are numbered increasing to the right */ best_dist = 0; best_diag = 0; /* set the number of distinct distances the algorithm will examine in the search for an optimal alignment. The heuristic worst-case running time of the algorithm is O(max_dist**2 + (len1+len2)); for sequences which are very similar, the average running time will be sig- nificantly better than this */ max_dist = GMIN(GREEDY_MAX_COST, (len2/GREEDY_MAX_COST_FRACTION + 1)); /* the main loop assumes that the index of all diagonals is biased to lie in the middle of allocated bookkeeping structures */ diag_origin = max_dist + 2; // last_seq2_off[d][k] is the largest offset into seq2 that // lies on diagonal k and has distance d last_seq2_off = aux_data.last_seq2_off; /* Instead of tracking the best alignment score and using xdrop_theshold directly, track the best score for each unique distance and use the best score for some previously computed distance to implement the X-dropoff test. xdrop_offset gives the distance backwards in the score array to look */ xdrop_offset = aux_data.xdrop_ofs; // find the offset of the first mismatch between seq1 and seq2 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse); // fence_hit, reverse, rem); // update the extents of the alignment, and bail out // early if no further work is needed seq1_align_len = index; seq2_align_len = index; seq1_index = index; /* seed->start_q = 0; seed->start_s = 0; seed->match_length = index; */ longest_match_run = index; if (index == len1 || index == len2) { /* Return the number of differences, which is zero here */ if (edit_block != NULL) //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index); edit_block->opRep(index); return 0; } // set up the memory pool mem_pool = aux_data.space; if (edit_block == NULL) { mem_pool = NULL; } else if (mem_pool == NULL) { aux_data.space = mem_pool = new GXMemPool(); } else { mem_pool->refresh(); } /* set up the array of per-distance maximum scores. There are max_diags + xdrop_offset distances to track, the first xdrop_offset of which are 0 */ max_score = aux_data.max_score + xdrop_offset; for (index = 0; index < xdrop_offset; index++) aux_data.max_score[index] = 0; // fill in the initial offsets of the distance matrix last_seq2_off[0][diag_origin] = seq1_index; max_score[0] = seq1_index * aux_data.match_reward; diag_lower = diag_origin - 1; diag_upper = diag_origin + 1; end1_reached = end2_reached = false; // for each distance for (d = 1; d <= max_dist; d++) { int xdrop_score; int curr_score; int curr_extent = 0; int curr_seq2_index = 0; int curr_diag = 0; int tmp_diag_lower = diag_lower; int tmp_diag_upper = diag_upper; // Assign impossible seq2 offsets to any diagonals that // are not in the range (diag_lower,diag_upper). // These will serve as sentinel values for the inner loop last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset; last_seq2_off[d - 1][diag_lower] = kInvalidOffset; last_seq2_off[d - 1][diag_upper] = kInvalidOffset; last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset; // compute the score for distance d corresponding to the X-dropoff criterion xdrop_score = max_score[d - xdrop_offset] + (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop; xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1)); // for each diagonal of interest for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) { /* find the largest offset into seq2 that increases the distance from d-1 to d (i.e. keeps the alignment from getting worse for as long as possible), then choose the offset into seq1 that will keep the resulting diagonal fixed at k Note that this requires kInvalidOffset+1 to be smaller than any valid offset into seq2, i.e. to be negative */ seq2_index = GMAX(last_seq2_off[d - 1][k + 1], last_seq2_off[d - 1][k ]) + 1; seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]); seq1_index = seq2_index + k - diag_origin; if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) { // if no valid diagonal can reach distance d, or the // X-dropoff test fails, narrow the range of diagonals // to test and skip to the next diagonal if (k == diag_lower) diag_lower++; else last_seq2_off[d][k] = kInvalidOffset; continue; } diag_upper = k; /* slide down diagonal k until a mismatch occurs. As long as only matches are encountered, the current distance d will not change */ index = s_FindFirstMismatch(seq1, len1, seq2, len2, seq1_index, seq2_index, reverse); //fence_hit, reverse, rem); if (index > longest_match_run) { //seed->start_q = seq1_index; //seed->start_s = seq2_index; //seed->match_length = index; longest_match_run = index; } seq1_index += index; seq2_index += index; // set the new largest seq2 offset that achieves // distance d on diagonal k last_seq2_off[d][k] = seq2_index; // since all values of k are constrained to have the // same distance d, the value of k which maximizes the // alignment score is the one that covers the most of seq1 and seq2 if (seq1_index + seq2_index > curr_extent) { curr_extent = seq1_index + seq2_index; curr_seq2_index = seq2_index; curr_diag = k; } /* clamp the bounds on diagonals to avoid walking off either sequence. Because the bounds increase by at most one for each distance, diag_lower and diag_upper can each be of size at most max_diags+2 */ if (seq2_index == len2) { diag_lower = k + 1; end2_reached = true; } if (seq1_index == len1) { diag_upper = k - 1; end1_reached = true; } } // end loop over diagonals // compute the maximum score possible for distance d curr_score = curr_extent * (aux_data.match_reward / 2) - d * (aux_data.match_reward + aux_data.mismatch_penalty); // if this is the best score seen so far, update the // statistics of the best alignment if (curr_score > max_score[d - 1]) { max_score[d] = curr_score; best_dist = d; best_diag = curr_diag; seq2_align_len = curr_seq2_index; seq1_align_len = curr_seq2_index + best_diag - diag_origin; } else { max_score[d] = max_score[d - 1]; } // alignment has finished if the lower and upper bounds // on diagonals to check have converged to each other if (diag_lower > diag_upper) break; /* set up for the next distance to examine. Because the bounds increase by at most one for each distance, diag_lower and diag_upper can each be of size at most max_diags+2 */ if (!end2_reached) diag_lower--; if (!end1_reached) diag_upper++; if (edit_block == NULL) { // if no traceback is specified, the next row of // last_seq2_off can reuse previously allocated memory //WARNING The following assumes two arrays of // at least max_dist+4 int's have already been allocated last_seq2_off[d + 1] = last_seq2_off[d - 1]; } else { // traceback requires all rows of last_seq2_off to be saved, // so a new row must be allocated last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int)); // move the origin for this row backwards // dubious pointer arithmetic ?! //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2; } } // end loop over distinct distances if (edit_block == NULL) return best_dist; //---- perform traceback d = best_dist; seq1_index = seq1_align_len; seq2_index = seq2_align_len; // for all positive distances //if (fence_hit && *fence_hit) // goto done; if (index==len1 || index==len2) d=0; while (d > 0) { int new_diag; int new_seq2_index; /* retrieve the value of the diagonal after the next traceback operation. best_diag starts off with the value computed during the alignment process */ new_diag = s_GetNextNonAffineTback(last_seq2_off, d, best_diag, &new_seq2_index); if (new_diag == best_diag) { // same diagonal: issue a group of substitutions if (seq2_index - new_seq2_index > 0) { edit_block->opRep(seq2_index - new_seq2_index); } } else if (new_diag < best_diag) { // smaller diagonal: issue a group of substitutions // and then a gap in seq2 */ if (seq2_index - new_seq2_index > 0) { edit_block->opRep(seq2_index - new_seq2_index); } //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1); edit_block->opIns(1); } else { // larger diagonal: issue a group of substitutions // and then a gap in seq1 if (seq2_index - new_seq2_index - 1 > 0) { edit_block->opRep(seq2_index - new_seq2_index - 1); } edit_block->opDel(1); } d--; best_diag = new_diag; seq2_index = new_seq2_index; } //done: // handle the final group of substitutions back to distance zero, // i.e. back to offset zero of seq1 and seq2 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, // last_seq2_off[0][diag_origin]); edit_block->opRep(last_seq2_off[0][diag_origin]); if (!reverse) edit_block->reverse(); return best_dist; } void printEditScript(GXEditScript* ed_script) { uint i; if (ed_script==NULL || ed_script->opnum == 0) return; for (i=0; iopnum; i++) { int num=((ed_script->ops[i]) >> 2); unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK ); if (op_type == 3) GError("Error: printEditScript encountered op_type 3 ?!\n"); GMessage("%d%c ", num, xgapcodes[op_type]); } GMessage("\n"); } GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart, bool editscript, int reward, int penalty, int xdrop) { int q_max=strlen(q_seq); //query int s_max=strlen(s_seq); //subj return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript); } struct GXSeedTable { int a_num, b_num; int a_cap, b_cap; char* xc; GXSeedTable(int a=12, int b=255) { a_cap=0; b_cap=0; a_num=0; b_num=0; xc=NULL; init(a,b); } ~GXSeedTable() { GFREE(xc); } void init(int a, int b) { a_num=a; b_num=b; bool resize=false; if (b_num>b_cap) { resize=true; b_cap=b_num;} if (a_num>a_cap) { resize=true; a_cap=a_num;} if (resize) { GFREE(xc); GCALLOC(xc, (a_num*b_num)); } else { //just clear up to a_max, b_max memset((void*)xc, 0, (a_num*b_num)); } } char& x(int ax, int by) { return xc[by*a_num+ax]; } }; const int a_m_score=2; //match score const int a_mis_score=-3; //mismatch const int a_dropoff_score=7; const int a_min_score=12; //at least 6 bases full match // ------------------ adapter matching - simple k-mer seed & extend, no indels for now //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme //check minimum score and //for 3' adapter trimming: // require that the right end of the alignment for either the adapter OR the read must be // < 3 distance from its right end // for 5' adapter trimming: // require that the left end of the alignment for either the adapter OR the read must // be at coordinate < 3 from start bool extendUngapped(const char* a, int alen, int ai, const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) { //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen //if (debug) { // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai); // } int a_l=ai; //alignment coordinates on a int a_r=ai+mlen-1; int b_l=bi; //alignment coordinates on b int b_r=bi+mlen-1; int ai_maxscore=ai; int bi_maxscore=bi; int score=mlen*a_m_score; int maxscore=score; int mism5score=a_mis_score; if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end //try to extend to the left first, if possible while (ai>0 && bi>0) { ai--; bi--; score+= (a[ai]==b[bi])? a_m_score : mism5score; if (score>maxscore) { ai_maxscore=ai; bi_maxscore=bi; maxscore=score; } else if (maxscore-score>a_dropoff_score) break; } a_l=ai_maxscore; b_l=bi_maxscore; //now extend to the right ai_maxscore=a_r; bi_maxscore=b_r; ai=a_r; bi=b_r; score=maxscore; //sometimes there are extra As at the end of the read, ignore those if (a[alen-2]=='A' && a[alen-1]=='A') { alen-=2; while (a[alen-1]=='A' && alen>ai) alen--; } while (ai=alen-2) { score+=a_m_score-(alen-ai-1); } } else { //mismatch score+=a_mis_score; } if (score>maxscore) { ai_maxscore=ai; bi_maxscore=bi; maxscore=score; } else if (maxscore-score>a_dropoff_score) break; } a_r=ai_maxscore; b_r=bi_maxscore; int a_ovh3=alen-a_r-1; int b_ovh3=blen-b_r-1; int mmovh3=(a_ovh3=a_min_score && mmovh3<2 && mmovh5<2) { if (a_l& seeds, GXSeqData& sd, GAlnTrimType trim_intent) { int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R int bimax=GMIN((sd.alen+2), (sd.blen-6)); int b_start = (trim_intent==galn_TrimRight) ? bimin : 0; int b_end = (trim_intent==galn_TrimLeft) ? bimax : sd.blen-6; //gx.init(a_maxlen, b_maxlen); GXSeedTable gx(sd.alen, sd.blen); GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips for (int bi=b_start;bi<=b_end;bi++) { //for each 6-mer of seqb uint16 bv = get6mer((char*) & (sd.bseq[bi])); GVec* alocs = sd.amers[bv]; if (alocs==NULL) continue; //extend each hit for (int h=0;hCount();h++) { int ai=alocs->Get(h); //word match if (gx.x(ai,bi)) //already have a previous seed covering this region of this diagonal continue; if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6) continue; //improper positioning for 5' trimming if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bisd.amlen) { //heuristics: very likely the best we can get //quick match shortcut diagstrips->qmatch=new GXSeed(ai,bi,len); return diagstrips; } if (bi>bimax && biaddSeed(newseed);//add it to all 3 adjacent diagonals //keep last resort terminal match to be used if no better alignment is there if (bi<2 && ai+len>=sd.alen-1 && (!diagstrips->tmatch_l || diagstrips->tmatch_l->lentmatch_l=newseed; //collectSeeds_R: if (ai<2 && bi+len>sd.blen-2 && (!diagstrips->tmatch_r || diagstrips->tmatch_r->lentmatch_r=newseed; } } //for each 6-mer of the read for (int i=0;iCount();i++) { diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds } diagstrips->setSorted(true); //sort by score return diagstrips; } int cmpSeedScore(const pointer p1, const pointer p2) { //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); GXSeed* s1=(GXSeed*)p1; GXSeed* s2=(GXSeed*)p2; if (s1->len==s2->len) { return (s1->b_ofs-s2->b_ofs); } else return (s2->len-s1->len); } int cmpSeedScore_R(const pointer p1, const pointer p2) { //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); GXSeed* s1=(GXSeed*)p1; GXSeed* s2=(GXSeed*)p2; if (s1->len==s2->len) { return (s2->b_ofs-s1->b_ofs); } else return (s2->len-s1->len); } int cmpSeedDiag(const pointer p1, const pointer p2) { GXSeed* s1=(GXSeed*)p1; GXSeed* s2=(GXSeed*)p2; return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs)); } int cmpDiagBands_R(const pointer p1, const pointer p2) { //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); GXBand* b1=(GXBand*)p1; GXBand* b2=(GXBand*)p2; if (b1->score==b2->score) { return (b2->w_min_b-b1->w_min_b); } else return (b2->score-b1->score); } GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, const char* s_seq, int s_alnstart, int s_max, int reward, int penalty, int xdrop, CGreedyAlignData* gxmem, CAlnTrim* trim, bool editscript) { GXEditScript* ed_script_fwd = NULL; GXEditScript* ed_script_rev = NULL; if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 ) GError("GreedyAlign() Error: invalid anchor coordinate.\n"); q_alnstart--; s_alnstart--; if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0) GError("GreedyAlign() Error: attempt to use an empty sequence string!\n"); /*if (q_seq[q_alnstart]!=s_seq[s_alnstart]) GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n", q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max); */ int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0; const char* q=q_seq+q_alnstart; int q_avail=q_max-q_alnstart; const char* s=s_seq+s_alnstart; int s_avail=s_max-s_alnstart; if (penalty<0) penalty=-penalty; GXAlnInfo* alninfo=NULL; bool freeAlnMem=(gxmem==NULL); if (freeAlnMem) { gxmem=new CGreedyAlignData(reward, penalty, xdrop); reward=gxmem->match_reward; penalty=gxmem->mismatch_penalty; xdrop=gxmem->x_drop; } else gxmem->reset(); int minMatch= trim ? trim->minMatch : 6; int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs int retscore = 0; int numdiffs = 0; if (trim!=NULL && trim->type==galn_TrimLeft) { //intent: trimming the left side if (editscript) ed_script_rev=new GXEditScript(); int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); //check this extension here and bail out if it's not a good extension if (s_ext_l+(trim->seedlen>>1) < trim->safelen && q_alnstart+1-q_ext_l>1 && s_alnstart+1-s_ext_l>trim->l_boundary) { #ifdef TRIMDEBUG GMessage(".. 5' greedy alignment rejected (trim_len=%d, trim_safelen=%d, " "q_delta=%d, s_delta=%d, trim_l_boundary=%d)\n", s_ext_l+(trim->seedlen>>1), trim->safelen, q_alnstart+1-q_ext_l, s_alnstart+1-s_ext_l, trim->l_boundary); #endif delete ed_script_rev; if (freeAlnMem) delete gxmem; return NULL; } if (editscript) ed_script_fwd=new GXEditScript(); int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); numdiffs=numdiffs_r+numdiffs_l; //convert num diffs to actual score retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty); if (editscript) ed_script_rev->Append(ed_script_fwd); //combine the two extensions } else { if (editscript) { ed_script_fwd=new GXEditScript(); } int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); //check extension here and bail out if not a good right extension //assuming s_max is really at the right end of s_seq if (trim!=NULL && trim->type==galn_TrimRight && s_ext_r+(trim->seedlen>>1) < trim->safelen && q_alnstart+q_ext_rr_boundary) { delete ed_script_fwd; if (freeAlnMem) delete gxmem; return NULL; } if (editscript) ed_script_rev=new GXEditScript(); int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); //convert num diffs to actual score numdiffs=numdiffs_r+numdiffs_l; retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty); if (editscript) ed_script_rev->Append(ed_script_fwd); //combine the two extensions } if (retscore>=MIN_GREEDY_SCORE) { alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r); int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r); alninfo->score=retscore; if (gxmem->scaled) alninfo->score >>= 1; alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length); #ifdef TRIMDEBUG //if (ed_script_rev) { // GMessage("Final Edit script ::: "); // printEditScript(ed_script_rev); // } #endif alninfo->editscript=ed_script_rev; alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1); } else { #ifdef TRIMDEBUG GMessage(".. greedy extension rejected (score=%d)\n", retscore); #endif //if (freeAlnMem) delete gxmem; delete ed_script_rev; delete alninfo; alninfo=NULL; } if (freeAlnMem) delete gxmem; delete ed_script_fwd; return alninfo; } GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem, CAlnTrim* trim, bool editscript) { int reward=2; int penalty=10; int xdrop=32; if (gxmem) { reward=gxmem->match_reward; penalty=gxmem->mismatch_penalty; xdrop=gxmem->x_drop; } return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max, reward, penalty, xdrop, gxmem, trim, editscript); } GXAlnInfo* match_adapter(GXSeqData& sd, GAlnTrimType trim_type, int minMatch, CGreedyAlignData* gxmem, double min_pid) { bool editscript=false; #ifdef TRIMDEBUG editscript=true; if (trim_type==galn_TrimLeft) { GMessage("=======> searching left (5') end : %s\n", sd.aseq); } else if (trim_type==galn_TrimRight) { GMessage("=======> searching right(3') end : %s\n", sd.aseq); } else if (trim_type==galn_TrimEither) { GMessage("==========> searching both ends : %s\n", sd.aseq); } #endif CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen); GList rseeds(true,true,false); GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type); GList anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal //did we find a shortcut? if (alnbands->qmatch) { #ifdef TRIMDEBUG GMessage("::: Found a quick long match at %d, len %d\n", alnbands->qmatch->b_ofs, alnbands->qmatch->len); #endif anchor_seeds.Add(alnbands->qmatch); } else { int max_top_bands=5; int top_band_count=0; for (int b=0;bCount();b++) { if (alnbands->Get(b)->score<6) break; //#ifdef TRIMDEBUG //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score); //#endif top_band_count++; GXBand& band=*(alnbands->Get(b)); band.seeds.setSorted(cmpSeedScore); anchor_seeds.Add(band.seeds.First()); //band.tested=true; if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; } //#ifdef TRIMDEBUG //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); //#endif } GList galns(true,true,false); for (int i=0;i>1)+1; int a2=aseed.b_ofs+(aseed.len>>1)+1; trimInfo.seedlen=aseed.len; #ifdef TRIMDEBUG GMessage("\t::: align from seed (%d, %d) of len %d.\n",aseed.a_ofs, aseed.b_ofs, aseed.len); #endif GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen, sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript); #ifdef TRIMDEBUG if (alninfo) { GMessage("\t::: aln pid=%4.2f (vs. min_pid=%.2f)\n", alninfo->pid, min_pid); alninfo->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen); } else GMessage("\t::: GreedyAlignRegion failed.\n"); #endif if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo)) galns.AddIfNew(alninfo, true); else delete alninfo; } if (galns.Count()==0) { //last resort: look for weaker terminal seeds GPVec tmatches(2,false); if (trim_type!=galn_TrimRight) { if (alnbands->tmatch_l) tmatches.Add(alnbands->tmatch_l); } if (trim_type!=galn_TrimLeft) { if (alnbands->tmatch_r) tmatches.Add(alnbands->tmatch_r); } for (int i=0;i>1; int a1=aseed.a_ofs+halfseed+1; int a2=aseed.b_ofs+halfseed+1; trimInfo.seedlen=aseed.len; #ifdef TRIMDEBUG GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs, aseed.len); #endif GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen, sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript); if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo)) galns.AddIfNew(alninfo, true); else delete alninfo; }//for each terminal seed } //---- found all alignments delete alnbands; #ifdef TRIMDEBUG //print all valid alignments found for (int i=0;iql, alninfo->qr, alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid); if (alninfo->gapinfo!=NULL) { GMessage("Alignment:\n"); alninfo->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen); } } #endif if (galns.Count()) { GXAlnInfo* bestaln=galns.Shift(); #ifdef TRIMDEBUG GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr, bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid); if (bestaln->gapinfo!=NULL) { bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen); } #endif return bestaln; } else return NULL; } gclib-0.12.7/GAlnExtend.h000066400000000000000000000614411407072766100150430ustar00rootroot00000000000000#ifndef _GALIGNEXTEND_H //greedy gapped alignment extension //(mostly lifted from NCBI's blast gapped extension code) #include "GBase.h" #include "GList.hh" #include "gdna.h" enum { gxEDIT_OP_MASK = 0x3, gxEDIT_OP_ERR = 0x0, gxEDIT_OP_INS = 0x1, gxEDIT_OP_DEL = 0x2, gxEDIT_OP_REP = 0x3 }; #define GX_EDITOP_VAL(op) ((op) >> 2) #define GX_EDITOP_GET(op) ((op) & gxEDIT_OP_MASK) #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & gxEDIT_OP_MASK)) enum {c_black=0, c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white }; void color_fg(int c, FILE* f=stderr); void color_bg(int c, FILE* f=stderr); void color_resetfg(FILE* f=stderr); void color_resetbg(FILE* f=stderr); void color_reset(FILE* f=stderr); void color_normal(FILE* f=stderr); struct GXEditScript{ uint32 *ops; // array of edit operations uint32 opsize, opnum; // size of allocation, number in use uint32 oplast; // most recent operation added //methods GXEditScript() { init(); } ~GXEditScript() { GFREE(ops); } void init() { ops = NULL; opsize = 0; opnum = 0; oplast = 0; getReady(8); } int getReady(uint32 n) { uint32 m = n + n/2; if (opsize <= n) { GREALLOC(ops, m*sizeof(uint32)); opsize = m; } return 1; } int getReady2(uint32 n) { if (opsize - opnum <= n) return getReady(n + opnum); return 1; } int Put(uint32 op, uint32 n) { if (!getReady2(2)) return 0; oplast = op; ops[opnum] = GX_EDITOP_CONS(op, n); opnum += 1; ops[opnum] = 0; // sentinel return 1; } uint32* First() { return opnum > 0 ? & ops[0] : NULL; } uint32* Next(uint32 *op) { // assumes flat address space ! if (&ops[0] <= op && op < &ops[opnum-1]) return op+1; else return 0; } int More(uint32 op, uint32 k) { if (op == gxEDIT_OP_ERR) { GError("GXEditScript::opMore: bad opcode %d:%d", op, k); return -1; } if (GX_EDITOP_GET(oplast) == op) { uint32 l=ops[opnum-1]; ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)), (GX_EDITOP_VAL(l) + k)); } else { Put(op, k); } return 0; } GXEditScript* Append(GXEditScript *et) { uint32 *op; for (op = et->First(); op; op = et->Next(op)) More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op)); return this; } int opDel(uint32 k) { return More(gxEDIT_OP_DEL, k); } int opIns(uint32 k) { return More(gxEDIT_OP_INS, k); } int opRep(uint32 k) { return More(gxEDIT_OP_REP, k); } GXEditScript *reverse() { const uint32 mid = opnum/2; const uint32 end = opnum-1; for (uint32 i = 0; i < mid; ++i) { const uint32 t = ops[i]; ops[i] = ops[end-i]; ops[end-i] = t; } return this; } }; /** Bookkeeping structure for greedy alignment. When aligning two sequences, the members of this structure store the largest offset into the second sequence that leads to a high-scoring alignment for a given start point */ struct SGreedyOffset { int insert_off; // Best offset for a path ending in an insertion int match_off; // Best offset for a path ending in a match int delete_off; // Best offset for a path ending in a deletion }; // ----- pool allocator ----- // works as a linked list of allocated memory blocks struct GXMemPool { SGreedyOffset* memblock; int used, size; GXMemPool *next; static int kMinSpace; //methods GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M) num_offsp=GMAX(kMinSpace, num_offsp); GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset)); if (memblock == NULL) { GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp); return; } used = 0; size = num_offsp; next = NULL; } void refresh() { GXMemPool* sp=this; while (sp) { sp->used = 0; sp = sp->next; } } ~GXMemPool() { GXMemPool* next_sp; GXMemPool* sp=this->next; while (sp) { next_sp = sp->next; GFREE(sp->memblock); delete sp; sp = next_sp; } GFREE(memblock); } SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array //can use the first found memory block with enough room, // or allocate a new large block SGreedyOffset* v; if (num_alloc < 0) return NULL; GXMemPool* S=this; while (used+num_alloc > S->size) { //no room in current block, get a new mem block if (next == NULL) { next=new GXMemPool(num_alloc); //allocates a large contiguous memory block } S = S->next; } v = S->memblock+S->used; S->used += num_alloc; //align to first 8-byte boundary int m8 = S->used & 7; //modulo 8 if (m8) S->used += 8 - m8; return v; } void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes return (void*)getSpace(byte_size/sizeof(SGreedyOffset)); } }; #define GREEDY_MAX_COST_FRACTION 8 /* (was 2) sequence_length / (this number) is a measure of how hard the alignment code will work to find the optimal alignment; in fact this gives a worst case bound on the number of loop iterations */ #define GREEDY_MAX_COST 1000 // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment // (should be increased for large sequences) #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n" // all auxiliary memory needed for the greedy extension algorithm class CGreedyAlignData { int d_diff; int max_d; public: int** last_seq2_off; // 2-D array of distances int* max_score; // array of maximum scores GXMemPool* space; // local memory pool for SGreedyOffset structs // bool scaled; //scores are all x2 int match_reward; int mismatch_penalty; int x_drop; int xdrop_ofs; // Allocate memory for the greedy gapped alignment algorithm CGreedyAlignData(int reward, int penalty, int xdrop) { scaled=false; xdrop_ofs = 0; //int max_d, diff_d; if (penalty<0) penalty=-penalty; if (reward % 2) { //scale params up scaled=true; match_reward = reward << 1; mismatch_penalty = (penalty << 1); x_drop = xdrop<<1; } else { match_reward=reward; mismatch_penalty = penalty; x_drop=xdrop; } xdrop_ofs=(x_drop + (match_reward>>1)) / (match_reward + mismatch_penalty) + 1; //if (gap_open == 0 && gap_extend == 0) // gap_extend = (reward >> 1) + penalty; const int max_dbseq_length=255; //adjust this accordingly max_d = GMIN(GREEDY_MAX_COST, (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1)); last_seq2_off=NULL; // 2-D array of distances max_score=NULL; // array of maximum scores space=NULL; // local memory pool for SGreedyOffset structs //if (score_params.gap_open==0 && score_params.gap_extend==0) { //non-affine, simpler Greedy algorithm d_diff = (x_drop+match_reward/2)/(mismatch_penalty+match_reward)+1; GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*))); if (!last_seq2_off) GError(GX_GALLOC_ERROR); GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2)); //allocates contiguous memory for 2 rows here if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR); last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff))); space = new GXMemPool(); if (!max_score || !space) GError(GX_GALLOC_ERROR); } //consructor void reset() { space->refresh(); if (last_seq2_off) { GFREE((last_seq2_off[0])); } GFREE(max_score); GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2)); if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR); //allocates contiguous memory for 2 rows here last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff))); if (!max_score) GError(GX_GALLOC_ERROR); } ~CGreedyAlignData() { if (last_seq2_off) { GFREE(last_seq2_off[0]); GFREE(last_seq2_off); } GFREE(max_score); delete space; } }; #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/ #define GAPALIGN_INS ((unsigned char)1) #define GAPALIGN_DEL ((unsigned char)2) #define GAPALIGN_DECLINE ((unsigned char)3) struct GapXEditScript { unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL int num; // Number of operations GapXEditScript* next; GapXEditScript() { op_type=0; num=0; next=NULL; } void print(); }; class CSeqGap { // public: int offset; int len; CSeqGap(int gofs=0,int glen=1) { offset=gofs; len=glen; } }; class CAlnGapInfo { int a_ofs; //alignment start on seq a (0 based) int b_ofs; //alignment start on seq b (0 based) int a_len; //length of alignment on seq a int b_len; //length of alignment on seq b public: GVec a_gaps; GVec b_gaps; CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() { a_ofs=astart; b_ofs=bstart; a_len=0; b_len=0; if (ed_script==NULL) return; for (uint32 i=0; iopnum; i++) { int num=((ed_script->ops[i]) >> 2); char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK ); if (op_type == 3 || op_type < 0 ) GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type); CSeqGap gap; switch (op_type) { case GAPALIGN_SUB: a_len+=num; b_len+=num; break; case GAPALIGN_INS: a_len+=num; gap.offset=b_ofs+b_len; gap.len=num; b_gaps.Add(gap); break; case GAPALIGN_DEL: b_len+=num; gap.offset=a_ofs+a_len; gap.len=num; a_gaps.Add(gap); break; } } } #ifdef TRIMDEBUG void printAlignment(FILE* f, const char* sa, int sa_len, const char* sb, int sb_len) { //print seq A char al[1024]; //display buffer for seq A int ap=0; //index in al[] for current character printed int g=0; int aend=a_ofs+a_len; if (a_ofsb_ofs) { for (int i=0;id.pid : score>d.score); } bool operator==(GXAlnInfo& d) { return (score==d.score && pid==d.pid); } }; struct GXSeed { int b_ofs; //0-based coordinate on seq b (x coordinate) int a_ofs; //0-based coordinate on seq a (y coordinate) int len; //length of exact match after extension bool operator<(GXSeed& d){ return ((b_ofs==d.b_ofs) ? a_ofs seeds; //sorted by x coordinate (b_ofs) int score; //sum of seed scores (- overlapping_bases/2 - gaps) bool tested; GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) { diag=start_diag; min_a=MAX_INT; min_b=MAX_INT; max_a=0; max_b=0; score=0; avg_len=0; w_min_b=0; tested=false; if (seed!=NULL) addSeed(seed); } void addSeed(GXSeed* seed) { seeds.Add(seed); score+=seed->len; avg_len+=seed->len; w_min_b+=seed->b_ofs * seed->len; //if (diag<0) diag=seed->diag; //should NOT be done like this if (seed->a_ofs < min_a) min_a=seed->a_ofs; if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len; if (seed->b_ofs < min_b) min_b=seed->b_ofs; if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len; } void finalize() { //!! to be called only AFTER all seeds have been added // seeds are sorted by b_ofs //penalize seed gaps and overlaps on b sequence if (avg_len==0) return; w_min_b/=avg_len; avg_len>>=1; for (int i=1;imax_gap) Gswap(max_gap, min_gap); int _penalty=0; if (min_gap<0) { //overlap if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); } else _penalty=-min_gap; } else { //gap _penalty=max_gap; } score-=(_penalty>>1); //score-=_penalty; }//for each seed } //bands will be sorted by decreasing score eventually, after all seeds are added //more seeds better than one longer seed? bool operator<(GXBand& d){ //return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score); return ((score==d.score) ? w_min_bd.score); } bool operator==(GXBand& d){ //return (score==d.score && seeds.Count()==d.seeds.Count()); return (score==d.score && w_min_b==d.w_min_b); } }; class GXBandSet:public GList { public: GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well GXSeed* tmatch_r; //terminal match to be used if there is no better alignment GXSeed* tmatch_l; //terminal match to be used if there is no better alignment int idxoffset; //global anti-diagonal->index offset (a_len-1) //used to convert a diagonal to an index //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1 //hence offset is a_len-1 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs) return Get(diag+idxoffset); } GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs) return Get(b_ofs-a_ofs+idxoffset); } GXBandSet(int a_len, int b_len):GList(a_len+b_len-1, false, true, false) { idxoffset=a_len-1; qmatch=NULL; tmatch_l=NULL; //terminal match to be used if everything else fails tmatch_r=NULL; //diag will range from -a_len+1 to b_len-1, so after adjustment //by idxoffset we get a max of a_len+b_len-2 int bcount=a_len+b_len-1; for (int i=0;iAdd(new GXBand(i-idxoffset)); //unsorted, this should set fList[i] } ~GXBandSet() { delete qmatch; } void addSeed(GXSeed* seed) { //MUST be unsorted !!! int idx=(seed->b_ofs-seed->a_ofs)+idxoffset; fList[idx]->addSeed(seed); if (idx>0) fList[idx-1]->addSeed(seed); if (idxaddSeed(seed); } }; inline int calc_safelen(int alen) { int r=iround(double(alen*0.6)); return (r<22)? 22 : r; } struct GXSeqData { const char* aseq; int alen; const char* bseq; int blen; GVec** amers; int amlen; //minimum alignment length that's sufficient to //trigger the quick extension heuristics GXSeqData(const char* sa=NULL, int la=0, const char* sb=NULL, int lb=0, GVec* mers[]=NULL):aseq(sa), alen(la), bseq(sb), blen(lb), amers(mers), amlen(0) { calc_amlen(); calc_bmlen(); } void calc_amlen() { if (alen) { int ah=calc_safelen(alen); if (amlen>ah) amlen=ah; } } void calc_bmlen() { if (blen) { int bh = iround(double(blen)*0.6); if (bh<22) bh=22; if (amlen>bh) amlen=bh; } } void update(const char* sa, int la, GVec** mers, const char* sb, int lb, int mlen=0) { aseq=sa; alen=la; amers=mers; if (mlen) { amlen=mlen; } else calc_amlen(); if (sb==bseq && blen==lb) return; bseq=sb; blen=lb; calc_bmlen(); } /* void update_b(const char* sb, int lb) { if (sb==bseq && blen==lb) return; bseq=sb; blen=lb; calc_bmlen(); }*/ }; uint16 get6mer(char* p); void table6mers(const char* s, int slen, GVec* amers[]); void printEditScript(GXEditScript* ed_script); int GXGreedyExtend(const char* seq1, int len1, const char* seq2, int len2, bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost, int& seq1_align_len, int& seq2_align_len, CGreedyAlignData& aux_data, GXEditScript *edit_block); enum GAlnTrimType { //Describes trimming intent galn_None=0, //no trimming, just alignment galn_TrimLeft, galn_TrimRight, galn_TrimEither //adapter should be trimmed from either end }; struct CAlnTrim { GAlnTrimType type; int minMatch; //minimum terminal exact match that will be removed from ends int l_boundary; //base index (either left or right) excluding terminal poly-A stretches int r_boundary; //base index (either left or right) excluding terminal poly-A stretches int alen; //query/adapter seq length (for validate()) int safelen; //alignment length > amlen should be automatically validated int seedlen; void prepare(const char* s, int s_len) { //type=trim_type; //amlen=smlen; l_boundary=0; r_boundary=0; //if (type==galn_TrimLeft) { int s_lbound=0; if (s[0]=='A' && s[1]=='A' && s[2]=='A') { s_lbound=3; while (s_lbound0 && s[r]=='A') r--; } else if (s[r-1]=='A' && s[r-2]=='A' && s[r-3]=='A') { r-=4; while (r>0 && s[r]=='A') r--; } r_boundary=r-3; // } } CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len, int a_len, int minEndTrim, int smlen): type(trim_type), minMatch(minEndTrim), l_boundary(0), r_boundary(0), alen(a_len), safelen(smlen) { prepare(s, s_len); } bool validate_R(int sr, int admax, int badj, int adist) { if (adist>admax) return false; return (sr>=r_boundary+badj); } bool validate_L(int sl, int alnlen, int admax, int badj, int alnpid, int adist) { if (adist>admax) return false; //left match should be more stringent (5') if (alnpid<93) { if (alnlen<13 || alnlensr - alninfo->sl + 1; /* #ifdef TRIMDEBUG GMessage("\t::: alnlen=%d, safelen=%d, pid=%4.2f\n", alnlen, safelen, alninfo->pid); #endif */ if (alninfo->pid>90.0 && alnlen>=safelen) { //special case: heavy match, could be in the middle if (alninfo->pid>94) alninfo->strong=true; return true; } int sl=alninfo->sl; int sr=alninfo->sr; sl--;sr--; //boundary is 0-based int badj=0; //default boundary is 3 bases distance to end int admax=1; if (alnlen>20) ++admax; if (alnlen<13) { //stricter boundary check if (alninfo->pid<90) return false; badj=2; if (alnlen<=7) { badj++; admax=0; } } if (type==galn_TrimLeft) { return validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr); } else if (type==galn_TrimRight) { return validate_R(sr, admax, badj, alninfo->ql-1); } else if (type==galn_TrimEither) { return (validate_R(sr, admax, badj, alninfo->ql-1) || validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr)); } return true; /* if (type==galn_TrimRight) { return (sr>=boundary+badj); } else { //left match should be more stringent (5') if (alnpid<93) { if (alnlen<13) return false; admax=0; badj++; } return (sl<=boundary-badj); } */ } }; //GXBandSet* collectSeeds_R(GList& seeds, GXSeqData& sd); //for overlap at 3' end of seqb GXBandSet* collectSeeds(GList& seeds, GXSeqData& sd, GAlnTrimType trim_intent); //for overlap at 5' end of seqb //=galn_None // reward MUST be >1 for this function GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, const char* s_seq, int s_alnstart, int s_max, int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL, CAlnTrim* trim=NULL, bool editscript=false); GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem, CAlnTrim* trim=NULL, bool editscript=false); GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart, bool editscript=false, int reward=2, int penalty=10, int xdrop=32); GXAlnInfo* match_adapter(GXSeqData& sd, GAlnTrimType trim_type, int minMatch, CGreedyAlignData* gxmem=NULL, double min_pid=90); //GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem=NULL, int min_pid=90); #endif gclib-0.12.7/GArgs.cpp000066400000000000000000000255321407072766100144110ustar00rootroot00000000000000#include "GBase.h" #include "GArgs.h" #include GArgs::GArgs(int argc, char* argv[], const char* format, bool nodigitopts) { /* format can be: {;|=} e.g. disable-test;PID=S= for --disable-test PID=50 (or --PID 50) S=3.5 etc. [:] e.g. p:hT for -p testing (or -ptesting) -h -T */ const char* fstr=format; fmtcount=0; count=0; nonOptCount=0; nonOptPos=0; optPos=0; errarg=0; err_valmissing=false; args=NULL; fmt=NULL; _argc=argc; _argv=argv; int fmtlen=strlen(format); //---- first parse the format string while (fstr-format < fmtlen ) { int l=strcspn(fstr, ";=:"); if (fstr[l]==0) { //end of string reached //all previous chars are just switches: GREALLOC(fmt, (fmtcount+l)*sizeof(fmtdef)); //store each switch for (int i=0; i1 && p[0]=='\'' && p[alen-1]=='\'') { p++; p[alen-2 ]='\0'; } } int GArgs::parseArgs(bool nodigitopts) { int p=1; //skip program name int f=0; while (p<_argc) { // silly patch for annnoying MacOS gdb/eclipse issues: #if defined(__APPLE__) && defined(DEBUG) dbg_dequote(_argv[p]); #endif //-- if (_argv[p][0]=='-' && (_argv[p][1]==0 || _argv[p][1]!='-')) { //single-dash argument int cpos=1; char c=_argv[p][cpos]; if (c==0 || (nodigitopts && isdigit(c)) || (c=='.' && isdigit(_argv[p][cpos+1]))) { //special case: plain argument '-' or just a negative number GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=NULL; args[count].fmti=-1; if (c==0) { GCALLOC(args[count].value, 2); args[count].value[0]='-'; } else { //negative number given args[count].value=Gstrdup(_argv[p]); } count++; nonOptCount++; } else { //single-dash argument or switch COLLAPSED: if ((f=validShortOpt(c))>=0) { GREALLOC(args, (count+1)*sizeof(argdata)); GCALLOC(args[count].opt, 2); args[count].opt[0]=c; args[count].fmti=f; if (!fmt[f].req_value) {//switch type GCALLOC(args[count].value,1);//so getOpt() functions would not return NULL count++; // only switches can be grouped with some other switches or options if (_argv[p][cpos+1]!='\0') { cpos++; c=_argv[p][cpos]; goto COLLAPSED; } } else { //single-dash argument followed by a value if (_argv[p][cpos+1]=='\0') { if (p+1<_argc && _argv[p+1][0]!=0) { //value is the whole next argument p++; #if defined(__APPLE__) && defined(DEBUG) dbg_dequote(_argv[p]); #endif args[count].value=Gstrdup(_argv[p]); } else { errarg=p; err_valmissing=true; return errarg; } } else { //value immediately follows the dash-option args[count].value=Gstrdup(_argv[p]+cpos+1); } count++; } } //was validShortOpt else { //option not found in format definition! errarg=p; return errarg; } } } //-single-dash else {//not a single-dash argument char* ap=_argv[p]; bool is_longopt=false; if (*ap=='-' && ap[1]=='-') { //double-dash option is_longopt=true; ap+=2; } char* e=strchr(ap+1,'='); while (e!=NULL && *(e-1)=='\\') e=strchr(e,'='); if (e==NULL && is_longopt) { e=ap; while (*e!=0 && *e!=' ') e++; //e will be on eos or next space } if (e!=NULL && e>ap) { //this must be a long option //e is on eos, space or '=' if ((f=validLongOpt(ap,e-1))>=0) { GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=Gstrdup(ap,e-1); args[count].fmti=f; if (fmt[f].req_value) { if (*e==0) { //value is the next argument if (p+1<_argc && _argv[p+1][0]!=0) { p++; #if defined(__APPLE__) && defined(DEBUG) dbg_dequote(_argv[p]); #endif args[count].value=Gstrdup(_argv[p]); } else { errarg=p; err_valmissing=true; return errarg; } } else { //value is in the same argument //while (*e!=0 && (*e==' ' || *e=='=')) e++; if (*e=='=') e++; if (*e==0) { errarg=p; err_valmissing=true; return errarg; } args[count].value=Gstrdup(e); } } //value required else { //no value expected GCALLOC(args[count].value,1); //do not return NULL } count++; } else { //error - this long argument not recognized errarg=p; return errarg; } } else { //just a plain non-option argument if (e==ap) { //i.e. just "--" errarg=p; return errarg; } GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=NULL; //it's not an option args[count].value=Gstrdup(_argv[p]); args[count].fmti=-1; count++; nonOptCount++; } } p++;//check next arg string } //while arguments return errarg; } void GArgs::printError(FILE* fout, const char* usage, bool exitProgram) { if (errarg==0) return; if (usage) fprintf(fout, "%s\n", usage); if (err_valmissing) fprintf(fout, "Error: value required for option '%s'\n", _argv[errarg]); else fprintf(fout, "Error: invalid argument '%s'\n", _argv[errarg]); if (exitProgram) exit(1); } void GArgs::printError(const char* usage, bool exitProgram) { printError(stderr, usage, exitProgram); } void GArgs::printCmdLine(FILE* fout) { if (_argv==NULL) return; for (int i=0;i<_argc;i++) { fprintf(fout, "%s%c", _argv[i], (i==_argc-1)?'\n':' '); } } GArgs::GArgs(int argc, char* argv[], const GArgsDef fmtrecs[], bool nodigitopts) { fmtcount=0; count=0; nonOptCount=0; nonOptPos=0; optPos=0; errarg=0; err_valmissing=false; args=NULL; fmt=NULL; _argc=argc; _argv=argv; if (fmtrecs==NULL) return; const GArgsDef* frec=fmtrecs; while ((frec->longopt || frec->opt) && fmtcount<255) { fmtcount++; frec=&(fmtrecs[fmtcount]); } GCALLOC(fmt, fmtcount*sizeof(fmtdef)); for (int i=0;i=0 && fmt[args[i].fmti].code==c) return args[i].value; return NULL; } char* GArgs::getOptName(int c) { for (int i=0; i=0 && fmt[args[i].fmti].code==c) return args[i].opt; return NULL; } int GArgs::startNonOpt(){ //reset iteration through non-option arguments //returns the number of non-option arguments nonOptPos=0; return nonOptCount; } char* GArgs::nextNonOpt() { //get the next non-dashed argument //or NULL if no more for (int i=nonOptPos;i=0) { optPos=i+1; return fmt[args[i].fmti].code; } return 0; //must make sure that codes are > 0 for this to work properly } gclib-0.12.7/GArgs.h000066400000000000000000000076641407072766100140640ustar00rootroot00000000000000/* GArgs is a quick'n'dirty object oriented replacement for the standard getopts library available on many unix platforms; it accepts the regular single letter, single-dash style options -[ ][] but also attr=value style options: = or --[=] */ #ifndef G_ARGS_DEFINED #define G_ARGS_DEFINED #ifdef HAVE_CONFIG_H #include #endif #include struct GArgsDef { const char* longopt; char opt; //equivalent one-char option, if any bool req_value; //true if the string that follows must be a value int code; //an enum code to be associated with this option }; class GArgs { //structure for parsing arguments format definition struct fmtdef { char* longopt; char opt; //equivalent one-char option, if any bool req_value; //true if the string that follows must be a value int code; //an enum code to be associated with this option }; int fmtcount; fmtdef* fmt; //this will store format definition after parsing it struct argdata { char* opt; // this is NULL for non-dashed arguments // a single character for single dash style arguments // a string for ARG=VALUE or --long_option style arguments char* value; // is NULL for switches (dashed flags) int fmti; //index in fmt table //int code; // if GArgsDef[] constructor was used, for getOpt }; int _argc; char** _argv; //the original main() values argdata* args; //arguments table after parsing it int count; //total count of elements in 'args' array int nonOptCount; //count of non-dashed, non= arguments int nonOptPos; //current position for nonOpt arguments iterator int optPos; //current position for options iterator int errarg; //argv error position after parsing bool err_valmissing; //if the error is strictly about missing value for errarg option int parseArgs(bool nodigitopts=false); //parsing helper functions int validOpt(int c); int validShortOpt(char o); int validLongOpt(char* o, char* to); public: GArgs(int argc, char* argv[], const char* format, bool nodigitopts=false); /* format can be: {;|=} e.g. disable-test;PID=S= for --disable-test PID=50 (or --PID 50) S=3.5 etc. [:] e.g. p:hT for -p testing (or -ptesting) -h -T This means that the long options, if present, should be given at the beginning of the format string, before the single-dash, single-char options */ GArgs(int argc, char* argv[], const GArgsDef fmtrecs[], bool nodigitopts=false); ~GArgs(); int isError(); // returns the offending argv position or 0 if no error int getCount() { return count; } //total number of arguments given int getFmtCount() { return fmtcount; } //total number of option definitions int getNonOptCount() { return nonOptCount; } //total number of non-option arguments char* getOpt(const char* o); /* retrieve the value for option o returns NULL if option not given at all !=NULL if boolean option was given opt's value if value option was given */ char* getOpt(const char o); char* getOpt(int c); //retrieve value by enum code char* getOptName(int c); //retrieve name of by enum code int startOpt(); //init iteration through option arguments // returns number of option args char* nextOpt(); //get next option argument's string int nextCode(); //get next option argument's code int startNonOpt(void); //init iteration through non-option arguments // returns the number of non-option arguments void printError(FILE* fout, const char* usage=NULL, bool exitProgram=false); void printError(const char* usage=NULL, bool exitProgram=false); void printCmdLine(FILE* fout); char* nextNonOpt(); //get the next non-option argument }; #endif gclib-0.12.7/GBam.cpp000066400000000000000000000376461407072766100142250ustar00rootroot00000000000000#include "GBam.h" #include #include "kstring.h" #define _cigOp(c) ((c)&BAM_CIGAR_MASK) #define _cigLen(c) ((c)>>BAM_CIGAR_SHIFT) //auxiliary functions for low level BAM record creation uint8_t* realloc_bdata(bam1_t *b, int size) { if (b->m_data < size) { b->m_data = size; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } if (b->data_lendata_len=size; return b->data; } uint8_t* dupalloc_bdata(bam1_t *b, int size) { //same as realloc_bdata, but does not free previous data //but returns it instead //it ALWAYS duplicates data b->m_data = size; kroundup32(b->m_data); uint8_t* odata=b->data; b->data = (uint8_t*)malloc(b->m_data); memcpy((void*)b->data, (void*)odata, b->data_len); b->data_len=size; return odata; //user must FREE this after } GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, int pos, bool reverse, const char* qseq, const char* cigar, const char* quals):iflags(0), exons(1), clipL(0), clipR(0), mapped_len(0), uval(0) { novel=true; bam_header=NULL; b=bam_init1(); if (pos<=0 || gseq_tid<0) { b->core.pos=-1; //unmapped b->core.flag |= BAM_FUNMAP; gseq_tid=-1; } else b->core.pos=pos-1; //BAM is 0-based b->core.tid=gseq_tid; b->core.qual=255; b->core.mtid=-1; b->core.mpos=-1; int l_qseq=strlen(qseq); //this may not be accurate, setting CIGAR is the correct way //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); b->core.l_qname=strlen(qname)+1; //includes the \0 at the end memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); set_cigar(cigar); //this will also set core.bin add_sequence(qseq, l_qseq); add_quals(quals); //quals must be given as Phred33 if (reverse) { b->core.flag |= BAM_FREVERSE ; } } GBamRecord::GBamRecord(const char* qname, int32_t samflags, int32_t g_tid, int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, int insert_size, const char* qseq, const char* quals, GVec* aux_strings):iflags(0), exons(1), uval(0) { novel=true; bam_header=NULL; b=bam_init1(); b->core.tid=g_tid; b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based b->core.qual=map_qual; int l_qseq=strlen(qseq); b->core.l_qname=strlen(qname)+1; //includes the \0 at the end memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); set_cigar(cigar); //this will also set core.bin add_sequence(qseq, l_qseq); add_quals(quals); //quals must be given as Phred33 set_flags(samflags); set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); if (aux_strings!=NULL) { for (int i=0;iCount();i++) { add_aux(aux_strings->Get(i)); } } } void GBamRecord::set_cigar(const char* cigar) { //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call int doff=b->core.l_qname; uint8_t* after_cigar=NULL; int after_cigar_len=0; uint8_t* prev_bdata=NULL; if (b->data_len>doff) { //cigar string already allocated, replace it int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data after_cigar=b->data+d; after_cigar_len=b->data_len-d; } const char *s; char *t; int i, op; long x; b->core.n_cigar = 0; if (cigar != NULL && strcmp(cigar, "*") != 0) { for (s = cigar; *s; ++s) { if (isalpha(*s)) b->core.n_cigar++; else if (!isdigit(*s)) { GError("Error: invalid CIGAR character (%s)\n",cigar); } } if (after_cigar_len>0) { //replace/insert into existing full data prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); free(prev_bdata); } else { realloc_bdata(b, doff + b->core.n_cigar * 4); } for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true; else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true; else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true; else if (op == 'P') op = BAM_CPAD; else GError("Error: invalid CIGAR operation (%s)\n",cigar); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); } else {//no CIGAR string given if (!(b->core.flag&BAM_FUNMAP)) { GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); b->core.flag |= BAM_FUNMAP; } b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); } setupCoordinates(); } //set_cigar() void GBamRecord::add_sequence(const char* qseq, int slen) { //must be called AFTER set_cigar (cannot replace existing sequence for now) if (qseq==NULL) return; //should we ever care about this? if (slen<0) slen=strlen(qseq); int doff = b->core.l_qname + b->core.n_cigar * 4; if (strcmp(qseq, "*")!=0) { b->core.l_qseq=slen; if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) GError("Error: CIGAR and sequence length are inconsistent!(%s)\n", qseq); uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; //also allocated quals memory memset(p, 0, (b->core.l_qseq+1)/2); for (int i = 0; i < b->core.l_qseq; ++i) p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); } else b->core.l_qseq = 0; } void GBamRecord::add_quals(const char* quals) { //requires core.l_qseq already set //and must be called AFTER add_sequence(), which also allocates the memory for quals uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); if (quals==NULL || strcmp(quals, "*") == 0) { for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; return; } for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; } void GBamRecord::add_aux(const char* str) { //requires: being called AFTER add_quals() int strl=strlen(str); //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; //int doff0=doff; if (strl < 6 || str[2] != ':' || str[4] != ':') parse_error("missing colon in auxiliary data"); tag[0] = str[0]; tag[1] = str[1]; uint8_t atype = str[3]; uint8_t* adata=abuf; int alen=0; if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility atype='A'; alen=1; adata=(uint8_t*)&str[5]; } else if (atype == 'I' || atype == 'i') { long long x=strtoll(str+5, NULL, 10); //(long long)atoll(str + 5); //long x=(long)atol(str + 5); if (x < 0) { if (x >= -127) { atype='c'; abuf[0] = (int8_t)x; alen=1; } else if (x >= -32767) { atype = 's'; *(int16_t*)abuf = (int16_t)x; alen=2; } else { atype='i'; *(int32_t*)abuf = (int32_t)x; alen=4; if (x < -2147483648ll) GMessage("Parse warning: integer %lld is out of range.", x); } } else { //x >=0 if (x <= 255) { atype = 'C'; abuf[0] = (uint8_t)x; alen=1; } else if (x <= 65535) { atype='S'; *(uint16_t*)abuf = (uint16_t)x; alen=2; } else { atype='I'; *(uint32_t*)abuf = (uint32_t)x; alen=4; if (x > 4294967295ll) GMessage("Parse warning: integer %lld is out of range.", x); } } } //integer type else if (atype == 'f') { *(float*)abuf = (float)atof(str + 5); alen = sizeof(float); } else if (atype == 'd') { //? *(float*)abuf = (float)atof(str + 9); alen=8; } else if (atype == 'Z' || atype == 'H') { if (atype == 'H') { // check whether the hex string is valid if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); for (int i = 0; i < strl - 5; ++i) { int c = toupper(str[5 + i]); if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) parse_error("invalid hex character"); } } memcpy(abuf, str + 5, strl - 5); abuf[strl-5] = 0; alen=strl-4; } else parse_error("unrecognized aux type"); this->add_aux(tag, atype, alen, adata); }//add_aux() int interpret_CIGAR(char cop, int cl, int aln_start) { // returns the number of bases "aligned" (matches or mismatches) from the read // gpos = current genomic position (will end up as right coordinate on the genome) // rpos = read position (will end up as the length of the read) // cop = CIGAR operation, cl = operation length int mbases = 0; //count "aligned" bases (includes mismatches) int rpos = 0; int gpos = aln_start; int num_mismatches=0; //NM tag value = edit distance switch (cop) { case BAM_CDIFF: // X num_mismatches+=cl; case BAM_CMATCH: // M //have to actually check for mismatches: num_mismatches+=count_mismatches; case BAM_CEQUAL: // = //printf("[%d-%d]", gpos, gpos + cl - 1); gpos+=cl; rpos+=cl; ++mbases; break; case BAM_CPAD: // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage gpos+=cl; break; case BAM_CHARD_CLIP: // printf("[%d]", pos); // No coverage // gpos is not advanced by this operation break; case BAM_CSOFT_CLIP: // S //soft clipped bases, present in SEQ rpos+=cl; break; case BAM_CINS: // I // No Coverage // adds cl bases "throughput" but not genomic position "coverage" (gap in genomic seq) // should also add cl to the number of "mismatches" (unaligned read bases) num_mismatches+=cl; // How you handle this is application dependent // gpos is not advanced by this operation rpos+=cl; break; case BAM_CDEL: // D //deletion in reference sequence relative to the read (gap in read sequence) // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions num_mismatches+=cl; gpos += cl; break; case BAM_CREF_SKIP: // N // intron //special skip operation, not contributing to "edit distance", // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage // so num_mismatches is not updated gpos+=cl; break; default: fprintf(stderr, "Unhandled cigar_op %d:%d\n", cop, cl); //printf("?"); } return mbases; } // interpret_CIGAR(), just a reference of CIGAR operations interpretation void GBamRecord::setupCoordinates() { const bam1_core_t *c = &b->core; if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */ uint32_t *cigar = bam1_cigar(b); //uint32_t *p = bam1_cigar(b); //--- prevent alignment error here (reported by UB-sanitazer): //uint32_t *cigar= new uint32_t[c->n_cigar]; //memcpy(cigar, p, c->n_cigar * sizeof(uint32_t)); //--- UBsan protection end int l=0; mapped_len=0; clipL=0; clipR=0; start=c->pos+1; //genomic start coordinate, 1-based (BAM core.pos is 0-based) GSeg exon; int exstart=c->pos; bool intron=false; bool ins=false; for (int i = 0; i < c->n_cigar; ++i) { unsigned char op = _cigOp(cigar[i]); switch(op) { case BAM_CEQUAL: // = case BAM_CDIFF: // X case BAM_CMATCH: // M case BAM_CDEL: // D l+=_cigLen(cigar[i]); intron=false; ins=false; break; case BAM_CINS: // I //rpos+=cl; //gpos not advanced //take care of cases where there is an ins within an intron ins=true; break; case BAM_CREF_SKIP: // N //intron starts //exon ends here if(!ins || !intron) { // insertion in the middle of an intron --> adjust last exon exon.start=exstart+1; exon.end=c->pos+l; exons.Add(exon); mapped_len+=exon.len(); } has_Introns=true; l += _cigLen(cigar[i]); exstart=c->pos+l; intron=true; break; case BAM_CSOFT_CLIP: // S soft_Clipped=true; if (l) clipR=_cigLen(cigar[i]); else clipL=_cigLen(cigar[i]); intron=false; ins=false; break; case BAM_CHARD_CLIP: hard_Clipped=true; intron=false; ins=false; break; case BAM_CPAD: //gpos+=cl; intron=false; ins=false; //? break; default: int cl=_cigLen(cigar[i]); fprintf(stderr, "Unhandled CIGAR operation %d:%d\n", op, cl); } } exon.start=exstart+1; exon.end=c->pos+l; exons.Add(exon); mapped_len+=exon.len(); end=exon.end; //genomic end coordinate } uint8_t* GBamRecord::find_tag(const char tag[2]) { return bam_aux_get(this->b, tag); } char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char uint8_t* s=find_tag(tag); if (s) return ( bam_aux2A(s) ); return 0; } char GBamRecord::tag_char1(const char tag[2]) { //just the first char from Z type tags uint8_t* s=bam_aux_get(this->b, tag); if (s==NULL) return 0; int type; type = *s++; if (s == 0) return 0; if (type == 'A' || type == 'Z') return *(char*)s; else return 0; } int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag uint8_t *s=find_tag(tag); if (s) return ( bam_aux2i(s) ); return 0; } float GBamRecord::tag_float(const char tag[2]) { //get the float value of tag uint8_t *s=bam_aux_get(this->b, tag);; if (s) return ( bam_aux2f(s) ); return 0; } char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag uint8_t *s=find_tag(tag); if (s) return ( bam_aux2Z(s) ); return NULL; } char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag char c=tag_char1("XS"); if (c==0) { //try minimap2's "ts" tag char m=tag_char1("ts"); if (m=='+' || m=='-') { if ((this->b->core.flag & BAM_FREVERSE) != 0) c=((m=='+') ? '-' : '+'); else c=m; } } return ((c=='+' || c=='-') ? c : '.'); } char* GBamRecord::sequence() { //user must free this after use char *s = (char*)bam1_seq(b); char* qseq=NULL; GMALLOC(qseq, (b->core.l_qseq+1)); int i; for (i=0;i<(b->core.l_qseq);i++) { int8_t v = bam1_seqi(s,i); qseq[i] = bam_nt16_rev_table[v]; } qseq[i] = 0; return qseq; } char* GBamRecord::qualities() {//user must free this after use char *qual = (char*)bam1_qual(b); char* qv=NULL; GMALLOC(qv, (b->core.l_qseq+1) ); int i; for(i=0;i<(b->core.l_qseq);i++) { qv[i]=qual[i]+33; } qv[i]=0; return qv; } char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user kstring_t str; str.l = str.m = 0; str.s = 0; if (b->core.n_cigar == 0) kputc('*', &str); else { for (int i = 0; i < b->core.n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } return str.s; } gclib-0.12.7/GBam.h000066400000000000000000000337501407072766100136620ustar00rootroot00000000000000#ifndef _G_BAM_H #define _G_BAM_H #include "GBase.h" #include "GList.hh" #include "bam.h" #include "sam.h" class GBamReader; class GBamWriter; class GBamRecord: public GSeg { friend class GBamReader; friend class GBamWriter; bam1_t* b; // b->data has the following strings concatenated: // qname (including the terminal \0) // +cigar (each event encoded on 32 bits) // +seq (4bit-encoded) // +qual // +aux union { uint16_t iflags; struct { bool novel :1; //if set, the destructor must free b bool hard_Clipped :1; bool soft_Clipped :1; bool has_Introns :1; }; }; bam_header_t* bam_header; char tag[2]; uint8_t abuf[512]; public: GVec exons; //coordinates will be 1-based int clipL; //soft clipping data, as seen in the CIGAR string int clipR; int mapped_len; //sum of exon lengths int uval; //user value (e.g. file index) bool isHardClipped() { return hard_Clipped; } bool isSoftClipped() { return soft_Clipped; } bool hasIntrons() { return has_Introns; } //created from a reader: void bfree_on_delete(bool b_free=true) { novel=b_free; } GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL, bool b_free=true):iflags(0), exons(1), clipL(0), clipR(0), mapped_len(0), uval(0) { bam_header=NULL; if (from_b==NULL) { b=bam_init1(); novel=true; } else { b=from_b; //it'll take over from_b novel=b_free; } bam_header=b_header; setupCoordinates();//set 1-based coordinates (start, end and exons array) } GBamRecord(GBamRecord& r):GSeg(r.start, r.end), iflags(0), exons(r.exons), clipL(r.clipL), clipR(r.clipR), mapped_len(r.mapped_len), uval(0) { //copy constructor //makes a new copy of the bam1_t record etc. clear(); b=bam_dup1(r.b); novel=true; //will also free b when destroyed } const GBamRecord& operator=(GBamRecord& r) { //copy operator //makes a new copy of the bam1_t record etc. clear(); b=bam_dup1(r.b); iflags=r.iflags; novel=true; //will also free b when destroyed start=r.start; end=r.end; exons = r.exons; clipL = r.clipL; clipR = r.clipR; uval = r.uval; mapped_len=r.mapped_len; return *this; } void setupCoordinates(); void clear() { if (novel) { bam_destroy1(b); //novel=false; } b=NULL; exons.Clear(); mapped_len=0; bam_header=NULL; iflags=0; } ~GBamRecord() { clear(); } void parse_error(const char* s) { GError("BAM parsing error: %s\n", s); } bam1_t* get_b() { return b; } void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available int32_t isize=0) { //mate info for current record b->core.mtid=mtid; b->core.mpos=m0pos; // should be -1 if '*' b->core.isize=isize; //should be 0 if not available } void set_flags(uint16_t samflags) { b->core.flag=samflags; } //creates a new record from 1-based alignment coordinate //quals should be given as Phred33 //Warning: pos and mate_pos must be given 1-based! GBamRecord(const char* qname, int32_t gseq_tid, int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL); GBamRecord(const char* qname, int32_t samflags, int32_t g_tid, int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, int insert_size, const char* qseq, const char* quals=NULL, GVec* aux_strings=NULL); //const std::vector* aux_strings=NULL); void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format void add_quals(const char* quals); //quality values string in Phred33 format void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1") void add_aux(const char tag[2], char atype, int len, uint8_t *data) { //IMPORTANT: strings (Z,H) should include the terminal \0 int addz=0; if ((atype=='Z' || atype=='H') && data[len-1]!=0) { addz=1; } int ori_len = b->data_len; b->data_len += 3 + len + addz; b->l_aux += 3 + len + addz; if (b->m_data < b->data_len) { b->m_data = b->data_len; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; b->data[ori_len + 2] = atype; if (addz) { b->data[ori_len+len+4]=0; } memcpy(b->data + ori_len + 3, data, len); } void add_tag(const char tag[2], char atype, int len, uint8_t *data) { //same with add_aux() add_aux(tag,atype,len,data); } //--query methods: uint32_t flags() { return b->core.flag; } //return SAM flags bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); } bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); } bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); } const char* name() { return bam1_qname(b); } int pairOrder() { //which read in the pair: 0 = unpaired, 1=first read, 2=second read int r=0; if ((b->core.flag & BAM_FREAD1) != 0) r=1; else if ((b->core.flag & BAM_FREAD2) != 0) r=2; return r; } bool revStrand() { //this is the raw alignment strand, NOT the transcription/splice strand return ((b->core.flag & BAM_FREVERSE) != 0); } const char* refName() { return (bam_header!=NULL) ? ((b->core.tid<0) ? "*" : bam_header->target_name[b->core.tid]) : NULL; } int32_t refId() { return b->core.tid; } int32_t mate_refId() { return b->core.mtid; } const char* mate_refName() { return (bam_header!=NULL) ? ((b->core.mtid<0) ? "*" : bam_header->target_name[b->core.mtid]) : NULL; } int32_t insertSize() { return b->core.isize; } int32_t mate_start() { return b->core.mpos<0? 0 : b->core.mpos+1; } //int find_tag(const char tag[2], uint8_t* & s, char& tag_type); uint8_t* find_tag(const char tag[2]); //position s at the beginning of tag data, tag_type is set to the found tag type //returns length of tag data, or 0 if tag not found char* tag_str(const char tag[2]); //return tag value for tag type 'Z' int tag_int(const char tag[2]); //return numeric value of tag (for numeric types) float tag_float(const char tag[2]); //return float value of tag (for float types) char tag_char(const char tag[2]); //return char value of tag (for type 'A') char tag_char1(const char tag[2]); //return char value of tag (for type 'A') char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag char* sequence(); //user should free after use char* qualities();//user should free after use char* cigar(); //returns text version of the CIGAR string; user must free }; // from sam.c: #define FTYPE_BAM 1 #define FTYPE_READ 2 class GBamReader { samfile_t* bam_file; char* fname; // from bam_import.c: struct samtools_tamFile_t { gzFile fp; void *ks; void *str; uint64_t n_lines; int is_first; }; public: void bopen(const char* filename, bool forceBAM=false) { if (strcmp(filename, "-")==0) { //if stdin was given, we assume it's text SAM, unless forceBAM was given if (forceBAM) bam_file=samopen(filename, "rb", 0); else bam_file=samopen(filename, "r", 0); } else { FILE* f=Gfopen(filename); if (f==NULL) { GError("Error opening SAM/BAM file %s!\n", filename); } if (forceBAM) { //directed to open this as a BAM file if (forceBAM) bam_file=samopen(filename, "rb", 0); } else { //try to guess if it's BAM or SAM //BAM files have the zlib signature bytes at the beginning: 1F 8B 08 //if that's not present then we assume text SAM byte fsig[3]; size_t rd=fread(fsig, 1, 3, f); fclose(f); if (rd<3) GError("Error reading from file %s!\n",filename); if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) || (fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) { bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM } else { //assume text SAM file bam_file=samopen(filename, "r", 0); } } } if (bam_file==NULL) GError("Error: could not open SAM file %s!\n",filename); fname=Gstrdup(filename); } GBamReader(const char* fn, bool forceBAM=false) { bam_file=NULL; fname=NULL; bopen(fn, forceBAM); } bam_header_t* header() { return bam_file? bam_file->header : NULL; } void bclose() { if (bam_file) { samclose(bam_file); bam_file=NULL; } } ~GBamReader() { bclose(); GFREE(fname); } int64_t fpos() { if ( bam_file->type & FTYPE_BAM ) return bgzf_tell(bam_file->x.bam); else return (int64_t)gztell(((samtools_tamFile_t*)(bam_file->x.tamr))->fp); } int64_t fseek(int64_t offs) { if ( bam_file->type & FTYPE_BAM ) return bgzf_seek(bam_file->x.bam, offs, SEEK_SET); else return (int64_t)gzseek(((samtools_tamFile_t*)(bam_file->x.tamr))->fp, offs, SEEK_SET); } void rewind() { if (fname==NULL) { GMessage("Warning: GBamReader::rewind() called without a file name.\n"); return; } bclose(); char* ifname=fname; bopen(ifname); GFREE(ifname); } GBamRecord* next() { if (bam_file==NULL) GError("Warning: GBamReader::next() called with no open file.\n"); bam1_t* b = bam_init1(); if (samread(bam_file, b) >= 0) { GBamRecord* bamrec=new GBamRecord(b, bam_file->header, true); return bamrec; } else { bam_destroy1(b); return NULL; } } }; class GBamWriter { samfile_t* bam_file; bam_header_t* bam_header; bool sharedHeader; public: void create(const char* fname, bool uncompressed=false) { if (bam_header==NULL) GError("Error: no bam_header for GBamWriter::create()!\n"); if (uncompressed) { bam_file=samopen(fname, "wbu", bam_header); } else { bam_file=samopen(fname, "wb", bam_header); } if (bam_file==NULL) GError("Error: could not create BAM file %s!\n",fname); } void create(const char* fname, bam_header_t* bh, bool uncompressed=false) { bam_header=bh; create(fname,uncompressed); } GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false):sharedHeader(false) { create(fname, bh, uncompressed); } GBamWriter(bam_header_t* bh, const char* fname, bool uncompressed=false):sharedHeader(true) { create(fname, bh, uncompressed); } GBamWriter(const char* fname, const char* samfname, bool uncompressed=false):sharedHeader(false) { tamFile samf_in=sam_open(samfname); if (samf_in==NULL) GError("Error: could not open SAM file %s\n", samfname); bam_header=sam_header_read(samf_in); if (bam_header==NULL) GError("Error: could not read SAM header from %s!\n",samfname); sam_close(samf_in); create(fname, uncompressed); } ~GBamWriter() { samclose(bam_file); if (!sharedHeader) bam_header_destroy(bam_header); } bam_header_t* get_header() { return bam_header; } int32_t get_tid(const char *seq_name) { if (bam_header==NULL) GError("Error: missing SAM header (get_tid())\n"); return bam_get_tid(bam_header, seq_name); } //just a convenience function for creating a new record, but it's NOT written //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based) GBamRecord* new_record(const char* qname, const char* gseqname, int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) { int32_t gseq_tid=get_tid(gseqname); if (gseq_tid < 0 && strcmp(gseqname, "*")) { if (bam_header->n_targets == 0) { GError("Error: missing/invalid SAM header\n"); } else GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n", gseqname); } return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual)); } GBamRecord* new_record(const char* qname, int32_t samflags, const char* gseqname, int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos, int insert_size, const char* qseq, const char* quals=NULL, GVec* aux_strings=NULL) { int32_t gseq_tid=get_tid(gseqname); if (gseq_tid < 0 && strcmp(gseqname, "*")) { if (bam_header->n_targets == 0) { GError("Error: missing/invalid SAM header\n"); } else GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n", gseqname); } int32_t mgseq_tid=-1; if (mgseqname!=NULL) { if (strcmp(mgseqname, "=")==0) { mgseq_tid=gseq_tid; } else { mgseq_tid=get_tid(mgseqname); if (mgseq_tid < 0 && strcmp(mgseqname, "*")) { GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n", mgseqname); } } } return (new GBamRecord(qname, samflags, gseq_tid, pos, map_qual, cigar, mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings)); } void write(GBamRecord* brec) { if (brec!=NULL) samwrite(this->bam_file,brec->get_b()); } void write(bam1_t* b) { samwrite(this->bam_file, b); } }; #endif gclib-0.12.7/GBase.cpp000066400000000000000000000641601407072766100143670ustar00rootroot00000000000000#include "GBase.h" #include #include #ifndef S_ISDIR #define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) #endif #ifndef S_ISREG #define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) #endif //#ifdef _WIN32 // int (WINAPIV * __vsnprintf)(char *, size_t, const char*, va_list) = _vsnprintf; //#endif //************************* Debug helpers ************************** // Assert failed routine void GAssert(const char* expression, const char* filename, unsigned int lineno){ char msg[4096]; sprintf(msg,"%s(%d): ASSERT(%s) failed.\n",filename,lineno,expression); fprintf(stderr,"%s",msg); #ifdef DEBUG // modify here if you [don't] want a core dump abort(); #endif exit(1); } // Error routine (prints error message and exits!) void GError(const char* format,...){ #ifdef _WIN32 char msg[4096]; va_list arguments; va_start(arguments,format); _vsnprintf(msg, 4095, format, arguments); vfprintf(stderr, format, arguments); // if a console is available msg[4095]=0; va_end(arguments); OutputDebugString(msg); MessageBox(NULL,msg,NULL,MB_OK|MB_ICONEXCLAMATION|MB_APPLMODAL); #else va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); #ifdef DEBUG // comment this if you do NOT want a core dump abort(); #endif #endif exit(1); } // Warning routine (just print message without exiting) void GMessage(const char* format,...){ #ifdef _WIN32 char msg[4096]; va_list arguments; va_start(arguments,format); vfprintf(stderr, format , arguments); // if a console is available _vsnprintf(msg, 4095, format, arguments); msg[4095]=0; va_end(arguments); fflush(stderr); //OutputDebugString(msg); #else va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); fflush(stderr); #endif } char* Gstrdup(const char* str, int xtracap) { if (str==NULL) return NULL; char *copy=NULL; GMALLOC(copy, strlen(str)+1+xtracap); strcpy(copy,str); return copy; } char* newEmptyStr() { char* zs=NULL; GMALLOC(zs,1); zs[0]=0; return zs; } char* Gstrdup(const char* sfrom, const char* sto) { if (sfrom==NULL || sto==NULL) return NULL; char *copy=NULL; if (sfrom[0]==0 || sto dirstack(4); // stack of directories that should be created while (psep>gpath && *(psep-1)=='/') --psep; //skip double slashes *psep='\0'; int fexists=0; while ((fexists=fileExists(gpath))==0) { dirstack.Push(psep); do { --psep; } while (psep>gpath && *psep!='/'); if (psep<=gpath) { psep=NULL; break; } while (psep>gpath && *(psep-1)=='/') --psep; *psep='\0'; } if (psep) *psep='/'; while (dirstack.Count()>0) { psep=dirstack.Pop(); int mkdir_err=0; if ((mkdir_err=G_mkdir(gpath, perms))!=0) { GMessage("Warning: mkdir(%s) failed: %s\n", gpath, strerror(errno)); GFREE(gpath); umask(process_mask); return -1; } *psep='/'; } GFREE(gpath); umask(process_mask); return 0; } bool haveStdInput() { #ifdef _WIN32 HANDLE hIn = GetStdHandle(STD_INPUT_HANDLE); DWORD stype = GetFileType(hIn); return (stype!=FILE_TYPE_CHAR); #else return !(isatty(fileno(stdin))); #endif } FILE* Gfopen(const char *path, char *mode) { FILE* f=NULL; if (mode==NULL) f=fopen(path, "rb"); else f=fopen(path, mode); if (f==NULL) GMessage("Error opening file '%s': %s\n", path, strerror(errno)); return f; } #define IS_CHR_DELIM(c) ( c == ' ' || c == '\t' || c == ':' ) void GRangeParser::parse(char* s) { //parses general range format: [+\-|.]refID[+\-\.][ |:][start][-|.\s]end[\s:][+\-\.] // if ref ID has ':' a space delimited format is preferred // or just separate the ref ID from the coordinate range: [[+/-]refstart[[-..][end]] //the safest way would be to parse from the end in case the ref ID has ':' characters //if the whole chromosome is intended (no coordinates to speficy) this->start=0; this->end=0; this->strand=0; int slen=strlen(s); if (slen==0) return; while (isspace(s[slen-1])) { slen--; s[slen]=0; } //trim trailing spaces while (isspace(*s)) { ++s; --slen; } //trim prefix spaces char c=*s; if (c=='+' || c=='-') { strand=c; ++s;slen--; } if (strand && (*s==':' || *s==' ')) //ignore { s++;slen--; } char* p=s; //parsing position for coordinate string char* isep=strpbrk(s, " \t"); if (isep==NULL) isep=strchr(s, ':'); if (isep) { //chr (ref) ID ending found p=isep+1; *isep=0; //character after the delimiter can only be a strand if it's followed by another delimiter //e.g. chr1 + 134551-204326 or chr1:+:134551-204326 c=*(isep+1); if (strand==0 && (c=='+' || c=='-' || c=='.') && IS_CHR_DELIM(*(isep+2))) { strand=c; p=isep+3; } if (strand==0) { c=*(isep-1); //character before the delimiter could be the strand if (c=='+' || c=='-') { //not '.', sorry isep--; strand=c; *isep=0; //ref is now parsable } } this->refName=Gstrdup(s,isep-1); } //here we are after ref ID (and possibly strand) delimiter char* pend=p; if (isdigit(*pend)) { //parse the start coordinate then do { pend++; } while (isdigit(*pend)); c=*pend; *pend=0; this->start=atoi(p); p=pend; *p=c; } while (*p=='-' || *p=='.' || *p==' ' || *p=='\t') ++p; pend=p; while (isdigit(*pend)) pend++; if (pend>p) { //parse the 2nd coordinate c=*pend; *pend=0; this->end=atoi(p); *pend=c; } if (start && end && endstart, this->end); //if (strand==0) { ? c=s[slen-1]; //peek at the end of the string for strand if (c=='+' || c=='-' || c=='.') { if (end || IS_CHR_DELIM(s[slen-2])) strand=c; //slen--;s[slen]=0; } } bool GstrEq(const char* a, const char* b) { if (a==NULL || b==NULL) return false; return (strcmp(a, b)==0); } #ifdef __CYGWIN__ int strcasecmp (const char *s1, const char *s2) { int d = 0; for ( ; ; ) { const int c1 = tolower(*s1++); const int c2 = tolower(*s2++); if (((d = c1 - c2) != 0) || (c2 == '\0')) break; } return d; } int strncasecmp (const char *s1, const char *s2, size_t n) { int d = 0; for ( ; n != 0; n--) { const int c1 = tolower(*s1++); const int c2 = tolower(*s2++); if (((d = c1 - c2) != 0) || (c2 == '\0')) break; } return d; } #endif bool GstriEq(const char* a, const char* b) { if (a==NULL || b==NULL) return false; return (strcasecmp(a, b)==0); } int Gstricmp(const char* a, const char* b, int n) { if (a==NULL || b==NULL) return a==NULL ? -1 : 1; if (n>=0) return strncasecmp(a,b,n); else return strcasecmp(a,b); } int strsplit(char* str, GDynArray& fields, const char* delim, int maxfields) { //splits by placing 0 where any of the delim chars are found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; fields.Reset(); while (str[i]!=0 && tidx& fields, const char delim, int maxfields) { //splits by placing 0 where delim is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; fields.Reset(); while (str[i]!=0 && tidx& fields, int maxfields) { //splits by placing 0 where delim is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; fields.Reset(); while (str[i]!=0 && tidx=str) { if (*p==ch) return p; p--; } return NULL; } /* DOS/UNIX safer fgets : reads a text line from a (binary) file and update the file position accordingly and the buffer capacity accordingly. The given buf is resized to read the entire line in memory -- even when it's abnormally long */ char* fgetline(char* & buf, int& buf_cap, FILE *stream, off_t* f_pos, int* linelen) { //reads a char at a time until \n and/or \r are encountered int c=0; GDynArray arr(buf, buf_cap); off_t fpos=(f_pos!=NULL) ? *f_pos : 0; while ((c=getc(stream))!=EOF) { if (c=='\n' || c=='\r') { if (c=='\r') { if ((c=getc(stream))!='\n') ungetc(c,stream); else fpos++; } fpos++; break; } fpos++; arr.Push((char)c); } //while i=str) { for (i=0; i=0 && s[i]==suffix[j]) { i--; j--; } return (j==-1); } bool endsiWith(const char* s, const char* suffix) { if (suffix==NULL || s==NULL) return false; if (suffix[0]==0) return true; //special case: empty suffix int j=strlen(suffix)-1; int i=strlen(s)-1; if (i=0 && tolower(s[i])==tolower(suffix[j])) { i--; j--; } return (j==-1); } bool trimSuffix(char* s, const char* suffix) { if (suffix==NULL || s==NULL) return false; if (suffix[0]==0) return true; //special case: empty suffix int j=strlen(suffix)-1; int i=strlen(s)-1; if (i=0 && s[i]==suffix[j]) { i--; j--; } if (j==-1) { //suffix found s[i+1]='\0'; //cut here return true; } return false; } bool trimiSuffix(char* s, const char* suffix) { if (suffix==NULL || s==NULL) return false; if (suffix[0]==0) return true; //special case: empty suffix int j=strlen(suffix)-1; int i=strlen(s)-1; if (i=0 && tolower(s[i])==tolower(suffix[j])) { i--; j--; } if (j==-1) { //suffix found s[i+1]='\0'; //cut here return true; } return false; } char* reverseChars(char* str, int slen) { if (slen==0) slen=strlen(str); int l=0; int r=slen-1; char c; while (l=lend) { for (i=0;i>24; h&=0x0fffffff; } GASSERT(h<=0x0fffffff); return h; } int djb_hash(const char* cp) { int h = 5381; while (*cp) h = (int)(33 * h ^ (unsigned char) *cp++); return (h & 0x7FFFFFFF); //always positive //return h; //return absolute value of this int: //int mask = (h >> (sizeof(int) * CHAR_BIT - 1)); //return (h + mask) ^ mask; } /* Fowler/Noll/Vo (FNV) hash function, variant 1a */ int fnv1a_hash(const char* cp) { int h = 0x811c9dc5; while (*cp) { h ^= (unsigned char) *cp++; h *= 0x01000193; } //return h; return (h & 0x7FFFFFFF); } // removes the last part (file or directory name) of a full path // this is a destructive operation for the given string!!! // the trailing '/' is guaranteed to be there void delFileName(char* filepath) { char *p, *sep; if (filepath==NULL) return; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p=='/' || *p=='\\') sep=p+1; *sep='\0'; // truncate filepath } // returns a pointer to the last file or directory name in a full path const char* getFileName(const char* filepath) { const char *p, *sep; if (filepath==NULL) return NULL; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p=='/' || *p=='\\') sep=p+1; return sep; } // returns a pointer to the file "extension" part in a filename const char* getFileExt(const char* filepath) { const char *p, *dp, *sep; if (filepath==NULL) return NULL; for (p=filepath, dp=filepath, sep=filepath;*p!='\0';p++) { if (*p=='.') dp=p+1; else if (*p=='/' || *p=='\\') sep=p+1; } return (dp>sep) ? dp : NULL ; } int fileExists(const char* fname) { struct stat stFileInfo; int r=0; // Attempt to get the path attributes int fs = stat(fname,&stFileInfo); if (fs == 0) { r=3; // We were able to get the file attributes // so the path exists if (S_ISREG (stFileInfo.st_mode)) { r=2; } if (S_ISDIR (stFileInfo.st_mode)) { r=1; } } return r; } int64 fileSize(const char* fpath) { #ifdef _WIN32 WIN32_FILE_ATTRIBUTE_DATA fad; if (!GetFileAttributesEx(fpath, GetFileExInfoStandard, &fad)) return -1; // error condition, could call GetLastError to find out more LARGE_INTEGER size; size.HighPart = fad.nFileSizeHigh; size.LowPart = fad.nFileSizeLow; return size.QuadPart; #else struct stat results; if (stat(fpath, &results) == 0) // The size of the file in bytes is in return (int64)results.st_size; else //An error occurred //GMessage("Error at stat(%s)!\n", fpath); return -1; #endif } bool parseNumber(char* &p, double& v) { //skip any spaces.. while (*p==' ' || *p=='\t') p++; char* start=p; /*if (*p=='-') p++; else if (*p=='+') { p++;start++; }*/ /* while ((*p>='1' && *p<='9') || *p=='0' || *p=='.' || *p=='-' || tolower(*p)=='e') p++; */ int numlen=strspn(start, "0123456789eE.-+"); p=start+numlen; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; v=strtod(start,&endptr); *p=saved; if (endptr!=p) return false; return true; } bool parseDouble(char* &p, double& v) { return parseNumber(p,v); } bool parseFloat(char* &p, float& v) { double dv; bool parsed=parseNumber(p,dv); if (parsed) v=(float)dv; return parsed; } bool parseInt(char* &p, int& i) { //pointer p is advanced after the number while (*p==' ' || *p=='\t') p++; char* start=p; char* p0=p; if (*p=='-') p++; else if (*p=='+') { p++;start++; } char* atdigits=p; while (*p>='0' && *p<='9') p++; //now p should be past the digits if (atdigits==p) {//no digits found! p=p0; return false; } char* endptr=NULL; long l=strtol(start,&endptr,10); i=(int)l; if (endptr!=p || endptr==start || i!=l) { p=p0; return false; } return true; } bool strToInt(char* p, int& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') p++; else if (*p=='+') { p++;start++; } char* atdigits=p; while (*p>='0' && *p<='9') p++; //now p should be past the digits if (atdigits==p) //no digits found! return false; char* endptr=NULL; long l=strtol(start,&endptr,10); i=(int)l; if (endptr!=p || endptr==start || i!=l) return false; return true; } bool strToUInt(char* p, uint& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (*p>='0' && *p<='9') p++; //now p is on a non-digit; if (start==p) return false; char* endptr=NULL; unsigned long l=strtoul(start,&endptr,10); i=(uint) l; if (endptr!=p || endptr==start || i!=l) return false; return true; } bool parseUInt(char* &p, uint& i) { //pointer p is advanced after the number while (*p==' ' || *p=='\t') p++; char* p0=p; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (*p>='0' && *p<='9') p++; //now p is on a non-digit; if (start==p) { p=p0; return false; } char* endptr=NULL; unsigned long l=strtoul(start,&endptr,10); i=(uint) l; if (endptr!=p || endptr==start || i!=l) { p=p0; return false; } return true; } bool parseHex(char* &p, uint& i) { //skip initial spaces/prefix while (*p==' ' || *p=='\t' || *p=='0' || *p=='x') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (isxdigit(*p)) p++; //now p is on a non-hexdigit; if (p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; unsigned long l=strtoul(start,&endptr,16); i=(uint) l; *p=saved; if (endptr!=p || i!=l) return false; return true; } //write a formatted fasta record, fasta formatted void writeFasta(FILE *fw, const char* seqid, const char* descr, const char* seq, int linelen, int seqlen) { fflush(fw); // write header line only if given! if (seqid!=NULL) { if (descr==NULL || descr[0]==0) fprintf(fw,">%s\n",seqid); else fprintf(fw,">%s %s\n",seqid, descr); } fflush(fw); if (seq==NULL || *seq==0) return; //nothing to print if (linelen==0) { //unlimited line length: write the whole sequence on a line if (seqlen>0) fwrite((const void*)seq, 1, seqlen,fw); else fprintf(fw,"%s",seq); fprintf(fw,"\n"); fflush(fw); return; } int ilen=0; if (seqlen>0) { //seq length given, so we know when to stop for (int i=0; i < seqlen; i++, ilen++) { if (ilen == linelen) { fputc('\n', fw); ilen = 0; } fputc(seq[i], fw); } fputc('\n', fw); } else { //seq length not given, stop when 0 encountered for (int i=0; seq[i]!=0; i++, ilen++) { if (ilen == linelen) { fputc('\n', fw); ilen = 0; } fputc(seq[i], fw); } //for fputc('\n', fw); } fflush(fw); } char* commaprintnum(uint64 n) { char retbuf[48]; int comma = ','; char *p = &retbuf[sizeof(retbuf)-1]; int i = 0; *p = '\0'; do { if(i%3 == 0 && i != 0) *--p = comma; *--p = '0' + n % 10; n /= 10; i++; } while(n != 0); return Gstrdup(p); } gclib-0.12.7/GBase.h000066400000000000000000000545741407072766100140440ustar00rootroot00000000000000#ifndef G_BASE_DEFINED #define G_BASE_DEFINED #define GCLIB_VERSION "0.12.7" #ifdef HAVE_CONFIG_H #include "config.h" #endif #if defined(__WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_WIN64) || defined(__MINGW64__) || defined(__WINDOWS__) #ifndef _WIN32 #define _WIN32 #endif #ifndef _WIN64 #define _WIN64 #endif #define __USE_MINGW_ANSI_STDIO 1 //#define __ISO_C_VISIBLE 1999 #endif #define XSTR(x) STR(x) #define STR(x) #x #ifdef _WIN32 #include #include #define CHPATHSEP '\\' #undef off_t #define off_t int64_t #ifndef popen #define popen _popen #endif /* #ifndef fseeko #ifdef _fseeki64 #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin) #else #define fseeko fseek #endif #endif #ifndef ftello #ifdef _ftelli64 #define ftello(stream) _ftelli64(stream) #else #define ftello ftell #endif #endif */ #else #define CHPATHSEP '/' #ifdef __CYGWIN__ #define _BSD_SOURCE #endif #include #endif #ifdef DEBUG #undef NDEBUG #define _DEBUG 1 #define _DEBUG_ 1 #endif #include #include #include #include #include #include #include #include #include #include typedef int64_t int64; typedef uint64_t uint64; typedef int32_t int32; typedef uint32_t uint32; typedef int16_t int16; typedef uint16_t uint16; typedef unsigned char uchar; typedef uint8_t byte; typedef unsigned int uint; typedef void* pointer; #ifndef MAXUINT #define MAXUINT ((unsigned int)-1) #endif #ifndef MAXINT #define MAXINT INT_MAX #endif #ifndef MAX_UINT #define MAX_UINT ((unsigned int)-1) #endif #ifndef MAX_INT #define MAX_INT INT_MAX #endif /****************************************************************************/ #ifndef EXIT_FAILURE #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #endif /****************************************************************************/ #define ERR_ALLOC "Error allocating memory.\n" //------------------- #define GEXIT(a) { \ fprintf(stderr, "Error: "); fprintf(stderr, a); \ GError("Exiting from line %i in file %s\n",__LINE__,__FILE__); \ } // Debug helpers #ifndef NDEBUG #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__)) #define GVERIFY(condition) \ if (!(condition)) { \ fprintf(stderr, "Assumption \"%s\"\nFailed in file %s: at line:%i\n", \ #condition,__FILE__,__LINE__); \ GEXIT(#condition);} #ifdef TRACE #define GTRACE(exp) (GMessage(exp)) #else #define GTRACE(exp) #endif #else #define GASSERT(exp) #define GTRACE(exp) #define GVERIFY(condition) #endif #define GERROR(exp) (GError(exp)) // Abolute value #define GABS(val) (((val)>=0)?(val):-(val)) // Min and Max #define GMAX(a,b) (((a)>(b))?(a):(b)) #define GMIN(a,b) (((a)>(b))?(b):(a)) // Min of three #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z)) // Max of three #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z)) // Return minimum and maximum of a, b #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a))) // Clamp value x to range [lo..hi] #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x))) typedef int GCompareProc(const pointer item1, const pointer item2); typedef long GFStoreProc(const pointer item1, FILE* fstorage); //for serialization typedef pointer GFLoadProc(FILE* fstorage); //for deserialization void GError(const char* format,...); // Error routine (aborts program) void GMessage(const char* format,...);// Log message to stderr // Assert failed routine:- usually not called directly but through GASSERT void GAssert(const char* expression, const char* filename, unsigned int lineno); typedef void GFreeProc(pointer item); //usually just delete, //but may also support structures with embedded dynamic members #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GFREE(ptr) GFree((pointer*)(&ptr)) inline char* strMin(char *arg1, char *arg2) { return (strcmp(arg1, arg2) < 0)? arg1 : arg2; } inline char* strMax(char *arg1, char *arg2) { return (strcmp(arg2, arg1) < 0)? arg1 : arg2; } inline int iround(double x) { return (int)floor(x + 0.5); } char* Grealpath(const char *path, char *resolved_path); int Gmkdir(const char *path, bool recursive=true, int perms = (S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)); void Gmktempdir(char* templ); bool haveStdInput(); //if stdin is from a pipe or redirection /****************************************************************************/ inline int Gintcmp(int a, int b) { //return (a>b)? 1 : ((a==b)?0:-1); return a-b; } int Gstrcmp(const char* a, const char* b, int n=-1); //same as strcmp but doesn't crash on NULL pointers int Gstricmp(const char* a, const char* b, int n=-1); bool GstrEq(const char* a, const char* b); bool GstriEq(const char* a, const char* b); //basic swap template function template void Gswap(T& lhs, T& rhs) { T tmp=lhs; //requires copy operator lhs=rhs; rhs=tmp; } // use std::is_pointer from in C++11 instead /* template struct isPointer { static const bool value = false; }; template struct isPointer { static const bool value = true; }; */ //check if type T is resolved as a pointer to char template struct is_char_ptr : std::integral_constant < bool, std::is_same::type>::value || std::is_same::type>::value > {}; inline void GFree(pointer* ptr){ GASSERT(ptr); if (*ptr) free(*ptr); *ptr=NULL; } inline bool GMalloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size!=0) *ptr=malloc(size); return *ptr!=NULL; } // Allocate 0-filled memory inline bool GCalloc(pointer* ptr,unsigned long size){ GASSERT(ptr); *ptr=calloc(size,1); return *ptr!=NULL; } // Resize memory inline bool GRealloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size==0) { GFree(ptr); return true; } if (*ptr==NULL) {//simple malloc void *p=malloc(size); if (p != NULL) { *ptr=p; return true; } else return false; }//malloc else {//realloc void *p=realloc(*ptr,size); if (p) { *ptr=p; return true; } return false; } } template T* GDupAlloc(T& data) { T* tmp=NULL; if (!GMalloc((pointer*) tmp, sizeof(T))) GError(ERR_ALLOC); memcpy((void*)tmp, (void*)&data, sizeof(T)); return tmp; } // ****************** basic string manipulation ************************* char *Gstrdup(const char* str, int xtracap=0); //string duplication with extra capacity added //duplicate a string by allocating a copy for it (+xtracap heap room) and returning the new pointer //caller is responsible for deallocating the returned pointer! char* Gstrdup(const char* sfrom, const char* sto); //same as GStrdup, but with an early termination (e.g. on delimiter) char* Gsubstr(const char* str, char* from, char* to=NULL); //extracts a substring, allocating it, including boundaries (from/to) char* replaceStr(char* &str, char* newvalue); //conversion: to Lower/Upper case // creating a new string: char* upCase(const char* str); char* loCase(const char* str); // changing string in place: char* strlower(char * str); char* strupper(char * str); //strstr but for memory zones: scans a memory region //for a substring: void* Gmemscan(void *mem, unsigned int len, void *part, unsigned int partlen); FILE* Gfopen(const char *path, char *mode=NULL); // test if a char is in a string: bool chrInStr(char c, const char* str); char* rstrchr(char* str, char ch); /* returns a pointer to the rightmost occurence of ch in str - like rindex for platforms missing it*/ char* strchrs(const char* s, const char* chrs); //strchr but with a set of chars instead of only one char* rstrfind(const char* str, const char *substr); // like rindex() but for strings; right side version of strstr() char* reverseChars(char* str, int slen=0); //in place reversal of string char* rstrstr(const char* rstart, const char *lend, const char* substr); /*the reversed, rightside equivalent of strstr: starts searching from right end (rstart), going back to left end (lend) and returns a pointer to the last (right) matching character in str */ char* strifind(const char* str, const char* substr); // case insensitive version of strstr -- finding a string within another string // returns NULL if not found //Determines if a string begins with a given prefix //(returns false when any of the params is NULL, // but true when prefix is '' (empty string)!) bool startsWith(const char* s, const char* prefix); bool startsiWith(const char* s, const char* prefix); //case insensitive bool endsWith(const char* s, const char* suffix); //Note: returns true if suffix is empty string, but false if it's NULL bool endsiWith(const char* s, const char* suffix); //case insensitive version //like endsWith but also remove the suffix if found //returns true if the given suffix was found and removed bool trimSuffix(char* s, const char* suffix); //case insensitive version: bool trimiSuffix(char* s, const char* suffix); // ELF hash function for strings int strhash(const char* str); //alternate hash functions: int fnv1a_hash(const char* cp); int djb_hash(const char* cp); //---- generic base GSeg : genomic segment (interval) -- // coordinates are considered 1-based (so 0 is invalid) struct GSeg { uint start; //starte) { start=e;end=s; } else { start=s;end=e; } } //check for overlap with other segment uint len() { return end-start+1; } bool overlap(GSeg* d) { return (start<=d->end && end>=d->start); } bool overlap(GSeg& d) { return (start<=d.end && end>=d.start); } bool overlap(GSeg& d, int fuzz) { return (start<=d.end+fuzz && end+fuzz>=d.start); } bool overlap(uint x) { return (start<=x && x<=end); } bool overlap(uint s, uint e) { if (s>e) { Gswap(s,e); } return (start<=e && end>=s); } //return the length of overlap between two segments int overlapLen(GSeg* r) { if (startstart) { if (r->start>end) return 0; return (r->end>end) ? end-r->start+1 : r->end-r->start+1; } else { //r->start<=start if (start>r->end) return 0; return (r->endend-start+1 : end-start+1; } } int overlapLen(uint rstart, uint rend) { if (rstart>rend) { Gswap(rstart,rend); } if (startend) return 0; return (rend>end) ? end-rstart+1 : rend-rstart+1; } else { //rstart<=start if (start>rend) return 0; return (rendstart && end>=s->end); } bool contained(GSeg* s) { return (s->start<=start && s->end>=end); } bool equals(GSeg& d){ return (start==d.start && end==d.end); } bool equals(GSeg* d){ return (start==d->start && end==d->end); } //fuzzy coordinate matching: bool coordMatch(GSeg* s, uint fuzz=0) { //caller must check for s!=NULL if (fuzz==0) return (start==s->start && end==s->end); uint sd = (start>s->start) ? start-s->start : s->start-start; uint ed = (end>s->end) ? end-s->end : s->end-end; return (sd<=fuzz && ed<=fuzz); } void expand(int by) { //expand in both directions start-=by; end+=by; } void expandInclude(uint rstart, uint rend) { //expand to include given coordinates if (rstart>rend) { Gswap(rstart,rend); } if (rstartend) end=rend; } //comparison operators required for sorting bool operator==(GSeg& d){ return (start==d.start && end==d.end); } bool operator<(GSeg& d){ return (start==d.start)?(end=fCount) GError(GDynArray_INDEX_ERR, x, fCount) #endif #define GDynArray_MAXCOUNT UINT_MAX-1 #define GDynArray_NOIDX UINT_MAX //basic dynamic array (vector) template for simple/primitive types or structs //Warning: uses malloc so it will never call the item's default constructor when growing template class GDynArray { protected: bool byptr; //in-place copy (pointer) takeover of existing OBJ[] OBJ *fArray; uint fCount; uint fCapacity; // size of allocated memory const static uint dyn_array_defcap = 16; // initial capacity (in elements) public: GDynArray(int initcap=dyn_array_defcap):byptr(false), fArray(NULL), fCount(0), fCapacity(initcap) { // constructor GMALLOC(fArray, fCapacity*sizeof(OBJ)); } GDynArray(const GDynArray &a):byptr(false), fArray(NULL), fCount(a.fCount), fCapacity(a.fCapacity) { // copy constructor GMALLOC(fArray, sizeof(OBJ)*a.fCapacity); memcpy(fArray, a.fArray, sizeof(OBJ)* a.fCapacity); } GDynArray(OBJ* ptr, uint pcap):byptr(true), fArray(ptr), fCount(0), fCapacity(pcap) { //this will never deallocate the passed pointer } virtual ~GDynArray() { if (!byptr) { GFREE(fArray); } } GDynArray& operator = (const GDynArray &a) { // assignment operator if (this == &a) return *this; if (a.fCount == 0) { Clear(); return *this; } growTo(a.fCapacity); //set size memcpy(fArray, a.fArray, sizeof(OBJ)*a.fCount); return *this; } OBJ& operator[] (uint idx) {// get array item GDynArray_TEST_INDEX(idx); return fArray[idx]; } void Grow() { int delta = (fCapacity>16) ? (fCapacity>>2) : 2; if (GDynArray_MAXCOUNT-delta<=fCapacity) delta=GDynArray_MAXCOUNT-fCapacity; if (delta<=1) GError("Error at GDynArray::Grow(): max capacity reached!\n"); growTo(fCapacity + delta); } #define GDynArray_ADD(item) \ if (fCount==MAX_UINT-1) GError("Error at GDynArray: cannot add item, maximum count reached!\n"); \ if ((++fCount) > fCapacity) Grow(); \ fArray[fCount-1] = item; uint Add(OBJ* item) { // Add item to the end of array //element given by pointer if (item==NULL) return GDynArray_NOIDX; GDynArray_ADD( (*item) ); return (fCount-1); } uint Add(OBJ item) { // Add OBJ copy to the end of array GDynArray_ADD(item); return (fCount-1); } uint Push(OBJ item) { //same as Add GDynArray_ADD(item); return (fCount-1); } OBJ Pop() { //shoddy.. Do NOT call this for an empty array! if (fCount==0) return (OBJ)NULL; //a NULL cast operator is required --fCount; return fArray[fCount]; } uint Count() { return fCount; } // get size of array (elements) uint Capacity() { return fCapacity; } void growTo(uint newcap) { if (newcap==0) { Clear(); return; } if (newcap <= fCapacity) return; //never shrink! (use Pack() for shrinking) GREALLOC(fArray, newcap*sizeof(OBJ)); fCapacity=newcap; } void append(OBJ* arr, uint count) { //fast adding of a series of objects growTo(fCount+count); memcpy(fArray+fCount, arr, count*sizeof(OBJ)); fCount+=count; } void append(GDynArray arr) { //fast adding of a series of objects growTo(fCount+arr.fCount); memcpy(fArray+fCount, arr.fArray, arr.fCount*sizeof(OBJ)); fCount+=arr.fCount; } void Trim(int tcount=1) { //simply cut (discard) the last tcount items //new Count is now fCount-tcount //does NOT shrink capacity accordingly! if (fCount>=tcount) fCount-=tcount; } void Pack() { //shrink capacity to fCount+dyn_array_defcap if (fCapacity-fCount<=dyn_array_defcap) return; int newcap=fCount+dyn_array_defcap; GREALLOC(fArray, newcap*sizeof(OBJ)); fCapacity=newcap; } void zPack(OBJ z) { //shrink capacity to fCount+1 and adds a z terminator if (fCapacity-fCount<=1) { fArray[fCount]=z; return; } int newcap=fCount+1; GREALLOC(fArray, newcap*sizeof(OBJ)); fCapacity=newcap; fArray[fCount]=z; } inline void Shrink() { Pack(); } void Delete(uint idx) { GDynArray_TEST_INDEX(idx); --fCount; if (idx& fields, const char* delim, int maxfields=MAX_INT); //splits a string by placing 0 where any of the delim chars are found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int strsplit(char* str, GDynArray& fields, const char delim, int maxfields=MAX_INT); //splits a string by placing 0 where the delim char is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int strsplit(char* str, GDynArray& fields, int maxfields=MAX_INT); //splits by tab or space //splits a string by placing 0 where tab or space is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed // ************** simple line reading class for text files //GLineReader -- text line reading/buffering class class GLineReader { bool closeFile; //int len; //int allocated; GDynArray buf; int textlen; //length of actual text, without newline character(s) bool isEOF; FILE* file; off_t filepos; //current position bool pushed; //pushed back int lcount; //line counter (read lines) public: char* chars() { return buf(); } char* line() { return buf(); } int readcount() { return lcount; } //number of lines read void setFile(FILE* stream) { file=stream; } int blength() { return buf.Count(); } //binary/buffer length, including newline character(s) int charcount() { return buf.Count(); } //line length, including newline character(s) int tlength() { return textlen; } //text length excluding newline character(s) int linelen() { return textlen; } //line length, excluding newline character(s) //int size() { return buf.Count(); } //same as size(); bool isEof() {return isEOF; } bool eof() { return isEOF; } off_t getfpos() { return filepos; } off_t getFpos() { return filepos; } char* nextLine() { return getLine(); } char* getLine() { if (pushed) { pushed=false; return buf(); } else return getLine(file); } char* getLine(FILE* stream) { if (pushed) { pushed=false; return buf(); } else return getLine(stream, filepos); } char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update // the given file position void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request // so the next call will in fact return the same line GLineReader(const char* fname):closeFile(false),buf(1024), textlen(0), isEOF(false),file(NULL),filepos(0), pushed(false), lcount(0) { FILE* f=fopen(fname, "rb"); if (f==NULL) GError("Error opening file '%s'!\n",fname); closeFile=true; file=f; } GLineReader(FILE* stream=NULL, off_t fpos=0):closeFile(false),buf(1024), textlen(0), isEOF(false),file(stream), filepos(fpos), pushed(false), lcount(0) { } ~GLineReader() { if (closeFile) fclose(file); } }; /* extended fgets() - to read one full line from a file and update the file position correctly ! buf will be reallocated as necessary, to fit the whole line */ char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL); //print int/values nicely formatted in 3-digit groups char* commaprintnum(uint64 n); /*********************** File management functions *********************/ // removes the last part (file or directory name) of a full path // WARNING: this is a destructive operation for the given string! void delFileName(char* filepath); // returns a pointer to the last file or directory name in a full path const char* getFileName(const char* filepath); // returns a pointer to the file "extension" part in a filename const char* getFileExt(const char* filepath); int fileExists(const char* fname); //returns 0 if path doesn't exist // 1 if it's a directory // 2 if it's a regular file // 3 something else (but entry exists) int64 fileSize(const char* fpath); //write a formatted fasta record, fasta formatted void writeFasta(FILE *fw, const char* seqid, const char* descr, const char* seq, int linelen=60, int seqlen=0); //parses the next number found in a string at the current position //until a non-digit (and not a '.', 'e','E','-','+') is encountered; //updates the char* pointer to be after the last digit parsed bool parseNumber(char* &p, double& v); bool parseDouble(char* &p, double& v); //just an alias for parseNumber bool parseFloat(char* &p, float& v); bool strToInt(char* p, int& i); bool strToUInt(char* p, uint& i); bool parseInt(char* &p, int& i); //advance pointer p after the number bool parseUInt(char* &p, uint& i); //advance pointer p after the number bool parseHex(char* &p, uint& i); #endif /* G_BASE_DEFINED */ gclib-0.12.7/GBitVec.h000066400000000000000000000327741407072766100143440ustar00rootroot00000000000000#ifndef __GBITVEC_H__ #define __GBITVEC_H__ #include "GBase.h" //this code is lifted from LLVM (llvm.org, BitVector.h) /// bitCount_32 - this function counts the number of set bits in a value. /// Ex. CountPopulation(0xF000F000) = 8 /// Returns 0 if the word is zero. inline uint bitCount_32(uint32_t Value) { #if __GNUC__ >= 4 return __builtin_popcount(Value); #else uint32_t v = Value - ((Value >> 1) & 0x55555555); v = (v & 0x33333333) + ((v >> 2) & 0x33333333); return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; #endif } /// bitCount_64 - this function counts the number of set bits in a value, /// (64 bit edition.) inline uint bitCount_64(uint64_t Value) { #if __GNUC__ >= 4 return __builtin_popcountll(Value); #else uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL); v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; return uint((uint64_t)(v * 0x0101010101010101ULL) >> 56); #endif } /// CountTrailingZeros_32 - this function performs the platform optimal form of /// counting the number of zeros from the least significant bit to the first one /// bit. Ex. CountTrailingZeros_32(0xFF00FF00) == 8. /// Returns 32 if the word is zero. inline unsigned bitCountTrailingZeros_32(uint32_t Value) { #if __GNUC__ >= 4 return Value ? __builtin_ctz(Value) : 32; #else static const unsigned Mod37BitPosition[] = { 32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4, 7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5, 20, 8, 19, 18 }; return Mod37BitPosition[(-Value & Value) % 37]; #endif } // CountTrailingZeros_64 - This function performs the platform optimal form /// of counting the number of zeros from the least significant bit to the first /// one bit (64 bit edition.) /// Returns 64 if the word is zero. inline unsigned bitCountTrailingZeros_64(uint64_t Value) { #if __GNUC__ >= 4 return Value ? __builtin_ctzll(Value) : 64; #else static const unsigned Mod67Position[] = { 64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54, 4, 64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5, 32, 65, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27, 29, 50, 43, 46, 31, 37, 21, 57, 52, 8, 26, 49, 45, 36, 56, 7, 48, 35, 6, 34, 33, 0 }; return Mod67Position[(-Value & Value) % 67]; #endif } class GBitVec { typedef unsigned long BitWord; enum { BITWORD_SIZE = (uint)sizeof(BitWord) * CHAR_BIT }; BitWord *fBits; // Actual bits. uint Size; // Size of GBitVec in bits. uint Capacity; // Size of allocated memory in BitWord. public: // Encapsulation of a single bit. class GBitRef { friend class GBitVec; BitWord *WordRef; uint BitPos; GBitRef(); // Undefined public: GBitRef(GBitVec &b, uint Idx) { WordRef = &b.fBits[Idx / BITWORD_SIZE]; BitPos = Idx % BITWORD_SIZE; } ~GBitRef() {} GBitRef &operator=(GBitRef t) { *this = bool(t); return *this; } GBitRef& operator=(bool t) { if (t) *WordRef |= 1L << BitPos; else *WordRef &= ~(1L << BitPos); return *this; } operator bool() const { return ((*WordRef) & (1L << BitPos)) ? true : false; } }; /// GBitVec default ctor - Creates an empty GBitVec. GBitVec() : Size(0), Capacity(0) { fBits = 0; } /// GBitVec ctor - Creates a GBitVec of specified number of bits. All /// bits are initialized to the specified value. explicit GBitVec(uint bitsize, bool value = false) : Size(bitsize) { if (bitsize==0) { Capacity=0; fBits=0; return; } Capacity = NumBitWords(bitsize); //fBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); GMALLOC(fBits, Capacity * sizeof(BitWord)); init_words(fBits, Capacity, value); if (value) clear_unused_bits(); } unsigned long getMemorySize() const { unsigned long r = ((unsigned long) Capacity) * sizeof(BitWord); return r; } GBitVec(const GBitVec* RHS) { if (RHS==NULL) { Size = 0; fBits = 0; Capacity = 0; return; } Capacity = NumBitWords(RHS->size()); GMALLOC(fBits, Capacity * sizeof(BitWord)); memcpy(fBits, RHS->fBits, Capacity * sizeof(BitWord)); } /// GBitVec copy ctor. GBitVec(const GBitVec &RHS) : Size(RHS.size()) { if (Size == 0) { fBits = 0; Capacity = 0; return; } Capacity = NumBitWords(RHS.size()); GMALLOC(fBits, Capacity * sizeof(BitWord)); memcpy(fBits, RHS.fBits, Capacity * sizeof(BitWord)); } ~GBitVec() { GFREE(fBits); } /// empty - Tests whether there are no bits in this GBitVec. bool empty() const { return Size == 0; } /// size - Returns the number of bits in this GBitVec. uint size() const { return Size; } void bitSizeError() { GError("Error at GBitVec: unsupported BitWord size (%d)!\n", sizeof(BitWord)); } /// count - Returns the number of bits which are set. uint count() { uint NumBits = 0; for (uint i = 0; i < NumBitWords(size()); ++i) if (sizeof(BitWord) == 4) NumBits += bitCount_32((uint32_t)fBits[i]); else if (sizeof(BitWord) == 8) NumBits += bitCount_64(fBits[i]); else bitSizeError(); return NumBits; } /// any - Returns true if any bit is set. bool any() { for (uint i = 0; i < NumBitWords(size()); ++i) if (fBits[i] != 0) return true; return false; } /// all - Returns true if all bits are set. bool all() { // TODO: Optimize this. return count() == size(); } /// none - Returns true if none of the bits are set. bool none() { return !any(); } /// find_first - Returns the index of the first set bit, -1 if none /// of the bits are set. int find_first() { for (uint i = 0; i < NumBitWords(size()); ++i) if (fBits[i] != 0) { if (sizeof(BitWord) == 4) return i * BITWORD_SIZE + bitCountTrailingZeros_32((uint32_t)fBits[i]); else if (sizeof(BitWord) == 8) return i * BITWORD_SIZE + bitCountTrailingZeros_64(fBits[i]); else bitSizeError(); } return -1; } /// find_next - Returns the index of the next set bit following the /// "Prev" bit. Returns -1 if the next set bit is not found. int find_next(uint Prev) { ++Prev; if (Prev >= Size) return -1; uint WordPos = Prev / BITWORD_SIZE; uint BitPos = Prev % BITWORD_SIZE; BitWord Copy = fBits[WordPos]; // Mask off previous bits. Copy &= ~0UL << BitPos; if (Copy != 0) { if (sizeof(BitWord) == 4) return WordPos * BITWORD_SIZE + bitCountTrailingZeros_32((uint32_t)Copy); else if (sizeof(BitWord) == 8) return WordPos * BITWORD_SIZE + bitCountTrailingZeros_64(Copy); else bitSizeError(); } // Check subsequent words. for (uint i = WordPos+1; i < NumBitWords(size()); ++i) if (fBits[i] != 0) { if (sizeof(BitWord) == 4) return i * BITWORD_SIZE + bitCountTrailingZeros_32((uint32_t)fBits[i]); else if (sizeof(BitWord) == 8) return i * BITWORD_SIZE + bitCountTrailingZeros_64(fBits[i]); else bitSizeError(); } return -1; } /// clear - Clear all bits; does NOT release memory void clear() { Size = 0; } /// resize - Grow or shrink the GBitVec. void resize(uint N, bool value = false) { if (N > Capacity * BITWORD_SIZE) { uint OldCapacity = Capacity; grow(N); init_words(&fBits[OldCapacity], (Capacity-OldCapacity), value); } // Set any old unused bits that are now included in the GBitVec. This // may set bits that are not included in the new vector, but we will clear // them back out below. if (N > Size) set_unused_bits(value); // Update the size, and clear out any bits that are now unused uint OldSize = Size; Size = N; if (value || N < OldSize) clear_unused_bits(); } void reserve(uint N) { if (N > Capacity * BITWORD_SIZE) grow(N); } // Set, reset, flip GBitVec &set() { init_words(fBits, Capacity, true); clear_unused_bits(); return *this; } GBitVec &set(uint Idx) { #ifndef NDEBUG indexCheck(Idx, Size); #endif fBits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE); return *this; } GBitVec &reset() { init_words(fBits, Capacity, false); return *this; } GBitVec &reset(uint Idx) { #ifndef NDEBUG indexCheck(Idx, Size); #endif fBits[Idx / BITWORD_SIZE] &= ~(1L << (Idx % BITWORD_SIZE)); return *this; } GBitVec &flip() { for (uint i = 0; i < NumBitWords(size()); ++i) fBits[i] = ~fBits[i]; clear_unused_bits(); return *this; } GBitVec &flip(uint Idx) { #ifndef NDEBUG indexCheck(Idx, Size); #endif fBits[Idx / BITWORD_SIZE] ^= 1L << (Idx % BITWORD_SIZE); return *this; } // No argument flip. GBitVec operator~() const { return GBitVec(*this).flip(); } inline static void indexCheck(uint vIdx, uint vSize) { if (vIdx >= vSize) GError("Error at GBitVec: index %d out of bounds (size %d)\n", (int)vIdx, vSize); } // Indexing. GBitRef operator[](uint Idx) { //assert (Idx < Size && "Out-of-bounds Bit access."); #ifndef NDEBUG indexCheck(Idx, Size); #endif return GBitRef(*this, Idx); } bool operator[](uint Idx) const { #ifndef NDEBUG indexCheck(Idx, Size); #endif BitWord Mask = 1L << (Idx % BITWORD_SIZE); return (fBits[Idx / BITWORD_SIZE] & Mask) != 0; } bool test(uint Idx) const { return (*this)[Idx]; } // Comparison operators. bool operator==(const GBitVec &RHS) const { uint ThisWords = NumBitWords(size()); uint RHSWords = NumBitWords(RHS.size()); uint i; uint imax=GMIN(ThisWords, RHSWords); for (i = 0; i != imax; ++i) if (fBits[i] != RHS.fBits[i]) return false; // Verify that any extra words are all zeros. if (i != ThisWords) { for (; i != ThisWords; ++i) if (fBits[i]) return false; } else if (i != RHSWords) { for (; i != RHSWords; ++i) if (RHS.fBits[i]) return false; } return true; } bool operator!=(const GBitVec &RHS) const { return !(*this == RHS); } // Intersection, union, disjoint union. GBitVec &operator&=(const GBitVec &RHS) { uint ThisWords = NumBitWords(size()); uint RHSWords = NumBitWords(RHS.size()); uint i; uint imax=GMIN(ThisWords, RHSWords); for (i = 0; i != imax; ++i) fBits[i] &= RHS.fBits[i]; // Any bits that are just in this GBitVec become zero, because they aren't // in the RHS bit vector. Any words only in RHS are ignored because they // are already zero in the LHS. for (; i != ThisWords; ++i) fBits[i] = 0; return *this; } GBitVec &operator|=(const GBitVec &RHS) { if (size() < RHS.size()) resize(RHS.size()); for (size_t i = 0, e = NumBitWords(RHS.size()); i != e; ++i) fBits[i] |= RHS.fBits[i]; return *this; } GBitVec &operator^=(const GBitVec &RHS) { if (size() < RHS.size()) resize(RHS.size()); for (size_t i = 0, e = NumBitWords(RHS.size()); i != e; ++i) fBits[i] ^= RHS.fBits[i]; return *this; } // Assignment operator. const GBitVec &operator=(const GBitVec &RHS) { if (this == &RHS) return *this; Size = RHS.size(); uint RHSWords = NumBitWords(Size); if (Size <= Capacity * BITWORD_SIZE) { if (Size) memcpy(fBits, RHS.fBits, RHSWords * sizeof(BitWord)); clear_unused_bits(); return *this; } // Grow the GBitVec to have enough elements. Capacity = RHSWords; //BitWord *NewBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); BitWord *NewBits = NULL; GMALLOC(NewBits, Capacity * sizeof(BitWord)); memcpy(NewBits, RHS.fBits, Capacity * sizeof(BitWord)); // Destroy the old bits. GFREE(fBits); fBits = NewBits; return *this; } void swap(GBitVec &RHS) { Gswap(fBits, RHS.fBits); Gswap(Size, RHS.Size); Gswap(Capacity, RHS.Capacity); } private: uint NumBitWords(uint S) const { return (S + BITWORD_SIZE-1) / BITWORD_SIZE; } // Set the unused bits in the high words. void set_unused_bits(bool value = true) { // Set high words first. uint UsedWords = NumBitWords(Size); if (Capacity > UsedWords) init_words(&fBits[UsedWords], (Capacity-UsedWords), value); // Then set any stray high bits of the last used word. uint ExtraBits = Size % BITWORD_SIZE; if (ExtraBits) { BitWord ExtraBitMask = ~0UL << ExtraBits; if (value) fBits[UsedWords-1] |= ExtraBitMask; else fBits[UsedWords-1] &= ~ExtraBitMask; } } // Clear the unused bits in the high words. void clear_unused_bits() { set_unused_bits(false); } void grow(uint NewSize) { Capacity = GMAX(NumBitWords(NewSize), Capacity * 2); //fBits = (BitWord *)std::realloc(fBits, Capacity * sizeof(BitWord)); GREALLOC(fBits, Capacity * sizeof(BitWord)); clear_unused_bits(); } void init_words(BitWord *B, uint NumWords, bool value) { memset(B, 0 - (int)value, NumWords*sizeof(BitWord)); } }; inline GBitVec operator&(const GBitVec &LHS, const GBitVec &RHS) { GBitVec Result(LHS); Result &= RHS; return Result; } inline GBitVec operator|(const GBitVec &LHS, const GBitVec &RHS) { GBitVec Result(LHS); Result |= RHS; return Result; } inline GBitVec operator^(const GBitVec &LHS, const GBitVec &RHS) { GBitVec Result(LHS); Result ^= RHS; return Result; } inline void Gswap(GBitVec &LHS, GBitVec &RHS) { LHS.swap(RHS); } #endif gclib-0.12.7/GCdbYank.cpp000066400000000000000000000361511407072766100150270ustar00rootroot00000000000000#include "GCdbYank.h" #include "GBase.h" #include #define ERR_READ "GCdbYank: error reading from file.\n" #define ERR_READFMT "GCdbYank read error: incorrect file format.\n" #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n" #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n" // 1MB memory buffer: #define MAX_MEM_RECSIZE 1048576 #ifndef O_BINARY #define O_BINARY 0x0000 #endif //default size of the index record for records stored with 32bit offsets uint32 irec_size32=8; #ifdef ENABLE_COMPRESSION GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) { zrecsize=-1; zpos=0; recdelim=Gstrdup(r_delim); zf=azf; decomp_start(zrsize); chrhandler=new GFastaCharHandler(recdelim); } GCdbZFasta::~GCdbZFasta() { //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf); // FULL_FLUSH method instead of finish delete chrhandler; GFREE(recdelim); decomp_end(); } void GCdbZFasta::decomp_start(int zrsize) { zstream.zalloc = (alloc_func)0; zstream.zfree = (free_func)0; zstream.opaque = (voidpf)0; zstream.next_in = (Bytef*)sbuf; zstream.avail_in = 0; zstream.next_out = (Bytef*)lbuf; int err = inflateInit(&zstream); if (err!=Z_OK) GMessage("Error at inflateInit()\n"); //-- now read and discard the first record, so we can use random access later // (needed by zlib) int bytes_read=fread(sbuf, 1, zrsize, zf); if (bytes_read=0) { if (fseeko(zf, zfofs, 0)) GError("GCdbZFasta::decompress: error fseeko() to %d\n", zfofs); } else if (feof(zf)) return 0; bool in_rec=true; int err=0; int total_read=0; int total_written=0; chrhandler->init(&rec, seqCallBack); while (in_rec) { // read loop int to_read=0; int bytes_read=0; if (csize<=0) { //read one byte at a time to_read=1; int c; if ((c =fgetc(zf))!=EOF) { bytes_read = 1; sbuf[0]=c; } else { //bytes_read=0; return 0; //eof } total_read+=bytes_read; } else { to_read = csize-total_read>GCDBZ_SBUF_LEN ? GCDBZ_SBUF_LEN : csize-total_read; // check for csize vs bytes_read match: if (to_read==0) return 0; bytes_read=fread(sbuf, 1, to_read, zf); if (bytes_read!=to_read) GError("Error reading from zrec file\n"); total_read+=bytes_read; in_rec=(total_read0) { /* if (fwrite(lbuf, 1, toWrite, outf)processChar(lbuf[i]); total_written+=toWrite; } if (err==Z_STREAM_END) { in_rec=false; if (total_written==0) { GMessage("Z_STREAM_END found but total_written=0!\n"); } break; } else if (err !=Z_OK) GError("GCdbZFasta error: inflate failed! (err=%d)\n",err); } while (zstream.avail_in!=0); //decompression loop } //read loop chrhandler->done(); /*if (err!=Z_STREAM_END) { GError("decompress: Z_STREAM_END not found!\n"); }*/ return total_written; } GCdbZFasta* GCdbYank::openCdbz(char* p) { //in case this was not done before: gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; FILE* zf=fopen(p, "rb"); if (zf==NULL) { GMessage("Error: cannot open compressed file '%s'!\n",p); return NULL; } //check if the file is valid and read the length of the first record // char ztag[5]; ztag[4]='\0'; if (fread(ztag, 1, 4, zf)<4) { GMessage("Error reading header of compressed file '%s'\n",p); return NULL; } if (strcmp(ztag, "CDBZ")!=0) { GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p); return NULL; } unsigned int zrecsize; if (fread((void*) &zrecsize,1,4,zf)<4) { GMessage("Error reading 1st compressed record size for file '%s'!\n",p); return NULL; } zrecsize=gcvt_uint(&zrecsize); return new GCdbZFasta(zf, zrecsize, recdelim); } #else static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n"; #endif //decompression stuff void inplace_Lower(char* c) { char *p=c; while (*p!='\0') { *p=tolower(*p);p++; } } void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) { int r; while (len > 0) { r = b->get(buf,len); if (r == -1) GError(ERR_READ); if (r == 0) GError(ERR_READFMT); pos += r; buf += r; len -= r; } } void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) { char buf[4]; buf_get(b, pos, buf, 4); uint32_unpack(buf,num); } int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) { //this is messy due to the need of compatibility with the //old 32bit file-length char* dbname=*fnameptr; //read just the tag first: 4 bytes ID lseek(fd, -cdbInfoSIZE, SEEK_END); int r=read(fd, &dbstat, cdbInfoSIZE ); if (r!=cdbInfoSIZE) return 2; //GMessage("Size of dbstat=%d\n", cdbInfoSIZE); if (strncmp(dbstat.oldtag, "CIDX", 4)==0) { //old dbstat structure -- convert it dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]); dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]); dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize); dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags); //position on the dbnamelen entry dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen); //GMessage("dbnamelen=%d\n", dbstat.dbnamelen); lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END); } else if (strncmp(dbstat.tag, "CDBX", 4)!=0) { GMessage("Error: this doesn't appear to be a cdbfasta created file!\n"); return 1; } else { // new CDBX type: dbstat.dbsize = gcvt_offt(&dbstat.dbsize); dbstat.num_keys=gcvt_uint(&dbstat.num_keys); dbstat.num_records=gcvt_uint(&dbstat.num_records); dbstat.idxflags = gcvt_uint(&dbstat.idxflags); //position on the dbnamelen entry dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen); //GMessage("dbnamelen=%d\n", dbstat.dbnamelen); lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END); } GMALLOC(dbname, dbstat.dbnamelen+1); dbname[dbstat.dbnamelen]='\0'; r=read(fd, dbname, dbstat.dbnamelen); *fnameptr=dbname; if (r!=dbstat.dbnamelen) return 2; return 0; } int parse_int(char* buf, const char* key, int& e) { char* p, *q; while (e!=EOF && isspace(e)) { //skip any spaces if (e=='\n') GError(ERR_RANGEFMT, key); e=fgetc(stdin); } if (e==EOF) GError(ERR_RANGEFMT, key); //now e is the first non-space p=buf; q=p; while (e!=EOF && !isspace(e)) { *q=e; q++; e=fgetc(stdin); } *q='\0'; //now p is the starting coordinate string return atoi(p); //now the file pointer should be on the first space after the parsed value } int parse_int(char*& f, const char* key, int& e) { char* p, *q; char buf[16]; while (e!='\0' && isspace(e)) { //skip any spaces if (e=='\n') GError(ERR_RANGEFMT, key); f++; e=*f; } if (e=='\0') GError(ERR_RANGEFMT, key); //now e is the first non-space char p=buf; q=p; while (e!='\0' && !isspace(e)) { *q=e; q++; f++; e=*f; } *q='\0'; return atoi(p); //now f and e should be on the first space after the parsed value (or '\0') } GCdbYank::GCdbYank(const char* fidx, const char* recsep) { is_compressed=false; fd=-1; cdb=NULL; warnings=0; #ifdef ENABLE_COMPRESSION cdbz=NULL; #endif fdb=-1; fz=NULL; dbname=NULL; recdelim=Gstrdup(recsep); if (fidx==NULL) GError("GCdbYank Error: NULL index file name!"); idxfile=Gstrdup(fidx); cdb=new GCdbRead(idxfile); fd=cdb->getfd(); db_size=0; dbstat.dbsize=0; info_dbname=NULL; int r=read_dbinfo(fd, &info_dbname, dbstat); lseek(fd, 0, SEEK_SET); if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n"); else if (r==2) GError("Error reading info chunk!\n"); /*try to find the database file rules: if given, only the -d given filename is used otherwise: 1) the same directory with the given index file(stripping the suffix) 2) the dbstat filepath/name stored by cdbfasta */ if (dbname==NULL) { char* p = rstrchr(idxfile, '.'); if (p!=NULL) { /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\ given, so the database file can be located in the same directory \n\ by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/ int nlen=p-idxfile; char* namebuf=NULL; GMALLOC(namebuf, nlen+1); strncpy(namebuf, idxfile, nlen); namebuf[nlen]='\0'; if (fileExists(namebuf)) dbname=namebuf; } // strip the index file extenstion // 2) try the stored dbstat name if (dbname==NULL) { if (fileExists(info_dbname)) dbname=info_dbname; else GError("Cannot locate the database file for this index\n"); } }// database name not given is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS); if (is_compressed) //try to open the dbname as a compressed file fz=fopen(dbname, "rb"); else fdb=open(dbname, O_RDONLY|O_BINARY); if (fdb==-1 && fz==NULL) GError("Error: cannot open database file %s\n",dbname); if (is_compressed) { #ifndef ENABLE_COMPRESSION GError(err_COMPRESSED); #else fclose(fz);//just to start fresh here //determine size: int ftmp = open(dbname, O_RDONLY|O_BINARY); if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname); struct stat fdbstat; fstat(ftmp, &fdbstat); db_size=fdbstat.st_size; close(ftmp); //-------- reopen here cdbz=openCdbz(dbname); if (cdbz==NULL) GError("Error opening the cdbz file '%s'\n"); fz=cdbz->getZFile(); #endif } else { struct stat fdbstat; if (stat(dbname, &fdbstat)!=0) { perror("stat()"); exit(1); } db_size=fdbstat.st_size; } //abort if the database size was read and it doesn't match the cdbfasta stored size if (dbstat.dbsize>0 && dbstat.dbsize!=db_size) GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n", fdb, dbstat.dbsize, db_size, dbname); fastahandler=new GFastaCharHandler(recdelim); } //* GCdbYank constructor *// GCdbYank::~GCdbYank() { if (is_compressed) { fclose(fz); #ifdef ENABLE_COMPRESSION delete cdbz; #endif } else close(fdb); GFREE(info_dbname); delete fastahandler; GFREE(recdelim); GFREE(dbname); GFREE(idxfile); delete cdb; close(fd); } int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) { //assumes fdb is open, cdb was created on the index file int r=cdb->find(key); if (r==0) return 0; if (r==-1) GError("cdbyank: error searching for key %s in %s\n", key, idxfile); /* while (r>0) { */ off_t pos = cdb->datapos(); //position of this key's record in the index file unsigned int len=cdb->datalen(); // length of this key's record char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length if (cdb->read(bytes,len,pos) == -1) GError("cdbyank: error at GCbd::read (%s)!\n", idxfile); off_t fpos; //this will be the fastadb offset uint32 reclen; //this will be the fasta record offset if (len>irec_size32) { //64 bit file offset was used fpos=gcvt_offt(bytes); reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]); } else { //32bit offset used fpos=gcvt_uint(bytes); reclen=gcvt_uint(&bytes[sizeof(uint32)]); } //GMessage("reclen=%d\n", reclen); /* if (showQuery) fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/ /*========= FETCHING RECORD CONTENT ======= */ if (is_compressed) { //for now: ignore special retrievals, just print the whole record #ifdef ENABLE_COMPRESSION return cdbz->decompress(rec, reclen, fpos, seqCallBack); #else GError(err_COMPRESSED); #endif } else { // not compressed -- position into the file and build an ad hoc GFastaFile lseek(fdb, fpos, SEEK_SET); // read it char by char and return it as output char c='\0'; int charsread=0; fastahandler->init(&rec, seqCallBack); while (reclen-- && read(fdb, &c, 1)==1) { fastahandler->processChar(c); charsread++; } fastahandler->done(); return charsread; } // not compressed /* if (many) r=cdb->findnext(key, strlen(key)); else r=0; } */ return 0; } off_t GCdbYank::getRecordPos(const char* key, uint32* record_len) { //assumes fdb is open, cdb was created on the index file int r=cdb->find(key); if (r==0 && warnings) { GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile); return -1; } if (r==-1) GError("cdbyank: error searching for key %s in %s\n", key, idxfile); off_t pos = cdb->datapos(); //position of this key's record in the index file unsigned int len=cdb->datalen(); // length of this key's record char bytes[64]; // data buffer -- should just accomodate fastarec_pos, fastarec_length if (cdb->read(bytes,len,pos) == -1) GError("cdbyank: error at GCbd::read (%s)!\n", idxfile); off_t fpos; //this will be the fastadb offset uint32 rlen; //this will be the fasta record length if (len>irec_size32) { //64 bit file offset was used fpos=gcvt_offt(bytes); rlen=gcvt_uint(&bytes[offsetof(CIdxData, reclen)]); } else { //32bit offset used fpos=gcvt_uint(bytes); rlen=gcvt_uint(&bytes[offsetof(CIdxData32, reclen)]); } if (record_len!=NULL) *record_len=rlen; return fpos; } gclib-0.12.7/GCdbYank.h000066400000000000000000000036401407072766100144710ustar00rootroot00000000000000#ifndef _GCDBYANK_H #define _GCDBYANK_H #include "gcdb.h" #include #include "GFastaFile.h" // FastaSeq class and *charFunc() callback type #define DEF_CDBREC_DELIM ">" #ifdef ENABLE_COMPRESSION #include #define GCDBZ_SBUF_LEN 8192 #define GCDBZ_LBUF_LEN 8192*2 class GCdbZFasta { private: char* recdelim; char lbuf[GCDBZ_LBUF_LEN]; //larger buffer char sbuf[GCDBZ_SBUF_LEN]; //smaller buffer char* defline; //defline copy storage -- compression only int defline_cap; //currently allocated length of defline int defline_len; //currently used length of defline z_stream zstream; // de/compression stream FILE* zf; //compressed file long zpos; //current position in zf int zrecsize; // the size of the compressed record GFastaCharHandler* chrhandler; public: GCdbZFasta(FILE* af, int zrsize=0, char* r_delim=DEF_CDBREC_DELIM); ~GCdbZFasta(); FILE* getZFile() { return zf; } void decomp_start(int zrsize); void decomp_end(); int decompress(FastaSeq& rec, int csize=0, int zfofs=-1, charFunc* seqCallBack=NULL); // uncompress csize bytes from file zf, from optional file offset zfofs, // and send the uncompressed stream to callbackFn }; #endif class GCdbYank { char* idxfile; //char* dbfile; char* recdelim; //record delimiter -- typically ">" int warnings; bool is_compressed; char* dbname; char* info_dbname; off_t db_size; cdbInfo dbstat; GCdbRead* cdb; #ifdef ENABLE_COMPRESSION GCdbZFasta* cdbz; #endif int fdb; int fd; FILE* fz; // if compressed GFastaCharHandler* fastahandler; #ifdef ENABLE_COMPRESSION protected: GCdbZFasta* openCdbz(char* p); #endif public: GCdbYank(const char* fidx, const char* recsep=DEF_CDBREC_DELIM); ~GCdbYank(); int getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack=NULL); off_t getRecordPos(const char* key, uint32* record_len=NULL); char* getDbName() { return dbname; } }; #endif gclib-0.12.7/GFaSeqGet.cpp000066400000000000000000000267331407072766100151600ustar00rootroot00000000000000#include "GFaSeqGet.h" #include "gdna.h" #include GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, const char* seqid) { if (gfasta.fastaPath==NULL) return NULL; return gfasta.fetch(seqid); } void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) { if (sovl==0) { GFREE(sq); sqstart=sstart; uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ; sqlen = (slen==0 ? max_len : slen); GMALLOC(sq, sqlen); return; } //overlap -- copy the overlapping region char* newsq=NULL; GMALLOC(newsq, slen); memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl); GFREE(sq); sq=newsq; sqstart=sstart; sqlen=slen; } void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) { fh=fopen(fn,"rb"); if (fh==NULL) { GError("Error (GFaSeqGet) opening file '%s'\n",fn); } fname=Gstrdup(fn); initialParse(fofs, validate); lastsub=new GSubSeq(); } GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen):fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { //for GFastaIndex use mostly -- the important difference is that //the file offset is to the sequence, not to the defline fh=fopen(faname,"rb"); if (fh==NULL) { GError("Error (GFaSeqGet) opening file '%s'\n",faname); } fname=Gstrdup(faname); line_len=l_len; line_blen=l_blen; seq_len=seqlen; if (line_blen fseqname(64); fseqname.DetachPtr(); //will not free the allocated memory fseqstart=fofs; int c=getc(fh); fseqstart++; if (c!='>') //fofs must be at the beginning of a FASTA record! GError("Error (GFaSeqGet): not a FASTA record?\n"); bool getName=true; while ((c=getc(fh))!=EOF) { fseqstart++; if (getName) { if (c<=32) getName=false; else //seqname.append((char)c); fseqname.Add((char)c); } if (c=='\n' || c=='\r') { break; } //end of defline } fseqname.Add('\0'); //terminate the string seqname=fseqname(); //takeover the string pointer if (c==EOF) GError(gfa_ERRPARSE); line_len=0; uint lendlen=0; while ((c=getc(fh))!=EOF) { if (c=='\n' || c=='\r') { //end of line encountered if (line_len>0) { //end of the first "sequence" line lendlen++; break; } else {// another EoL char at the end of defline fseqstart++; continue; } }// end-of-line characters line_len++; } //we are at the end of first sequence line while ((c=getc(fh))!=EOF) { if (c=='\n' || c=='\r') lendlen++; else { ungetc(c,fh); break; } } line_blen=line_len+lendlen; if (c==EOF) return; // -- you don't need to check it all if you're sure it's safe if (checkall) { //validate the rest of the FASTA records uint llen=0; //last line length uint elen=0; //length of last line ending bool waseol=true; while ((c=getc(fh))!=EOF) { if (c=='>' && waseol) { ungetc(c,fh); break; } if (c=='\n' || c=='\r') { // eol char elen++; if (waseol) continue; //2nd eol char waseol=true; elen=1; continue; } if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered //--- on a seq char here: if (waseol) {//beginning of a seq line if (elen && (llen!=line_len || elen!=lendlen)) //GError(gfa_ERRPARSE); GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\ the sequence lines have the same length (except for the last line)"); waseol=false; llen=0; elen=0; } llen++; } //while reading chars }// FASTA checking was requested fseeko(fh,fseqstart,SEEK_SET); } const char* GFaSeqGet::subseq(uint cstart, int& clen) { //cstart is 1-based genomic coordinate within current fasta sequence int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ; //GMessage("--> call: subseq(%u, %d)\n", cstart, clen); if (clen>maxlen) { GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen); return NULL; } if (seq_len>0 && clen+cstart-1>seq_len) { //GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len); //Adjust request: clen=seq_len-cstart+1; } if (lastsub->sq==NULL || lastsub->sqlen==0) { lastsub->setup(cstart, clen, 0,0,0,seq_len); loadsubseq(cstart, clen); lastsub->sqlen=clen; return (const char*)lastsub->sq; } //allow extension up to MAX_FASUBSEQ uint bstart=lastsub->sqstart; uint bend=lastsub->sqstart+lastsub->sqlen-1; uint cend=cstart+clen-1; int qlen=0; //only the extra len to be allocated/appended/prepended uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file int newlen=0; //the new total length of the buffered sequence lastsub->sq int kovl=0; int czfrom=0;//0-based offsets for copying a previously read sequence chunk int czto=0; uint newstart=cstart; if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer return (const char*) &(lastsub->sq[cstart-bstart]) ; } //extend downward uint newend=GMAX(cend, bend); if (cstartMAX_FASUBSEQ) { newlen=MAX_FASUBSEQ; newend=cstart+newlen-1; //keep newstart, set newend } qlen=bstart-cstart; if (newend>bstart) { //overlap if (newend>bend) {// new region is larger & around the old one - so we have two regions to update kovl=bend-bstart+1; czfrom=0; czto=bstart-cstart; lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq qlen=bstart-cstart; loadsubseq(newstart, qlen); qlen=newend-bend; int toread=qlen; loadsubseq(bend+1, qlen); clen-=(toread-qlen); lastsub->sqlen=clen; return (const char*)lastsub->sq; } //newend<=bend kovl=newend-bstart+1; } else { //no overlap with previous buffer if (newend>bend) kovl=bend-bstart+1; else kovl=newend-bstart+1; } qlen=bstart-cstart; czfrom=0; czto=qlen; } //cstart=bstart, possibly extend upwards newstart=bstart; newlen=(newend-newstart+1); if (newlen>MAX_FASUBSEQ) { newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart newlen=MAX_FASUBSEQ; if (newstart<=bend) { //overlap with old buffer kovl=bend-newstart+1; czfrom=newstart-bstart; czto=0; } else { //not overlapping old buffer kovl=0; } } //newstart reassigned else { //we can extend the buffer to include the old one qlen=newend-bend; //how much to read from file qstart=bend+1; kovl=bend-bstart+1; czfrom=0; czto=0; } } lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region lastsub->sqlen-=qlen; //appending may result in a premature eof int toread=qlen; loadsubseq(qstart, qlen); //read the missing chunk, if any clen-=(toread-qlen); lastsub->sqlen+=qlen; return (const char*)(lastsub->sq+(cstart-newstart)); } char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) { if (cstart>cend) { Gswap(cstart, cend); } int clen=cend-cstart+1; const char* gs=subseq(cstart, clen); if (gs==NULL) return NULL; char* r=NULL; GMALLOC(r,clen+1); r[clen]=0; memcpy((void*)r,(void*)gs, clen); if (revCmpl) reverseComplement(r,clen); if (upCase) { for (int i=0;isq space allocated previously //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart] if (cstart>seq_len || lastsub->sqstart>cstart) { clen=0; //invalid request return NULL; } int eol_size=line_blen-line_len; char* seqp=lastsub->sq+(int)(cstart-lastsub->sqstart); //should be positive offset? //find the proper file offset and read the appropriate lines cstart--; //seq start offset, 0-based int lineofs = cstart % line_len; //file offset, relative to the first letter of the sequence in the file off_t f_start= ((int)(cstart/line_len))*line_blen + lineofs; uint letters_toread=clen; //actual sequence letters to read int maxlen=(seq_len>0)? seq_len-cstart : MAX_FASUBSEQ ; if (clen==0) letters_toread=maxlen; //read max allowed, or to the end of file uint c_end=cstart+letters_toread; //cstart+clen off_t f_end= ((int)(c_end/line_len))*line_blen + c_end % line_len; int bytes_toRead=f_end-f_start; f_start+=fseqstart; // file offset from the beginning of the file fseeko(fh, f_start, SEEK_SET); size_t actual_read=0; char* smem=NULL; GMALLOC(smem, bytes_toRead); actual_read=fread((void*)smem, 1, bytes_toRead, fh); if (actual_read==0) { //error reading any bytes from the file, or invalid request clen=0; return (const char*)seqp; } uint mp=0; //current read offset in smem uint sublen=0; //current sequence letter storage offset in seqp //copySeqOnly(seqp, smem, actualrlen); bool rdone=false; if (lineofs>0) { //read the partial first line uint reqrlen=line_len-lineofs; if (reqrlen>letters_toread) { reqrlen=letters_toread; //in case we need to read just a few chars rdone=true; } if (reqrlen>actual_read) { reqrlen=actual_read; //incomplete file read? rdone=true; } memcpy((void*)seqp, (void*)smem, reqrlen); if (rdone) { //eof reached prematurely GFREE(smem); clen=reqrlen; return (const char*)seqp; } letters_toread-=reqrlen; sublen+=reqrlen; mp+=reqrlen+eol_size; if (mp>actual_read) { GFREE(smem); clen=reqrlen; return (const char*)seqp; } }//loading first line //read the rest of the lines while (letters_toread>=line_len && mp+line_len=actual_read) { GFREE(smem); clen=sublen; return (const char*)seqp; } // read the last partial line, if any if (letters_toread>0) { if (mp+letters_toread>actual_read) letters_toread=actual_read-mp; if (letters_toread>0) { memcpy((void*)(&seqp[sublen]), (void*)(&smem[mp]), letters_toread); sublen+=letters_toread; } } //lastsub->sqlen+=sublen; GFREE(smem); clen=sublen; return (const char*)seqp; } gclib-0.12.7/GFaSeqGet.h000066400000000000000000000247071407072766100146240ustar00rootroot00000000000000#ifndef GFASEQGET_H #define GFASEQGET_H #include "GFastaIndex.h" #define MAX_FASUBSEQ 0x20000000 //max 512MB sequence data held in memory at a time class GSubSeq { public: uint sqstart; //1-based coord of subseq start on sequence uint sqlen; //length of subseq loaded char* sq; //actual subsequence data will be stored here // (with end-of-line characters removed) /*char* xseq; //the exposed pointer to the last requested subsequence start off_t xstart; //the coordinate start for the last requested subseq off_t xlen; //the last requested subseq len*/ GSubSeq() { sqstart=0; sqlen=0; sq=NULL; /* xseq=NULL; xstart=0; xlen=0;*/ } void forget() { //forget about pointer data, so we can reuse it sq=NULL; sqstart=0; sqlen=0; } ~GSubSeq() { GFREE(sq); } // genomic, 1-based coordinates: void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0); //check for overlap with previous window and realloc/extend appropriately //returns offset from seq that corresponds to sstart // the window will keep extending until MAX_FASUBSEQ is reached }; // class GFaSeqGet { char* fname; //file name where the sequence resides FILE* fh; off_t fseqstart; //file offset where the sequence actually starts uint seq_len; //total sequence length, if known (when created from GFastaIndex) uint line_len; //length of each line of text uint line_blen; //binary length of each line // = line_len + number of EOL character(s) GSubSeq* lastsub; void initialParse(off_t fofs=0, bool checkall=true); const char* loadsubseq(uint cstart, int& clen); void finit(const char* fn, off_t fofs, bool validate); public: //GStr seqname; //current sequence name char* seqname; GFaSeqGet(): fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { } GFaSeqGet(const char* fn, off_t fofs, bool validate=false):fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { finit(fn,fofs,validate); } GFaSeqGet(const char* fn, bool validate=false):fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { finit(fn,0,validate); } GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen); //constructor from GFastaIndex record GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false); ~GFaSeqGet() { if (fname!=NULL) { GFREE(fname); fclose(fh); } GFREE(seqname); delete lastsub; } const char* seq(uint cstart=1, int clen=0) { int cend = clen==0 ? 0 : cstart+clen-1; return getRange(cstart, cend); } const char* subseq(uint cstart, int& clen); const char* getRange(uint cstart=1, uint cend=0) { if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ; if (cstart>cend) { Gswap(cstart, cend); } int clen=cend-cstart+1; //int rdlen=clen; return subseq(cstart, clen); } //caller is responsible for deallocating the return string char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false); //uncached, read and return allocated buffer //caller is responsible for deallocating the return string char* fetchSeq(int* retlen=NULL) { int clen=(seq_len>0) ? seq_len : MAX_FASUBSEQ; if (lastsub) { delete lastsub; lastsub=NULL; } subseq(1, clen); if (retlen) *retlen=clen; char* r=lastsub->sq; lastsub->forget(); if (clen>0) { r[clen]=0; } else { r=NULL; } return r; } void loadall(uint32 max_len=0) { //TODO: better read the whole sequence differently here - line by line //so when EOF or another '>' line is found, the reading stops! int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ); subseq(1, clen); } void load(uint cstart, uint cend) { //cache as much as possible if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request int clen=cend-cstart+1; subseq(cstart, clen); } int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; } int getseqlen() { return seq_len; } //known when loaded with GFastaIndex off_t getseqofs() { return fseqstart; } int getLineLen() { return line_len; } int getLineBLen() { return line_blen; } //reads a subsequence starting at genomic coordinate cstart (1-based) }; //multi-fasta sequence handling class GFastaDb { public: char* fastaPath; GFastaIndex* faIdx; //could be a cdb .cidx file //int last_fetchid; const char* last_seqname; GFaSeqGet* faseq; //GCdbYank* gcdb; GFastaDb(const char* fpath=NULL, bool forceIndexFile=true):fastaPath(NULL), faIdx(NULL), last_seqname(NULL), faseq(NULL) { //gcdb=NULL; init(fpath, forceIndexFile); } void init(const char* fpath, bool writeIndexFile=true) { if (fpath==NULL || fpath[0]==0) return; //last_fetchid=-1; last_seqname=NULL; if (!fileExists(fpath)) GError("Error: file/directory %s does not exist!\n",fpath); fastaPath=Gstrdup(fpath); //GStr gseqpath(fpath); if (fileExists(fastaPath)>1) { //exists and it's not a directory char* fainame=Gstrdup(fastaPath,4); int fainamelen=strlen(fainame); //int fainame_len=strlen(fainame); if (trimSuffix(fastaPath, ".fai")) { //.fai index file given directly if (!fileExists(fastaPath)) GError("Error: cannot find fasta file for index %s !\n", fastaPath); } else { //append .fai as needed strcpy(fainame+fainamelen, ".fai"); fainamelen+=4; } //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars()); faIdx=new GFastaIndex(fastaPath, fainame); char* fainamecwd=fainame; //will hold just the file name without the path char* plast=strrchr(fainamecwd, '/'); //CHPATHSEP if (plast!=NULL) { fainamecwd=plast+1; //point to the file name only } if (!faIdx->hasIndex()) { //could not load index file .fai //try current directory (Warning: might not be the correct index for that file!) if (plast==NULL) { if (fileExists(fainamecwd)>1) { faIdx->loadIndex(fainamecwd); } } } //tried to load index if (!faIdx->hasIndex()) { //no index file to be loaded, build the index //if (forceIndexFile) // GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath); faIdx->buildIndex(); //build index in memory only if (faIdx->getCount()==0) GError("Error: no fasta records found!\n"); if (writeIndexFile) { //GMessage("Fasta index rebuilt.\n"); FILE* fcreate=fopen(fainame, "w"); char* idxfname=fainame; if (fcreate==NULL) { GMessage("Warning: cannot create fasta index file %s! (permissions?)\n", fainame); if (fainame!=fainamecwd) { //try cwd idxfname=fainamecwd; GMessage(" Attempting to create the index in the current directory..\n"); if ((fcreate=fopen(fainamecwd, "w"))==NULL) GError("Error: cannot create fasta index file %s!\n", fainamecwd); } } if (fcreate!=NULL) { if (faIdx->storeIndex(fcreate)getCount()) GMessage("Warning: error writing the index file %s!\n", idxfname); else GMessage("FASTA index file %s created.\n", idxfname); } } //file storage of index requested } //creating FASTA index GFREE(fainame); } //multi-fasta file } GFaSeqGet* fetchFirst(const char* fname, bool checkFasta=false) { faseq=new GFaSeqGet(fname, checkFasta); faseq->loadall(); //last_fetchid=gseq_id; GFREE(last_seqname); last_seqname=Gstrdup(faseq->seqname); return faseq; } char* getFastaFile(const char* gseqname) { if (fastaPath==NULL) return NULL; int gnl=strlen(gseqname); char* s=Gstrdup(fastaPath, gnl+8); int slen=strlen(s); if (s[slen-1]!='/') {//CHPATHSEP ? s[slen]='/'; slen++; s[slen]='\0'; } //s.append(gseqname); strcpy(s+slen, gseqname); slen+=gnl; if (!fileExists(s)) { //s.append(".fa") strcpy(s+slen, ".fa"); slen+=3; } if (!fileExists(s)) { strcpy(s+slen, "sta"); slen+=3; } if (fileExists(s)) return Gstrdup(s); else { GMessage("Warning: cannot find genomic sequence file %s/%s{.fa,.fasta}\n",fastaPath, s); return NULL; } GFREE(s); } GFaSeqGet* fetch(const char* gseqname) { if (fastaPath==NULL) return NULL; if (last_seqname!=NULL && (strcmp(gseqname, last_seqname)==0) && faseq!=NULL) return faseq; delete faseq; faseq=NULL; //last_fetchid=-1; GFREE(last_seqname); last_seqname=NULL; //char* gseqname=GffObj::names->gseqs.getName(gseq_id); if (faIdx!=NULL) { //fastaPath was the multi-fasta file name and it must have an index GFastaRec* farec=faIdx->getRecord(gseqname); if (farec!=NULL) { faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos, farec->line_len, farec->line_blen); faseq->loadall(); //just cache the whole sequence, it's faster //last_fetchid=gseq_id; last_seqname=Gstrdup(gseqname); } else { GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname); return NULL; } } else { //directory with FASTA files named as gseqname char* sfile=getFastaFile(gseqname); if (sfile!=NULL) { faseq=new GFaSeqGet(sfile); faseq->loadall(); //last_fetchid=gseq_id; GFREE(sfile); } } //one fasta file per contig //else GMessage("Warning: fasta index not available, cannot retrieve sequence %s\n", // gseqname); return faseq; } ~GFastaDb() { GFREE(fastaPath); GFREE(last_seqname); //delete gcdb; delete faIdx; delete faseq; } }; GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, const char* seqid); #endif gclib-0.12.7/GFastaFile.h000066400000000000000000000573441407072766100150260ustar00rootroot00000000000000#ifndef GFASTAFILE_H #define GFASTAFILE_H #include "GBase.h" #define CAPINC 64 #define SEQCAPINC 256 #define DEF_FASTA_DELIM (char*)">" class GFastaCharHandler; class GFastaFile; class FastaSeq { /* fasta record storage */ friend GFastaCharHandler; friend GFastaFile; protected: int id_cap; /* allocated size of the sequence name string*/ int namelen; // real length of seq name int d_cap; /* allocated size of the description */ int descrlen; /* real length of the description */ //-------actual sequence : int s_cap; /* allocated length of the sequence string */ public: int len; /* the actual string length of seq */ char *id; /* id only, up to first space */ char *descr; /* any comment on the defline, after the first space */ char* seq; /* the sequence buffer itself */ protected: //---- void detach() { //when pointers are taken over by another object //clear and forget id=NULL; descr=NULL; seq=NULL; init(); } void init(const char* cname, const char* cdescr=NULL, const char* cseq=NULL, int sbeg=-1, int send=-1) { //Warning: sbeg and send are 0-based! int l=0; if (cname==NULL) { GMALLOC(id, CAPINC); id_cap=CAPINC; namelen=0; id[0]='\0'; GMALLOC(descr, CAPINC); } else { l=strlen(cname); GMALLOC(id, l+1);strcpy(id,cname); id_cap=l+1; namelen=l; } if (cdescr==NULL) { GMALLOC(descr, CAPINC); descr[0]='\0'; d_cap=CAPINC; descrlen=0; } else {//copy given description l=strlen(cdescr); GMALLOC(descr, l+1); strcpy(descr,cdescr); d_cap=l+1; descrlen=l; } if (cseq==NULL) { GMALLOC(seq, SEQCAPINC); seq[0]='\0'; len=0; s_cap=SEQCAPINC; } else { //sequence given if (sbeg>=0) { //sequence range given if (send<0) send=strlen(cseq)-1; len=send-sbeg+1; if (len>0) { s_cap=len+1; GMALLOC(seq, s_cap); strncpy(seq, cseq+sbeg, len); seq[len]=0; } else { //null range GMALLOC(seq, SEQCAPINC); seq[0]='\0'; len=0; s_cap=SEQCAPINC; } } else {// copy whole cseq l=strlen(cseq); GMALLOC(seq, l+1); strcpy(seq,cseq); len=l; s_cap=l+1; } } } //init(alldata, range) void init(int seqalloc=0) { //ntCompTableInit(); GMALLOC(id, CAPINC); id_cap=CAPINC; namelen=0; id[0]='\0'; GMALLOC(descr, CAPINC); descr[0]='\0'; d_cap=CAPINC; descrlen=0; if (seqalloc<=0) { s_cap=SEQCAPINC; GMALLOC(seq, SEQCAPINC); } else { s_cap=seqalloc; GMALLOC(seq, seqalloc); } seq[0]='\0'; len=0; } public: FastaSeq(const char* cname, const char* cdescr=NULL, const char* bseq=NULL,int bseq_len=0) { if (bseq_len>0) init(cname, cdescr, bseq, 0, bseq_len-1); else init(cname, cdescr, bseq); } FastaSeq(int seqalloc=0) { init(seqalloc); } //copy constructor: FastaSeq(const FastaSeq& fa,int sbeg=-1,int send=-1) { if (sbeg<0) { sbeg=0; send=fa.len-1; } else if (send<0) send=fa.len-1; if (send>fa.len-1) send=fa.len-1; init(fa.id, fa.descr, fa.seq, sbeg, send); } FastaSeq(FastaSeq& fa, bool takeover) { if (takeover) { id_cap=fa.id_cap; id=fa.id; namelen=fa.namelen; descr=fa.descr; d_cap=fa.d_cap; descrlen=fa.descrlen; s_cap=fa.s_cap; len=fa.len; seq=fa.seq; fa.detach(); }else { init(fa.id, fa.descr, fa.seq); } } void clear() { GFREE(id);id_cap=0;namelen=0;id=NULL; GFREE(descr);d_cap=0;descrlen=0;descr=NULL; GFREE(seq);s_cap=0;len=0;seq=NULL; } ~FastaSeq() { clear(); } int getNameLen() { return namelen; } const char* getName() { return (const char*) id; } const char* name() { return (const char*) id; } const char* getSeqName() { return (const char*) id; } const char* getId() { return (const char*) id; } const char* getDescr() { return (const char*) descr; } int getDescrLen() { return descrlen; } const char* getSeq() { return (const char*) seq; } int getSeqLen() { return len; } void extendId(char c) { if (namelen+1 >= id_cap) { id_cap += CAPINC; GREALLOC(id, id_cap); } id[namelen]= c; namelen++; } void extendSeqName(char c) { extendId(c); } void extendName(char c) { extendId(c); } void extendDescr(char c) { if (descrlen+1 >= d_cap) { d_cap += CAPINC; GREALLOC(descr, d_cap); } descr[descrlen]= c; descrlen++; } void endId() { id[namelen]=0; } void endName() { id[namelen]=0; } void endSeqName() { id[namelen]=0; } void endDescr() { descr[descrlen]=0; } void endSeq() { seq[len]=0; } void extendSeq(char c) { if (len+1 >= s_cap) { s_cap += SEQCAPINC; GREALLOC(seq, s_cap); } seq[len]= c; len++; } void compactIdMem() { if (namelen>0) { GREALLOC(id, namelen+1); id_cap=namelen+1; } } void compactDescrMem() { if (descrlen>0) { GREALLOC(descr, descrlen+1); d_cap=descrlen+1; } } void compactSeqMem() { if (len>0) { GREALLOC(seq, len+1); s_cap=len+1; } } void compactMem() { compactIdMem(); compactDescrMem(); compactSeqMem(); } char* detachSeqPtr() { //such that the sequence allocated memory is no longer // freed when the FastaSeq object is destroyed // the returned pointer MUST be deallocated by the the user, later! char* p=seq; GMALLOC(seq, SEQCAPINC); s_cap=SEQCAPINC; len=0; return p; } char* setSeqPtr(char* newseq, int newlen=0, int newcap=0) { if (newlen==0) newlen=strlen(newseq); if (newcap<=newlen) newcap=newlen+1; GFREE(seq); seq=newseq; len=newlen; s_cap=newcap; return seq; } void reset() {// allocated space remains the same! namelen=0;id[0]=0; descrlen=0;descr[0]=0; len=0;seq[0]=0; } /* //reverse-complement a nucleotide sequence: // -- requires gdna.h void reverseComplement() { if (len==0) return; //ntCompTableInit(); reverseChars(seq,len); for (int i=0;i0) fprintf(fout, "%s %s\n", id, descr); else fprintf(fout, ">%s\n", id); } int l=len; char* p=seq; while (l>0) { int to_write=GMIN(line_len, l); fwrite(p,1,to_write,fout); fprintf(fout,"\n"); p+=line_len; l-=line_len; } } // static void write(FILE *fh, const char* seqid, const char* descr, char* seq, const int linelen=70, const int seqlen=0) { writeFasta(fh, seqid, descr, seq, linelen, seqlen); //from GBase.cpp } }; typedef int charFunc(char c, int pos, FastaSeq* fseq); //char processing function /* passes: c = current sequence character (generally aminoacid or nucleotide) pos = 0-based coordinate of the given character within the sequence fseq = FastaSeq pointer (useful for retrieving sequence defline info) the return value is not used yet */ //(for reading/writing variable length records, etc.) enum fileMode { fmRead, fmWrite }; class GFastaFile { char* fname; FILE* fh; fileMode fmode; long int rec_fpos; //the input stream offset of the current record to be read long int cur_fpos; //the input stream offset of the current byte to be read uint seqcoord; //1-based coordinate of the current record's sequence reading position //(updated by getSeqRange() mostly) protected: void bad_fastafmt() { GError("Error parsing file '%s'. Not a Fasta file?\n", fname); } void check_eof(int c) { if (c == EOF) bad_fastafmt(); } public: GFastaFile(const char* filename, fileMode filemode=fmRead) { fh=NULL; cur_fpos=0; rec_fpos=0; fmode=filemode; seqcoord=0; const char *mode=(filemode==fmRead) ? "rb" : "wb"; if (filename == NULL || filename[0]=='\0') { fh = (filemode == fmRead) ? stdin : stdout; fname=NULL; } else { if ((fh = fopen(filename, mode)) == NULL) GError("Cannot open file '%s'!", filename); fname=Gstrdup(filename); } /* GCALLOC(curseqid, CAPINC); curseqidlen=CAPINC; GCALLOC(curdescr, CAPINC); curdescrlen=CAPINC;*/ } //attach a GFastaFile object to an already open handle GFastaFile(FILE* fhandle, fileMode filemode=fmRead, const char* filename=NULL) { fh=fhandle; cur_fpos=ftell(fh); fmode=filemode; rec_fpos=cur_fpos; seqcoord=0; if (filename == NULL || filename[0]=='\0') { fname=NULL; } else fname=Gstrdup(filename); } void reset() { if (fh!=NULL && fh!=stdout && fh!=stdin) { fseeko(fh,0L, SEEK_SET); cur_fpos=0; rec_fpos=0; } else GError("Cannot use GFastaFile::reset() on stdin, stdout or NULL handles.\n"); } void seek(int pos) { if (fh!=NULL && fh!=stdout && fh!=stdin) { fseeko(fh, pos, SEEK_SET); cur_fpos=pos; seqcoord=0; //seqcoord agnostic after a seek } else GError("Cannot use GFastaFile::seek() on stdin, stdout or NULL handles.\n"); } ~GFastaFile() { if (fh!=NULL && fh!=stdout && fh!=stdin) fclose(fh); fh=NULL; GFREE(fname); /*GFREE(curseqid); GFREE(curdescr);*/ } int getReadPos() { return cur_fpos; } /* returns current read position in the input stream (can be used within callback) */ int ReadSeqPos() {return rec_fpos; } /* returns the input stream offset of the last fasta record processed by getFastaSeq*/ bool readHeader(FastaSeq& seq) { return (readHeader(&seq)!=NULL); } FastaSeq* readSeq(int seqalloc=0) { //allocate a new FastaSeq, reads the next record and returns it //caller is responsible for deallocating returned FastaSeq memory! FastaSeq* r=readHeader(NULL, seqalloc); int len=0; int c=-1; //load the whole sequence in FastaSeq while ((c = getc(fh)) != EOF && c != '>') { cur_fpos++; //if (isspace(c) || c<31) if (c<=32) { //before = (c=='\n' || c=='\r')?1:0; continue; /* skip spaces */ } if (len >= r->s_cap-1) { GREALLOC(r->seq, r->s_cap + SEQCAPINC); r->s_cap+=SEQCAPINC; } r->seq[len] = c; //before=0; len++; } r->seq[len] = '\0'; r->len=len; return r; } FastaSeq* readHeader(FastaSeq* seq=NULL, int seqalloc=0, const char* trim_delim=NULL) { /* reads the Fasta sequence header the first character must be '>' for this call, after any spaces, if seq is NULL a new FastaSeq object is allocated and returned, otherwise id and descr are updated */ seqcoord=0; int* buflen; int* buflenstr; char** buf; int before; if (feof(fh)) return NULL; int c = getc(fh); if (c==EOF) return NULL; cur_fpos++; while (c!=EOF && c<=32) { c=getc(fh); cur_fpos++; }//skip spaces etc. if (c == EOF) return NULL; if (c != '>') bad_fastafmt(); if (seq==NULL) seq=new FastaSeq(seqalloc); else { seq->clear(); seq->init(seqalloc); } int len = 0; //chars accumulated so far buflen=&(seq->id_cap); buf=&(seq->id); buflenstr=&(seq->namelen); before=1; //before seq ID was completed bool trim_done=false; int dt_len=0; //only set if defline trim was requested int dt_match=0; // trim_delim match char idx+1 while ((c = getc(fh)) != EOF) { cur_fpos++; if (c=='\n' || c=='\r') break; if (trim_done) continue; if (len >= *buflen-1) { GREALLOC(*buf, *buflen + CAPINC); *buflen+=CAPINC; } if (before) {//seq ID parsing if (c<=32) { // space encountered => seq_name finished before=0; (*buf)[len]='\0'; *buflenstr=len; buf=&seq->descr; buflen=&seq->d_cap; buflenstr=&seq->descrlen; len=0; if (trim_delim!=NULL) { //trimming the defline was requested dt_len=strlen(trim_delim); if (c<32 && c==trim_delim[0]) { trim_done=true; continue; } } //if (c!=1) // special case, nrdb concatenation continue; // skip this "space" } } else { //seq description parsing if (c<32 && dt_len>0 && c==trim_delim[0]) { //end the defline parsing here (e.g. nrdb concatenation) trim_done=true; continue; } if (dt_len) { if (dt_match==0) { if (c==trim_delim[0]) { //quick way out? if (dt_len==1) { trim_done=true; continue; } dt_match=1; } } else if (c==trim_delim[dt_match-1]) { if (dt_match==dt_len) { len-=dt_len-1; trim_done=true; continue; } dt_match++; } else dt_match=0; //cancel this match attempt } //trim delimiter matching } (*buf)[len]=c; len++; } //while reading defline (*buf)[len]='\0'; /* terminate the comment string */ *buflenstr = len; check_eof(c); /* it's wrong to have eof here */ seqcoord=1; return (seq->namelen==0) ? NULL : seq; } FastaSeq *getFastaSeq(bool& is_last, FastaSeq* seq, charFunc* callbackFn = NULL, const char* trim_descr_at=NULL) { /* seq must be a pointer to a initialized FastaSeq structure if seq is NULL, the sequence is not actually read, but just skipped and the file pointer set accordingly, while the returned "pointer" will not be a FastaSeq one but just NULL or not NULL (depending if eof was encountered) if callbackFn is NULL, the sequence is read entirely in memory in a FastaSeq.seq field otherwise only the defline is parsed into FastaSeq::id and FastaSeq::descr but actual sequence letters are passed one by one to the callback function and the actual sequence is never stored in memory (unless the callback does it) The optional trim_descr_at is there for nrdb-collapsed entries, it's supposed to discard any text from the description line following the trim_descr_at string; special case for "\x01" string: ^A delimiter */ int c, len; int before; rec_fpos=cur_fpos; len = 0; //chars accumulated so far if (fh==NULL || feof(fh)) return NULL; // -------- read the defline first if (seq==NULL) { // navigate only! don't read/parse anything but the record delimiter before=1; while ((c = getc(fh)) != EOF && c != '\n' && c !='\r') cur_fpos++; // skip defline if (c==EOF && cur_fpos<=rec_fpos+2) return NULL; check_eof(c); /* it's wrong to have eof here! */ cur_fpos++; //to account for the '\n' read /*----- read the sequence now: */ before=1; /* "newline before" flag */ while ((c = getc(fh)) != EOF && c != '>') { cur_fpos++; before = (c=='\n' || c=='\r') ? 1 : 0; } //we should end up at a '>' character here, or EOF } /* fasta fmt navigation to next sequence, no seq storage */ else { // sequence storage: if (!readHeader(seq, 0, trim_descr_at)) { is_last=true; return NULL; } /*----- read the actual sequence now: */ len=0; before=1; //newline before indicator if (callbackFn==NULL) { //load the whole sequence in FastaSeq while ((c = getc(fh)) != EOF && c != '>') { cur_fpos++; //if (isspace(c) || c<31) if (c<=32) { before = (c=='\n' || c=='\r')?1:0; continue; /* skip spaces */ } if (len >= seq->s_cap-1) { GREALLOC(seq->seq, seq->s_cap + CAPINC); seq->s_cap+=CAPINC; } seq->seq[len] = c; before=0; len++; } seq->seq[len] = '\0'; seq->len=len; } /* sequence storage */ else { //use the callback for each letter, do not store the whole sequence in FastaSeq while ((c = getc(fh)) != EOF && c != '>') { cur_fpos++; if (c<=32) { before = (c=='\n' || c=='\r')?1:0; continue; /* skip spaces within sequence*/ } (*callbackFn)(c, len, seq); //call the user function for each letter before=0; len++; } seq->len=len; } /* callback sequence reading (no storage)*/ } /* sequence parsing */ if (c=='>') { if (!before) bad_fastafmt(); /* '>' must only be at start of line, never within the sequence ! */ is_last=false; /* FALSE - not the last one */ ungetc(c, fh); } else is_last=true; /* TRUE - eof() here */ return ((seq==NULL) ? (FastaSeq*)fh : seq); //alwayws return non NULL here! } //getFastaSeq //simplified call to ignore the is_last flag FastaSeq *getFastaSeq(FastaSeq* seq, charFunc* callbackFn = NULL) { bool b; if (fh==NULL || feof(fh)) return NULL; return getFastaSeq(b, seq, callbackFn); } FastaSeq *getFastaSeq(FastaSeq* seq, const char* trim_descr_at) { //special case to trim nrdb collapsed descriptions bool b; if (fh==NULL || feof(fh)) return NULL; return getFastaSeq(b, seq, NULL, trim_descr_at); } uint seqSkip(uint slen, int& c){ //assumes the header was read ! //skip exactly slen characters in the actual aa or nt sequence //(spaces are not counted) uint skipacc=0; while (skipacc=seqcoord (i.e. when sequence ranges are read sequentially) if rcoord>=seqcoord assumes the header has been read already! Returns the actual length of the sequence returned (0 if rcoord>seq_length) and updates seqcoord, cur_fpos accordingly (rec_fpos is unchanged) */ uint getSeqRange(FastaSeq& seq, uint rcoord, uint rlen=0) { int c; uint len; rec_fpos=cur_fpos; if (!seqcoord || seqcoord>rcoord) { // slow -- go back to the beginning of the record seek(rec_fpos); readHeader(&seq); //this will also reset seqcoord to 1 } if (rcoord!=seqcoord) { seqSkip(rcoord-seqcoord, c); check_eof(c); if (c=='>') GError("Error: '>' character found while skipping through sequence!\n"); } len = 0; //chars accumulated so far seq.seq[0]='\0'; seq.len=0; //----- read the actual subsequence now: len=0; while ((c = getc(fh)) != EOF && c != '>') { cur_fpos++; if (c<=32) continue; // skip spaces if (len >= (uint) (seq.s_cap-1)) { GREALLOC(seq.seq, seq.s_cap + CAPINC); seq.s_cap+=CAPINC; } seq.seq[len] = c; len++; seqcoord++; if (rlen>0 && len==rlen) break; } seq.seq[len] = '\0'; seq.len=len; if (c=='>') bad_fastafmt(); /* '>' must only be at start of line, never within the sequence ! */ return len; } //getSeqRange //only for writing void putFastaSeq(FastaSeq *fa, const int linelen=60) { writeFasta(fh, fa->id, fa->descr, fa->seq, linelen); } /* static void writeFasta(FILE *fh, char* seqid, char* descr, char* seq, const int linelen=60, const int seqlen=0) { FastaSeq::write(fh, seqid, descr, seq, linelen, seqlen); } */ }; // ------------- FASTA parser/handler ---- // REQUIRES the first character processed after init() // to be the first character of the record delimiter // (default: ">") class GFastaCharHandler { protected: char* recdelim; charFunc* seqCallBack; bool in_delim; int delim_pos; bool in_seqname; bool in_descr; bool in_seq; FastaSeq* rec; unsigned int seq_pos; void reset() { in_delim=true; delim_pos=0; in_seqname=false; in_descr=false; in_seq=false; seq_pos=0; } public: GFastaCharHandler(char* recdel=DEF_FASTA_DELIM) { reset(); rec=NULL; recdelim=recdel; seqCallBack=NULL; } GFastaCharHandler(charFunc* chrCallBack, FastaSeq* r=NULL, char* recdel=DEF_FASTA_DELIM) { reset(); rec=r; recdelim=recdel; seqCallBack=chrCallBack; if (rec!=NULL) rec->reset(); } void init() { init(rec, seqCallBack); } void init(charFunc* chrCallBack) { init(rec,chrCallBack); } void init(FastaSeq* r) { init(r,seqCallBack); } void init(FastaSeq* r, charFunc* chrCallBack) { rec=r; seqCallBack=chrCallBack; if (rec==NULL) GError("GFastaCharHandler::init() Error: cannot use NULL FastaSeq!\n"); rec->reset(); reset(); } void done() { if (rec==NULL) GError("GFastaCharHandler::done() Error: cannot use NULL FastaSeq!\n"); rec->endId(); rec->endDescr(); rec->endSeq(); } //~GFastaCharHandler(); void processChar(char c) { if (in_delim) { //skip record delimiter -- but it must be there! if (recdelim[delim_pos]!=c) {//the only way to detect an Id starting in_seqname=true; in_delim=false; } delim_pos++; } if (in_seqname) { if (rec->namelen>0 && c<=32) { //breaking out of seq_name rec->endId(); if (c=='\n' || c=='\r') { //end defline in_seqname=false; in_seq=true; } else { //seqname break, not defline end in_seqname=false; in_descr=true; } } // seqname termination else { //seqname continues if (c>32) rec->extendId(c); } return; } // in_seqname if (in_descr) { if (c=='\n' || c=='\r') { //end defline rec->endDescr(); in_descr=false; in_seq=true; } else rec->extendDescr(c); return; } // in_descr if (in_seq && c>32) { seq_pos++; // 1-based sequence position ! if (seqCallBack==NULL) rec->extendSeq(c); else (*seqCallBack)(c,seq_pos,rec); } } }; #endif gclib-0.12.7/GFastaIndex.cpp000066400000000000000000000133641407072766100155430ustar00rootroot00000000000000/* * GFastaIndex.cpp * * Created on: Aug 25, 2010 * Author: gpertea */ #include "GFastaIndex.h" #define ERR_FAIDXLINE "Error parsing fasta index line: \n%s\n" #define ERR_FALINELEN "Error: sequence lines in a FASTA record must have the same length!\n" void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) { GFastaRec* farec=records.Find(seqname); if (farec!=NULL) { GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n"); farec->seqlen=seqlen; farec->fpos=foffs; farec->line_len=llen; farec->line_blen=llen_full; } else { farec=new GFastaRec(seqlen,foffs,llen,llen_full); records.Add(seqname,farec); farec->seqname=records.getLastKey(); } } int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index if (finame==NULL) finame=fai_name; if (finame!=fai_name) { fai_name=Gstrdup(finame); } if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n"); records.Clear(); haveFai=false; FILE* fi=fopen(fai_name,"rb"); if (fi==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fai_name); return 0; } GLineReader fl(fi); char* s=NULL; while ((s=fl.nextLine())!=NULL) { if (*s=='#') continue; char* p=strchrs(s,"\t "); if (p==NULL) GError(ERR_FAIDXLINE,s); *p=0; //s now holds the genomic sequence name p++; uint len=0; int line_len=0, line_blen=0; #ifdef _WIN32 long offset=-1; sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen); #else long long offset=-1; sscanf(p, "%u%lld%d%d", &len, &offset, &line_len, &line_blen); #endif if (len==0 || line_len==0 || line_blen==0 || line_blen0); return records.Count(); } int GFastaIndex::buildIndex() { //this parses the whole fasta file, so it could be slow for large files //builds the index in memory only if (fa_name==NULL) GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n"); FILE* fa=fopen(fa_name,"rb"); if (fa==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fa_name); return 0; } records.Clear(); GLineReader fl(fa); char* s=NULL; uint seqlen=0; int line_len=0,line_blen=0; bool newSeq=false; //set when FASTA header is encountered off_t newSeqOffset=0; //int prevOffset=0; char* seqname=NULL; int last_len=0; bool mustbeLastLine=false; //true if the line length decreases while ((s=fl.nextLine())!=NULL) { if (s[0]=='>') { if (seqname!=NULL) { if (seqlen==0) GError("Warning: empty FASTA record skipped (%s)!\n",seqname); else { //seqlen!=0 addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen); } } char *p=s; while (*p > 32) p++; *p=0; GFREE(seqname); seqname=Gstrdup(&s[1]); newSeq=true; newSeqOffset=fl.getfpos(); last_len=0; line_len=0; line_blen=0; seqlen=0; mustbeLastLine=false; } //defline parsing else { //sequence line int llen=fl.tlength(); int lblen=fl.blength(); //fl.getFpos()-prevOffset; if (newSeq) { //first sequence line after defline line_len=llen; line_blen=lblen; } else {//next seq lines after first if (mustbeLastLine) { //could be empty line, adjust for possible spaces if (llen>0) { char *p=s; //trim spaces, tabs etc. on the last line while (*p > 32) ++p; llen=(p-s); } if (llen>0) GError(ERR_FALINELEN); } else { if (llenlast_len) GError(ERR_FALINELEN); } } seqlen+=llen; last_len=llen; newSeq=false; } //sequence line //prevOffset=fl.getfpos(); }//for each line of the fasta file if (seqlen>0) addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen); GFREE(seqname); fclose(fa); return records.Count(); } int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file if (records.Count()==0) GError("Error at GFastaIndex:storeIndex(): no records found!\n"); FILE* fai=fopen(finame, "w"); if (fai==NULL) GError("Error creating fasta index file: %s\n",finame); int rcount=storeIndex(fai); GFREE(fai_name); fai_name=Gstrdup(finame); return rcount; } int GFastaIndex::storeIndex(FILE* fai) { int rcount=0; GList reclist(true,false,true); //sorted, don't free members, unique records.startIterate(); GFastaRec* rec=NULL; while ((rec=records.NextData())!=NULL) { reclist.Add(rec); } //reclist has records sorted by file offset for (int i=0;iseqname,reclist[i]->seqlen,(long)reclist[i]->fpos, reclist[i]->line_len, reclist[i]->line_blen); #else int written=fprintf(fai, "%s\t%d\t%lld\t%d\t%d\n", reclist[i]->seqname, reclist[i]->seqlen, (long long)(reclist[i]->fpos), reclist[i]->line_len, reclist[i]->line_blen); #endif if (written>0) rcount++; else break; //couldn't write anymore } fclose(fai); haveFai=(rcount>0); return rcount; } gclib-0.12.7/GFastaIndex.h000066400000000000000000000037451407072766100152120ustar00rootroot00000000000000/* * GFaIdx.h * * Created on: Aug 25, 2010 * Author: gpertea */ #ifndef GFAIDX_H_ #define GFAIDX_H_ #include "GHashMap.hh" #include "GList.hh" class GFastaRec { public: const char* seqname; uint seqlen; off_t fpos; int line_len; //effective line length (without EoL) int line_blen; //length of line including EoL characters GFastaRec(uint slen=0, off_t fp=0, int llen=0, int llenb=0) { seqname=NULL; //only a pointer copy seqlen=slen; fpos=fp; line_len=llen; line_blen=llenb; } bool operator==(GFastaRec& d){ return (fpos==d.fpos); } bool operator>(GFastaRec& d){ return (fpos>d.fpos); } bool operator<(GFastaRec& d){ return (fpos records; void addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full); GFastaRec* getRecord(const char* seqname) { return records.Find(seqname); } bool hasIndex() { return haveFai; } int loadIndex(const char* finame); int buildIndex(); //build index in memory by parsing the whole fasta file int storeIndex(const char* finame); int storeIndex(FILE* fai); int getCount() { return records.Count(); } GFastaIndex(const char* fname, const char* finame=NULL):records() { if (fileExists(fname)!=2) GError("Error: fasta file %s not found!\n",fname); if (fileSize(fname)<=0) GError("Error: invalid fasta file %s !\n",fname); fa_name=Gstrdup(fname); fai_name=finame!=NULL ? Gstrdup(finame) : NULL; if (fileSize(fa_name)==0) { GError("Error creating GFastaIndex(%s): invalid fasta file!\n",fa_name); } haveFai=false; if (fai_name!=NULL && fileSize(fai_name)>0) { //try to load the index file if it exists loadIndex(fai_name); haveFai=(records.Count()>0); } } ~GFastaIndex() { GFREE(fa_name); GFREE(fai_name); } }; #endif /* GFAIDX_H_ */ gclib-0.12.7/GHash.hh000066400000000000000000000405451407072766100142160ustar00rootroot00000000000000/******************************************************************************** * Hash table class template (char* based) * *********************************************************************************/ #ifndef GHash_HH #define GHash_HH #include "GBase.h" /** * This class maintains a fast-access hash table of entities * indexed by a character string (essentially, maps strings to pointers) */ //#define HASH_DBG_PRINT 1 #define GSTR_HASH(s) strhash(s) //#define GSTR_HASH(s) djb_hash(s) //#define GSTR_HASH(s) fnv1a_hash(s) //#define GSTR_HASH(s) murmur3(s) template class GHash { protected: struct GHashEntry { char* key; // Key string bool keyalloc; // shared key flag (to free/not the key) int hash; // Hash value of key pointer data; // Data }; GHashEntry* hash; // Hash int fCapacity; // table size int fCount; // number of valid entries int fCurrentEntry; char* lastkeyptr; //pointer to last key string added //---------- Raw data retrieval (including empty entries) // Return key at position pos. const char* Key(uint pos) const { return hash[pos].key; } // return data OBJ* at given position OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; } // Return position of first filled slot, or >= fCapacity int First() const; // Return position of last filled slot or -1 int Last() const; // Return position of next filled slot in hash table // or a value greater than or equal to fCapacity if no filled // slot was found int Next(int pos) const; //Return position of previous filled slot in hash table //or a -1 if no filled slot was found int Prev(int pos) const; private: GHash(const GHash&); GHash &operator=(const GHash&); GFreeProc* fFreeProc; //procedure to free item data protected: public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } public: GHash(GFreeProc* freeProc); // constructs of an empty hash GHash(bool doFree=true); // constructs of an empty hash (free the item objects) void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { fFreeProc=(doFree)? &DefaultFreeProc : NULL; } int Capacity() const { return fCapacity; } // table's size, including the empty slots. void Resize(int m); // Resize the table to the given size. int Count() const { return fCount; }// the total number of entries in the table. // Insert a new entry into the table given key. // If there is already an entry with that key, leave it unchanged OBJ* Add(const char* ky, OBJ* ptr=NULL); //same with Add, but frees the old element if it's a replacement OBJ* fAdd(const char* ky, OBJ* ptr=NULL); //same as Add, but the key pointer is stored directly, no string copy needed //(shared-key-Add) OBJ* shkAdd(const char* ky, OBJ* ptr); // Replace data at key. If there was no existing entry, // a new entry is inserted. OBJ* Replace(const char* ky, OBJ* ptr); // Remove a given key and its data OBJ* Remove(const char* ky); // Find data OBJ* given key. OBJ* Find(const char* ky, char** keyptr=NULL); bool hasKey(const char* ky); char* getLastKey() { return lastkeyptr; } OBJ* operator[](const char* ky) { return Find(ky); } void startIterate(); //iterator-like initialization char* NextKey(); //returns next valid key in the table (NULL if no more) OBJ* NextData(); //returns next valid hash[].data OBJ* NextData(char*& nextkey); //returns next valid hash[].data //or NULL if no more //nextkey is SET to the corresponding key GHashEntry* NextEntry() { //returns a pointer to a GHashEntry int pos=fCurrentEntry; while (pos GHash::GHash(GFreeProc* freeProc) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fCurrentEntry=-1; fFreeProc=freeProc; lastkeyptr=NULL; for (uint i=0; i GHash::GHash(bool doFree) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fCurrentEntry=-1; lastkeyptr=NULL; fFreeProc = (doFree)?&DefaultFreeProc : NULL; for (uint i=0; i void GHash::Resize(int m) { int i,n,p,x,h; GHashEntry *k; GASSERT(fCount<=fCapacity); if(m>2)>m) n>>=1; // Shrink until n/4 <= m while((n>>1)>1)); GASSERT(DEF_HASH_SIZE<=n); if(n!=fCapacity){ GASSERT(m<=n); GMALLOC(k, sizeof(GHashEntry)*n); for(i=0; i=0){ p=HASH1(h,n); GASSERT(0<=p && p OBJ* GHash::Add(const char* ky, OBJ* pdata) { int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::fAdd(const char* ky, OBJ* pdata) { int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::shkAdd(const char* ky, OBJ* pdata) { int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Replace(const char* ky, OBJ* pdata){ int p,i,x,h,n; if(!ky){ GError("GHash::replace: NULL key argument.\n"); } GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Remove(const char* ky){ int p,x,h,n; if(!ky){ GError("GHash::remove: NULL key argument.\n"); } OBJ* removed=NULL; if(0 bool GHash::hasKey(const char* ky) { int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if(0 OBJ* GHash::Find(const char* ky, char** keyptr){ int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if (fCount==0) return NULL; h=GSTR_HASH(ky); GASSERT(0<=h); p=HASH1(h,fCapacity); GASSERT(0<=p && p void GHash::startIterate() {// initialize a key iterator; call fCurrentEntry=0; } template char* GHash::NextKey() { int pos=fCurrentEntry; while (pos OBJ* GHash::NextData() { int pos=fCurrentEntry; while (pos OBJ* GHash::NextData(char* &nextkey) { int pos=fCurrentEntry; while (pos int GHash::First() const { int pos=0; while(pos int GHash::Last() const { int pos=fCapacity-1; while(0<=pos){ if(0<=hash[pos].hash) break; pos--; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Find next valid entry template int GHash::Next(int pos) const { GASSERT(0<=pos && pos int GHash::Prev(int pos) const { GASSERT(0<=pos && pos= 0){ if(0<=hash[pos].hash) break; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Remove all template void GHash::Clear(){ int i; for(i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); //reinitialize it for (i=0; i GHash::~GHash(){ for(int i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); } class GStrSet:public GHash { protected: bool free_keys; public: GStrSet(bool shared_keys=false):GHash(false), free_keys(!shared_keys) { } void Add(const char* str) { if (free_keys) { //allocates a copy of str GHash::Add(str, NULL); } else this->shkAdd(str, NULL); } void add(const char* str) { this->Add(str); } void push(const char* str) { this->Add(str); } bool has(const char* str) { return hasKey(str); } }; #endif gclib-0.12.7/GHashMap.hh000066400000000000000000000336141407072766100146530ustar00rootroot00000000000000/******************************************************************************** * Hash map class templates *********************************************************************************/ #ifndef GHashMap_HH #define GHashMap_HH #include "GBase.h" #include "khashl.hh" #include #include #define XXH_INLINE_ALL 1 #include "xxhash.h" #include "wyhash.h" template struct GHashKey_xxHash32 { //K generic (class, primitive, pointer except const char* ) //template inline typename std::enable_if< std::is_trivial::value, uint32_t>::type uint32_t operator()(const K& s) const { //only works for trivial types! static_assert(std::is_trivial::value, "Error: cannot use this for non-trivial types!\n"); return XXH32((const void *) &s, sizeof(K), 0); } }; template <> struct GHashKey_xxHash32 { inline uint32_t operator()(const char* s) const { return XXH32(s, strlen(s), 0); } }; template struct GHashKey_xxHash { //K generic (class, primitive, pointer except const char* ) //template inline typename std::enable_if< std::is_trivial::value, uint32_t>::type uint64_t operator()(const K& s) const { //only works for trivial types! static_assert(std::is_trivial::value, "Error: cannot use this for non-trivial types!\n"); return XXH64((const void *) &s, sizeof(K), 0); } }; template <> struct GHashKey_xxHash { inline uint32_t operator()(const char* s) const { return XXH64(s, strlen(s), 0); } }; template struct GHashKey_wyHash { //K generic (class, primitive, pointer except const char* ) //template inline typename std::enable_if< std::is_trivial::value, uint32_t>::type uint64_t operator()(const K& s) const { //only works for trivial types! static_assert(std::is_trivial::value, "Error: cannot use this for non-trivial types!\n"); return wyhash((const void *) &s, sizeof(K), 0, _wyp); } }; template <> struct GHashKey_wyHash { inline uint32_t operator()(const char* s) const { return wyhash(s, strlen(s), 0, _wyp); } }; template struct GHashKey_Eq { //K is a type having the == operator defined inline bool operator()(const K& x, const K& y) const { return (x == y); //requires == operator to be defined for K } }; template <> struct GHashKey_Eq { inline bool operator()(const char* x, const char* y) const { return (strcmp(x, y) == 0); } }; // GHashSet never makes a deep copy of a char* key, it only stores the pointer // - for pointer keys like char*, key allocation must be managed separately (and should always survive the GHashSet) template , class Eq=GHashKey_Eq, typename khInt_t=uint64_t > class GHashSet: public std::conditional< is_char_ptr::value, klib::KHashSetCached< K, Hash, Eq, khInt_t >, klib::KHashSet< K, Hash, Eq, khInt_t > >::type { protected: khInt_t i_iter=0; public: inline khInt_t Add(const K ky) { // return -1 if the key already exists int absent=-1; khInt_t i=this->put(ky, &absent); if (absent==1) //key was actually added return i; return -1; } inline khInt_t Remove(K ky) { //return index being removed, or -1 if no such key exists khInt_t i=this->get(ky); if (i!=this->end()) { this->del(i); return i; } return -1; } inline void Clear() { this->clear(); //does not shrink } inline void Reset() { this->clear(); GFREE(this->used); GFREE(this->keys); this->bits=0; this->count=0; } ~GHashSet() { this->Reset(); } inline bool operator[](K ky) { //RH only (read-only), cannot assign (use Add instead) return (this->get(ky)!=this->end()); } inline bool hasKey(K ky) { return (this->get(ky)!=this->end()); } int Find(K ky) {//return internal slot location if found, // or -1 if not found khInt_t r=this->get(ky); if (r==this->end()) return -1; return (int)r; } void startIterate() { //iterator-like initialization i_iter=0; } K* Next() { //returns a pointer to next valid key in the table (NULL if no more) if (this->count==0) return NULL; uint32_t nb=this->n_buckets(); while (i_iter_used(i_iter)) i_iter++; if (i_iter==nb) return NULL; K* k=&(this->key(i_iter-1)); ++i_iter; return k; } inline uint32_t Count() { return this->count; } }; // GStrSet always allocates a new copy of each added string; // if you don't want that, just use GHashSet instead and manage the key allocation separately template , class Eq=GHashKey_Eq, typename khInt_t=uint64_t> class GStrSet: public GHashSet { protected: const char* lastKey=NULL; public: inline int Add(const char* ky) { // return -1 if the key already exists int absent=-1; khInt_t i=this->put(ky, &absent); if (absent==1) {//key was actually added const char* s=Gstrdup(ky); this->key(i)=s; //store a copy of the key string lastKey=s; return i; } //key was already there return -1; } inline const char* getLastKey() { return lastKey; } int Remove(const char* ky) { //return index being removed, or -1 if no such key exists khInt_t i=this->get(ky); if (i!=this->end()) { const char* s=this->key(i); if (s==lastKey) lastKey=NULL; GFREE(s); //free string copy this->del(i); return i; } return -1; } inline void Clear() { khInt_t nb=this->n_buckets(); for (khInt_t i = 0; i != nb; ++i) { if (!this->_used(i)) continue; //deallocate string copy GFREE(this->key(i)); } lastKey=NULL; this->clear(); //does not shrink ! } inline void Reset() { this->Clear(); GFREE(this->used); GFREE(this->keys); lastKey=NULL; this->bits=0; this->count=0; } ~GStrSet() { this->Reset(); } }; // Generic hash map where keys and values can be of any type // Note: keys are always copied (shared) as simple value, there is no deep copy/allocation for pointers // so pointer keys must me managed separately // Note: pointer values are automatically deallocated on container destruction by default, // use GHashMap(false) to disable that when V is a pointer template , class Eq=GHashKey_Eq, typename khInt_t=uint64_t> class GHashMap:public std::conditional< is_char_ptr::value, klib::KHashMapCached< K, V, Hash, Eq, khInt_t>, klib::KHashMap< K, V, Hash, Eq, khInt_t> >::type { protected: khInt_t i_iter=0; bool freeItems=false; public: //---- these should be reimplemented for GHash inline int Add(const K ky, const V val) { // if a key does not exist allocate a copy of the key // return -1 if the key already exists int absent=-1; khInt_t i=this->put(ky, &absent); if (absent==1) { //key was actually added this->value(i)=val; //value is always copied return i; } return -1; } template inline typename std::enable_if< std::is_pointer::value, int>::type Remove(K ky) { //return index being removed khInt_t i=this->get(ky); if (i!=this->end()) { if (freeItems) delete this->value(i); this->del(i); return i; } return -1; } template inline typename std::enable_if< !std::is_pointer::value, int>::type Remove(K ky) { //return index being removed khInt_t i=this->get(ky); if (i!=this->end()) { this->del(i); return i; } return -1; } template inline typename std::enable_if< std::is_pointer::value, void>::type Clear() { if (!freeItems) { this->clear(); //does not shrink ! return; } khInt_t nb=this->n_buckets(); for (khInt_t i = 0; i != nb; ++i) { if (!this->_used(i)) continue; if (freeItems) delete this->value(i); } this->clear(); } template inline typename std::enable_if< !std::is_pointer::value, void>::type Clear() { this->clear(); } inline void Reset() { this->Clear(); GFREE(this->used); GFREE(this->keys); this->bits=0; this->count=0; } ~GHashMap() { this->Reset(); } // -- these can be shared with GHash: GHashMap(bool doFree=std::is_pointer::value):freeItems(doFree) { static_assert(std::is_trivial::value, "Error: cannot use this for non-trivial types!\n"); if (!std::is_pointer::value) doFree=false; }; //return pointer to stored value if found, NULL otherwise // if the stored value is a pointer, it's going to be a pointer to that template inline typename std::enable_if< std::is_pointer::value, T>::type Find(const K ky) { khInt_t r=this->get(ky); if (r==this->end()) return NULL; return this->value(r); } template inline typename std::enable_if< !std::is_pointer::value, T*>::type Find(const K ky) { khInt_t r=this->get(ky); if (r==this->end()) return NULL; return &(this->value(r)); } //-- operator[] should be defined just like Find? template inline typename std::enable_if< std::is_pointer::value, T>::type operator[](const K ky) { khInt_t r=this->get(ky); if (r==this->end()) return NULL; return this->value(r); } template inline typename std::enable_if< !std::is_pointer::value, T*>::type operator[](const K ky) { khInt_t r=this->get(ky); if (r==this->end()) return NULL; return &(this->value(r)); } inline bool hasKey(K ky) { return (this->get(ky)!=this->end()); } inline void startIterate() { //iterator-like initialization i_iter=0; } template inline typename std::enable_if< !std::is_pointer::value, T*>::type Next (V& val) { //returns a pointer to next key entry in the table (NULL if no more) if (this->count==0) return NULL; khInt_t nb=this->n_buckets(); while (i_iter_used(i_iter)) i_iter++; if (i_iter==nb) return NULL; val=this->value(i_iter); K* k=&(this->key(i_iter)); ++i_iter; return k; } template inline typename std::enable_if< std::is_pointer::value, T>::type Next (V& val) { //returns a pointer to next key entry in the table (NULL if no more) if (this->count==0) return NULL; khInt_t nb=this->n_buckets(); while (i_iter_used(i_iter)) i_iter++; if (i_iter==nb) return NULL; val=this->value(i_iter); K k = this->key(i_iter); ++i_iter; return k; } template inline typename std::enable_if< !std::is_pointer::value, T*>::type NextData () { //returns a pointer to next key entry in the table (NULL if no more) if (this->count==0) return NULL; khInt_t nb=this->n_buckets(); while (i_iter_used(i_iter)) i_iter++; if (i_iter==nb) return NULL; T* val=&(this->value(i_iter)); ++i_iter; return val; } template inline typename std::enable_if< std::is_pointer::value, T>::type NextData () { //returns a pointer to next key entry in the table (NULL if no more) if (this->count==0) return NULL; khInt_t nb=this->n_buckets(); while (i_iter_used(i_iter)) i_iter++; if (i_iter==nb) return NULL; T val=this->value(i_iter); ++i_iter; return val; } inline uint32_t Count() { return this->count; } }; // GHash(doFree=true) -- basic string hashmap // Note: this hash map always makes a copy of the string key which can be costly // use GHashMap for a faster alternative template , class Eq=GHashKey_Eq, typename khInt_t=uint64_t > class GHash:public GHashMap { protected: const char* lastKey=NULL; public: GHash(bool doFree=true) { this->freeItems=doFree; }; //---- these should be now reimplemented inline int Add(const char* ky, const V val) { // if a key does not exist allocate a copy of the key // return -1 if the key already exists int absent=-1; khInt_t i=this->put(ky, &absent); if (absent==1) { //key was actually added const char* s=Gstrdup(ky); this->key(i)=s; //store a copy of the key string lastKey=s; this->value(i)=val; //value is always copied return i; } return -1; } inline const char* getLastKey() { return lastKey; } template inline typename std::enable_if< std::is_pointer::value, int>::type Remove(const char* ky) { //return index being removed khInt_t i=this->get(ky); if (i!=this->end()) { const char* s=this->key(i); if (s==lastKey) lastKey=NULL; GFREE(s); //free string copy if (this->freeItems) delete this->value(i); this->del(i); return i; } return -1; } template inline typename std::enable_if< !std::is_pointer::value, int>::type Remove(const char* ky) { //return index being removed khInt_t i=this->get(ky); if (i!=this->end()) { const char* s=this->key(i); if (s==lastKey) lastKey=NULL; GFREE(s); //free string copy this->del(i); return i; } return -1; } template inline typename std::enable_if< std::is_pointer::value, void>::type Clear() { khInt_t nb=this->n_buckets(); for (khInt_t i = 0; i != nb; ++i) { if (!this->_used(i)) continue; if (this->freeItems) delete this->value(i); GFREE(this->key(i)); } lastKey=NULL; this->clear(); } template inline typename std::enable_if< !std::is_pointer::value, void>::type Clear() { khInt_t nb=this->n_buckets(); for (khInt_t i = 0; i != nb; ++i) { if (! this->_used(i) ) continue; GFREE(this->key(i)); } lastKey=NULL; this->clear(); } inline void Reset() { this->Clear(); GFREE(this->used); GFREE(this->keys); this->bits=0; this->count=0; } ~GHash() { this->Reset(); } }; template using GIntHash = GHashMap, GHashKey_Eq, uint32_t>; #endif gclib-0.12.7/GIntHash.hh000066400000000000000000000277151407072766100146750ustar00rootroot00000000000000#ifndef _GHASHT_HH #define _GHASHT_HH #include "GBase.h" //---------------------------------------------- // Int Hash table templates // --------------------------------------------- // Maps 32-bit integers to user data // Uses open addressing with linear probing. // In the m_cells array, key = 0 is reserved to indicate an unused cell. // Actual value for key 0 (if any) is stored in m_zeroCell. // The hash table automatically doubles in size when it becomes 75% full. // The hash table never shrinks in size // unless you explicitly call Clear() or Compact(). //---------------------------------------------- inline uint32_t upper_power_of_two(uint32_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } inline uint64_t upper_power_of_two(uint64_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v |= v >> 32; v++; return v; } template class GHashT { public: //protected: CELL* m_cells; uint32 m_arraySize; uint32 m_population; bool m_zeroUsed; CELL m_zeroCell; void Resize(uint32 desiredSize); //for iteration over elements //public: void init(uint32 initialSize = 32); GHashT(uint32 initialSize = 32):m_cells(NULL),m_arraySize(0), m_population(0),m_zeroUsed(false), m_zeroCell(), m_cur(NULL) { init(initialSize); } ~GHashT() { delete[] m_cells; } uint32 Count() { return m_population; } // Basic operations CELL* Lookup(uint32 key); CELL* Insert(uint32 key);//Important: set the value to Insert()->value void Delete(CELL* cell); void Clear(uint32 initSize = 32) { delete[] m_cells; init(initSize); } void Compact() { Resize(upper_power_of_two((m_population * 4 + 3) / 3)); } //---------------------------------------------- // Iteration //---------------------------------------------- //protected: CELL* m_cur; //public: void startIterate() { m_cur = &m_zeroCell; if (!m_zeroUsed) NextCell(); } CELL* NextCell(); }; template class GIntHash { //OBJ requires a copy operator= protected: struct Cell { uint32 key; OBJ value; Cell():key(0) { } }; GHashT ghash; public: GIntHash():ghash() {} OBJ* Add(uint32 key, OBJ val) { Cell* c=ghash.Insert(key); c->value = val; return &(c->value); } OBJ* set(uint32 key, OBJ val) { Cell* c=ghash.Insert(key); c->value = val; return &(c->value); } uint32 Count() { return ghash.Count(); } OBJ Replace(uint32 key, OBJ val) { //just like set() but returns a copy of the *old* value, if any Cell* c=ghash.Insert(key); OBJ oldv=c->value; c->value=val; return oldv; } void Clear() { ghash.Clear(); } void Compact() { ghash.Compact(); } void startIterate() { ghash.startIterate(); } void Delete(uint32 key) { Cell* cell = ghash.Lookup(key); if (cell) Delete(cell); } OBJ* Find(uint32 key) { Cell* cell = ghash.Lookup(key); return (cell ? & cell->value : NULL); } OBJ* get(uint32 key) { Cell* cell = ghash.Lookup(key); return (cell ? & cell->value : NULL); } OBJ* operator[](const uint32 ky) { Cell* cell = ghash.Lookup(ky); return (cell ? & cell->value : NULL); } OBJ* Next(uint32& nextky) { Cell* cell=ghash.NextCell(); if (cell) { nextky=cell->key; return & (cell->value); } else { nextky=0; return NULL; } } uint32 NextKey() { Cell* cell=ghash.NextCell(); if (cell) return cell->key; else return 0; } OBJ* NextValue() { Cell* cell=ghash.NextCell(); if (cell) return & (cell->value); else return NULL; } }; template class GIntHashP { protected: struct Cell { uint32 key; OBJ* value; Cell():key(0),value(NULL) { } }; GHashT ghash; bool doFreeItems; public: GIntHashP(bool freeItems=true):ghash(),doFreeItems(freeItems) {} ~GIntHashP() { Clear(); } OBJ* Add(uint32 key, OBJ* val) { Cell* c=ghash.Insert(key); c->value = val; return c->value; } OBJ* set(uint32 key, OBJ* val) { Cell* c=ghash.Insert(key); c->value = val; return c->value; } uint32 Count() { return ghash.Count(); } OBJ* Replace(uint32 key, OBJ* val) { //just like set() but returns a copy of the *old* value, if any Cell* c=ghash.Insert(key); OBJ* oldv=c->value; c->value=val; return oldv; } void startIterate() { ghash.startIterate(); } void Compact() { ghash.Compact(); } void Clear() { if (doFreeItems) { if (ghash.m_zeroUsed) delete ghash.m_zeroCell.value; ghash.startIterate(); while (Cell* cell=ghash.NextCell()) { delete cell->value; } } ghash.Clear(); } void Delete(uint32 key) { Cell* cell = ghash.Lookup(key); if (cell) { if (doFreeItems) { delete cell->value; } Delete(cell); } } OBJ* Find(uint32 key) { Cell* cell = ghash.Lookup(key); return (cell ? cell->value : NULL); } OBJ* get(uint32 key) { Cell* cell = ghash.Lookup(key); return (cell ? cell->value : NULL); } OBJ* operator[](const uint32 ky) { Cell* cell = ghash.Lookup(ky); return (cell ? cell->value : NULL); } OBJ* Next(uint32& nextky) { Cell* cell=ghash.NextCell(); if (cell) { nextky=cell->key; return cell->value; } else { nextky=0; return NULL; } } uint32 NextKey() { Cell* cell=ghash.NextCell(); if (cell) return cell->key; else return 0; } OBJ* NextValue() { Cell* cell=ghash.NextCell(); if (cell) return cell->value; else return NULL; } }; // from code.google.com/p/smhasher/wiki/MurmurHash3 inline uint32_t integerHash(uint32_t h) { h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; h ^= h >> 16; return h; } // from code.google.com/p/smhasher/wiki/MurmurHash3 inline uint64_t integerHash(uint64_t k) { k ^= k >> 33; k *= 0xff51afd7ed558ccd; k ^= k >> 33; k *= 0xc4ceb9fe1a85ec53; k ^= k >> 33; return k; } #define GIHASH_FIRST_CELL(hash) (m_cells + ((hash) & (m_arraySize - 1))) #define GIHASH_CIRCULAR_NEXT(c) ((c) + 1 != m_cells + m_arraySize ? (c) + 1 : m_cells) #define GIHASH_CIRCULAR_OFFSET(a, b) ((b) >= (a) ? (b) - (a) : m_arraySize + (b) - (a)) //---------------------------------------------- // constructor //---------------------------------------------- template void GHashT::init(uint32 initialSize) { // Initialize regular cells m_arraySize = initialSize; GASSERT((m_arraySize & (m_arraySize - 1)) == 0); // Must be a power of 2 m_cells = new CELL[m_arraySize]; memset(m_cells, 0, sizeof(CELL) * m_arraySize); m_population = 0; // Initialize zero cell m_zeroUsed = 0; m_zeroCell.key = 0; //m_zeroCell.value = 0; } //---------------------------------------------- // Lookup key //---------------------------------------------- template CELL* GHashT::Lookup(uint32 key) { if (key) { // Check regular cells for (CELL* cell = GIHASH_FIRST_CELL(integerHash(key));; cell = GIHASH_CIRCULAR_NEXT(cell)) { if (cell->key == key) return cell; if (!cell->key) return NULL; } } else { // Check zero cell if (m_zeroUsed) return &m_zeroCell; return NULL; } }; //----------------------------------------------------------------------- // Adding a key pair to the hash table, returns CELL //IMPORTANT: Caller is responsible of setting the value into CELL->value //----------------------------------------------------------------------- template CELL* GHashT::Insert(uint32 key) { if (key) { // Check regular cells for (;;) { for (CELL* cell = GIHASH_FIRST_CELL(integerHash(key));; cell = GIHASH_CIRCULAR_NEXT(cell)) { if (cell->key == key) { // Found //cell->value=val; return cell; } if (cell->key == 0) { // Insert here if ((m_population + 1) * 4 >= m_arraySize * 3) { // Time to resize Resize(m_arraySize * 2); break; } ++m_population; cell->key = key; //cell->value = val; return cell; } } } } else { // Check zero cell if (!m_zeroUsed) { // Insert here m_zeroUsed = true; if (++m_population * 4 >= m_arraySize * 3) { // Even though we didn't use a regular slot, let's keep the sizing rules consistent Resize(m_arraySize * 2); } } //m_zeroCell.value=val; return &m_zeroCell; } } //---------------------------------------------- // Delete a key-value pair in the hash table //---------------------------------------------- template void GHashT::Delete(CELL* cell) { if (cell != &m_zeroCell) { // Delete from regular cells GASSERT(cell >= m_cells && cell - m_cells < m_arraySize); GASSERT(cell->key); // Remove this cell by shuffling neighboring cells so there are no gaps in anyone's probe chain for (CELL* neighbor = GIHASH_CIRCULAR_NEXT(cell);; neighbor = GIHASH_CIRCULAR_NEXT(neighbor)) { if (!neighbor->key) { // There's nobody to swap with. Go ahead and clear this cell, then return cell->key = 0; //cell->value = 0; m_population--; if (m_populationkey)); if (GIHASH_CIRCULAR_OFFSET(ideal, cell) < GIHASH_CIRCULAR_OFFSET(ideal, neighbor)) { // Swap with neighbor, then make neighbor the new cell to remove. *cell = *neighbor; cell = neighbor; } } } else { // Delete zero cell GASSERT(m_zeroUsed); m_zeroUsed = false; //cell->value = 0; m_population--; if (m_population void GHashT::Resize(uint32 desiredSize) { GASSERT((desiredSize & (desiredSize - 1)) == 0); // Must be a power of 2 GASSERT(m_population * 4 <= desiredSize * 3); // Get start/end pointers of old array CELL* oldCells = m_cells; CELL* end = m_cells + m_arraySize; // Allocate new array m_arraySize = desiredSize; m_cells = new CELL[m_arraySize]; memset(m_cells, 0, sizeof(CELL) * m_arraySize); // Iterate through old array for (CELL* c = oldCells; c != end; c++) { if (c->key) { // Insert this element into new array for (CELL* cell = GIHASH_FIRST_CELL(integerHash(c->key));; cell = GIHASH_CIRCULAR_NEXT(cell)) { if (!cell->key) { // Insert here *cell = *c; break; } } } } delete[] oldCells; // Delete old array } //-------------------------------------------------- // return next cell (requires startIterate() first) //-------------------------------------------------- template CELL* GHashT::NextCell() { // Already finished? if (!m_cur) return m_cur; // Iterate past zero cell if (m_cur == &m_zeroCell) m_cur = & (m_cells[-1]); // Iterate through the regular cells CELL* end = m_cells + m_arraySize; while (++m_cur != end) { if (m_cur->key) return m_cur; } // Finished return m_cur = NULL; } #endif gclib-0.12.7/GIntervalTree.hh000066400000000000000000000432531407072766100157360ustar00rootroot00000000000000#ifndef E_INTERVAL_TREE #define E_INTERVAL_TREE #include "GBase.h" #include "GVec.hh" // This is an interval tree implementation based on red-black-trees // as described in the book _Introduction_To_Algorithms_ by Cormen, Leisserson, and Rivest. class GIntervalTreeNode { friend class GIntervalTree; protected: GSeg* storedInterval; int key; int high; int maxHigh; int red; /* if red=0 then the node is black */ GIntervalTreeNode* left; GIntervalTreeNode* right; GIntervalTreeNode* parent; public: void Print(GIntervalTreeNode* nil, GIntervalTreeNode* root) const { printf(", k=%i, h=%i, mH=%i",key,high,maxHigh); printf(" l->key="); if( left == nil) printf("NULL"); else printf("%i",left->key); printf(" r->key="); if( right == nil) printf("NULL"); else printf("%i",right->key); printf(" p->key="); if( parent == root) printf("NULL"); else printf("%i",parent->key); printf(" red=%i\n",red); } GIntervalTreeNode():storedInterval(NULL), key(0), high(0),maxHigh(0),red(0), left(NULL), right(NULL), parent(NULL) {} GIntervalTreeNode(GSeg * newInterval): storedInterval (newInterval), key(newInterval->start), high(newInterval->end) , maxHigh(high), red(0), left(NULL), right(NULL), parent(NULL) { } ~GIntervalTreeNode() {} }; struct G_ITRecursionNode { public: // this structure stores the information needed when we take the // right branch in searching for intervals but possibly come back // and check the left branch as well. GIntervalTreeNode * start_node; unsigned int parentIndex; int tryRightBranch; } ; class GIntervalTree { private: unsigned int recursionNodeStackSize; G_ITRecursionNode * recursionNodeStack; unsigned int currentParent; unsigned int recursionNodeStackTop; protected: // A sentinel is used for root and for nil. root->left should always // point to the node which is the root of the tree. nil points to a // node which should always be black but has arbitrary children and // parent and no key or info; These sentinels are used so // that the root and nil nodes do not require special treatment in the code GIntervalTreeNode* root; GIntervalTreeNode* nil; // INPUT: the node to rotate on // Rotates as described in _Introduction_To_Algorithms by // Cormen, Leiserson, Rivest (Chapter 14). Basically this // makes the parent of x be to the left of x, x the parent of // its parent before the rotation and fixes other pointers // accordingly. Also updates the maxHigh fields of x and y // after rotation. void LeftRotate(GIntervalTreeNode* x) { GIntervalTreeNode* y; // originally wrote this function to use the sentinel for // nil to avoid checking for nil. However this introduces a // very subtle bug because sometimes this function modifies // the parent pointer of nil. This can be a problem if a // function which calls LeftRotate also uses the nil sentinel // and expects the nil sentinel's parent pointer to be unchanged // after calling this function. For example, when DeleteFixUP // calls LeftRotate it expects the parent pointer of nil to be // unchanged. y=x->right; x->right=y->left; if (y->left != nil) y->left->parent=x; // used to use sentinel here // and do an unconditional assignment instead of testing for nil y->parent=x->parent; // instead of checking if x->parent is the root as in the book, we // count on the root sentinel to implicitly take care of this case if( x == x->parent->left) { x->parent->left=y; } else { x->parent->right=y; } y->left=x; x->parent=y; x->maxHigh=GMAX(x->left->maxHigh, GMAX(x->right->maxHigh,x->high)); y->maxHigh=GMAX(x->maxHigh,GMAX(y->right->maxHigh,y->high)); } // make the parent of x be to the left of x, x the parent of // its parent before the rotation and fixes other pointers // accordingly. Also updates the maxHigh fields of x and y // after rotation. void RightRotate(GIntervalTreeNode*y) { GIntervalTreeNode* x; x=y->left; y->left=x->right; if (nil != x->right) x->right->parent=y; //used to use sentinel here // and do an unconditional assignment instead of testing for nil // instead of checking if x->parent is the root as in the book, we // count on the root sentinel to implicitly take care of this case x->parent=y->parent; if( y == y->parent->left) { y->parent->left=x; } else { y->parent->right=x; } x->right=y; y->parent=x; y->maxHigh=GMAX(y->left->maxHigh,GMAX(y->right->maxHigh,y->high)); x->maxHigh=GMAX(x->left->maxHigh,GMAX(y->maxHigh,x->high)); } // Inserts z into the tree as if it were a regular binary tree // using the algorithm described in _Introduction_To_Algorithms_ // by Cormen et al. This function is only intended to be called // by the InsertTree function and not by the user void TreeInsertHelp(GIntervalTreeNode* z) { // this should only be called by the Insert method GIntervalTreeNode* x; GIntervalTreeNode* y; z->left=z->right=nil; y=root; x=root->left; while( x != nil) { y=x; if ( x->key > z->key) { x=x->left; } else { // x->key <= z->key x=x->right; } } z->parent=y; if ( (y == root) || (y->key > z->key) ) { y->left=z; } else { y->right=z; } #if defined(DEBUG_ASSERT) Assert(!nil->red,"nil not red in ITTreeInsertHelp"); Assert((nil->maxHigh=MIN_INT), "nil->maxHigh != MIN_INT in ITTreeInsertHelp"); #endif } void TreePrintHelper(GIntervalTreeNode* x) const { if (x != nil) { TreePrintHelper(x->left); x->Print(nil,root); TreePrintHelper(x->right); } } // FUNCTION: FixUpMaxHigh // INPUTS: x is the node to start from // EFFECTS: Travels up to the root fixing the maxHigh fields after // an insertion or deletion void FixUpMaxHigh(GIntervalTreeNode* x) { while(x != root) { x->maxHigh=GMAX(x->high,GMAX(x->left->maxHigh,x->right->maxHigh)); x=x->parent; } } // FUNCTION: DeleteFixUp // INPUTS: x is the child of the spliced // out node in DeleteNode. // EFFECT: Performs rotations and changes colors to restore red-black // properties after a node is deleted void DeleteFixUp(GIntervalTreeNode* x) { GIntervalTreeNode * w; GIntervalTreeNode * rootLeft = root->left; while( (!x->red) && (rootLeft != x)) { if (x == x->parent->left) { w=x->parent->right; if (w->red) { w->red=0; x->parent->red=1; LeftRotate(x->parent); w=x->parent->right; } if ( (!w->right->red) && (!w->left->red) ) { w->red=1; x=x->parent; } else { if (!w->right->red) { w->left->red=0; w->red=1; RightRotate(w); w=x->parent->right; } w->red=x->parent->red; x->parent->red=0; w->right->red=0; LeftRotate(x->parent); x=rootLeft; // this is to exit while loop } } else { // the code below is has left and right switched from above w=x->parent->left; if (w->red) { w->red=0; x->parent->red=1; RightRotate(x->parent); w=x->parent->left; } if ( (!w->right->red) && (!w->left->red) ) { w->red=1; x=x->parent; } else { if (!w->left->red) { w->right->red=0; w->red=1; LeftRotate(w); w=x->parent->left; } w->red=x->parent->red; x->parent->red=0; w->left->red=0; RightRotate(x->parent); x=rootLeft; // this is to exit while loop } } } x->red=0; } // Make sure the maxHigh fields for everything makes sense. void CheckMaxHighFields(GIntervalTreeNode * x) const { if (x != nil) { CheckMaxHighFields(x->left); if(!(CheckMaxHighFieldsHelper(x,x->maxHigh,0) > 0)) { GEXIT("Error found in CheckMaxHighFields.\n"); } CheckMaxHighFields(x->right); } } int CheckMaxHighFieldsHelper(GIntervalTreeNode * y, const int currentHigh, int match) const { if (y != nil) { match = CheckMaxHighFieldsHelper(y->left,currentHigh,match) ? 1 : match; GVERIFY(y->high <= currentHigh); if (y->high == currentHigh) match = 1; match = CheckMaxHighFieldsHelper(y->right,currentHigh,match) ? 1 : match; } return match; }public: GIntervalTree():recursionNodeStackSize(128), recursionNodeStack(NULL), currentParent(0), recursionNodeStackTop(1), root(new GIntervalTreeNode), nil(new GIntervalTreeNode) { //nil = new IntervalTreeNode; nil->left = nil->right = nil->parent = nil; nil->red = 0; nil->key = nil->high = nil->maxHigh = INT_MIN; nil->storedInterval = NULL; //root = new IntervalTreeNode; root->parent = root->left = root->right = nil; root->key = root->high = root->maxHigh = INT_MAX; root->red=0; root->storedInterval = NULL; /* the following are used for the Enumerate function */ //recursionNodeStackSize = 128; GMALLOC(recursionNodeStack, recursionNodeStackSize*sizeof(G_ITRecursionNode)); //recursionNodeStackTop = 1; recursionNodeStack[0].start_node = NULL; } ~GIntervalTree() { GIntervalTreeNode * x = root->left; GVec stuffToFree; if (x != nil) { if (x->left != nil) { stuffToFree.Push(x->left); } if (x->right != nil) { stuffToFree.Push(x->right); } // delete x->storedInterval; delete x; while( stuffToFree.Count()>0 ) { x = stuffToFree.Pop(); if (x->left != nil) { stuffToFree.Push(x->left); } if (x->right != nil) { stuffToFree.Push(x->right); } // delete x->storedInterval; delete x; } } delete nil; delete root; GFREE(recursionNodeStack); } void Print() const { TreePrintHelper(root->left); } // FUNCTION: DeleteNode // // INPUTS: tree is the tree to delete node z from // OUTPUT: returns the Interval stored at deleted node // EFFECT: Deletes z from tree and but don't call destructor // Then calls FixUpMaxHigh to fix maxHigh fields then calls // DeleteFixUp to restore red-black properties GSeg* DeleteNode(GIntervalTreeNode* z) { GIntervalTreeNode* y; GIntervalTreeNode* x; GSeg* returnValue = z->storedInterval; y= ((z->left == nil) || (z->right == nil)) ? z : GetSuccessorOf(z); x= (y->left == nil) ? y->right : y->left; if (root == (x->parent = y->parent)) { // assignment of y->p to x->p is intentional root->left=x; } else { if (y == y->parent->left) { y->parent->left=x; } else { y->parent->right=x; } } if (y != z) { // y should not be nil in this case #ifdef DEBUG_ASSERT Assert( (y!=nil),"y is nil in DeleteNode \n"); #endif // y is the node to splice out and x is its child y->maxHigh = INT_MIN; y->left=z->left; y->right=z->right; y->parent=z->parent; z->left->parent=z->right->parent=y; if (z == z->parent->left) { z->parent->left=y; } else { z->parent->right=y; } FixUpMaxHigh(x->parent); if (!(y->red)) { y->red = z->red; DeleteFixUp(x); } else y->red = z->red; delete z; } else { FixUpMaxHigh(x->parent); if (!(y->red)) DeleteFixUp(x); delete y; } return returnValue; } // Before calling InsertNode the node x should have its key set // FUNCTION: InsertNode // INPUT: newInterval is the interval to insert // OUTPUT: This function returns a pointer to the newly inserted node // which is guaranteed to be valid until this node is deleted. // What this means is if another data structure stores this // pointer then the tree does not need to be searched when this // is to be deleted. // EFFECTS: Creates a node node which contains the appropriate key and // info pointers and inserts it into the tree. GIntervalTreeNode * Insert(GSeg* newInterval) { GIntervalTreeNode* y; GIntervalTreeNode* newNode; GIntervalTreeNode* x = new GIntervalTreeNode(newInterval); TreeInsertHelp(x); FixUpMaxHigh(x->parent); newNode = x; x->red=1; while(x->parent->red) { // use sentinel instead of checking for root if (x->parent == x->parent->parent->left) { y=x->parent->parent->right; if (y->red) { x->parent->red=0; y->red=0; x->parent->parent->red=1; x=x->parent->parent; } else { if (x == x->parent->right) { x=x->parent; LeftRotate(x); } x->parent->red=0; x->parent->parent->red=1; RightRotate(x->parent->parent); } } else { // case for x->parent == x->parent->parent->right // this part is just like the section above with // left and right interchanged y=x->parent->parent->left; if (y->red) { x->parent->red=0; y->red=0; x->parent->parent->red=1; x=x->parent->parent; } else { if (x == x->parent->left) { x=x->parent; RightRotate(x); } x->parent->red=0; x->parent->parent->red=1; LeftRotate(x->parent->parent); } } } root->left->red=0; return(newNode); } // FUNCTION: GetSuccessorOf // INPUTS: x is the node we want the successor of // OUTPUT: This function returns the successor of x or NULL if no // successor exists. GIntervalTreeNode * GetPredecessorOf(GIntervalTreeNode* x) const { GIntervalTreeNode* y; if (nil != (y = x->right)) { // assignment to y is intentional while(y->left != nil) { // returns the minium of the right subtree of x y=y->left; } return(y); } else { y=x->parent; while(x == y->right) { // sentinel used instead of checking for nil x=y; y=y->parent; } if (y == root) return(nil); return(y); } } // FUNCTION: GetPredecessorOf // INPUTS: x is the node to get predecessor of // OUTPUT: This function returns the predecessor of x or NULL if no // predecessor exists. GIntervalTreeNode * GetSuccessorOf(GIntervalTreeNode* x) const { GIntervalTreeNode* y; if (nil != (y = x->left)) { // assignment to y is intentional while(y->right != nil) { // returns the maximum of the left subtree of x y=y->right; } return(y); } else { y=x->parent; while(x == y->left) { if (y == root) return(nil); x=y; y=y->parent; } return(y); } } // FUNCTION: Enumerate // INPUTS: tree is the tree to look for intervals overlapping the // closed interval [low,high] // OUTPUT: stack containing pointers to the nodes overlapping // [low,high] // EFFECT: Returns a stack containing pointers to nodes containing // intervals which overlap [low,high] in O(max(N,k*log(N))) // where N is the number of intervals in the tree and k is // the number of overlapping intervals // Note: This basic idea for this function comes from the // _Introduction_To_Algorithms_ book by Cormen et al, but // modifications were made to return all overlapping intervals // instead of just the first overlapping interval as in the // book. The natural way to do this would require recursive // calls of a basic search function. I translated the // recursive version into an iterative version with a stack // as described below. // The basic idea for the function below is to take the IntervalSearch // function from the book and modify to find all overlapping intervals // instead of just one. This means that any time we take the left // branch down the tree we must also check the right branch if and only if // we find an overlapping interval in that left branch. Note this is a // recursive condition because if we go left at the root then go left // again at the first left child and find an overlap in the left subtree // of the left child of root we must recursively check the right subtree // of the left child of root as well as the right child of root. GVec * Enumerate(int low, int high) { GVec * enumResultStack; GIntervalTreeNode* x=root->left; int stuffToDo = (x != nil); // Possible speed up: add min field to prune right searches #ifdef DEBUG_ASSERT Assert((recursionNodeStackTop == 1), "recursionStack not empty when entering IntervalTree::Enumerate"); #endif currentParent = 0; enumResultStack = new GVec(4); while(stuffToDo) { //if (Overlap(low,high,x->key,x->high) ) { if (low<=x->high && x->key<=high) { enumResultStack->Push(x->storedInterval); recursionNodeStack[currentParent].tryRightBranch=1; } if(x->left->maxHigh >= low) { // implies x != nil if ( recursionNodeStackTop == recursionNodeStackSize ) { recursionNodeStackSize *= 2; recursionNodeStack = (G_ITRecursionNode *) realloc(recursionNodeStack, recursionNodeStackSize * sizeof(G_ITRecursionNode)); if (recursionNodeStack == NULL) GEXIT("realloc failed in IntervalTree::Enumerate\n"); } recursionNodeStack[recursionNodeStackTop].start_node = x; recursionNodeStack[recursionNodeStackTop].tryRightBranch = 0; recursionNodeStack[recursionNodeStackTop].parentIndex = currentParent; currentParent = recursionNodeStackTop++; x = x->left; } else { x = x->right; } stuffToDo = (x != nil); while( (!stuffToDo) && (recursionNodeStackTop > 1) ) { if(recursionNodeStack[--recursionNodeStackTop].tryRightBranch) { x=recursionNodeStack[recursionNodeStackTop].start_node->right; currentParent=recursionNodeStack[recursionNodeStackTop].parentIndex; recursionNodeStack[currentParent].tryRightBranch=1; stuffToDo = ( x != nil); } } } #ifdef DEBUG_ASSERT Assert((recursionNodeStackTop == 1), "recursionStack not empty when exiting IntervalTree::Enumerate"); #endif return(enumResultStack); } }; #endif gclib-0.12.7/GList.hh000066400000000000000000000526611407072766100142500ustar00rootroot00000000000000//--------------------------------------------------------------------------- /* Sortable collections of objects and object pointers */ #ifndef _GList_HH #define _GList_HH #include "GVec.hh" #define GLIST_SORTED_ERR "Operation not allowed on a sorted list!\n" #define GLIST_UNSORTED_ERR "Operation not allowed on an unsorted list!\n" //------ useful macros: #define BE_UNSORTED if (fCompareProc!=NULL) { GError(GLIST_SORTED_ERR); return; } #define BE_SORTED if (fCompareProc==NULL) { GError(GLIST_UNSORTED_ERR); return; } #define SORTED (fCompareProc!=NULL) #define UNSORTED (fCompareProc==NULL) // GArray is the sortable array type, requires the comparison operator < to be defined template class GArray:public GVec { protected: bool fUnique; static int DefaultCompareProc(const pointer item1, const pointer item2) { //operator< MUST be defined for OBJ class! if (*((OBJ*)item2) < *((OBJ*)item1)) return 1; else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1; else return 0; } GCompareProc* fCompareProc; public: GArray(GCompareProc* cmpFunc=NULL); GArray(bool sorted, bool unique=false); GArray(int init_capacity, bool sorted, bool unique=false); GArray(const GArray& array); //copy constructor GArray& operator=(const GArray& array); //~GArray(); //assignment operator void setSorted(GCompareProc* cmpFunc); void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } //sort the array if cmpFunc not NULL or changes int Add(OBJ* item); // specific implementation if sorted int Add(OBJ& item) { return Add(&item); } //both will CREATE a new OBJ and COPY to it // using OBJ new operator= int AddIfNew(OBJ& item, int* fidx=NULL); //requires == operator //if equal item not found, item is added and return the index of it //otherwise returns -1 and fidx is set to the equal item location int cAdd(OBJ item) { return Add(&item); } int cPush(OBJ item) { return Add(&item); } int Push(OBJ& item) { return Add(&item); } void Add(GArray& list); //add copies of all items from another list //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; void Sort(); //explicit sort may be requested bool Sorted() { return fCompareProc!=NULL; } void Replace(int idx, OBJ& item); //Put, use operator= to copy int Unique() { return fUnique; } int IndexOf(OBJ& item); //this needs the == operator to have been defined for OBJ bool Found(OBJ& item, int& idx); // for sorted arrays only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which fCompareProc returns 0 bool Exists(OBJ& item); //same as above without existing index info //unsorted only, place item at position idx: void Move(int curidx, int newidx); void Insert(int idx, OBJ* item); void Insert(int idx, OBJ item) { Insert(idx,&item); } }; //GList is a sortable collection of pointers to objects; requires operator< to be defined, or a custom compare function template class GList:public GPVec { protected: bool fUnique; GCompareProc* fCompareProc; //a pointer to a Compare function static int DefaultCompareProc(const pointer item1, const pointer item2) { //operator< MUST be defined for OBJ class! if (*((OBJ*)item2) < *((OBJ*)item1)) return 1; else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1; else return 0; } public: void sortInsert(int idx, OBJ* item); //special insert in sorted lists //WARNING: the caller must know the insert index such that the sort order is preserved! GList(GCompareProc* compareProc=NULL); //free by default GList(GCompareProc* compareProc, //unsorted by default GFreeProc *freeProc, bool beUnique=false); GList(bool sorted, bool free_elements=true, bool beUnique=false); GList(int init_capacity, bool sorted, bool free_elements=true, bool beUnique=false); GList(const GList& list); //copy constructor GList(GList&& list); //move constructor GList(GList* list); //kind of a copy constructor GList& operator=(GList& list); //copy operator GList& operator=(GList&& list); //move operator //void Clear(); //~GList(); void setSorted(GCompareProc* compareProc); //sorted if compareProc not NULL; sort the list if compareProc changes ! bool Sorted() { return fCompareProc!=NULL; } void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } int Add(OBJ* item); //-- specific implementation if sorted - may become an Insert() void Add(GList& list); //add all pointers from another list OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL); // default: delete item if Found() (and pointers are not equal)! //returns the equal (==) object if it's in the list already //or the item itself if it is unique and actually added int AddedIfNew(OBJ* item); // if Found(item) (and pointers are not equal) delete item and returns -1 // if added, returns the new item index int Unique() { return fUnique; } //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; GCompareProc* GetCompareProc() {return fCompareProc;} int IndexOf(OBJ* item); //this has a specific implementation for sorted lists //if list is sorted, item data is located by binary search //based on the Compare function //if not, a linear search is performed, but //this needs the == operator to have been defined for OBJ void Put(int idx, OBJ* item, bool re_sort=false); bool Found(OBJ* item, int & idx); // sorted only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which GTCompareProc returns 0 bool Exists(OBJ* item); //same as above without existing index info bool Exists(OBJ& item); //same as above without existing index info void Sort(); //explicit sort may be requested using this function int Remove(OBJ* item); //search for pointer, using binary search if sorted void Insert(int idx, OBJ* item); //unsorted only, place item at position idx void Move(int curidx, int newidx); }; //GList //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GArray::GArray(const GArray& array):GVec(0) { //copy constructor this->fCount=array.fCount; this->fCapacity=array.fCapacity; this->fArray=NULL; if (this->fCapacity>0) { this->fArray=new OBJ[this->fCapacity]; } this->fCount=array.fCount; fUnique=array.fUnique; fCompareProc=array.fCompareProc; // uses OBJ operator= for (int i=0;ifCount;i++) this->fArray[i]=array[i]; } template GArray& GArray::operator=(const GArray& array) { if (&array==this) return *this; GVec::Clear(); this->fCount=array.fCount; this->fUnique=array.fUnique; this->fCapacity=array.fCapacity; if (this->fCapacity>0) { //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ)); this->fArray=new OBJ[this->fCapacity]; } this->fCompareProc=array.fCompareProc; this->fCount=array.fCount; // uses OBJ operator= for (int i=0;ifCount;i++) { this->fArray[i]=array[i]; } return *this; } template GArray::GArray(GCompareProc* cmpFunc):GVec(0) { fCompareProc = cmpFunc; fUnique = false; //only affects sorted lists } template GArray::GArray(bool sorted, bool unique):GVec(0) { fUnique=unique; fCompareProc = sorted ? DefaultCompareProc : NULL; } template GArray::GArray(int init_capacity, bool sorted, bool unique):GVec(init_capacity) { fUnique=unique; fCompareProc=sorted ? DefaultCompareProc : NULL; } template void GArray::setSorted(GCompareProc* cmpFunc) { GCompareProc* old_proc=fCompareProc; fCompareProc=cmpFunc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template int GArray::IndexOf(OBJ& item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GArray::Exists(OBJ& item) { int result=0; if (Found(item, result)) return true; else return false; } template int GArray::Add(OBJ* item) { if (item==NULL) return -1; int result; if (SORTED) { if (Found(*item, result)) if (fUnique) return -1; //cannot add a duplicate! //Found sets result to the position where the item should be! GVec::Insert(result, *item); } else { if (fUnique && Found(*item, result)) return -1; //set behaviour result = this->fCount; if (result==this->fCapacity) GVec::Grow(); this->fArray[result] = *item; //operator=, copies the item this->fCount++; } return result; } template void GArray::Add(GArray& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;isetCapacity(this->fCapacity+list.fCount); int s=this->fCount; for (int i=0;ifArray[s+i]=list.fArray[i]; this->fCount+=list.fCount; } } //returns -1 if existing equal object exists, sets fidx to that equal item index //or returns the index where the item was added/inserted template int GArray::AddIfNew(OBJ& item, int* fidx) { int rpos; if (Found(item, rpos)) { if (fidx) *fidx=rpos; //the position where the item should be inserted: return -1; //found and not added } //not found, let's insert it if (SORTED) { //Found() set result to the position where the item should be inserted GVec::Insert(rpos, item); } else { //simply append rpos = this->fCount; if (rpos==this->fCapacity) GVec::Grow(); this->fArray[rpos] = item; //operator= copies the item this->fCount++; } if (fidx!=NULL) *fidx=rpos; return rpos; } template bool GArray::Found(OBJ& item, int& idx) { //search the list by using fCompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (this->fCount==0) { idx=0; return false;} if (SORTED) { //binary search based on fCompareProc //do the simplest tests first: if ((*fCompareProc)(&(this->fArray[0]),&item)>0) { idx=0; return false; } if ((*fCompareProc)(&item, &(this->fArray[this->fCount-1]))>0) { idx=this->fCount; return false; } int l=0; int h = this->fCount - 1; int c; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(&(this->fArray[i]), &item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { //found! idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (ifCount) { if (this->fArray[i]==item) { //requires operator== idx=i; return true; } i++; } return false; } } template void GArray::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //forbid this operation on sorted data GVec::Insert(idx, item); } template void GArray::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! if (curidx!=newidx || newidx>=this->fCount) GError(GVEC_INDEX_ERR, newidx); OBJ tmp=this->fArray[curidx]; //copy constructor here this->fArray[curidx]=this->fArray[newidx]; this->fArray[newidx]=tmp; } template void GArray::Replace(int idx, OBJ& item) { //TEST_INDEX(idx); if (idx<0 || idx>=this->fCount) GError(GVEC_INDEX_ERR, __FILE__,__LINE__, idx); this->fArray[idx]=item; if ( SORTED ) Sort(); //re-sort ! this could be very expensive, don't do it } template void GArray::Sort() { if (fCompareProc==NULL) { fCompareProc=DefaultCompareProc; } if (this->fArray!=NULL && this->fCount>0) this->qSort(0, this->fCount-1, fCompareProc); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GList implementation -- sortable array of pointers to OBJ template GList::GList(const GList& list):GPVec(list) { //copy constructor fUnique=list.fUnique; fCompareProc=list.fCompareProc; } template GList::GList(GList&& list):GPVec(list) { //move constructor fUnique=list.fUnique; fCompareProc=list.fCompareProc; } template GList::GList(GList* plist):GPVec(*plist) { fUnique=plist->fUnique; fCompareProc=plist->fCompareProc; } template void GList::Add(GList& list) { if (list.Count()==0) return; for (int i=0;i GList::GList(GCompareProc* compareProc, GFreeProc* freeProc, bool beUnique) { fCompareProc = compareProc; this->fFreeProc = freeProc; fUnique = beUnique; //only affects sorted lists } template GList::GList(GCompareProc* compareProc) { fCompareProc = compareProc; this->fFreeProc = GPVec::DefaultFreeProc; fUnique = false; //only affects sorted lists } template GList::GList(bool sorted, bool free_elements, bool beUnique) { if (sorted) { if (free_elements) { fCompareProc=&DefaultCompareProc; this->fFreeProc = GPVec::DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=&DefaultCompareProc; this->fFreeProc=NULL; fUnique=beUnique; } } else { if (free_elements) { fCompareProc=NULL; this->fFreeProc=GPVec::DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=NULL; this->fFreeProc=NULL; fUnique=beUnique; } } } template GList::GList(int init_capacity, bool sorted, bool free_elements, bool beUnique):GPVec(init_capacity, free_elements) { if (sorted) { fCompareProc=&DefaultCompareProc; fUnique=beUnique; } else { fCompareProc=NULL; fUnique=beUnique; } } template GList& GList::operator=(GList& list) { if (&list!=this) { GPVec::Clear(); fCompareProc=list.fCompareProc; this->fFreeProc=list.fFreeProc; //Attention: the object pointers are copied directly, //but the actual objects are NOT duplicated for (int i=0;i GList& GList::operator=(GList&& list) { if (&list!=this) { GPVec::Clear(); fCompareProc=list.fCompareProc; this->fCount=list.fCount; this->fFreeProc=list.fFreeProc; this->fList=list.fList; list.fList=NULL; list.fCount=0; list.fCapacity=0; } return *this; } template void GList::setSorted(GCompareProc* compareProc) { GCompareProc* old_proc=fCompareProc; fCompareProc=compareProc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template int GList::IndexOf(OBJ* item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GList::Exists(OBJ& item) { int result=0; if (Found(&item, result)) return true; else return false; } template bool GList::Exists(OBJ* item) { int result=0; if (Found(item, result)) return true; else return false; } template int GList::Add(OBJ* item) { int result; if (item==NULL) return -1; if (SORTED) { if (Found(item, result)) if (fUnique) return -1; //duplicates forbidden //Found sets result to the position where the item should be! sortInsert(result, item); } else { if (fUnique && Found(item,result)) return -1; //set behaviour result = this->fCount; if (result==this->fCapacity) GPVec::Grow(); this->fList[result]=item; this->fCount++; } return result; } //by default, it deletes item if it an equal is found in the list! //returns the existing equal (==) object if it's in the list already //or returns the item itself if it's unique (and adds it) template OBJ* GList::AddIfNew(OBJ* item, bool deleteIfFound, int* fidx) { int r; if (Found(item, r)) { if (deleteIfFound && (pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); } if (fidx!=NULL) *fidx=r; return this->fList[r]; //found } //not found: if (SORTED) { //Found() set result to the position where the item should be inserted: sortInsert(r, item); } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; } if (fidx!=NULL) *fidx=r; return item; } //if item is found already in the list DELETE it and return -1 //otherwise the item is added and its index is returned template int GList::AddedIfNew(OBJ* item) { int r; if (Found(item, r)) { if ((pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); } return -1; } //not found: if (SORTED) { //Found() set r to the position where the item should be inserted: sortInsert(r, item); } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; } return r; } template bool GList::Found(OBJ* item, int& idx) { //search the list by using fCompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (this->fCount==0) { idx=0;return false;} if (SORTED) { //binary search based on fCompareProc //do the simple test first: if ((*fCompareProc)(this->fList[0],item)>0) { idx=0; return false; } if ((*fCompareProc)(item, this->fList[this->fCount-1])>0) { idx=this->fCount; return false; } int l, h, c; l = 0; h = this->fCount - 1; while (l <= h) { i = l + ((h-l)>>1); c = (*fCompareProc)(this->fList[i], item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (ifCount) { if (*this->fList[i]==*item) { idx=i; return true; } i++; } return false; } } template void GList::sortInsert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the current fList[idx] and all the above will be shifted +1 if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); if (this->fCount==this->fCapacity) { GPVec::Grow(idx, item); //expand and also copy/move data and insert the new item return; } //room still left, just move data around and insert the new one if (idxfCount) //copy/move pointers only! memmove(&(this->fList[idx+1]), &(this->fList[idx]), (this->fCount-idx)*sizeof(OBJ*)); this->fList[idx]=item; this->fCount++; } template void GList::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //cannot do that with a sorted list! GPVec::Insert(idx,item); } template void GList::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! GPVec::Move(curidx,newidx); } template void GList::Put(int idx, OBJ* item, bool re_sort) { //WARNING: this will never free the replaced item! // this may BREAK the sort order unless the "re_sort" parameter is given if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); this->fList[idx]=item; if (SORTED && item!=NULL && re_sort) Sort(); //re-sort } template int GList::Remove(OBJ* item) { //removes an item if it's in our list int result=IndexOf(item); if (result>=0) GPVec::Delete(result); return result; } template void GList::Sort() { if (fCompareProc==NULL) fCompareProc = DefaultCompareProc; if (this->fList!=NULL && this->fCount>0) this->qSort(0, this->fCount-1, fCompareProc); } //--------------------------------------------------------------------------- #endif gclib-0.12.7/GResUsage.cpp000066400000000000000000000211711407072766100152260ustar00rootroot00000000000000#include "GResUsage.h" #if defined(__APPLE__) && defined(__MACH__) #include #include #include #include #ifndef MAC_OS_X_VERSION_10_12 #define MAC_OS_X_VERSION_10_12 101200 #endif #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 #define G_gettime(s) clock_gettime(CLOCK_MONOTONIC, &s); #else #include #define MACHGT_NANO (+1.0E-9) #define MACHGT_GIGA UINT64_C(1000000000) void mach_gettime( struct timespec* t) { // be more careful in a multithreaded environement static double machgt_timebase = 0.0; static uint64_t machgt_timestart = 0; if (!machgt_timestart) { mach_timebase_info_data_t tb; tb.numer=0;tb.denom=0; mach_timebase_info(&tb); machgt_timebase = tb.numer; machgt_timebase /= tb.denom; machgt_timestart = mach_absolute_time(); } ; double diff = (mach_absolute_time() - machgt_timestart) * machgt_timebase; t->tv_sec = diff * MACHGT_NANO; t->tv_nsec = diff - (t->tv_sec * MACHGT_GIGA); } #define G_gettime(s) mach_gettime(&s) #endif #else #ifdef _WIN32 //Windows implementation: #include LARGE_INTEGER getFILETIMEoffset() { SYSTEMTIME s; FILETIME f; LARGE_INTEGER t; s.wYear = 1970; s.wMonth = 1; s.wDay = 1; s.wHour = 0; s.wMinute = 0; s.wSecond = 0; s.wMilliseconds = 0; SystemTimeToFileTime(&s, &f); t.QuadPart = f.dwHighDateTime; t.QuadPart <<= 32; t.QuadPart |= f.dwLowDateTime; return (t); } static void usage_to_timeval(FILETIME *ft, struct timeval *tv) { ULARGE_INTEGER time; time.LowPart = ft->dwLowDateTime; time.HighPart = ft->dwHighDateTime; tv->tv_sec = time.QuadPart / 10000000; tv->tv_usec = (time.QuadPart % 10000000) / 10; } //implementation of getrusage for Windows int getrusage(int who, rusage *usage) { FILETIME creation_time, exit_time, kernel_time, user_time; PROCESS_MEMORY_COUNTERS pmc; memset(usage, 0, sizeof(rusage)); if (who == RUSAGE_SELF) { if (!GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time)) { GMessage("Error: GetProcessTimes() failed!\n"); return -1; } if (!GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc))) { GMessage("Error: GetProcessMemoryInfo() failed!\n"); return -1; } usage_to_timeval(&kernel_time, &usage->ru_stime); usage_to_timeval(&user_time, &usage->ru_utime); usage->ru_majflt = pmc.PageFaultCount; usage->ru_maxrss = pmc.PeakWorkingSetSize / 1024; return 0; } else if (who == RUSAGE_THREAD) { if (!GetThreadTimes(GetCurrentThread(), &creation_time, &exit_time, &kernel_time, &user_time)) { GMessage("Error: GetThreadTimes() failed!\n"); return -1; } usage_to_timeval(&kernel_time, &usage->ru_stime); usage_to_timeval(&user_time, &usage->ru_utime); return 0; } else { return -1; } } void win_gettime(struct timespec* ts) { LARGE_INTEGER t; FILETIME f; double microseconds; static LARGE_INTEGER offset; static double frequencyToMicroseconds; static int initialized = 0; static BOOL usePerformanceCounter = 0; if (!initialized) { LARGE_INTEGER performanceFrequency; initialized = 1; usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency); if (usePerformanceCounter) { QueryPerformanceCounter(&offset); frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.; } else { offset = getFILETIMEoffset(); frequencyToMicroseconds = 10.; } } if (usePerformanceCounter) QueryPerformanceCounter(&t); else { GetSystemTimeAsFileTime(&f); t.QuadPart = f.dwHighDateTime; t.QuadPart <<= 32; t.QuadPart |= f.dwLowDateTime; } t.QuadPart -= offset.QuadPart; microseconds = (double)t.QuadPart / frequencyToMicroseconds; t.QuadPart = microseconds; ts->tv_sec = t.QuadPart / 1000000; ts->tv_nsec = (t.QuadPart % 1000000)*1000; } #define G_gettime(s) win_gettime(&s) #else //assume Linux compatible #define G_gettime(s) clock_gettime(CLOCK_MONOTONIC, &s) #endif #endif // Returns the peak (maximum so far) resident set size (physical // memory use) measured in bytes size_t getPeakMemUse() { #if defined(_WIN32) // -- Windows PROCESS_MEMORY_COUNTERS info; GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); return (size_t)info.PeakWorkingSetSize; #elif defined(__APPLE__) && defined(__MACH__) && defined(MACH_TASK_BASIC_INFO) struct mach_task_basic_info info; mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; if ( task_info( mach_task_self( ), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount ) != KERN_SUCCESS ) return (size_t)info.resident_size_max; else return (size_t)0L; // Can't access? #else // defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) // asssume BSD, Linux, or OSX struct rusage rusage; getrusage( RUSAGE_SELF, &rusage ); #if defined(__APPLE__) return (size_t)(rusage.ru_maxrss); #else //linux returns this in kilobytes return (size_t)(rusage.ru_maxrss*1024L); #endif #endif } /** * Returns the current resident set size (physical memory use) measured * in bytes */ size_t getCurrentMemUse() { #if defined(_WIN32) // -- Windows PROCESS_MEMORY_COUNTERS info; GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); return (size_t)info.WorkingSetSize; #elif defined(__APPLE__) && defined(__MACH__) #if defined MACH_TASK_BASIC_INFO struct mach_task_basic_info info; #else struct task_basic_info info; #define MACH_TASK_BASIC_INFO TASK_BASIC_INFO #define MACH_TASK_BASIC_INFO_COUNT TASK_BASIC_INFO_COUNT #endif mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; if ( task_info( mach_task_self( ), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount ) != KERN_SUCCESS ) return (size_t)0L; // Can't access? return (size_t)info.resident_size; #else //-- assume Linux long progsize = 0L; FILE* fp = NULL; if ( (fp = fopen( "/proc/self/statm", "r" )) == NULL ) return (size_t)0L; /* Can't open? */ //if ( fscanf( fp, "%*s%ld", &rss ) != 1 ) if ( fscanf( fp, "%*s%ld", &progsize ) != 1 ) { fclose( fp ); return (size_t)0L; /* Can't read? */ } fclose( fp ); int page_size=sysconf(_SC_PAGESIZE); return ((size_t)progsize * (size_t)page_size); #endif } void printMemUsage(FILE* fout) { double rs= getCurrentMemUse(); rs/=1024; fprintf(fout, "Resident Size: %6.1fMB\n", rs); } double get_usecTime() { struct timespec start_ts; G_gettime(start_ts); return (((double)start_ts.tv_sec)*1000000.0 + ((double)start_ts.tv_nsec)/1000.0); } double GResUsage::start() { started=true; stopped=false; start_mem=getCurrentMemUse(); double mem=(double)start_mem/1024; GMessage(" start_mem=%.2f\n", mem); getrusage(RUSAGE_SELF, &start_ru); G_gettime(start_ts); double tm=start_ts.tv_sec*1000000.0 + start_ts.tv_nsec/1000.0; return tm; } double GResUsage::stop() { if (started!=true) GError("Error: calling GResUsage::stop() without starting it first?\n"); stopped=true; G_gettime(stop_ts); getrusage(RUSAGE_SELF, &stop_ru); double tm=stop_ts.tv_sec*1000000.0 + stop_ts.tv_nsec/1000.0; stop_mem=getCurrentMemUse(); double mem=(double)stop_mem/1024; GMessage(" stop_mem=%.2f\n", mem); return tm; } #define RUSAGE_STOPCHECK void GResUsage::stopCheck(const char* s) { if (!started || !stopped) GError("Error: GResUsage::%s() cannot be used before start&stop\n", s); } double GResUsage::elapsed() { stopCheck("elapsed"); double st=start_ts.tv_sec*1000000.0 + start_ts.tv_nsec/1000.0; double et=stop_ts.tv_sec*1000000.0 + stop_ts.tv_nsec/1000.0; return (et-st); } double GResUsage::u_elapsed() { stopCheck("u_elapsed"); double st=start_ru.ru_utime.tv_sec*1000000.0 + start_ru.ru_utime.tv_usec; double et=stop_ru.ru_utime.tv_sec*1000000.0 + stop_ru.ru_utime.tv_usec; return (et-st); } double GResUsage::s_elapsed() { stopCheck("s_elapsed"); double st=start_ru.ru_stime.tv_sec*1000000.0 + start_ru.ru_stime.tv_usec; double et=stop_ru.ru_stime.tv_sec*1000000.0 + stop_ru.ru_stime.tv_usec; return (et-st); } double GResUsage::memoryUsed() { //in kilobytes stopCheck("memoryUsed"); return (((double)stop_mem-(double)start_mem)/1024.0); } gclib-0.12.7/GResUsage.h000066400000000000000000000030441407072766100146720ustar00rootroot00000000000000#ifndef _GRESUSAGE_ #define _GRESUSAGE_ #include "GBase.h" #if defined _WIN32 && ! defined __CYGWIN__ #define RUSAGE_SELF 0 /* calling process */ #define RUSAGE_CHILDREN -1 /* terminated child processes */ #define RUSAGE_THREAD 1 struct rusage { struct timeval ru_utime; /* user time used */ struct timeval ru_stime; /* system time used */ long ru_maxrss; long ru_majflt; }; #else #include #endif #include // report the memory usage of the current process in bytes size_t getCurrentMemUse(); //current memory usage of the program (RSS) size_t getPeakMemUse(); //maximum memory usage (RSS) for the program until now void printMemUsage(FILE* fout=stderr); //in kilobytes double get_usecTime(); class GResUsage { protected: bool started; bool stopped; size_t start_mem; size_t stop_mem; struct rusage start_ru; struct rusage stop_ru; struct timespec start_ts; struct timespec stop_ts; void stopCheck(const char* s); public: GResUsage(bool do_start=false):started(false), stopped(false), start_mem(0), stop_mem(0) { if (do_start) start(); } double start(); //returns microseconds time using clock_gettime(CLOCK_MONOTONIC) double stop(); //stop the stopwatch, returns the current time in microseconds double elapsed(); //microseconds elapsed between start and stop (wallclock time) double u_elapsed(); //microseconds of user time elapsed double s_elapsed(); //microseconds of system time elapsed double memoryUsed(); //memory increase between start and stop in KB (can be negative) }; #endif gclib-0.12.7/GStr.cpp000066400000000000000000001144611407072766100142650ustar00rootroot00000000000000//--------------------------------------------------------------------------- #include "GStr.h" #include #include #include #include "GBase.h" #include #include //--------------------------------------------------------------------------- GStr::Data GStr::null_data; //========================================= GStr::Data * GStr::new_data(uint len, uint addcap) { //static method to return a new Data object (allocate length) //content is undefined, but it's null terminated if (len > 0) { Data* data; GMALLOC(data, sizeof(Data)+len+addcap); data->ref_count = 0; data->length = len; data->cap=len+addcap; data->chars[len] = '\0'; return data; } else return &null_data; } GStr::Data* GStr::new_data(const char* str, uint addcap) { //static method to return a new Data object (allocate: length+addcap) //as a copy of a given string //if (str==NULL) return &null_data; int len= (str==NULL)? 0 : strlen(str); if (len+addcap > 0) { Data* data; GMALLOC(data, sizeof(Data)+len+addcap); if (len) strcpy(data->chars, str); data->ref_count = 0; data->cap=len+addcap; data->length = len; data->chars[len] = '\0'; return data; } else return &null_data; } void GStr::prep_data(uint len, uint addcap) { uint newcap=len+addcap; if (newcap > 0 && my_data->ref_count <= 1 && my_data->cap>=newcap && my_data->cap-newcap<(newcap>>1)+2) { //no need to shrink/reallocate the already allocated space my_data->length = len; my_data->chars[len]=0; return; } if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (len + addcap> 0) { GMALLOC(my_data, sizeof(Data)+len+addcap); my_data->ref_count = 1; my_data->length = len; my_data->cap=len+addcap; my_data->chars[len] = 0; } else my_data = &null_data; } GStr& GStr::clear(int init_cap) { make_unique(); //edit operation ahead prep_data(0, init_cap); return *this; } void GStr::replace_data(Data *data) { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (data != &null_data) data->ref_count++; my_data = data; } void GStr::make_unique() {//make sure it's not a reference to other string if (my_data->ref_count > 1) { Data *data = new_data(my_data->length, 0); ::memcpy(data->chars, my_data->chars, my_data->length); my_data->ref_count--; my_data = data; my_data->ref_count++; } } bool operator==(const char *s1, const GStr& s2){ if (s1==NULL) return s2.is_empty(); return (strcmp(s1, s2.chars()) == 0); } bool operator<(const char *s1, const GStr& s2) { if (s1==NULL) return !s2.is_empty(); return (strcmp(s1, s2.chars()) < 0); } bool operator<=(const char *s1, const GStr& s2){ if (s1==NULL) return true; return (strcmp(s1, s2.chars()) <= 0); } bool operator>(const char *s1, const GStr& s2) { if (s1==NULL) return false; return (strcmp(s1, s2.chars()) > 0); } GStr::GStr():my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; } //detach from the string data, returning a pointer to it char* GStr::detach() { make_unique(); char *r=my_data->chars; my_data=&null_data; fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; return r; } GStr::GStr(const GStr& s): my_data(&null_data){ fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; replace_data(s.my_data); } GStr::GStr(const char *s, uint addcap): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; my_data=new_data(s, addcap); my_data->ref_count = 1; } GStr::GStr(const int i, uint addcap): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; char buf[20]; sprintf(buf,"%d",i); my_data=new_data(buf, addcap); } GStr::GStr(const double f): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); prep_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(const char c, int n): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; prep_data(n); ::memset(chrs(), c, n); } GStr::~GStr() { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); GFREE(fTokenDelimiter); GFREE(readbuf); } char& GStr::operator[](int idx){ //returns reference to char (can be l-value) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); make_unique(); //because the user will probably modify this char! return chrs()[idx]; } char GStr::operator[](int idx) const { //returns char copy (cannot be l-value!) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); return my_data->chars[idx]; } GStr& GStr::operator=(const GStr& s) { if (s.my_data!=my_data) { make_unique(); //edit operation ahead replace_data(s.my_data); } return *this; } GStr& GStr::operator=(const char *s) { make_unique(); //edit operation ahead if (s==NULL) { prep_data(0); return *this; } const int len = ::strlen(s); prep_data(len); ::memcpy(my_data->chars, s, len); return *this; } GStr& GStr::assign(const char* s) { make_unique(); //edit operation ahead if (s==NULL) { prep_data(0, my_data->cap); return *this; } const int len = ::strlen(s); prep_data(len, my_data->cap-len-2); ::memcpy(my_data->chars, s, len); return *this; } GStr& GStr::assign(const int v) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%d",v); const int len = ::strlen(buf); prep_data(len, my_data->cap-len-2); ::memcpy(my_data->chars, buf, len); return *this; } GStr& GStr::operator=(const double f) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); prep_data(len); ::memcpy(my_data->chars, buf, len); return *this; } GStr& GStr::operator=(const int i) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); prep_data(len); ::memcpy(my_data->chars, buf, len); return *this; } bool GStr::operator==(const GStr& s) const { if (s.is_empty()) return is_empty(); return (length() == s.length()) && (memcmp(my_data->chars, s.chars(), length()) == 0); } bool GStr::operator==(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(my_data->chars, s) == 0); } bool GStr::operator<(const GStr& s) const { if (s.is_empty()) return false; return (strcmp(my_data->chars, s.chars()) < 0); } bool GStr::operator<(const char *s) const { if (s==NULL) return false; return (strcmp(my_data->chars, s) < 0); } bool GStr::operator<=(const GStr& s) const { if (s.is_empty()) return is_empty(); return (strcmp(my_data->chars, s.chars()) <= 0); } bool GStr::operator<=(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(my_data->chars, s) <= 0); } bool GStr::operator>(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (strcmp(my_data->chars, s.chars()) > 0); } bool GStr::operator>(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(my_data->chars, s) > 0); } bool GStr::operator>=(const GStr& s) const { if (s.is_empty()) return true; return (strcmp(my_data->chars, s.chars()) >= 0); } bool GStr::operator>=(const char *s) const { if (s==NULL) return true; return (strcmp(my_data->chars, s) >= 0); } bool GStr::operator!=(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (length() != s.length()) || (memcmp(my_data->chars, s.chars(), length()) != 0); } bool GStr::operator!=(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(my_data->chars, s) != 0); } GStr& GStr::append(int i) { char buf[20]; sprintf(buf,"%d",i); return append(buf); } GStr& GStr::append(uint i) { char buf[20]; sprintf(buf,"%u",i); return append(buf); } GStr& GStr::append(long l) { char buf[20]; sprintf(buf,"%ld",l); return append(buf); } GStr& GStr::append(unsigned long l) { char buf[20]; sprintf(buf,"%lu", l); return append(buf); } GStr& GStr::append(double f) { char buf[30]; sprintf(buf,"%f",f); return append(buf); } bool GStr::is_empty() const { //return my_data == &null_data; return (length()==0); } GStr GStr::copy() const { GStr newstring(*this); return newstring; } int GStr::index(const GStr& s, int start_index) const { return index(s.chars(), start_index); } bool GStr::contains(const GStr& s) const { return (index(s, 0) >= 0); } bool GStr::contains(const char *s) const { return (index(s, 0) >= 0); } bool GStr::startsWith(const char *s) const { //return (index(s, 0) == 0); return ::startsWith(my_data->chars, s); } bool GStr::startsWith(const GStr& s) const { //return (index(s, 0) == 0); return ::startsWith(my_data->chars, s.chars()); } bool GStr::endsWith(const char *s) const { //return (index(s, 0) == 0); return ::endsWith(my_data->chars, s); } bool GStr::endsWith(const GStr& s) const { //return (index(s, 0) == 0); return ::endsWith(my_data->chars, s.chars()); } bool GStr::contains(char c) const { return (index(c, 0) >= 0); } GStr& GStr::format(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions int len=vsprintf(buf,fmt,arguments); va_end(arguments); prep_data(len); //this also adds the '\0' at the end! //and sets the right len ::memcpy(chrs(), buf, len); GFREE(buf); return *this; } GStr& GStr::appendfmt(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions vsprintf(buf,fmt,arguments); va_end(arguments); append(buf); GFREE(buf); return *this; } GStr& GStr::trim(char c) { int istart; int iend; for (istart=0; istartchars[istart]==c;istart++) ; if (istart==length()) { make_unique(); //edit operation ahead prep_data(0); //string was entirely trimmed return *this; } for (iend=length()-1; iend>istart && my_data->chars[iend]==c;iend--) ; int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &my_data->chars[istart], newlen); replace_data(data); return *this; } GStr& GStr::trim(const char* c) { int istart; int iend; for (istart=0; istartchars[istart])!=NULL ;istart++) ; if (istart==length()) { prep_data(0); //string was entirely trimmed return *this; } for (iend=length()-1; iend>istart && strchr(c, my_data->chars[iend])!=NULL;iend--) ; int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, & (my_data->chars[istart]), newlen); replace_data(data); return *this; } GStr& GStr::trimR(char c) { //only trim the right end int iend; for (iend=length()-1; iend>=0 && my_data->chars[iend]==c;iend--) ; if (iend==-1) { make_unique(); prep_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead my_data->length=newlen; my_data->chars[newlen]='\0'; /* Data *data = new_data(newlen); ::memcpy(data->chars, my_data->chars, newlen); replace_data(data); */ return *this; } GStr& GStr::trimR(const char* c) { int iend; for (iend=length()-1; iend>=0 && strchr(c,my_data->chars[iend])!=NULL;iend--) ; if (iend==-1) { make_unique(); prep_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead my_data->length=newlen; my_data->chars[newlen]='\0'; /* Data *data = new_data(newlen); ::memcpy(data->chars, my_data->chars, newlen); replace_data(data); */ return *this; } GStr& GStr::chomp(const char* cstr) { int iend; if (cstr==NULL || *cstr==0) return *this; //check if this ends with cstr int cend=strlen(cstr)-1; iend=my_data->length-1; while (iend>=0 && cend>=0) { if (my_data->chars[iend]!=cstr[cend]) return *this; iend--; cend--; } if (iend==-1) { make_unique(); prep_data(0); //string will be entirely trimmed return *this; } int newlen=iend+1; make_unique(); //edit operation ahead my_data->length=newlen; my_data->chars[newlen]='\0'; //Data *data = new_data(newlen); //::memcpy(data->chars, my_data->chars, newlen); //replace_data(data); return *this; } GStr& GStr::trimL(char c) { int istart; for (istart=0; istartchars[istart]==c;istart++) ; if (istart==length()) { prep_data(0); //string was entirely trimmed return *this; } int newlen=length()-istart; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &my_data->chars[istart], newlen); replace_data(data); return *this; } GStr& GStr::trimL(const char* c) { int istart; for (istart=0; istartchars[istart])!=NULL;istart++) ; if (istart==length()) { prep_data(0); //string was entirely trimmed return *this; } int newlen=length()-istart; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &my_data->chars[istart], newlen); replace_data(data); return *this; } GStr& GStr::padR(uint len, char c) { //pad with c until total string length is len if (my_data->length>=len) return *this; //no room for padding make_unique(); //edit operation ahead if (my_data->cap>=len) { ::memset(my_data->chars, c, len-my_data->length); my_data->length=len; return *this; } Data *data = new_data(len); ::memset(data->chars,c,len-my_data->length); ::memcpy(&data->chars[len-length()], my_data->chars, my_data->length); replace_data(data); return *this; } GStr& GStr::padL(uint len, char c) { //align left the string if (my_data->length>=len) return *this; //no room for padding make_unique(); //edit operation ahead Data *data = new_data(len); ::memcpy(data->chars, my_data->chars, length()); ::memset(&data->chars[length()],c,len-length()); replace_data(data); return *this; } GStr& GStr::padC(uint len, char c) { if (my_data->length>=len) return *this; //no room for padding make_unique(); //edit operation ahead uint istart=(len-length())/2; Data *data = new_data(len); if (istart>0) ::memset(data->chars, c, istart); ::memcpy(&data->chars[istart], my_data->chars, length()); uint iend=istart+length(); if (iendchars[iend],c,len-iend); replace_data(data); return *this; } GStr operator+(const char *s1, const GStr& s2) { const int s1_length = ::strlen(s1); if (s1_length == 0) return s2; else { GStr newstring; newstring.prep_data(s1_length + s2.length()); ::memcpy(newstring.chrs(), s1, s1_length); ::memcpy(&(newstring.chrs())[s1_length], s2.chars(), s2.length()); return newstring; } } //========================================= GStr GStr::operator+(const GStr& s) const { if (length() == 0) return s; else if (s.length() == 0) return *this; else { GStr newstring; newstring.prep_data(length() + s.length()); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], s.chars(), s.length()); return newstring; } } //========================================= GStr GStr::operator+(const char *s) const { const int s_length = ::strlen(s); if (s_length == 0) return *this; else { GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], s, s_length); return newstring; } } GStr GStr::operator+(const int i) const { char buf[20]; sprintf(buf, "%d", i); const int s_length = ::strlen(buf); GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const char c) const { char buf[4]; sprintf(buf, "%c", c); const int s_length = ::strlen(buf); GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const double f) const { char buf[30]; sprintf(buf, "%f", f); const int s_length = ::strlen(buf); GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } //========================================= bool GStr::is_space() const { if (my_data == &null_data) return false; for (const char *p = my_data->chars; *p; p++) if (!isspace(*p)) return false; return true; } //========================================= GStr GStr::substr(int idx, int len) const { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); else if (idx>=length()) { len=0; idx=length(); } if (len) { // A length of -1 specifies the rest of the string. if (len < 0 || len>length()-idx) len = length() - idx; if (idx<0 || idx>=length() || len<0 ) invalid_args_error("substr()"); } GStr newstring; if (len) { newstring.prep_data(len); ::memcpy(newstring.chrs(), &my_data->chars[idx], len); } return newstring; } GStr& GStr::reverse() { make_unique(); int l=0; int r=my_data->length-1; char c; while (lchars[l]; my_data->chars[l]=my_data->chars[r]; my_data->chars[r]=c; l++;r--; } return *this; } //transform: any character from 'from' is replaced with a coresponding //char from 'to' GStr& GStr::tr(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); if (rto!=NULL && rto[0]==0) rto=NULL; if (rto!=NULL && strlen(rto)!=l) invalid_args_error("tr()"); make_unique(); //edit operation ahead if (rto==NULL) { //delete all characters Data *data = new_data(length()); char* s = my_data->chars; char* p=NULL; char* dest = data->chars; do { if ((p=strpbrk(s,rfrom))!=NULL) { memcpy(dest,s,p-s); dest+=p-s; s=p+1; } else { strcpy(dest, s); dest+=strlen(s); } } while (p!=NULL); (*dest)='\0'; data->length=strlen(data->chars); replace_data(data); } else { //char substitution case - easier! const char* p=NULL; for (int i=0; ichars[i]))!=NULL) my_data->chars[i]=rto[p-rfrom]; } } return *this; } // search and replace all the occurences of a string with another string // or just remove the given string (if replacement is NULL) GStr& GStr::replace(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); unsigned int tl= (rto==NULL)?0:strlen(rto); make_unique(); //edit operation ahead char* p; char* dest; char* newdest=NULL; char* s = my_data->chars; if (tl!=l) { //reallocation if (tl>l) { //possible enlargement GMALLOC(newdest, length()*(tl-l+1)+1); } else {//delete or replace with a shorter string GMALLOC(newdest, length() + 1); } dest=newdest; if (tl==0) {//deletion while ((p=strstr(s,rfrom))!=NULL) { //rfrom found at position p memcpy(dest,s,p-s); dest+=p-s; s+=p-s+l; //s positioned in string after rfrom } //no more occurences, copy the remaining string strcpy(dest, s); } else { //replace with another string while ((p=strstr(s,rfrom))!=NULL) { memcpy(dest,s,p-s); //copy up rto the match dest+=p-s; memcpy(dest,rto,tl); //put the replacement string dest+=tl; s+=p-s+l; } //not found any more, copy rto end of string strcpy(dest, s); } Data* data=new_data(newdest); replace_data(data); GFREE(newdest); } else { //inplace editing: no need rto reallocate while ((p=strstr(s,rfrom))!=NULL) { memcpy(p,rto,l); s+=p-s+l; } } return *this; } GStr& GStr::cut(int idx, int len) { if (len == 0) return *this; make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string, // so the left part will be cut out if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("cut()"); Data *data = new_data(length() - len); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); ::strcpy(&data->chars[idx], &my_data->chars[idx+len]); replace_data(data); return *this; } //========================================= GStr& GStr::paste(const GStr& s, int idx, int len) { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); make_unique(); //edit operation ahead // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); if (len == s.length() && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s.chars(), len); else { Data *data = new_data(length() - len + s.length()); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); if (s.length() > 0) ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &my_data->chars[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::paste(const char *s, int idx, int len) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); const int s_length = ::strlen(s); if (len == s_length && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s, len); else { Data *data = new_data(length() - len + s_length); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); if (s_length > 0) ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &my_data->chars[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const GStr& s, int idx) { make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); if (s.length() > 0) { Data *data = new_data(length() + s.length()); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &my_data->chars[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const char *s, int idx) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); const int s_length = ::strlen(s); if (s_length > 0) { Data *data = new_data(length() + s_length); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &my_data->chars[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::append(char c) { make_unique(); //edit operation ahead uint newlen=my_data->length+1; if (my_data->cap==0) { prep_data(1, 6); my_data->chars[0]=c; return *this; } if (newlen>my_data->cap) { //not enough room to append this char GREALLOC(my_data, sizeof(Data)+newlen); my_data->cap=newlen; } my_data->chars[my_data->length]=c; my_data->length++; my_data->chars[my_data->length]='\0'; return *this; } GStr& GStr::append(const char* s) { make_unique(); //edit operation ahead uint len=::strlen(s); uint newlen=len+my_data->length; if (newlen<=my_data->length) return *this; if (my_data->length==0 && my_data->capchars, s, len); return *this; } if (newlen>=my_data->cap) { //not enough room to append these chars GREALLOC(my_data, sizeof(Data)+newlen+1); my_data->cap=newlen+1; } ::memcpy(my_data->chars+my_data->length, s, len); my_data->length=newlen; my_data->chars[newlen]='\0'; return *this; } GStr& GStr::appendQuoted(const char* s, char q, bool onlyIfSpaced) { if (onlyIfSpaced) { if (strpbrk(s, "\t ")==NULL) return this->append(s); } char qend=q; if (q=='[' || q=='{' || q=='<') qend=q+2; else if (q=='(') qend=')'; this->append(q); this->append(s); this->append(qend); return *this; } GStr& GStr::append(const char* s, int len) { make_unique(); //edit operation ahead //uint len=::strlen(s); uint newlen=len+my_data->length; if (newlen<=my_data->length) return *this; if (my_data->length==0 && my_data->capchars, s, len); return *this; } if (newlen>=my_data->cap) { //not enough room to append these chars GREALLOC(my_data, sizeof(Data)+newlen+1); my_data->cap=newlen+1; } //strncpy(my_data->chars+my_data->length, s, len); newlen=my_data->length; for (int i=0;s[i]!='\0' && ichars[my_data->length+i]=s[i]; ++newlen; } my_data->length=newlen; my_data->chars[newlen]='\0'; return *this; } GStr& GStr::appendmem(const char* m, int len) { if (len<=0) return *this; make_unique(); //edit operation ahead uint newlen=len+my_data->length; //if (newlength<=my_data->length) return *this; if (my_data->length==0) { prep_data(len); ::memcpy(my_data->chars, m, len); return *this; } //faster solution with realloc if (newlen>=my_data->cap) { //not enough room to append these chars GREALLOC(my_data, sizeof(Data)+newlen+1); my_data->cap=newlen+1; } ::memcpy(my_data->chars + my_data->length, m, len); my_data->length=newlen; my_data->chars[newlen]='\0'; return *this; } GStr& GStr::append(const GStr& s) { return appendmem(s.chars(), s.length()); } GStr& GStr::upper() { make_unique(); //edit operation ahead for (char *p = chrs(); *p; p++) *p = (char) toupper(*p); return *this; } //========================================= GStr& GStr::lower() { make_unique(); for (char *p = chrs(); *p; p++) *p = (char) tolower(*p); return *this; } //========================================= int GStr::index(const char *s, int start_index) const { // A negative index specifies an index from the right of the string. if (strlen(s)>(size_t)length()) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); const char* idx = strstr(&my_data->chars[start_index], s); if (!idx) return -1; else return idx - my_data->chars; } //========================================= int GStr::index(char c, int start_index) const { // A negative index specifies an index from the right of the string. if (length()==0) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); if (c == '\0') return -1; const char *idx=(char *) ::memchr(&my_data->chars[start_index], c, length()-start_index); if (idx==NULL) return -1; else return idx - my_data->chars; } int GStr::rindex(char c, int end_index) const { if (c == 0 || length()==0 || end_index>=length()) return -1; if (end_index<0) end_index=my_data->length-1; for (int i=end_index;i>=0;i--) { if (my_data->chars[i]==c) return i; } return -1; } int GStr::rindex(const char* str, int end_index) const { if (str==NULL || *str == '\0' || length()==0 || end_index>=length()) return -1; int slen=strlen(str); if (end_index<0) end_index=my_data->length-1; //end_index is the index of the right-side boundary //the scanning starts at the end if (end_index>=0 && end_index=0;i--) { if (memcmp((void*)(my_data->chars+i),(void*)str, slen)==0) return i; } return -1; } GStr GStr::split(const char* delim) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::split(char c) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } GStr GStr::splitr(const char* delim) { GStr result; int i=rindex(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::splitr(char c) { GStr result; int i=rindex(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } void GStr::startTokenize(const char* delimiter, enTokenizeMode tokenizemode) { GFREE(fTokenDelimiter); if (delimiter) { GMALLOC(fTokenDelimiter,strlen(delimiter)+1); strcpy(fTokenDelimiter, delimiter); } fLastTokenStart=0; fTokenizeMode=tokenizemode; } bool GStr::nextToken(GStr& token) { if (fTokenDelimiter==NULL) { GError("GStr:: no token delimiter; use StartTokenize first\n"); } if (fLastTokenStart>=length()) {//no more GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } int dlen=strlen(fTokenDelimiter); char* delpos=NULL; //delimiter position int tlen=0; if (fTokenizeMode==tkFullString) { //exact string as a delimiter delpos=(char*)strstr(my_data->chars+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(my_data->chars+length()); //empty records may be returned if (my_data->chars+fLastTokenStart == delpos) { //empty token fLastTokenStart=(delpos-my_data->chars)+dlen; token=""; return true; } else { tlen=delpos-(my_data->chars+fLastTokenStart); token.prep_data(tlen); ::memcpy(token.chrs(), &my_data->chars[fLastTokenStart], tlen); fLastTokenStart=(delpos-my_data->chars)+dlen; return true; } } else { //tkCharSet - any character is a delimiter //empty records are never returned ! if (fLastTokenStart==0) {//skip any starting delimiters delpos=(char*)my_data->chars; while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; if (*delpos!='\0') fLastTokenStart = delpos-my_data->chars; else { //only delimiters here,no tokens GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } } //now fLastTokenStart is on a non-delimiter char //GMessage("String at fLastTokenStart=%d is %s\n", fLastTokenStart, delpos); char* token_end=NULL; delpos=(char*)strpbrk(my_data->chars+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(my_data->chars+length()); token_end=delpos-1; while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; //skip any other delimiters in the set! //now we know that delpos is on the beginning of next token tlen=(token_end-my_data->chars)-fLastTokenStart+1; if (tlen==0) { GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } token.prep_data(tlen); ::memcpy(token.chrs(), &my_data->chars[fLastTokenStart], tlen); fLastTokenStart=delpos-my_data->chars; return true; } //return true; } size_t GStr::read(FILE* stream, const char* delimiter, size_t bufsize) { //read up to (and including) the given delimiter string //if delimiter is NULL or zero length, it will read the whole file if (readbuf==NULL) { GMALLOC(readbuf, bufsize); readbufsize=bufsize; } else if (bufsize!=readbufsize) { GFREE(readbuf); if (bufsize>0) { GMALLOC(readbuf, bufsize); } readbufsize=bufsize; } if (bufsize==0) { prep_data(0); return 0; //clear the string and free the buffer } size_t numread; size_t acc_len=0; //accumulated length int dlen=0; if (delimiter!=NULL && delimiter[0]!=0) dlen=strlen(delimiter); void* p=NULL; Data* data = &null_data; do { numread=fread(readbuf, 1, bufsize, stream); if (numread) { if (dlen>0) p=Gmemscan(readbuf, bufsize, (void*) delimiter, dlen); if (p!=NULL) {//found the delimiter //position the stream after it int l = (char*)p-(char*)readbuf; fseek(stream, l+dlen-numread, SEEK_CUR); numread=l+dlen; } else {//not found, go back if not eof if (numread==bufsize) { if (dlen>0) { fseek(stream, -dlen, SEEK_CUR); //check if this works! numread-=dlen; } } } if (data==&null_data) { data=new_data(numread); ::memcpy(data->chars, readbuf, numread); acc_len+=numread; } else { GREALLOC(data, sizeof(Data)+acc_len+numread); memcpy(&data->chars[acc_len], readbuf, numread); acc_len+=numread; data->length=acc_len; data->chars[acc_len]='\0'; } } //if something read } while (p==NULL && numread!=0); replace_data(data); return acc_len; } int GStr::asInt(int base /*=10 */) { return strtol(text(), NULL, base); } bool GStr::asInt(int& r, int base) { errno=0; char*endptr; long val=strtol(text(), &endptr, base); if (errno!=0) return false; if (endptr == text()) return false; /* If we got here, strtol() successfully parsed a number */ r=val; return true; } double GStr::asReal() { return strtod(text(), NULL); } bool GStr::asReal(double& r) { errno=0; char* endptr; double val=strtod(text(), &endptr); if (errno!=0) return false; if (endptr == text()) return false; //no digits to parse r=val; return true; } int GStr::peelInt() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=0;ichars[i])) j++; //set coord else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i-j], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } int GStr::peelIntR() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=length()-1;i>=0;i--) { if (started) { if (isdigit(my_data->chars[i])) j++; //set length else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i+1], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } GStr GStr::to(char c) { //return the first part up to first occurence of c int i=index(c); if (i>=0) return substr(0,i); else return (*this); } //or whole string if c not found GStr GStr::from(char c) { //same as to, but starting from the right side int i=rindex(c); if (i>=0) return substr(i+1); else return (*this); } int GStr::count(char c){ //return the number of occurences of char c within the string int result=0; for (int i=0;ichars[i]==c) result++; return result; } //========================================= void GStr::invalid_args_error(const char *fname) { GError("GStr:: %s - invalid arguments\n", fname); } //**************************************************************************** void GStr::invalid_index_error(const char *fname) { GError("GStr:: %s - invalid index\n", fname); } //**************************************************************************** gclib-0.12.7/GStr.h000066400000000000000000000233101407072766100137220ustar00rootroot00000000000000//--------------------------------------------------------------------------- #ifndef GSTR_H #define GSTR_H //--------------------------------------------------------------------------- #include "GBase.h" #include #include #include // This class uses reference counting and copy-on-write semantics // All indexes are zero-based. For all functions that accept an index, a // negative index specifies an index from the right of the string. Also, // for all functions that accept a length, a length of -1 specifies the rest // of the string. enum enTokenizeMode { tkFullString, tkCharSet }; class GStr { friend GStr operator+(const char* s1, const GStr& s2); friend bool operator==(const char* s1, const GStr& s2); friend bool operator<(const char* s1, const GStr& s2); friend bool operator<=(const char* s1, const GStr& s2); friend bool operator>(const char* s1, const GStr& s2); friend bool operator>=(const char* s1, const GStr& s2); friend bool operator!=(const char* s1, const GStr& s2); friend void Gswap(GStr& s1, GStr& s2); public: GStr(); GStr(const GStr& s); //minimize reallocation when suffixes are added GStr(const char* s, uint addcap=8); GStr(const int i, uint addcap=8); GStr(const double f); GStr(const char c, int n = 1); ~GStr(); operator const char* () const { return my_data->chars;} //inline here char& operator[](int index); char operator[](int index) const; GStr& assign(const char* s); //never shrinks the allocated space GStr& assign(const int v); //never shrinks the allocated space GStr& operator=(const GStr& s); GStr& operator=(const char* s); GStr& operator=(const int i); GStr& operator=(const double f); GStr operator+(const GStr& s) const; GStr operator+(const char* s) const; GStr operator+(const char c) const; GStr operator+(const int i) const; GStr operator+(const double f) const; bool operator==(const GStr& s) const; bool operator==(const char* s) const; bool operator<(const GStr& s) const; bool operator<(const char* s) const; bool operator<=(const GStr& s) const; bool operator<=(const char* s) const; bool operator>(const GStr& s) const; bool operator>(const char* s) const; bool operator>=(const GStr& s) const; bool operator>=(const char* s) const; bool operator!=(const GStr& s) const; bool operator!=(const char* s) const; GStr& operator+=(const GStr& s) { return append(s.chars()); } GStr& operator+=(const char* s) { return append(s); } GStr& operator+=(char c) { return append(c); } GStr& operator+=(int i) { return append(i); } GStr& operator+=(uint i) { return append(i); } GStr& operator+=(long l) { return append(l); } GStr& operator+=(unsigned long l) { return append(l); } GStr& operator+=(double f); //interface: int length() const; bool is_empty() const; bool is_space() const; GStr substr(int index = 0, int len = -1) const; GStr to(char c); //return the first part up to first occurence of c //or whole string if c not found GStr from(char c); //same as to, but starting from the right side GStr copy() const; GStr& format(const char *fmt,...); GStr& reverse(); GStr& appendfmt(const char *fmt,...); GStr& cut(int index = 0, int len = -1); //delete a specified length GStr& remove(int from, int to) { return cut(from, to-from+1); } //paste a string at the specified position GStr& paste(const GStr& s, int index = 0, int len=-1); GStr& paste(const char* s, int index = 0, int len = -1); GStr& replace(const char* from, const char* to=NULL); GStr& insert(const GStr& s, int index = 0); GStr& insert(const char* s, int index = 0); GStr& append(const char* s); GStr& appendQuoted(const char* s, char q='"', bool onlyIfSpaced=false); GStr& appendmem(const char* m, int len); GStr& append(const char* m, int len); //same as appendmem but stops at '\0' GStr& append(const GStr& s); GStr& append(char c); GStr& append(int i); GStr& append(long l); GStr& append(double f); GStr& append(uint i); GStr& append(unsigned long l); GStr& upper(); GStr& lower(); GStr& clear(int init_cap=0);//make empty, but can specify initial capacity //character translation or removal: GStr& tr(const char* from, const char* to=NULL); //number of occurences of a char in the string: int count(char c); void startTokenize(const char* delimiter=" \t\n", enTokenizeMode tokenizemode=tkCharSet); bool nextToken(GStr& token); int asInt(int base=10); double asReal(); double asDouble() { return asReal(); } bool asReal(double& r); bool asDouble(double& r) { return asReal(r); } bool asInt(int& r, int base=10); int index(const GStr& s, int start_index = 0) const; int index(const char* s, int start_index = 0) const; int index(char c, int start_index = 0) const; int rindex(char c, int end_index = -1) const; int rindex(const char* str, int end_index = -1) const; bool contains(const GStr& s) const; bool contains(const char* s) const; bool contains(char c) const; bool startsWith(const char* s) const; bool startsWith(const GStr& s) const; bool endsWith(const char* s) const; bool endsWith(const GStr& s) const; GStr split(const char* delim); GStr split(char c); /* splits "this" in two parts, at the first (leftmost) encounter of delim: 1st would stay in "this" (which this way is truncated) 2nd will go to the returned string */ GStr splitr(const char* delim); GStr splitr(char c); /* splits "this" in two parts, at the last (rightmost) encounter of delim: 1st would stay in "this" 2nd will be returned */ int peelInt() const; //extract an integer, (left to right), from a //mixed alphanumeric string, e.g. 'T24HC1234b'=> 2 int peelIntR() const; //same as above, but starts from the right side //e.g. 'T2HC1234b'=> 1234 GStr& trim(char c); GStr& trim(const char* c=" \t\n\r"); //trim both ends of characters in given set GStr& trimR(const char* c=" \t\n\r"); //trim only right end GStr& trimR(char c=' '); GStr& chomp(char c='\n') { return trimR(c); } GStr& chomp(const char* cstr); //like trimR, but given string is taken as a whole GStr& trimL(const char* c=" \t\n\r"); //trim only left end GStr& trimL(char c=' '); GStr& padR(uint len, char c=' '); //align it in len spaces to the right GStr& padL(uint len, char c=' '); //align it in len spaces to the left GStr& padC(uint len, char c=' '); //center it size_t read(FILE* stream, const char* delimiter="\n", size_t bufsize=4096); //read next token from stream, using the given string as //a marker where the block should stop const char* chars() const; const char* text() const; char* detach(); //returns pointer to the string, giving up on its memory management protected: char* fTokenDelimiter; int fLastTokenStart; enTokenizeMode fTokenizeMode; void* readbuf; //file read buffer for the read() function size_t readbufsize; //last setting for the readbuf static void invalid_args_error(const char* fname); static void invalid_index_error(const char* fname); struct Data {//structure holding actual //string data and reference count information Data():ref_count(0), cap(0),length(0) { chars[0] = 0; } uint ref_count; //reference count uint cap; //allocated string capacity (excluding \0 end char) uint length; //actual string length (excluding \0 end char) char chars[1]; }; static Data* new_data(uint len, uint addcap=0); //alloc a specified length string's Data static Data* new_data(const char* str, uint addcap=0); //alloc a copy of a specified string, with an additional cap void prep_data(uint len, uint addcap=0); //allocates memory for the string, if needed void replace_data(Data* data); //WARNING (dangerous): direct access to pointer; string editing cannot change the length! char* chrs(); void make_unique(); static Data null_data; //a null (empty) string Data is available here Data* my_data; //pointer to a Data object holding actual string data }; /***************************************************************************/ inline int GStr::length() const { return my_data->length; } inline const char *GStr::chars() const { return my_data->chars; } inline char *GStr::chrs() { //allows direct modification of the chars ! return my_data->chars; } inline const char *GStr::text() const { return my_data->chars; } inline bool operator>=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) >= 0); } inline bool operator!=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) != 0); } inline void Gswap(GStr& s1, GStr& s2) { GStr::Data *tmp = s1.my_data; s1.my_data = s2.my_data; s2.my_data = tmp; } #endif gclib-0.12.7/GThreads.cpp000066400000000000000000000307521407072766100151070ustar00rootroot00000000000000/* Copyright (c) 2010 Marcus Geelnard (with minor modifications and naming changes by Geo Pertea) This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ #include "GThreads.h" #if defined(_GTHREADS_POSIX_) #include // #include #elif defined(_GTHREADS_WIN32_) #include #endif #include //------------------------------------------------------------------------------ // condition_variable //------------------------------------------------------------------------------ // NOTE 1: The Win32 implementation of the condition_variable class is based on // the corresponding implementation in GLFW, which in turn is based on a // description by Douglas C. Schmidt and Irfan Pyarali: // http://www.cs.wustl.edu/~schmidt/win32-cv-1.html // // NOTE 2: Windows Vista actually has native support for condition variables // (InitializeConditionVariable, WakeConditionVariable, etc), but we want to // be portable with pre-Vista Windows versions, so TinyThread++ does not use // Vista condition variables. //------------------------------------------------------------------------------ #if defined(_GTHREADS_WIN32_) #define _CONDITION_EVENT_ONE 0 #define _CONDITION_EVENT_ALL 1 #endif int GThread::tcounter=0; int GThread::num_created=0; #if defined(_GTHREADS_WIN32_) GConditionVar::GConditionVar() : mWaitersCount(0) { mEvents[_CONDITION_EVENT_ONE] = CreateEvent(NULL, FALSE, FALSE, NULL); mEvents[_CONDITION_EVENT_ALL] = CreateEvent(NULL, TRUE, FALSE, NULL); InitializeCriticalSection(&mWaitersCountLock); } #endif #if defined(_GTHREADS_WIN32_) GConditionVar::~GConditionVar() { CloseHandle(mEvents[_CONDITION_EVENT_ONE]); CloseHandle(mEvents[_CONDITION_EVENT_ALL]); DeleteCriticalSection(&mWaitersCountLock); } #endif #if defined(_GTHREADS_WIN32_) void GConditionVar::_wait() { // Wait for either event to become signaled due to notify_one() or // notify_all() being called int result = WaitForMultipleObjects(2, mEvents, FALSE, INFINITE); // Check if we are the last waiter EnterCriticalSection(&mWaitersCountLock); -- mWaitersCount; bool lastWaiter = (result == (WAIT_OBJECT_0 + _CONDITION_EVENT_ALL)) && (mWaitersCount == 0); LeaveCriticalSection(&mWaitersCountLock); // If we are the last waiter to be notified to stop waiting, reset the event if(lastWaiter) ResetEvent(mEvents[_CONDITION_EVENT_ALL]); } #endif #if defined(_GTHREADS_WIN32_) void GConditionVar::notify_one() { // Are there any waiters? EnterCriticalSection(&mWaitersCountLock); bool haveWaiters = (mWaitersCount > 0); LeaveCriticalSection(&mWaitersCountLock); // If we have any waiting threads, send them a signal if(haveWaiters) SetEvent(mEvents[_CONDITION_EVENT_ONE]); } #endif #if defined(_GTHREADS_WIN32_) void GConditionVar::notify_all() { // Are there any waiters? EnterCriticalSection(&mWaitersCountLock); bool haveWaiters = (mWaitersCount > 0); LeaveCriticalSection(&mWaitersCountLock); // If we have any waiting threads, send them a signal if(haveWaiters) SetEvent(mEvents[_CONDITION_EVENT_ALL]); } #endif //------------------------------------------------------------------------------ // POSIX pthread_t to unique thread::id mapping logic. // Note: Here we use a global thread safe std::map to convert instances of // pthread_t to small thread identifier numbers (unique within one process). // This method should be portable across different POSIX implementations. //------------------------------------------------------------------------------ /* #if defined(_GTHREADS_POSIX_) static thread::id _pthread_t_to_ID(const pthread_t &aHandle) { static mutex idMapLock; static std::map idMap; static unsigned long int idCount(1); lock_guard guard(idMapLock); if(idMap.find(aHandle) == idMap.end()) idMap[aHandle] = idCount ++; return thread::id(idMap[aHandle]); } #endif // _GTHREADS_POSIX_ */ void gthreads_errExit(int err, const char* msg) { if (msg!=NULL) fprintf(stderr, "GThreads Error: %s (%s)\n", msg, strerror(err)); else fprintf(stderr, "GThreads Error: %s\n", strerror(err)); exit(EXIT_FAILURE); } void GThread::update_counter(int inc, GThread* t_update) { static GMutex counterLock; GLockGuard guard(counterLock); if (inc==1) { //joinable thread creation GThread::num_created++; t_update->mId = GThread::num_created; } GThread::tcounter+=inc; if (t_update!=NULL && inc<0) t_update->mId=0; // thread terminated } //------------------------------------------------------------------------------ // thread //------------------------------------------------------------------------------ /// Information to pass to the new thread (what to run). struct _thread_start_info { /* void * mArg; ///< Function argument for the thread function. GThread * mThread; ///< Pointer to the thread object. */ GThreadData threadData; //void (*mFunction)(void *, GThread*); void (*mFunction)(void *); ///< Pointer to the function to be executed. void (*gtFunction)(GThreadData&); //custom variant, passing GThreadData //handy constructors: _thread_start_info():threadData() { mFunction=NULL; gtFunction=NULL; } _thread_start_info(GThread* t, void (*aFunc)(void *), void* udata): threadData(udata, t) { mFunction=aFunc; gtFunction=NULL; } _thread_start_info(GThread* t, void (*gtFunc)(GThreadData &), void* udata): threadData(udata, t) { mFunction=NULL; gtFunction=gtFunc; } }; // Thread wrapper function. #if defined(_GTHREADS_WIN32_) unsigned WINAPI GThread::wrapper_function(void * aArg) #elif defined(_GTHREADS_POSIX_) void * GThread::wrapper_function(void * aArg) #endif { // Get thread startup information _thread_start_info * ti = (_thread_start_info *) aArg; /* try { // Call the actual client thread function ti->mFunction(ti->mArg, ti->mThread); } catch(...) { // Uncaught exceptions will terminate the application (default behavior // according to the C++11) std::terminate(); } */ //ti->mFunction(ti->mArg, ti->mThread); //cheap trick to pass current GThread pointer //when the user doesn't pass anything if (ti->gtFunction) { ti->gtFunction(ti->threadData); } else { if (ti->threadData.udata) { ti->mFunction(ti->threadData.udata); } else { ti->mFunction(ti->threadData.thread); } } // The thread is no longer executing GLockGuard guard(ti->threadData.thread->mDataMutex); ti->threadData.thread->mNotAThread = true; GThread::update_counter(-1, ti->threadData.thread); // The thread is responsible for freeing the startup information delete ti; return 0; } void GThread::initStart(void* tidata, size_t stacksize) { _thread_start_info * ti = (_thread_start_info *) tidata; /*ti->mFunction = aFunction; ti->mArg = aArg; ti->mThread = this;*/ // The thread is now alive mNotAThread = false; // Create the thread #if defined(_GTHREADS_WIN32_) mHandle = (HANDLE) _beginthreadex(0, 0, wrapper_function, (void *) ti, 0, &mWin32ThreadID); #elif defined(_GTHREADS_POSIX_) if (stacksize>0) { pthread_attr_t attr; int r=pthread_attr_init(&attr); if (r!=0) gthreads_errExit(r, "pthread_attr_init()"); r = pthread_attr_setstacksize(&attr, stacksize); if (r!=0) gthreads_errExit(r, "pthread_attr_setstacksize()"); stack_size=stacksize; r=pthread_create(&mHandle, &attr, wrapper_function, (void *) ti); if (r!=0) { gthreads_errExit(r, "pthread_create()"); //mHandle = 0; } r=pthread_attr_destroy(&attr); if (r!=0) gthreads_errExit(r, "pthread_attr_destroy()"); } else { int r=pthread_create(&mHandle, NULL, wrapper_function, (void *) ti); if (r!= 0) gthreads_errExit(r, "pthread_create()"); //mHandle = 0; } #endif // Did we fail to create the thread? if(!mHandle) { mNotAThread = true; delete ti; } else GThread::update_counter(1, this); } GThread::GThread(void (*aFunction)(void *), void * aArg, size_t stacksize): mId(0), mHandle(0), mNotAThread(true) #if defined(_GTHREADS_WIN32_) , mWin32ThreadID(0) #endif { kickStart(aFunction, aArg, stacksize); } void GThread::kickStart(void (*aFunction)(void *), void * aArg, size_t stacksize) { // Serialize access to this thread structure GLockGuard guard(mDataMutex); // Fill out the thread startup information (passed to the thread wrapper, // which will eventually free it) _thread_start_info * ti = new _thread_start_info(this, aFunction, aArg); initStart(ti, stacksize); } //custom alternate constructor (non-C++11 compatible), passing GThreadData back to the //user function in order to easily retrieve current GThread object //(better alternative to this_thread) GThread::GThread(void (*gFunction)(GThreadData& thread_data), void * aArg, size_t stacksize) { kickStart(gFunction, aArg, stacksize); } void GThread::kickStart(void (*gFunction)(GThreadData& thread_data), void * aArg, size_t stacksize) { // Serialize access to this thread structure GLockGuard guard(mDataMutex); // Fill out the thread startup information (passed to the thread wrapper, // which will eventually free it) _thread_start_info * ti = new _thread_start_info(this, gFunction, aArg); initStart(ti, stacksize); } GThread::~GThread() { if(joinable()) { //std::terminate(); -- why?? GThread::update_counter(-1, this); mDataMutex.lock(); #if defined(_TTHREAD_WIN32_) CloseHandle(mHandle); #elif defined(_TTHREAD_POSIX_) pthread_detach(mHandle); #endif mDataMutex.unlock(); } } void GThread::join() { if(joinable()) { #if defined(_GTHREADS_WIN32_) WaitForSingleObject(mHandle, INFINITE); CloseHandle(mHandle); #elif defined(_GTHREADS_POSIX_) pthread_join(mHandle, NULL); #endif } } void GThread::detach() { mDataMutex.lock(); if(!mNotAThread) { #if defined(_TTHREAD_WIN32_) CloseHandle(mHandle); #elif defined(_TTHREAD_POSIX_) pthread_detach(mHandle); #endif mNotAThread = true; } mDataMutex.unlock(); } void GThread::wait_all() { while (GThread::num_running()>0) current_thread::sleep_for(1); } bool GThread::joinable() const { mDataMutex.lock(); bool result = !mNotAThread; mDataMutex.unlock(); return result; } int GThread::get_id() const { if(!joinable()) //return id(); return 0; //FIXME: don't use this else return mId; /* #if defined(_GTHREADS_WIN32_) return id((unsigned long int) mWin32ThreadID); #elif defined(_GTHREADS_POSIX_) return _pthread_t_to_ID(mHandle); #endif */ } unsigned GThread::hardware_concurrency() { #if defined(_GTHREADS_WIN32_) SYSTEM_INFO si; GetSystemInfo(&si); return (int) si.dwNumberOfProcessors; #elif defined(_SC_NPROCESSORS_ONLN) return (int) sysconf(_SC_NPROCESSORS_ONLN); #elif defined(_SC_NPROC_ONLN) return (int) sysconf(_SC_NPROC_ONLN); #else // The standard requires this function to return zero if the number of // hardware cores could not be determined. return 0; #endif } //------------------------------------------------------------------------------ // current_thread //------------------------------------------------------------------------------ /* int current_thread::get_id() { #if defined(_GTHREADS_WIN32_) return thread::id((unsigned long int) GetCurrentThreadId()); #elif defined(_GTHREADS_POSIX_) return _pthread_t_to_ID(pthread_self()); #endif } */ void current_thread::yield() { #if defined(_GTHREADS_WIN32_) Sleep(0); #else sched_yield(); #endif } // Blocks the calling thread for a certain time (given in milliseconds) // Example usage: // // Sleep for 100 milliseconds: // current_thread::sleep_for(100); void current_thread::sleep_for(const int mstime) { #if defined(_GTHREADS_WIN32_) Sleep(mstime); #else usleep((useconds_t)(mstime*1000)); #endif } gclib-0.12.7/GThreads.h000066400000000000000000000602511407072766100145510ustar00rootroot00000000000000#ifndef _GTHREADS_ #define _GTHREADS_ /* GThread - multi-platform thread management utility class This code is taken from the TinyThread++ 1.0 package by Marcus Geelnard (with only very minor alterations and naming changes). Original Copyright notice follows: ---- Copyright (c) 2010 Marcus Geelnard This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ /// @file /// @mainpage TinyThread++ API Reference /// /// @section intro_sec Introduction /// TinyThread++ is a minimal, portable implementation of basic threading /// classes for C++. /// /// They closely mimic the functionality and naming of the C++11 standard, and /// should be easily replaceable with the corresponding std:: variants. /// /// @section port_sec Portability /// The Win32 variant uses the native Win32 API for implementing the thread /// classes, while for other systems, the POSIX threads API (pthread) is used. /// /// @section class_sec Classes /// In order to mimic the threading API of the C++11 standard, subsets of /// several classes are provided. The fundamental classes are: /// @li GThread /// @li GMutex /// @li GRecursiveMutex /// @li GConditionVariable /// @li GLockGuard /// @li GFastMutex /// /// @section misc_sec Miscellaneous /// The following special keywords are available: #thread_local. /// /// For more detailed information (including additional classes), browse the /// different sections of this documentation. A good place to start is: /// tinythread.h. // Which platform are we on? #if !defined(_GTHREADS_PLATFORM_DEFINED_) #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__) #define _GTHREADS_WIN32_ #else #define _GTHREADS_POSIX_ #endif #define _GTHREADS_PLATFORM_DEFINED_ #endif // Check if we can support the assembly language level implementation (otherwise // revert to the system API) #if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \ (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \ (defined(__GNUC__) && (defined(__ppc__))) #define _GFASTMUTEX_ASM_ #else #define _FAST_MUTEX_SYS_ #endif // Platform specific includes #if defined(_GTHREADS_WIN32_) #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #define __UNDEF_LEAN_AND_MEAN #endif #include #ifdef __UNDEF_LEAN_AND_MEAN #undef WIN32_LEAN_AND_MEAN #undef __UNDEF_LEAN_AND_MEAN #endif #else #ifdef __CYGWIN__ #define _BSD_SOURCE #endif #include #include #include #include #endif #include #include #include /// TinyThread++ version (major number). #define TINYTHREAD_VERSION_MAJOR 1 /// TinyThread++ version (minor number). #define TINYTHREAD_VERSION_MINOR 1 /// TinyThread++ version (full version). #define TINYTHREAD_VERSION (TINYTHREAD_VERSION_MAJOR * 100 + TINYTHREAD_VERSION_MINOR) // Do we have a fully featured C++11 compiler? #if (__cplusplus > 199711L) || (defined(__STDCXX_VERSION__) && (__STDCXX_VERSION__ >= 201001L)) #define _GTHREADS_CPP11_ #endif // ...at least partial C++11? #if defined(_GTHREADS_CPP11_) || defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(__GXX_EXPERIMENTAL_CPP0X__) #define _GTHREADS_CPP11_PARTIAL_ #endif // Macro for disabling assignments of objects. #ifdef _GTHREADS_CPP11_PARTIAL_ #define _GTHREADS_DISABLE_ASSIGNMENT(name) \ name(const name&) = delete; \ name& operator=(const name&) = delete; #else #define _GTHREADS_DISABLE_ASSIGNMENT(name) \ name(const name&); \ name& operator=(const name&); #endif /// @def thread_local /// Thread local storage keyword. /// A variable that is declared with the \c thread_local keyword makes the /// value of the variable local to each thread (known as thread-local storage, /// or TLS). Example usage: /// @code /// // This variable is local to each thread. /// thread_local int variable; /// @endcode /// @note The \c thread_local keyword is a macro that maps to the corresponding /// compiler directive (e.g. \c __declspec(thread)). While the C++11 standard /// allows for non-trivial types (e.g. classes with constructors and /// destructors) to be declared with the \c thread_local keyword, most pre-C++11 /// compilers only allow for trivial types (e.g. \c int). So, to guarantee /// portable code, only use trivial types for thread local storage. /// @note This directive is currently not supported on Mac OS X (it will give /// a compiler error), since compile-time TLS is not supported in the Mac OS X /// executable format. Also, some older versions of MinGW (before GCC 4.x) do /// not support this directive. /// @hideinitializer #if !defined(_GTHREADS_CPP11_) && !defined(thread_local) #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__) #define thread_local __thread #else #define thread_local __declspec(thread) #endif #endif // HACK: Mac OS X and early MinGW do not support thread-local storage #if defined(__APPLE__) || (defined(__MINGW32__) && (__GNUC__ < 4)) #define GTHREADS_NO_TLS #endif void gthreads_errExit(int err, const char* msg=NULL); #define pthreads_err(msg) \ do { perror(msg); exit(EXIT_FAILURE); } while (0) /// GMutex class /// This is a mutual exclusion object for synchronizing access to shared /// memory areas for several threads. The mutex is non-recursive (i.e. a /// program may deadlock if the thread that owns a mutex object calls lock() /// on that object). /// @see recursive_mutex class GMutex { public: /// Constructor. GMutex() #if defined(_GTHREADS_WIN32_) : mAlreadyLocked(false) #endif { #if defined(_GTHREADS_WIN32_) InitializeCriticalSection(&mHandle); #else pthread_mutex_init(&mHandle, NULL); #endif } /// Destructor. ~GMutex() { #if defined(_GTHREADS_WIN32_) DeleteCriticalSection(&mHandle); #else pthread_mutex_destroy(&mHandle); #endif } /// Lock the mutex. /// The method will block the calling thread until a lock on the mutex can /// be obtained. The mutex remains locked until \c unlock() is called. /// @see lock_guard inline void lock() { #if defined(_GTHREADS_WIN32_) EnterCriticalSection(&mHandle); while(mAlreadyLocked) Sleep(1000); // Simulate deadlock... mAlreadyLocked = true; #else pthread_mutex_lock(&mHandle); #endif } /// Try to lock the mutex. /// The method will try to lock the mutex. If it fails, the function will /// return immediately (non-blocking). /// @return \c true if the lock was acquired, or \c false if the lock could /// not be acquired. inline bool try_lock() { #if defined(_GTHREADS_WIN32_) bool ret = (TryEnterCriticalSection(&mHandle) ? true : false); if(ret && mAlreadyLocked) { LeaveCriticalSection(&mHandle); ret = false; } return ret; #else return (pthread_mutex_trylock(&mHandle) == 0) ? true : false; #endif } /// Unlock the mutex. /// If any threads are waiting for the lock on this mutex, one of them will /// be unblocked. inline void unlock() { #if defined(_GTHREADS_WIN32_) mAlreadyLocked = false; LeaveCriticalSection(&mHandle); #else pthread_mutex_unlock(&mHandle); #endif } _GTHREADS_DISABLE_ASSIGNMENT(GMutex) private: #if defined(_GTHREADS_WIN32_) CRITICAL_SECTION mHandle; bool mAlreadyLocked; #else pthread_mutex_t mHandle; #endif friend class GConditionVar; }; /// Recursive mutex class. /// This is a mutual exclusion object for synchronizing access to shared /// memory areas for several threads. The mutex is recursive (i.e. a thread /// may lock the mutex several times, as long as it unlocks the mutex the same /// number of times). /// @see mutex class GMutexRecursive { public: /// Constructor. GMutexRecursive() { #if defined(_GTHREADS_WIN32_) InitializeCriticalSection(&mHandle); #else pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); pthread_mutex_init(&mHandle, &attr); #endif } /// Destructor. ~GMutexRecursive() { #if defined(_GTHREADS_WIN32_) DeleteCriticalSection(&mHandle); #else pthread_mutex_destroy(&mHandle); #endif } /// Lock the mutex. /// The method will block the calling thread until a lock on the mutex can /// be obtained. The mutex remains locked until \c unlock() is called. /// @see lock_guard inline void lock() { #if defined(_GTHREADS_WIN32_) EnterCriticalSection(&mHandle); #else pthread_mutex_lock(&mHandle); #endif } /// Try to lock the mutex. /// The method will try to lock the mutex. If it fails, the function will /// return immediately (non-blocking). /// @return \c true if the lock was acquired, or \c false if the lock could /// not be acquired. inline bool try_lock() { #if defined(_GTHREADS_WIN32_) return TryEnterCriticalSection(&mHandle) ? true : false; #else return (pthread_mutex_trylock(&mHandle) == 0) ? true : false; #endif } /// Unlock the mutex. /// If any threads are waiting for the lock on this mutex, one of them will /// be unblocked. inline void unlock() { #if defined(_GTHREADS_WIN32_) LeaveCriticalSection(&mHandle); #else pthread_mutex_unlock(&mHandle); #endif } _GTHREADS_DISABLE_ASSIGNMENT(GMutexRecursive) private: #if defined(_GTHREADS_WIN32_) CRITICAL_SECTION mHandle; #else pthread_mutex_t mHandle; #endif friend class GConditionVar; }; /// Fast mutex class. /// This is a mutual exclusion object for synchronizing access to shared /// memory areas for several threads. It is similar to the tthread::mutex class, /// but instead of using system level functions, it is implemented as an atomic /// spin lock with very low CPU overhead. /// /// The \c fast_mutex class is NOT compatible with the \c condition_variable /// class (however, it IS compatible with the \c lock_guard class). It should /// also be noted that the \c fast_mutex class typically does not provide /// as accurate thread scheduling as a the standard \c mutex class does. /// /// Because of the limitations of the class, it should only be used in /// situations where the mutex needs to be locked/unlocked very frequently. /// /// @note The "fast" version of this class relies on inline assembler language, /// which is currently only supported for 32/64-bit Intel x86/AMD64 and /// PowerPC architectures on a limited number of compilers (GNU g++ and MS /// Visual C++). /// For other architectures/compilers, system functions are used instead. class GFastMutex { public: /// Constructor. #if defined(_GFASTMUTEX_ASM_) GFastMutex() : mLock(0) {} #else GFastMutex() { #if defined(_GTHREADS_WIN32_) InitializeCriticalSection(&mHandle); #elif defined(_GTHREADS_POSIX_) pthread_mutex_init(&mHandle, NULL); #endif } #endif #if !defined(_GFASTMUTEX_ASM_) /// Destructor. ~GFastMutex() { #if defined(_GTHREADS_WIN32_) DeleteCriticalSection(&mHandle); #elif defined(_GTHREADS_POSIX_) pthread_mutex_destroy(&mHandle); #endif } #endif /// Lock the mutex. /// The method will block the calling thread until a lock on the mutex can /// be obtained. The mutex remains locked until \c unlock() is called. /// @see lock_guard inline void lock() { #if defined(_GFASTMUTEX_ASM_) bool gotLock; do { gotLock = try_lock(); if(!gotLock) { #if defined(_GTHREADS_WIN32_) Sleep(0); #elif defined(_GTHREADS_POSIX_) sched_yield(); #endif } } while(!gotLock); #else #if defined(_GTHREADS_WIN32_) EnterCriticalSection(&mHandle); #elif defined(_GTHREADS_POSIX_) pthread_mutex_lock(&mHandle); #endif #endif } /// Try to lock the mutex. /// The method will try to lock the mutex. If it fails, the function will /// return immediately (non-blocking). /// @return \c true if the lock was acquired, or \c false if the lock could /// not be acquired. inline bool try_lock() { #if defined(_GFASTMUTEX_ASM_) int oldLock; #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) asm volatile ( "movl $1,%%eax\n\t" "xchg %%eax,%0\n\t" "movl %%eax,%1\n\t" : "=m" (mLock), "=m" (oldLock) : : "%eax", "memory" ); #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) int *ptrLock = &mLock; __asm { mov eax,1 mov ecx,ptrLock xchg eax,[ecx] mov oldLock,eax } #elif defined(__GNUC__) && (defined(__ppc__)) int newLock = 1; asm volatile ( "\n1:\n\t" "lwarx %0,0,%1\n\t" "cmpwi 0,%0,0\n\t" "bne- 2f\n\t" "stwcx. %2,0,%1\n\t" "bne- 1b\n\t" "isync\n" "2:\n\t" : "=&r" (oldLock) : "r" (&mLock), "r" (newLock) : "cr0", "memory" ); #endif return (oldLock == 0); #else #if defined(_GTHREADS_WIN32_) return TryEnterCriticalSection(&mHandle) ? true : false; #elif defined(_GTHREADS_POSIX_) return (pthread_mutex_trylock(&mHandle) == 0) ? true : false; #endif #endif } /// Unlock the mutex. /// If any threads are waiting for the lock on this mutex, one of them will /// be unblocked. inline void unlock() { #if defined(_GFASTMUTEX_ASM_) #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) asm volatile ( "movl $0,%%eax\n\t" "xchg %%eax,%0\n\t" : "=m" (mLock) : : "%eax", "memory" ); #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) int *ptrLock = &mLock; __asm { mov eax,0 mov ecx,ptrLock xchg eax,[ecx] } #elif defined(__GNUC__) && (defined(__ppc__)) asm volatile ( "sync\n\t" // Replace with lwsync where possible? : : : "memory" ); mLock = 0; #endif #else #if defined(_GTHREADS_WIN32_) LeaveCriticalSection(&mHandle); #elif defined(_GTHREADS_POSIX_) pthread_mutex_unlock(&mHandle); #endif #endif } private: #if defined(_GFASTMUTEX_ASM_) int mLock; #else #if defined(_GTHREADS_WIN32_) CRITICAL_SECTION mHandle; #elif defined(_GTHREADS_POSIX_) pthread_mutex_t mHandle; #endif #endif }; /// Lock guard class. /// The constructor locks the mutex, and the destructor unlocks the mutex, so /// the mutex will automatically be unlocked when the lock guard goes out of /// scope. Example usage: /// @code /// mutex m; /// int counter; /// /// void increment() /// { /// lock_guard guard(m); /// ++ counter; /// } /// @endcode template class GLockGuard { public: typedef T mutex_type; GLockGuard() : mMutex(0) {} /// The constructor locks the mutex. explicit GLockGuard(mutex_type &aMutex) { mMutex = &aMutex; mMutex->lock(); } /// The destructor unlocks the mutex. ~GLockGuard() { if(mMutex) mMutex->unlock(); } private: mutex_type * mMutex; }; /// Condition variable class. /// This is a signalling object for synchronizing the execution flow for /// several threads. Example usage: /// @code /// // Shared data and associated mutex and condition variable objects /// int count; /// mutex m; /// condition_variable cond; /// /// // Wait for the counter to reach a certain number /// void wait_counter(int targetCount) /// { /// lock_guard guard(m); /// while(count < targetCount) /// cond.wait(m); /// } /// /// // Increment the counter, and notify waiting threads /// void increment() /// { /// lock_guard guard(m); /// ++ count; /// cond.notify_all(); /// } /// @endcode class GConditionVar { public: /// Constructor. #if defined(_GTHREADS_WIN32_) GConditionVar(); #else GConditionVar() { pthread_cond_init(&mHandle, NULL); } #endif /// Destructor. #if defined(_GTHREADS_WIN32_) ~GConditionVar(); #else ~GConditionVar() { pthread_cond_destroy(&mHandle); } #endif /// Wait for the condition. /// The function will block the calling thread until the condition variable /// is woken by \c notify_one(), \c notify_all() or a spurious wake up. /// @param[in] aMutex A mutex that will be unlocked when the wait operation /// starts, an locked again as soon as the wait operation is finished. template inline void wait(_mutexT &aMutex) { #if defined(_GTHREADS_WIN32_) // Increment number of waiters EnterCriticalSection(&mWaitersCountLock); ++ mWaitersCount; LeaveCriticalSection(&mWaitersCountLock); // Release the mutex while waiting for the condition (will decrease // the number of waiters when done)... aMutex.unlock(); _wait(); aMutex.lock(); #else pthread_cond_wait(&mHandle, &aMutex.mHandle); #endif } /// Notify one thread that is waiting for the condition. /// If at least one thread is blocked waiting for this condition variable, /// one will be woken up. /// @note Only threads that started waiting prior to this call will be /// woken up. #if defined(_GTHREADS_WIN32_) void notify_one(); #else inline void notify_one() { pthread_cond_signal(&mHandle); } #endif /// Notify all threads that are waiting for the condition. /// All threads that are blocked waiting for this condition variable will /// be woken up. /// @note Only threads that started waiting prior to this call will be /// woken up. #if defined(_GTHREADS_WIN32_) void notify_all(); #else inline void notify_all() { pthread_cond_broadcast(&mHandle); } #endif _GTHREADS_DISABLE_ASSIGNMENT(GConditionVar) private: #if defined(_GTHREADS_WIN32_) void _wait(); HANDLE mEvents[2]; ///< Signal and broadcast event HANDLEs. unsigned int mWaitersCount; ///< Count of the number of waiters. CRITICAL_SECTION mWaitersCountLock; ///< Serialize access to mWaitersCount. #else pthread_cond_t mHandle; #endif }; class GThread; struct GThreadData { void* udata; //user data GThread* thread; //current GThread object GThreadData(void* u=NULL, GThread* t=NULL):udata(u),thread(t) {} }; /// Thread class. class GThread { public: #if defined(_GTHREADS_WIN32_) typedef HANDLE native_handle_type; #else typedef pthread_t native_handle_type; #endif private: int mId; size_t stack_size; //available only for pthreads static int tcounter; //counts live, joinable GThread instances static int num_created; //counts all joinable GThread instances ever created by current process native_handle_type mHandle; ///< Thread handle. mutable GMutex mDataMutex; ///< Serializer for access to the thread private data. bool mNotAThread; ///< True if this object is not a thread of execution. #if defined(_GTHREADS_WIN32_) unsigned int mWin32ThreadID; ///< Unique thread ID (filled out by _beginthreadex). #endif public: /// Default constructor. /// Construct a thread object without an associated thread of execution /// (i.e. non-joinable). GThread(size_t stacksize=0) : mId(0), stack_size(stacksize), mHandle(0), mNotAThread(true) #if defined(_GTHREADS_WIN32_) , mWin32ThreadID(0) #endif {} /// Thread starting constructor. /// Construct a @c thread object with a new thread of execution. /// @param[in] aFunction A function pointer to a function of type: /// void fun(void * arg) /// @param[in] aArg Argument to the thread function. /// @note This constructor is not fully compatible with the standard C++ /// thread class. It is more similar to the pthread_create() (POSIX) and /// CreateThread() (Windows) functions. //GThread(void (*aFunction)(void *, GThread*), void * aArg); GThread(void (*aFunction)(void *), void * aArg=NULL, size_t stacksize=0); GThread(void (*aFunction)(GThreadData& thread_data), void * aArg, size_t stacksize=0); void kickStart(void (*aFunction)(GThreadData& thread_data), void * aArg, size_t stacksize=0); void kickStart(void (*aFunction)(void *), void * aArg=NULL, size_t stacksize=0); /// Destructor. /// @note If the thread is joinable upon destruction, \c std::terminate() /// will be called, which terminates the process. It is always wise to do /// \c join() before deleting a thread object. ~GThread(); /// Wait for the thread to finish (join execution flows). void join(); /// Check if the thread is joinable. /// A thread object is joinable if it has an associated thread of execution. bool joinable() const; /// Detach from the thread. /// After calling @c detach(), the thread object is no longer assicated with /// a thread of execution (i.e. it is not joinable). The thread continues /// execution without the calling thread blocking, and when the thread /// ends execution, any owned resources are released. void detach(); /// Return the thread ID of a thread object. int get_id() const; // { return mID; } size_t getStackSize() { return stack_size; } //only for pthreads /// Get the native handle for this thread. /// @note Under Windows, this is a \c HANDLE, and under POSIX systems, this /// is a \c pthread_t. inline native_handle_type native_handle() { return mHandle; } inline void yield() { #if defined(_GTHREADS_WIN32_) Sleep(0); #else sched_yield(); #endif } static int num_running() { //return number of running (live) threads static GFastMutex vLock; GLockGuard guard(vLock); int r=tcounter; return r; } #ifdef _GTHREADS_POSIX_ static size_t defaultStackSize() { pthread_attr_t attr; size_t stacksize; pthread_attr_init(&attr); pthread_attr_getstacksize(&attr, &stacksize); pthread_attr_destroy(&attr); return stacksize; } #endif static int liveCount() { //return number of running (live) threads return num_running(); } static void wait_all(); /// Determine the number of threads which can possibly execute concurrently. /// This function is useful for determining the optimal number of threads to /// use for a task. /// @return The number of hardware thread contexts in the system. /// @note If this value is not defined, the function returns zero (0). static unsigned hardware_concurrency(); _GTHREADS_DISABLE_ASSIGNMENT(GThread) private: void initStart(void* tidata, size_t stacksize=0); static void update_counter(int inc=1, GThread* t_update=NULL); //default: increments // This is the internal thread wrapper function. #if defined(_GTHREADS_WIN32_) static unsigned WINAPI wrapper_function(void * aArg); #else static void * wrapper_function(void * aArg); #endif }; /// The namespace "current_thread" provides methods for dealing with the /// calling thread. namespace current_thread { /// Return the thread ID of the calling thread. //thread::id get_id(); //this can be slow, better not use it /// Yield execution to another thread. /// Offers the operating system the opportunity to schedule another thread /// that is ready to run on the current processor. void yield(); // Blocks the calling thread for a certain time (given in milliseconds) // Example usage: // // Sleep for 100 milliseconds: // current_thread::sleep_for(100); void sleep_for(const int mstime); } // Define/macro cleanup #undef _GTHREADS_DISABLE_ASSIGNMENT #endif // _GTHREADS_ gclib-0.12.7/GVec.hh000066400000000000000000000650661407072766100140550ustar00rootroot00000000000000//--------------------------------------------------------------------------- /* Sortable collection of pointers to objects */ #ifndef _GVec_HH #define _GVec_HH #include "GBase.h" #include #define GVEC_INDEX_ERR "GVec error: invalid index: %d\n" #if defined(NDEBUG) || defined(NODEBUG) || defined(_NDEBUG) || defined(NO_DEBUG) #define TEST_INDEX(x) #else #define TEST_INDEX(x) \ if (x<0 || x>=fCount) GError(GVEC_INDEX_ERR, x) #endif #define GVEC_CAPACITY_ERR "GVec error: invalid capacity: %d\n" #define GVEC_COUNT_ERR "GVec error: invalid count: %d\n" #define MAXLISTSIZE INT_MAX-1 #define FREEDATA (fFreeProc!=NULL) template int DefLTCompareProc(pointer p1, pointer p2) { OBJ& o1 = *((OBJ*) p1); OBJ& o2 = *((OBJ*) p2); if (o1 < o2) return -1; else return ((o2 < o1) ? 1 : 0 ); } //basic template for array of objects; //so it doesn't require comparison operators to be defined template class GVec { protected: OBJ* fArray; int fCount; int fCapacity; void qSort(int L, int R, GCompareProc* cmpFunc); public: GVec(int init_capacity=2); GVec(int init_count, const OBJ init_val); GVec(int init_count, OBJ* init_val, bool delete_initval=true); //convenience constructor for complex vectors GVec(const GVec& array); //copy constructor GVec(GVec&& array); //move constructor GVec& operator=(const GVec& array); //copy operator GVec& operator=(GVec&& array); //move operator virtual ~GVec(); void Insert(int idx, OBJ item) { Insert(idx, &item); } void Insert(int idx, OBJ* item); void idxInsert(int idx, OBJ& item) { Insert(idx, &item); } void Grow(); void Grow(int newCap); void Grow(int idx, OBJ& item); //grow and add/insert item copy void Reverse(); //WARNING: will break the sort order if SORTED! int Add(OBJ* item); // simply append to the end of fArray, reallocating as needed int Add(OBJ& item) { return Add(&item); } int cAdd(OBJ item) { return Add(&item); } //all these will CREATE a new OBJ and COPY to it // // using OBJ copy operator= // -- stack/queue usage: //int Push(OBJ& item) { return Add(&item); } int Push(OBJ& item) { return Add(&item); } int cPush(OBJ item) { return Add(&item); } OBJ Pop();// Stack use; removes and returns a copy of the last item OBJ Shift(); //Queue use: removes and returns a copy of the first item void Shift(int idx); //Queue use: removes first idx elements from array void Add(GVec& list); //append copies of all items from another list OBJ& Get(int idx) { TEST_INDEX(idx); return fArray[idx]; } inline OBJ& operator[](int i) { TEST_INDEX(i); return fArray[i]; } OBJ& Last() { TEST_INDEX(fCount-1); return fArray[fCount-1]; } OBJ& First() { TEST_INDEX(0); return fArray[0]; } void Clear(); void Delete(int index); void Replace(int idx, OBJ& item); //Put, use operator= to copy void Exchange(int idx1, int idx2); void Swap(int idx1, int idx2) { Exchange(idx1, idx2); } int Capacity() { return fCapacity; } //this will reject identical items in sorted lists only! inline void setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: NewCapacity MUST be > fCount //if you want to shrink it use Resize() or setCount() if (NewCapacity!=fCapacity) { if (NewCapacity==0) { delete[] fArray; fArray=nullptr; } else { OBJ* oldArray=fArray; fArray=new OBJ[NewCapacity]; if (oldArray) { std::move(&oldArray[0], &oldArray[fCount], & fArray[0]); delete[] oldArray; } //this relies on in-class initializers, default constructors etc. } fCapacity=NewCapacity; } } /* void insertGrow(int NewCapacity, int idx, OBJ& item) { OBJ* newList=new OBJ[NewCapacity]; // operator= required! std::move(& fArray[0], & fArray[idx], & newList[0]); newList[idx]=item; //copy data after idx std::move(& fArray[idx], & fArray[fCount], & newList[idx+1]); delete[] fArray; fArray=newList; fCapacity=NewCapacity; fCount++; } */ template typename std::enable_if< std::is_trivial::value, void>::type topOff(int NewCount) { memset(fArray+fCount, 0, (NewCount-fCount)*sizeof(OBJ)); } template typename std::enable_if< !std::is_trivial::value, void>::type topOff(int NewCount) { OBJ v; if (NewCount>fCount) for (int i=fCount;i0; } void Sort(GCompareProc* cmpFunc); void Sort(); }; //---- template for dynamic array of object pointers //---- it's faster than GVec and has item deallocation awareness template class GPVec { protected: OBJ* *fList; //pointer to an array of pointers to objects int fCount; //total number of entries in list int fCapacity; //current allocated size GFreeProc* fFreeProc; //useful for deleting objects //--- void Expand(); void Grow(); void Grow(int idx, OBJ* item); void qSort(int L, int R, GCompareProc* cmpFunc); public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } virtual ~GPVec(); GPVec(int init_capacity=2, bool free_elements=true); //also the default constructor GPVec(bool free_elements); GPVec(const GPVec& list); //copy constructor GPVec(GPVec&& list); //move construstor GPVec(GPVec* list); //similar to a copy constructor GPVec& operator=(const GPVec& list); GPVec& operator=(GPVec&& list);//move assignment operator inline OBJ* Get(int i) { TEST_INDEX(i); return fList[i]; } //OBJ* operator[](int i) { return this->Get(i); } inline OBJ*& operator[](int i) { TEST_INDEX(i); return fList[i]; } void Reverse(); //reverse pointer array; WARNING: will break(reverse) the sort order if sorted! void freeItem(int idx); //calls fFreeProc (or DefaultFreeProc) on fList[idx] and sets NULL there, doesn't pack! //it will free even if fFreeProc is NULL! void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { if (doFree) fFreeProc=DefaultFreeProc; else fFreeProc=NULL; } // -- stack usage: int Push(OBJ* item) { return Add(item); } OBJ* Pop();// Stack use; removes and returns last item,but does NOT FREE it OBJ* Shift(); //Queue use: removes and returns first item, but does NOT FREE it void deallocate_item(OBJ*& item); //forcefully call fFreeProc or delete on item void Clear(); void Exchange(int idx1, int idx2); void Swap(int idx1, int idx2) { Exchange(idx1, idx2); } OBJ* First() { return (fCount>0)?fList[0]:nullptr; } OBJ* Last() { return (fCount>0)?fList[fCount-1]:nullptr;} bool isEmpty() { return fCount==0; } bool notEmpty() { return fCount>0; } int Capacity() { return fCapacity; } int Count() { return fCount; } void setCapacity(int NewCapacity); void setCount(int NewCount); //the same as setCapacity() but the new item range is filled with NULLs int Add(OBJ* item); //simply append the pointer copy void Add(GPVec& list); //add all pointers from another list void addNew(GPVec& list); //add new OBJ copies of items in another list void Insert(int idx, OBJ* item); void Move(int curidx, int newidx); void Put(int idx, OBJ* item); void Pack(); void Delete(int index); //also frees the item if fFreeProc!=NULL, and shifts the successor items void Forget(int idx); //simply places a NULL at fList[idx], nothing else int RemovePtr(pointer item); //always use linear search to find the pointer! calls Delete() if found int IndexOf(pointer item); //a linear search for pointer address! void Sort(GCompareProc* cmpFunc); void Sort(); }; //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GVec::GVec(int init_capacity) { fCount=0; fCapacity=0; fArray=nullptr; setCapacity(init_capacity); //if (set_count) fCount = init_capacity; } template GVec::GVec(int init_count, const OBJ init_val) { fCount=0; fCapacity=0; fArray=nullptr; setCapacity(init_count); fCount = init_count; for (int i=0;i GVec::GVec(int init_count, OBJ* init_val, bool delete_initval) { fCount=0; fCapacity=0; fArray=nullptr; setCapacity(init_count); fCount = init_count; for (int i=0;i GVec::GVec(const GVec& array) { //copy constructor fCount=array.fCount; fCapacity=array.fCapacity; fArray=nullptr; if (fCapacity>0) { fArray=new OBJ[this->fCapacity]; std::copy(& array.fArray[0], & array.fArray[fCount], & fArray[0]); } } template GVec::GVec(GVec&& array) { //move constructor fCount=array.fCount; fCapacity=array.fCapacity; fArray=array.fArray; array.fArray=nullptr; array.fCount=0; array.fCapacity=0; } template GVec& GVec::operator=(const GVec& array) { if (&array==this) return *this; Clear(); fCapacity=array.fCapacity; fCount=array.fCount; if (fCapacity>0) { fArray=new OBJ[this->fCapacity]; // uses OBJ operator= std::copy(& array.fArray[0], & array.fArray[fCount], & fArray[0]); } return *this; } template GVec& GVec::operator=(GVec&& array) { if (&array==this) return *this; Clear(); fCapacity=array.fCapacity; fCount=array.fCount; fArray=array.fArray; array.fArray=nullptr; array.fCapacity=0; array.fCount=0; return *this; } template GVec::~GVec() { this->Clear(); } template void GVec::Clear() { fCount=0; delete[] fArray; fArray=nullptr; fCapacity=0; } template void GVec::Grow() { int delta = (fCapacity>8) ? (fCapacity>>2) : 2 ; setCapacity(fCapacity + delta); } template void GVec::Grow(int newCap) { if (newCap void GVec::Reverse() { int l=0; int r=fCount-1; OBJ c; while (l void GVec::Grow(int idx, OBJ& item) { int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity >= MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range if (idx==fCount) { //append item setCapacity(NewCapacity); fArray[idx]=item; fCount++; return; } //expands and inserts item at idx at the same time //insertGrow(NewCapacity, idx, item); OBJ* newList=new OBJ[NewCapacity]; // operator= required! std::move(& fArray[0], & fArray[idx], & newList[0]); newList[idx]=item; //copy data after idx std::move(& fArray[idx], & fArray[fCount], & newList[idx+1]); delete[] fArray; fArray=newList; fCapacity=NewCapacity; fCount++; } template int GVec::Add(OBJ* item) { if (item==NULL) return -1; if (fCount==fCapacity) Grow(); fArray[fCount] = *item; //OBJ::operator= must copy OBJ properly! fCount++; return fCount-1; } template void GVec::Add(GVec& list) { if (list.Count()==0) return; //simply copy setCapacity(fCapacity+list.fCount); if (std::is_trivial::value) { memcpy( &fArray[fCount], list.fArray, list.fCount*sizeof(OBJ)); } else { for (int i=0;i OBJ GVec::Pop() { if (fCount<=0) GError("Error: invalid GVec::Pop() operation!\n"); fCount--; //OBJ o(fArray[fCount]); //copy constructor //o=fList[fCount]; //fArray[fCount]=NULL; return fArray[fCount]; //copy of the last element (copy constructor called) } //Queue usage: template OBJ GVec::Shift() { if (fCount<=0) GError("Error: invalid GVec::Shift() operation!\n"); fCount--; OBJ o(fArray[0]); //copy constructor if (fCount>0) memmove(&fArray[0], &fArray[1], (fCount)*sizeof(OBJ)); //fList[fCount]=NULL; //not that it matters.. return o; } template void GVec::Shift(int idx) { if (idx<=0 || fCount-idx<=0) GError("Error: invalid GVec::Shift() operation!\n"); fCount-=idx; if (fCount>0) memmove(&fArray[0], &fArray[idx], (fCount)*sizeof(OBJ)); } template void GVec::Insert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the old idx item all the above will be shifted to idx+1 if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx); if (fCount==fCapacity) { //need to resize the array //expand and also copy/move data and insert the new item //more efficient than copying everything and then moving elements Grow(idx, *item); return; } //move data around to make room for the new item if (idxshiftUp(idx); std::move_backward(& fArray[idx], & fArray[fCount], & fArray[fCount+1]); } fArray[idx]=*item; fCount++; } /*template void GVec::Move(int curidx, int newidx) { //swap if (curidx!=newidx || newidx>=fCount) GError(GVEC_INDEX_ERR, newidx); OBJ tmp=fArray[curidx]; //copy constructor here fArray[curidx]=fArray[newidx]; fArray[newidx]=tmp; }*/ template void GVec::Replace(int idx, OBJ& item) { TEST_INDEX(idx); fArray[idx]=item; } template void GVec::Exchange(int idx1, int idx2) { TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ item=fArray[idx1]; fArray[idx1]=fArray[idx2]; fArray[idx2]=item; } template void GVec::Delete(int idx) { TEST_INDEX(idx); std::move(& fArray[idx+1], & fArray[fCount], & fArray[idx]); fCount--; /* if (std::is_trivial::value) { if (index void GVec::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); //if (NewCount > fCapacity) setCapacity(NewCount); Grow(NewCount); if (NewCount>fCount) this->topOff(NewCount); fCount = NewCount; //new items should be populated by the default object constructor(!) } /* template void GVec::setCount(int NewCount, OBJ* v) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); while (NewCount > fCapacity) Grow(); if (NewCount>fCount) { for (int i=fCount;i void GVec::setCount(int NewCount, OBJ v) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); Grow(NewCount); if (NewCount>fCount) { for (int i=fCount;i void GVec::qSort(int l, int r, GCompareProc* cmpFunc) { int i, j; OBJ p,t; do { i = l; j = r; p = this->fArray[l+ ((r - l) >> 1)]; do { while (cmpFunc(&(this->fArray[i]), &p) < 0) i++; while (cmpFunc(&(this->fArray[j]), &p) > 0) j--; if (i <= j) { t = this->fArray[i]; this->fArray[i] = this->fArray[j]; this->fArray[j] = t; i++; j--; } } while (i <= j); if (l < j) qSort(l, j, cmpFunc); l = i; } while (i < r); } template void GVec::Sort(GCompareProc* cmpFunc) { if (cmpFunc==NULL) { GMessage("Warning: NULL compare function given, useless Sort() call.\n"); return; } if (this->fArray!=nullptr && this->fCount>0) qSort(0, this->fCount-1, cmpFunc); } template void GVec::Sort() { GCompareProc* cmpFunc = DefLTCompareProc; Sort(cmpFunc); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GPVec implementation template GPVec::GPVec(const GPVec& list) { //copy constructor fCount=list.fCount; fCapacity=list.fCapacity; fList=nullptr; fFreeProc=list.fFreeProc; fCount=list.fCount; if (fCapacity>0) { //GMALLOC(fList, fCapacity*sizeof(OBJ*)); fList=new OBJ*[fCapacity]; std::move(&list.fList[0], &list.fList[fCount], &fList[0]); //memcpy(fList, list.fList, fCount*sizeof(OBJ*)); } } template GPVec::GPVec(GPVec&& list) { //copy constructor fCount=list.fCount; fCapacity=list.fCapacity; fList=list.fList; fFreeProc=list.fFreeProc; list.fCount=0; list.fCapacity=0; list.fList=nullptr; } template GPVec::GPVec(GPVec* plist) { //another copy constructor fCount=0; fCapacity=plist->fCapacity; fList=nullptr; fFreeProc=plist->fFreeProc; fCount=plist->fCount; if (fCapacity>0) { //GMALLOC(fList, fCapacity*sizeof(OBJ*)); fList=new OBJ*[fCapacity]; std::move(& (plist->fList[0]), & (plist->fList[fCount]), &fList[0]); //memcpy(fList, plist->fList, fCount*sizeof(OBJ*)); } } template GPVec& GPVec::operator=(const GPVec& list) { if (&list!=this) { Clear(); fFreeProc=list.fFreeProc; //Attention: only the *POINTERS* are copied, // the actual objects are NOT duplicated fCount=list.fCount; fCapacity=list.fCapacity; if (fCapacity>0) { //GMALLOC(fList, fCapacity*sizeof(OBJ*)); //memcpy(fList, list.fList, fCount*sizeof(OBJ*)); fList=new OBJ*[fCapacity]; std::move(&list.fList[0], &list.fList[fCount], &fList[0]); } } return *this; } template GPVec& GPVec::operator=(GPVec&& list) { if (&list!=this) { Clear(); fFreeProc=list.fFreeProc; //Attention: only the *POINTERS* are copied, // the actual objects are NOT duplicated fCount=list.fCount; fCapacity=list.fCapacity; fList=list.fList; list.fList=nullptr; list.fCapacity=0; list.fCount=0; } return *this; } template void GPVec::Add(GPVec& list) { if (list.Count()==0) return; //simply copy the pointers! -- the objects will be shared setCapacity(fCapacity+list.fCount); memcpy( & (fList[fCount]), list.fList, list.fCount*sizeof(OBJ*)); fCount+=list.fCount; } template void GPVec::addNew(GPVec& list) { if (list.Count()==0) return; //requires OBJ copy constructor setCapacity(fCapacity+list.fCount); //memcpy( & (fList[fCount]), list.fList, list.fCount*sizeof(OBJ*)); for (int i=fCount;i void GPVec::Reverse() { int l=0; int r=fCount-1; OBJ* c; while (l GPVec::GPVec(int init_capacity, bool free_elements) { fCount=0; fCapacity=0; fList=nullptr; fFreeProc=(free_elements) ? DefaultFreeProc : NULL; if (init_capacity>0) setCapacity(init_capacity); } template GPVec::GPVec(bool free_elements) { fCount=0; fCapacity=0; fList=nullptr; fFreeProc=(free_elements) ? DefaultFreeProc : NULL; } template GPVec::~GPVec() { this->Clear();//this will free the items if fFreeProc is defined } template void GPVec::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) { //GFREE(fList); delete[] fList; fList=nullptr; } else { //GREALLOC(fList, NewCapacity*sizeof(OBJ*)); OBJ** oldList=fList; fList=new OBJ*[NewCapacity]; if (oldList) { std::move(&oldList[0], &oldList[fCount], & fList[0]); delete[] oldList; } } fCapacity=NewCapacity; } } template void GPVec::deallocate_item(OBJ* &item) { if (item==NULL) return; if (FREEDATA) { (*fFreeProc)(item); item=NULL; } else { delete item; item=NULL; } } template void GPVec::Clear() { if (FREEDATA) { for (int i=0; i void GPVec::Exchange(int idx1, int idx2) { TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ* item=fList[idx1]; fList[idx1]=fList[idx2]; fList[idx2]=item; } template void GPVec::Expand() { if (fCount==fCapacity) Grow(); //return this; } template void GPVec::Grow() { int delta = (fCapacity>8) ? (fCapacity>>2) : 1; setCapacity(fCapacity + delta); } template void GPVec::Grow(int idx, OBJ* item) { int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); if (idx==fCount) { //GREALLOC(fList, NewCapacity*sizeof(OBJ*)); setCapacity(NewCapacity); fList[idx]=item; fCount++; return; } OBJ** newList=new OBJ*[NewCapacity]; std::move(& fList[0], & fList[idx], & newList[0]); newList[idx]=item; //copy data after idx std::move(& fList[idx], & fList[fCount], & newList[idx+1]); delete[] fList; fList=newList; fCapacity=NewCapacity; fCount++; } /* GMALLOC(newList, NewCapacity*sizeof(OBJ*)); //copy data before idx memcpy(&newList[0],&fList[0], idx*sizeof(OBJ*)); newList[idx]=newitem; //copy data after idx memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*)); memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*)); //data copied: GFREE(fList); fList=newList; fCount++; fCapacity=NewCapacity; */ template int GPVec::IndexOf(pointer item) { for (int i=0;i int GPVec::Add(OBJ* item) { int result; if (item==NULL) return -1; result = fCount; if (result==fCapacity) this->Grow(); fList[result]=item; fCount++; return fCount-1; } template void GPVec::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx); if (fCount==fCapacity) { Grow(idx, item); return; } if (idx void GPVec::Move(int curidx, int newidx) { //s //BE_UNSORTED; //cannot do that in a sorted list! if (curidx!=newidx || newidx>=fCount) GError(GVEC_INDEX_ERR, newidx); OBJ* p; p=Get(curidx); //this is a delete: fCount--; if (curidx void GPVec::Put(int idx, OBJ* item) { //WARNING: this will never free the replaced item! TEST_INDEX(idx); fList[idx]=item; } template void GPVec::Forget(int idx) { TEST_INDEX(idx); fList[idx]=NULL; //user should free that somewhere else } template void GPVec::freeItem(int idx) { TEST_INDEX(idx); if (fFreeProc!=NULL) { (*fFreeProc)(fList[idx]); } else this->DefaultFreeProc(fList[idx]); fList[idx]=NULL; } template void GPVec::Delete(int index) { TEST_INDEX(index); if (fFreeProc!=NULL && fList[index]!=NULL) { (*fFreeProc)(fList[index]); //freeItem } fList[index]=NULL; fCount--; if (index OBJ* GPVec::Pop() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[fCount]; fList[fCount]=NULL; return o; } //Queue usage: template OBJ* GPVec::Shift() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[0]; if (fCount>0) memmove(&fList[0], &fList[1], (fCount)*sizeof(OBJ*)); fList[fCount]=NULL; //not that it matters.. return o; } //linear search for the pointer address template int GPVec::RemovePtr(pointer item) { if (item==NULL) return -1; for (int i=0;i void GPVec::Pack() { for (int i=fCount-1; i>=0; i--) if (fList[i]==NULL) Delete(i); //shift rest of fList content accordingly } template void GPVec::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); if (NewCount > fCapacity) setCapacity(NewCount); if (NewCount > fCount) //pad with NULL pointers! memset(& fList[fCount], 0, (NewCount - fCount) * sizeof(OBJ*)); fCount = NewCount; } template void GPVec::qSort(int L, int R, GCompareProc* cmpFunc) { int I, J; OBJ* P; OBJ* T; do { I = L; J = R; P = this->fList[L + ((R-L)>>1)]; do { while (cmpFunc(this->fList[I], P) < 0) I++; while (cmpFunc(this->fList[J], P) > 0) J--; if (I <= J) { T = this->fList[I]; this->fList[I] = this->fList[J]; this->fList[J] = T; I++; J--; } } while (I <= J); if (L < J) qSort(L, J, cmpFunc); L = I; } while (I < R); } template void GPVec::Sort(GCompareProc* cmpFunc) { if (cmpFunc==NULL) { GMessage("Warning: NULL compare function given, useless Sort() call.\n"); return; } if (this->fList!=NULL && this->fCount>0) qSort(0, this->fCount-1, cmpFunc); } template void GPVec::Sort() { GCompareProc* cmpFunc = DefLTCompareProc; Sort(cmpFunc); } //--------------------------------------------------------------------------- #endif gclib-0.12.7/GapAssem.cpp000066400000000000000000001144431407072766100151060ustar00rootroot00000000000000#include "GapAssem.h" const unsigned char GA_flag_IS_REF=0; const unsigned char GA_flag_HAS_PARENT=1; const unsigned char GA_flag_BAD_ALIGN=7; const unsigned char GA_flag_PREPPED=5; //bool GASeq::debug=false; bool MSAColumns::removeConsGaps = true; bool MSAColumns::refineClipping = true; //unsigned int GSeqAlign::counter = 0; int qsortnuc(const void* p1, const void* p2) { GAlnColumn::NucCount* c1 = (GAlnColumn::NucCount*) p1; GAlnColumn::NucCount* c2 = (GAlnColumn::NucCount*) p2; return (c1->count > c2->count) ? -1 : ((c1->count < c2->count) ? 1 : 0); } int compareOrdnum(void* p1, void* p2) { int v1 = ((GSeqAlign*) p1)->ordnum; int v2 = ((GSeqAlign*) p2)->ordnum; return (v1 < v2) ? -1 : ((v1 > v2) ? 1 : 0); } int compareCounts(void* p1, void* p2) { int v1 = ((GSeqAlign*) p1)->Count(); int v2 = ((GSeqAlign*) p2)->Count(); return (v1 > v2) ? -1 : ((v1 < v2) ? 1 : 0); } GASeq::GASeq(const char* sname, const char* sdescr, const char* sseq, int slen, int soffset) : FastaSeq(sname, sdescr, sseq, slen), numgaps(0), ofs(NULL), delops(false, true, false),flags(0),msa(NULL), msaidx(-1), seqlen(slen), offset(soffset), ng_ofs(soffset),revcompl(0), ext5(0), ext3(0), clp5(0), clp3(0) { seqlen = len; //FastaSeq constructor settles it if (seqlen>0) { GCALLOC(ofs, seqlen * sizeof(short)); #ifdef ALIGN_COVERAGE_DATA GCALLOC(cov,seqlen*sizeof(int)); #endif } } GASeq::GASeq(const char* sname, int soffset, int slen, int sclipL, int sclipR, char rev) : FastaSeq(sname), numgaps(0), ofs(NULL), delops(false, true, false),flags(0),msa(NULL), msaidx(-1), seqlen(slen), offset(soffset), ng_ofs(soffset),revcompl(rev), ext5(0), ext3(0), clp5(sclipL), clp3(sclipR) { if (seqlen>0) { GCALLOC(ofs, seqlen * sizeof(short)); #ifdef ALIGN_COVERAGE_DATA GCALLOC(cov,seqlen*sizeof(int)); #endif } } GASeq::GASeq(GASeq& aseq): FastaSeq(aseq.id, aseq.descr, aseq.seq, aseq.len), numgaps(0), ofs(NULL), delops(false, true, false),flags(0),msa(NULL), msaidx(-1), seqlen(aseq.len), offset(0), ng_ofs(0),revcompl(0), ext5(0), ext3(0), clp5(0), clp3(0) { if (seqlen>0) { GCALLOC(ofs, seqlen * sizeof(short)); #ifdef ALIGN_COVERAGE_DATA GCALLOC(cov,seqlen*sizeof(int)); #endif } } GASeq::GASeq(FastaSeq& faseq, bool takeover):FastaSeq(faseq, takeover), numgaps(0), ofs(NULL), delops(false, true, false),flags(0),msa(NULL), msaidx(-1), seqlen(len), offset(0), ng_ofs(0),revcompl(0), ext5(0), ext3(0), clp5(0), clp3(0) { if (seqlen>0) { GCALLOC(ofs, seqlen * sizeof(short)); #ifdef ALIGN_COVERAGE_DATA GCALLOC(cov,seqlen*sizeof(int)); #endif } } GASeq::~GASeq() { GFREE(ofs); #ifdef ALIGN_COVERAGE_DATA GFREE(cov); #endif } /* void GASeq::loadProcessing() { //process all delops for (int i = 0; i < delops->Count(); i++) { SeqDelOp& delop = *delops->Get(i); int pos = delop.revcompl ? len - delop.pos - 1 : delop.pos; removeBase(pos); } if (revcompl == 1) reverseComplement(); } */ void GASeq::finalize() { if (this->len==0) GError("Error: sequence for %s not loaded!\n",this->getId()); if (!this->hasFlag(GA_flag_PREPPED)) this->prepSeq(); } void GASeq::prepSeq() { //should only be called once (use hasFlag() before calling) //apply all deletions to the sequence for (int i = 0; i < delops.Count(); i++) { SeqDelOp& delop = *delops.Get(i); int pos = delop.revcompl ? len - delop.pos - 1 : delop.pos; removeBase(pos); } if (revcompl == 1) reverseComplement(); setFlag(GA_flag_PREPPED); } //set the gap length in this position void GASeq::setGap(int pos, short gaplen) { if (pos < 0 || pos >= seqlen) GError("Error: invalid gap position (%d) given for sequence %s\n", pos + 1, id); numgaps -= ofs[pos]; ofs[pos] = gaplen; numgaps += gaplen; } //add to the existing gap length in this position void GASeq::addGap(int pos, short gapadd) { if (pos < 0 || pos >= seqlen) GError("Error: invalid gap position (%d) given for sequence %s\n", pos + 1, id); numgaps += gapadd; ofs[pos] += gapadd; } void GASeq::removeBase(int pos) { if (pos < 0 || pos >= seqlen) GError("Error: invalid gap position (%d) given for sequence %s\n", pos + 1, id); //if there is a gap at that position, remove the gap //otherwise, remove the actual nucleotide! //if (ofs[pos]>0) { ofs[pos]--; numgaps--; // return; // } /* if it's end base or within clipping -- // don't remove, just adjust clipping if (revcompl!=0) { //reversed in this alignment if (pos<=clp3) { if (pos==clp3) clp3++; offset--; return; } if (pos>=seqlen-clp5-1) { if (pos==seqlen-clp5-1) clp5++; return; } } else {//forward if (pos<=clp5) { if (pos==clp5) clp5++; offset--; return; } if (pos>=seqlen-clp3-1) { if (pos==seqlen-clp3-1) clp3++; return; } } */ //-- couldn't just this do it ? /* ofs[pos]--; numgaps--; return; */ /* //=========================================== // worst case: modify/rebuild the whole sequence.. //short* newofs; seqlen--; memmove(ofs+pos, ofs+pos+1, seqlen-pos); //do the same for the actual sequence, if loaded if (len>0) {//sequence loaded len--; memmove(seq+pos, seq+pos+1, len-pos); endSeq(); } else { //push this sequence editing information on a stack for later processing delops->Add(new SeqDelOp(pos, revcompl!=0)); } */ } void GASeq::refineClipping(GDynArray& cons, int cpos, bool skipDels) { //check if endings match consensus.. //adjust clipping as appropriate //int clipL, clipR; if (clp3 == 0 && clp5 == 0) return; int& clipL = (revcompl != 0) ? clp3 : clp5; int& clipR = (revcompl != 0) ? clp5 : clp3; //build the gapped sequence string in memory char* gseq; int glen = seqlen + numgaps; int allocsize = glen; int gclipR = clipR; int gclipL = clipL; if (skipDels) { for (int i = 1; i <= clipR; i++) { if (ofs[seqlen - i] < 0) allocsize++; else gclipR += ofs[seqlen - i]; } for (int i = 0; i < clipL; i++) { if (ofs[i] < 0) allocsize++; else gclipL += ofs[i]; } } else { for (int i = 1; i <= clipR; i++) gclipR += ofs[seqlen - i]; for (int i = 0; i < clipL; i++) gclipL += ofs[i]; } int* gxpos; //mapping of positions from gseq to seq GMALLOC(gxpos, allocsize * sizeof(int)); GMALLOC(gseq, allocsize + 1); gseq[allocsize] = 0; int gseqpos = 0; for (int i = 0; i < seqlen; i++) { //bool notClip=(i>=clipL && i= clipL && i < seqlen - clipR) //in non-clipped region continue; //skip gaps else glen++; } for (int j = 0; j < ofs[i]; j++) { gseq[gseqpos] = '*'; gseqpos++; } gseq[gseqpos] = seq[i]; gxpos[gseqpos] = i; gseqpos++; } gseq[allocsize] = 0; if (glen != allocsize) GError( "Length mismatch (allocsize %d vs. glen %d) while refineClipping for seq %s !\n", allocsize, glen, id); //adjust end clipping, using a simple X-drop algorithm // match_reward=1, mismatch_penalty=-3 #define XDROP -16 #define MATCH_SC 1 #define MISMATCH_SC -3 if (clipR > 0) { //-------------- clipR ------------------------- // actual span of clipped regions in the alignment // could be larger then clipR/clipL due to gaps propagated // WITHIN the clipped regions //******** right end adjust // cp = last "matching" position on consensus int cp = cpos + glen - gclipR - 1; // sp = corresponding last match position on read int sp = glen - gclipR - 1; //we could be on a mismatch or a gap // so, first go backward to find the first match while (gseq[sp] != cons[cp] || gseq[sp] == '*') { if (gseq[sp] != '*') clipR++; sp--; cp--; if (sp < gclipL) { GMessage( "Warning: reached clipL trying to find an initial match on %s!\n", id); GFREE(gseq); return; //break } } //now go forward for as much as we can, using the dropoff test int score = MATCH_SC; int maxscore = MATCH_SC; int startpos = sp; int bestpos = sp; //new Right clipping position for maxscore while (score > XDROP && ++cp < (int)cons.Count() && ++sp < glen) { if (gseq[sp] == cons[cp]) { if (gseq[sp] != '*') { //real match score += MATCH_SC; //clpinc++; if (score > maxscore) { bestpos = sp; maxscore = score; } //better score than before } //real match } //match else { //mismatch if (gseq[sp] != '*') { score += MISMATCH_SC; //clipinc++; } } } //while XDROP if (bestpos > startpos) clipR = seqlen - gxpos[bestpos] - 1; } //<------- was clipR // ******** left end adjust if (clipL > 0) { // cp = last "matching" position on consensus int cp = cpos + gclipL; // sp = corresponding last match position on read int sp = gclipL; // we could be on a mismatch or a gap // so, first go backward to find the first match while (gseq[sp] != cons[cp] || gseq[sp] == '*') { if (gseq[sp] != '*') clipL++; sp++; cp++; if (sp >= glen - gclipR) { GMessage( "Warning: reached clipR trying to find an initial match on %s!\n", id); GFREE(gseq); return; //break } } //-- now go backward for as much as we can, using the dropoff test int score = MATCH_SC; int maxscore = MATCH_SC; int startpos = sp; int bestpos = sp; while (score > XDROP && --cp >= 0 && --sp >= 0) { if (gseq[sp] == cons[cp]) { if (gseq[sp] != '*') { //real match score += MATCH_SC; if (score > maxscore) { bestpos = sp; maxscore = score; } //better score than before } //real match } //match else { //mismatch if (gseq[sp] != '*') { score += MISMATCH_SC; } } } //while XDROP if (bestpos < startpos) clipL = gxpos[bestpos]; } //is clipL GFREE(gseq); GFREE(gxpos); } void GASeq::reverseGaps() { //--when reading mgblast alignments and gap info //the gap positions are reversed starting and shifted by 1 //because the first ofs is always 0 int l = 1; int r = seqlen - 1; while (l < r) { short c = ofs[l]; ofs[l] = ofs[r]; ofs[r] = c; l++; r--; } } void GASeq::revComplement(int alignlen) { if (alignlen > 0) { //sequence is in an alignment offset = alignlen - endOffset(); if (msa != NULL) { ng_ofs = msa->ng_len - endNgOffset(); if (msa->minoffset > offset) msa->minoffset = offset; if (msa->ng_minofs > ng_ofs) msa->ng_minofs = ng_ofs; } } revcompl = !revcompl; if (len == seqlen) //sequence is loaded, complement it reverseComplement(); reverseGaps(); //-- also reverse the coverage array: #ifdef ALIGN_COVERAGE_DATA int l=0;int r=seqlen-1; while (lseqlen) GError("GSeqAlign Error: invalid addCoverage %s(len %d) vs %s(len %d)\n", name(),seqlen, s->name(), s->seqlen); if (s->revcompl!=revcompl) { for (int i=0;icov[seqlen-i-1]; } else for (int i=0;icov[i]; } #endif void GASeq::printGappedSeq(FILE* f, int baseoffs) { // for now, simple console printing of short sequences -- for testing if (len == 0 || len != seqlen) GError( "GASeq print Error: invalid sequence data '%s' (len=%d, seqlen=%d)\n", id, len, seqlen); int i; int clipL, clipR; if (revcompl != 0) { clipL = clp3; clipR = clp5; } else { clipL = clp5; clipR = clp3; } for (i = 0; i < (offset - baseoffs); i++) fprintf(f, " "); for (i = 0; i < seqlen; i++) { if (ofs[i] < 0) continue; //deleted base for (int j = 0; j < ofs[i]; j++) fprintf(f, "-"); char c = seq[i]; if (i < clipL || i >= seqlen - clipR) c = (char) tolower(c); fprintf(f, "%c", c); } //for each base fprintf(f, "\n"); } void GASeq::printGappedFasta(FILE* f) { if (len == 0 || len != seqlen) GError( "GASeq print Error: invalid sequence data '%s' (len=%d, seqlen=%d)\n", id, len, seqlen); int i; /* //FIXME TESTME - original mblaor had this uncommented! int clipL, clipR; if (revcompl != 0) { clipL = clp3; clipR = clp5; } else { clipL = clp5; clipR = clp3; } */ int printed = 0; for (i = 0; i < seqlen; i++) { if (ofs[i] < 0) continue; //deleted base for (int j = 0; j < ofs[i]; j++) { fprintf(f, "*"); printed++; if (printed == 60) { fprintf(f, "\n"); printed = 0; } } char c = seq[i]; printed++; if (printed == 60) { fprintf(f, "%c\n", c); printed = 0; } else fprintf(f, "%c", c); } //for each base if (printed < 60) fprintf(f, "\n"); } void GASeq::printMFasta(FILE* f, int llen) { if (len == 0 || len != seqlen) GError("GASeq print Error: invalid sequence data '%s' (len=%d, seqlen=%d)\n", id, len, seqlen); if (this->descrlen>0) fprintf(f, ">%s %s\n", id, descr); else fprintf(f, ">%s\n", id); int i; int printed = 0; for (i=0;i= seqlen - clipR) { //with right clipping delgapsR += ofs[i]; ofs[i] = 0; } } //for offset += delgapsL; numgaps -= (delgapsL + delgapsR); return delgapsL + delgapsR; } void GASeq::toMSA(MSAColumns& msacols, int nucValue) { if (len == 0 || len != seqlen) GError( "GASeq::toMSA Error: invalid sequence data '%s' (len=%d, seqlen=%d)\n", id, len, seqlen); int i; int clipL, clipR; if (revcompl != 0) { clipL = clp3; clipR = clp5; } else { clipL = clp5; clipR = clp3; } int mincol = INT_MAX; int maxcol = 0; //for (i=0;i<(offset-msa.baseoffset);i++) col++; int col = offset - msa->minoffset; for (i = 0; i < seqlen; i++) { bool clipped; if (i < clipL || i >= seqlen - clipR) clipped = true; else { clipped = false; if (mincol == INT_MAX) mincol = col; } for (int j = 0; j < ofs[i]; j++) { //storing gap if (!clipped) msacols[col].addGap(nucValue); col++; } msacols[col].addNuc(this, i, clipped, nucValue); if (!clipped) maxcol = col; col++; } //for each base //update msa's min-max msacols.updateMinMax(mincol, maxcol); } //=================================== GSeqAlign =============================== // -- creation from a pairwise alignment // l1-r1 = coordinates of alignment region on s1 // l2-r2 = coordinates of alignment region on s2 // coordinates MUST be 0-based #ifdef ALIGN_COVERAGE_DATA GSeqAlign::GSeqAlign(GASeq* s1, int l1, int r1, GASeq* s2, int l2, int r2) //:GList(true,true,false) { :GList(false,true,false) { #else GSeqAlign::GSeqAlign(GASeq* s1, GASeq* s2) : GList(false, true, false), length(0), minoffset(0), refinedMSA(false), msacolumns(NULL), ordnum(0), ng_len(0),ng_minofs(0), badseqs(0), consensus(512), consensus_bq(512) { #endif s1->msa = this; s2->msa = this; //the offset for at least one sequence is 0 this->Add(s1); this->Add(s2); minoffset = GMIN(s1->offset, s2->offset); ng_minofs = minoffset; //no gaps in the clipped regions for now length = GMAX(s1->endOffset(), s2->endOffset()); length -= minoffset; ng_len = GMAX(s1->endNgOffset(), s2->endNgOffset()); ng_len -= ng_minofs; //-- according to the alignment, update the coverage for each sequence //-- overlaps are granted +1 bonus #ifdef ALIGN_COVERAGE_DATA for (int i=l1;icov[i]++; for (int i=l2;icov[i]++; //-- mismatch regions at the left end int msml=(l2>l1) ? l1:l2; for (int i=1;i<=msml;i++) { s1->cov[l1-msml]--; s2->cov[l2-msml]--; } //-- mismatch regions at the right end int cr1=s1->seqlen-r1-1; int cr2=s2->seqlen-r2-1; int msmr=(cr2>cr1) ? cr1:cr2; for (int i=1;i<=msmr;i++) { s1->cov[r1+msmr]--; s2->cov[r2+msmr]--; } #endif } //merge other alignment omsa into this msa //seq->id MUST be the same with oseq->id bool GSeqAlign::addAlign(GASeq* seq, GSeqAlign* omsa, GASeq* oseq) { //error checking -- could be disabled to speed it up a bit if (seq->seqlen != oseq->seqlen) GError("GSeqAlign Error: invalid merge %s(len %d) vs %s(len %d)\n", seq->getName(), seq->seqlen, oseq->getName(), oseq->seqlen); // for this merge to work, the shared sequence MUST have // the same orientation in both MSAs if (seq->revcompl != oseq->revcompl) omsa->revComplement(); //reverse-complement all sequences in omsa #ifdef ALIGN_COVERAGE_DATA //add coverage values: seq->addCoverage(oseq); #endif //--- now propagate gaps as appropriate for (int i = 0; i < seq->seqlen; i++) { int d = seq->gap(i) - oseq->gap(i); if (d > 0) { //extra gap in seq //propagate into all omsa omsa->injectGap(oseq, i, d); continue; } //extra gap in seq if (d < 0) { //extra gap in oseq //propagate into all msa injectGap(seq, i, -d); continue; } //extra gap in oseq } //--for each base position //--now add the sequences from omsa to this MSA for (int i = 0; i < omsa->Count(); i++) { GASeq* s = omsa->Get(i); if (s == oseq) continue; //adjust offset -- which can be extended by gaps in seq BEFORE s->offset //the offsets had been adjusted already (by injectGap() method) // to account for propagated gaps in both MSAs! //--add this sequence //cluster minoffset and length will be updated too! addSeq(s, seq->offset + s->offset - oseq->offset, seq->ng_ofs + s->ng_ofs - oseq->ng_ofs); } omsa->setFreeItem(false); //delete omsa; //we no longer need this alignment delete oseq; //also deletes oseq return true; } //just to automatically set the offset, msa, //and to update the MSA length if needed void GSeqAlign::addSeq(GASeq* s, int soffs, int ngofs) { s->offset = soffs; s->ng_ofs = ngofs; s->msa = this; this->Add(s); //keep track of minimum offset //this also adjusts length! if (soffs < minoffset) { length += minoffset - soffs; minoffset = soffs; } if (ngofs < ng_minofs) { ng_len += ng_minofs - ngofs; ng_minofs = ngofs; } //adjust length of alignment if very long sequences were added if (s->endOffset() - minoffset > length) length = s->endOffset() - minoffset; if (s->endNgOffset() - ng_minofs > ng_len) ng_len = s->endNgOffset() - ng_minofs; } //propagate a gap in a sequence into the whole alignment containing it // offsets of all seqs after the gap MUST be adjusted too! void GSeqAlign::injectGap(GASeq* seq, int pos, int xgap) { //find the actual alignment position of this pos in the layout int alpos = seq->offset + pos; for (int i = 0; i <= pos; i++) alpos += seq->gap(i); //now alpos = the exact offset of seq[pos] in this MSA for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); int spos = 0; // finding out position of gap in seq s if (s == seq) spos = pos; else { //walk to lpos on sequence s int salpos = s->offset; if (salpos >= alpos) { //s->offset is AFTER this gap, so only the offset is affected s->offset += xgap; continue; } while (spos < s->seqlen) { salpos += 1 + s->gap(spos); if (salpos > alpos) break; spos++; } if (spos >= s->seqlen) //spos is AFTER the end of sequence s continue; // s not affected //--it is a valid position for this sequence //--TO DO: clipping? first/last positions? } s->addGap(spos, xgap); } //for each sequense in MSA length += xgap; } void GSeqAlign::removeColumn(int column) { int alpos = column + minoffset; for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); int spos = 0; // finding out position of this base in seq s int salpos = s->offset; if (salpos >= alpos) { //s->offset is AFTER this gap, so only the offset is affected s->offset--; //deletion of 1 continue; } while (spos < s->seqlen) { salpos += 1 + s->gap(spos); if (salpos > alpos) break; spos++; } if (spos >= s->seqlen) //spos is AFTER the end of sequence s continue; // s not affected //--now spos is a valid position for this sequence // } s->removeBase(spos); } //for each sequence in MSA length--; } void GSeqAlign::removeBase(GASeq* seq, int pos) { //find the actual alignment position of this pos in the layout int alpos = seq->offset + pos; for (int i = 0; i <= pos; i++) alpos += seq->gap(i); //now alpos = the exact offset of seq[pos] in this MSA for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); int spos = 0; // finding out position of this base in seq s if (s == seq) spos = pos; else { //walk to lpos on sequence s int salpos = s->offset; if (salpos >= alpos) { //s->offset is AFTER this gap, so only the offset is affected s->offset--; //deletion of 1 continue; } while (spos < s->seqlen) { salpos += 1 + s->gap(spos); if (salpos > alpos) break; spos++; } if (spos >= s->seqlen) //spos is AFTER the end of sequence s continue; // s not affected //--now spos is a valid position for this sequence } s->removeBase(spos); } //for each sequence in MSA length--; } void GSeqAlign::applyClipping(AlnClipOps& clipops) { for (int i = 0; i < clipops.Count(); i++) { SeqClipOp& cop = *clipops.Get(i); if (cop.clp[0] >= 0) cop.seq->clp5 = cop.clp[0]; if (cop.clp[1] >= 0) cop.seq->clp3 = cop.clp[1]; } } bool GSeqAlign::evalClipping(GASeq* seq, int c5, int c3, float clipmax, AlnClipOps& clipops) { //propagate trimming of a read to the rest of this container MSA //-- returns false if any of the reads in this MSA are clipped too much! //GList clipops(false,true,false); if (c5 >= 0) { //the position of the first/last non-clipped letter int pos = (seq->revcompl != 0) ? seq->seqlen - c5 - 1 : c5; //find the actual alignment position of this pos in the layout int alpos = seq->offset + pos; for (int i = 0; i <= pos; i++) alpos += seq->gap(i); //alpos = the position of seq[pos] in this MSA for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); if (s == seq) { if (!clipops.add5(s, c5, clipmax)) return false; continue; } int spos = 0; // finding out position in seq s //walk to lpos on sequence s //salpos is going to be the position of seq[pos] in s int salpos = s->offset; if (salpos >= alpos) { //-- s starts AFTER this alpos position if (seq->revcompl != 0) { //clipping right side // which means ALL of s is to the right => clipped entirely! // !!! TODO: return false; //clipops.Add(new SeqClipOp(s, s->seqlen)); } continue; } while (spos < s->seqlen) { salpos += 1 + s->gap(spos); if (salpos > alpos) break; spos++; } if (spos >= s->seqlen) { //s ends BEFORE this alpos position if (seq->revcompl == 0) { //clipping left side // which means ALL of s is to the left => clipped entirely! return false; } continue; // s not affected } //--it is a valid position for this sequence //now spos is in the corresponding position of pos //trim s here if (seq->revcompl != 0) { //trim right side in this msa if (s->revcompl != 0) { if (!clipops.add5(s, s->seqlen - spos - 1, clipmax)) return false; /*if (s->clp5clp3,clipmax)) return false; clipops.Add(new SeqClipOp(s,newclp)); }*/ } else { if (!clipops.add3(s, s->seqlen - spos - 1, clipmax)) return false; /*if (s->clp3clp5,newclp,clipmax)) return false; clipops.Add(new SeqClipOp(s,-1,newclp)); }*/ } } else { //trim left side in this msa if (s->revcompl != 0) { if (!clipops.add3(s, spos, clipmax)) return false; /*if (s->clp3clp5,spos,clipmax)) return false; clipops.Add(new SeqClipOp(s,-1,spos)); }*/ } else { if (!clipops.add5(s, spos, clipmax)) return false; /*if (s->clp5clp3,clipmax)) return false; clipops.Add(new SeqClipOp(s,spos)); }*/ } } } //for each sequense in MSA } // 5' clipping case //--------------- if (c3 >= 0) { //the position of the first/last non-clipped letter int pos = (seq->revcompl != 0) ? c3 : seq->seqlen - c3 - 1; //find the actual alignment position of this pos in the layout int alpos = seq->offset + pos; for (int i = 0; i <= pos; i++) alpos += seq->gap(i); //now alpos = the exact offset of seq[pos] in this MSA for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); if (s == seq) { if (!clipops.add3(s, c3, clipmax)) return false; /*if (s->clp3clp5,c3,clipmax)) return false; clipops.Add(new SeqClipOp(s,-1,c3)); }*/ continue; } int spos = 0; // finding out position in seq s //walk to lpos on sequence s int salpos = s->offset; if (salpos >= alpos) { //-- s starts AFTER this alpos position if (seq->revcompl == 0) { //clipping right side // which means ALL of s is to the right => clipped entirely! return false; //clipops.Add(new SeqClipOp(s, s->seqlen)); } continue; } while (spos < s->seqlen) { salpos += 1 + s->gap(spos); if (salpos > alpos) break; spos++; } if (spos >= s->seqlen) { //s ends BEFORE this alpos position if (seq->revcompl != 0) { //clipping left side // which means ALL of s is to the left => clipped entirely! return false; //clipops.Add(new SeqClipOp(s, s->seqlen)); } continue; // s not affected } //--it is a valid position for this sequence //now spos is in the corresponding position of pos //trim s here if (seq->revcompl != 0) { //trim left side in this msa if (s->revcompl != 0) { if (!clipops.add3(s, spos, clipmax)) return false; /*if (s->clp3clp5,spos,clipmax)) return false; clipops.Add(new SeqClipOp(s,-1, spos)); }*/ } else { if (!clipops.add5(s, spos, clipmax)) return false; /*if (s->clp5clp3,clipmax)) return false; clipops.Add(new SeqClipOp(s,spos)); }*/ } } else { //trim right side in this msa //int newclp=s->seqlen-spos-1; if (s->revcompl != 0) { if (!clipops.add5(s, s->seqlen - spos - 1, clipmax)) return false; /*if (s->clp5clp3,clipmax)) return false; clipops.Add(new SeqClipOp(s,newclp)); }*/ } else { if (!clipops.add3(s, s->seqlen - spos - 1, clipmax)) return false; /*if (s->clp3clp5,newclp,clipmax)) return false; clipops.Add(new SeqClipOp(s,-1, newclp)); }*/ } } } //for each sequense in MSA } // 3' clipping return true; } void GSeqAlign::revComplement() { for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); s->revComplement(length); } Sort(); } void GSeqAlign::finalize() { //prepare for printing for (int i=0;ilen==0) GError("Error: sequence for %s not loaded!\n",s->getId()); if (!s->hasFlag(GA_flag_PREPPED)) s->prepSeq(); } } void GSeqAlign::print(FILE* f, char c) { finalize(); //this calls prepSeq as needed to reverse complement sequence etc. int max = 0; for (int i = 0; i < Count(); i++) { int n = Get(i)->getNameLen(); if (n > max) max = n; } char fmtstr[128]; fmtstr[0] = '%'; sprintf(&fmtstr[1], "%d", max); strcat(fmtstr, "s %c "); if (c != 0) { // draw a separator line built from c fprintf(f, fmtstr, " ", ' '); for (int k = 0; k < length; k++) fprintf(f, "%c", c); fprintf(f, "\n"); } for (int i = 0; i < Count(); i++) { GASeq* s = Get(i); char orientation = s->revcompl == 1 ? '-' : '+'; fprintf(f, fmtstr, s->name(), orientation); s->printGappedSeq(f, minoffset); } } void GSeqAlign::writeMSA(FILE* f, int linelen) { finalize(); for (int i = 0; i < Count(); i++) { GASeq& s = *Get(i); // s.printMFasta(f, linelen); } } char GAlnColumn::bestChar(int16_t *qscore) { //returns most frequent char -- could be a gap! if (layers == 0) return 0; if (consensus != 0) return consensus; if (!countsSorted) { qsort(counts, 6, sizeof(NucCount), qsortnuc); countsSorted = true; } int r = 0; char best = counts[0].nt; int bq = counts[0].count; for (;r<5;) { //if gap or N have the same freq as a real base, pick the base instead if ((best == '-' || best == 'N') && counts[r].count == counts[r + 1].count) { r++; best = counts[r].nt; bq = counts[r].count; } else break; } if (qscore!=NULL) { for (int q=0;q<6;++q) { if (q!=r) bq-=counts[q].count; } if (bq<=SHRT_MIN) bq=SHRT_MIN+1; if (bq>=SHRT_MAX) bq=SHRT_MAX-1; *qscore=(short)bq; } consensus = best; return best; } void GAlnColumn::remove() { if (hasClip) { clipnuc->seq->msa->removeBase(clipnuc->seq, clipnuc->pos); return; } if (nucs->Count() > 0) { NucOri* n = nucs->Get(0); n->seq->msa->removeBase(n->seq, n->pos); //this should also be enough to propagate the deletion // to all involved sequences! // (all affected ofs[] and offsets) return; } GMessage( "Warning: column remove() couldn't find a sequence at that position!\n"); } void GSeqAlign::buildMSA(bool refWeighDown) { if (msacolumns != NULL) GError("Error: cannot call buildMSA() twice!\n"); msacolumns = new MSAColumns(length, minoffset); for (int i = 0; i < Count(); i++) { GASeq* seq = Get(i); seq->msaidx = i; // if GSeqAlign is sorted by offset // this could speed up some later adjustments if (seq->seqlen - seq->clp3 - seq->clp5 < 1) { GMessage("Warning: sequence %s (length %d) was trimmed too badly (%d,%d)" " -- should be removed from MSA w/ %s!\n", seq->id, seq->seqlen, seq->clp5, seq->clp3, Get(0)->id); seq->setFlag(GA_flag_BAD_ALIGN); //bad-align flag! badseqs++; } int incVal=1; if (refWeighDown && !seq->hasFlag(GA_flag_IS_REF)) { incVal = 10; } seq->toMSA(*msacolumns, incVal); } //this->Pack(); } void GSeqAlign::freeMSA() { if (msacolumns != NULL) { delete msacolumns; msacolumns = NULL; } //free sequence data too! for (int i = 0; i < Count(); i++) { GASeq* seq = Get(i); char* p = seq->detachSeqPtr(); GFREE(p); } } void GSeqAlign::ErrZeroCov(int col) { int cnt = Count(); fprintf(stderr, "WARNING: 0 coverage column %d (mincol=%d) found within alignment of %d seqs!\n", col, msacolumns->mincol, cnt); for (int i = 0; i < cnt; i++) { GASeq* seq = Get(i); fprintf(stderr, "%s\n", seq->id); } exit(5); } void GSeqAlign::refineMSA(bool refWeighDown, bool redo_ends) { if (redo_ends) { //TODO: //recompute consensus at the ends of MSA INCLUDING trimmed sequence //and recompute the trimming accordingly } else { //freeze end trimming //if (msacolumns==NULL) buildMSA(refWeighDown); //populate MSAColumns only based on existing trimming } //==> remove columns and build consensus int cols_removed = 0; for (int col = msacolumns->mincol; col <= msacolumns->maxcol; col++) { int16_t qscore=0; char c = msacolumns->columns[col].bestChar(&qscore); if (c == 0) { //should never be the case! ErrZeroCov(col); c = '*'; } if (c == '-' || c == '*') { c = '*'; if (MSAColumns::removeConsGaps) { removeColumn(col - cols_removed); cols_removed++; //this will delete the corresponding nucleotides //from every involved read, also updating the offsets of //every read AFTER this column continue;//don't add this gap to the consensus } } extendConsensus(c, qscore); } //make sure consensus is 0 terminated: char e=0;consensus.Add(e);consensus.Pop(); //-- refine clipping and remove gaps propagated in the clipping regions for (int i = 0; i < Count(); i++) { GASeq* seq = Get(i); //if (seq->hasFlag(7)) continue; -- checking the badalign flag.. //refine clipping -- first pass: if (MSAColumns::refineClipping) seq->refineClipping(consensus, //consensus_len, seq->offset - minoffset - msacolumns->mincol); //..remove any "gaps" in the non-aligned (trimmed) regions int grem = 0; if (MSAColumns::removeConsGaps) grem = seq->removeClipGaps(); //if any gaps were removed, take one more shot at //refining the clipping -- we may get lucky and realign better.. if (grem != 0 && MSAColumns::refineClipping) seq->refineClipping(consensus, //consensus_len, seq->offset - minoffset - msacolumns->mincol, true); } refinedMSA = true; } void GSeqAlign::extendConsensus(char c, int16_t bq) { /* int newlen = consensus_len + 1; if (newlen >= consensus_cap) { consensus_cap += 128; if (consensus_len == 0) { GMALLOC(consensus, consensus_cap); } else { GREALLOC(consensus, consensus_cap); } } consensus[consensus_len] = c; consensus[newlen] = 0; consensus_len++; */ consensus.Add(c); if (bq!=SHRT_MIN && consensus_bq.Count()==consensus.Count()-1) { consensus_bq.Add(bq); } } void GSeqAlign::writeACE(FILE* f, const char* name, bool refWeighDown) { //--build a consensus sequence if (!refinedMSA) refineMSA(refWeighDown); //FastaSeq conseq((char*)name); //conseq.setSeqPtr(consensus, consensus_len, consensus_cap); int fwd = 0; //number of reversed reads int rvs = 0; //number of forward reads for (int i = 0; i < Count(); i++) { GASeq* seq = Get(i); if (seq->revcompl != 0) rvs++; else fwd++; } char consDir = (rvs > fwd) ? 'C' : 'U'; fprintf(f, "CO %s %d %d 0 %c\n", name, consensus.Count(), Count(), consDir); // conseq.len, Count()-badseqs, consDir); //conseq.fprint(f,60); FastaSeq::write(f, NULL, NULL, consensus(), 60, consensus.Count()); //fprintf(f, "\nBQ \n\n"); //TODO: print consensus_bq array values here! fprintf(f, "\nBQ\n"); int bl=0; for (uint i=0;ihasFlag(7)) continue; -- checking the badalign flag.. char sc = (seq->revcompl == 0) ? 'U' : 'C'; fprintf(f, "AF %s %c %d\n", seq->id, sc, seq->offset - minoffset - msacolumns->mincol + 1); } fprintf(f, "\n"); // a second pass to write the actual read entries.. for (int i = 0; i < Count(); i++) { GASeq* seq = Get(i); //if (seq->hasFlag(7)) continue; //badalign flag set int gapped_len = seq->seqlen + seq->numgaps; fprintf(f, "RD %s %d 0 0\n", seq->id, gapped_len); seq->printGappedFasta(f); int clpl, clpr; if (seq->revcompl == 0) { //forward clpl = seq->clp5; clpr = seq->clp3; } else { //reverse complement clpl = seq->clp3; clpr = seq->clp5; } int l = clpl; int r = clpr; for (int j = 1; j <= r; j++) clpr += seq->ofs[seq->seqlen - j]; for (int j = 0; j <= l; j++) clpl += seq->ofs[j]; int seql = clpl + 1; int seqr = gapped_len - clpr; if (seqr < seql) { fprintf(stderr, "Bad trimming for %s of gapped len %d (%d, %d)\n", seq->id, gapped_len, seql, seqr); seqr = seql + 1; } fprintf(f, "\nQA %d %d %d %d\nDS \n\n", seql, seqr, seql, seqr); } } void GSeqAlign::writeInfo(FILE* f, const char* name, bool refWeighDown) { /* File format should match assembly & asmbl_link tables in our db: >contig_name seq_count contig_sequence seqname seqlen offset asm_lend asm_rend seq_lend seq_rend pid alndata Notes: seq_lend>seq_rend if the sequence is reverse complemented */ //--build the actual MSA and a consensus sequence, if not done yet: // this will also remove the consensus gaps as appropriate (unless disabled) if (!refinedMSA) refineMSA(refWeighDown); //-- also compute this, just in case: // redundancy = sum(asm_rend-asm_lend+1)/contig_len //(and also the pid for each reads vs. consensus) fprintf(f, ">%s %d %s\n", name, Count(), consensus()); float redundancy = 0; // = sum(asm_rend-asm_lend+1)/contig_len for (int i = 0; i < Count(); i++) { GASeq* seq = Get(i); //GStr alndata; //if (seq->hasFlag(7)) continue; //badalign flag set int gapped_len = seq->seqlen + seq->numgaps; //fprintf(f, "RD %s %d 0 0\n", seq->id, gapped_len); /*seq->printGappedFasta(f);*/ int seqoffset = seq->offset - minoffset - msacolumns->mincol + 1; int clpl, clpr; int asml = seqoffset + 1; int asmr = asml - 1; float pid = 0; if (seq->revcompl == 0) { //forward clpl = seq->clp5; clpr = seq->clp3; } else { //reverse complement clpl = seq->clp3; clpr = seq->clp5; } int aligned_len = 0; //int indel_ofs = 0; //distance to last indel position for (int j = seq->clp5; j < seq->seqlen - seq->clp3; j++) { int indel = seq->ofs[j]; //char indel_type = 0; asmr += indel + 1; if (indel < 0) { //deletion //indel_type = 'd'; indel = -indel; } else { // indel>=0 //actually aligned nucleotide here //if (indel > 0) // indel_type = 'g'; //else // // indel==0, no indel at all // indel_ofs++; if (toupper(seq->seq[j]) == toupper(consensus[asmr - 1])) pid++; aligned_len++; } /* if (indel_type) { if (indel > 2) alndata.appendfmt("%d%c%d-", indel_ofs, indel_type, indel); else for (int r = 0; r < indel; r++) alndata += indel_type; indel_ofs = 0; } */ } pid = (pid * 100.0) / (float) aligned_len; redundancy += aligned_len; /*int l=clpl; int r=clpr; for (int j=1;j<=r;j++) clpr+=seq->ofs[seq->seqlen-j]; for (int j=0;j<=l;j++) clpl+=seq->ofs[j];*/ int seql = clpl + 1; int seqr = seq->len - clpr; if (seqr < seql) { fprintf(stderr, "WARNING: Bad trimming for %s of gapped len %d (%d, %d)\n", seq->id, gapped_len, seql, seqr); seqr = seql + 1; } if (seq->revcompl) Gswap(seqr, seql); // id ln of al ar sl sr pi an fprintf(f, "%s %d %d %d %d %d %d %4.2f\n", seq->id, seq->len, seqoffset, asml, asmr, seql, seqr, pid); //alndata.chars()); } redundancy /= (float) consensus.Count(); } gclib-0.12.7/GapAssem.h000066400000000000000000000355641407072766100145610ustar00rootroot00000000000000#ifndef G_GAP_ASSEM_DEFINED #define G_GAP_ASSEM_DEFINED #include "GFastaFile.h" #include "gdna.h" #include "GList.hh" #include "GHash.hh" #include class GSeqAlign; class MSAColumns; extern const unsigned char GA_flag_IS_REF; extern const unsigned char GA_flag_HAS_PARENT; extern const unsigned char GA_flag_BAD_ALIGN; extern const unsigned char GA_flag_PREPPED; struct SeqDelOp { int pos; bool revcompl; SeqDelOp(int p, bool r) { pos=p; revcompl=r;} // bool operator==(SeqDelOp& d){ return (this==&d); } bool operator<(SeqDelOp& d){ return (this<&d); } }; class GASeq : public FastaSeq { protected: int numgaps; //total number of accumulated gaps in this sequence short *ofs; //array of gaps at each position; //a negative value (-1) means DELETION of the nucleotide //at that position! #ifdef ALIGN_COVERAGE_DATA int* cov; //coverage of every nucleotide of this seq // it starts with 0 by itself // it'll be decreased by -1 for mismatching ends! #endif GList delops; //delete operations void prepSeq(); //reverse complement if needed, and apply deletions (delops) //should only be called once when the MSA is complete (by GSeqAlign::finalize()) public: unsigned char flags; //8 general purpose boolean flags (bits) // bad_align flag is the last bit -- i.e. bit 7 // all the others (0..6) are free for custom use GSeqAlign* msa; int msaidx; //actual index at which this sequence is to be found in GASeqAlign; int seqlen; // exactly the size of ofs[] int offset; //offset in the layout int ng_ofs; //non-gapped offset in the layout //(approx, for clipping constraints only) char revcompl; //0 = forward, 1=reverse int ext5; // layout-positional extension at 5' end int ext3; // layout-positional extension at 3' end //-- int clp5; //ever "disproved" region at 5' end int clp3; //ever "disproved" region at 3' end //------- comparison operators (for GList) : //sorting by offset in cluster void allupper() { for (int i=0;i(GASeq& d){ return (offset>d.offset); } bool operator<(GASeq& d){ return (offset0) { GCALLOC(ofs, seqlen * sizeof(short)); #ifdef ALIGN_COVERAGE_DATA GCALLOC(cov,seqlen*sizeof(int)); #endif } }; GASeq(const char* sname, const char* sdescr=NULL, const char* sseq=NULL, int slen=0, int soffset=0); GASeq(const char* sname, int soffset, int slen, int sclipL=0, int sclipR=0, char rev=0); ~GASeq(); void refineClipping(GDynArray& cons, int cpos, bool skipDels=false); void setGap(int pos, short gaplen=1); // set the gap in this pos void addGap(int pos, short gapadd); //extend the gap in this pos //bitno is 0 based here, for simplicity: inline void setFlag(unsigned char bitno) { flags |= ((unsigned char)1 << bitno); } inline void clearFlag(unsigned char bitno) { flags ^= ((unsigned char)1 << bitno); } inline bool hasFlag(unsigned char bitno) { return ( (((unsigned char)1 << bitno) & flags) !=0 ); } int getNumGaps() { return numgaps; } int gap(int pos) { return ofs[pos]; } void removeBase(int pos); //remove the nucleotide at that position int endOffset() { return offset+seqlen+numgaps; } int endNgOffset() { return ng_ofs+seqlen; } int removeClipGaps(); //remove gaps within clipped regions //offset should be corrected appropriately! void printGappedSeq(FILE* f, int baseoffs=0); void printGappedSeq(int baseoffs=0) { printGappedSeq(stdout, baseoffs); } void printGappedFasta(FILE* f); void printMFasta(FILE* f, int llen=60); //offset padded //void loadProcessing(); //to be called immediately after loading the sequence // it will revCompl if needed and apply delops void finalize(); //delete inserts and reverse complement sequence if needed #ifdef ALIGN_COVERAGE_DATA void addCoverage(GASeq* s); #endif void reverseGaps(); //don't update offset and flags //useful after reading mgblast gap info void revComplement(int alignlen=0); void toMSA(MSAColumns& msa, int nucValue=1); }; // -- nucleotide origin -- for every nucleotide in a MSA column // this info is needed by SNP reporting class NucOri { public: GASeq* seq; int pos; //0-based position of nucleotide letter NucOri() { seq=NULL; pos=0; } NucOri(GASeq* s, int p) { seq=s; pos=p; } bool operator==(NucOri& d){ return (strcmp(seq->id,d.seq->id)==0 && pos==d.pos); } bool operator>(NucOri& d){ int cmp=strcmp(seq->id,d.seq->id); if (cmp==0) return pos>d.pos; else return cmp>0; } bool operator<(NucOri& d){ int cmp=strcmp(seq->id,d.seq->id); if (cmp==0) return pos(SeqClipOp& d){ return (this>&d); } bool operator<(SeqClipOp& d){ return (this<&d); } }; class AlnClipOps :public GList { public: char q_rev; int d5; int d3; //--- int total; AlnClipOps():GList(false,true,false) { total=0; d5=0; d3=0; q_rev=false; } bool add5(GASeq* s, int clp, float clipmax) { if (s->clp50) { int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen); if (clp>maxovh) return false; } //----- base test: the read should be left with no less than 25% of its length if (s->seqlen-s->clp3-clp < (s->seqlen >> 2)) return false; total+=10000+clp-s->clp5; Add(new SeqClipOp(s,clp,-1)); } return true; } bool add3(GASeq* s, int clp, float clipmax) { if (s->clp30) { int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen); if (clp>maxovh) return false; } //----- base test: if the read is left with less than 25% of its length if (s->seqlen-s->clp5-clp < (s->seqlen >> 2)) return false; total+= 10000+clp-s->clp3; Add(new SeqClipOp(s,-1,clp)); } return true; } bool add(GASeq* s, int clp5, int clp3, float clipmax) { int newclp5=-1; int newclp3=-1; int add=0; if (s->clp50) { int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen); if (clp5>maxovh) return false; } //----- base test: if the read is left with less than 25% of its length! if (s->seqlen-s->clp3-clp5 < (s->seqlen >> 2)) return false; add+= 10000+clp5-s->clp5; newclp5=clp5; } else clp5=s->clp5; if (s->clp30) { int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen); if (clp3>maxovh) return false; } //----- base test: if the read is left with less than 25% of its length! if (s->seqlen-clp5-clp3 < (s->seqlen >> 2)) return false; add+=10000+clp3-s->clp3; newclp3=clp3; } if (add>0) { total+=add; Add(new SeqClipOp(s,newclp5,newclp3)); } return true; } }; class GAlnColumn { protected: struct NucCount { char nt; // A, C, G, T, N or - // precisely in this order (except after qsort) int count; void set(char l, int num=0) { nt=l;count=num; } }; enum { ncA=0, ncC, ncG, ncT, ncN, ncGap }; NucCount counts[6]; bool countsSorted; public: bool hasClip; char consensus; int layers; //total "thickness" NucOri* clipnuc; GList* nucs; friend int qsortnuc(const void* p1, const void* p2); //int total() { return numgaps+numN+numA()+numC()+numG()+numT(); } GAlnColumn():countsSorted(false),hasClip(false), consensus(0), layers(0), clipnuc(NULL), nucs(NULL) { //sorted?, free?, unique? nucs=new GList(false,true,false); /*lstC=new GList(false,true,false); lstG=new GList(false,true,false); lstT=new GList(false,true,false);*/ counts[ncA].set('A'); counts[ncC].set('C'); counts[ncG].set('G'); counts[ncT].set('T'); counts[ncN].set('N'); counts[ncGap].set('-'); } ~GAlnColumn() { delete nucs; if (clipnuc!=NULL) delete clipnuc; } void addGap(int nucVal=1) { counts[ncGap].count+=nucVal; layers++; //-- Not a "layer", actually //numgaps++; } void addNuc(GASeq* seq, int pos, bool clipped=false, short nucVal=1) { //assumes the seq is already loaded and reverse complemented if necessary //position is precisely where it should be if (clipped) { if (hasClip==false) { hasClip=true; clipnuc = new NucOri(seq,pos); } return; } char c=(char)toupper(seq->seq[pos]); switch (c) { case 'A':nucs->Add(new NucOri(seq,pos)); counts[ncA].count+=nucVal; layers++; break; case 'C':nucs->Add(new NucOri(seq,pos)); counts[ncC].count+=nucVal; layers++; break; case 'G':nucs->Add(new NucOri(seq,pos)); counts[ncG].count+=nucVal; layers++; break; case 'T':nucs->Add(new NucOri(seq,pos)); counts[ncT].count+=nucVal; layers++; break; case '-': //this shouldn't be the case! case '*':counts[ncGap].count+=nucVal; layers++; //numgaps++; break; default: nucs->Add(new NucOri(seq,pos)); counts[ncN].count+=nucVal; layers++; //numN++; }//switch } char bestChar(int16_t *qscore=NULL); void remove(); //removes a nucleotide from all involved sequences //adjust all affected offsets in the alignment }; // A MSA columns container class MSAColumns { int size; public: static bool removeConsGaps; static bool refineClipping; GAlnColumn* columns; int baseoffset; int mincol; int maxcol; MSAColumns(int len, int baseofs=0) { columns=new GAlnColumn[len]; size=len; baseoffset=baseofs; mincol=INT_MAX; maxcol=0; } ~MSAColumns() { size=0; baseoffset=0; delete[] columns; } GAlnColumn& operator[](int idx) { if (idx<0 || idx>=size) GError("MSAColumns op[]: bad index %d (size=%d)\n", idx,size); return columns[idx]; } int len() { return maxcol-mincol+1; } void updateMinMax(int minc, int maxc) { if (mincmaxcol) maxcol=maxc; } }; //----------------------------------------------- // a sequence alignment: could be pairwise or MSA class GSeqAlign :public GList { //static unsigned int counter; int length; int minoffset; //int consensus_cap; void buildMSA(bool refWeighDown=false); void ErrZeroCov(int col); public: bool refinedMSA; //if refineMSA() was applied MSAColumns* msacolumns; unsigned int ordnum; //order number -- when it was created // the lower the better (earlier=higher score) int ng_len; //ungapped length and minoffset (approximative, int ng_minofs; // for clipping constraints only) int badseqs; //char* consensus; //consensus sequence (built by refineMSA()) GDynArray consensus; GDynArray consensus_bq; //int consensus_len; friend class GASeq; bool operator==(GSeqAlign& d){ return (this==&d); } bool operator>(GSeqAlign& d){ return (this>&d); } bool operator<(GSeqAlign& d){ return (this<&d); } //-- //GSeqAlign():GList(true,true,false), length(0), minoffset(0), GSeqAlign():GList(false,true,false), length(0), minoffset(0), refinedMSA(false), msacolumns(NULL), ordnum(0), ng_len(0),ng_minofs(0), badseqs(0), consensus(512), consensus_bq(512) { //default is: sorted by GASeq offset, free nodes, non-unique } GSeqAlign(bool sorted, bool free_elements=true, bool beUnique=false) :GList(sorted,free_elements,beUnique), length(0), minoffset(0), refinedMSA(false), msacolumns(NULL), ordnum(0), ng_len(0),ng_minofs(0), badseqs(0), consensus(512), consensus_bq(512) { } //void incOrd() { ordnum = ++counter; } void incOrd() { ordnum++; } //first time creation from a pairwise alignment: #ifdef ALIGN_COVERAGE_DATA GSeqAlign(GASeq* s1, int l1, int r1, GASeq* s2, int l2, int r2); #else GSeqAlign(GASeq* s1, GASeq* s2); #endif ~GSeqAlign() { if (msacolumns!=NULL) delete msacolumns; } int len() { return length; } void revComplement(); void addSeq(GASeq* s, int soffs, int ngofs); void injectGap(GASeq* seq, int pos, int xgap); void removeBase(GASeq* seq, int pos); void extendConsensus(char c, int16_t bq=SHRT_MIN); //try to propagate the planned trimming of a read //to the whole MSA containing it // returns false if too much is trimmed of any component read void applyClipping(AlnClipOps& clipops); bool evalClipping(GASeq* seq, int c5, int c3, float clipmax, AlnClipOps& clipops); //merge other alignment into this msa //seq->id MUST be the same with oseq->id // *if OK, gaps AND coverage values are propagated // and omsa is deallocated // *if not OK <=> the layout doesn't accept the merge // due to clipmax constraint, then nothing happens bool addAlign(GASeq* seq, GSeqAlign* omsa, GASeq* oseq); void finalize(); //delete inserts and reverse complement sequences as needed void print(FILE* f, char c=0); //debug printing one-line alignments void print() { print(stdout); } void removeColumn(int column); void freeMSA(); void refineMSA(bool refWeighDown=false, bool redo_ends=false); // find consensus, refine clipping, remove gap-columns void writeACE(FILE* f, const char* name, bool refWeighDown=false); void writeMSA(FILE* f, int linelen=60); //write as multi-FASTA (MAF?) file void writeInfo(FILE* f, const char* name, bool refWeighDown=false); }; int compareOrdnum(void* p1, void* p2); int compareCounts(void* p1, void* p2); #endif gclib-0.12.7/LICENSE.txt000066400000000000000000000210541407072766100145200ustar00rootroot00000000000000 The Artistic License 2.0 Preamble This license establishes the terms under which a given free software Package may be copied, modified, distributed, and/or redistributed. The intent is that the Copyright Holder maintains some artistic control over the development of that Package while still keeping the Package available as open source and free software. You are always permitted to make arrangements wholly outside of this license directly with the Copyright Holder of a given Package. If the terms of this license do not permit the full use that you propose to make of the Package, you should contact the Copyright Holder and seek a different licensing arrangement. Definitions "Copyright Holder" means the individual(s) or organization(s) named in the copyright notice for the entire Package. "Contributor" means any party that has contributed code or other material to the Package, in accordance with the Copyright Holder's procedures. "You" and "your" means any person who would like to copy, distribute, or modify the Package. "Package" means the collection of files distributed by the Copyright Holder, and derivatives of that collection and/or of those files. A given Package may consist of either the Standard Version, or a Modified Version. "Distribute" means providing a copy of the Package or making it accessible to anyone else, or in the case of a company or organization, to others outside of your company or organization. "Distributor Fee" means any fee that you charge for Distributing this Package or providing support for this Package to another party. It does not mean licensing fees. "Standard Version" refers to the Package if it has not been modified, or has been modified only in ways explicitly requested by the Copyright Holder. "Modified Version" means the Package, if it has been changed, and such changes were not explicitly requested by the Copyright Holder. "Original License" means this Artistic License as Distributed with the Standard Version of the Package, in its current version or as it may be modified by The Perl Foundation in the future. "Source" form means the source code, documentation source, and configuration files for the Package. "Compiled" form means the compiled bytecode, object code, binary, or any other form resulting from mechanical transformation or translation of the Source form. Permission for Use and Modification Without Distribution (1) You are permitted to use the Standard Version and create and use Modified Versions for any purpose without restriction, provided that you do not Distribute the Modified Version. Permissions for Redistribution of the Standard Version (2) You may Distribute verbatim copies of the Source form of the Standard Version of this Package in any medium without restriction, either gratis or for a Distributor Fee, provided that you duplicate all of the original copyright notices and associated disclaimers. At your discretion, such verbatim copies may or may not include a Compiled form of the Package. (3) You may apply any bug fixes, portability changes, and other modifications made available from the Copyright Holder. The resulting Package will still be considered the Standard Version, and as such will be subject to the Original License. Distribution of Modified Versions of the Package as Source (4) You may Distribute your Modified Version as Source (either gratis or for a Distributor Fee, and with or without a Compiled form of the Modified Version) provided that you clearly document how it differs from the Standard Version, including, but not limited to, documenting any non-standard features, executables, or modules, and provided that you do at least ONE of the following: (a) make the Modified Version available to the Copyright Holder of the Standard Version, under the Original License, so that the Copyright Holder may include your modifications in the Standard Version. (b) ensure that installation of your Modified Version does not prevent the user installing or running the Standard Version. In addition, the Modified Version must bear a name that is different from the name of the Standard Version. (c) allow anyone who receives a copy of the Modified Version to make the Source form of the Modified Version available to others under (i) the Original License or (ii) a license that permits the licensee to freely copy, modify and redistribute the Modified Version using the same licensing terms that apply to the copy that the licensee received, and requires that the Source form of the Modified Version, and of any works derived from it, be made freely available in that license fees are prohibited but Distributor Fees are allowed. Distribution of Compiled Forms of the Standard Version or Modified Versions without the Source (5) You may Distribute Compiled forms of the Standard Version without the Source, provided that you include complete instructions on how to get the Source of the Standard Version. Such instructions must be valid at the time of your distribution. If these instructions, at any time while you are carrying out such distribution, become invalid, you must provide new instructions on demand or cease further distribution. If you provide valid instructions or cease distribution within thirty days after you become aware that the instructions are invalid, then you do not forfeit any of your rights under this license. (6) You may Distribute a Modified Version in Compiled form without the Source, provided that you comply with Section 4 with respect to the Source of the Modified Version. Aggregating or Linking the Package (7) You may aggregate the Package (either the Standard Version or Modified Version) with other packages and Distribute the resulting aggregation provided that you do not charge a licensing fee for the Package. Distributor Fees are permitted, and licensing fees for other components in the aggregation are permitted. The terms of this license apply to the use and Distribution of the Standard or Modified Versions as included in the aggregation. (8) You are permitted to link Modified and Standard Versions with other works, to embed the Package in a larger work of your own, or to build stand-alone binary or bytecode versions of applications that include the Package, and Distribute the result without restriction, provided the result does not expose a direct interface to the Package. Items That are Not Considered Part of a Modified Version (9) Works (including, but not limited to, modules and scripts) that merely extend or make use of the Package, do not, by themselves, cause the Package to be a Modified Version. In addition, such works are not considered parts of the Package itself, and are not subject to the terms of this license. General Provisions (10) Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license. (11) If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license. (12) This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder. (13) This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed. (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gclib-0.12.7/README.md000066400000000000000000000021161407072766100141520ustar00rootroot00000000000000## GCLib - Genomic C++ Library This is an eclectic collection of basic C++ code (functions, classes, templates) which is shared between a few of my bioinformatics projects. The main idea was to provide a core collection of data structures, trying to avoid unnecessary code dependencies of other heavy libraries, while minimizing build time. I had started gathering this code even before the C++ STL had been fully adopted as a cross-platform "standard". Even STL itself seems a bit on the heavy side (and keeps growing) compared to what I need in practice for many of my C++ projects, so often times I prefer to just use these simpler and leaner C++ classes and templates to provide most common data structures needed for my projects. ## Build/Install Do not build. Do not install. This is not meant to be built into an object library, it's a simple _source code library_ for other projects to include and link statically into the final executable(s). The makefile included here is just for simple, extemporaneous tests I occasionally perform as new functionality is added to this code collection. gclib-0.12.7/codons.cpp000066400000000000000000000113451407072766100146700ustar00rootroot00000000000000#include "codons.h" static char codonTable[32768]; //32K table for fasta codon decoding // codons are encoded as triplets of 5-bit-encoded nucleotides // (so any codon can be encoded/decoded as a unique 15-bit value) static char codonData[]={ //long list of 3+1 characters (codon+translation) 'A','A','A','K', 'A','A','C','N', 'A','A','G','K', 'A','A','R','K', 'A','A','T','N', 'A','A','Y','N', 'A','C','A','T', 'A','C','B','T', 'A','C','C','T', 'A','C','D','T', 'A','C','G','T', 'A','C','H','T', 'A','C','K','T', 'A','C','M','T', 'A','C','N','T', 'A','C','R','T', 'A','C','S','T', 'A','C','T','T', 'A','C','V','T', 'A','C','W','T', 'A','C','Y','T', 'A','G','A','R', 'A','G','C','S', 'A','G','G','R', 'A','G','R','R', 'A','G','T','S', 'A','G','Y','S', 'A','T','A','I', 'A','T','C','I', 'A','T','G','M', 'A','T','H','I', 'A','T','M','I', 'A','T','T','I', 'A','T','W','I', 'A','T','Y','I', 'C','A','A','Q', 'C','A','C','H', 'C','A','G','Q', 'C','A','R','Q', 'C','A','T','H', 'C','A','Y','H', 'C','C','A','P', 'C','C','B','P', 'C','C','C','P', 'C','C','D','P', 'C','C','G','P', 'C','C','H','P', 'C','C','K','P', 'C','C','M','P', 'C','C','N','P', 'C','C','R','P', 'C','C','S','P', 'C','C','T','P', 'C','C','V','P', 'C','C','W','P', 'C','C','Y','P', 'C','G','A','R', 'C','G','B','R', 'C','G','C','R', 'C','G','D','R', 'C','G','G','R', 'C','G','H','R', 'C','G','K','R', 'C','G','M','R', 'C','G','N','R', 'C','G','R','R', 'C','G','S','R', 'C','G','T','R', 'C','G','V','R', 'C','G','W','R', 'C','G','Y','R', 'C','T','A','L', 'C','T','B','L', 'C','T','C','L', 'C','T','D','L', 'C','T','G','L', 'C','T','H','L', 'C','T','K','L', 'C','T','M','L', 'C','T','N','L', 'C','T','R','L', 'C','T','S','L', 'C','T','T','L', 'C','T','V','L', 'C','T','W','L', 'C','T','Y','L', 'G','A','A','E', 'G','A','C','D', 'G','A','G','E', 'G','A','R','E', 'G','A','T','D', 'G','A','Y','D', 'G','C','A','A', 'G','C','B','A', 'G','C','C','A', 'G','C','D','A', 'G','C','G','A', 'G','C','H','A', 'G','C','K','A', 'G','C','M','A', 'G','C','N','A', 'G','C','R','A', 'G','C','S','A', 'G','C','T','A', 'G','C','V','A', 'G','C','W','A', 'G','C','Y','A', 'G','G','A','G', 'G','G','B','G', 'G','G','C','G', 'G','G','D','G', 'G','G','G','G', 'G','G','H','G', 'G','G','K','G', 'G','G','M','G', 'G','G','N','G', 'G','G','R','G', 'G','G','S','G', 'G','G','T','G', 'G','G','V','G', 'G','G','W','G', 'G','G','Y','G', 'G','T','A','V', 'G','T','B','V', 'G','T','C','V', 'G','T','D','V', 'G','T','G','V', 'G','T','H','V', 'G','T','K','V', 'G','T','M','V', 'G','T','N','V', 'G','T','R','V', 'G','T','S','V', 'G','T','T','V', 'G','T','V','V', 'G','T','W','V', 'G','T','Y','V', 'M','G','A','R', 'M','G','G','R', 'M','G','R','R', 'N','N','N','X', 'R','A','Y','B', 'S','A','R','Z', 'T','A','A','.', 'T','A','C','Y', 'T','A','G','.', 'T','A','R','.', 'T','A','T','Y', 'T','A','Y','Y', 'T','C','A','S', 'T','C','B','S', 'T','C','C','S', 'T','C','D','S', 'T','C','G','S', 'T','C','H','S', 'T','C','K','S', 'T','C','M','S', 'T','C','N','S', 'T','C','R','S', 'T','C','S','S', 'T','C','T','S', 'T','C','V','S', 'T','C','W','S', 'T','C','Y','S', 'T','G','A','.', 'T','G','C','C', 'T','G','G','W', 'T','G','T','C', 'T','G','Y','C', 'T','R','A','.', 'T','T','A','L', 'T','T','C','F', 'T','T','G','L', 'T','T','R','L', 'T','T','T','F', 'T','T','Y','F', 'X','X','X','X', 'Y','T','A','L', 'Y','T','G','L', 'Y','T','R','L' }; static bool isCodonTableReady=codonTableInit(); unsigned short packCodon(char n1, char n2, char n3) { //assumes they are uppercase already! byte b1=n1-'A'; byte b2=n2-'A'; byte b3=n3-'A'; b1 |= (b2 << 5); b2 = (b2 >> 3) | (b3 << 2); return ( ((unsigned short)b2) << 8) + b1; } bool codonTableInit() { memset((void*)codonTable, 'X', 32768); int cdsize=sizeof(codonData); for (int i=0;i unsigned short packCodon(char n1, char n2, char n3); //assumes n1,n2,n3 are UPPERCASE! struct Codon { char nuc[3]; Codon(char* str=NULL) { if (str==NULL) { nuc[0]='N'; nuc[1]='N'; nuc[2]='N'; } else { nuc[0]=toupper(str[0]); nuc[1]=toupper(str[1]); nuc[2]=toupper(str[2]); } } Codon(char s1, char s2, char s3) { nuc[0]=toupper(s1); nuc[1]=toupper(s2); nuc[2]=toupper(s3); } char& operator[](int idx) { if (idx<0 || idx>2) GError("Error: Codon index out of bounds!\n"); return nuc[idx]; } char operator[](int idx) const { if (idx<0 || idx>2) GError("Error: Codon index out of bounds!\n"); return nuc[idx]; } char translate(); }; //simple 1st frame forward translation of a given DNA string //will allocated memory for the translation -- the caller is // responsible for freeing the returned string! char* translateDNA(const char* dnastr, int& aalen, int dnalen=0); char translateCodon(const char* dna); //returns the aminoacid code for the 1st codon at dna bool codonTableInit(); #endif gclib-0.12.7/gcdb.cpp000066400000000000000000000600061407072766100143000ustar00rootroot00000000000000#include "gcdb.h" #include #ifdef _WIN32 /* mmap on Windows (from imagick sources) % Method mmap emulates the Unix method of the same name. % The format of the mmap method is: % void *mmap(char *address,size_t length,int protection, % int access,int file,off_t offset) */ void *mmap(char *address, size_t length, int protection, int access, int file, off_t offset) { void *map; HANDLE handle; map=(void *) NULL; handle=INVALID_HANDLE_VALUE; switch (protection) { case PROT_READ: default: { handle=CreateFileMapping((HANDLE) _get_osfhandle(file),0,PAGE_READONLY,0, length,0); if (!handle) break; map=(void *) MapViewOfFile(handle,FILE_MAP_READ,0,0,length); CloseHandle(handle); break; } case PROT_WRITE: { handle=CreateFileMapping((HANDLE) _get_osfhandle(file),0,PAGE_READWRITE,0, length,0); if (!handle) break; map=(void *) MapViewOfFile(handle,FILE_MAP_WRITE,0,0,length); CloseHandle(handle); break; } case PROT_READWRITE: { handle=CreateFileMapping((HANDLE) _get_osfhandle(file),0,PAGE_READWRITE,0, length,0); if (!handle) break; map=(void *) MapViewOfFile(handle,FILE_MAP_ALL_ACCESS,0,0,length); CloseHandle(handle); break; } } if (map == (void *) NULL) return((void *) MAP_FAILED); return((void *) ((char *) map+offset)); } /* =========== m u n m a p =========================== % % Method munmap emulates the Unix method with the same name. % The format of the munmap method is: % int munmap(void *map,size_t length) % A description of each parameter follows: % > status: Method munmap returns 0 on success; otherwise, it % returns -1 and sets errno to indicate the error. % > map: The address of the binary large object. % > length: The length of the binary large object. % */ int munmap(void *map, size_t length) { if (!UnmapViewOfFile(map)) return(-1); return(0); } #endif int endianSetup=0; int cdbInfoSIZE=offsetof(cdbInfo, tag)+4; int IdxDataSIZE=offsetof(CIdxData, reclen)+sizeof(uint32); int IdxDataSIZE32=offsetof(CIdxData32, reclen)+sizeof(uint32); /* int IdxSeqDataSIZE=offsetof(CIdxSeqData, elen)+sizeof(byte); int IdxSeqDataSIZE32=offsetof(CIdxSeqData32, elen)+sizeof(byte); */ //===================================================== //------------- buffer stuff ------------------- //===================================================== //------------------------------------- //--------- misc utility functions ----- static int gcdb_seek_set(int fd,gcdb_seek_pos pos) { if (lseek(fd, pos, 0) == -1) return -1; return 0; } #define gcdb_seek_begin(fd) (gcdb_seek_set((fd),(gcdb_seek_pos) 0)) static unsigned int gcdb_strlen(const char *s) { register char *t; t = (char*)s; for (;;) { if (!*t) return t - s; ++t; /*if (!*t) return t - s; ++t; if (!*t) return t - s; ++t; if (!*t) return t - s; ++t; */ } } static int byte_diff(char *s, unsigned int n,char *t) { for (;;) { if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; } return ((int)(unsigned int)(unsigned char) *s) - ((int)(unsigned int)(unsigned char) *t); } static void gcdb_byte_copy(char *to, unsigned int n, char *from) { for (;;) { if (!n) return; *to++ = *from++; --n; if (!n) return; *to++ = *from++; --n; if (!n) return; *to++ = *from++; --n; if (!n) return; *to++ = *from++; --n; } } static void gcdb_byte_copyr(char *to, unsigned int n, char *from) { to += n; from += n; for (;;) { if (!n) return; *--to = *--from; --n; if (!n) return; *--to = *--from; --n; if (!n) return; *--to = *--from; --n; if (!n) return; *--to = *--from; --n; } } #define ALIGNMENT 16 /* XXX: assuming that this alignment is enough */ #define SPACE 4096 /* must be multiple of ALIGNMENT */ typedef union { char irrelevant[ALIGNMENT]; double d; } aligned; static aligned realspace[SPACE / ALIGNMENT]; #define space ((char *) realspace) static unsigned int avail = SPACE; /* multiple of ALIGNMENT; 0<=avail<=SPACE */ offt_conv_func gcvt_offt; uint_conv_func gcvt_uint; int16_conv_func gcvt_int16; char *gcdb_alloc(unsigned int n) { char *x; n = ALIGNMENT + n - (n & (ALIGNMENT - 1)); /* XXX: could overflow */ if (n <= avail) { avail -= n; return space + avail; } x = (char*) malloc(n); if (!x) return NULL; //if (!x) GError("Error: mgcdb_alloc(%d) failed !\n", n); return x; } int GCDBuffer::write_all(char* buf, unsigned int len) { int w; while (len) { w = op(fd,buf,len); if (w == -1) { if (errno == error_intr) continue; return -1; /* note that some data may have been written */ } /* if (w == 0) ; luser's fault */ buf += w; len -= w; } return 0; } int GCDBuffer::flush() { int pt=p; if (!pt) return 0; p = 0; //return allwrite(op,fd,x,pt); return write_all(x,pt); } int GCDBuffer::putalign(char *buf,unsigned int len) { unsigned int bn; while (len > (bn = n-p)) { gcdb_byte_copy(x + p,bn,buf); p += bn; buf += bn; len -= bn; if (GCDBuffer::flush() == -1) return -1; } /* now len <= s->n - s->p */ gcdb_byte_copy(x + p,len,buf); p += len; return 0; } int GCDBuffer::put(char *buf,unsigned int len) { unsigned int bn=n; if (len > bn - p) { if (GCDBuffer::flush() == -1) return -1; /* now s->p == 0 */ if (bn < GCDBUFFER_OUTSIZE) bn = GCDBUFFER_OUTSIZE; while (len > n) { if (bn > len) bn = len; if (write_all(buf, bn) == -1) return -1; buf += bn; len -= bn; } } /* now len <= s->n - s->p */ gcdb_byte_copy(x + p,len,buf); p += len; return 0; } int GCDBuffer::putflush(char *buf,unsigned int len) { if (flush() == -1) return -1; return write_all(buf,len); } int GCDBuffer::putsalign(char *buf) { return GCDBuffer::putalign(buf, gcdb_strlen(buf)); } int GCDBuffer::puts(char *buf) { return GCDBuffer::put(buf, gcdb_strlen(buf)); } int GCDBuffer::putsflush(char *buf) { return GCDBuffer::putflush(buf, gcdb_strlen(buf)); } static int oneread(opfunc op,int fd, char *buf,unsigned int len) { int r; for (;;) { r = op(fd,buf,len); if (r == -1 && errno == error_intr) continue; return r; } } int GCDBuffer::oneRead(char* buf, unsigned int len) { return op(fd,buf,len); /*int r; for (;;) { r = op(fd,buf,len); if (r == -1 && errno == error_intr) continue; return r; }*/ } int GCDBuffer::getthis(char *buf,unsigned int len) { if (len > p) len = p; p -= len; gcdb_byte_copy(buf, len,x + n); n += len; return len; } int GCDBuffer::feed() { int r; if (p) return p; r = oneRead(x,n); if (r <= 0) return r; p = r; n -= r; if (n > 0) gcdb_byte_copyr(x + n,r,x); return r; } int GCDBuffer::bget(char *buf,unsigned int len) { int r; if (p > 0) return getthis(buf,len); if (n <= len) return oneRead(buf,n); r = GCDBuffer::feed(); if (r <= 0) return r; return getthis(buf,len); } int GCDBuffer::get(char *buf,unsigned int len) { int r; if (p > 0) return getthis(buf,len); if (n <= len) return oneread(op,fd,buf,len); r = GCDBuffer::feed(); if (r <= 0) return r; return getthis(buf,len); } char* GCDBuffer::peek() { return x + n; } void GCDBuffer::seek(unsigned int len) { n += len; p -= len; } int GCDBuffer::copy(GCDBuffer* bin) { int n_in; char *x_in; for (;;) { n_in = bin->feed(); if (n_in < 0) return -2; if (!n_in) return 0; x_in = bin->peek(); if (GCDBuffer::put(x_in,n_in) == -1) return -3; bin->seek(n_in); } } //===================================================== //------------- cdb utils ------------------- //===================================================== int error_intr = #ifdef EINTR EINTR; #else -1; #endif int error_nomem = #ifdef ENOMEM ENOMEM; #else -2; #endif int error_proto = #ifdef EPROTO EPROTO; #else -15; #endif //------------------------------------------------ //------------ allocation routines: /* conversion of unsigned int offsets read from a file can also be used to prepare unsigned integers to be written into a file in an independent platform manner */ union UInt32Bytes { unsigned char b[4]; int32_t ui; }; union UInt16Bytes { unsigned char b[2]; int16_t ui; }; unsigned int uint32_sun(void* x86int) { UInt32Bytes ub; ub.b[3]=((unsigned char*)x86int)[0]; ub.b[0]=((unsigned char*)x86int)[3]; ub.b[1]=((unsigned char*)x86int)[2]; ub.b[2]=((unsigned char*)x86int)[1]; return ub.ui; } int16_t int16_sun(void* x86int) { UInt16Bytes ub; ub.b[1]=((unsigned char*)x86int)[0]; ub.b[0]=((unsigned char*)x86int)[1]; return ub.ui; } /* unsigned int uint32_sun(void* x86int) { unsigned char b[4]; b[3]=((unsigned char*)x86int)[0]; b[0]=((unsigned char*)x86int)[3]; b[1]=((unsigned char*)x86int)[2]; b[2]=((unsigned char*)x86int)[1]; return *((unsigned int*)b); return *ub; }*/ unsigned int uint32_x86(void* offt) { return *((unsigned int*)offt); } int16_t int16_x86(void* v) { return *((int16_t *)v); } //-------- 64bit types conversion : union ULongBytes { unsigned char b[8]; off_t ob; }; off_t offt_sun(void* offt) { //unsigned char b[8]; ULongBytes ub; if (sizeof(off_t)==8) { //64 bit? // upper words: ub.b[3]=((unsigned char*)offt)[4]; ub.b[0]=((unsigned char*)offt)[7]; ub.b[1]=((unsigned char*)offt)[6]; ub.b[2]=((unsigned char*)offt)[5]; //-- ub.b[7]=((unsigned char*)offt)[0]; ub.b[4]=((unsigned char*)offt)[3]; ub.b[5]=((unsigned char*)offt)[2]; ub.b[6]=((unsigned char*)offt)[1]; } else { ub.b[3]=((unsigned char*)offt)[0]; ub.b[0]=((unsigned char*)offt)[3]; ub.b[1]=((unsigned char*)offt)[2]; ub.b[2]=((unsigned char*)offt)[1]; } //return *((off_t*)b); return ub.ob; } /* off_t offt_sun(void* offt) { unsigned char b[8]; if (sizeof(off_t)==8) { //64 bit? // upper words: b[3]=((unsigned char*)offt)[4]; b[0]=((unsigned char*)offt)[7]; b[1]=((unsigned char*)offt)[6]; b[2]=((unsigned char*)offt)[5]; //-- b[7]=((unsigned char*)offt)[0]; b[4]=((unsigned char*)offt)[3]; b[5]=((unsigned char*)offt)[2]; b[6]=((unsigned char*)offt)[1]; } else { b[3]=((unsigned char*)offt)[0]; b[0]=((unsigned char*)offt)[3]; b[1]=((unsigned char*)offt)[2]; b[2]=((unsigned char*)offt)[1]; } return *((off_t*)b); } */ off_t offt_x86(void* offt) { return *((off_t*)offt); } //------------------------ platform independent uint32 : void uint32_pack(char s[4],uint32 u) { s[0] = u & 255; u >>= 8; s[1] = u & 255; u >>= 8; s[2] = u & 255; s[3] = u >> 8; } void uint32_pack_big(char s[4],uint32 u) { s[3] = u & 255; u >>= 8; s[2] = u & 255; u >>= 8; s[1] = u & 255; s[0] = u >> 8; } /* unpacking: */ void uint32_unpack(char s[4],uint32 *u) { uint32 result; result = (unsigned char) s[3]; result <<= 8; result += (unsigned char) s[2]; result <<= 8; result += (unsigned char) s[1]; result <<= 8; result += (unsigned char) s[0]; *u = result; } void uint32_unpack_big(char s[4],uint32 *u) { uint32 result; result = (unsigned char) s[0]; result <<= 8; result += (unsigned char) s[1]; result <<= 8; result += (unsigned char) s[2]; result <<= 8; result += (unsigned char) s[3]; *u = result; } /* big/little endian check */ int endian_test(void) { unsigned short v=0x0001; unsigned char* b = (unsigned char*)&v; return b[1]; } void gcvt_endian_setup() { if (endianSetup!=0) return; //check endianness if (endian_test()) { gcvt_uint = &uint32_sun; gcvt_offt = &offt_sun; gcvt_int16 = &int16_sun; } else { gcvt_uint = &uint32_x86; gcvt_offt = &offt_x86; gcvt_int16 = &int16_x86; } } //===================================================== //------------- cdb index ------------------- //===================================================== GCdbWrite::GCdbWrite(int afd) { //check endianness :) gcvt_endian_setup(); cdbuf=new GCDBuffer((opfunc)&write,(int) afd,(char*)bspace,sizeof bspace); head = NULL; split = 0; hash = 0; numentries = 0; fd = afd; pos = sizeof final; gcdb_seek_set(fd, pos); fname[0]='\0'; //should return and test the result of gcdb_seek_set!!! } GCdbWrite::GCdbWrite(char* afname) { #ifdef _WIN32 fd = open(afname,O_WRONLY | O_TRUNC | O_BINARY | O_CREAT, S_IREAD|S_IWRITE); #else fd = open(afname,O_WRONLY | O_NDELAY | O_TRUNC | O_CREAT, 0664); #endif if (fd == -1) GError("GCdbWrite: Error creating file '%s'\n", fname); gcvt_endian_setup(); cdbuf=new GCDBuffer((opfunc)&write,(int) fd,(char*)bspace,sizeof bspace); head = NULL; split = 0; hash = 0; numentries = 0; pos = sizeof final; gcdb_seek_set(fd, pos); strcpy(fname, afname); //should return and test the result of gcdb_seek_set!!! } GCdbWrite::~GCdbWrite() { cdbuf->flush(); #ifndef _WIN32 /* NFS silliness */ if (fsync(fd) == -1) GError("GCdbWrite: Error at fsync() for file '%s'\n", fname); #endif if (::close(fd) == -1) GError("GCdbWrite: Error at closing file '%s'\n", fname); delete cdbuf; if (head!=NULL) free(head); } int GCdbWrite::posplus(uint32 len) { uint32 newpos = pos + len; if (newpos < len) { //errno = error_nomem; return -1; } pos = newpos; return 0; } int GCdbWrite::addend(unsigned int keylen,unsigned int datalen,uint32 h) { struct cdb_hplist *chead = head; if (!chead || (chead->num >= CDB_HPLIST)) { chead = (struct cdb_hplist *) gcdb_alloc(sizeof(struct cdb_hplist)); if (!chead) return -1; chead->num = 0; chead->next = head; head = chead; } chead->hp[head->num].h = h; chead->hp[head->num].p = pos; ++chead->num; ++numentries; if (posplus(8) == -1) return -1; if (posplus(keylen) == -1) return -1; if (posplus(datalen) == -1) return -1; return 0; } int GCdbWrite::addbegin(unsigned int keylen,unsigned int datalen) { char buf[8]; //if (keylen > MAX_UINT) { /* errno = error_nomem; */return -1; } // if (datalen > MAX_UINT) { /*errno = error_nomem;*/ return -1; } uint32_pack(buf,keylen); uint32_pack(buf + 4,datalen); if (cdbuf->putalign(buf,8) == -1) return -1; return 0; } #define cdbuffer_PUTC(s,c) \ ( ((s).n != (s).p) \ ? ( (s).x[(s).p++] = (c), 0 ) \ : (s).put(&(c),1) \ ) int GCdbWrite::add(const char* key, char* recdata, unsigned int datalen) { unsigned int i; unsigned int klen=strlen(key); if (klen<1) { GMessage("Warning: zero length key found\n"); return 0; } //------------ adding record ----------------- if (addbegin(klen,datalen)==-1) GError("GCdbWrite: Error at addbegin(%d, %d)\n",klen, datalen); uint32 h=CDB_HASHSTART; for (i = 0;i < klen; ++i) { //if (cdbuffer_PUTC(c.cdbuf,key[i]) == -1) if ( ((cdbuf->n!=cdbuf->p) ? (cdbuf->x[cdbuf->p++]=(key[i]),0 ) : cdbuf->put((char*)&(key[i]),1) )==-1) GError("GCdbWrite: Error at cdbbuf.put, key '%s'\n", key); h = cdb_hashadd(h,key[i]); } if (cdbuf->put(recdata,datalen) == -1) GError("GCdbWrite: Error at final cdbuf.put() at key='%s', datalen=%d\n", key, datalen); if (addend(klen,datalen,h) == -1) GError("GCdbWrite: Error at addend(%d, %d, h)\n", klen, datalen); return 1; } int GCdbWrite::addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen) { if (GCdbWrite::addbegin(keylen,datalen) == -1) return -1; if (cdbuf->putalign((char*)key,keylen) == -1) return -1; if (cdbuf->putalign(data,datalen) == -1) return -1; return GCdbWrite::addend(keylen,datalen,cdb_hash(key,keylen)); } int GCdbWrite::finish() { char buf[8]; int i; uint32 len; uint32 u; uint32 memsize; uint32 icount; uint32 where; struct cdb_hplist *x; struct cdb_hp *hp; for (i = 0;i < 256;++i) count[i] = 0; for (x = head;x;x = x->next) { i = x->num; while (i--) ++count[255 & x->hp[i].h]; } memsize = 1; for (i = 0;i < 256;++i) { u = count[i] * 2; if (u > memsize) memsize = u; } memsize += numentries; /* no overflow possible up to now */ u = (uint32) 0 - (uint32) 1; u /= sizeof(struct cdb_hp); if (memsize > u) { /* errno = error_nomem;*/ return -1; } split = (struct cdb_hp *) gcdb_alloc(memsize * sizeof(struct cdb_hp)); if (!split) return -1; hash = split + numentries; u = 0; for (i = 0;i < 256;++i) { u += count[i]; /* bounded by numentries, so no overflow */ start[i] = u; } for (x = head;x;x = x->next) { i = x->num; while (i--) split[--start[255 & x->hp[i].h]] = x->hp[i]; } for (i = 0;i < 256;++i) { icount = count[i]; len = icount + icount; /* no overflow possible */ uint32_pack(final + 8 * i,pos); uint32_pack(final + 8 * i + 4,len); for (u = 0;u < len;++u) hash[u].h = hash[u].p = 0; hp = split + start[i]; for (u = 0;u < icount;++u) { where = (hp->h >> 8) % len; while (hash[where].p) if (++where == len) where = 0; hash[where] = *hp++; } for (u = 0;u < len;++u) { uint32_pack(buf,hash[u].h); uint32_pack(buf + 4,hash[u].p); if (cdbuf->putalign(buf,8) == -1) return -1; if (posplus(8) == -1) return -1; } } if (cdbuf->flush() == -1) return -1; if (gcdb_seek_begin(fd) == -1) return -1; return cdbuf->putflush(final,sizeof(final)); } //===================================================== //------------- cdb ------------------- //===================================================== uint32 cdb_hashadd(uint32 h,unsigned char c) { h += (h << 5); return h ^ c; } uint32 cdb_hash(const char *buf,unsigned int len) { uint32 h; h = CDB_HASHSTART; while (len) { h = cdb_hashadd(h,*buf++); --len; } return h; } //--------------------------------------------------------------- //-------------------------- cdb methods ------------------------ GCdbRead::GCdbRead(int afd):map(NULL),loop(0) { struct stat st; char *x; gcvt_endian_setup(); findstart(); fd = afd; if (fstat(fd,&st) == 0) { if (st.st_size <= MAX_UINT) { #ifndef NO_MMAP x = (char *) mmap(0,st.st_size,PROT_READ,MAP_SHARED,fd,0); if (x + 1) { size = st.st_size; map = x; } else { GError("Error mapping the file (size=%ld)!\n",st.st_size); } #endif } else { GError("Error mapping the file (size %ld > MAX_UINT)\n", st.st_size); } } } GCdbRead::GCdbRead(char* afname):map(NULL) { struct stat st; char *x; gcvt_endian_setup(); findstart(); #ifdef _WIN32 fd = open(afname, O_RDONLY|O_BINARY); #else fd = open(afname, O_RDONLY); #endif if (fd == -1) GError("Error: cannot open file %s\n", afname); strcpy(fname, afname); if (fstat(fd,&st) == 0) { if (st.st_size <= MAX_UINT) { #ifndef NO_MMAP x = (char *) mmap(0,st.st_size,PROT_READ,MAP_SHARED,fd,0); if (x + 1) { size = st.st_size; map = x; } else { GError("GCdbRead: Error mapping the file (size=%ld)!\n",st.st_size); } #endif } else { GError("GCdbRead: Error mapping the file (size %ld > MAX_UINT)\n", st.st_size); } } } GCdbRead::~GCdbRead() { if (map!=NULL) { munmap(map,size); map = NULL; } } int GCdbRead::read(char *buf,unsigned int len, uint32 pos) { #ifndef NO_MMAP if (map) { if ((pos > size) || (size - pos < len)) { /* errno = error_proto; */ return -1; } gcdb_byte_copy(buf, len, map + pos); } else #endif { if (gcdb_seek_set(fd,pos) == -1) return -1; while (len > 0) { int r; do { r = ::read(fd,buf,len); } while ((r == -1) && (errno == error_intr)); if (r == -1) return -1; if (r == 0) { //errno = error_proto; return -1; } buf += r; len -= r; } } return 0; } int GCdbRead::match(const char *key, unsigned int len, uint32 pos) { char buf[32]; unsigned int n; while (len > 0) { n = sizeof buf; if (n > len) n = len; if (GCdbRead::read(buf,n,pos) == -1) return -1; if (byte_diff(buf,n,(char*)key)) return 0; pos += n; key += n; len -= n; } return 1; } int GCdbRead::findnext(const char *key,unsigned int len) { char buf[8]; uint32 pos; uint32 u; if (!loop) { u = cdb_hash(key,len); if (GCdbRead::read(buf,8,(u << 3) & 2047) == -1) return -1; uint32_unpack(buf + 4,&hslots); if (!hslots) return 0; uint32_unpack(buf,&pos); hpos=pos; khash = u; u >>= 8; u %= hslots; u <<= 3; kpos = hpos + u; } while (loop < hslots) { if (GCdbRead::read(buf,8,kpos) == -1) return - 1; uint32_unpack(buf + 4, &pos); if (!pos) return 0; loop += 1; kpos += 8; if (kpos == hpos + (hslots << 3)) kpos = hpos; uint32_unpack(buf,&u); if (u == khash) { if (GCdbRead::read(buf,8,pos) == -1) return -1; uint32_unpack(buf,&u); if (u == len) switch(GCdbRead::match(key,len,pos + 8)) { case -1: return -1; case 1: uint32_unpack(buf + 4,&dlen); dpos = pos + 8 + len; return 1; } } } return 0; } int GCdbRead::find(const char *key) { GCdbRead::findstart(); return GCdbRead::findnext(key,gcdb_strlen(key)); } //----- GReadBuf and GReadBufLine char* GReadBufLine::readline(int idx) { //reads a char at a time until \n and/or \r are encountered GFREE(buf[idx].chars); buf[idx].len=0; if (isEOF) return NULL; int len=0; buf[idx].fpos=filepos; int c=0; int allocated=256; GMALLOC(buf[idx].chars, allocated); while ((c=getc(file))!=EOF) { if (len>=allocated-1) { allocated+=256; GREALLOC(buf[idx].chars, allocated); } if (c=='\n' || c=='\r') { buf[idx].chars[len]='\0'; if (c=='\r') { //DOS file -- special case if ((c=getc(file))!='\n') ungetc(c,file); else filepos++; } filepos++; buf[idx].len=len; return buf[idx].chars; } filepos++; buf[idx].chars[len]=(char)c; len++; } //while i0) { //preserve the lines already in buffer int bidx=bufidx-1;//always leave room for PREVIOUS line, for putLine() for (int i=0;i=0 && bufidx0 && bufidx0 && bufidx #include #ifdef _WIN32 #define PROT_READ 1 #define PROT_WRITE 2 #define PROT_READWRITE 3 #define MAP_SHARED 1 #define MAP_PRIVATE 2 #define F_OK 0 #define R_OK 4 #define W_OK 2 #define RW_OK 6 #ifndef MAP_FAILED #define MAP_FAILED ((void *) -1) #endif void *mmap(char *,size_t,int,int,int,off_t); int munmap(void *,size_t); #else #include #endif //===================================================== //------------- buffer stuff ------------------- //===================================================== #define GCDBUFFER_INSIZE 8192 #define GCDBUFFER_OUTSIZE 8192 typedef int (*opfunc)(int, char*, size_t); //typedef unsigned long gcdb_seek_pos; typedef off_t gcdb_seek_pos; typedef unsigned int (*uint_conv_func)(void*); //uint conversion function pointer typedef off_t (*offt_conv_func)(void*); //uint conversion function pointer typedef int16_t (*int16_conv_func)(void*); //int16 conversion function pointer //conversion function --> to platform independent uint extern uint_conv_func gcvt_uint; extern offt_conv_func gcvt_offt; extern int16_conv_func gcvt_int16; /* unsigned int uint32_sun(void* x86int); unsigned int uint32_x86(void* x86int); //for file offsets: off_t runtime conversions: off_t offt_sun(void* offt); off_t offt_x86(void* offt); int16_t int16_sun(void* i16); int16_t int16_x86(void* i16); */ void gcvt_endian_setup(); class GCDBuffer { public: char *x; unsigned int p; unsigned int n; int fd; opfunc op; //methods: GCDBuffer():x(NULL),p(0),n(0),fd(0),op(NULL) { } GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) { //check endianness gcvt_endian_setup(); init(aop, afd, buf, len); } void init(opfunc aop,int afd,char *buf,unsigned int len) { x=buf; fd=afd; op=aop; p=0; n=len; } int flush(); int write_all(char* buf, unsigned int pt); int put(char* buf,unsigned int len); int putalign(char* buf,unsigned int len); int putflush(char* buf,unsigned int len); int puts(char *buf); int putsalign(char *buf); int putsflush(char *buf); int oneRead(char* buf, unsigned int len); int getthis(char* buf,unsigned int len); int get(char* buf,unsigned int len); int bget(char* buf,unsigned int len); int feed(); char *peek(); void seek(unsigned int len); int copy(GCDBuffer* bin); }; //===================================================== //------------- cdb utils ------------------- //===================================================== #ifndef _WIN32 extern int errno; #endif extern int error_intr; extern int error_nomem; extern int error_proto; //additional data to be appended to the cdb file: #define CDBMSK_OPT_MULTI 0x00000001 #define CDBMSK_OPT_C 0x00000002 #define CDBMSK_OPT_CADD 0x00000004 #define CDBMSK_OPT_COMPRESS 0x00000008 #define CDBMSK_OPT_GSEQ 0x00000010 //creates a compressed version of the database //uses plenty of unions for ensuring compatibility with // the old 'CIDX' info structure //trying to prevent [64bit] machines to align this to 64bit -- sizeof() gets it wrong! #pragma pack(4) // eek, gcc 2.95.3 alpha-decosf version does not // recognize this pragma directive //32 bit limits for index file size struct cdbInfo { uint32 num_keys; union { uint32 num_records; char oldtag[4]; // 'CIDX' for old tag style }; // data file size -- used to be uint32, now it could be 64bit union { int64_t dbsize; uint32 oldnum[2]; //num_keys, num_records }; union { uint32 idxflags; uint32 old_dbsize; }; union { int dbnamelen; int old_idxflags; }; // -- the actual db name precedes this fixed-size record union { char tag[4]; //'CDBX' for new files with LFS uint32 old_dbnamelen; }; }; // for passing around index data: struct CIdxData32 { uint32 fpos; uint32 reclen; }; /* struct CIdxSeqData32 { //4+4+2+1 = 11 bytes uint32 fpos; uint32 reclen; uint16_t linelen; //line length for FASTA-formatted seq byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows) }; */ struct CIdxData { off_t fpos; //64bit value on Linux uint32 reclen; }; /* struct CIdxSeqData { //8+4+2+1 = 15 bytes off_t fpos; //64bit value on Linux uint32 reclen; uint16_t linelen; //line length for FASTA-formatted seq byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows) }; */ #pragma pack() extern int cdbInfoSIZE; extern int IdxDataSIZE; extern int IdxDataSIZE32; /* extern int IdxSeqDataSIZE; extern int IdxSeqDataSIZE32; */ void uint32_pack(char *,uint32); void uint32_pack_big(char *,uint32); void uint32_unpack(char *,uint32 *); void uint32_unpack_big(char *,uint32 *); //===================================================== //------------- cdb index ------------------- //===================================================== #define CDB_HPLIST 1000 struct cdb_hp { uint32 h; uint32 p; } ; struct cdb_hplist { struct cdb_hp hp[CDB_HPLIST]; struct cdb_hplist *next; int num; }; //the index file should always be smaller than 4GB ! class GCdbWrite { GCDBuffer* cdbuf; char bspace[8192]; char fname[1024]; char final[2048]; uint32 count[256]; uint32 start[256]; struct cdb_hplist *head; struct cdb_hp *split; /* includes space for hash */ struct cdb_hp *hash; uint32 numentries; uint32 pos; //file position int posplus(uint32 len); int fd; //file descriptor public: //methods: GCdbWrite(int afd); //was: init GCdbWrite(char* fname); ~GCdbWrite(); int addbegin(unsigned int keylen,unsigned int datalen); int addend(unsigned int keylen,unsigned int datalen,uint32 h); int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen); int add(const char *key, char *data, unsigned int datalen); int getNumEntries() { return numentries; } int finish(); int close(); int getfd() { return fd; } char* getfile() { return fname; } }; //===================================================== //------------- cdb ------------------- //===================================================== #define CDB_HASHSTART 5381 uint32 cdb_hashadd(uint32,unsigned char); uint32 cdb_hash(const char *,unsigned int); #define MCDB_SLOT_BITS 8 /* 2^8 = 256 */ #define MCDB_SLOTS (1u< MCDB_HEADER_SZ */ #define MCDB_BLOCK_SZ (1u<<22) /* 4MB; must be >= MCDB_MMAP_SZ */ class GCdbRead { //struct mcdb_mmap *map; char *map; // ptr, mmap pointer uintptr_t size; // mmap size, initialized if map is nonzero uint32_t b; // hash table stride bits: (data < 4GB) ? 3 : 4 uint32_t n; // num records in mcdb uint32 loop; // number of hash slots searched under this key uint32 hslots; // initialized if loop is nonzero uintptr_t kpos; // initialized if loop is nonzero uintptr_t hpos; // initialized if loop is nonzero uintptr_t dpos; // initialized if cdb_findnext() returns 1 uint32 dlen; // initialized if cdb_findnext() returns 1 uint32 klen; // initialized if cdb_findnext() returns 1 uint32 khash; // initialized if loop is nonzero char fname[1024]; //char *map; // 0 if no map is available int fd; public: //methods: GCdbRead(int fd); //was cdb_init GCdbRead(char* afname); //was cdb_init ~GCdbRead(); //was cdb_free int read(char *,unsigned int,uint32); int match(const char *key, unsigned int len, uint32 pos); void findstart() { loop =0; } int findnext(const char *key,unsigned int len); int find(const char *key); int datapos() { return dpos; } int datalen() { return dlen; } int getfd() { return fd; } char* getfile() { return fname; } }; class GReadBuf { protected: FILE* f; uchar* buf; int buflen; int bufused; // int bufpos; off_t fpos; bool eof; bool eob; int refill(bool repos=false) { //refill the buffer----------- if (repos && bufpos==0) return 0; //no need to repos if (eof) return 0; int fr=0; if (repos && bufposreturns the number of bytes read int get(uchar *outbuf, int len) { if (eob) return 0; int rd=0; //bytes read while (!eob && rd=bufused) { if (eof) eob=true; else refill(); } }//while return rd; } uchar* getStr(uchar *outbuf, int len) { int rd=get(outbuf,len); if (rd==0) return NULL; else { outbuf[rd]='\0'; return outbuf; } } // getc equivalent int getch() { if (eob) return -1; int ch=(int)(uchar)buf[bufpos]; bufpos++; if (bufpos>=bufused) { if (eof) eob=true; else refill(); } return ch; } //--- bool isEof() { return eob; } bool ended() { return eob; } off_t getPos() { //returns the virtual file position // = the actual file offset of the byte at bufpos return fpos-(bufused-bufpos); } //skip into the stream the specified number of bytes int skip(int skiplen) { if (eob) return 0; int r=0; //the actual number of bytes skipped while (skiplen && !eob) { int dif=GMIN(bufused-bufpos,skiplen); skiplen-=dif; bufpos+=dif; r+=dif; if (bufpos>=bufused) { if (eof) { eob=true; return r; } refill(); } } return r; } //look ahead without updating the read pointer (bufpos) //Cannot peek more than buflen! int peek(uchar* outbuf, int len) { if (eob) return -1; //if (eob || len>buflen) return -1; if (len>bufused-bufpos) refill(true); int mlen=GMIN((bufused-bufpos),len); memcpy((void*)outbuf, (void*)(buf+bufpos), mlen); return mlen; } char peekChar() { if (eob) return -1; //if (eob || len>buflen) return -1; if (1>bufused-bufpos) refill(true); return *(buf+bufpos); } uchar* peekStr(uchar* outbuf, int len) { int rd=peek(outbuf,len); if (rd>0) { outbuf[rd]='\0'; return outbuf; } else return NULL; } //looks ahead to check if what follows matches int peekCmp(char* cmpstr, int cmplen=-1) { if (cmplen==0) return 0; if (eob) //GError("GReadBuf::peekcmp error: eob!\n"); return -2; if (cmplen<0) cmplen=strlen(cmpstr); if (cmplen>bufused-bufpos) { refill(true); if (cmplen>bufused-bufpos) return -2; } //use memcmp return memcmp((void*)(buf+bufpos), cmpstr, cmplen); } }; //circular line buffer, with read-ahead (peeking) capability class GReadBufLine { protected: struct BufLine { off_t fpos; int len; char* chars; }; int bufcap; //total number of lines in the buf array int bufidx; // the "current line" index in buf array bool isEOF; int lno; FILE* file; off_t filepos; //current file/stream offset for the first char of buf[bufidx] BufLine* buf; //array of bufferred lines char* readline(int idx);//read line from file into the buffer int fillbuf(); bool isEOB; public: const char* line(); //gets current line and advances the "current line" pointer //use putLine() to revert/undo this advancement off_t fpos(); //gets current line's byte offset in the file // does NOT advance the "current line" pointer int len(); //gets current line's length // does NOT advance the "current line" pointer bool isEof() { return isEOB; } bool eof() { return isEOB; } off_t getfpos() { return fpos(); } const char* getline() { return line(); } const char* getLine() { return line(); } int getLen() { return len(); } int linenumber() { return lno; } int lineno() { return lno; } int getLineNo() { return lno; } void putLine(); GReadBufLine(FILE* stream, int bcap=20) { if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine() bufcap=bcap; bufidx=-1; isEOB=false; isEOF=false; lno=0; GMALLOC(buf, bufcap * sizeof(BufLine)); for (int i=0;icolor(rgb); } GDImg::~GDImg() { gdImageDestroy(img); if (fout!=NULL && fout!=stdout) fclose(fout); } int GDImg::color(byte r, byte g, byte b) { return gdImageColorAllocate(img,(int)r,(int)g,(int)b); } void GDImg::line(int x1, int y1, int x2, int y2, int color) { if (color==-1) color=currentColor; gdImageLine(img,x1,y1,x2,y2,color); } void GDImg::rectangle(int x1, int y1, int x2, int y2, int color) { if (color==-1) color=currentColor; gdImageRectangle(img,x1,y1,x2,y2,color); } void GDImg::fillRectangle(int x1, int y1, int x2, int y2, int color) { if (color==-1) color=currentColor; gdImageFilledRectangle(img,x1,y1,x2,y2,color); } void GDImg::fillPolygon(gdPointPtr points, int ptotal, int color) { if (color==-1) color=currentColor; gdImageFilledPolygon(img, points,ptotal,color); } void GDImg::setTransparent(int cidx) { //cidx must be the color index of a color allocated previously for img! gdImageColorTransparent(img,cidx); } void GDImg::setFile(const char* fname) { if (fout!=NULL && fout!=stdout) fclose(fout); if (fname[0]=='-' && fname[1]==0) { //special "-" file name means stdout fout=stdout; } else { fout=fopen(fname, "wb"); if (fout==NULL) GError("Error: cannot open file %s for writing!\n",fname); } } void GDImg::setFile(FILE* f) { if (fout!=NULL && fout!=stdout) fclose(fout); fout=f; } void GDImg::write(const char* fname) { if (fname==NULL && fout==NULL) GError("Error at GDImg::writeGIF() - no destination file given!\n"); if (fname!=NULL) setFile(fname); gdImageGif(img,fout); } gclib-0.12.7/gdimg.h000066400000000000000000000037661407072766100141470ustar00rootroot00000000000000#ifndef _GDIMG_ #define _GDIMG_ #include "GBase.h" #include "gd.h" class GDImg { protected: gdImagePtr img; int imgW; int imgH; FILE* fout; int currentColor; int bgColor; //preallocated white by default void setFile(const char* fname); void setFile(FILE* f); static int defaultBg; public: void init(int w=64, int h=64, const char* fname=NULL, int bg_rgb=defaultBg); GDImg(int w=64, int h=64, const char* fname=NULL, int bg_rgb=defaultBg) { init(w,h, fname, bg_rgb); } GDImg(int w,int h, int bg_rgb) { init(w,h, (const char*)NULL, bg_rgb); } ~GDImg(); void write(const char* fname=NULL); //automatically write GIF void setTransparent(int cidx); // -1 means 'no transparency' void setTransparent(bool v=true) { setTransparent(v ? (int) bgColor : (int)-1); } int color(byte r, byte g, byte b); int color(int rgb) { return color( (byte)(rgb>>16) & 255, (byte)(rgb>>8) & 255, (byte)(rgb & 255)); } int colorAllocate(byte r, byte g, byte b) { return color(r,g,b); } int colorAllocate(int rgb) { return color(rgb); } void setColorIdx(int color) { currentColor=color; } //current color for drawing operations int setColor(int r, int g, int b) { currentColor=this->color(r,g,b); return currentColor; } int setColor(int rgb) { currentColor=this->color(rgb); return currentColor; } void setPixel(int x, int y, int color=-1) { if (color==-1) color=currentColor; gdImageSetPixel(img, x,y,color); } int getPixel(int x, int y) { return gdImageGetPixel(img, x, y); } void setBg(int rgb); void clear(int color=-1) { if (color==-1) color=bgColor; fillRectangle(0,0,imgW,imgH,color); } void line(int x1, int y1, int x2, int y2, int color=-1); void rectangle(int x1, int y1, int x2, int y2, int color=-1); void fillRectangle(int x1, int y1, int x2, int y2, int color=-1); void fillPolygon(gdPointPtr points, int ptotal, int color=-1); }; #endif gclib-0.12.7/gdna.cpp000066400000000000000000000040261407072766100143120ustar00rootroot00000000000000#include "gdna.h" #include const char* IUPAC_2BIT ="AACCTTGGTTAAAAAACCCCGGAAAAAACCAAAAAA"; const char* IUPAC_2BITN ="001133223300000011112200000011000000"; const char* IUPAC_DEFS ="AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbNnXx-*"; const char* IUPAC_COMP ="TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvNnXx-*"; #define A_2BIT 0 // 00 #define C_2BIT 1 // 01 #define G_2BIT 2 // 10 #define T_2BIT 3 // 11 static byte ntCompTable[256]; static byte nt2bit[256]; //maps any character to a 2bit base value (with N = A) static char v_2bit2nt[4] = {'A','C','G','T'}; //---------------------- static bool gdna_Ready=gDnaInit(); //---------------------- byte gdna2bit(char* &nt, int n) { // Pack n bases into a byte (n can be 1..4) byte out = 0; while (n && *nt) { n--; out <<= 2; out += nt2bit[(int)*nt]; nt++; } #ifdef GDEBUG if (n) { GError("Error: attempt to read 6-mer beyond the end of the string!\n"); } #endif return out; } char ntComplement(char c) { return ntCompTable[(int)c]; } char g2bit2base(byte v2bit) { return v_2bit2nt[v2bit & 0x03 ]; } //in place reverse complement of nucleotide (sub)sequence char* reverseComplement(char* seq, int slen) { if (slen==0) slen=strlen(seq); //reverseChars(seq,len); int l=0; int r=slen-1; char c; while (l5MB mouse intron const int GFF_MIN_INTRON = 4; //for mergeCloseExons option //bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings() int gff_fid_mRNA=0; //mRNA (has CDS) int gff_fid_transcript=1; // generic "transcript" feature int gff_fid_exon=2; // generic "exon"-like feature (exon,CDS,UTR,start/stop codon) int gff_fid_CDS=3; // CDS feature (CDS, start/stop codon) const char* exonTypes[]={ "None", "StartCodon", "StopCodon", "CDS", "UTR", "CDS+UTR", "exon" }; const GffScore GFFSCORE_NONE; //const uint gfo_flag_LEVEL_MSK = 0x00FF0000; //const byte gfo_flagShift_LEVEL = 16; void gffnames_ref(GffNames* &n) { if (n==NULL) n=new GffNames(); n->numrefs++; } void gffnames_unref(GffNames* &n) { if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n"); n->numrefs--; if (n->numrefs==0) { delete n; n=NULL; } } const byte CLASSCODE_OVL_RANK = 14; //rank value just above 'o' class code //rank value < this means exon overlap const byte CLASSCODE_J_RANK = 6; // all junctional based overlaps byte classcode_rank(char c) { switch (c) { case '=': return 0; //intron chain match or full exon chain match if strict matching is enabled case '~': return 1; //intron chain match when strict matching is enabled case 'c': return 4; //containment, perfect partial match (transfrag contained in reference) case 'k': return 4; // reverse containment (reference contained in transfrag) case 'm': return 6; // full span overlap with all reference introns either matching or retained case 'n': return 6; // partial overlap transfrag with at least one intron retention case 'j': return 6; // multi-exon transfrag overlap with at least one junction match OR intron overlap! case 'e': return 12; // single exon transfrag partially overlapping an intron of reference (possible pre-mRNA fragment) case 'o': return 12; // other generic exon overlap //**** >14 => no exon overlaps (not on the same strand) from here on ***** case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand (wrong strand mapping?) case 'x': return 18; // generic overlap on opposite strand (usually wrong strand mapping) case 'i': return 20; // intra-intron (transfrag fully contained within a reference intron) case 'y': return 30; // no exon overlap: ref exons fall within transfrag introns! (reverse of i) case 'p': return 90; //polymerase run case 'r': return 92; //repeats case 'u': return 94; //intergenic case 0 : return 100; default: return 96; } } const char* strExonType(char xtype) { static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"}; if (xtype>0 && xtype<7) return extbl[(int)xtype]; else return "NULL"; } int gfo_cmpByLoc(const pointer p1, const pointer p2) { GffObj& g1=*((GffObj*)p1); GffObj& g2=*((GffObj*)p2); if (g1.gseq_id==g2.gseq_id) { if (g1.start!=g2.start) return (int)(g1.start-g2.start); else if (g1.getLevel()!=g2.getLevel()) return (int)(g1.getLevel()-g2.getLevel()); else if (g1.end!=g2.end) return (int)(g1.end-g2.end); else return strcmp(g1.getID(), g2.getID()); } else //return (int)(g1.gseq_id-g2.gseq_id); // input order ! return strcmp(g1.getGSeqName(), g2.getGSeqName()); //lexicographic ! } //comparator for ordering by reference sequence (chromosome) index int gfo_cmpRefByID(const pointer p1, const pointer p2) { GffObj& g1=*((GffObj*)p1); GffObj& g2=*((GffObj*)p2); if (g1.gseq_id==g2.gseq_id) { if (g1.start!=g2.start) return (int)(g1.start-g2.start); else if (g1.getLevel()!=g2.getLevel()) return (int)(g1.getLevel()-g2.getLevel()); else if (g1.end!=g2.end) return (int)(g1.end-g2.end); else return strcmp(g1.getID(), g2.getID()); } else return (g1.gseq_id-g2.gseq_id); // sort refs by their id# order } char* GffLine::extractGFFAttr(char* & infostr, const char* oline, const char* attr, bool caseStrict, bool enforce_GTF2, int* rlen, bool deleteAttr) { //parse a key attribute and remove it from the info string //(only works for attributes that have values following them after ' ' or '=') static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required for GTF) at line:\n%s\n"; int attrlen=strlen(attr); char cend=attr[attrlen-1]; //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr); //must make sure attr is not found in quoted text char* pos=infostr; char prevch=0; bool in_str=false; bool notfound=true; int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp; while (notfound && *pos) { char ch=*pos; if (ch=='"') { in_str=!in_str; pos++; prevch=ch; continue; } if (!in_str && (prevch==0 || prevch==' ' || prevch == ';') && strcmpfn(attr, pos, attrlen)==0) { //attr match found //check for word boundary on right char* epos=pos+attrlen; if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') { notfound=false; break; } //not a perfect match, move on pos=epos; prevch=*(pos-1); continue; } //not a match or in_str prevch=ch; pos++; } if (notfound) return NULL; char* vp=pos+attrlen; while (*vp==' ') vp++; if (*vp==';' || *vp==0) { GMessage("Warning: cannot parse value of GFF attribute \"%s\" at line:\n%s\n", attr, oline); return NULL; } bool dq_enclosed=false; //value string enclosed by double quotes if (*vp=='"') { dq_enclosed=true; vp++; } if (enforce_GTF2 && !dq_enclosed) GError(GTF2_ERR, attr, oline); char* vend=vp; if (dq_enclosed) { while (*vend!='"' && *vend!=';' && *vend!=0) vend++; } else { while (*vend!=';' && *vend!=0) vend++; } if (enforce_GTF2 && *vend!='"') GError(GTF2_ERR, attr, oline); char *r=Gstrdup(vp, vend-1); if (rlen) *rlen = vend-vp; if (deleteAttr) {//-- remove this attribute from infostr while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++; if (*vend==0) vend--; for (char *src=vend, *dest=pos;;src++,dest++) { *dest=*src; //shift the rest of infostr (copy over) if (*src==0) break; } } return r; } BEDLine::BEDLine(GffReader* reader, const char* l): skip(true), dupline(NULL), line(NULL), llen(0), gseqname(NULL), fstart(0), fend(0), strand(0), ID(NULL), info(NULL), cds_start(0), cds_end(0), cds_phase(0), exons(1) { if (reader==NULL || l==NULL) return; llen=strlen(l); GMALLOC(line,llen+1); memcpy(line, l, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l, llen+1); char* t[14]; int i=0; int tidx=1; t[0]=line; if (startsWith(line, "browser ") || startsWith(line, "track ")) return; while (line[i]!=0) { if (line[i]=='\t') { line[i]=0; t[tidx]=line+i+1; tidx++; //our custom BED-13+ format, with GFF3 attributes in 13th column if (tidx>12) { info=t[12]; break; } } i++; } /* if (tidx<6) { // require BED-6+ lines GMessage("Warning: 6+ BED columns expected, instead found:\n%s\n", l); return; } */ gseqname=t[0]; char* p=t[1]; if (!parseUInt(p,fstart)) { GMessage("Warning: invalid BED start coordinate at line:\n%s\n",l); return; } ++fstart; //BED start is 0 based p=t[2]; if (!parseUInt(p,fend)) { GMessage("Warning: invalid BED end coordinate at line:\n%s\n",l); return; } if (fend5) { strand=*t[5]; if (strand!='-' && strand !='.' && strand !='+') { GMessage("Warning: unrecognized BED strand at line:\n%s\n",l); return; } } else strand='.'; //if (tidx>12) ID=t[12]; // else ID=t[3]; ID=t[3]; //now parse the exons, if any if (tidx>11) { int numexons=0; p=t[9]; if (!parseInt(p, numexons) || numexons<=0) { GMessage("Warning: invalid BED block count at line:\n%s\n",l); return; } char** blen; char** bstart; GMALLOC(blen, numexons * sizeof(char*)); GMALLOC(bstart, numexons * sizeof(char*)); i=0; int b=1; blen[0]=t[10]; while (t[10][i]!=0 && b<=numexons) { if (t[10][i]==',') { t[10][i]=0; if (b0 at line:\n%s\n",exonstart, l); return; } exonstart+=fstart; uint exonend=exonstart+exonlen-1; if ((uint)exonstart>fend || exonend>fend) { GMessage("Warning: BED exon %d-%d is outside record boundary at line:\n%s\n",exonstart,exonend, l); return; } ex.start=exonstart;ex.end=exonend; exons.Add(ex); } GFREE(blen); GFREE(bstart); } else { //take it as single-exon transcript GSeg v(fstart, fend); exons.Add(v); } if (info!=NULL) { char* cdstr=GffLine::extractGFFAttr(info, dupline, "CDS="); if (cdstr) { char* p=strchr(cdstr, ':'); if (p!=NULL) { *p='\0'; ++p; } if (strToUInt(cdstr, cds_start) && cds_start>=fstart-1) { ++cds_start; if (!strToUInt(p, cds_end) || cds_end>fend) { GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n", cds_start, cds_end, dupline); cds_start=0; cds_end=0; //invalid CDS coordinates } } char* cdstr_phase=NULL; if (cds_start>0 && (cdstr_phase=GffLine::extractGFFAttr(info, dupline, "CDSphase="))!=NULL) { cds_phase=cdstr_phase[0]; GFREE(cdstr_phase); } GFREE(cdstr); } } if (cds_start==0 && cds_end==0 && tidx>7) { //check if columns 7,8 can be reasonably assumed to be CDS start-end coordinates if (strToUInt(t[6], cds_start) && strToUInt(t[7], cds_end) && cds_end>cds_start) { if (cds_start>=fstart-1 && cds_end<=fend) cds_start++; else { cds_start=0; cds_end=0; } } } skip=false; } bool GffLine::parseSegmentList(GVec& segs, char* str) { bool segs_valid=false; char* p=strchr(str, '-'); if (p!=NULL && p>str) { GDynArray ss; strsplit(str, ss, ','); GSeg seg; segs_valid=true; for (uint i=0;i(int)fend){ segs_valid=false; break; } if (!strToInt(p, xend) || xend<(int)fstart || xend>(int)fend) { segs_valid=false; break; } if (xstart>xend) { seg.start=(uint)xend;seg.end=(uint)xstart; } else { seg.start=(uint)xstart;seg.end=(uint)xend; } segs.Add(seg); } //parse all CDS segments if (segs_valid) { if (segs.Count()>1) segs.Sort(); } else segs.Clear(); } return segs_valid; } void GffLine::ensembl_GFF_ID_process(char*& id) { char* n=NULL; if (startsWith(id, "gene:")) { n=Gstrdup(id+5); GFREE(id); id=n; } else if (startsWith(id, "transcript:")) { n=Gstrdup(id+11); GFREE(id); id=n; } } void GffLine::ensembl_GTF_ID_process(char*& id, const char* ver_attr) { char* v=NULL; v=extractAttr(ver_attr); if (v!=NULL) { char* n=Gstrdup(id, strlen(v)+1); strcat(n,".");strcat(n,v); GFREE(v); GFREE(id); id=n; } } GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len(0), dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL), ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0), score(0), score_decimals(-1), strand(0), flags(0), exontype(exgffNone), phase(0), cds_start(0), cds_end(0), exons(), cdss(), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) { llen=strlen(l); GMALLOC(line,llen+1); memcpy(line, l, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l, llen+1); skipLine=true; //clear only if we make it to the end of this function char* t[9]; int i=0; int tidx=1; t[0]=line; char fnamelc[128]; while (line[i]!=0) { if (line[i]=='\t') { line[i]=0; t[tidx]=line+i+1; tidx++; //if (tidx>8) break; } i++; } if (tidx<8) { // ignore non-GFF lines return; } if (tidx>9) { GMessage("Warning: unexpected tab character in last column, line truncated:\n\%s\n",l); } gffWarnings=reader->gff_warns; gseqname=t[0]; track=t[1]; ftype=t[2]; info=t[8]; char* p=t[3]; if (!parseUInt(p,fstart)) { //chromosome_band entries in Flybase GMessage("Warning: invalid start coordinate at line:\n%s\n",l); return; } p=t[4]; if (!parseUInt(p,fend)) { GMessage("Warning: invalid end coordinate at line:\n%s\n",l); return; } if (fendfeats.addName(ftype); } else if (endsWith(fnamelc, "_gene_segment")) { is_transcript=true; is_t_data=true; is_gene_segment=true; } else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) { is_gene=true; is_t_data=true; //because its name will be attached to parented transcripts } char* Parent=NULL; /* Rejecting non-transcript lines early if only transcripts are requested ?! It would be faster to do this here but there are GFF cases when we reject an unusual parent feature here (e.g. protein with CDS children) and then their exon/CDS children show up and get assigned to an implicit parent mRNA The solution is to still load this parent as GffObj for now and BAN it later so its children get dismissed/discarded as well. */ if (reader->ignoreLocus) { if (strcmp(ftype, "locus")==0) return; if (is_transcript || is_gene) { char* locus=NULL; if (reader->is_gff3 || reader->gff_type==0) locus=extractAttr("locus="); else locus=extractAttr("locus"); if (locus!=NULL) { GFREE(locus); } } } char *gtf_tid=NULL; char *gtf_gid=NULL; if (reader->is_gff3 || reader->gff_type==0) { ID=extractAttr("ID=",true); if (ID!=NULL && reader->procEnsemblID()) { ensembl_GFF_ID_process(ID); } Parent=extractAttr("Parent=",true); if (Parent!=NULL && reader->procEnsemblID()) { ensembl_GFF_ID_process(Parent); } if (reader->gff_type==0) { if (ID!=NULL || Parent!=NULL) reader->is_gff3=true; else { //check if it looks like a GTF gtf_tid=extractAttr("transcript_id", true, true); if (gtf_tid!=NULL) { if (reader->procEnsemblID()) ensembl_GTF_ID_process(gtf_tid, "transcript_version"); } else { //NULL gtf_tid, try gene_id gtf_gid=extractAttr("gene_id", true, true); if (gtf_gid!=NULL) { if (reader->procEnsemblID()) ensembl_GTF_ID_process(gtf_gid, "gene_version"); } else return; //cannot determine file type yet } reader->is_gtf=true; } } } if (reader->is_gff3) { //parse as GFF3 //if (ID==NULL && Parent==NULL) return; //silently ignore unidentified/unlinked features if (ID!=NULL) { //has ID attr so it's likely to be a parent feature //look for explicit gene name gene_name=getAttrValue("gene_name="); if (gene_name==NULL) { gene_name=getAttrValue("geneName="); if (gene_name==NULL) { gene_name=getAttrValue("gene_sym="); if (gene_name==NULL) { gene_name=getAttrValue("gene="); } } } gene_id=getAttrValue("geneID="); if (gene_id==NULL) { gene_id=getAttrValue("gene_id="); } //--parse exons for TLF char* segstr=extractAttr("exons="); bool exons_valid=false; if (segstr) { exons_valid=parseSegmentList(exons, segstr); char* exoncountstr=extractAttr("exonCount="); if (exoncountstr) { int exoncount=0; if (!strToInt(exoncountstr, exoncount) || exoncount!=(int)exons.Count()) GMessage("Warning: exonCount attribute value doesn't match the exons attribute!\n"); GFREE(exoncountstr); } GFREE(segstr); } if (exons_valid) { bool validCDS=false; segstr=extractAttr("CDS="); if (segstr) { char* p=strchr(segstr, ':'); if (p!=NULL) { // CDS=start:end format *p='\0'; ++p; validCDS=true; if (validCDS && strToUInt(segstr, cds_start) && cds_start>=fstart) { if (!strToUInt(p, cds_end) || cds_end>fend) { validCDS=false; } } if (!validCDS || (int)cds_start<=0 || (int)cds_end<=0) { GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n", cds_start, cds_end, dupline); cds_start=0; cds_end=0; } } //CDS=start:end format else { //CDS = list of start-end segments, just like the exons validCDS=parseSegmentList(cdss, segstr); if (validCDS && cdss.Count()>0) { if (cds_start==0) cds_start=cdss.First().start; if (cds_end==0) cds_end=cdss.Last().end; } } GFREE(segstr); } if (validCDS) { char* cds_phase=NULL; if ((cds_phase=extractAttr("CDSphase="))!=NULL) { phase=cds_phase[0]; GFREE(cds_phase); } } //CDS found }//has valid exons }// has GFF3 ID if (Parent!=NULL) { //keep Parent attr //parse multiple parents num_parents=1; p=Parent; int last_delim_pos=-1; while (*p!=';' && *p!=0) { if (*p==',' && *(p+1)!=0 && *(p+1)!=';') { num_parents++; last_delim_pos=(p-Parent); } p++; } _parents_len=p-Parent+1; _parents=Parent; GMALLOC(parents, num_parents*sizeof(char*)); parents[0]=_parents; int i=1; if (last_delim_pos>0) { for (p=_parents+1;p<=_parents+last_delim_pos;p++) { if (*p==',') { char* ep=p-1; while (*ep==' ' && ep>_parents) ep--; *(ep+1)=0; //end the string there parents[i]=p+1; i++; } } } } //has Parent field //special case for gene_id: for genes, this is the ID if (gene_id==NULL) { if (is_gene) { if (ID!=NULL) gene_id=Gstrdup(ID); } else if (is_transcript) { if (Parent!=NULL) gene_id=Gstrdup(Parent); } } } //GFF3 else { // ----------------- GTF syntax ------------------ if (reader->transcripts_Only && !is_t_data) { return; //alwasys skip unrecognized non-transcript features in GTF } if (is_gene) { reader->gtf_gene=true; ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //Ensemble GTF might lack transcript_id ! if (ID!=NULL) { if (gtf_tid==NULL && reader->procEnsemblID()) ensembl_GTF_ID_process(ID, "transcript_version"); } gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); if (gene_id!=NULL && gtf_gid==NULL && reader->procEnsemblID()) ensembl_GTF_ID_process(gene_id, "gene_version"); if (ID==NULL) { //no transcript_id -- this should not be valid GTF2 format, but Ensembl (Gencode?) //has being known to add "gene" features with only gene_id in their GTF if (gene_id!=NULL) { //likely a gene feature line (Ensembl!) ID=Gstrdup(gene_id); //take over as ID (for defective GTF lacking transcript_id) } } // else if (strcmp(gene_id, ID)==0) //GENCODE v20 gene feature ? } else if (is_transcript) { ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID if (ID==NULL) { //something is wrong here, cannot parse the GTF ID GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l); return; } else if (gtf_tid==NULL && reader->procEnsemblID()) ensembl_GTF_ID_process(ID, "transcript_version"); gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); if (gene_id!=NULL) { if (gtf_gid==NULL && reader->procEnsemblID()) ensembl_GTF_ID_process(gene_id, "gene_version"); Parent=Gstrdup(gene_id); } reader->gtf_transcript=true; is_gtf_transcript=1; } else { //must be an exon type ? Parent = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); if (Parent!=NULL && gtf_tid==NULL && reader->procEnsemblID()) ensembl_GTF_ID_process(Parent, "transcript_version"); gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); // for GTF this is the only attribute accepted as geneID if (gene_id!=NULL && gtf_gid==NULL && reader->procEnsemblID()) ensembl_GTF_ID_process(gene_id, "gene_version"); //old pre-GTF2 formats like Jigsaw's (legacy support) if (Parent==NULL && exontype==exgffExon) { if (startsWith(track,"jigsaw")) { is_cds=true; strcpy(track,"jigsaw"); p=strchr(info,';'); if (p==NULL) { Parent=Gstrdup(info); info=NULL; } else { Parent=Gstrdup(info,p-1); info=p+1; } } } if (Parent==NULL) { //something is wrong here couldn't parse the transcript ID for this feature GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l); return; } } //more GTF attribute parsing if (is_gene && gene_id==NULL && ID!=NULL) gene_id=Gstrdup(ID); gene_name=getAttrValue("gene_name"); if (gene_name==NULL) { gene_name=getAttrValue("gene_sym"); if (gene_name==NULL) { gene_name=getAttrValue("gene"); if (gene_name==NULL) gene_name=getAttrValue("genesymbol"); } } //*** IMPORTANT: prepare GTF for easy parseAttr by adding '=' character after the attribute name // for ALL attributes p=info; bool noed=true; //not edited after the last delim bool nsp=false; //non-space found after last delim while (*p!=0) { if (*p==' ') { if (nsp && noed) { *p='='; noed=false; p++; continue; } } else nsp=true; //non-space if (*p==';') { noed=true; nsp=false; } p++; } //-- GTF prepare parents[] if Parent found if (Parent!=NULL) { //GTF transcript_id found as a parent _parents=Parent; num_parents=1; _parents_len=strlen(Parent)+1; GMALLOC(parents, sizeof(char*)); parents[0]=_parents; } } //GTF if (ID==NULL && parents==NULL) { if (gffWarnings) GMessage("Warning: discarding unrecognized feature (no ID or Parent):\n%s\n",dupline); return; //skip } skipLine=false; } //FIXME - this should only be used AFTER finalize() was called, and must have cdss=NULL of course void GffObj::setCDS(uint cd_start, uint cd_end, char phase) { if (cd_startstart) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n", gffID, cd_start); return; } if (cd_end>this->end) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n", gffID, cd_end); return; } this->CDstart=cd_start; this->CDend=cd_end; this->CDphase=phase; isTranscript(true); subftype_id=gff_fid_exon; if (monoFeature()) { if (exons.Count()==0) addExon(this->start, this->end, exgffExon); else exons[0]->exontype=exgffExon; } } void GffObj::setCDS(GffObj* t) { //copy the cdss as well uint cd_start=t->CDstart; uint cd_end=t->CDend; uint phase=t->CDphase; if (cd_startstart) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n", gffID, cd_start); return; } if (cd_end>this->end) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n", gffID, cd_end); return; } this->CDstart=cd_start; this->CDend=cd_end; this->CDphase=phase; isTranscript(true); subftype_id=gff_fid_exon; if (monoFeature()) { if (exons.Count()==0) addExon(this->start, this->end, exgffExon); else exons[0]->exontype=exgffExon; } if (t->cdss!=NULL) { if (this->cdss!=NULL) delete cdss; cdss=new GList(true, true, false); for (int i=0;icdss->Count();i++) { cdss->Add(new GffExon(*(t->cdss->Get(i)))); } } } int GffObj::readExon(GffReader& reader, GffLine& gl) { // -- this should only be called before ::finalize()! //should make sure to get the right subftype_id! if (!isTranscript() && gl.exontype>exgffNone) { //subfeature recognized as exon-like, so this should be considered a transcript! isTranscript(true); } if (isTranscript()) { if (subftype_id<0) {//exon_ftype_id=gff_fid_exon; if (gl.exontype>exgffNone) subftype_id=gff_fid_exon; else subftype_id=names->feats.addName(gl.ftype); } //any recognized exon-like segment gets the generic "exon" type (also applies to CDS) if (gl.exontype==exgffNone && !gl.is_transcript) { //extraneous mRNA feature, discard if (reader.gff_warns) GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n", gl.ftype, gffID); return -1; } } else { //non-mRNA parent feature, check this subf type int subf_id=names->feats.addName(gl.ftype); if (subftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added) subftype_id=subf_id; else { if (subftype_id!=subf_id) { if (subftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) { //the existing exon was just a dummy one created by default, discard it? exons.Clear(); covlen=0; subftype_id=subf_id; //allow the new subfeature to completely takeover } else { //multiple subfeatures, prefer those exon-like if (reader.gff_warns) GMessage("Warning: multiple subfeatures (%s and %s) found for %s, discarding ", names->feats.getName(subf_id), names->feats.getName(subftype_id),gffID); if (gl.exontype>exgffNone) { //new feature is an exon, discard previously parsed subfeatures if (reader.gff_warns) GMessage("%s.\n", names->feats.getName(subftype_id)); subftype_id=subf_id; exons.Clear(); covlen=0; } else { //discard new feature if (reader.gff_warns) GMessage("Warning: skipping subfeature %s.\n", names->feats.getName(subf_id)); return -1; //skip this 2nd subfeature type for this parent! } } } //incoming subfeature is of different type } //new subfeature type } //non-mRNA parent int eidx=-1; GList* segs=NULL; //either cds or &exons if (gl.is_cds) { if (cdss==NULL) cdss=new GList(true, true, false); segs=cdss; } else { segs=&exons; } eidx=addExon(*segs, gl); if (eidx<0) { GMessage("Warning: addExon() failed for GFF line:\n%s\n",gl.dupline); return eidx; //this should never happen! } if (reader.keep_Attrs) { if (reader.noExonAttrs) { parseAttrs(attrs, gl.info, true); } else { //need all exon-level attributes parseAttrs((*segs)[eidx]->attrs, gl.info, true, gl.is_cds); } } return eidx; } int GffObj::addExon(GList& segs, GffLine& gl, int8_t exontype_override) { int ex_type=(exontype_override!=exgffNone) ? exontype_override : gl.exontype; GffScore exon_score(gl.score, gl.score_decimals); int eidx=addExon(gl.fstart, gl.fend, ex_type, gl.phase, exon_score, &segs); if (&segs==cdss && isGene() && gl.ID!=NULL && eidx>=0) { //special NCBI cases where CDS can be treated as discontiguous features, grouped by their ID //-- used for genes with X_gene_segment features //char* cds_id=Gstrdup(gl.ID); //segs[eidx]->uptr=cds_id; segs[eidx]->uptr=gl.ID; gl.ID=NULL; } return eidx; } int GffObj::exonOverlapIdx(GList& segs, uint s, uint e, int* ovlen, int start_idx) { //return the exons' index for the overlapping OR ADJACENT exon //ovlen, if given, will return the overlap length //if (s>e) Gswap(s,e); for (int i=start_idx;istart>e+1) break; if (s-1>segs[i]->end) continue; //-- overlap/adjacent if we are here: if (ovlen!=NULL) { int ovlend= (segs[i]->end>e) ? e : segs[i]->end; *ovlen= ovlend - ((s>segs[i]->start)? s : segs[i]->start)+1; } return i; } //for each exon *ovlen=0; return -1; } void GffObj::transferCDS(GffExon* cds) { //direct adding of a cds to the cdss pointer, without checking if (cdss==NULL) cdss=new GList(true, true, false); cdss->Add(cds); //now the caller must forget this exon! if (CDstart==0 || CDstart>cds->start) CDstart=cds->start; } int GffObj::addExon(uint segstart, uint segend, int8_t exontype, char phase, GffScore exon_score, GList* segs) { if (segstart>segend) { Gswap(segstart, segend); } if (segs==NULL) segs=&exons; if (exontype!=exgffNone) { //check for overlaps between exon/CDS-type segments //addExonSegment(gl.fstart, gl.fend, gl.score, gl.phase, gl.is_cds, exontype_override); int ovlen=0; int oi=-1; while ((oi=exonOverlapIdx(*segs, segstart, segend, &ovlen, oi+1))>=0) { //note: ovlen==0 for adjacent segments if ((*segs)[oi]->exontype>exgffNone && (*segs)[oi]->start<=segstart && (*segs)[oi]->end>=segend) { //existing feature contains this segment, so we do NOT need to add it //-- unless its the annoying NCBI exception: gene with multiple alternate // _gene_segment CDS features! if (!(this->isGene() && exontype==exgffCDS && (*segs)[oi]->exontype==exgffCDS )) return oi; } if (ovlen==0 || !(exontype==exgffCDS && (*segs)[oi]->exontype==exgffCDS)) { //always merge adjacent features //but NEVER merge two overlapping CDS (CDS programmed ribosomal shift aware) int8_t segtype=((*segs)[oi]->exontype==exgffCDS || exontype==exgffCDS) ? exgffCDS : exgffExon; //if expanded upward, may overlap the segment(s) above expandSegment(*segs, oi, segstart, segend, segtype); return oi; } } } //exon overlap/adjacent check //new exon/CDS, not merged in a previous one GffExon* enew=new GffExon(segstart, segend, exontype, phase, exon_score.score, exon_score.precision); int eidx=segs->Add(enew); if (eidx<0) { //this would actually be possible if the object is a "Gene" and "exons" are in fact isoforms delete enew; hasErrors(true); return -1; } if (start>segs->First()->start) start=segs->First()->start; if (endLast()->end) end=segs->Last()->end; if (isFinalized() && segs==&exons) { covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1; } return eidx; } void GffObj::expandSegment(GList& segs, int oi, uint segstart, uint segend, int8_t exontype) { //oi is the index of the *first* overlapping segment found that must be enlarged covlen-=segs[oi]->len(); if (segstartstart) segs[oi]->start=segstart; //if (qs && qsqstart) exons[oi]->qstart=qs; if (segend>segs[oi]->end) segs[oi]->end=segend; //if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe; //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse //if (sc!=0) segs[oi]->score=sc; //covlen+=exons[oi]->len(); //if (exons[oi]->exontype< exontype) -- always true segs[oi]->exontype = exontype; //if (exontype==exgffCDS) exons[oi]->phase=fr; //we must check if any more exons are also overlapping this int ni=oi+1; //next exon index after oi while (nistart<=segend+1) { // next segment overlaps OR adjacent to newly enlarged segment if (segs[ni]->exontype>0 && (segs[ni]->start==segend+1 || segs[ni]->exontype!=exgffCDS || exontype!=exgffCDS)) { if (segs[ni]->startstart) { segs[oi]->start=segs[ni]->start; if (strand=='+') segs[oi]->phase=segs[ni]->phase; } if (segs[ni]->end>segs[oi]->end) { segs[oi]->end=segs[ni]->end; if (strand=='-') segs[oi]->phase=segs[ni]->phase; } segs.Delete(ni); } else ++ni; } //until no more overlapping/adjacent segments found // -- make sure any other related boundaries are updated: if (isFinalized()) { if (&segs==&exons) { start=exons.First()->start; end=exons.Last()->end; //recalculate covlen covlen=0; for (int i=0;ilen(); } } else { if (start>segs.First()->start) start=segs.First()->start; if (endend) end=segs.Last()->end; } } void GffObj::removeExon(int idx) { if (idx<0 || idx>=exons.Count()) return; int segstart=exons[idx]->start; int segend=exons[idx]->end; exons.Delete(idx); if (isFinalized()) { covlen -= (int)(segend-segstart)+1; start=exons.First()->start; end=exons.Last()->end; if (isCDSOnly()) { CDstart=start; CDend=end; } } } void GffObj::removeExon(GffExon* p) { for (int idx=0;idxstart; int segend=exons[idx]->end; exons.Delete(idx); covlen -= (int)(segend-segstart)+1; if (exons.Count() > 0) { start=exons.First()->start; end=exons.Last()->end; if (isCDSOnly()) { CDstart=start; CDend=end; } } return; } } } GffObj::GffObj(GffReader& gfrd, BEDLine& bedline):GSeg(0,0), exons(true,true,false), cdss(NULL), gscore() { uptr=NULL; ulink=NULL; parent=NULL; udata=0; flags=0; CDstart=0; CDend=0; CDphase=0; attrs=NULL; gffID=NULL; track_id=-1; gseq_id=-1; //ftype_id=-1; //subftype_id=-1; strand='.'; gffnames_ref(names); //qlen=0;qstart=0;qend=0; covlen=0; geneID=NULL; gene_name=NULL; ftype_id=gff_fid_transcript; subftype_id=gff_fid_exon; start=bedline.fstart; end=bedline.fend; gseq_id=names->gseqs.addName(bedline.gseqname); track_id=names->tracks.addName("BED"); strand=bedline.strand; //setup flags from gffline isGene(false); isTranscript(true); gffID=Gstrdup(bedline.ID); for (int i=0;iaddExon(bedline.exons[i].start, bedline.exons[i].end, exgffExon); if (eidx<0 && gfrd.showWarnings()) GMessage("Warning: failed adding segment %d-%d for %s (discarded)!\n", bedline.exons[i].start, bedline.exons[i].end, gffID); } if (bedline.cds_start>0) { CDstart=bedline.cds_start; CDend=bedline.cds_end; if (CDstart>0 && bedline.cds_phase) CDphase=bedline.cds_phase; } if (gfrd.keep_Attrs && bedline.info!=NULL) this->parseAttrs(attrs, bedline.info); } GffObj::GffObj(GffReader &gfrd, GffLine& gffline): GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() { uptr=NULL; ulink=NULL; parent=NULL; udata=0; flags=0; CDstart=0; CDend=0; CDphase=0; geneID=NULL; gene_name=NULL; attrs=NULL; gffID=NULL; track_id=-1; gseq_id=-1; //ftype_id=-1; subftype_id=-1; strand='.'; gffnames_ref(names); //qlen=0;qstart=0;qend=0; covlen=0; ftype_id=gffline.ftype_id; start=gffline.fstart; end=gffline.fend; gseq_id=names->gseqs.addName(gffline.gseqname); track_id=names->tracks.addName(gffline.track); strand=gffline.strand; /* qcov=0; qlen=gffline.qlen; qstart=gffline.qstart; qend=gffline.qend; */ //setup flags from gffline isCDSOnly(gffline.is_cds); //for now isGene(gffline.is_gene); isTranscript(gffline.is_transcript || gffline.exontype!=exgffNone); //fromGff3(gffline.is_gff3); isGeneSegment(gffline.is_gene_segment); if (gffline.parents!=NULL && !gffline.is_transcript) { //GTF style -- create a GffObj directly by subfeature //(also possible orphan GFF3 exon line, or an exon given before its parent (chado)) if (gffline.exontype!=exgffNone) { //recognized exon-like feature ftype_id=gff_fid_transcript; //so this is some sort of transcript subftype_id=gff_fid_exon; //subfeatures MUST be exons //typical GTF2 without "transcript" line // or malformed GFF3 with orphan or premature exon feature (found before the transcript line) gffID=Gstrdup(gffline.parents[0]); this->createdByExon(true); if (gfrd.is_gff3 && gfrd.showWarnings()) GMessage("Warning: exon feature found before transcript ID %s\n",gffID); //this is the first exon/segment of the transcript readExon(gfrd, gffline); } else {//unrecognized (non-exon) subfeature //make this GffObj of the same feature type ftype_id=names->feats.addName(gffline.ftype); if (gffline.ID!=NULL) { //unrecognized non-exon feature ? use the ID instead this->hasGffID(true); gffID=Gstrdup(gffline.ID); if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info); } else { //no ID, just Parent GMessage("Warning: unrecognized parented feature without ID found before its parent:\n%s\n", gffline.dupline); gffID=Gstrdup(gffline.parents[0]); this->createdByExon(true); readExon(gfrd, gffline); } } //unrecognized (non-exon) feature } //non-transcript parented subfeature given directly else { //non-parented feature OR a recognizable transcript //create a parent feature in its own right gscore.score=gffline.score; gscore.precision=gffline.score_decimals; if (gffline.ID==NULL || gffline.ID[0]==0) GError("Error: no valid ID found for GFF record\n"); this->hasGffID(true); gffID=Gstrdup(gffline.ID); //there must be an ID here //if (gffline.is_transcript) ftype_id=gff_fid_mRNA; //else if (gffline.is_transcript) { subftype_id=gff_fid_exon; if (ftype_id<0) ftype_id=names->feats.addName(gffline.ftype); if (gfrd.is_gff3) { if (gffline.exons.Count()>0) { //for compact GFF-like transcript line format (TLF), exons were already found as attributes for (int i=0;iaddExon(gffline.exons[i].start, gffline.exons[i].end, exgffExon, '.', gscore); if (eidx<0 && gfrd.showWarnings()) GMessage("Warning: failed adding exon %d-%d for %s (discarded)!\n", gffline.exons[i].start, gffline.exons[i].end, gffID); } } if (gffline.cds_start>0) { CDstart=gffline.cds_start; CDend=gffline.cds_end; } if (gffline.phase!=0) CDphase=gffline.phase; if (gffline.cdss.Count()>0) { //for compact GFF-like transcript line format (TLF), CDS might be already found as attributes if (cdss==NULL) cdss=new GList(true, true, false); for (int i=0;iaddExon(gffline.cdss[i].start, gffline.cdss[i].end, exgffCDS, 0, GFFSCORE_NONE, cdss); if (eidx<0 && gfrd.showWarnings()) GMessage("Warning: failed adding CDS segment %d-%d for %s (discarded)!\n", gffline.cdss[i].start, gffline.cdss[i].end, gffID); } } } } //is_transcript if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info); if (gfrd.is_gff3 && gffline.parents==NULL && gffline.exontype!=exgffNone) { //special case with bacterial genes just given as a CDS/exon, without parent! this->createdByExon(true); if (ftype_id<0) ftype_id=gff_fid_mRNA; readExon(gfrd, gffline); } if (ftype_id<0) ftype_id=names->feats.addName(gffline.ftype); }//no parent OR recognizable transcript if (gffline.gene_name!=NULL) { gene_name=Gstrdup(gffline.gene_name); } if (gffline.gene_id) { //only for gene features or GTF2 gene_id attribute if (!(this->isGene() && strcmp(gffID, gffline.gene_id)==0)) geneID=Gstrdup(gffline.gene_id); } /*//we cannot assume parents[0] is a gene! for NCBI miRNA, parent can be a primary_transcript feature! else if (gffline.is_transcript && gffline.parents!=NULL) { geneID=Gstrdup(gffline.parents[0]); } */ } BEDLine* GffReader::nextBEDLine() { if (bedline!=NULL) return bedline; //caller should free gffline after processing while (bedline==NULL) { int llen=0; buflen=GFF_LINELEN-1; char* l=fgetline(linebuf, buflen, fh, &fpos, &llen); if (l==NULL) return NULL; int ns=0; //first nonspace position while (l[ns]!=0 && isspace(l[ns])) ns++; if (l[ns]=='#' || llen<7) continue; bedline=new BEDLine(this, l); if (bedline->skip) { delete bedline; bedline=NULL; continue; } } return bedline; } GffLine* GffReader::nextGffLine() { if (gffline!=NULL) return gffline; //caller should free gffline after processing while (gffline==NULL) { int llen=0; buflen=GFF_LINELEN-1; char* l=fgetline(linebuf, buflen, fh, &fpos, &llen); if (l==NULL) { return NULL; //end of file } #ifdef CUFFLINKS _crc_result.process_bytes( linebuf, llen ); #endif int ns=0; //first nonspace position bool commentLine=false; while (l[ns]!=0 && isspace(l[ns])) ns++; if (l[ns]=='#') { commentLine=true; if (llen<10) { if (commentParser!=NULL) (*commentParser)(l, &gflst); continue; } } gffline=new GffLine(this, l); if (gffline->skipLine) { if (commentLine && commentParser!=NULL) (*commentParser)(gffline->dupline, &gflst); delete gffline; gffline=NULL; continue; } if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID //this might not be needed, already checked in the GffLine constructor if (gff_warns) GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n"); delete gffline; gffline=NULL; //continue; } } return gffline; } char* GffReader::gfoBuildId(const char* id, const char* ctg) { //caller must free the returned pointer char* buf=NULL; int idlen=strlen(id); GMALLOC(buf, idlen+strlen(ctg)+2); strcpy(buf, id); buf[idlen]='~'; strcpy(buf+idlen+1, ctg); return buf; } GffObj* GffReader::gfoAdd(GffObj* gfo) { GPVec* glst=phash.Find(gfo->gffID); if (glst==NULL) glst=new GPVec(false); int i=glst->Add(gfo); phash.Add(gfo->gffID, glst); return glst->Get(i); } GffObj* GffReader::gfoAdd(GPVec& glst, GffObj* gfo) { int i=glst.Add(gfo); return glst[i]; } GffObj* GffReader::gfoReplace(GPVec& glst, GffObj* gfo, GffObj* toreplace) { for (int i=0;i*& glst) { glst = phash.Find(id); return (glst!=NULL); } GffObj* GffReader::gfoFind(const char* id, GPVec*& glst, const char* ctg, char strand, uint start, uint end) { GPVec* gl=NULL; if (glst) { gl=glst; } else { gl = phash.Find(id); } GffObj* gh=NULL; if (gl) { for (int i=0;iCount();i++) { GffObj& gfo = *(gl->Get(i)); if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0) continue; if (strand && gfo.strand!='.' && strand != gfo.strand) continue; if (start>0) { if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS) continue; if (end>0 && (gfo.start>end || gfo.endchildren.Add(newgfo); if (newgfo->parent==NULL) newgfo->parent=parent; newgfo->setLevel(parent->getLevel()+1); //if (parent->isGene()) { if (parent->gene_name!=NULL && newgfo->gene_name==NULL) newgfo->gene_name=Gstrdup(parent->gene_name); if (parent->geneID!=NULL && newgfo->geneID==NULL) newgfo->geneID=Gstrdup(parent->geneID); //} return newgfo; } GffObj* GffReader::newGffRec(GffLine* gffline, GffObj* parent, GffExon* pexon, GPVec* glst, bool replace_parent) { GffObj* newgfo=new GffObj(*this, *gffline); GffObj* r=NULL; gflst.Add(newgfo); //tag non-transcripts to be discarded later if (this->transcripts_Only && this->is_gff3 && gffline->ID!=NULL && gffline->exontype==exgffNone && !gffline->is_gene && !gffline->is_transcript) { //unrecognized non-exon entity, should be discarded newgfo->isDiscarded(true); this->discarded_ids.Add(gffline->ID, 1); } if (replace_parent && glst) { r=gfoReplace(*glst, newgfo, parent); updateParent(r, parent); } else { //regular case of new GffObj creation r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo); if (parent!=NULL) { updateParent(r, parent); if (pexon!=NULL) parent->removeExon(pexon); } } return r; } GffObj* GffReader::newGffRec(BEDLine* bedline, GPVec* glst) { GffObj* newgfo=new GffObj(*this, *bedline); GffObj* r=NULL; gflst.Add(newgfo); r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo); return r; } GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline) { if (prevgfo==NULL) return NULL; //prevgfo->gffobj->createdByExon(false); if (gffline->ftype_id>=0) prevgfo->ftype_id=gffline->ftype_id; else prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype); prevgfo->start=gffline->fstart; prevgfo->end=gffline->fend; prevgfo->isGene(gffline->is_gene); prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=exgffNone); prevgfo->hasGffID(gffline->ID!=NULL); if (keep_Attrs) { if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear(); prevgfo->parseAttrs(prevgfo->attrs, gffline->info); } return prevgfo; } bool GffReader::readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash* pex) { //this should only be called before prevgfo->finalize()! bool r=true; if (gffline->strand!=prevgfo->strand) { if (prevgfo->strand=='.') { prevgfo->strand=gffline->strand; } else { GMessage("Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n", prevgfo->gffID, prevgfo->strand, gffline->fstart, gffline->fend, gffline->strand, prevgfo->getGSeqName()); return true; } } int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end : ((gffline->fendstart)? prevgfo->start-gffline->fend : 0 ); if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID); //validation_errors = true; r=false; if (!gff_warns) exit(1); } int eidx=prevgfo->readExon(*this, *gffline); if (pex!=NULL && eidx>=0) { //if (eidx==0 && gffline->exontype>0) prevgfo->isTranscript(true); if (gffline->ID!=NULL && gffline->exontype==exgffNone) subfPoolAdd(*pex, prevgfo); } return r; } CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash& pex, char*& subp_name) { CNonExon* subp=NULL; subp_name=NULL; for (int i=0;inum_parents;i++) { if (transcripts_Only && discarded_ids.Find(gffline->parents[i])!=NULL) continue; subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name subp=pex.Find(subp_name); if (subp!=NULL) return subp; GFREE(subp_name); } return NULL; } void GffReader::subfPoolAdd(GHash& pex, GffObj* newgfo) { //this might become a parent feature later if (newgfo->exons.Count()>0) { char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname); pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], *gffline)); GFREE(xbuf); } } GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash& pex) { GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene) //if (prevp!=gflst[subp->idx]) // GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID); subp->gffline->discardParent(); GffObj* gfoh=newGffRec(subp->gffline, prevp, subp->exon); pex.Remove(subp_name); //no longer a potential parent, moved it to phash already prevp->promotedChildren(true); return gfoh; //returns the holder of newly promoted feature } //In the rare cases where the GFF/GTF stream is properly formatted // i.e. when all sub-features are grouped with (and preceded by) their parent! GffObj* GffReader::readNext() { //user must free the returned GffObj* GffObj* gfo=NULL; //GSeg tseg(0,0); //transcript boundaries char* lastID=NULL; if (is_BED) { if (nextBEDLine()) { gfo=new GffObj(*this, *bedline); //tseg.start=gfo->start; //tseg.end=gfo->end; delete bedline; bedline=NULL; } //else return NULL; } else { //GFF parsing while (nextGffLine()!=NULL) { char* tid=gffline->ID; if (gffline->is_exon) tid=gffline->parents[0]; else //not an exon if (!(gffline->is_transcript || gffline->is_gene)) tid=NULL; //WARNING: only parsing transcript && gene records here //if (tid==NULL || gffline->num_parents>1) { if (tid==NULL) { //not a suitable transcript ID found, skip this line delete gffline; gffline=NULL; continue; } bool sameID=(lastID!=NULL && strcmp(lastID, tid)==0); if (sameID) { if (gfo==NULL) GError("Error: same transcript ID but GffObj not initialized?!(%s)\n", tid); //TODO: if gffline->is_transcript: trans-splicing! if (!gffline->is_exon) { GMessage("Warning: skipping unexpected non-exon record with previously seen ID:\n%s\n", gffline->dupline); delete gffline; gffline=NULL; continue; } readExonFeature(gfo, gffline); //also takes care of adding CDS segments } else { //new transcript if (gfo==NULL) { //start gathering this transcript's data now gfo=new GffObj(*this, *gffline); //GFREE(lastID); lastID=Gstrdup(tid); /*if (gffline->is_transcript) { tseg.start=gffline->fstart; tseg.end=gffline->fend; }*/ } else { //this gffline is for the next transcript! //return what we've got so far //return gfo; break; } } //transcript ID change //gffline processed, move on delete gffline; gffline=NULL; } //while nextgffline() } //GFF records GFREE(lastID); //gfo populated with all its sub-features (or eof reached) if (gfo!=NULL) { gfo->finalize(this); } return gfo; } //Usually we have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input // (thanks to the annoyingly loose GFF3 specs) //Trans-splicing and fusions shall only be accepted in proper GFF3 format, i.e. multiple transcript features //with the same ID but NOT overlapping/continuous // *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged // and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq) void GffReader::readAll() { bool validation_errors = false; if (is_BED) { while (nextBEDLine()) { GPVec* prevgflst=NULL; GffObj* prevseen=gfoFind(bedline->ID, prevgflst, bedline->gseqname, bedline->strand, bedline->fstart); if (prevseen) { //duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs //e.g. a trans-spliced transcript - but segments should not overlap if (prevseen->overlap(bedline->fstart, bedline->fend)) { //overlapping feature with same ID is going too far GMessage("Error: overlapping duplicate BED feature (ID=%s)\n", bedline->ID); //validation_errors = true; if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors delete bedline; bedline=NULL; continue; } else exit(1); } //create a separate entry (true discontinuous feature?) prevseen=newGffRec(bedline, prevgflst); if (gff_warns) { GMessage("Warning: duplicate BED feature ID %s (%d-%d) (discontinuous feature?)\n", bedline->ID, bedline->fstart, bedline->fend); } } else { newGffRec(bedline, prevgflst); } delete bedline; bedline=NULL; } } else { //regular GFF/GTF or perhaps TLF? //loc_debug=false; GHash pex; //keep track of any parented (i.e. exon-like) features that have an ID //and thus could become promoted to parent features while (nextGffLine()!=NULL) { GffObj* prevseen=NULL; GPVec* prevgflst=NULL; if (gffline->ID && gffline->exontype==exgffNone) { //parent-like feature ID (mRNA, gene, etc.) not recognized as an exon feature //check if this ID was previously seen on the same chromosome/strand within GFF_MAX_LOCUS distance prevseen=gfoFind(gffline->ID, prevgflst, gffline->gseqname, gffline->strand, gffline->fstart); if (prevseen) { //same ID seen in the same locus/region if (prevseen->createdByExon()) { if (gff_warns && (prevseen->startfstart || prevseen->end>gffline->fend)) GMessage("Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID); //an exon of this ID was given before //this line has the main attributes for this ID updateGffRec(prevseen, gffline); } else { //possibly a duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs //e.g. a trans-spliced transcript - though segments should not overlap! bool gtf_gene_dupID=(prevseen->isGene() && gffline->is_gtf_transcript); if (prevseen->overlap(gffline->fstart, gffline->fend) && !gtf_gene_dupID) { //in some GTFs a gene ID may actually be the same with the parented transcript ID (thanks) //overlapping feature with same ID is going too far GMessage("Error: discarding overlapping duplicate %s feature (%d-%d) with ID=%s\n", gffline->ftype, gffline->fstart, gffline->fend, gffline->ID); //validation_errors = true; if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors delete gffline; gffline=NULL; continue; } //else exit(1); } if (gtf_gene_dupID) { //special GTF case where parent gene_id matches transcript_id (sigh) prevseen=newGffRec(gffline, prevseen, NULL, prevgflst, true); } else { //create a separate entry (true discontinuous feature) prevseen=newGffRec(gffline, prevseen->parent, NULL, prevgflst); if (gff_warns) { GMessage("Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n", gffline->ID, gffline->fstart, gffline->fend); } } } //duplicate ID in the same locus } //ID seen previously in the same locus } //parent-like ID feature (non-exon) if (gffline->parents==NULL) { //top level feature (transcript, gene), no parents (or parents can be ignored) if (!prevseen) newGffRec(gffline, NULL, NULL, prevgflst); } else { //--- it's a child feature (exon/CDS or even a mRNA with a gene as parent) //updates all the declared parents with this child bool found_parent=false; if (gffline->is_gtf_transcript && prevseen && prevseen->parent) { found_parent=true; //parent already found in special GTF case } else { GffObj* newgfo=prevseen; GPVec* newgflst=NULL; GVec kparents; //kept parents (non-discarded) GVec< GPVec* > kgflst(false); GPVec* gflst0=NULL; for (int i=0;inum_parents;i++) { newgflst=NULL; //if (transcriptsOnly && ( if (discarded_ids.Find(gffline->parents[i])!=NULL) continue; if (!pFind(gffline->parents[i], newgflst)) continue; //skipping discarded parent feature kparents.Add(i); if (i==0) gflst0=newgflst; kgflst.Add(newgflst); } if (gffline->num_parents>0 && kparents.Count()==0) { kparents.cAdd(0); kgflst.Add(gflst0); } for (int k=0;kis_transcript || gffline->exontype==exgffNone) {//likely a transcript //parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname, // gffline->strand, gffline->fstart, gffline->fend); if (newgflst!=NULL && newgflst->Count()>0) parentgfo = newgflst->Get(0); } else { //for exon-like entities we only need a parent to be in locus distance, //on the same strand parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname, gffline->strand, gffline->fstart); } if (parentgfo!=NULL) { //parent GffObj parsed earlier found_parent=true; if ((parentgfo->isGene() || parentgfo->isTranscript()) && (gffline->is_transcript || gffline->exontype==exgffNone)) { //not an exon, but could be a transcript parented by a gene // *or* by another transcript (! miRNA -> primary_transcript) if (newgfo) { updateParent(newgfo, parentgfo); } else { newgfo=newGffRec(gffline, parentgfo); } } else { //potential exon subfeature? bool addingExon=false; if (transcripts_Only) { if (gffline->exontype>0) addingExon=true; } else { //always discard silly "intron" features if (! (gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) addingExon=true; } if (addingExon) if (!readExonFeature(parentgfo, gffline, &pex)) validation_errors=true; } } //overlapping parent feature found } //for each parsed parent Id if (!found_parent) { //new GTF-like record starting directly here as a subfeature //or it could be some chado GFF3 barf with exons coming BEFORE their parent :( //or it could also be a stray transcript without a parent gene defined previously //check if this feature isn't parented by a previously stored "child" subfeature char* subp_name=NULL; CNonExon* subp=NULL; if (!gffline->is_transcript) { //don't bother with this check for obvious transcripts if (pex.Count()>0) subp=subfPoolCheck(gffline, pex, subp_name); if (subp!=NULL) { //found a subfeature that is the parent of this (!) //promote that subfeature to a full GffObj GffObj* gfoh=promoteFeature(subp, subp_name, pex); //add current gffline as an exon of the newly promoted subfeature if (!readExonFeature(gfoh, gffline, &pex)) validation_errors=true; } } if (subp==NULL) { //no parent subfeature seen before //loc_debug=true; GffObj* ngfo=prevseen; if (ngfo==NULL) { //if it's an exon type, create directly the parent with this exon //but if it's recognized as a transcript, the object itself is created ngfo=newGffRec(gffline, NULL, NULL, newgflst); } if (!ngfo->isTranscript() && gffline->ID!=NULL && gffline->exontype==0) subfPoolAdd(pex, ngfo); //even those with errors will be added here! } GFREE(subp_name); } //no previous parent found } } //parented feature //-- delete gffline; gffline=NULL; }//while gff lines } if (gflst.Count()>0) { gflst.finalize(this); //force sorting by locus if so constructed } // all gff records are now loaded in GList gflst // so we can free the hash phash.Clear(); //tids.Clear(); if (validation_errors) { exit(1); } } void GfList::finalize(GffReader* gfr) { //if set, enforce sort by locus GList discarded(false,true,false); for (int i=0;ifinalize(gfr); if (fList[i]->isDiscarded()) { discarded.Add(fList[i]); //inform parent that thiis child is removed if (fList[i]->parent!=NULL) { GPVec& pchildren=fList[i]->parent->children; for (int c=0;cchildren.Count()>0) { //inform children that the parent was removed for (int c=0;cchildren.Count();c++) { fList[i]->children[c]->parent=NULL; if (gfr->keep_Attrs) //inherit the attributes of discarded parent (e.g. pseudo=true; ) fList[i]->children[c]->copyAttrs(fList[i]); } } this->Forget(i); } } if (discarded.Count()>0) { this->Pack(); } if (gfr->sortByLoc) { this->setSorted(false); if (gfr->refAlphaSort) this->setSorted((GCompareProc*)gfo_cmpByLoc); else this->setSorted((GCompareProc*)gfo_cmpRefByID); } } bool GffObj::reduceExonAttrs(GList& segs) { bool attrs_discarded=false; for (int a=0;aattrs->Count();a++) { int attr_id=segs[0]->attrs->Get(a)->attr_id; char* attr_name=names->attrs.getName(attr_id); char* attr_val =segs[0]->attrs->Get(a)->attr_val; bool sameExonAttr=true; bool discardAll=(GstrEq("exon_id", attr_name) || GstrEq("exon_number", attr_name)); if (!discardAll) for (int i=1;igetAttr(attr_id); if (ov==NULL || (strcmp(ov,attr_val)!=0)) { sameExonAttr=false; break; } } if (sameExonAttr) { //delete this attribute from exon level attrs_discarded=true; if (!discardAll) { //add the attribute to transcript level //rename it if it exists and is different for the transcript! char* t_val=NULL; bool same_aval=false; if (this->attrs!=NULL && (t_val=this->attrs->getAttr(attr_id))!=NULL) { //same attribute name already exists for the transcript! //write it using CDS_ or exon_ prefix same_aval=(strcmp(attr_val, t_val)==0); if (!same_aval) { //add renamed attribute const char* prefix = (&segs==cdss) ? "CDS_" : "exon_"; char* new_attr_name=NULL; GMALLOC(new_attr_name, strlen(prefix)+strlen(attr_name)+1); new_attr_name[0]=0; strcat(new_attr_name, prefix); strcat(new_attr_name, attr_name); this->attrs->add_or_update(names, new_attr_name, attr_val); GFREE(new_attr_name); } } else { //no such attribute exists for the transcript, copy it from the exon this->addAttr(attr_name, attr_val); } } for (int i=1;iattrs->freeItem(a); } //sameExonAttr } if (attrs_discarded) segs[0]->attrs->Pack(); return attrs_discarded; } //return the segs index of segment containing coord: int GffObj::whichExon(uint coord, GList* segs) { //segs MUST be sorted by GSeg order (start coordinate) if (segs==NULL) segs=&exons; if (segs->Count()==0) return -1; if (coordFirst()->start || coord>segs->Last()->end) return -1; if (segs->Count()<6) { //simple scan for (int i=0;iCount();i++) if ((*segs)[i]->overlap(coord)) return i; return -1; } else { //use quick search int i=0; int l=0; //lower boundary int h=segs->Count()-1; //higher boundary while (l<=h) { i = (l+h) >> 1; //range midpoint if (coord > segs->Get(i)->end) l=i+1; else { //coord <= segs->Get(i)->end if (coord >= segs->Get(i)->start) { return i; } //here: coord < segs->Get(i)->start h = i-1; } } } return -1; } bool GffObj::processGeneSegments(GffReader* gfr) { /* procedure: 1)store the info about any X_gene_segment entries in a GVec (just storing their index in gene->children[] list) 2)for each CDS, group them by ID in GHash (and a GPVec for storage) 3)for each GeneCDSChain, collect _gene_segments having a containment-relationship and rank them by lowest noncov 4)for each GeneCDSChain, pick best _gene_segment match (if any) and transfer CDSs to it */ GVec geneSegs; //X_gene_segment features (children transcripts of this gene) GHashMap cdsChainById(false); // hash of CDS chains: CDS feature grouped by ID GPVec cdsChains; // CDS chains storage if (cdss==NULL || cdss->Count()==0 || children.Count()==0) return false; //we shouldn't be here //check if we have any _gene_segment children for this gene for (int i=0;iflag_GENE_SEGMENT) { if (children[i]->hasCDS() || children[i]->cdss!=NULL) { GMessage("Warning: will not transfer CDS from %s to gene_segment %s which already has its own\n", gffID, children[i]->gffID); continue; } geneSegs.Add(i); } if (geneSegs.Count()==0) { if (gfr->gff_warns) GMessage("Warning: gene %s has CDS and transcripts but no suitable _gene_segment features\n",gffID); return false; //nothing to do } //group CDSs into CDS chains by their ID: for (int i=0;iCount();i++) { char* id=(char*)(cdss->Get(i)->uptr); if (id==NULL) continue; //should never happen GeneCDSChain *gcc=cdsChainById.Find(id); if (gcc!=NULL) gcc->addCDS(i, cdss->Get(i)->start, cdss->Get(i)->end); else { //new CDS chain: gcc=new GeneCDSChain(i, cdss->Get(i)->start, cdss->Get(i)->end); cdsChains.Add(gcc); cdsChainById.Add(id, gcc); } } for (int i=0;iCount();i++) { GFREE(cdss->Get(i)->uptr); //no CDS ID no longer needed } //collect _gene_segment containers for each CDS chain int cds_moved=0; for (int i=0;itransferCDS(cdss->Get(gc.cdsList[c].idx)); cdss->Forget(gc.cdsList[c].idx); cds_moved++; } // also remove it from the list of gene_segments to be mapped geneSegs.Delete(gc.mxs.First().gsegidx); //assigned, should no longer be checked against other CDS chains if (t->isFinalized()) t->finalize(gfr); } if (cds_moved>0) cdss->Pack(); if (cdss->Count()==0) { delete cdss; cdss=NULL; if (exons.Count()==0) isTranscript(false); } return true; } GffObj* GffObj::finalize(GffReader* gfr) { if (this->createdByExon() && this->end-this->start<10 && this->exons.Count()<=1) { //? misleading exon-like feature parented by an exon or CDS mistakenly // interpreted as a standalone transcript // example: GENCODE gff3 feature "stop_codon_redefined_as_selenocysteine" which is // parented by a CDS ! if (cdss==NULL || cdss->Count()<=1) { if (gfr->showWarnings()) { GMessage("Warning: discarding suspicious '%s' record (ID=%s)\n",this->getFeatureName(),gffID); } isDiscarded(true); } } if (!isDiscarded()) { bool noExons=(exons.Count()==0 && (cdss==NULL || cdss->Count()==0)); if (noExons) { if (isTranscript() || (isGene() && children.Count()==0 && gfr->gene2exon)) { //add exon feature to an exonless transcript/gene addExon(this->start, this->end, exgffExon); //effectively this becomes a transcript (even childless genes if gene2exon) isTranscript(true); } } else { //it has exons or CDSs if (cdss!=NULL && isGene() && children.Count()>0) { //check for X_gene_segment processing processGeneSegments(gfr);//distribute the cdss to children _gene_segments } // _gene_segment processing } } if (cdss!=NULL && isGene()) //in case we stored IDs for gene_segment features for (int i=0;iCount();i++) { GFREE(cdss->Get(i)->uptr); } if (gfr->transcripts_Only && !isTranscript() && !(gfr->keep_Genes && isGene())) { //discard non-transcripts, unless it's a gene and keepGenes was specified isDiscarded(true); } isFinalized(true); if (isDiscarded()) { //just in case we have cds with uptr in use (X_gene_segment), free them uptr=NULL; udata=0; return this; } if (isTranscript()) { isCDSOnly(cdss!=NULL && exons.Count()==0 && cdss->Count()>0); subftype_id=isCDSOnly() ? gff_fid_CDS : gff_fid_exon; } if (cdss!=NULL && cdss->Count()>0) { CDstart=cdss->First()->start; CDend=cdss->Last()->end; CDphase=(strand=='-')? cdss->Last()->phase : cdss->First()->phase; bool updatePhase=(CDphase=='.' || CDphase==0); if (!updatePhase) for (int i=0;iCount();++i) if ((*cdss)[i]->phase<'0') { updatePhase=true; break; } if (updatePhase) updateCDSPhase(*cdss); //there are GFFs out there which only provide UTR and CDS records instead of full exons //so make sure we add all CDS segments to exons, if they are not already there for (int i=0;iCount();++i) { int eidx=addExon((*cdss)[i]->start, (*cdss)[i]->end, exgffExon, 0, (*cdss)[i]->score); if (eidx<0) GError("Error: could not reconcile CDS %d-%d with exons of transcript %s\n", (*cdss)[i]->start, (*cdss)[i]->end, gffID); } } else if (CDstart==0) {//no CDS, no phase CDphase=0; CDend=0; } //-- attribute reduction for some records which // repeat the exact same attr=value for every exon bool reduceAttributes=(gfr->keep_Attrs && !gfr->noExonAttrs && !gfr->keep_AllExonAttrs && exons.Count()>0 && exons[0]->attrs!=NULL); if (reduceAttributes) { //for each attribute of the 1st exon, if it has the //same value for all other exons, move it to transcript level //bool reduced=reduceExonAttrs(exons); reduceExonAttrs(exons); //if (gfr->showWarnings() && reduced) // GMessage("Info: duplicate exon attributes reduced for %s\n", gffID); //do the same for CDS segments, if any if (cdss!=NULL && cdss->Count()>0 && (*cdss)[0]->attrs!=NULL) { //reduced= reduceExonAttrs(*cdss); //if (gfr->showWarnings() && reduced) // GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID); } } //merge close exons if requested if (exons.Count()>0 && isTranscript()) { if (gfr->merge_CloseExons) { for (int i=0;iend; while (nistart-mend-1); //<0 = overlap, 0 = adjacent, >0 = bases apart if (dist>GFF_MIN_INTRON) break; //no merging with next segment if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) { GMessage("Warning: merging adjacent/overlapping segments (distance=%d) of %s on %s (%d-%d, %d-%d)\n", dist, gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end); } mend=exons[ni]->end; exons[i]->end=mend; if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL || exons[i]->attrs->Count()attrs->Count())) { //use the other exon attributes, if it has more delete(exons[i]->attrs); exons[i]->attrs=exons[ni]->attrs; exons[ni]->attrs=NULL; } exons.Delete(ni); } //check for merge with next exon } //for each exon } //merge close exons if (isCDSOnly() && exons.Count()!=cdss->Count()) isCDSOnly(false); } //-- check features vs their exons' span if (isTranscript()) { if (exons.Count()>0) { if (gfr->gff_warns && (this->start!=exons.First()->start || this->end!=exons.Last()->end) ) GMessage("Warning: adjusted transcript %s boundaries according to terminal exons.\n", gffID); this->start=exons.First()->start; this->end=exons.Last()->end; } } else { //non-transcripts just have to be at least as wide as their sub-features if (exons.Count()>0) { bool adj=false; if (this->start>exons.First()->start) { this->start=exons.First()->start; adj=true; } if (this->endend) { this->end=exons.First()->end; adj=true; } if (gfr->gff_warns && adj) GMessage("Warning: adjusted %s %s boundaries according to terminal sub-features.\n", this->getFeatureName(), gffID); } } //-- update covlen covlen=0; for (int i=0;ilen(); //-- check if CDS segments are different from exons and thus worth keeping separately in cdss if (cdss!=NULL && cdss->Count()>0) { bool cds_exComp=true; //CDSs are exon-compatible (no need to keep them separately) if (cdss->Count()==1) { //check that the CDS segment is within a single exon int start_eidx=-1; int end_eidx=-1; for (int i=0;istart, // exons[i]->end); if (CDstart>=exons[i]->start && CDstart<=exons[i]->end) { start_eidx=i; } if (CDend>=exons[i]->start || CDend<=exons[i]->end ) { end_eidx=i; } if (start_eidx>=0 && end_eidx>=0) break; } cds_exComp=(start_eidx==end_eidx && start_eidx>=0); if (!cds_exComp) GMessage("Warning: transcript %s has incorrect CDS segment definition (%d-%d)!\n", gffID, CDstart, CDend); cds_exComp=true; //just to free cdss, even though it's wrong } else { if (cdss->Count()>exons.Count()) { cds_exComp=false; } else { //2 or more CDS segments //CDSs should be intron compatible with exons, and CDS ends should be within exons int imax=exons.Count()-1; int jmax=cdss->Count()-1; int i=0; int j=0; //find which exon has CDstart for (i=0;i<=imax;++i) if (CDstart>=exons[i]->start && CDstart<=exons[i]->end) break; if (i>imax) cds_exComp=false; else { //check the introns now while (iend!=(*cdss)[j]->end || exons[i+1]->start!=(*cdss)[j+1]->start) { cds_exComp=false; break; } ++i; ++j; } //now j must be the last segment of cdss and CDend must be within exon[i] if (cds_exComp) if (j!=jmax || CDend>exons[i]->end || CDendstart) cds_exComp=false; } } } //multiple CDS segments if (cds_exComp) { if (isCDSOnly() && cdss->Count()==exons.Count()) for (int i=0;iCount();i++) exons[i]->phase=cdss->Get(i)->phase; if (gfr->keep_Attrs && !gfr->noExonAttrs) { int eidx=whichExon((*cdss)[0]->start, &exons); if (eidx<0) GError("Error finding CDS coordinate inside exons (?) for %s\n", gffID); for (int i=0;iCount();i++) { if (isCDSOnly()) //eidx should be the same with i exons[eidx]->phase=cdss->Get(i)->phase; if ((*cdss)[i]->attrs!=NULL && (*cdss)[i]->attrs->Count()>0) { if (exons[eidx]->attrs==NULL) exons[eidx]->attrs=new GffAttrs(); exons[eidx]->attrs->copyAttrs((*cdss)[i]->attrs, true); if (exons[eidx]->attrs->Count()==0) { delete exons[eidx]->attrs; exons[eidx]->attrs=NULL; } } ++eidx; } } delete cdss; cdss=NULL; //this->isXCDS(false); } else this->isXCDS(true); }//cdss check //--- collect stats for the reference genomic sequence if (gfr->gseqtable.Count()<=gseq_id) { gfr->gseqtable.setCount(gseq_id+1); } GSeqStat* gsd=gfr->gseqtable[gseq_id]; if (gsd==NULL) { gsd=new GSeqStat(gseq_id,names->gseqs.getName(gseq_id)); //gfr->gseqtable.Put(gseq_id, gsd); gfr->gseqtable[gseq_id]=gsd; gfr->gseqStats.Add(gsd); } gsd->fcount++; if (startmincoord) gsd->mincoord=start; if (end>gsd->maxcoord) gsd->maxcoord=end; if (this->len()>gsd->maxfeat_len) { gsd->maxfeat_len=this->len(); gsd->maxfeat=this; } uptr=NULL; udata=0; return this; } void GffObj::printExonList(FILE* fout) { //print comma delimited list of exon intervals for (int i=0;i0) fprintf(fout, ","); fprintf(fout, "%d-%d",exons[i]->start, exons[i]->end); } } void GffObj::printCDSList(FILE* fout) { //print comma delimited list of CDS intervals if (!hasCDS()) return; GVec cds; this->getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment for (int i=0;i0) fprintf(fout, ","); fprintf(fout, "%d-%d", cds[i].start, cds[i].end); } } void BED_addAttribute(FILE* fout, int& acc, const char* format,... ) { ++acc; if (acc==1) fprintf(fout, "\t"); else fprintf(fout, ";"); va_list arguments; va_start(arguments,format); vfprintf(fout,format,arguments); va_end(arguments); } void GffObj::printBED(FILE* fout, bool cvtChars) { //print a BED-12 line + GFF3 attributes in 13th field const int DBUF_LEN=1024; //there should not be attribute values longer than 1K! char dbuf[DBUF_LEN]; int cd_start=CDstart>0? CDstart-1 : start-1; int cd_end=CDend>0 ? CDend : end; char cdphase=(CDphase>0) ? CDphase : '0'; fprintf(fout, "%s\t%d\t%d\t%s\t%d\t%c\t%d\t%d\t%c,0,0", getGSeqName(), start-1, end, getID(), 100, strand, cd_start, cd_end, cdphase); if (exons.Count()>0) { int i; fprintf(fout, "\t%d\t", exons.Count()); for (i=0;ilen()); fprintf(fout, "\t"); for (i=0;istart-start); } else { //no-exon feature(!), shouldn't happen fprintf(fout, "\t1\t%d,\t0,", len()); } //now add the GFF3 attributes for in the 13th field int numattrs=0; if (CDstart>0) BED_addAttribute(fout, numattrs,"CDS=%d:%d",CDstart-1, CDend); if (CDphase>0) BED_addAttribute(fout, numattrs,"CDSphase=%c", CDphase); if (geneID!=NULL) BED_addAttribute(fout, numattrs, "geneID=%s",geneID); if (gene_name!=NULL) fprintf(fout, ";gene_name=%s",gene_name); if (attrs!=NULL) { for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); const char* attrval=attrs->Get(i)->attr_val; if (attrval==NULL || attrval[0]=='\0') { BED_addAttribute(fout, numattrs,"%s",attrname); continue; } if (cvtChars) { decodeHexChars(dbuf, attrval, DBUF_LEN-1); BED_addAttribute(fout, numattrs, "%s=%s", attrname, dbuf); } else BED_addAttribute(fout, numattrs,"%s=%s", attrname, attrs->Get(i)->attr_val); } } fprintf(fout, "\n"); } void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon, bool CDSsrc) { if (names==NULL) GError(ERR_NULL_GFNAMES, "parseAttrs()"); if (atrlist==NULL) { atrlist=new GffAttrs(); } bool exon2transcript=(isExon && atrlist==this->attrs); char* endinfo=info+strlen(info); char* start=info; char* pch=start; while (startadd_or_update(this->names, start, ech, CDSsrc); //overwrite previous attr with the same name } start=pch; } //while info characters if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; } } void GffObj::addAttr(const char* attrname, const char* attrvalue) { if (this->attrs==NULL) this->attrs=new GffAttrs(); //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue)); this->attrs->add_or_update(names, attrname, attrvalue); } void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript if (from==NULL || from->attrs==NULL || from->attrs->Count()==0) return; if (this->attrs==NULL) { this->attrs=new GffAttrs(); } //special RefSeq case int desc_attr_id=names->attrs.getId("description"); //from gene int prod_attr_id=names->attrs.getId("product"); //from transcript (this) char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL; for (int i=0;iattrs->Count();++i) { //this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val); int aid=from->attrs->Get(i)->attr_id; //special case for GenBank refseq genes vs transcripts: if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0) continue; //skip description if product already there and the same bool haveit=false; for (int ai=0;aiattrs->Count();++ai) { //do we have it already? if (aid==this->attrs->Get(ai)->attr_id) { haveit=true; break; //skip this, don't replace } } if (!haveit) this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val)); } } void GffObj::setFeatureName(const char* feature) { //change the feature name/type for a transcript int fid=names->feats.addName(feature); if (monoFeature() && exons.Count()>0) this->subftype_id=fid; this->ftype_id=fid; } void GffObj::setRefName(const char* newname) { //change the feature name/type for a transcript int rid=names->gseqs.addName(newname); this->gseq_id=rid; } int GffObj::removeAttrs(GStrSet<>& attrSet) { //remove attributes NOT found in given set of attribute names if (this->attrs==NULL || attrSet.Count()==0) return 0; int delcount=0; for (int i=0;iattrs->Count();i++) { int aid=this->attrs->Get(i)->attr_id; if (!attrSet.hasKey(this->names->attrs.Get(aid)->name)) { delcount++; this->attrs->freeItem(i); } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeAttr(const char* attrname, const char* attrval) { if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0; int aid=this->names->attrs.getId(attrname); if (aid<0) return 0; int delcount=0; //could be more than one ? for (int i=0;iattrs->Count();i++) { if (aid==this->attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { delcount++; this->attrs->freeItem(i); } } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeAttr(int aid, const char* attrval) { if (this->attrs==NULL || aid<0) return 0; int delcount=0; //could be more than one ? for (int i=0;iattrs->Count();i++) { if (aid==this->attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { delcount++; this->attrs->freeItem(i); } } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) { if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0; int aid=this->names->attrs.getId(attrname); if (aid<0) return 0; int delcount=0; //could be more than one for (int i=0;iCount();i++) { if (aid==exon.attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { delcount++; exon.attrs->freeItem(i); } } } if (delcount>0) exon.attrs->Pack(); return delcount; } int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) { if (exon.attrs==NULL || aid<0) return 0; int delcount=0; //could be more than one for (int i=0;iCount();i++) { if (aid==exon.attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { delcount++; exon.attrs->freeItem(i); } } } if (delcount>0) exon.attrs->Pack(); return delcount; } char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst) { if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n"); return NULL; } //restore normal coordinates: if (exons.Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } char* unspliced=NULL; int seqstart=exons.First()->start; int seqend=exons.Last()->end; int unsplicedlen = 0; if (seglst) seglst->Clear(strand); unsplicedlen += seqend - seqstart + 1; GMALLOC(unspliced, unsplicedlen+1); //allocate more here //uint seqstart, seqend; int s = 0; //resulting nucleotide counter if (strand=='-') { if (seglst!=NULL) seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend); for (int i=seqend;i>=seqstart;i--) { unspliced[s] = ntComplement(gsubseq[i-start]); s++; }//for each nt } // - strand else { // + strand if (seglst!=NULL) seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend); for (int i=seqstart;i<=seqend;i++) { unspliced[s]=gsubseq[i-start]; s++; }//for each nt } // + strand //assert(s <= unsplicedlen); unspliced[s]=0; if (rlen!=NULL) *rlen=s; return unspliced; } void GffObj::addPadding(int padLeft, int padRight) { this->start-=padLeft; this->end+=padRight; if (exons.Count()>0) { exons[0]->start-=padLeft; exons.Last()->end+=padRight; } covlen+=padLeft+padRight; } void GffObj::removePadding(int padLeft, int padRight) { this->start+=padLeft; this->end-=padRight; if (exons.Count()>0) { exons[0]->start+=padLeft; exons.Last()->end-=padRight; } covlen-=padLeft+padRight; } char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end, GMapSegments* seglst, bool cds_open) { //cds_open only makes sense when CDSonly is true by overriding CDS 3'end such that the end of //the sequence beyond the 3' CDS end is also returned (the 3' UTR is appended to the CDS) if (CDSonly && CDstart==0) { GMessage("Warning: getSpliced(CDSOnly) requested for transcript with no CDS (%s)!\n", gffID); //should never happen return NULL; } if (faseq==NULL) { GMessage("Warning: getSpliced() called with uninitialized GFaSeqGet object!\n"); //should never happen return NULL; } GList* xsegs=&exons; if (CDSonly && this->cdss!=NULL) xsegs=this->cdss; if (xsegs->Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust int endadj=end-start+1-fspan; uint prevend=end; end-=endadj; if (CDend>end) CDend=end; if (xsegs->Last()->end>end) { xsegs->Last()->end=end; //this could be trouble if exon start is also > end if (xsegs->Last()->start>xsegs->Last()->end) { GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n", prevend,getGSeqName(), getID()); } covlen-=endadj; } } char* spliced=NULL; GMALLOC(spliced, covlen+1); //IMPORTANT: covlen must be correct here! uint g_start=0, g_end=0; int cdsadj=0; if (CDphase=='1' || CDphase=='2') { cdsadj=CDphase-'0'; } uint CDS_start=CDstart; uint CDS_stop=CDend; if (cdsadj>0) { if (strand=='-') CDS_stop-=cdsadj; else CDS_start+=cdsadj; } if (CDSonly) { g_start=CDS_start; g_end=CDS_stop; if (g_end-g_start<3) GMessage("Warning: CDS %d-%d too short for %s, check your data.\n", g_start, g_end, gffID); } else { //all exon content, not just CDS g_start=xsegs->First()->start; g_end=xsegs->Last()->end; cds_open=false; //override mistaken user request } if (seglst!=NULL) seglst->Clear(strand); int s=0; //resulting nucleotide counter if (strand=='-') { if (cds_open) {// appending 3'UTR g_start=xsegs->First()->start; //CDS_start=g_start; } for (int x=xsegs->Count()-1;x>=0;x--) { uint sgstart=xsegs->Get(x)->start; uint sgend=xsegs->Get(x)->end; if (g_endsgend) continue; if (g_start>=sgstart && g_start<=sgend) sgstart=g_start; //3' end within this segment if (g_end>=sgstart && g_end<=sgend) sgend=g_end; //5' end within this segment if (seglst!=NULL) seglst->add(s+1,s+1+sgend-sgstart,sgend,sgstart); for (uint i=sgend;i>=sgstart;i--) { spliced[s] = ntComplement(gsubseq[i-start]); s++; }//for each nt //--update local CDS start-end coordinates if (cds_start!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) { //CDS start in this segment *cds_start=s-(CDS_stop-sgstart); } if (cds_end!=NULL && CDS_start>=sgstart && CDS_start<=sgend) { //CDS stop in this segment *cds_end=s-(CDS_start-sgstart); } } //for each exon } // - strand else { // + strand if (cds_open) { // appending 3'UTR g_end=xsegs->Last()->end; //CDS_stop=g_end; } for (int x=0;xCount();x++) { uint sgstart=xsegs->Get(x)->start; uint sgend=xsegs->Get(x)->end; if (g_endsgend) continue; if (g_start>=sgstart && g_start<=sgend) sgstart=g_start; //seqstart within this segment if (g_end>=sgstart && g_end<=sgend) sgend=g_end; //seqend within this segment if (seglst!=NULL) seglst->add(s+1,s+1+sgend-sgstart, sgstart, sgend); for (uint i=sgstart;i<=sgend;i++) { spliced[s]=gsubseq[i-start]; s++; }//for each nt //--update local CDS start-end coordinates if (cds_start!=NULL && CDS_start>=sgstart && CDS_start<=sgend) { //CDS start in this segment *cds_start=s-(sgend-CDS_start); } if (cds_end!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) { //CDS stop in this segment *cds_end=s-(sgend-CDS_stop); } } //for each exon } // + strand spliced[s]=0; if (rlen!=NULL) *rlen=s; return spliced; } void GffObj::printSummary(FILE* fout) { if (fout==NULL) fout=stdout; fprintf(fout, "%s\t%c\t%d\t%d\t", gffID, strand, start, end); gscore.print(fout); fprintf(fout, "\n"); } //TODO we should also have an escapeChars function for some situations //when we want to write a GFF3 strictly compliant to the dang specification void GffObj::decodeHexChars(char* dbuf, const char* s, int maxlen) { int dlen=0; dbuf[0]=0; if (s==NULL) return; for (const char* p=s;(*p)!=0 && dlen'Z') a^=0x20; //toupper() if (a>'9') a=10+(a-'A'); else a-='0'; int b=*(++p); if (b>'Z') b^=0x20; if (b>'9') b=10+(b-'A'); else b-='0'; char c=(char)((a<<4)+b); if (c=='%') { dbuf[dlen]='p'; ++dlen; dbuf[dlen]='r'; ++dlen; c='c'; } else if (c==';') c='.'; else if (c<='\t') c=' '; if (c>=' ') { dbuf[dlen]=c; ++dlen; continue; } } dbuf[dlen]=*p; ++dlen; } dbuf[dlen]=0; } void GffObj::printGTab(FILE* fout, char** extraAttrs) { fprintf(fout, "%s\t%c\t%d\t%d\t%s\t", this->getGSeqName(), this->strand, this->start, this->end, this->getID()); if (exons.Count()) printExonList(fout); else fprintf(fout, "."); if (extraAttrs!=NULL) { //print a list of "attr=value;" pairs here as the last column //for requested attributes bool t1=true; for (int i=0;extraAttrs[i]!=NULL;++i) { const char* v=this->getAttr(extraAttrs[i]); if (v==NULL) continue; if (t1) { fprintf(fout, "\t"); t1=false; } fprintf(fout, "%s=%s;", extraAttrs[i], v); } } fprintf(fout,"\n"); } void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, GffExon* exon, bool gff3, bool cvtChars) { const int DBUF_LEN=1024; //there should not be attribute values longer than 1K! char dbuf[DBUF_LEN]; exon->score.sprint(dbuf); if (exon->phase==0 || !iscds) exon->phase='.'; const char* ftype=iscds ? "CDS" : getSubfName(); const char* attrname=NULL; const char* attrval=NULL; if (gff3) { fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s", gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand, exon->phase, gffID); if (exon->attrs!=NULL) { for (int i=0;iattrs->Count();i++) { if (exon->attrs->Get(i)->cds!=iscds) continue; attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id); if (cvtChars) { decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, DBUF_LEN-1); fprintf(fout,";%s=%s", attrname, dbuf); } else { fprintf(fout,";%s=%s", attrname, exon->attrs->Get(i)->attr_val); } } } fprintf(fout, "\n"); } //GFF3 else {//GTF fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";", gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand, exon->phase, gffID); if (geneID) fprintf(fout," gene_id \"%s\";",geneID); if (gene_name!=NULL) { fprintf(fout," gene_name \"%s\";",gene_name); } if (exon->attrs!=NULL) { bool trId=false; bool gId=false; for (int i=0;iattrs->Count();i++) { if (exon->attrs->Get(i)->attr_val==NULL) continue; if (exon->attrs->Get(i)->cds!=iscds) continue; attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id); if (strcmp(attrname, "transcriptID")==0) { if (trId) continue; trId=true; } if (strcmp(attrname, "transcript_id")==0 && !trId) { attrname="transcriptID"; trId=true; } if (strcmp(attrname, "geneID")==0) { if (gId) continue; gId=true; } if (strcmp(attrname, "gene_id")==0 && !gId) { attrname="geneID"; gId=true; } if (Gstricmp(attrname, "gene_name")==0 && gene_name!=NULL) { continue; } fprintf(fout, " %s ",attrname); if (cvtChars) { decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, DBUF_LEN-1); attrval=dbuf; } else { attrval=exon->attrs->Get(i)->attr_val; } if (attrval[0]=='"') fprintf(fout, "%s;",attrval); else fprintf(fout, "\"%s\";",attrval); } } fprintf(fout, "\n"); }//GTF } bool GffObj::printAttrs(FILE* fout, const char* sep, bool GTFstyle, bool cvtChars, bool sepFirst) { //* this prints sep FIRST and then the list of attributes separated by sep //* does NOT print sep and newline at the end! // returns false if no attribute was printed at all const int DBUF_LEN=1024; //there should not be attribute values longer than 1K! char dbuf[DBUF_LEN]; const char* prsep=sepFirst ? sep : ""; //assumes ID or transcript_ID was already printed (without ending) bool pr=false; if (GTFstyle) { //for GTF also print gene_id here! (and gene_name) char* gid=NULL; if (geneID!=NULL) { gid=geneID; } else { gid=getAttr("gene_id"); if (gid==NULL) gid=gffID; //last resort, write gid the same with gffID } if (gid!=NULL) { fprintf(fout, "%sgene_id \"%s\"",prsep, gid); prsep=sep; pr=true; } if (gene_name!=NULL && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL) { fprintf(fout, "%sgene_name \"%s\"",prsep, gene_name); prsep=sep; pr=true; } if (attrs!=NULL) { bool trId=false; //bool gId=false; for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); const char* attrval=attrs->Get(i)->attr_val; if (attrval==NULL || attrval[0]=='\0') continue; if (strcmp(attrname, "transcriptID")==0) { if (trId) continue; trId=true; } if (strcmp(attrname, "transcript_id")==0 && !trId) { attrname="transcriptID"; trId=true; } if (Gstrcmp(attrname, "geneID")==0 && gid!=NULL && strcmp(attrval, gid)==0) continue; if (strcmp(attrname, "gene_id")==0) continue; if (cvtChars) { decodeHexChars(dbuf, attrval, DBUF_LEN-1); fprintf(fout,"%s%s \"%s\"", prsep, attrname, dbuf); } else fprintf(fout,"%s%s \"%s\"", prsep, attrname, attrs->Get(i)->attr_val); prsep=sep; pr=true; } } } else { //for GFF/BED/TLF etc, geneID and gene_name should be printed already //IF stored only separately and NOT as attributes //the initial sep is NOT printed! if (attrs!=NULL) { for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); const char* attrval=attrs->Get(i)->attr_val; if (attrval==NULL || attrval[0]=='\0') continue; //fprintf(fout,";%s",attrname); if (cvtChars) { decodeHexChars(dbuf, attrval, DBUF_LEN-1); fprintf(fout,"%s%s=%s", prsep, attrname, dbuf); } else fprintf(fout,"%s%s=%s", prsep, attrname, attrs->Get(i)->attr_val); prsep=sep; pr=true; } } } return pr; } void GffObj::printGxf(FILE* fout, GffPrintMode gffp, const char* tlabel, const char* gfparent, bool cvtChars) { char dbuf[10]; if (tlabel==NULL) { tlabel=track_id>=0 ? names->tracks.Get(track_id)->name : (char*)"gffobj" ; } if (gffp==pgffBED) { printBED(fout, cvtChars); return; } const char* gseqname=names->gseqs.Get(gseq_id)->name; bool gff3 = (gffp>=pgffAny && gffp<=pgffTLF); bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgtfBoth || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth); bool showExon = (gffp<=pgtfExon || gffp==pgtfBoth || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth); gscore.sprint(dbuf); if (gffp<=pgtfBoth && gffp>=pgtfAny) { //GTF output fprintf(fout, "%s\t%s\ttranscript\t%d\t%d\t%s\t%c\t.\ttranscript_id \"%s\"", gseqname, tlabel, start, end, dbuf, strand, gffID); printAttrs(fout, "; ", true, cvtChars); fprintf(fout,"\n"); } else if (gff3) { //print GFF3 transcript line: uint pstart, pend; if (gffp==pgffCDS) { pstart=CDstart; pend=CDend; } else { pstart=start;pend=end; } //const char* ftype=isTranscript() ? "mRNA" : getFeatureName(); const char* ftype=getFeatureName(); fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s", gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID); bool parentPrint=false; if (gfparent!=NULL && gffp!=pgffTLF) { //parent override - also prevents printing gene_name and gene_id unless they were given as transcript attributes fprintf(fout, ";Parent=%s",gfparent); parentPrint=true; } else if (parent!=NULL && !parent->isDiscarded() && gffp!=pgffTLF) { fprintf(fout, ";Parent=%s",parent->getID()); if (parent->isGene()) parentPrint=true; } if (gffp==pgffTLF) { fprintf(fout, ";exonCount=%d",exons.Count()); if (exons.Count()>0) fprintf(fout, ";exons=%d-%d", exons[0]->start, exons[0]->end); for (int i=1;istart, exons[i]->end); } } if (CDstart>0 && (gffp==pgffTLF || !showCDS)) { if (cdss==NULL) fprintf(fout,";CDS=%d:%d",CDstart,CDend); else { fprintf(fout, ";CDS="); for (int i=0;iCount();++i) { if (i>0) fprintf(fout, ","); fprintf(fout, "%d-%d", (*cdss)[i]->start, (*cdss)[i]->end); } } } if (CDphase>0 && (gffp==pgffTLF || !showCDS)) fprintf(fout,";CDSphase=%c", CDphase); char* g_id=NULL; if (geneID!=NULL && !parentPrint && getAttr("geneID")==NULL && ((g_id=getAttr("gene_id"))==NULL || strcmp(g_id, geneID)!=0)) fprintf(fout, ";geneID=%s",geneID); if (gene_name!=NULL && !parentPrint && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL) fprintf(fout, ";gene_name=%s",gene_name); printAttrs(fout, ";", false, cvtChars); fprintf(fout,"\n"); }// gff3 transcript line if (gffp==pgffTLF) return; bool is_cds_only = (gffp==pgffBoth || gffp==pgtfBoth) ? false : isCDSOnly(); if (showExon) { //print exons for (int i=0;i0) { GVec cds; getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment for (int i=0;i& segs) { int cdsacc=0; if (CDphase=='1' || CDphase=='2') { cdsacc+= 3-(CDphase-'0'); } else CDphase='0'; if (strand=='-') { //reverse strand for (int i=segs.Count()-1;i>=0;i--) { segs[i]->phase='0'+ (3-cdsacc%3)%3; cdsacc+=segs[i]->end-segs[i]->start+1; } } else { //forward strand for (int i=0;iphase='0'+ (3-cdsacc%3)%3; cdsacc+=segs[i]->end-segs[i]->start+1; } } } void GffObj::getCDSegs(GVec& cds) { //like updateCDSPhase() above, also updates phase for each segment GffExon cdseg(true); cds.Clear(); if (cdss!=NULL) { //copy directly from cdss list for (int i=0;iCount();i++) { cdseg=(*cdss->Get(i)); cdseg.sharedAttrs=true; cds.Add(cdseg); } return; } int cdsacc=0; if (CDphase=='1' || CDphase=='2') { cdsacc+= 3-(CDphase-'0'); } if (strand=='-') { for (int x=exons.Count()-1;x>=0;x--) { uint sgstart=exons[x]->start; uint sgend=exons[x]->end; if (CDendsgend) continue; if (CDstart>=sgstart && CDstart<=sgend) sgstart=CDstart; //cdstart within this segment if (CDend>=sgstart && CDend<=sgend) sgend=CDend; //cdend within this segment cdseg.start=sgstart; cdseg.end=sgend; //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); cdseg.phase='0'+ (3-cdsacc%3)%3; cdsacc+=sgend-sgstart+1; cdseg.attrs=exons[x]->attrs; cdseg.sharedAttrs=true; cds.Add(cdseg); } //for each exon cds.Reverse(); } // - strand else { // + strand for (int x=0;xstart; uint sgend=exons[x]->end; if (CDendsgend) continue; if (CDstart>=sgstart && CDstart<=sgend) sgstart=CDstart; //seqstart within this segment if (CDend>=sgstart && CDend<=sgend) sgend=CDend; //seqend within this segment cdseg.start=sgstart; cdseg.end=sgend; //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); cdseg.phase='0' + (3-cdsacc%3)%3 ; cdsacc+=sgend-sgstart+1; cdseg.attrs=exons[x]->attrs; cdseg.sharedAttrs=true; cds.Add(cdseg); } //for each exon } // + strand } //-- transcript match/overlap classification functions char transcriptMatch(GffObj& a, GffObj& b, int& ovlen, int trange) { //return '=' if exact exon match or transcripts ends are within tdelta distance // '~' if intron-chain match (or 80% overlap, for single-exon) // or 0 otherwise int imax=a.exons.Count()-1; int jmax=b.exons.Count()-1; ovlen=0; if (imax!=jmax) return false; //different number of exons, cannot match if (imax==0) //single-exon mRNAs return (singleExonTMatch(a,b,ovlen, trange)); if ( a.exons[imax]->startend || b.exons[jmax]->startend ) return 0; //intron chains do not overlap at all //check intron overlaps ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1; ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start; for (int i=1;i<=imax;i++) { if (ilen(); if ((a.exons[i-1]->end!=b.exons[i-1]->end) || (a.exons[i]->start!=b.exons[i]->start)) { return 0; //intron mismatch } } //--- full intron chain match //check if it's an exact if (abs((int)a.exons[0]->start-(int)b.exons[0]->start)<=trange && abs((int)a.exons.Last()->end-(int)b.exons.Last()->end)<=trange) return '='; return '~'; } char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen, int trange) { //return '=' if boundaries match within tdelta distance, // or '~' if the overlap is >=80% of the longer sequence length // return 0 if there is no overlap GSeg mseg(m.start, m.end); ovlen=mseg.overlapLen(r.start,r.end); if (ovlen<=0) return 0; // fuzzy matching for single-exon transcripts: // matching = overlap is at least 80% of the length of the longer transcript // *OR* in case of reverse containment (reference contained in m) // it's also considered "matching" if the overlap is at least 80% of // the reference len AND at least 70% of the query len if (abs((int)m.start-(int)r.start)<=trange && abs((int)m.end-(int)r.end)<=trange) return '='; if (m.covlen>r.covlen) { if ( (ovlen >= m.covlen*0.8) || (ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ) ) //allow also some fuzzy reverse containment return '~'; } else { if (ovlen >= r.covlen*0.8) return '~'; } return 0; } TOvlData getOvlData(GffObj& m, GffObj& r, bool stricterMatch, int trange) { TOvlData odta; if (!m.overlap(r.start,r.end)) return odta; int jmax=r.exons.Count()-1; //char rcode=0; if (m.exons.Count()==1) { //single-exon transfrag GSeg mseg(m.start, m.end); if (jmax==0) { //also single-exon ref char eqcode=0; if ((eqcode=singleExonTMatch(m, r, odta.ovlen, trange))>0) { odta.ovlcode=(stricterMatch) ? eqcode : '='; return odta; } if (m.covlen= m.covlen*0.8) { odta.ovlcode='c'; return odta; } } // fuzzy containment else if (odta.ovlen >= r.covlen*0.8 ) { odta.ovlcode='k'; return odta; } // fuzzy reverse containment odta.ovlcode='o'; return odta; //just plain overlapping } //-- single-exon qry overlaping multi-exon ref //check full pre-mRNA case (all introns retained): code 'm' if (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start) { odta.ovlcode='m'; return odta; } for (int j=0;j<=jmax;j++) { //check if it's ~contained by an exon int exovlen=mseg.overlapLen(r.exons[j]); if (exovlen>0) { odta.ovlen+=exovlen; if (m.start>r.exons[j]->start-4 && m.endend+4) { odta.ovlcode='c'; return odta; //close enough to be considered contained in this exon } } if (j==jmax) break; //last exon here, no intron to check //check if it fully covers an intron (retained intron) if (m.startend && m.end>r.exons[j+1]->start) { odta.ovlcode='n'; return odta; } //check if it's fully contained by an intron if (m.endstart && m.start>r.exons[j]->end) { odta.ovlcode='i'; return odta; } // check if it's a potential pre-mRNA transcript // (if overlaps this intron at least 10 bases) uint introvl=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1); //iovlen+=introvl; if (introvl>=10 && mseg.len()>introvl+10) { odta.ovlcode='e'; } } //for each ref exon if (odta.ovlcode>0) return odta; odta.ovlcode='o'; //plain overlap, uncategorized return odta; } //single-exon transfrag //-- multi-exon transfrag -- int imax=m.exons.Count()-1;// imax>0 here odta.jbits.resize(imax << 1); //num_junctions = 2 * num_introns if (jmax==0) { //single-exon reference overlap //any exon overlap? GSeg rseg(r.start, r.end); for (int i=0;i<=imax;i++) { //check if it's ~contained by an exon int exovlen=rseg.overlapLen(m.exons[i]); if (exovlen>0) { odta.ovlen+=exovlen; if (r.start>m.exons[i]->start-4 && r.endend+4) { odta.ovlcode='k'; return odta; //reference contained in this assembled exon } } if (i==imax) break; if (r.endstart && r.start>m.exons[i]->end) { odta.ovlcode='y'; //ref contained in this transfrag intron return odta; } } odta.ovlcode='o'; return odta; } // SET reference // -- MET comparison --- // check if qry contained by a ref intron for (int j=0;jstart && m.start>r.exons[j]->end) { odta.ovlcode='i'; return odta; } } if (m.exons[imax]->startend) { //qry intron chain ends before ref intron chain starts //check if last qry exon plugs the 1st ref intron if (m.exons[imax]->start<=r.exons[0]->end && m.exons[imax]->end>=r.exons[1]->start) { odta.ovlen=m.exonOverlapLen(r); odta.ovlcode='n'; return odta; } odta.ovlen=m.exons[imax]->overlapLen(r.exons[0]); odta.ovlcode='o'; //only terminal exons overlap return odta; } else if (r.exons[jmax]->startend) { //qry intron chain starts after ref intron chain ends //check if first qry exon plugs the last ref intron if (m.exons[0]->start<=r.exons[jmax-1]->end && m.exons[0]->end>=r.exons[jmax]->start) { odta.ovlen=m.exonOverlapLen(r); odta.ovlcode='n'; return odta; } odta.ovlen=m.exons[0]->overlapLen(r.exons[jmax]); odta.ovlcode='o'; //only terminal exons overlap return odta; } //check intron chain overlap (match, containment, intron retention etc.) int i=1; //index of exon to the right of current qry intron int j=1; //index of exon to the right of current ref intron bool intron_conflict=false; //overlapping introns have at least a mismatching splice site //from here on we check all qry introns against ref introns bool junct_match=false; //true if at least a junction match is found bool ichain_match=false; //if there is intron (sub-)chain match, to be updated by any mismatch bool intron_ovl=false; //if any intron overlap is found bool intron_retention=false; //if any ref intron is covered by a qry exon //intron chain (partial) match exon-index boundaries: int imfirst=0; //index of exon after first intron match in query (valid>0) int jmfirst=0; //index of exon after first intron match in reference (valid>0) int imlast=0; //index of exon after last intron match in query int jmlast=0; //index of exon after last intron match in reference //--keep track of the last overlapping introns in both qry and ref: odta.ovlen=m.exonOverlapLen(r); //int q_first_iovl=-1, r_first_iovl=-1, q_last_iovl=-1, r_last_iovl=-1; //check for intron matches while (i<=imax && j<=jmax) { uint mstart=m.exons[i-1]->end; //qry intron start-end uint mend=m.exons[i]->start; uint rstart=r.exons[j-1]->end; //ref intron start-end uint rend=r.exons[j]->start; if (rendoverlap(mstart+1, mend-1)) intron_conflict=true; //next ref exon overlaps this qry intron if (!intron_retention && rstart>=m.exons[i-1]->start && rend<=m.exons[i-1]->end) intron_retention=true; //this ref intron is covered by previous qry exons[i-1] if (intron_ovl) ichain_match=false; j++; continue; } //no overlap with this ref intron, skipping it if (rstart>mend) { //qry intron ends before ref intron starts //if qry intron overlaps the exon on the left, we have an intron conflict if (!intron_conflict && r.exons[j-1]->overlap(mstart+1, mend-1)) intron_conflict=true; if (!intron_retention && rstart>=m.exons[i]->start && rend<=m.exons[i]->end) intron_retention=true; if (intron_ovl) ichain_match=false; i++; continue; } //no intron overlap, skipping qry intron intron_ovl=true; //q_last_iovl=i; //keep track of the last overlapping introns in both qry and ref //r_last_iovl=j; //overlapping introns, test junction matching bool smatch=false; if (mstart==rstart) { smatch=true; odta.jbits.set( (i-1)<<1 ); odta.numJmatch++; junct_match=true; } bool ematch=false; if (mend==rend) { ematch=true; odta.jbits.set( ((i-1)<<1)+1 ); odta.numJmatch++; junct_match=true; } if (smatch && ematch) { //perfect match of this intron if (jmfirst==0) { ichain_match=true; jmfirst=j; imfirst=i; } if (ichain_match) { imlast=i; jmlast=j; } i++; j++; continue; } //intron overlapping but not fully matching intron_conflict=true; ichain_match=false; if (mend>rend) j++; else i++; } //while checking intron overlaps // --- when qry intron chain is contained within ref intron chain // qry terminal exons may poke (overhang) into ref's other introns int l_iovh=0; // overhang of q left boundary beyond the end of ref intron on the left int r_iovh=0; // same type of overhang through the ref intron on the right int qry_intron_poking=0; // --- when ref intron chain is contained within qry intron chain, // terminal exons of ref may poke (overhang) into qry other introns int l_jovh=0; // overhang of q left boundary beyond the end of ref intron to the left int r_jovh=0; // same type of overhang through the ref intron on the right int ref_intron_poking=0; if (ichain_match) { //intron (sub-)chain compatible so far (but there could still be conflicts) if (imfirst==1 && imlast==imax) { // qry full intron chain match if (jmfirst==1 && jmlast==jmax) {//identical intron chains if (stricterMatch) { odta.ovlcode= (abs((int)r.exons[0]->start-(int)m.exons[0]->start)<=trange && abs((int)r.exons.Last()->end-(int)m.exons.Last()->end)<=trange) ? '=' : '~'; return odta; } odta.ovlcode='='; return odta; } // -- a partial intron chain match if (jmfirst>1) { //find if m.start falls within any ref intron before jmfirst for (int j=jmfirst-1;j>0;--j) if (m.startstart) { if (m.start>r.exons[j-1]->end) { //m.start within this ref intron l_iovh = r.exons[j]->start - m.start; break; } else { intron_retention=true; ichain_match=false; } } } if (jmlast r.exons[j]->end) { if (m.end < r.exons[j+1]->start) { //m.end within this ref intron r_iovh = m.end - r.exons[j]->end; break; } else { intron_retention=true; ichain_match=false; } } } if (ichain_match && l_iovh<4 && r_iovh<4) { odta.ovlcode='c'; return odta; } qry_intron_poking=GMAX(l_iovh, r_iovh); } else if ((jmfirst==1 && jmlast==jmax)) {//ref intron chain match //check if the reference j-chain is contained in qry i-chain //check for ref ends poking into qry introns if (imfirst>1) { for (int i=imfirst-1;i>0;--i) if (m.exons[i]->start>r.start) { if (r.start>m.exons[i-1]->end) { l_jovh = m.exons[i]->start - r.start; break; } else { ichain_match = false; } } } if (imlast m.exons[i]->end) { if (r.end < m.exons[i+1]->start) { r_jovh = r.end - m.exons[i]->end; break; } else { ichain_match = false; } } } if (ichain_match && l_jovh<4 && r_jovh<4) { odta.ovlcode='k'; //reverse containment return odta; } ref_intron_poking=GMAX(l_jovh, r_jovh); } } //'=', 'c' and 'k' were checked and assigned, check for 'm' and 'n' before falling back to 'j' if (intron_retention) { //ref is boundary contained with qry intron chain ? that's not required for 'm' //GMessage("r_jovh=%d, r_iovh=%d, l_jovh=%d, l_iovh=%d\n", r_jovh, r_iovh, l_jovh, l_iovh); //GMessage("m.start=%d, r.exons[0]->end=%d, m.end=%d, r.exons[jmax]->start=%d\n", // m.start, r.exons[0]->end, m.end, r.exons[jmax]->start); //if (ref_intron_poking>0 && ) //we just need to have no intron poking going on if (!intron_conflict && ref_intron_poking<4 && qry_intron_poking<4) { odta.ovlcode='m'; return odta; } odta.ovlcode='n'; return odta; } //if (intron_ovl) { ? if (junct_match) { odta.ovlcode='j'; return odta; } //what's left could be intron overlap but with no junction match = 'o' if (odta.ovlen>4) { odta.ovlcode='o'; return odta; } //but if there is no exon overlap, we have 'i' or 'y' //exons are within the introns of the other! if (m.start>r.start && r.end > m.end) { odta.ovlcode='i'; return odta; } odta.ovlcode='y'; return odta; //all reference exons are within transfrag introns! } gclib-0.12.7/gff.h000066400000000000000000001455631407072766100136240ustar00rootroot00000000000000#ifndef GFF_H #define GFF_H //#define CUFFLINKS 1 #include "GBase.h" #include "gdna.h" #include "codons.h" #include "GFaSeqGet.h" #include "GList.hh" #include "GHashMap.hh" #include "GBitVec.h" #ifdef CUFFLINKS #include // for boost::crc_32_type #endif //reserved Gffnames::feats entries -- basic feature types extern int gff_fid_mRNA; // "mRNA" feature name extern int gff_fid_transcript; // *RNA, *transcript feature name extern int gff_fid_exon; extern int gff_fid_CDS; extern const uint GFF_MAX_LOCUS; extern const uint GFF_MAX_EXON; extern const uint GFF_MAX_INTRON; //extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent //extern const byte gfo_flagShift_LEVEL; //extern bool gff_show_warnings; #define GFF_LINELEN 4096 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n" enum GffExonType { exgffIntron=-1, // useless "intron" feature exgffNone=0, //not recognizable or unitialized exonic segment exgffStartCodon, //from "start_codon" feature (within CDS) exgffStopCodon, //from "stop_codon" feature (may be outside CDS, but should) exgffCDS, //from "CDS" feature exgffUTR, //from "UTR" feature exgffCDSUTR, //from a merge of UTR and CDS feature exgffExon, //from "exon" feature }; extern const char* exonTypes[]; const char* strExonType(char xtype); class GfList; typedef void GFFCommentParser(const char* cmline, GfList* gflst); //comment parser callback //Useful for parsing/maintaining ref seq info from comment lines like this: //##sequence-region chr1 1 24895642 class GffReader; class GffObj; //---transcript overlapping - utility functions extern const byte CLASSCODE_OVL_RANK; //rank value just above 'o' class code byte classcode_rank(char c); //returns priority value for class codes struct TOvlData { char ovlcode; int ovlen; int16_t numJmatch; //number of matching junctions (not introns) GBitVec jbits; //bit array with 1 bit for each junctions (total = 2 * num_introns) TOvlData(char oc=0, int olen=0, int16_t nmj=0, GBitVec* jb=NULL):ovlcode(oc), ovlen(olen),numJmatch(nmj),jbits(jb) { } }; TOvlData getOvlData(GffObj& m, GffObj& r, bool stricterMatch=false, int trange=0); char transcriptMatch(GffObj& a, GffObj& b, int& ovlen, int trange=0); //generic transcript match test // -- return '=', '~' or 0 char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen, int trange=0); //single-exon transcript match test - returning '=', '~' or 0 //--- // -- tracking exon/CDS segments from local mRNA to genome coordinates class GMapSeg:public GSeg { public: uint gstart; //genome start location uint gend; //genome end location //gendgend) { //reverse strand mapping if (gcgstart) return 0; return (gstart-gc); } else { if (gcgend) return 0; return (gc-gstart); } } }; struct GffScore { float score; int8_t precision; GffScore(float sc=0, int8_t prec=-1):score(sc),precision(prec) { } void print(FILE* outf) { if (precision<0) fprintf(outf, "."); else fprintf(outf, "%.*f", precision, score); } void sprint(char* outs) { if (precision<0) sprintf(outs, "."); else sprintf(outs, "%.*f", precision, score); } bool operator<(GffScore& v) { return this->scorescore<=v.score; } bool operator>(GffScore& v) { return this->score>v.score; } bool operator>=(GffScore& v) { return this->score>=v.score; } bool operator==(GffScore& v) { return this->score==v.score; } }; extern const GffScore GFFSCORE_NONE; class GMapSegments:public GVec { public: int dir; //-1 or +1 (reverse/forward for genome coordinates) GSeg lreg; // always 1,max local coord GSeg greg; // genomic min,max coords GMapSegments(char strand='+'):lreg(0,0),greg(0,0) { dir=(strand=='-') ? -1 : 1; } void Clear(char strand='+') { lreg.start=0;lreg.end=0; greg.start=0;greg.end=0; dir = (strand=='-') ? -1 : 1;; GVec::Clear(); } int add(uint s, uint e, uint gs, uint ge) { if (dir<0) { if (gsgreg.end) greg.end=gs; if (gegreg.end) greg.end=ge; if (gslreg.end) lreg.end=gm.end; if (gm.start::Add(gm); } uint gmap(uint lc) { //takes a local coordinate and returns its mapping to genomic coordinates //returns 0 if mapping cannot be performed! if (lc==0 || fCount==0 || lclreg.end) return 0; //find local segment containing this coord int i=0; while (i=fArray[i].start && lc<=fArray[i].end) return (fArray[i].gstart+dir*(lc-fArray[i].start)); ++i; } return 0; } uint lmap(uint gc) { //takes a genome coordinate and returns its mapping to local coordinates if (gc==0 || fCount==0 || gcgreg.end) return 0; //find genomic segment containing this coord int i=0; while (i exons; BEDLine(GffReader* r=NULL, const char* l=NULL); ~BEDLine() { GFREE(dupline); GFREE(line); } }; class GffLine { protected: char* _parents; //stores a copy of the Parent attribute value, //with commas replaced by \0 int _parents_len; bool parseSegmentList(GVec& segs, char* str); public: char* dupline; //duplicate of original line char* line; //this will have tabs replaced by \0 int llen; char* gseqname; char* track; char* ftype; //feature name: mRNA/gene/exon/CDS int ftype_id; char* info; //the last, attributes' field, unparsed uint fstart; uint fend; /* uint qstart; //overlap coords on query, if available uint qend; uint qlen; //query len, if given */ float score; int8_t score_decimals; char strand; union { unsigned int flags; struct { bool is_exonlike:2; //CDS,codon, UTR, exon }; struct { bool is_cds:1; //"cds" or "start/stop_codon" features bool is_exon:1; //"exon" and "utr" features bool is_transcript:1; //if current feature is *RNA or *transcript bool is_gene:1; //current feature is *gene //bool is_gff3:1; //line appears to be in GFF3 format (0=GTF) bool is_gtf_transcript:1; //GTF transcript line with Parents parsed from gene_id bool skipLine:1; bool gffWarnings:1; bool is_gene_segment:1; //for NCBI's D/J/V/C_gene_segment }; }; int8_t exontype; // gffExonType char phase; // '.' , '0', '1' or '2', can be also given as CDSphase attribute in TLF uint cds_start; //if TLF: CDS=start:end attribute uint cds_end; GVec exons; //if TLF: exons= attribute GVec cdss; //if TLF: CDS=segment_list attribute char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3) char* gene_id; //GTF only: value of "gene_id" attribute if present char** parents; //for GTF only parents[0] is used int num_parents; char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id) GffLine(GffReader* reader, const char* l); //parse the line accordingly void discardParent() { GFREE(_parents); _parents_len=0; num_parents=0; GFREE(parents); parents=NULL; } void ensembl_GFF_ID_process(char*& id); void ensembl_GTF_ID_process(char*& id, const char* ver_attr); static char* extractGFFAttr(char*& infostr, const char* oline, const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL, bool deleteAttr=true); char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL){ return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, true); } char* getAttrValue(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL) { return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, false); } GffLine(GffLine& l): _parents(NULL), _parents_len(l._parents_len), dupline(NULL), line(NULL), llen(l.llen), gseqname(NULL), track(NULL), ftype(NULL), ftype_id(l.ftype_id), info(NULL), fstart(l.fstart), fend(l.fend), //qstart(l.fstart), qend(l.fend), qlen(l.qlen), score(l.score), score_decimals(l.score_decimals), strand(l.strand), flags(l.flags), exontype(l.exontype), phase(l.phase), cds_start(l.cds_start), cds_end(l.cds_end), exons(l.exons), cdss(l.cdss), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(l.num_parents), ID(NULL) { //if (l==NULL || l->line==NULL) // GError("Error: invalid GffLine(l)\n"); //memcpy((void*)this, (void*)l, sizeof(GffLine)); GMALLOC(line, llen+1); memcpy(line, l.line, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l.dupline, llen+1); //--offsets within line[] gseqname=line+(l.gseqname-l.line); track=line+(l.track-l.line); ftype=line+(l.ftype-l.line); info=line+(l.info-l.line); if (num_parents>0 && parents) { GMALLOC(parents, num_parents*sizeof(char*)); //_parents_len=l->_parents_len; copied above _parents=NULL; //re-init, forget pointer copy GMALLOC(_parents, _parents_len); memcpy(_parents, l._parents, _parents_len); for (int i=0;i(GffAttr& d){ return (this>&d); } bool operator<(GffAttr& d){ return (this<&d); } }; class GffNameList; class GffNames; class GffNameInfo { friend class GffNameList; public: int idx; char* name; GffNameInfo(const char* n=NULL):idx(-1),name(NULL) { if (n) name=Gstrdup(n); } ~GffNameInfo() { GFREE(name); } bool operator==(GffNameInfo& d){ return (strcmp(this->name, d.name)==0); } bool operator<(GffNameInfo& d){ return (strcmp(this->name, d.name)<0); } }; class GffNameList:public GPVec { friend class GffNameInfo; friend class GffNames; protected: GHashMap byName;//hash with shared keys int idlast; //fList index of last added/reused name int addStatic(const char* tname) {// fast add GffNameInfo* f=new GffNameInfo(tname); idlast=this->Add(f); f->idx=idlast; byName.Add(f->name,f); return idlast; } public: //GffNameList(int init_capacity=6):GList(init_capacity, false,true,true), byName(false) { GffNameList(int init_capacity=6):GPVec(init_capacity, true), byName(false) { idlast=-1; setCapacity(init_capacity); } char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; } int lastNameId() { return idlast; } char* getName(int nid) { //retrieve name by its ID if (nid<0 || nid>=fCount) GError("GffNameList Error: invalid index (%d)\n",nid); return fList[nid]->name; } int addName(const char* tname) {//returns or create an id for the given name //check idlast first, chances are it's the same feature name checked /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0) return idlast;*/ GffNameInfo* f=byName.Find(tname); int fidx=-1; if (f!=NULL) fidx=f->idx; else {//add new entry f=new GffNameInfo(tname); fidx=this->Add(f); f->idx=fidx; byName.Add(f->name,f); } idlast=fidx; return fidx; } int addNewName(const char* tname) { GffNameInfo* f=new GffNameInfo(tname); int fidx=this->Add(f); f->idx=fidx; byName.Add(f->name,f); return fidx; } int getId(const char* tname) { //only returns a name id# if found GffNameInfo* f=byName.Find(tname); if (f==NULL) return -1; return f->idx; } int removeName() { GError("Error: removing names from GffNameList not allowed!\n"); return -1; } }; class GffNames { public: int numrefs; GffNameList tracks; GffNameList gseqs; GffNameList attrs; GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc. GffNames():tracks(),gseqs(),attrs(), feats() { numrefs=0; //the order below is critical! //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS gff_fid_mRNA = feats.addStatic("mRNA");//index 0=gff_fid_mRNA gff_fid_transcript=feats.addStatic("transcript");//index 1=gff_fid_transcript gff_fid_exon=feats.addStatic("exon");//index 2=gff_fid_exon gff_fid_CDS=feats.addStatic("CDS"); //index 3=gff_fid_CDS } }; void gffnames_ref(GffNames* &n); void gffnames_unref(GffNames* &n); enum GffPrintMode { pgtfAny, //print record as read, if GTF pgtfExon, //print only exon features (CDS converted to exon if exons are missing) pgtfCDS, //print only CDS features pgtfBoth, //print both CDS and exon features pgffAny, //print record as read (if isCDSonly() prints only CDS) pgffExon, pgffCDS, pgffBoth, //enforce exon printing if isCDSOnly() pgffTLF, //exon and CDS data shown as additional GFF attributes //in the transcript line (Transcript Line Format) //every line has the whole transcript data pgffBED //print a BED line with all other GFF attributes in column 13 }; class GffAttrs:public GList { public: GffAttrs():GList(false,true,true) { } void add_if_new(GffNames* names, const char* attrname, const char* attrval) { //adding a new value without checking for cds status int nid=names->attrs.getId(attrname); if (nid>=0) { //attribute name found in the dictionary for (int i=0;iattr_id) { return; } //don't update existing } else { //adding attribute name to global attr name dictionary nid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(nid, attrval)); } void add_if_new(GffNames* names, const char* attrname, const char* attrval, bool is_cds) { int nid=names->attrs.getId(attrname); if (nid>=0) { //attribute name found in the dictionary for (int i=0;iattr_id && is_cds==Get(i)->cds) { return; } //don't update existing } else { //adding attribute name to global attr name dictionary nid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(nid, attrval, is_cds)); } void add_or_update(GffNames* names, const char* attrname, const char* val) { //adding a new value without checking for cds status int aid=names->attrs.getId(attrname); if (aid>=0) { //attribute found in the dictionary for (int i=0;iattr_id) { //update the existing value for this attribute Get(i)->setValue(val); return; } } } else { //adding attribute name to global attr name dictionary aid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(aid, val)); } void add_or_update(GffNames* names, const char* attrname, const char* val, bool is_cds) { int aid=names->attrs.getId(attrname); if (aid>=0) { //attribute found in the dictionary for (int i=0;iattr_id && Get(i)->cds==is_cds) { //update the existing value for this attribute Get(i)->setValue(val, is_cds); return; } } } else { //adding attribute name to global attr name dictionary aid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(aid, val, is_cds)); } int haveId(int attr_id, bool is_cds=false) { for (int i=0;iattr_id && Get(i)->cds==is_cds) return i; return -1; } int haveId(const char* attrname, GffNames* names, bool is_cds=false) { int aid=names->attrs.getId(attrname); if (aid>=0) { for (int i=0;iattr_id && Get(i)->cds==is_cds) return i; } return -1; } char* getAttr(GffNames* names, const char* attrname) { int aid=names->attrs.getId(attrname); if (aid>=0) for (int i=0;iattr_id) return Get(i)->attr_val; return NULL; } char* getAttr(GffNames* names, const char* attrname, bool is_cds) { int aid=names->attrs.getId(attrname); if (aid>=0) for (int i=0;iattr_id && Get(i)->cds==is_cds) return Get(i)->attr_val; return NULL; } char* getAttr(int aid) { if (aid>=0) for (int i=0;iattr_id) return Get(i)->attr_val; return NULL; } char* getAttr(int aid, bool is_cds) { if (aid>=0) for (int i=0;iattr_id && Get(i)->cds==is_cds) return Get(i)->attr_val; return NULL; } void copyAttrs(GffAttrs* attrs, bool is_cds=false) { //deep copy attributes from another GffAttrs list // (only the ones which do not exist yet) if (attrs==NULL) return; for (int i=0;iCount();i++) { int aid=attrs->Get(i)->attr_id; if (haveId(aid, is_cds)<0) Add(new GffAttr(aid, attrs->Get(i)->attr_val, is_cds)); } } }; class GffExon : public GSeg { public: bool sharedAttrs; //do not free attrs on destruct! GffAttrs* attrs; //other attributes kept for this exon/CDS GffScore score; // gff score column int8_t exontype; char phase; //GFF phase column - for CDS segments only! // '.' = undefined (UTR), '0','1','2' for CDS exons void* uptr; //for associating extended user data to this exon char* getAttr(GffNames* names, const char* atrname) { if (attrs==NULL || names==NULL || atrname==NULL) return NULL; return attrs->getAttr(names, atrname); } char* getAttr(int aid) { if (attrs==NULL) return NULL; return attrs->getAttr(aid); } GffExon(bool share_attributes):GSeg(0,0), sharedAttrs(share_attributes), attrs(NULL), score(), exontype(0), phase('.'), uptr(NULL){ } GffExon(uint s=0, uint e=0, int8_t et=0, char ph='.', float sc=0, int8_t sc_prec=0):sharedAttrs(false), attrs(NULL), score(sc,sc_prec), exontype(et), phase(ph), uptr(NULL) { if (scopyAttrs(ex.attrs); } } GffExon& operator=(const GffExon& o) = default; //prevent gcc 9 warnings: //yes, I want a shallow copy here ~GffExon() { //destructor if (attrs!=NULL && !sharedAttrs) delete attrs; } }; //only for mapping to spliced coding sequence: class GffCDSeg:public GSeg { public: char phase; int exonidx; }; //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments class GffObj:public GSeg { protected: char* gffID; // ID name for mRNA (parent) feature char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3) char* geneID; //value of gene_id attribute (GTF) if present, or the ID attribute of a parent gene feature (GFF3) union { unsigned int flags; struct { bool flag_HAS_ERRORS :1; bool flag_CHILDREN_PROMOTED :1; bool flag_IS_GENE :1; bool flag_IS_TRANSCRIPT :1; bool flag_HAS_GFF_ID :1; //found transcript/RNA feature line (GFF3 or GTF2 with transcript line) bool flag_BY_EXON :1; //created by subfeature (exon/CDS) directly bool flag_CDS_ONLY :1; //transcript defined by CDS features only (GffObj::isCDS()) bool flag_CDS_NOSTART :1; //partial CDS at 5' end (no start codon) bool flag_CDS_NOSTOP :1; //partial CDS at 3' end (no stop codon) bool flag_CDS_X :1; //transcript having CDS with ribosomal shift (i.e. after merging exons) //CDS segments stored in ::cdss are incompatible with the exon segments bool flag_GENE_SEGMENT :1; //a transcript-like C/D/J/V_gene_segment (NCBI's annotation) bool flag_TRANS_SPLICED :1; bool flag_DISCONTINUOUS :1; //discontinuous feature (e.g. cDNA_match) segments linked by same ID bool flag_TARGET_ONLY :1; //Target= feature (e.g. from RepeatMasker output), lacks ID bool flag_DISCARDED :1; //it will be discarded from the final GffReader list bool flag_LST_KEEP :1; //controlled by isUsed(); if set, this GffObj will not be //deallocated when GffReader is destroyed bool flag_FINALIZED :1; //if finalize() was already called for this GffObj unsigned int gff_level :4; //hierarchical level (0..15) }; }; //-- friends: friend class GffReader; friend class GffExon; public: static GffNames* names; // dictionary storage that holds the various attribute names etc. int track_id; // index of track name in names->tracks int gseq_id; // index of genomic sequence name in names->gseqs int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value int subftype_id; //index of child subfeature name in names->feats (subfeatures stored in "exons") //if ftype_id==gff_fid_mRNA then this value is ignored GList exons; //for non-mRNA entries, these can be any subfeature of type subftype_id GList* cdss; //only !NULL for cases of "programmed frameshift" when CDS boundaries do not match //exons boundaries GPVec children; GffObj* parent; int udata; //user data, flags etc. void* uptr; //user pointer (to a parent object, cluster, locus etc.) GffObj* ulink; //link to another GffObj (user controlled field) //---mRNA specific fields: //bool isCDS; //just a CDS, no UTRs uint CDstart; //CDS lowest coordinate uint CDend; //CDS highest coordinate char CDphase; //initial phase for CDS start ('.','0'..'2') //CDphase is at CDend if strand=='-' static void decodeHexChars(char* dbuf, const char* s, int maxlen=1023); bool hasErrors() { return flag_HAS_ERRORS; } void hasErrors(bool v) { flag_HAS_ERRORS=v; } bool hasGffID() { return flag_HAS_GFF_ID; } void hasGffID(bool v) {flag_HAS_GFF_ID=v; } bool createdByExon() { return flag_BY_EXON; } void createdByExon(bool v) {flag_BY_EXON=v; } bool isCDSOnly() { return flag_CDS_ONLY; } void isCDSOnly(bool v) { flag_CDS_ONLY=v; } bool isXCDS() { return flag_CDS_X; } void isXCDS(bool v) { flag_CDS_X=v; } bool isFinalized() { return flag_FINALIZED; } void isFinalized(bool v) { flag_FINALIZED=v; } bool isGene() { return flag_IS_GENE; } void isGene(bool v) {flag_IS_GENE=v; } bool isDiscarded() { return flag_DISCARDED; } void isDiscarded(bool v) { flag_DISCARDED=v; } bool isUsed() { return flag_LST_KEEP; } void isUsed(bool v) {flag_LST_KEEP=v; } bool isTranscript() { return flag_IS_TRANSCRIPT; } void isTranscript(bool v) {flag_IS_TRANSCRIPT=v; } bool isGeneSegment() { return flag_GENE_SEGMENT; } void isGeneSegment(bool v) {flag_GENE_SEGMENT=v; } bool promotedChildren() { return flag_CHILDREN_PROMOTED; } void promotedChildren(bool v) { flag_CHILDREN_PROMOTED=v; } void setLevel(byte v) { gff_level=v; } byte getLevel() { return gff_level; } byte incLevel() { gff_level++; return gff_level; } bool isValidTranscript() { //return (ftype_id==gff_fid_mRNA && exons.Count()>0); return (isTranscript() && exons.Count()>0); } //return the index of exon containing coordinate coord, or -1 if not int whichExon(uint coord, GList* segs=NULL); int readExon(GffReader& reader, GffLine& gl); int addExon(GList& segs, GffLine& gl, int8_t exontype_override=exgffNone); //add to cdss or exons int addExon(uint segstart, uint segend, int8_t exontype=exgffNone, char phase='.', GffScore exon_score=GFFSCORE_NONE, GList* segs=NULL); protected: bool reduceExonAttrs(GList& segs); //utility segment-merging function for addExon() void expandSegment(GList&segs, int oi, uint segstart, uint segend, int8_t exontype); bool processGeneSegments(GffReader* gfr); //for genes that have _gene_segment features (NCBI annotation) void transferCDS(GffExon* cds); public: void removeExon(int idx); void removeExon(GffExon* p); char strand; //'+', '-' or '.' GffScore gscore; int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths) GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature //constructor by gff line parsing: GffObj(GffReader& gfrd, BEDLine& bedline); GffObj(GffReader& gfrd, GffLine& gffline); //if gfline->Parent!=NULL then this will also add the first sub-feature // otherwise, only the main feature is created void copyAttrs(GffObj* from); void clearAttrs() { if (attrs!=NULL) { bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs); delete attrs; attrs=NULL; if (sharedattrs) exons[0]->attrs=NULL; } } GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() { //exons: sorted, free, non-unique gffID=NULL; uptr=NULL; ulink=NULL; flags=0; udata=0; parent=NULL; ftype_id=-1; subftype_id=-1; if (anid!=NULL) gffID=Gstrdup(anid); gffnames_ref(names); CDstart=0; // hasCDS <=> CDstart>0 CDend=0; CDphase=0; gseq_id=-1; track_id=-1; strand='.'; attrs=NULL; covlen=0; geneID=NULL; gene_name=NULL; } ~GffObj() { GFREE(gffID); GFREE(gene_name); GFREE(geneID); delete cdss; clearAttrs(); gffnames_unref(names); } //-------------- GffObj* finalize(GffReader* gfr); //complete parsing: must be called in order to merge adjacent/close proximity subfeatures void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false, bool CDSsrc=false); const char* getSubfName() { //returns the generic feature type of the entries in exons array return names->feats.getName(subftype_id); } void setCDS(uint cd_start, uint cd_end, char phase=0); void setCDS(GffObj* t); //set CDS from another transcript bool monoFeature() { return (exons.Count()==0 || (exons.Count()==1 && //exon_ftype_id==ftype_id && exons[0]->end==this->end && exons[0]->start==this->start)); } bool hasCDS() { return (CDstart>0); } const char* getFeatureName() { return names->feats.getName(ftype_id); } void setFeatureName(const char* feature); void addAttr(const char* attrname, const char* attrvalue); int removeAttr(const char* attrname, const char* attrval=NULL); int removeAttr(int aid, const char* attrval=NULL); int removeAttrs(GStrSet<>& attrSet); //remove attributes whose names are NOT in attrSet int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL); int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL); const char* getAttrName(int i) { if (attrs==NULL) return NULL; return names->attrs.getName(attrs->Get(i)->attr_id); } char* getAttr(const char* attrname, bool checkFirstExon=false) { if (names==NULL || attrname==NULL) return NULL; char* r=NULL; if (attrs==NULL) { if (!checkFirstExon) return NULL; } else r=attrs->getAttr(names, attrname); if (r!=NULL) return r; if (checkFirstExon && exons.Count()>0) { r=exons.First()->getAttr(names, attrname); } return r; } char* getExonAttr(GffExon* exon, const char* attrname) { if (exon==NULL || attrname==NULL) return NULL; return exon->getAttr(names, attrname); } char* getExonAttr(int exonidx, const char* attrname) { if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL; return exons[exonidx]->getAttr(names, attrname); } char* getAttrValue(int i) { if (attrs==NULL) return NULL; return attrs->Get(i)->attr_val; } const char* getGSeqName() { return names->gseqs.getName(gseq_id); } const char* getRefName() { return names->gseqs.getName(gseq_id); } void setRefName(const char* newname); const char* getTrackName() { return names->tracks.getName(track_id); } bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment //ignores strand! if (s>e) Gswap(s,e); for (int i=0;ioverlap(s,e)) return true; } return false; } bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment //if (gseq_id!=m.gseq_id) return false; // ignores strand and gseq_id, must check in advance for (int i=0;istart>m.exons[j]->end) continue; if (m.exons[j]->start>exons[i]->end) break; //-- overlap if we are here: return true; } } return false; } int exonOverlapIdx(GList& segs, uint s, uint e, int* ovlen=NULL, int start_idx=0); int exonOverlapLen(GffObj& m) { if (start>m.end || m.start>end) return 0; int i=0; int j=0; int ovlen=0; while (istart; uint iend=exons[i]->end; uint jstart=m.exons[j]->start; uint jend=m.exons[j]->end; if (istart>jend) { j++; continue; } if (jstart>iend) { i++; continue; } //exon overlap uint ovstart=GMAX(istart,jstart); if (iend(GffObj& d){ if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); if (start==d.start) { if (getLevel()==d.getLevel()) { if (end==d.end) return (strcmp(gffID, d.gffID)>0); else return (end>d.end); } else return (getLevel()>d.getLevel()); } else return (start>d.start); } bool operator<(GffObj& d){ if (gseq_id!=d.gseq_id) return (gseq_id& cds); void updateCDSPhase(GList& segs); //for CDS-only features, updates GffExon::phase void printGTab(FILE* fout, char** extraAttrs=NULL); void printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, GffExon* exon, bool gff3, bool cvtChars); void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false); void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) { printGxf(fout, pgtfAny, tlabel, NULL, cvtChars); } void printGff(FILE* fout, const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false) { printGxf(fout, pgffAny, tlabel, gfparent, cvtChars); } bool printAttrs(FILE* fout, const char* sep=";", bool GTFstyle=false, bool cvtChars=false, bool sepFirst=true); void printTranscriptGff(FILE* fout, char* tlabel=NULL, bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) { if (isValidTranscript()) printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars); } void printExonList(FILE* fout); //print comma delimited list of exon intervals void printCDSList(FILE* fout); //print comma delimited list of CDS intervals void printBED(FILE* fout, bool cvtChars); //print a BED-12 line + GFF3 attributes in 13th field void printSummary(FILE* fout=NULL); char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL, uint* cds_start=NULL, uint* cds_end=NULL, GMapSegments* seglst=NULL, bool cds_open=false); char* getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst=NULL); void addPadding(int padLeft, int padRight); //change exons to include this padding on the sides void removePadding(int padLeft, int padRight); //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ? bool empty() { return (start==0); } }; typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2); //user callback after parsing a mapping object: // Returns: "done with it" status: // TRUE if gobj is no longer needed so it's FREEd upon return // FALSE if the user needs the gobj pointer and is responsible for // collecting and freeing all GffObj objects //GSeqStat: collect basic stats about a common underlying genomic sequence // for multiple GffObj class GSeqStat { public: int gseqid; //gseq id in the global static pool of gseqs char* gseqname; //just a pointer to the name of gseq int fcount;//number of features on this gseq uint mincoord; uint maxcoord; uint maxfeat_len; //maximum feature length on this genomic sequence GffObj* maxfeat; GSeqStat(int id=-1, char* name=NULL) { gseqid=id; gseqname=name; fcount=0; mincoord=MAXUINT; maxcoord=0; maxfeat_len=0; maxfeat=NULL; } bool operator>(GSeqStat& g) { return (gseqid>g.gseqid); } bool operator<(GSeqStat& g) { return (gseqid { public: GfList(bool sorted):GList(sorted,false,false) { } GfList():GList(false,false,false) { //GffObjs in this list are NOT deleted when the list is cleared //-- for deallocation of these objects, call freeAll() or freeUnused() as needed } void finalize(GffReader* gfr); void freeAll() { for (int i=0;iisUsed()) continue; /*//inform the children? for (int c=0;cchildren.Count();c++) { fList[i]->children[c]->parent=NULL; } */ delete fList[i]; fList[i]=NULL; } Clear(); } }; class CNonExon { //utility class used in subfeature promotion public: //int idx; GffObj* parent; GffExon* exon; GffLine* gffline; //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) { CNonExon(GffObj* p, GffExon* e, GffLine& gl) { parent=p; exon=e; //idx=i; gffline=new GffLine(gl); } ~CNonExon() { delete gffline; } }; class GffReader { friend class GffObj; friend class GffLine; friend class GfList; char* linebuf; off_t fpos; int buflen; protected: union { unsigned int flags; unsigned int gff_type: 6; struct { bool is_gff3: 1; //GFF3 syntax was detected bool is_gtf:1; //GTF syntax was detected bool gtf_transcript:1; //has "transcript" features (2-level GTF) bool gtf_gene:1; //has "gene" features (3-level GTF ..Ensembl?) bool is_BED:1; //input is BED-12 format, possibly with attributes in 13th field bool is_TLF:1; //input is GFF3-like Transcript Line Format with exons= attribute //--other flags bool transcripts_Only:1; //default ; only keep recognized transcript features bool keep_Genes:1; //for transcriptsOnly, do not discard genes from gflst bool keep_Attrs:1; bool keep_AllExonAttrs:1; //when keep_Attrs, do not attempt to reduce exon attributes bool noExonAttrs:1; bool ignoreLocus:1; //discard locus features and attributes from input bool merge_CloseExons:1; bool gene2exon:1; bool sortByLoc:1; //if records should be sorted by location bool refAlphaSort:1; //if sortByLoc, reference sequences are // sorted lexically instead of their id# //Ensembl ID processing: bool xEnsemblID:1; //for ensemble GTF merge gene_version and transcript_version into the ID //for ensemble GFF3, cannot merge version (!), just remove "transcript:" and "gene:" prefixes bool gff_warns:1; }; }; //char* lastReadNext; FILE* fh; char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader GFFCommentParser* commentParser; GffLine* gffline; BEDLine* bedline; //bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features //bool gene2exon; // for childless genes: add an exon as the entire gene span GHash discarded_ids; //for transcriptsOnly mode, keep track // of discarded parent IDs GHash< GPVec* > phash; //transcript_id => GPVec(false) char* gfoBuildId(const char* id, const char* ctg); //void gfoRemove(const char* id, const char* ctg); GffObj* gfoAdd(GffObj* gfo); GffObj* gfoAdd(GPVec& glst, GffObj* gfo); GffObj* gfoReplace(GPVec& glst, GffObj* gfo, GffObj* toreplace); // const char* id, const char* ctg, char strand, GVec** glst, uint start, uint end bool pFind(const char* id, GPVec*& glst); GffObj* gfoFind(const char* id, GPVec* & glst, const char* ctg=NULL, char strand=0, uint start=0, uint end=0); CNonExon* subfPoolCheck(GffLine* gffline, GHash& pex, char*& subp_name); void subfPoolAdd(GHash& pex, GffObj* newgfo); GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash& pex); #ifdef CUFFLINKS boost::crc_32_type _crc_result; #endif public: GPVec gseqtable; //table with all genomic sequences, but only current GXF gseq ID indices will have non-NULL //GffNames* names; //just a pointer to the global static Gff names repository GfList gflst; //keeps track of all GffObj records being read (when readAll() is used) GffObj* newGffRec(GffLine* gffline, GffObj* parent=NULL, GffExon* pexon=NULL, GPVec* glst=NULL, bool replace_parent=false); GffObj* newGffRec(BEDLine* bedline, GPVec* glst=NULL); //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx); GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline); GffObj* updateParent(GffObj* newgfh, GffObj* parent); bool readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash* pex = NULL); GPVec gseqStats; //populated after finalize() with only the ref seqs in this file GffReader(FILE* f=NULL, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0), buflen(0), flags(0), fh(f), fname(NULL), commentParser(NULL), gffline(NULL), bedline(NULL), discarded_ids(true), phash(true), gseqtable(1,true), gflst(), gseqStats(1, false) { GMALLOC(linebuf, GFF_LINELEN); buflen=GFF_LINELEN-1; gffnames_ref(GffObj::names); //gff_warns=gff_show_warnings; transcripts_Only=t_only; sortByLoc=sort; noExonAttrs=true; //lastReadNext=NULL; } /* void init(FILE *f, bool t_only=false, bool sortbyloc=false, bool g2exon=false) { fname=NULL; fh=f; if (fh!=NULL) rewind(fh); fpos=0; flags=0; transcriptsOnly=t_only; gflst.sortedByLoc(sortbyloc); gene2exon=g2exon; } */ void gene2Exon(bool v) { gene2exon=v;} void procEnsemblID(bool v) { xEnsemblID=v;} bool procEnsemblID() { return xEnsemblID; } void enableSorting(bool sorting=true) { sortByLoc=sorting; } bool getSorting() { return sortByLoc; } void isBED(bool v=true) { is_BED=v; } //should be set before any parsing! void isTLF(bool v=true) { is_TLF=v; } //should be set before any parsing! void keepAttrs(bool keep_attrs=true, bool discardExonAttrs=true, bool preserve_exon_attrs=false) { keep_Attrs=keep_attrs; noExonAttrs=discardExonAttrs; keep_AllExonAttrs=preserve_exon_attrs; } void transcriptsOnly(bool t_only) { transcripts_Only=t_only; } bool transcriptsOnly() { return transcripts_Only; } void setIgnoreLocus(bool nolocus) { ignoreLocus=nolocus; } void keepGenes(bool keep_genes) { keep_Genes=keep_genes; } bool keepGenes() { return keep_Genes; } void mergeCloseExons(bool merge_close_exons=true) { merge_CloseExons=merge_close_exons; } void showWarnings(bool v) { gff_warns=v; //gff_show_warnings=v; } bool showWarnings() { return gff_warns; } void setRefAlphaSorted(bool v=true) { refAlphaSort=v; if (v) sortByLoc=true; } void setCommentParser(GFFCommentParser* cmParser=NULL) { commentParser=cmParser; } GffReader(const char* fn, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0), buflen(0), flags(0), fh(NULL), fname(NULL), commentParser(NULL), gffline(NULL), bedline(NULL), discarded_ids(true), phash(true), gseqtable(1,true), gflst(), gseqStats(1,false) { //gff_warns=gff_show_warnings; gffnames_ref(GffObj::names); noExonAttrs=true; transcripts_Only=t_only; sortByLoc=sort; fname=Gstrdup(fn); fh=fopen(fname, "rb"); GMALLOC(linebuf, GFF_LINELEN); buflen=GFF_LINELEN-1; //lastReadNext=NULL; } ~GffReader() { delete gffline; gffline=NULL; fpos=0; if (fh && fh!=stdin) fclose(fh); gflst.freeUnused(); gflst.Clear(); discarded_ids.Clear(); phash.Clear(); GFREE(fname); GFREE(linebuf); //GFREE(lastReadNext); gffnames_unref(GffObj::names); } GffLine* nextGffLine(); BEDLine* nextBEDLine(); // load all subfeatures, re-group them: void readAll(); void readAll(bool keepAttr, bool mergeCloseExons=false, bool noExonAttr=true) { this->keep_Attrs=keepAttr; this->merge_CloseExons=mergeCloseExons; this->noExonAttrs=noExonAttr; readAll(); } //only for well-formed files: BED or GxF where exons are strictly grouped by their transcript_id/Parent GffObj* readNext(); //user must free the returned GffObj* ! #ifdef CUFFLINKS boost::crc_32_type current_crc_result() const { return _crc_result; } #endif }; // end of GffReader // ---------------------------------------------------------- // -- auxiliary classes for GffObj::processGeneSegments() -- class GSegMatch { //keep track of "matching" overlaps of a GeneCDSChain with multiple GeneSegment containers public: int child_idx; //index of matching _gene_segment GffObj in gene->children[] list int noncov; //number of "non-covered" bases in the GeneSegment int gsegidx; //index of _gene_segment in GVec geneSegs // (i.e. UTRs + implied introns if exons are missing) bool operator<(GSegMatch& o) { return (noncovcdss[] list GeneCDS(int i=-1, uint cstart=0, uint cend=0):GSeg(cstart, cend), idx(i) { } }; class GeneCDSChain: public GSeg { //keep track of CDS chains of the gene and their boundaries public: GVec cdsList; //all CDSs in this chain GArray mxs; //list of "matching" container X_gene_segment transcripts; GeneCDSChain():cdsList(),mxs() { } GeneCDSChain(int idx, uint cstart, uint cend):GSeg(cstart, cend), cdsList(),mxs(true) { addCDS(idx, cstart, cend); } void addCDS(int idx, uint cstart, uint cend) { GeneCDS cds(idx, cstart, cend); cdsList.Add(cds); expandInclude(cstart, cend); } void addMatch(int childidx, int ncov, int gsegidx) { GSegMatch segmatch(childidx, ncov, gsegidx); mxs.Add(segmatch); } bool singleExonCDSMatch(uint tstart, uint tend, int& ncov) { if (start>=tstart && end<=tend) { ncov=start-tstart + tend-end; //add all CDS-"introns" if (cdsList.Count()>1) //shouldn't really consider this a valid "match" for (int i=1;ioverlap(cdsList[0])) { if (cdsList[0].start>=t.exons[i]->start && cdsList[0].end<=t.exons[i]->end) { match=true; nc+=cdsList[0].start-t.exons[i]->start+t.exons[i]->end+cdsList[0].end; } //contained in this exon else return false; //overlap, but not contained continue; } nc+=t.exons[i]->len(); } if (!match) return false; ncov=nc; return true; } bool multiCDStoExon(GffObj &t, int& ncov) { //multi-CDS vs multi-exon t int nc=0; int e=0, c=0; int emax=t.exons.Count()-1; int cmax=cdsList.Count()-1; int mintrons=0; //matched introns while (e0 && (cdsList[c].end!=t.exons[e]->end || cdsList[c+1].start!=t.exons[e+1]->start)) return false; GSeg cintron(cdsList[c].end+1, cdsList[c+1].start-1); GSeg eintron(t.exons[e]->end+1, t.exons[e+1]->start-1); if (cintron.start>eintron.end) { nc+=t.exons[e]->len(); e++; continue; } if (eintron.start<=cintron.end) { //intron overlap if (cintron.start==eintron.start && cintron.end==eintron.end) { //intron match if (mintrons==0) { if (cdsList[c].startstart) return false; nc+=cdsList[c].start-t.exons[e]->start; } mintrons++; c++;e++; continue; } else return false; } c++; //should never get here, CDS shouldn't be have to catch up with e } if (mintronsend-cdsList[c].end; for(int i=e+1;ilen(); ncov=nc; return true; } bool containedBy(GffObj& t, int& ncov) { // (Warning: t may have no defined exons!) //if yes: ncov will be set to the number of non-CDS-covered bases in t if (t.exons.Count()<2) { if (t.exons.Count()==0) //no exons defined, just check boundaries return singleExonCDSMatch(t.start, t.end, ncov); else //single-exon return singleExonCDSMatch(t.exons[0]->start, t.exons[0]->end, ncov); } //single or no exon else { //multi-exon transcript if (startstart || end>t.exons.Last()->end) return false; //no containment possible; if (cdsList.Count()==1) return singleCDStoExon(t, ncov); //check intron compatibility! } return true; } }; #endif gclib-0.12.7/gsocket.cpp000066400000000000000000000216041407072766100150410ustar00rootroot00000000000000#include "gsocket.h" #include // For errno #ifdef _WIN32 static bool initialized = false; #endif void GSocketErr(GStr message, bool inclSysMsg) { if (inclSysMsg) { message.append(": "); message.append(strerror(errno)); } GError("%s\n",message.chars()); } // Function to fill in address structure given an address and port static void fillAddr(const GStr &address, unsigned short port, sockaddr_in &addr) { memset(&addr, 0, sizeof(addr)); // Zero out address structure addr.sin_family = AF_INET; // Internet address hostent *host; // Resolve name if ((host = gethostbyname(address.chars())) == NULL) { // strerror() will not work for gethostbyname() and hstrerror() // is supposedly obsolete GSocketErr("Failed to resolve name (gethostbyname())"); } addr.sin_addr.s_addr = *((unsigned long *) host->h_addr_list[0]); addr.sin_port = htons(port); // Assign port in network byte order } // GSocket Code GSocket::GSocket(int type, int protocol) { #ifdef _WIN32 if (!initialized) { WORD wVersionRequested; WSADATA wsaData; wVersionRequested = MAKEWORD(2, 0); // Request WinSock v2.0 if (WSAStartup(wVersionRequested, &wsaData) != 0) { // Load WinSock DLL GSocketErr("Unable to load WinSock DLL"); } initialized = true; } #endif // Make a new socket if ((sockDesc = socket(PF_INET, type, protocol)) < 0) { GSocketErr("GSocket creation failed (socket())", true); } } GSocket::~GSocket() { #ifdef _WIN32 ::closesocket(sockDesc); #else ::close(sockDesc); #endif sockDesc = -1; } GStr GSocket::getLocalAddress() { sockaddr_in addr; unsigned int addr_len = sizeof(addr); if (getsockname(sockDesc, (sockaddr *) &addr, (socklen_t *) &addr_len) < 0) { GSocketErr("Fetch of local address failed (getsockname())", true); } return inet_ntoa(addr.sin_addr); } unsigned short GSocket::getLocalPort() { sockaddr_in addr; unsigned int addr_len = sizeof(addr); if (getsockname(sockDesc, (sockaddr *) &addr, (socklen_t *) &addr_len) < 0) { GSocketErr("Fetch of local port failed (getsockname())", true); } return ntohs(addr.sin_port); } void GSocket::setLocalPort(unsigned short localPort) { // Bind the socket to its port sockaddr_in localAddr; memset(&localAddr, 0, sizeof(localAddr)); localAddr.sin_family = AF_INET; localAddr.sin_addr.s_addr = htonl(INADDR_ANY); localAddr.sin_port = htons(localPort); if (bind(sockDesc, (sockaddr *) &localAddr, sizeof(sockaddr_in)) < 0) { GSocketErr("Set of local port failed (bind())", true); } } void GSocket::setLocalAddressAndPort(const GStr &localAddress, unsigned short localPort) { // Get the address of the requested host sockaddr_in localAddr; fillAddr(localAddress, localPort, localAddr); if (bind(sockDesc, (sockaddr *) &localAddr, sizeof(sockaddr_in)) < 0) { GSocketErr("Set of local address and port failed (bind())", true); } } void GSocket::cleanUp() { #ifdef _WIN32 if (WSACleanup() != 0) { GSocketErr("WSACleanup() failed"); } #endif } unsigned short GSocket::resolveService(const GStr &service, const GStr &protocol) { struct servent *serv; /* Structure containing service information */ if ((serv = getservbyname(service.chars(), protocol.chars())) == NULL) return atoi(service.chars()); /* Service is port number */ else return ntohs(serv->s_port); /* Found port (network byte order) by name */ } // GCommSocket Code void GCommSocket::setTimeout(int microsecs) { #ifdef _WIN32 DWORD timeout = microsecs; setsockopt(sockDesc, SOL_SOCKET, SO_RCVTIMEO, (const char*)&timeout, sizeof(timeout)); #else struct timeval tv; if (microsecs>1000) { tv.tv_sec=microsecs / 1000; tv.tv_usec=microsecs % 1000; } else { tv.tv_sec=0; tv.tv_usec=microsecs; } setsockopt(sockDesc, SOL_SOCKET, SO_RCVTIMEO, (const char*)&tv,sizeof(struct timeval)); #endif } void GCommSocket::connect(const GStr &foreignAddress, unsigned short foreignPort) { // Get the address of the requested host sockaddr_in destAddr; fillAddr(foreignAddress, foreignPort, destAddr); // Try to connect to the given port if (::connect(sockDesc, (sockaddr *) &destAddr, sizeof(destAddr)) < 0) { GSocketErr("Connect failed (connect())", true); } } void GCommSocket::send(const void *buffer, int bufferLen) { if (::send(sockDesc, (raw_type *) buffer, bufferLen, 0) < 0) { GSocketErr("Send failed (send())", true); } } int GCommSocket::recv(void *buffer, int bufferLen) { int rtn; if ((rtn = ::recv(sockDesc, (raw_type *) buffer, bufferLen, 0)) < 0) { GSocketErr("Received failed (recv())", true); } return rtn; } GStr GCommSocket::recvline() { GStr r; char buf[1024]; char* p=NULL; while (p==NULL) { int rtn = ::recv(sockDesc, (raw_type *) buf, 1024, 0); if (rtn<0) GSocketErr("Received failed (recv())", true); if (rtn==0) return r; p=(char*)memchr((void*)buf, '\n', rtn); if (p) { r.appendmem(buf, p-buf); return r; } r.appendmem(buf, rtn); } return r; } GStr GCommSocket::getForeignAddress() { sockaddr_in addr; unsigned int addr_len = sizeof(addr); if (getpeername(sockDesc, (sockaddr *) &addr,(socklen_t *) &addr_len) < 0) { //GSocketErr("Fetch of foreign address failed (getpeername())", true); return ""; } return inet_ntoa(addr.sin_addr); } unsigned short GCommSocket::getForeignPort() { sockaddr_in addr; unsigned int addr_len = sizeof(addr); if (getpeername(sockDesc, (sockaddr *) &addr, (socklen_t *) &addr_len) < 0) { return 0; } return ntohs(addr.sin_port); } // GTCPServerSocket Code GTCPSocket *GTCPServerSocket::accept() { int newConnSD; if ((newConnSD = ::accept(sockDesc, NULL, 0)) < 0) { GSocketErr("Accept failed (accept())", true); } return new GTCPSocket(newConnSD); } void GTCPServerSocket::setListen(int queueLen) { if (listen(sockDesc, queueLen) < 0) GSocketErr("Set listening socket failed (listen())", true); } // GUDPSocket Code void GUDPSocket::setBroadcast() { // If this fails, we'll hear about it when we try to send. This will allow // system that cannot broadcast to continue if they don't plan to broadcast int broadcastPermission = 1; setsockopt(sockDesc, SOL_SOCKET, SO_BROADCAST, (raw_type *) &broadcastPermission, sizeof(broadcastPermission)); } void GUDPSocket::disconnect() { sockaddr_in nullAddr; memset(&nullAddr, 0, sizeof(nullAddr)); nullAddr.sin_family = AF_UNSPEC; // Try to disconnect if (::connect(sockDesc, (sockaddr *) &nullAddr, sizeof(nullAddr)) < 0) { #ifdef _WIN32 if (errno != WSAEAFNOSUPPORT) { #else if (errno != EAFNOSUPPORT) { #endif GSocketErr("Disconnect failed (connect())", true); } } } void GUDPSocket::sendTo(const void *buffer, int bufferLen, const GStr &foreignAddress, unsigned short foreignPort) { sockaddr_in destAddr; fillAddr(foreignAddress, foreignPort, destAddr); // Write out the whole buffer as a single message. if (sendto(sockDesc, (raw_type *) buffer, bufferLen, 0, (sockaddr *) &destAddr, sizeof(destAddr)) != bufferLen) { GSocketErr("Send failed (sendto())", true); } } int GUDPSocket::recvFrom(void *buffer, int bufferLen, GStr &sourceAddress, unsigned short &sourcePort) { sockaddr_in clntAddr; socklen_t addrLen = sizeof(clntAddr); int rtn; if ((rtn = recvfrom(sockDesc, (raw_type *) buffer, bufferLen, 0, (sockaddr *) &clntAddr, (socklen_t *) &addrLen)) < 0) { GSocketErr("Receive failed (recvfrom())", true); } sourceAddress = inet_ntoa(clntAddr.sin_addr); sourcePort = ntohs(clntAddr.sin_port); return rtn; } void GUDPSocket::setMulticastTTL(unsigned char multicastTTL) { if (setsockopt(sockDesc, IPPROTO_IP, IP_MULTICAST_TTL, (raw_type *) &multicastTTL, sizeof(multicastTTL)) < 0) { GSocketErr("Multicast TTL set failed (setsockopt())", true); } } void GUDPSocket::joinGroup(const GStr &multicastGroup) { struct ip_mreq multicastRequest; multicastRequest.imr_multiaddr.s_addr = inet_addr(multicastGroup.chars()); multicastRequest.imr_interface.s_addr = htonl(INADDR_ANY); if (setsockopt(sockDesc, IPPROTO_IP, IP_ADD_MEMBERSHIP, (raw_type *) &multicastRequest, sizeof(multicastRequest)) < 0) { GSocketErr("Multicast group join failed (setsockopt())", true); } } void GUDPSocket::leaveGroup(const GStr &multicastGroup) { struct ip_mreq multicastRequest; multicastRequest.imr_multiaddr.s_addr = inet_addr(multicastGroup.chars()); multicastRequest.imr_interface.s_addr = htonl(INADDR_ANY); if (setsockopt(sockDesc, IPPROTO_IP, IP_DROP_MEMBERSHIP, (raw_type *) &multicastRequest, sizeof(multicastRequest)) < 0) { GSocketErr("Multicast group leave failed (setsockopt())", true); } } gclib-0.12.7/gsocket.h000066400000000000000000000222611407072766100145060ustar00rootroot00000000000000#ifndef GSOCKET_DEFINED #define GSOCKET_DEFINED #include "GBase.h" #include "GStr.h" #ifdef _WIN32 #include // For socket(), connect(), send(), and recv() typedef int socklen_t; typedef char raw_type; // Type used for raw data on this platform #else #include // For data types #include // For socket(), connect(), send(), and recv() #include // For gethostbyname() #include // For inet_addr() #include // For close() #include // For sockaddr_in typedef void raw_type; // Type used for raw data on this platform #endif /** * Signals a problem with the execution of a socket call. * @param message explanatory message * @param incSysMsg true if system message (from strerror(errno)) * should be postfixed to the user provided message */ void GSocketErr(GStr message, bool inclSysMsg = false); /** * Base class representing basic communication endpoint */ class GSocket { public: // Close and deallocate this socket ~GSocket(); /** * Get the local address * @return local address of socket */ GStr getLocalAddress(); /** * Get the local port * @return local port of socket */ unsigned short getLocalPort(); /** * Set the local port to the specified port and the local address * to any interface * @param localPort local port */ void setLocalPort(unsigned short localPort); /** * Set the local port to the specified port and the local address * to the specified address. If you omit the port, a random port * will be selected. * @param localAddress local address * @param localPort local port */ void setLocalAddressAndPort(const GStr &localAddress, unsigned short localPort = 0); /** * If WinSock, unload the WinSock DLLs; otherwise do nothing. We ignore * this in our sample client code but include it in the library for * completeness. If you are running on Windows and you are concerned * about DLL resource consumption, call this after you are done with all * Socket instances. If you execute this on Windows while some instance of * Socket exists, you are toast. For portability of client code, this is * an empty function on non-Windows platforms so you can always include it. * @param buffer buffer to receive the data * @param bufferLen maximum number of bytes to read into buffer * @return number of bytes read, 0 for EOF, and -1 for error */ static void cleanUp(); /** * Resolve the specified service for the specified protocol to the * corresponding port number in host byte order * @param service service to resolve (e.g., "http") * @param protocol protocol of service to resolve. Default is "tcp". */ static unsigned short resolveService(const GStr &service, const GStr &protocol = "tcp"); private: // Prevent the user from trying to use value semantics on this object GSocket(const GSocket &sock); void operator=(const GSocket &sock); protected: int sockDesc; // Socket descriptor GSocket(int type, int protocol); GSocket(int sockDesc) { this->sockDesc = sockDesc; } }; /** * Socket which is able to connect, send, and receive */ class GCommSocket : public GSocket { public: /** * Establish a socket connection with the given foreign * address and port * @param foreignAddress foreign address (IP address or name) * @param foreignPort foreign port */ void setTimeout(int microsecs); void connect(const GStr &foreignAddress, unsigned short foreignPort); /** * Write the given buffer to this socket. Call connect() before * calling send() * @param buffer buffer to be written * @param bufferLen number of bytes from buffer to be written */ void send(const void *buffer, int bufferLen); void send(const GStr& str) { send(str.chars(), str.length()); } /** * Read into the given buffer up to bufferLen bytes data from this * socket. Call connect() before calling recv() * @param buffer buffer to receive the data * @param bufferLen maximum number of bytes to read into buffer * @return number of bytes read, 0 for EOF, and -1 for error */ int recv(void *buffer, int bufferLen); GStr recvline(); /** * Get the foreign address. Call connect() before calling recv() * @return foreign address */ GStr getForeignAddress(); /** * Get the foreign port. Call connect() before calling recv() * @return foreign port */ unsigned short getForeignPort(); protected: GCommSocket(int type, int protocol) : GSocket(type, protocol) { } GCommSocket(int newConnSD) : GSocket(newConnSD) { } }; // TCP socket for communication with other TCP sockets class GTCPSocket : public GCommSocket { public: // Construct a TCP socket with no connection GTCPSocket() : GCommSocket(SOCK_STREAM, IPPROTO_TCP) { } /** * Construct a TCP socket with a connection to the given foreign address * and port * @param foreignAddress foreign address (IP address or name) * @param foreignPort foreign port */ GTCPSocket(const GStr &foreignAddress, unsigned short foreignPort) : GCommSocket(SOCK_STREAM, IPPROTO_TCP) { connect(foreignAddress, foreignPort); } private: // Access for TCPServerSocket::accept() connection creation friend class GTCPServerSocket; GTCPSocket(int newConnSD) : GCommSocket(newConnSD) { } }; // TCP socket class for servers class GTCPServerSocket : public GSocket { public: /** * Construct a TCP socket for use with a server, accepting connections * on the specified port on any interface * @param localPort local port of server socket, a value of zero will * give a system-assigned unused port * @param queueLen maximum queue length for outstanding * connection requests (default 5) */ GTCPServerSocket(unsigned short localPort, int queueLen = 5) : GSocket(SOCK_STREAM, IPPROTO_TCP) { setLocalPort(localPort); setListen(queueLen); } /** * Construct a TCP socket for use with a server, accepting connections * on the specified port on the interface specified by the given address * @param localAddress local interface (address) of server socket * @param localPort local port of server socket * @param queueLen maximum queue length for outstanding * connection requests (default 5) */ GTCPServerSocket(const GStr &localAddress, unsigned short localPort, int queueLen = 5) : GSocket(SOCK_STREAM, IPPROTO_TCP) { setLocalAddressAndPort(localAddress, localPort); setListen(queueLen); } // Blocks until a new connection is established on this socket or error // @return new connection socket GTCPSocket *accept(); private: void setListen(int queueLen); }; /** * UDP socket class */ class GUDPSocket : public GCommSocket { public: // Construct a UDP socket GUDPSocket() : GCommSocket(SOCK_DGRAM, IPPROTO_UDP) { setBroadcast(); } // Construct a UDP socket with the given local port // @param localPort local port GUDPSocket(unsigned short localPort) : GCommSocket(SOCK_DGRAM, IPPROTO_UDP) { setLocalPort(localPort); setBroadcast(); } // Construct a UDP socket with the given local port and address // @param localAddress local address // @param localPort local port GUDPSocket(const GStr &localAddress, unsigned short localPort) : GCommSocket(SOCK_DGRAM, IPPROTO_UDP) { setLocalAddressAndPort(localAddress, localPort); setBroadcast(); } // Unset foreign address and port // @return true if disassociation is successful void disconnect(); /* Send the given buffer as a UDP datagram to the * specified address/port * @param buffer buffer to be written * @param bufferLen number of bytes to write * @param foreignAddress address (IP address or name) to send to * @param foreignPort port number to send to * @return true if send is successful */ void sendTo(const void *buffer, int bufferLen, const GStr &foreignAddress, unsigned short foreignPort); /* Read read up to bufferLen bytes data from this socket. The given buffer * is where the data will be placed * @param buffer buffer to receive data * @param bufferLen maximum number of bytes to receive * @param sourceAddress address of datagram source * @param sourcePort port of data source * @return number of bytes received and -1 for error */ int recvFrom(void *buffer, int bufferLen, GStr &sourceAddress, unsigned short &sourcePort); // Set the multicast TTL // @param multicastTTL multicast TTL void setMulticastTTL(unsigned char multicastTTL); // Join the specified multicast group // @param multicastGroup multicast group address to join void joinGroup(const GStr &multicastGroup); // Leave the specified multicast group // @param multicastGroup multicast group address to leave void leaveGroup(const GStr &multicastGroup); private: void setBroadcast(); }; #endif /* GSOCKET_DEFINED */ gclib-0.12.7/gstopwatch.cpp000066400000000000000000000016431407072766100155660ustar00rootroot00000000000000#include "gstopwatch.h" #ifdef _WIN32 double GStopWatch::LIToSecs( LARGE_INTEGER & L) { return ((double)L.QuadPart /(double)frequency.QuadPart); } GStopWatch::GStopWatch(){ timer.start.QuadPart=0; timer.stop.QuadPart=0; QueryPerformanceFrequency( &frequency ); } void GStopWatch::startTimer( ) { QueryPerformanceCounter(&timer.start); } void GStopWatch::stopTimer( ) { QueryPerformanceCounter(&timer.stop); } double GStopWatch::getElapsedTime() { LARGE_INTEGER time; time.QuadPart = timer.stop.QuadPart - timer.start.QuadPart; return LIToSecs( time) ; } #else //Linux code: void GStopWatch::startTimer( ) { gettimeofday(&(timer.start),NULL); } void GStopWatch::stopTimer( ) { gettimeofday(&(timer.stop),NULL); } double GStopWatch::getElapsedTime() { timeval res; timersub(&(timer.stop),&(timer.start),&res); return res.tv_sec + res.tv_usec/1000000.0; // 10^6 uSec per second } #endif gclib-0.12.7/gstopwatch.h000066400000000000000000000011651407072766100152320ustar00rootroot00000000000000#ifndef __GSTOPWATCH_H #define __GSTOPWATCH_H #include "GBase.h" #ifdef _WIN32 typedef struct { LARGE_INTEGER start; LARGE_INTEGER stop; } stopWatch; class GStopWatch { private: stopWatch timer; LARGE_INTEGER frequency; double LIToSecs( LARGE_INTEGER & L); public: GStopWatch(); void startTimer( ); void stopTimer( ); double getElapsedTime(); }; #else #include typedef struct { timeval start; timeval stop; } stopWatch; class GStopWatch { private: stopWatch timer; public: GStopWatch() {}; void startTimer( ); void stopTimer( ); double getElapsedTime(); }; #endif #endif gclib-0.12.7/gtest.cpp000066400000000000000000000107341407072766100145320ustar00rootroot00000000000000#include "GBase.h" #include "GArgs.h" #include "GStr.h" #include "GBitVec.h" #include "GList.hh" #include "GHash.hh" #define USAGE "Usage:\n\ gtest [--bit-test|-g|--genomic-fasta ] [-c|COV=] \n\ [-s|--seq ] [-o|--out ] [--disable-flag] [-t|--test ]\n\ [-p|PID=] file1 [file2 file3 ..]\n\ " enum { OPT_HELP=1, OPT_GENOMIC, OPT_COV, OPT_SEQ, OPT_OUTFILE, OPT_DISABLE_FLAG, OPT_TEST, OPT_PID, OPT_BITVEC, OPT_NUM }; GArgsDef opts[] = { {"help", 'h', 0, OPT_HELP}, {"genomic-fasta", 'g', 1, OPT_GENOMIC}, {"COV", 'c', 1, OPT_COV}, {"seq", 's', 1, OPT_SEQ}, {"out", 'o', 1, OPT_OUTFILE}, {"disable-flag", 0, 0, OPT_DISABLE_FLAG}, {"test", 't', 1, OPT_TEST}, {"PID", 'p', 1, OPT_PID}, {"bit-test", 'B', 0, OPT_BITVEC}, {"bignum", 'n', 1, OPT_NUM}, {0,0,0,0} }; void bitError(int idx) { GError("Error bit checking (index %d)!\n", idx); } struct Gint { int v; Gint(int vv=0):v(vv) {} int val() { return v; } ~Gint() { GMessage("Gint with val %d getting destroyed\n", v); } }; int cmpGint(pointer p1, pointer p2) { int v1=((Gint*)p1)->v; int v2=((Gint*)p2)->v; if (v1v2)? 1 : 0; } void testGPVec() { GPVec vecs[3]; vecs[1].Add(new Gint(2)); vecs[2].Add(new Gint(3)); GMessage("Added to vecs[1]:%d\n", vecs[1][0]->val()); GMessage("Added to vecs[2]:%d\n", vecs[2][0]->val()); } int main(int argc, char* argv[]) { //GArgs args(argc, argv, "hg:c:s:t:o:p:help;genomic-fasta=COV=PID=seq=out=disable-flag;test="); GArgs args(argc, argv, opts); fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); args.printError(USAGE, true); //if (args.getOpt('h') || args.getOpt("help")) GVec transcripts(true); transcripts.cAdd(0); fprintf(stderr,"after add transcript counts=%d\n",transcripts.Count()); exit(0); if (args.getOpt(OPT_HELP)) { GMessage("%s\n", USAGE); exit(1); } if (args.getOpt(OPT_NUM)) { GStr snum(args.getOpt(OPT_NUM)); int num=snum.asInt(); char* numstr=commaprintnum(num); GMessage("Number %d written with commas: %s\n", num, numstr); GFREE(numstr); } //--- GHash > ends; /* testGPVec(); //exit(0); //uint pos=3; //GStr spos((int)pos); //GVec *ev=ends[spos.chars()]; GPVec v; int r(5); int rr=v.Add(new Gint(3)); //if (rr<0) { // GMessage("Error adding 0! (code %d)\n",rr); // } v.Add(new Gint(r)); v.Add(new Gint(2)); v.Add(new Gint(1)); v.Add(new Gint(4)); rr=v.Add(new Gint(0)); v[rr]->v=-1; v.Sort(cmpGint); GMessage("collection has %d elements:\n",v.Count()); for (int i=0;iv); } exit(0); */ //--- int numopts=args.startOpt(); if (numopts) GMessage("#### Recognized %d option arguments:\n", numopts); int optcode=0; while ((optcode=args.nextCode())) { char* r=args.getOpt(optcode); GMessage("%14s\t= %s\n", args.getOptName(optcode), (r[0]==0)?"True":r); } int numargs=args.startNonOpt(); if (numargs>0) { GMessage("\n#### Found %d non-option arguments given:\n", numargs); char* a=NULL; while ((a=args.nextNonOpt())) { GMessage("%s\n",a); } } GStr s=args.getOpt('t'); if (!s.is_empty()) { GStr token; GMessage("Tokens in \"%s\" :\n",s.chars()); s.startTokenize(";,: \t"); int c=1; while (s.nextToken(token)) { GMessage("token %2d : \"%s\"\n",c,token.chars()); c++; } } if (args.getOpt(OPT_BITVEC)) { uint numbits=4156888234; GBitVec bits(numbits); GMessage(">>> -- BitVec(%u) created (size=%u, mem=%lu) -- \n", numbits, bits.size(), bits.getMemorySize()); bits[405523342]=true; GMessage(" memory size: %lu , size()=%u, count()=%d \n", bits.getMemorySize(), bits.size(), bits.count()); /* //GMessage(">>> -- Start BitVec Test -- \n"); if (bits[1092]) bitError(1092); bits.resize(2049); if (bits[2048]) bitError(2048); bits[2048]=true; if (!bits[2048]) bitError(2048); bits.resize(4097); if (!bits[2048]) bitError(2048); if (bits[4096]) bitError(4096); bits[4096]=true; if (!bits[4096]) bitError(4096); GBitVec bits2(64); Gswap(bits, bits2); if (!bits2[2048]) bitError(2048); if (!bits2[4096]) bitError(4096); */ //GMessage("<<< -- End BitVec Test (size: %d, count: %d, bits2 size=%d, count=%d) --\n", /// bits.size(), bits.count(), bits2.size(), bits2.count()); } } gclib-0.12.7/htest.cpp000066400000000000000000000414771407072766100145430ustar00rootroot00000000000000#include "GBase.h" #include "GArgs.h" #include "GStr.h" #include "GVec.hh" namespace old { #include "GHash.hh" } #include "GResUsage.h" #include //#include "tsl/hopscotch_map.h" //#include "tsl/robin_map.h" #include //#include "ska/bytell_hash_map.hpp" #include "GHashMap.hh" #define USAGE "Usage:\n\ htest [-Q] [-C] [-n num_clusters] textfile.. \n\ E.g. quick query test: ./htest -Q qtest_str.dta\n\ \n\ " //quick query test: ./htest -Q qtest_str.dta bool qryMode=false; bool checkRM=false; int numClusters=500; struct HStrData { int cmd; // 0=add, 1=remove, 2=clear GStr str; HStrData(char* s=NULL, int c=0):cmd(c), str(s) { } }; int loadStrings(FILE* f, GPVec& strgsuf, GPVec& strgs, int toLoad) { int num=0; GLineReader lr(f); char* line=NULL; int numcl=0; while ((line=lr.nextLine())!=NULL) { int len=strlen(line); if (len<3) continue; if (line[0]=='>') { numcl++; if (toLoad && numcl>toLoad) { break; } continue; } if (strcmp(line, "HCLR")==0) { strgs.Add(new HStrData(NULL, 2)); strgsuf.Add(new HStrData(NULL, 2)); continue; } if (startsWith(line, "RM ")) { strgsuf.Add(new HStrData(line+3,1) ); line[len-3]=0; strgs.Add(new HStrData(line+3,1)); continue; } strgsuf.Add(new HStrData(line)); line[len-3]=0; strgs.Add(new HStrData(line)); num++; } //while line return num; } void showTimings(GResUsage swatch) { char *wtime=commaprintnum((uint64_t)swatch.elapsed()); char *utime=commaprintnum((uint64_t)swatch.u_elapsed()); char *stime=commaprintnum((uint64_t)swatch.s_elapsed()); char *smem=commaprintnum((uint64_t)swatch.memoryUsed()); GMessage("Elapsed time (microseconds): %12s us\n", wtime); GMessage(" user time: %12s us\n", utime); GMessage(" system time: %12s us\n", stime); GMessage(" mem usage: %12s KB\n", smem); GFREE(wtime);GFREE(utime);GFREE(stime); GFREE(smem); } // default values recommended by http://isthe.com/chongo/tech/comp/fnv/ const uint32_t Prime = 0x01000193; // 16777619 const uint32_t Seed = 0x811C9DC5; // 2166136261 /// hash a single byte inline uint32_t fnv1a(unsigned char b, uint32_t h = Seed) { return (b ^ h) * Prime; } /// hash a C-style string uint32_t fnv1a(const char* text, uint32_t hash = Seed) { while (*text) hash = fnv1a((unsigned char)*text++, hash); return hash; } struct cstr_eq { inline bool operator()(const char* x, const char* y) const { return (strcmp(x, y) == 0); } }; struct cstr_hash { inline uint32_t operator()(const char* s) const { return XXH32(s, std::strlen(s),0); //return fnv1a(s); } }; void run_GHash(GResUsage& swatch, GPVec & hstrs, const char* label) { old::GHash ghash; // @suppress("Type cannot be resolved") int num_add=0, num_rm=0, num_clr=0; GMessage("----------------- %s ----------------\n", label); ghash.Clear(); swatch.start(); int cl_i=0; int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0: if (cl_i==0) cl_i=i; ghash.fAdd(hstrs[i]->str.chars(), new int(i)); num_add++; break; case 1: if (qryMode) break; ghash.Remove(hstrs[i]->str.chars()); num_rm++; break; case 2: //run tests here if (qryMode) { //run some query tests here for(int j=cl_i;jcmd) continue; int* v=ghash[hstrs[j]->str.chars()]; if (v==NULL) GError("Error at <%s>, key %s not found (count:%d, cl_i=%d, i=%d)!\n",label, hstrs[j]->str.chars(), ghash.Count(), cl_i, i ); if (*v!=j) GError("Error at <%s>, invalid value for key %s!\n",label, hstrs[j]->str.chars() ); } } cl_i=0; ghash.Clear(); num_clr++; break; } } swatch.stop(); ghash.Clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } /* void run_Hopscotch(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; //tsl::hopscotch_map hsmap; tsl::hopscotch_map>, 30, true> hsmap; GMessage("----------------- %s ----------------\n", label); swatch.start(); int cl_i=0; int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0: if (cl_i==0) cl_i=i; hsmap.insert({hstrs[i]->str.chars(), i}); num_add++; break; case 1: if (qryMode) break; hsmap.erase(hstrs[i]->str.chars()); num_rm++; break; case 2: if (qryMode) { //run some query tests here //with strings from hstrs[cl_i .. i-1] range for(int j=cl_i;jcmd) continue; int v=hsmap[hstrs[j]->str.chars()]; if (v!=j) GError("Error at <%s>, invalid value for key %s! (got %d, expected %d)\n",label, hstrs[j]->str.chars(), v, j ); } } cl_i=0; hsmap.clear(); num_clr++; break; } } swatch.stop(); hsmap.clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } void run_Robin(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; //tsl::hopscotch_map hsmap; tsl::robin_map>, true> rmap; GMessage("----------------- %s ----------------\n", label); swatch.start(); int cl_i=0; int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0: if (cl_i==0) cl_i=i; rmap.insert({hstrs[i]->str.chars(), i}); num_add++; break; case 1: if (qryMode) break; rmap.erase(hstrs[i]->str.chars()); num_rm++; break; case 2: if (qryMode) { //run some query tests here //with strings from hstrs[cl_i .. i-1] range for(int j=cl_i;jcmd) continue; int v=rmap[hstrs[j]->str.chars()]; if (v!=j) GError("Error at <%s>, invalid value for key %s!\n",label, hstrs[j]->str.chars() ); } } cl_i=0; rmap.clear(); num_clr++; break; } } swatch.stop(); rmap.clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } void run_Bytell(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; ska::bytell_hash_map bmap; GMessage("----------------- %s ----------------\n", label); swatch.start(); for (int i=0;icmd) { case 0:bmap.insert({hstrs[i]->str.chars(), 1}); num_add++; break; case 1:bmap.erase(hstrs[i]->str.chars()); num_rm++; break; case 2:bmap.clear(); num_clr++; break; } } swatch.stop(); bmap.clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } */ void run_Khashl(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; klib::KHashMapCached khmap; GMessage("----------------- %s ----------------\n", label); swatch.start(); int cl_i=0; int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0:if (cl_i==0) cl_i=i; khmap[hstrs[i]->str.chars()]=i; num_add++; break; case 1:if (qryMode) break; khmap.del(khmap.get(hstrs[i]->str.chars())); num_rm++; break; case 2: if (qryMode) { //run some query tests here for(int j=cl_i;jcmd) continue; int v=khmap[hstrs[j]->str.chars()]; if (v!=j) GError("Error at <%s>, invalid value for key %s!\n",label, hstrs[j]->str.chars() ); } } cl_i=0; khmap.clear(); num_clr++; break; } } swatch.stop(); khmap.clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } void run_GHashMap(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; //GKHashSet khset; //GHashSet<> khset; //GHash, uint32_t> khset; GHashMap khset; GMessage("----------------- %s ----------------\n", label); int cl_i=0; swatch.start(); int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0: if (cl_i==0) cl_i=i; khset.Add(hstrs[i]->str.chars(), i); num_add++; break; case 1:if (qryMode) break; if (khset.Remove(hstrs[i]->str.chars())<0) if (checkRM) GMessage("Warning: key %s could not be removed!\n", hstrs[i]->str.chars()); num_rm++; break; case 2: if (qryMode) { //run some query tests here //with strings from hstrs[cl_i .. i-1] range for(int j=cl_i;jcmd) continue; int* v=khset[hstrs[j]->str.chars()]; if (*v!=j) GError("Error at <%s>, invalid value for key %s!\n",label, hstrs[j]->str.chars() ); } } cl_i=0; khset.Clear(); num_clr++; break; } } swatch.stop(); khset.Clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } void run_GxxHashMap(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; //GHash khset; GHashMap, GHashKey_Eq, uint32_t > khset; GMessage("----------------- %s ----------------\n", label); int cl_i=0; swatch.start(); int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0: if (cl_i==0) cl_i=i; khset.Add(hstrs[i]->str.chars(), i); num_add++; break; case 1:if (qryMode) break; if (khset.Remove(hstrs[i]->str.chars())<0) if (checkRM) GMessage("Warning: key %s could not be removed!\n", hstrs[i]->str.chars()); num_rm++; break; case 2: if (qryMode) { //run some query tests here //with strings from hstrs[cl_i .. i-1] range for(int j=cl_i;jcmd) continue; int* v=khset[hstrs[j]->str.chars()]; if (*v!=j) GError("Error at <%s>, invalid value for key %s!\n",label, hstrs[j]->str.chars() ); } } cl_i=0; khset.Clear(); num_clr++; break; } } swatch.stop(); khset.Clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } void run_GHashMapShk(GResUsage& swatch, GPVec & hstrs, const char* label) { int num_add=0, num_rm=0, num_clr=0; GHashMap khset; GMessage("----------------- %s ----------------\n", label); int cl_i=0; swatch.start(); int prevcmd=2; for (int i=0;icmd==prevcmd) { if (prevcmd==2) continue; } else prevcmd=hstrs[i]->cmd; switch (hstrs[i]->cmd) { case 0: if (cl_i==0) cl_i=i; khset.Add(hstrs[i]->str.chars(),i); num_add++; break; case 1:if (qryMode) break; if (khset.Remove(hstrs[i]->str.chars())<0) if (checkRM) GMessage("Warning: key %s could not be removed!\n", hstrs[i]->str.chars()); num_rm++; break; case 2: if (qryMode) { //run some query tests here //with strings from hstrs[cl_i .. i-1] range for(int j=cl_i;jcmd) continue; int* v=khset[hstrs[j]->str.chars()]; if (*v!=j) GError("Error at <%s>, invalid value for key %s!\n",label, hstrs[j]->str.chars() ); } } cl_i=0; khset.Clear(); num_clr++; break; } } swatch.stop(); khset.Clear(); GMessage(" (%d inserts, %d deletions, %d clears)\n", num_add, num_rm, num_clr); } struct SObj { GStr atr; int val; SObj(const char* a=NULL, const int v=0):atr(a),val(v) { } bool operator<(const SObj& o) const { return val strs; GPVec sufstrs; //GArgs args(argc, argv, "hg:c:s:t:o:p:help;genomic-fasta=COV=PID=seq=out=disable-flag;test="); GArgs args(argc, argv, "hQCn:"); //fprintf(stderr, "Command line was:\n"); //args.printCmdLine(stderr); args.printError(USAGE, true); if (args.getOpt('h') || args.getOpt("help")) GMessage(USAGE); GStr s=args.getOpt('n'); if (!s.is_empty()) { numClusters=s.asInt(); if (numClusters<=0) GError("%s\nError: invalid value for -n !\n", USAGE); } qryMode=(args.getOpt('Q')); checkRM=(args.getOpt('C')); int numargs=args.startNonOpt(); const char* a=NULL; FILE* f=NULL; int total=0; //==== quick test area /* std::unordered_map umap; GHash gh; GPVec ptrs(false); GQHash ihash(false); GQHash phash; GQStrHash shash; const char* tstrs[6] = {"twelve", "five", "nine", "eleven", "three", "nope"}; int vals[6] = { 12, 5, 9, 11, 3, 777 }; char buf[20]; for (int i=0;i<5;i++) { SObj* o=new SObj(tstrs[i], vals[i]*10); ptrs.Add(o); sprintf(buf, "%lx", o); GMessage("SObj (%s, %d) pointer added: %s\n",tstrs[i], o->val, buf); gh.Add(buf, new int(vals[i])); shash.Add(tstrs[i], o); ihash.Add(vals[i], o); phash.Add(o, vals[i]); umap[o]=vals[i]; } ptrs.Sort(); GMessage("shash has now %d entries.\n", shash.Count()); //enumerate shash entries: { shash.startIterate(); SObj* iv=NULL; while (const char* k=shash.Next(iv)) { GMessage("Enumerating shash entry: (%s => %lx)\n", k, iv); } } //qry: for (int i=0;iatr.chars()); if (v==NULL) GMessage("key <%s> not found in shash!\n", o->atr.chars()); int* iv=phash.Find(o); if (iv==NULL) GMessage("key <%lx> not found in phash!\n", o); //if (!oset[*o]) // GMessage("struct {%s, %d} not found in oset!\n", o->atr.chars(), o->val); //sprintf(buf, "%lx", o); //int* hv=gh[buf]; //GMessage("Item {%s, %d} : GHash retrieved flag = %d, umap retrieved flag = %d\n", // o->atr.chars(), o->val, *hv, umap[o]); } //SObj* n=new SObj("test", 10); //if (!pset[n]) // GMessage("key <%lx> not found in pset!\n", n); for (int i=0;i<6;i++) { SObj* o=shash[tstrs[i]]; if (o==NULL) GMessage("key <%s> not found in shash!\n", tstrs[i]); if (o && i<5) { if (o->atr!=tstrs[i]) GMessage("shash value does not match key <%s!\n", tstrs[i]); } } //delete n; //int v=umap[n]; //GMessage("Non-existing test entry returned value %d\n", v); */ /* auto found=umap.find(n); if (found!=umap.end()) { GMessage("Found flags %d for entry {\"%s\", %d}\n", found->second, n->atr.chars(), found->first->val ); } else GMessage("New test obj not found !\n"); return(0); */ //==== quick test area end if (numargs==0) { //a="htest_data.lst"; a="htest_over500.lst"; f=fopen(a, "r"); if (f==NULL) GError("Error: could not open file %s !\n", a); GMessage("loading %d clusters from file..\n", numClusters); int num=loadStrings(f, sufstrs, strs, numClusters); total+=num; fclose(f); } else { while ((a=args.nextNonOpt())) { f=fopen(a, "r"); if (f==NULL) GError("Error: could not open file %s !\n", a); int num=loadStrings(f, sufstrs, strs, numClusters); total+=num; fclose(f); } } GResUsage swatch; run_GHash(swatch, sufstrs, "GHash w/ suffix"); showTimings(swatch); //run_GHash(swatch, strs, "GHash no suffix"); //showTimings(swatch); /* run_Hopscotch(swatch, sufstrs, "hopscotch w/ suffix"); showTimings(swatch); run_Hopscotch(swatch, strs, "hopscotch no suffix"); showTimings(swatch); */ /* run_Robin(swatch, sufstrs, "robin w/ suffix"); showTimings(swatch); run_Robin(swatch, strs, "robin no suffix"); showTimings(swatch); */ run_Khashl(swatch, sufstrs, "khashl w/ suffix"); showTimings(swatch); /* run_Khashl(swatch, strs, "khashl no suffix"); showTimings(swatch); */ run_GHashMap(swatch, sufstrs, "GHashMap default w/ suffix"); showTimings(swatch); run_GxxHashMap(swatch, sufstrs, "GHashMap xxHash32 w/ suffix"); showTimings(swatch); //run_GHashMap(swatch, strs, "GHashMap no suffix"); //showTimings(swatch); //run_GHashMapShk(swatch, sufstrs, "GHashSetShk w/ suffix"); //showTimings(swatch); //run_GHashMapShk(swatch, strs, "GHashSetShk no suffix"); //showTimings(swatch); /* run_Bytell(swatch, sufstrs, "bytell w/ suffix"); showTimings(swatch); run_Bytell(swatch, strs, "bytell no suffix"); showTimings(swatch); */ } gclib-0.12.7/khashl.hh000066400000000000000000000231171407072766100144720ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef __AC_KHASHL_HPP #define __AC_KHASHL_HPP #include // for std::equal_to #include // for malloc() etc #include // for memset() #include // for uint32_t namespace klib { /*********** * HashSet * ***********/ template, typename khint_t = uint32_t> class KHashSet { protected: khint_t bits, count; uint32_t *used; T *keys; static inline uint32_t __kh_used(const uint32_t *flag, khint_t i) { return flag[i>>5] >> (i&0x1fU) & 1U; }; static inline void __kh_set_used(uint32_t *flag, khint_t i) { flag[i>>5] |= 1U<<(i&0x1fU); }; static inline void __kh_set_unused(uint32_t *flag, khint_t i) { flag[i>>5] &= ~(1U<<(i&0x1fU)); }; static inline khint_t __kh_fsize(khint_t m) { return m<32? 1 : m>>5; } static inline khint_t __kh_h2b(uint32_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); } static inline khint_t __kh_h2b(uint64_t hash, khint_t bits) { return hash * 11400714819323198485ULL >> (64 - bits); } public: KHashSet() : bits(0), count(0), used(NULL), keys(NULL) {}; ~KHashSet() { std::free(used); std::free(keys); }; inline khint_t n_buckets() const { return used? khint_t(1) << bits : 0; } inline khint_t end() const { return n_buckets(); } inline khint_t size() const { return count; } inline T &key(khint_t x) { return keys[x]; }; inline bool _used(khint_t i) const { return (used[i>>5] >> (i&0x1fU) & 1U); } void clear(void) { if (!used) return; memset(used, 0, __kh_fsize(n_buckets()) * sizeof(uint32_t)); count = 0; } khint_t get(const T &key) const { khint_t i, last, mask, nb; if (keys == 0) return 0; nb = n_buckets(); mask = nb - khint_t(1); i = last = __kh_h2b(Hash()(key), bits); while (__kh_used(used, i) && !Eq()(keys[i], key)) { i = (i + khint_t(1)) & mask; if (i == last) return nb; } return !__kh_used(used, i)? nb : i; } int resize(khint_t new_nb) { uint32_t *new_used = 0; khint_t j = 0, x = new_nb, nb, new_bits, new_mask; while ((x >>= khint_t(1)) != 0) ++j; if (new_nb & (new_nb - 1)) ++j; new_bits = j > 2? j : 2; new_nb = khint_t(1) << new_bits; if (count > (new_nb>>1) + (new_nb>>2)) return 0; // requested size is too small new_used = (uint32_t*)std::malloc(__kh_fsize(new_nb) * sizeof(uint32_t)); memset(new_used, 0, __kh_fsize(new_nb) * sizeof(uint32_t)); if (!new_used) return -1; /* not enough memory */ nb = n_buckets(); if (nb < new_nb) { /* expand */ T *new_keys = (T*)std::realloc(keys, new_nb * sizeof(T)); if (!new_keys) { std::free(new_used); return -1; } keys = new_keys; } /* otherwise shrink */ new_mask = new_nb - 1; for (j = 0; j != nb; ++j) { if (!__kh_used(used, j)) continue; T key = keys[j]; __kh_set_unused(used, j); while (1) { /* kick-out process; sort of like in Cuckoo hashing */ khint_t i; i = __kh_h2b(Hash()(key), new_bits); while (__kh_used(new_used, i)) i = (i + khint_t(1)) & new_mask; __kh_set_used(new_used, i); if (i < nb && __kh_used(used, i)) { /* kick out the existing element */ { T tmp = keys[i]; keys[i] = key; key = tmp; } __kh_set_unused(used, i); /* mark it as deleted in the old hash table */ } else { /* write the element and jump out of the loop */ keys[i] = key; break; } } } if (nb > new_nb) /* shrink the hash table */ keys = (T*)std::realloc(keys, new_nb * sizeof(T)); std::free(used); /* free the working space */ used = new_used, bits = new_bits; return 0; } khint_t put(const T &key, int *absent_ = 0) { khint_t nb, i, last, mask; int absent = -1; nb = n_buckets(); if (count >= (nb>>1) + (nb>>2)) { // rehashing if (resize(nb + khint_t(1)) < 0) { if (absent_) *absent_ = -1; return nb; } nb = n_buckets(); } // TODO: to implement automatically shrinking; resize() already support shrinking mask = nb - 1; i = last = __kh_h2b(Hash()(key), bits); while (__kh_used(used, i) && !Eq()(keys[i], key)) { i = (i + 1U) & mask; if (i == last) break; } if (!__kh_used(used, i)) { // not present at all keys[i] = key; __kh_set_used(used, i); ++count, absent = 1; } else absent = 0; /* Don't touch keys[i] if present */ if (absent_) *absent_ = absent; return i; } int del(khint_t i) { khint_t j = i, k, mask, nb = n_buckets(); if (keys == 0 || i >= nb) return 0; mask = nb - khint_t(1); while (1) { j = (j + khint_t(1)) & mask; if (j == i || !__kh_used(used, j)) break; // j==i only when the table is completely full k = __kh_h2b(Hash()(keys[j]), bits); if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) keys[i] = keys[j], i = j; } __kh_set_unused(used, i); --count; return 1; } }; /*********** * HashMap * ***********/ template struct KHashMapBucket { KType key; VType val; }; template struct KHashMapHash { khint_t operator() (const T &a) const { return Hash()(a.key); } }; template struct KHashMapEq { bool operator() (const T &a, const T &b) const { return Eq()(a.key, b.key); } }; template, typename khint_t=uint32_t> class KHashMap : public KHashSet, KHashMapHash, Hash, khint_t>, KHashMapEq, Eq>, khint_t> { protected: typedef KHashMapBucket bucket_t; typedef KHashSet, KHashMapEq, khint_t> hashset_t; public: khint_t get(const KType &key) const { bucket_t t = { key, VType() }; return hashset_t::get(t); } khint_t put(const KType &key, int *absent) { bucket_t t = { key, VType() }; return hashset_t::put(t, absent); } inline KType &key(khint_t i) { return hashset_t::key(i).key; } inline VType &value(khint_t i) { return hashset_t::key(i).val; } inline VType &operator[] (const KType &key) { bucket_t t = { key, VType() }; return value(hashset_t::put(t)); } }; /**************************** * HashSet with cached hash * ****************************/ template struct KHashSetCachedBucket { KType key; khint_t hash; }; template struct KHashCachedHash { khint_t operator() (const T &a) const { return a.hash; } }; template struct KHashCachedEq { bool operator() (const T &a, const T &b) const { return a.hash == b.hash && Eq()(a.key, b.key); } }; template, typename khint_t = uint32_t> class KHashSetCached : public KHashSet, KHashCachedHash, khint_t>, KHashCachedEq, Eq>, khint_t> { typedef KHashSetCachedBucket bucket_t; typedef KHashSet, KHashCachedEq, khint_t> hashset_t; public: khint_t get(const KType &key) const { bucket_t t = { key, Hash()(key) }; return hashset_t::get(t); } khint_t put(const KType &key, int *absent) { bucket_t t = { key, Hash()(key) }; return hashset_t::put(t, absent); } inline KType &key(khint_t i) { return hashset_t::key(i).key; } }; /**************************** * HashMap with cached hash * ****************************/ template struct KHashMapCachedBucket { KType key; VType val; khint_t hash; }; template, typename khint_t = uint32_t> class KHashMapCached : public KHashSet, KHashCachedHash, khint_t>, KHashCachedEq, Eq>, khint_t> { protected: typedef KHashMapCachedBucket bucket_t; typedef KHashSet, KHashCachedEq, khint_t> hashset_t; public: khint_t get(const KType &key) const { bucket_t t = { key, VType(), Hash()(key) }; return hashset_t::get(t); } khint_t put(const KType &key, int *absent) { bucket_t t = { key, VType(), Hash()(key) }; return hashset_t::put(t, absent); } inline KType &key(khint_t i) { return hashset_t::key(i).key; } inline VType &value(khint_t i) { return hashset_t::key(i).val; } inline VType &operator[] (const KType &key) { bucket_t t = { key, VType(), Hash()(key) }; return value(hashset_t::put(t)); } }; } #endif /* __AC_KHASHL_HPP */ gclib-0.12.7/mdtest.cpp000066400000000000000000000025341407072766100147030ustar00rootroot00000000000000#include "GBase.h" #include "GArgs.h" #include "GStr.h" #define USAGE "Usage:\n\ mdtest '' \n\ Shows an example of basic parsing of a MD string.\n\ " int main(int argc, char* argv[]) { GArgs args(argc, argv, "h"); int numargs=args.startNonOpt(); if (args.getOpt('h') || numargs!=1) { GMessage("%s\n",USAGE); exit(0); } char* mdstr=args.nextNonOpt(); //--make a copy of the string, in case the original is a const string // (because the parseUInt() function modifies the string temporarily char* mdstring=Gstrdup(mdstr); char *p=mdstring; while (*p!='\0') { int num_matches=0; if (parseInt(p,num_matches)) { if (num_matches!=0) GMessage("%d matching bases\n", num_matches); continue; } if (*p=='^') { //deletion GDynArray deletion; //deletion string accumulates here (if needed) int del_length=0;//tracking deletion length char delbase=*(++p); while (delbase>='A' && delbase<='Z') { deletion.Add(delbase); del_length++; delbase=*(++p); } GMessage("%d base(s) deletion [", del_length); for (uint i=0;i='A' && *p<='Z') { GMessage("base mismatch [%c]\n",*p); p++; continue; } GMessage("Warning: skipping unrecognized char [%c]\n", *p); p++; } GFREE(mdstring); return 0; } gclib-0.12.7/proc_mem.cpp000066400000000000000000000053271407072766100152070ustar00rootroot00000000000000#include "proc_mem.h" #ifdef __APPLE__ #include void get_mem_usage(double& vm_usage, double& resident_set) { vm_usage=0; resident_set=0; struct task_basic_info t_info; mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; if (KERN_SUCCESS == task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count)) { vm_usage=double(t_info.virtual_size)/1024; resident_set=double(t_info.resident_size)/1024; } // resident size is in t_info.resident_size; // virtual size is in t_info.virtual_size; } #elif defined(_WIN32) || defined(_WIN64) #include "windows.h" #include "psapi.h" void get_mem_usage(double& vm_usage, double& resident_set) { PROCESS_MEMORY_COUNTERS_EX pmc; GetProcessMemoryInfo(GetCurrentProcess(), (PPROCESS_MEMORY_COUNTERS)&pmc, sizeof(pmc)); //SIZE_T virtualMemUsedByMe = pmc.PrivateUsage; //SIZE_T physMemUsedByMe = pmc.WorkingSetSize; vm_usage=(double)pmc.PrivateUsage; resident_set=(double)pmc.WorkingSetSize; } #else //assume Linux #include #include #include #include void get_mem_usage(double& vm_usage, double& resident_set) { using std::ios_base; using std::ifstream; using std::string; vm_usage = 0.0; resident_set = 0.0; // 'file' stat seems to give the most reliable results ifstream stat_stream("/proc/self/stat",ios_base::in); // dummy vars for leading entries in stat that we don't care about string pid, comm, state, ppid, pgrp, session, tty_nr; string tpgid, flags, minflt, cminflt, majflt, cmajflt; string utime, stime, cutime, cstime, priority, nice; string O, itrealvalue, starttime; // the two fields we want // unsigned long vsize; long rss; stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt >> utime >> stime >> cutime >> cstime >> priority >> nice >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest stat_stream.close(); long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages vm_usage = vsize / 1024.0; resident_set = rss * page_size_kb; } #endif ////////////////////////////////////////////////////////////////////////////// // get_mem_usage(double &, double &) - takes two doubles by reference, // attempts to read the system-dependent data for a process' virtual memory // size and resident set size, and return the results in KB. // // On failure, returns 0.0, 0.0 void print_mem_usage(FILE* fout) { double vs, rs; get_mem_usage(vs,rs); vs/=1024; rs/=1024; fprintf(fout, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs); } gclib-0.12.7/proc_mem.h000066400000000000000000000003611407072766100146450ustar00rootroot00000000000000#ifndef PROC_MEM_H #define PROC_MEM_H #include // a Linux-specific way to report the memory usage of the current process void get_mem_usage(double& vm_usage, double& resident_set); void print_mem_usage(FILE* fout=stderr); #endif gclib-0.12.7/tag_git.sh000077500000000000000000000005071407072766100146520ustar00rootroot00000000000000#!/bin/bash -e git checkout master ver=$(fgrep '#define GCLIB_VERSION ' GBase.h) ver=${ver#*\"} ver=${ver%%\"*} #git fetch --tags if [[ "$1" == "delete" || "$1" == "del" ]]; then echo "Deleting tag v$ver .." git tag -d v$ver git push origin :refs/tags/v$ver exit fi git tag -a "v$ver" -m "release $ver" git push --tags gclib-0.12.7/wyhash.h000066400000000000000000000270761407072766100143630ustar00rootroot00000000000000// This is free and unencumbered software released into the public domain under The Unlicense (http://unlicense.org/) // main repo: https://github.com/wangyi-fudan/wyhash // author: 王一 Wang Yi // contributors: Reini Urban, Dietrich Epp, Joshua Haberman, Tommy Ettinger, Daniel Lemire, Otmar Ertl, cocowalla, leo-yuriev, Diego Barrios Romero, paulie-g, dumblob, Yann Collet, ivte-ms, hyb, James Z.M. Gao, easyaspi314 (Devin), TheOneric /* quick example: string s="fjsakfdsjkf"; uint64_t hash=wyhash(s.c_str(), s.size(), 0, _wyp); */ #ifndef wyhash_final_version_3 #define wyhash_final_version_3 #ifndef WYHASH_CONDOM //protections that produce different results: //1: normal valid behavior //2: extra protection against entropy loss (probability=2^-63), aka. "blind multiplication" #define WYHASH_CONDOM 1 #endif #ifndef WYHASH_32BIT_MUM //0: normal version, slow on 32 bit systems //1: faster on 32 bit systems but produces different results, incompatible with wy2u0k function #define WYHASH_32BIT_MUM 0 #endif //includes #include #include #if defined(_MSC_VER) && defined(_M_X64) #include #pragma intrinsic(_umul128) #endif //likely and unlikely macros #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) #define _likely_(x) __builtin_expect(x,1) #define _unlikely_(x) __builtin_expect(x,0) #else #define _likely_(x) (x) #define _unlikely_(x) (x) #endif //128bit multiply function static inline uint64_t _wyrot(uint64_t x) { return (x>>32)|(x<<32); } static inline void _wymum(uint64_t *A, uint64_t *B){ #if(WYHASH_32BIT_MUM) uint64_t hh=(*A>>32)*(*B>>32), hl=(*A>>32)*(uint32_t)*B, lh=(uint32_t)*A*(*B>>32), ll=(uint64_t)(uint32_t)*A*(uint32_t)*B; #if(WYHASH_CONDOM>1) *A^=_wyrot(hl)^hh; *B^=_wyrot(lh)^ll; #else *A=_wyrot(hl)^hh; *B=_wyrot(lh)^ll; #endif #elif defined(__SIZEOF_INT128__) __uint128_t r=*A; r*=*B; #if(WYHASH_CONDOM>1) *A^=(uint64_t)r; *B^=(uint64_t)(r>>64); #else *A=(uint64_t)r; *B=(uint64_t)(r>>64); #endif #elif defined(_MSC_VER) && defined(_M_X64) #if(WYHASH_CONDOM>1) uint64_t a, b; a=_umul128(*A,*B,&b); *A^=a; *B^=b; #else *A=_umul128(*A,*B,B); #endif #else uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo; uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t>32)+(rm1>>32)+c; #if(WYHASH_CONDOM>1) *A^=lo; *B^=hi; #else *A=lo; *B=hi; #endif #endif } //multiply and xor mix function, aka MUM static inline uint64_t _wymix(uint64_t A, uint64_t B){ _wymum(&A,&B); return A^B; } //endian macros #ifndef WYHASH_LITTLE_ENDIAN #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define WYHASH_LITTLE_ENDIAN 1 #elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define WYHASH_LITTLE_ENDIAN 0 #else #warning could not determine endianness! Falling back to little endian. #define WYHASH_LITTLE_ENDIAN 1 #endif #endif //read functions #if (WYHASH_LITTLE_ENDIAN) static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v;} static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return v;} #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return __builtin_bswap64(v);} static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return __builtin_bswap32(v);} #elif defined(_MSC_VER) static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return _byteswap_uint64(v);} static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return _byteswap_ulong(v);} #else static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >> 8) & 0xff000000)| ((v << 8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000)); } static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return (((v >> 24) & 0xff)| ((v >> 8) & 0xff00)| ((v << 8) & 0xff0000)| ((v << 24) & 0xff000000)); } #endif static inline uint64_t _wyr3(const uint8_t *p, size_t k) { return (((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1];} //wyhash main function static inline uint64_t wyhash(const void *key, size_t len, uint64_t seed, const uint64_t *secret){ const uint8_t *p=(const uint8_t *)key; seed^=*secret; uint64_t a, b; if(_likely_(len<=16)){ if(_likely_(len>=4)){ a=(_wyr4(p)<<32)|_wyr4(p+((len>>3)<<2)); b=(_wyr4(p+len-4)<<32)|_wyr4(p+len-4-((len>>3)<<2)); } else if(_likely_(len>0)){ a=_wyr3(p,len); b=0;} else a=b=0; } else{ size_t i=len; if(_unlikely_(i>48)){ uint64_t see1=seed, see2=seed; do{ seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed); see1=_wymix(_wyr8(p+16)^secret[2],_wyr8(p+24)^see1); see2=_wymix(_wyr8(p+32)^secret[3],_wyr8(p+40)^see2); p+=48; i-=48; }while(_likely_(i>48)); seed^=see1^see2; } while(_unlikely_(i>16)){ seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed); i-=16; p+=16; } a=_wyr8(p+i-16); b=_wyr8(p+i-8); } return _wymix(secret[1]^len,_wymix(a^secret[1],b^seed)); } //the default secret parameters static const uint64_t _wyp[4] = {0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, 0x589965cc75374cc3ull}; //a useful 64bit-64bit mix function to produce deterministic pseudo random numbers that can pass BigCrush and PractRand static inline uint64_t wyhash64(uint64_t A, uint64_t B){ A^=0xa0761d6478bd642full; B^=0xe7037ed1a0b428dbull; _wymum(&A,&B); return _wymix(A^0xa0761d6478bd642full,B^0xe7037ed1a0b428dbull);} //The wyrand PRNG that pass BigCrush and PractRand static inline uint64_t wyrand(uint64_t *seed){ *seed+=0xa0761d6478bd642full; return _wymix(*seed,*seed^0xe7037ed1a0b428dbull);} //convert any 64 bit pseudo random numbers to uniform distribution [0,1). It can be combined with wyrand, wyhash64 or wyhash. static inline double wy2u01(uint64_t r){ const double _wynorm=1.0/(1ull<<52); return (r>>12)*_wynorm;} //convert any 64 bit pseudo random numbers to APPROXIMATE Gaussian distribution. It can be combined with wyrand, wyhash64 or wyhash. static inline double wy2gau(uint64_t r){ const double _wynorm=1.0/(1ull<<20); return ((r&0x1fffff)+((r>>21)&0x1fffff)+((r>>42)&0x1fffff))*_wynorm-3.0;} #if(!WYHASH_32BIT_MUM) //fast range integer random number generation on [0,k) credit to Daniel Lemire. May not work when WYHASH_32BIT_MUM=1. It can be combined with wyrand, wyhash64 or wyhash. static inline uint64_t wy2u0k(uint64_t r, uint64_t k){ _wymum(&r,&k); return k; } #endif //make your own secret static inline void make_secret(uint64_t seed, uint64_t *secret){ uint8_t c[] = {15, 23, 27, 29, 30, 39, 43, 45, 46, 51, 53, 54, 57, 58, 60, 71, 75, 77, 78, 83, 85, 86, 89, 90, 92, 99, 101, 102, 105, 106, 108, 113, 114, 116, 120, 135, 139, 141, 142, 147, 149, 150, 153, 154, 156, 163, 165, 166, 169, 170, 172, 177, 178, 180, 184, 195, 197, 198, 201, 202, 204, 209, 210, 212, 216, 225, 226, 228, 232, 240 }; for(size_t i=0;i<4;i++){ uint8_t ok; do{ ok=1; secret[i]=0; for(size_t j=0;j<64;j+=8) secret[i]|=((uint64_t)c[wyrand(&seed)%sizeof(c)])<> 1) & 0x5555555555555555; x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; x = (x * 0x0101010101010101) >> 56; if(x!=32){ ok=0; break; } #endif } }while(!ok); } } /* This is world's fastest hash map: 2x faster than bytell_hash_map. It does not store the keys, but only the hash/signature of keys. First we use pos=hash1(key) to approximately locate the bucket. Then we search signature=hash2(key) from pos linearly. If we find a bucket with matched signature we report the bucket Or if we meet a bucket whose signifure=0, we report a new position to insert The signature collision probability is very low as we usually searched N~10 buckets. By combining hash1 and hash2, we acturally have 128 bit anti-collision strength. hash1 and hash2 can be the same function, resulting lower collision resistance but faster. The signature is 64 bit, but can be modified to 32 bit if necessary for save space. The above two can be activated by define WYHASHMAP_WEAK_SMALL_FAST simple examples: const size_t size=213432; vector idx(size); // allocate the index of fixed size. idx MUST be zeroed. vector value(size); // we only care about the index, user should maintain his own value vectors. string key="dhskfhdsj" // the object to be inserted into idx size_t pos=wyhashmap(idx.data(), idx.size(), key.c_str(), key.size(), 1); // get the position and insert if(pos */ gclib-0.12.7/xxhash.h000066400000000000000000005505061407072766100143620ustar00rootroot00000000000000/* * xxHash - Extremely Fast Hash algorithm * Header File * Copyright (C) 2012-2020 Yann Collet * * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * You can contact the author at: * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ /* TODO: update */ /* Notice extracted from xxHash homepage: xxHash is an extremely fast hash algorithm, running at RAM speed limits. It also successfully passes all tests from the SMHasher suite. Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) Name Speed Q.Score Author xxHash 5.4 GB/s 10 CrapWow 3.2 GB/s 2 Andrew MumurHash 3a 2.7 GB/s 10 Austin Appleby SpookyHash 2.0 GB/s 10 Bob Jenkins SBox 1.4 GB/s 9 Bret Mulvey Lookup3 1.2 GB/s 9 Bob Jenkins SuperFastHash 1.2 GB/s 1 Paul Hsieh CityHash64 1.05 GB/s 10 Pike & Alakuijala FNV 0.55 GB/s 5 Fowler, Noll, Vo CRC32 0.43 GB/s 9 MD5-32 0.33 GB/s 10 Ronald L. Rivest SHA1-32 0.28 GB/s 10 Q.Score is a measure of quality of the hash function. It depends on successfully passing SMHasher test set. 10 is a perfect score. Note: SMHasher's CRC32 implementation is not the fastest one. Other speed-oriented implementations can be faster, especially in combination with PCLMUL instruction: https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 A 64-bit version, named XXH64, is available since r35. It offers much better speed, but for 64-bit applications only. Name Speed on 64 bits Speed on 32 bits XXH64 13.8 GB/s 1.9 GB/s XXH32 6.8 GB/s 6.0 GB/s */ #if defined (__cplusplus) extern "C" { #endif /* **************************** * INLINE mode ******************************/ /*! * XXH_INLINE_ALL (and XXH_PRIVATE_API) * Use these build macros to inline xxhash into the target unit. * Inlining improves performance on small inputs, especially when the length is * expressed as a compile-time constant: * * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html * * It also keeps xxHash symbols private to the unit, so they are not exported. * * Usage: * #define XXH_INLINE_ALL * #include "xxhash.h" * * Do not compile and link xxhash.o as a separate object, as it is not useful. */ #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ && !defined(XXH_INLINE_ALL_31684351384) /* this section should be traversed only once */ # define XXH_INLINE_ALL_31684351384 /* give access to the advanced API, required to compile implementations */ # undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ # define XXH_STATIC_LINKING_ONLY /* make all functions private */ # undef XXH_PUBLIC_API # if defined(__GNUC__) # define XXH_PUBLIC_API static __inline __attribute__((unused)) # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) # define XXH_PUBLIC_API static inline # elif defined(_MSC_VER) # define XXH_PUBLIC_API static __inline # else /* note: this version may generate warnings for unused static functions */ # define XXH_PUBLIC_API static # endif /* * This part deals with the special case where a unit wants to inline xxHash, * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such * as part of some previously included *.h header file. * Without further action, the new include would just be ignored, * and functions would effectively _not_ be inlined (silent failure). * The following macros solve this situation by prefixing all inlined names, * avoiding naming collision with previous inclusions. */ # ifdef XXH_NAMESPACE # error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" /* * Note: Alternative: #undef all symbols (it's a pretty large list). * Without #error: it compiles, but functions are actually not inlined. */ # endif # define XXH_NAMESPACE XXH_INLINE_ /* * Some identifiers (enums, type names) are not symbols, but they must * still be renamed to avoid redeclaration. * Alternative solution: do not redeclare them. * However, this requires some #ifdefs, and is a more dispersed action. * Meanwhile, renaming can be achieved in a single block */ # define XXH_IPREF(Id) XXH_INLINE_ ## Id # define XXH_OK XXH_IPREF(XXH_OK) # define XXH_ERROR XXH_IPREF(XXH_ERROR) # define XXH_errorcode XXH_IPREF(XXH_errorcode) # define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) # define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) # define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) # define XXH32_state_s XXH_IPREF(XXH32_state_s) # define XXH32_state_t XXH_IPREF(XXH32_state_t) # define XXH64_state_s XXH_IPREF(XXH64_state_s) # define XXH64_state_t XXH_IPREF(XXH64_state_t) # define XXH3_state_s XXH_IPREF(XXH3_state_s) # define XXH3_state_t XXH_IPREF(XXH3_state_t) # define XXH128_hash_t XXH_IPREF(XXH128_hash_t) /* Ensure the header is parsed again, even if it was previously included */ # undef XXHASH_H_5627135585666179 # undef XXHASH_H_STATIC_13879238742 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ /* **************************************************************** * Stable API *****************************************************************/ #ifndef XXHASH_H_5627135585666179 #define XXHASH_H_5627135585666179 1 /* specific declaration modes for Windows */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) # if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) # ifdef XXH_EXPORT # define XXH_PUBLIC_API __declspec(dllexport) # elif XXH_IMPORT # define XXH_PUBLIC_API __declspec(dllimport) # endif # else # define XXH_PUBLIC_API /* do nothing */ # endif #endif /*! * XXH_NAMESPACE, aka Namespace Emulation: * * If you want to include _and expose_ xxHash functions from within your own * library, but also want to avoid symbol collisions with other libraries which * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix * any public symbol from xxhash library with the value of XXH_NAMESPACE * (therefore, avoid empty or numeric values). * * Note that no change is required within the calling program as long as it * includes `xxhash.h`: Regular symbol names will be automatically translated * by this header. */ #ifdef XXH_NAMESPACE # define XXH_CAT(A,B) A##B # define XXH_NAME2(A,B) XXH_CAT(A,B) # define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) /* XXH32 */ # define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) # define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) # define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) # define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) # define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) # define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) # define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) # define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) # define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) /* XXH64 */ # define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) # define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) # define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) # define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) # define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) # define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) # define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) # define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) # define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) /* XXH3_64bits */ # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) /* XXH3_128bits */ # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) # define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) # define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) # define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) #endif /* ************************************* * Version ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 #define XXH_VERSION_RELEASE 0 #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) XXH_PUBLIC_API unsigned XXH_versionNumber (void); /* **************************** * Definitions ******************************/ #include /* size_t */ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; /*-********************************************************************** * 32-bit hash ************************************************************************/ #if !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) # include typedef uint32_t XXH32_hash_t; #else # include # if UINT_MAX == 0xFFFFFFFFUL typedef unsigned int XXH32_hash_t; # else # if ULONG_MAX == 0xFFFFFFFFUL typedef unsigned long XXH32_hash_t; # else # error "unsupported platform: need a 32-bit type" # endif # endif #endif /*! * XXH32(): * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". * The memory between input & input+length must be valid (allocated and read-accessible). * "seed" can be used to alter the result predictably. * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s * * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, * and offers true 64/128 bit hash results. It provides a superior level of * dispersion, and greatly reduces the risks of collisions. */ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); /******* Streaming *******/ /* * Streaming functions generate the xxHash value from an incrememtal input. * This method is slower than single-call functions, due to state management. * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. * * An XXH state must first be allocated using `XXH*_createState()`. * * Start a new hash by initializing the state with a seed using `XXH*_reset()`. * * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. * * The function returns an error code, with 0 meaning OK, and any other value * meaning there is an error. * * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. * This function returns the nn-bits hash as an int or long long. * * It's still possible to continue inserting input into the hash state after a * digest, and generate new hash values later on by invoking `XXH*_digest()`. * * When done, release the state using `XXH*_freeState()`. */ typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); /******* Canonical representation *******/ /* * The default return values from XXH functions are unsigned 32 and 64 bit * integers. * This the simplest and fastest format for further post-processing. * * However, this leaves open the question of what is the order on the byte level, * since little and big endian conventions will store the same number differently. * * The canonical representation settles this issue by mandating big-endian * convention, the same convention as human-readable numbers (large digits first). * * When writing hash values to storage, sending them over a network, or printing * them, it's highly recommended to use the canonical representation to ensure * portability across a wider range of systems, present and future. * * The following functions allow transformation of hash values to and from * canonical format. */ typedef struct { unsigned char digest[4]; } XXH32_canonical_t; XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); #ifndef XXH_NO_LONG_LONG /*-********************************************************************** * 64-bit hash ************************************************************************/ #if !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) # include typedef uint64_t XXH64_hash_t; #else /* the following type must have a width of 64-bit */ typedef unsigned long long XXH64_hash_t; #endif /*! * XXH64(): * Returns the 64-bit hash of sequence of length @length stored at memory * address @input. * @seed can be used to alter the result predictably. * * This function usually runs faster on 64-bit systems, but slower on 32-bit * systems (see benchmark). * * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, * and offers true 64/128 bit hash results. It provides a superior level of * dispersion, and greatly reduces the risks of collisions. */ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed); /******* Streaming *******/ typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); /******* Canonical representation *******/ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); /*-********************************************************************** * XXH3 64-bit variant ************************************************************************/ /* ************************************************************************ * XXH3 is a new hash algorithm featuring: * - Improved speed for both small and large inputs * - True 64-bit and 128-bit outputs * - SIMD acceleration * - Improved 32-bit viability * * Speed analysis methodology is explained here: * * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html * * In general, expect XXH3 to run about ~2x faster on large inputs and >3x * faster on small ones compared to XXH64, though exact differences depend on * the platform. * * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash * on all platforms. * * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. * * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are * explained in the implementation. * * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. * * XXH3 offers 2 variants, _64bits and _128bits. * When only 64 bits are needed, prefer calling the _64bits variant, as it * reduces the amount of mixing, resulting in faster speed on small inputs. * * It's also generally simpler to manipulate a scalar return type than a struct. * * The 128-bit version adds additional strength, but it is slightly slower. * * Return values of XXH3 and XXH128 are officially finalized starting * with v0.8.0 and will no longer change in future versions. * Avoid storing values from before that release in long-term storage. * * Results produced by v0.7.x are not comparable with results from v0.7.y. * However, the API is completely stable, and it can safely be used for * ephemeral data (local sessions). * * The API supports one-shot hashing, streaming mode, and custom secrets. */ /* XXH3_64bits(): * default 64-bit variant, using default secret and default seed of 0. * It's the fastest variant. */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); /* * XXH3_64bits_withSeed(): * This variant generates a custom secret on the fly * based on default secret altered using the `seed` value. * While this operation is decently fast, note that it's not completely free. * Note: seed==0 produces the same results as XXH3_64bits(). */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* * XXH3_64bits_withSecret(): * It's possible to provide any blob of bytes as a "secret" to generate the hash. * This makes it more difficult for an external actor to prepare an intentional collision. * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). * However, the quality of produced hash values depends on secret's entropy. * Technically, the secret must look like a bunch of random bytes. * Avoid "trivial" or structured data such as repeated sequences or a text document. * Whenever unsure about the "randomness" of the blob of bytes, * consider relabelling it as a "custom seed" instead, * and employ "XXH3_generateSecret()" (see below) * to generate a high entropy secret derived from the custom seed. */ #define XXH3_SECRET_SIZE_MIN 136 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); /******* Streaming *******/ /* * Streaming requires state maintenance. * This operation costs memory and CPU. * As a consequence, streaming is slower than one-shot hashing. * For better performance, prefer one-shot functions whenever applicable. */ typedef struct XXH3_state_s XXH3_state_t; XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); /* * XXH3_64bits_reset(): * Initialize with default parameters. * digest will be equivalent to `XXH3_64bits()`. */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); /* * XXH3_64bits_reset_withSeed(): * Generate a custom secret from `seed`, and store it into `statePtr`. * digest will be equivalent to `XXH3_64bits_withSeed()`. */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); /* * XXH3_64bits_reset_withSecret(): * `secret` is referenced, it _must outlive_ the hash streaming session. * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, * and the quality of produced hash values depends on secret's entropy * (secret's content should look like a bunch of random bytes). * When in doubt about the randomness of a candidate `secret`, * consider employing `XXH3_generateSecret()` instead (see below). */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); /* note : canonical representation of XXH3 is the same as XXH64 * since they both produce XXH64_hash_t values */ /*-********************************************************************** * XXH3 128-bit variant ************************************************************************/ typedef struct { XXH64_hash_t low64; XXH64_hash_t high64; } XXH128_hash_t; XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); /******* Streaming *******/ /* * Streaming requires state maintenance. * This operation costs memory and CPU. * As a consequence, streaming is slower than one-shot hashing. * For better performance, prefer one-shot functions whenever applicable. * * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). * Use already declared XXH3_createState() and XXH3_freeState(). * * All reset and streaming functions have same meaning as their 64-bit counterpart. */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); /* Following helper functions make it possible to compare XXH128_hast_t values. * Since XXH128_hash_t is a structure, this capability is not offered by the language. * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ /*! * XXH128_isEqual(): * Return: 1 if `h1` and `h2` are equal, 0 if they are not. */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); /*! * XXH128_cmp(): * * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. * * return: >0 if *h128_1 > *h128_2 * =0 if *h128_1 == *h128_2 * <0 if *h128_1 < *h128_2 */ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); /******* Canonical representation *******/ typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); #endif /* XXH_NO_LONG_LONG */ #endif /* XXHASH_H_5627135585666179 */ #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) #define XXHASH_H_STATIC_13879238742 /* **************************************************************************** * This section contains declarations which are not guaranteed to remain stable. * They may change in future versions, becoming incompatible with a different * version of the library. * These declarations should only be used with static linking. * Never use them in association with dynamic linking! ***************************************************************************** */ /* * These definitions are only present to allow static allocation * of XXH states, on stack or in a struct, for example. * Never **ever** access their members directly. */ struct XXH32_state_s { XXH32_hash_t total_len_32; XXH32_hash_t large_len; XXH32_hash_t v1; XXH32_hash_t v2; XXH32_hash_t v3; XXH32_hash_t v4; XXH32_hash_t mem32[4]; XXH32_hash_t memsize; XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ }; /* typedef'd to XXH32_state_t */ #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ struct XXH64_state_s { XXH64_hash_t total_len; XXH64_hash_t v1; XXH64_hash_t v2; XXH64_hash_t v3; XXH64_hash_t v4; XXH64_hash_t mem64[4]; XXH32_hash_t memsize; XXH32_hash_t reserved32; /* required for padding anyway */ XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */ }; /* typedef'd to XXH64_state_t */ #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ # include # define XXH_ALIGN(n) alignas(n) #elif defined(__GNUC__) # define XXH_ALIGN(n) __attribute__ ((aligned(n))) #elif defined(_MSC_VER) # define XXH_ALIGN(n) __declspec(align(n)) #else # define XXH_ALIGN(n) /* disabled */ #endif /* Old GCC versions only accept the attribute after the type in structures. */ #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ && defined(__GNUC__) # define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) #else # define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type #endif #define XXH3_INTERNALBUFFER_SIZE 256 #define XXH3_SECRET_DEFAULT_SIZE 192 struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); /* used to store a custom secret generated from a seed */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); XXH32_hash_t bufferedSize; XXH32_hash_t reserved32; size_t nbStripesSoFar; XXH64_hash_t totalLen; size_t nbStripesPerBlock; size_t secretLimit; XXH64_hash_t seed; XXH64_hash_t reserved64; const unsigned char* extSecret; /* reference to external secret; * if == NULL, use .customSecret instead */ /* note: there may be some padding at the end due to alignment on 64 bytes */ }; /* typedef'd to XXH3_state_t */ #undef XXH_ALIGN_MEMBER /* When the XXH3_state_t structure is merely emplaced on stack, * it should be initialized with XXH3_INITSTATE() or a memset() * in case its first reset uses XXH3_NNbits_reset_withSeed(). * This init can be omitted if the first reset uses default or _withSecret mode. * This operation isn't necessary when the state is created with XXH3_createState(). * Note that this doesn't prepare the state for a streaming operation, * it's still necessary to use XXH3_NNbits_reset*() afterwards. */ #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } /* === Experimental API === */ /* Symbols defined below must be considered tied to a specific library version. */ /* * XXH3_generateSecret(): * * Derive a high-entropy secret from any user-defined content, named customSeed. * The generated secret can be used in combination with `*_withSecret()` functions. * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed, * as it becomes much more difficult for an external actor to guess how to impact the calculation logic. * * The function accepts as input a custom seed of any length and any content, * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE * into an already allocated buffer secretBuffer. * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long. * * The generated secret can then be used with any `*_withSecret()` variant. * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` * are part of this list. They all accept a `secret` parameter * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN) * _and_ feature very high entropy (consist of random-looking bytes). * These conditions can be a high bar to meet, so * this function can be used to generate a secret of proper quality. * * customSeed can be anything. It can have any size, even small ones, * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes. * The resulting `secret` will nonetheless provide all expected qualities. * * Supplying NULL as the customSeed copies the default secret into `secretBuffer`. * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior. */ XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize); /* simple short-cut to pre-selected XXH3_128bits variant */ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); #endif /* XXH_NO_LONG_LONG */ #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) # define XXH_IMPLEMENTATION #endif #endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ /* ======================================================================== */ /* ======================================================================== */ /* ======================================================================== */ /*-********************************************************************** * xxHash implementation *-********************************************************************** * xxHash's implementation used to be hosted inside xxhash.c. * * However, inlining requires implementation to be visible to the compiler, * hence be included alongside the header. * Previously, implementation was hosted inside xxhash.c, * which was then #included when inlining was activated. * This construction created issues with a few build and install systems, * as it required xxhash.c to be stored in /include directory. * * xxHash implementation is now directly integrated within xxhash.h. * As a consequence, xxhash.c is no longer needed in /include. * * xxhash.c is still available and is still useful. * In a "normal" setup, when xxhash is not inlined, * xxhash.h only exposes the prototypes and public symbols, * while xxhash.c can be built into an object file xxhash.o * which can then be linked into the final binary. ************************************************************************/ #if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) # define XXH_IMPLEM_13a8737387 /* ************************************* * Tuning parameters ***************************************/ /*! * XXH_FORCE_MEMORY_ACCESS: * By default, access to unaligned memory is controlled by `memcpy()`, which is * safe and portable. * * Unfortunately, on some target/compiler combinations, the generated assembly * is sub-optimal. * * The below switch allow selection of a different access method * in the search for improved performance. * Method 0 (default): * Use `memcpy()`. Safe and portable. Default. * Method 1: * `__attribute__((packed))` statement. It depends on compiler extensions * and is therefore not portable. * This method is safe if your compiler supports it, and *generally* as * fast or faster than `memcpy`. * Method 2: * Direct access via cast. This method doesn't depend on the compiler but * violates the C standard. * It can generate buggy code on targets which do not support unaligned * memory accesses. * But in some circumstances, it's the only known way to get the most * performance (example: GCC + ARMv6) * Method 3: * Byteshift. This can generate the best code on old compilers which don't * inline small `memcpy()` calls, and it might also be faster on big-endian * systems which lack a native byteswap instruction. * See https://stackoverflow.com/a/32095106/646947 for details. * Prefer these methods in priority order (0 > 1 > 2 > 3) */ #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ # if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) # define XXH_FORCE_MEMORY_ACCESS 2 # elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) # define XXH_FORCE_MEMORY_ACCESS 1 # endif #endif /*! * XXH_ACCEPT_NULL_INPUT_POINTER: * If the input pointer is NULL, xxHash's default behavior is to dereference it, * triggering a segfault. * When this macro is enabled, xxHash actively checks the input for a null pointer. * If it is, the result for null input pointers is the same as a zero-length input. */ #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ # define XXH_ACCEPT_NULL_INPUT_POINTER 0 #endif /*! * XXH_FORCE_ALIGN_CHECK: * This is an important performance trick * for architectures without decent unaligned memory access performance. * It checks for input alignment, and when conditions are met, * uses a "fast path" employing direct 32-bit/64-bit read, * resulting in _dramatically faster_ read speed. * * The check costs one initial branch per hash, which is generally negligible, but not zero. * Moreover, it's not useful to generate binary for an additional code path * if memory access uses same instruction for both aligned and unaligned adresses. * * In these cases, the alignment check can be removed by setting this macro to 0. * Then the code will always use unaligned memory access. * Align check is automatically disabled on x86, x64 & arm64, * which are platforms known to offer good unaligned memory accesses performance. * * This option does not affect XXH3 (only XXH32 and XXH64). */ #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \ || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ # define XXH_FORCE_ALIGN_CHECK 0 # else # define XXH_FORCE_ALIGN_CHECK 1 # endif #endif /*! * XXH_NO_INLINE_HINTS: * * By default, xxHash tries to force the compiler to inline almost all internal * functions. * * This can usually improve performance due to reduced jumping and improved * constant folding, but significantly increases the size of the binary which * might not be favorable. * * Additionally, sometimes the forced inlining can be detrimental to performance, * depending on the architecture. * * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the * compiler full control on whether to inline or not. * * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using * -fno-inline with GCC or Clang, this will automatically be defined. */ #ifndef XXH_NO_INLINE_HINTS # if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ || defined(__NO_INLINE__) /* -O0, -fno-inline */ # define XXH_NO_INLINE_HINTS 1 # else # define XXH_NO_INLINE_HINTS 0 # endif #endif /*! * XXH_REROLL: * Whether to reroll XXH32_finalize, and XXH64_finalize, * instead of using an unrolled jump table/if statement loop. * * This is automatically defined on -Os/-Oz on GCC and Clang. */ #ifndef XXH_REROLL # if defined(__OPTIMIZE_SIZE__) # define XXH_REROLL 1 # else # define XXH_REROLL 0 # endif #endif /* ************************************* * Includes & Memory related functions ***************************************/ /*! * Modify the local functions below should you wish to use * different memory routines for malloc() and free() */ #include static void* XXH_malloc(size_t s) { return malloc(s); } static void XXH_free(void* p) { free(p); } /*! and for memcpy() */ #include static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } #include /* ULLONG_MAX */ /* ************************************* * Compiler Specific Options ***************************************/ #ifdef _MSC_VER /* Visual Studio warning fix */ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #endif #if XXH_NO_INLINE_HINTS /* disable inlining hints */ # if defined(__GNUC__) # define XXH_FORCE_INLINE static __attribute__((unused)) # else # define XXH_FORCE_INLINE static # endif # define XXH_NO_INLINE static /* enable inlining hints */ #elif defined(_MSC_VER) /* Visual Studio */ # define XXH_FORCE_INLINE static __forceinline # define XXH_NO_INLINE static __declspec(noinline) #elif defined(__GNUC__) # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) # define XXH_NO_INLINE static __attribute__((noinline)) #elif defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ # define XXH_FORCE_INLINE static inline # define XXH_NO_INLINE static #else # define XXH_FORCE_INLINE static # define XXH_NO_INLINE static #endif /* ************************************* * Debug ***************************************/ /* * XXH_DEBUGLEVEL is expected to be defined externally, typically via the * compiler's command line options. The value must be a number. */ #ifndef XXH_DEBUGLEVEL # ifdef DEBUGLEVEL /* backwards compat */ # define XXH_DEBUGLEVEL DEBUGLEVEL # else # define XXH_DEBUGLEVEL 0 # endif #endif #if (XXH_DEBUGLEVEL>=1) # include /* note: can still be disabled with NDEBUG */ # define XXH_ASSERT(c) assert(c) #else # define XXH_ASSERT(c) ((void)0) #endif /* note: use after variable declarations */ #define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) /* ************************************* * Basic Types ***************************************/ #if !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) # include typedef uint8_t xxh_u8; #else typedef unsigned char xxh_u8; #endif typedef XXH32_hash_t xxh_u32; #ifdef XXH_OLD_NAMES # define BYTE xxh_u8 # define U8 xxh_u8 # define U32 xxh_u32 #endif /* *** Memory access *** */ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) /* * Manual byteshift. Best for old compilers which don't inline memcpy. * We actually directly use XXH_readLE32 and XXH_readBE32. */ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) /* * Force direct memory access. Only works on CPU which support unaligned memory * access in hardware. */ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) /* * __pack instructions are safer but compiler specific, hence potentially * problematic for some compilers. * * Currently only defined for GCC and ICC. */ #ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; #endif static xxh_u32 XXH_read32(const void* ptr) { typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign; return ((const xxh_unalign*)ptr)->u32; } #else /* * Portable and safe solution. Generally efficient. * see: https://stackoverflow.com/a/32095106/646947 */ static xxh_u32 XXH_read32(const void* memPtr) { xxh_u32 val; memcpy(&val, memPtr, sizeof(val)); return val; } #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ /* *** Endianess *** */ typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; /*! * XXH_CPU_LITTLE_ENDIAN: * Defined to 1 if the target is little endian, or 0 if it is big endian. * It can be defined externally, for example on the compiler command line. * * If it is not defined, a runtime check (which is usually constant folded) * is used instead. */ #ifndef XXH_CPU_LITTLE_ENDIAN /* * Try to detect endianness automatically, to avoid the nonstandard behavior * in `XXH_isLittleEndian()` */ # if defined(_WIN32) /* Windows is always little endian */ \ || defined(__LITTLE_ENDIAN__) \ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) # define XXH_CPU_LITTLE_ENDIAN 1 # elif defined(__BIG_ENDIAN__) \ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) # define XXH_CPU_LITTLE_ENDIAN 0 # else /* * runtime test, presumed to simplify to a constant by compiler */ static int XXH_isLittleEndian(void) { /* * Portable and well-defined behavior. * Don't use static: it is detrimental to performance. */ const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; return one.c[0]; } # define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() # endif #endif /* **************************************** * Compiler-specific Functions and Macros ******************************************/ #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) #ifdef __has_builtin # define XXH_HAS_BUILTIN(x) __has_builtin(x) #else # define XXH_HAS_BUILTIN(x) 0 #endif #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ && XXH_HAS_BUILTIN(__builtin_rotateleft64) # define XXH_rotl32 __builtin_rotateleft32 # define XXH_rotl64 __builtin_rotateleft64 /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ #elif defined(_MSC_VER) # define XXH_rotl32(x,r) _rotl(x,r) # define XXH_rotl64(x,r) _rotl64(x,r) #else # define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) # define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) #endif #if defined(_MSC_VER) /* Visual Studio */ # define XXH_swap32 _byteswap_ulong #elif XXH_GCC_VERSION >= 403 # define XXH_swap32 __builtin_bswap32 #else static xxh_u32 XXH_swap32 (xxh_u32 x) { return ((x << 24) & 0xff000000 ) | ((x << 8) & 0x00ff0000 ) | ((x >> 8) & 0x0000ff00 ) | ((x >> 24) & 0x000000ff ); } #endif /* *************************** * Memory reads *****************************/ typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; /* * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. * * This is ideal for older compilers which don't inline memcpy. */ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) { const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; return bytePtr[0] | ((xxh_u32)bytePtr[1] << 8) | ((xxh_u32)bytePtr[2] << 16) | ((xxh_u32)bytePtr[3] << 24); } XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) { const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; return bytePtr[3] | ((xxh_u32)bytePtr[2] << 8) | ((xxh_u32)bytePtr[1] << 16) | ((xxh_u32)bytePtr[0] << 24); } #else XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) { return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); } static xxh_u32 XXH_readBE32(const void* ptr) { return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); } #endif XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) { if (align==XXH_unaligned) { return XXH_readLE32(ptr); } else { return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); } } /* ************************************* * Misc ***************************************/ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } /* ******************************************************************* * 32-bit hash functions *********************************************************************/ static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ #ifdef XXH_OLD_NAMES # define PRIME32_1 XXH_PRIME32_1 # define PRIME32_2 XXH_PRIME32_2 # define PRIME32_3 XXH_PRIME32_3 # define PRIME32_4 XXH_PRIME32_4 # define PRIME32_5 XXH_PRIME32_5 #endif static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { acc += input * XXH_PRIME32_2; acc = XXH_rotl32(acc, 13); acc *= XXH_PRIME32_1; #if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) /* * UGLY HACK: * This inline assembly hack forces acc into a normal register. This is the * only thing that prevents GCC and Clang from autovectorizing the XXH32 * loop (pragmas and attributes don't work for some resason) without globally * disabling SSE4.1. * * The reason we want to avoid vectorization is because despite working on * 4 integers at a time, there are multiple factors slowing XXH32 down on * SSE4: * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on * newer chips!) making it slightly slower to multiply four integers at * once compared to four integers independently. Even when pmulld was * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE * just to multiply unless doing a long operation. * * - Four instructions are required to rotate, * movqda tmp, v // not required with VEX encoding * pslld tmp, 13 // tmp <<= 13 * psrld v, 19 // x >>= 19 * por v, tmp // x |= tmp * compared to one for scalar: * roll v, 13 // reliably fast across the board * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason * * - Instruction level parallelism is actually more beneficial here because * the SIMD actually serializes this operation: While v1 is rotating, v2 * can load data, while v3 can multiply. SSE forces them to operate * together. * * How this hack works: * __asm__("" // Declare an assembly block but don't declare any instructions * : // However, as an Input/Output Operand, * "+r" // constrain a read/write operand (+) as a general purpose register (r). * (acc) // and set acc as the operand * ); * * Because of the 'r', the compiler has promised that seed will be in a * general purpose register and the '+' says that it will be 'read/write', * so it has to assume it has changed. It is like volatile without all the * loads and stores. * * Since the argument has to be in a normal register (not an SSE register), * each time XXH32_round is called, it is impossible to vectorize. */ __asm__("" : "+r" (acc)); #endif return acc; } /* mix all bits */ static xxh_u32 XXH32_avalanche(xxh_u32 h32) { h32 ^= h32 >> 15; h32 *= XXH_PRIME32_2; h32 ^= h32 >> 13; h32 *= XXH_PRIME32_3; h32 ^= h32 >> 16; return(h32); } #define XXH_get32bits(p) XXH_readLE32_align(p, align) static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) { #define XXH_PROCESS1 do { \ h32 += (*ptr++) * XXH_PRIME32_5; \ h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ } while (0) #define XXH_PROCESS4 do { \ h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ ptr += 4; \ h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ } while (0) /* Compact rerolled version */ if (XXH_REROLL) { len &= 15; while (len >= 4) { XXH_PROCESS4; len -= 4; } while (len > 0) { XXH_PROCESS1; --len; } return XXH32_avalanche(h32); } else { switch(len&15) /* or switch(bEnd - p) */ { case 12: XXH_PROCESS4; /* fallthrough */ case 8: XXH_PROCESS4; /* fallthrough */ case 4: XXH_PROCESS4; return XXH32_avalanche(h32); case 13: XXH_PROCESS4; /* fallthrough */ case 9: XXH_PROCESS4; /* fallthrough */ case 5: XXH_PROCESS4; XXH_PROCESS1; return XXH32_avalanche(h32); case 14: XXH_PROCESS4; /* fallthrough */ case 10: XXH_PROCESS4; /* fallthrough */ case 6: XXH_PROCESS4; XXH_PROCESS1; XXH_PROCESS1; return XXH32_avalanche(h32); case 15: XXH_PROCESS4; /* fallthrough */ case 11: XXH_PROCESS4; /* fallthrough */ case 7: XXH_PROCESS4; /* fallthrough */ case 3: XXH_PROCESS1; /* fallthrough */ case 2: XXH_PROCESS1; /* fallthrough */ case 1: XXH_PROCESS1; /* fallthrough */ case 0: return XXH32_avalanche(h32); } XXH_ASSERT(0); return h32; /* reaching this point is deemed impossible */ } } #ifdef XXH_OLD_NAMES # define PROCESS1 XXH_PROCESS1 # define PROCESS4 XXH_PROCESS4 #else # undef XXH_PROCESS1 # undef XXH_PROCESS4 #endif XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) { const xxh_u8* bEnd = input + len; xxh_u32 h32; #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) if (input==NULL) { len=0; bEnd=input=(const xxh_u8*)(size_t)16; } #endif if (len>=16) { const xxh_u8* const limit = bEnd - 15; xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; xxh_u32 v2 = seed + XXH_PRIME32_2; xxh_u32 v3 = seed + 0; xxh_u32 v4 = seed - XXH_PRIME32_1; do { v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; } while (input < limit); h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); } else { h32 = seed + XXH_PRIME32_5; } h32 += (xxh_u32)len; return XXH32_finalize(h32, input, len&15, align); } XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) { #if 0 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH32_state_t state; XXH32_reset(&state, seed); XXH32_update(&state, (const xxh_u8*)input, len); return XXH32_digest(&state); #else if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); } } return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); #endif } /******* Hash streaming *******/ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) { return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); } XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) { XXH_free(statePtr); return XXH_OK; } XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) { memcpy(dstState, srcState, sizeof(*dstState)); } XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) { XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ memset(&state, 0, sizeof(state)); state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; state.v2 = seed + XXH_PRIME32_2; state.v3 = seed + 0; state.v4 = seed - XXH_PRIME32_1; /* do not write into reserved, planned to be removed in a future version */ memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state, const void* input, size_t len) { if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) return XXH_OK; #else return XXH_ERROR; #endif { const xxh_u8* p = (const xxh_u8*)input; const xxh_u8* const bEnd = p + len; state->total_len_32 += (XXH32_hash_t)len; state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); if (state->memsize + len < 16) { /* fill in tmp buffer */ XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); state->memsize += (XXH32_hash_t)len; return XXH_OK; } if (state->memsize) { /* some data left from previous update */ XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); { const xxh_u32* p32 = state->mem32; state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); } p += 16-state->memsize; state->memsize = 0; } if (p <= bEnd-16) { const xxh_u8* const limit = bEnd - 16; xxh_u32 v1 = state->v1; xxh_u32 v2 = state->v2; xxh_u32 v3 = state->v3; xxh_u32 v4 = state->v4; do { v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; } while (p<=limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < bEnd) { XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); state->memsize = (unsigned)(bEnd-p); } } return XXH_OK; } XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) { xxh_u32 h32; if (state->large_len) { h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); } else { h32 = state->v3 /* == seed */ + XXH_PRIME32_5; } h32 += state->total_len_32; return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); } /******* Canonical representation *******/ /* * The default return values from XXH functions are unsigned 32 and 64 bit * integers. * * The canonical representation uses big endian convention, the same convention * as human-readable numbers (large digits first). * * This way, hash values can be written into a file or buffer, remaining * comparable across different systems. * * The following functions allow transformation of hash values to and from their * canonical format. */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); memcpy(dst, &hash, sizeof(*dst)); } XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) { return XXH_readBE32(src); } #ifndef XXH_NO_LONG_LONG /* ******************************************************************* * 64-bit hash functions *********************************************************************/ /******* Memory access *******/ typedef XXH64_hash_t xxh_u64; #ifdef XXH_OLD_NAMES # define U64 xxh_u64 #endif /*! * XXH_REROLL_XXH64: * Whether to reroll the XXH64_finalize() loop. * * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a * performance gain on 64-bit hosts, as only one jump is required. * * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial * to unroll. The code becomes ridiculously large (the largest function in the * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is * also slightly faster because it fits into cache better and is more likely * to be inlined by the compiler. * * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */ #ifndef XXH_REROLL_XXH64 # if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \ || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \ || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ # define XXH_REROLL_XXH64 1 # else # define XXH_REROLL_XXH64 0 # endif #endif /* !defined(XXH_REROLL_XXH64) */ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) /* * Manual byteshift. Best for old compilers which don't inline memcpy. * We actually directly use XXH_readLE64 and XXH_readBE64. */ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) /* * __pack instructions are safer, but compiler specific, hence potentially * problematic for some compilers. * * Currently only defined for GCC and ICC. */ #ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; #endif static xxh_u64 XXH_read64(const void* ptr) { typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64; return ((const xxh_unalign64*)ptr)->u64; } #else /* * Portable and safe solution. Generally efficient. * see: https://stackoverflow.com/a/32095106/646947 */ static xxh_u64 XXH_read64(const void* memPtr) { xxh_u64 val; memcpy(&val, memPtr, sizeof(val)); return val; } #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ #if defined(_MSC_VER) /* Visual Studio */ # define XXH_swap64 _byteswap_uint64 #elif XXH_GCC_VERSION >= 403 # define XXH_swap64 __builtin_bswap64 #else static xxh_u64 XXH_swap64 (xxh_u64 x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL); } #endif /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) { const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; return bytePtr[0] | ((xxh_u64)bytePtr[1] << 8) | ((xxh_u64)bytePtr[2] << 16) | ((xxh_u64)bytePtr[3] << 24) | ((xxh_u64)bytePtr[4] << 32) | ((xxh_u64)bytePtr[5] << 40) | ((xxh_u64)bytePtr[6] << 48) | ((xxh_u64)bytePtr[7] << 56); } XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) { const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; return bytePtr[7] | ((xxh_u64)bytePtr[6] << 8) | ((xxh_u64)bytePtr[5] << 16) | ((xxh_u64)bytePtr[4] << 24) | ((xxh_u64)bytePtr[3] << 32) | ((xxh_u64)bytePtr[2] << 40) | ((xxh_u64)bytePtr[1] << 48) | ((xxh_u64)bytePtr[0] << 56); } #else XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) { return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); } static xxh_u64 XXH_readBE64(const void* ptr) { return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); } #endif XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void* ptr, XXH_alignment align) { if (align==XXH_unaligned) return XXH_readLE64(ptr); else return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); } /******* xxh64 *******/ static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ #ifdef XXH_OLD_NAMES # define PRIME64_1 XXH_PRIME64_1 # define PRIME64_2 XXH_PRIME64_2 # define PRIME64_3 XXH_PRIME64_3 # define PRIME64_4 XXH_PRIME64_4 # define PRIME64_5 XXH_PRIME64_5 #endif static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) { acc += input * XXH_PRIME64_2; acc = XXH_rotl64(acc, 31); acc *= XXH_PRIME64_1; return acc; } static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) { val = XXH64_round(0, val); acc ^= val; acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; return acc; } static xxh_u64 XXH64_avalanche(xxh_u64 h64) { h64 ^= h64 >> 33; h64 *= XXH_PRIME64_2; h64 ^= h64 >> 29; h64 *= XXH_PRIME64_3; h64 ^= h64 >> 32; return h64; } #define XXH_get64bits(p) XXH_readLE64_align(p, align) static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) { #define XXH_PROCESS1_64 do { \ h64 ^= (*ptr++) * XXH_PRIME64_5; \ h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \ } while (0) #define XXH_PROCESS4_64 do { \ h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \ ptr += 4; \ h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \ } while (0) #define XXH_PROCESS8_64 do { \ xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ ptr += 8; \ h64 ^= k1; \ h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; \ } while (0) /* Rerolled version for 32-bit targets is faster and much smaller. */ if (XXH_REROLL || XXH_REROLL_XXH64) { len &= 31; while (len >= 8) { XXH_PROCESS8_64; len -= 8; } if (len >= 4) { XXH_PROCESS4_64; len -= 4; } while (len > 0) { XXH_PROCESS1_64; --len; } return XXH64_avalanche(h64); } else { switch(len & 31) { case 24: XXH_PROCESS8_64; /* fallthrough */ case 16: XXH_PROCESS8_64; /* fallthrough */ case 8: XXH_PROCESS8_64; return XXH64_avalanche(h64); case 28: XXH_PROCESS8_64; /* fallthrough */ case 20: XXH_PROCESS8_64; /* fallthrough */ case 12: XXH_PROCESS8_64; /* fallthrough */ case 4: XXH_PROCESS4_64; return XXH64_avalanche(h64); case 25: XXH_PROCESS8_64; /* fallthrough */ case 17: XXH_PROCESS8_64; /* fallthrough */ case 9: XXH_PROCESS8_64; XXH_PROCESS1_64; return XXH64_avalanche(h64); case 29: XXH_PROCESS8_64; /* fallthrough */ case 21: XXH_PROCESS8_64; /* fallthrough */ case 13: XXH_PROCESS8_64; /* fallthrough */ case 5: XXH_PROCESS4_64; XXH_PROCESS1_64; return XXH64_avalanche(h64); case 26: XXH_PROCESS8_64; /* fallthrough */ case 18: XXH_PROCESS8_64; /* fallthrough */ case 10: XXH_PROCESS8_64; XXH_PROCESS1_64; XXH_PROCESS1_64; return XXH64_avalanche(h64); case 30: XXH_PROCESS8_64; /* fallthrough */ case 22: XXH_PROCESS8_64; /* fallthrough */ case 14: XXH_PROCESS8_64; /* fallthrough */ case 6: XXH_PROCESS4_64; XXH_PROCESS1_64; XXH_PROCESS1_64; return XXH64_avalanche(h64); case 27: XXH_PROCESS8_64; /* fallthrough */ case 19: XXH_PROCESS8_64; /* fallthrough */ case 11: XXH_PROCESS8_64; XXH_PROCESS1_64; XXH_PROCESS1_64; XXH_PROCESS1_64; return XXH64_avalanche(h64); case 31: XXH_PROCESS8_64; /* fallthrough */ case 23: XXH_PROCESS8_64; /* fallthrough */ case 15: XXH_PROCESS8_64; /* fallthrough */ case 7: XXH_PROCESS4_64; /* fallthrough */ case 3: XXH_PROCESS1_64; /* fallthrough */ case 2: XXH_PROCESS1_64; /* fallthrough */ case 1: XXH_PROCESS1_64; /* fallthrough */ case 0: return XXH64_avalanche(h64); } } /* impossible to reach */ XXH_ASSERT(0); return 0; /* unreachable, but some compilers complain without it */ } #ifdef XXH_OLD_NAMES # define PROCESS1_64 XXH_PROCESS1_64 # define PROCESS4_64 XXH_PROCESS4_64 # define PROCESS8_64 XXH_PROCESS8_64 #else # undef XXH_PROCESS1_64 # undef XXH_PROCESS4_64 # undef XXH_PROCESS8_64 #endif XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) { const xxh_u8* bEnd = input + len; xxh_u64 h64; #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) if (input==NULL) { len=0; bEnd=input=(const xxh_u8*)(size_t)32; } #endif if (len>=32) { const xxh_u8* const limit = bEnd - 32; xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; xxh_u64 v2 = seed + XXH_PRIME64_2; xxh_u64 v3 = seed + 0; xxh_u64 v4 = seed - XXH_PRIME64_1; do { v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; } while (input<=limit); h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); h64 = XXH64_mergeRound(h64, v1); h64 = XXH64_mergeRound(h64, v2); h64 = XXH64_mergeRound(h64, v3); h64 = XXH64_mergeRound(h64, v4); } else { h64 = seed + XXH_PRIME64_5; } h64 += (xxh_u64) len; return XXH64_finalize(h64, input, len, align); } XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) { #if 0 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH64_state_t state; XXH64_reset(&state, seed); XXH64_update(&state, (const xxh_u8*)input, len); return XXH64_digest(&state); #else if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); } } return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); #endif } /******* Hash Streaming *******/ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) { return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); } XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) { XXH_free(statePtr); return XXH_OK; } XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) { memcpy(dstState, srcState, sizeof(*dstState)); } XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) { XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ memset(&state, 0, sizeof(state)); state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; state.v2 = seed + XXH_PRIME64_2; state.v3 = seed + 0; state.v4 = seed - XXH_PRIME64_1; /* do not write into reserved64, might be removed in a future version */ memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state, const void* input, size_t len) { if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) return XXH_OK; #else return XXH_ERROR; #endif { const xxh_u8* p = (const xxh_u8*)input; const xxh_u8* const bEnd = p + len; state->total_len += len; if (state->memsize + len < 32) { /* fill in tmp buffer */ XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); state->memsize += (xxh_u32)len; return XXH_OK; } if (state->memsize) { /* tmp buffer is full */ XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); p += 32-state->memsize; state->memsize = 0; } if (p+32 <= bEnd) { const xxh_u8* const limit = bEnd - 32; xxh_u64 v1 = state->v1; xxh_u64 v2 = state->v2; xxh_u64 v3 = state->v3; xxh_u64 v4 = state->v4; do { v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; } while (p<=limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < bEnd) { XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); state->memsize = (unsigned)(bEnd-p); } } return XXH_OK; } XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) { xxh_u64 h64; if (state->total_len >= 32) { xxh_u64 const v1 = state->v1; xxh_u64 const v2 = state->v2; xxh_u64 const v3 = state->v3; xxh_u64 const v4 = state->v4; h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); h64 = XXH64_mergeRound(h64, v1); h64 = XXH64_mergeRound(h64, v2); h64 = XXH64_mergeRound(h64, v3); h64 = XXH64_mergeRound(h64, v4); } else { h64 = state->v3 /*seed*/ + XXH_PRIME64_5; } h64 += (xxh_u64) state->total_len; return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); } /******* Canonical representation *******/ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); memcpy(dst, &hash, sizeof(*dst)); } XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) { return XXH_readBE64(src); } /* ********************************************************************* * XXH3 * New generation hash designed for speed on small keys and vectorization ************************************************************************ */ /* === Compiler specifics === */ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ # define XXH_RESTRICT restrict #else /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ # define XXH_RESTRICT /* disable */ #endif #if (defined(__GNUC__) && (__GNUC__ >= 3)) \ || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ || defined(__clang__) # define XXH_likely(x) __builtin_expect(x, 1) # define XXH_unlikely(x) __builtin_expect(x, 0) #else # define XXH_likely(x) (x) # define XXH_unlikely(x) (x) #endif #if defined(__GNUC__) # if defined(__AVX2__) # include # elif defined(__SSE2__) # include # elif defined(__ARM_NEON__) || defined(__ARM_NEON) # define inline __inline__ /* circumvent a clang bug */ # include # undef inline # endif #elif defined(_MSC_VER) # include #endif /* * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while * remaining a true 64-bit/128-bit hash function. * * This is done by prioritizing a subset of 64-bit operations that can be * emulated without too many steps on the average 32-bit machine. * * For example, these two lines seem similar, and run equally fast on 64-bit: * * xxh_u64 x; * x ^= (x >> 47); // good * x ^= (x >> 13); // bad * * However, to a 32-bit machine, there is a major difference. * * x ^= (x >> 47) looks like this: * * x.lo ^= (x.hi >> (47 - 32)); * * while x ^= (x >> 13) looks like this: * * // note: funnel shifts are not usually cheap. * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); * x.hi ^= (x.hi >> 13); * * The first one is significantly faster than the second, simply because the * shift is larger than 32. This means: * - All the bits we need are in the upper 32 bits, so we can ignore the lower * 32 bits in the shift. * - The shift result will always fit in the lower 32 bits, and therefore, * we can ignore the upper 32 bits in the xor. * * Thanks to this optimization, XXH3 only requires these features to be efficient: * * - Usable unaligned access * - A 32-bit or 64-bit ALU * - If 32-bit, a decent ADC instruction * - A 32 or 64-bit multiply with a 64-bit result * - For the 128-bit variant, a decent byteswap helps short inputs. * * The first two are already required by XXH32, and almost all 32-bit and 64-bit * platforms which can run XXH32 can run XXH3 efficiently. * * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one * notable exception. * * First of all, Thumb-1 lacks support for the UMULL instruction which * performs the important long multiply. This means numerous __aeabi_lmul * calls. * * Second of all, the 8 functional registers are just not enough. * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need * Lo registers, and this shuffling results in thousands more MOVs than A32. * * A32 and T32 don't have this limitation. They can access all 14 registers, * do a 32->64 multiply with UMULL, and the flexible operand allowing free * shifts is helpful, too. * * Therefore, we do a quick sanity check. * * If compiling Thumb-1 for a target which supports ARM instructions, we will * emit a warning, as it is not a "sane" platform to compile for. * * Usually, if this happens, it is because of an accident and you probably need * to specify -march, as you likely meant to compile for a newer architecture. * * Credit: large sections of the vectorial and asm source code paths * have been contributed by @easyaspi314 */ #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) # warning "XXH3 is highly inefficient without ARM or Thumb-2." #endif /* ========================================== * Vectorization detection * ========================================== */ #define XXH_SCALAR 0 /* Portable scalar version */ #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ #define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */ #define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */ #define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */ #ifndef XXH_VECTOR /* can be defined on command line */ # if defined(__AVX512F__) # define XXH_VECTOR XXH_AVX512 # elif defined(__AVX2__) # define XXH_VECTOR XXH_AVX2 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) # define XXH_VECTOR XXH_SSE2 # elif defined(__GNUC__) /* msvc support maybe later */ \ && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) # define XXH_VECTOR XXH_NEON # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ # define XXH_VECTOR XXH_VSX # else # define XXH_VECTOR XXH_SCALAR # endif #endif /* * Controls the alignment of the accumulator, * for compatibility with aligned vector loads, which are usually faster. */ #ifndef XXH_ACC_ALIGN # if defined(XXH_X86DISPATCH) # define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ # elif XXH_VECTOR == XXH_SCALAR /* scalar */ # define XXH_ACC_ALIGN 8 # elif XXH_VECTOR == XXH_SSE2 /* sse2 */ # define XXH_ACC_ALIGN 16 # elif XXH_VECTOR == XXH_AVX2 /* avx2 */ # define XXH_ACC_ALIGN 32 # elif XXH_VECTOR == XXH_NEON /* neon */ # define XXH_ACC_ALIGN 16 # elif XXH_VECTOR == XXH_VSX /* vsx */ # define XXH_ACC_ALIGN 16 # elif XXH_VECTOR == XXH_AVX512 /* avx512 */ # define XXH_ACC_ALIGN 64 # endif #endif #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 # define XXH_SEC_ALIGN XXH_ACC_ALIGN #else # define XXH_SEC_ALIGN 8 #endif /* * UGLY HACK: * GCC usually generates the best code with -O3 for xxHash. * * However, when targeting AVX2, it is overzealous in its unrolling resulting * in code roughly 3/4 the speed of Clang. * * There are other issues, such as GCC splitting _mm256_loadu_si256 into * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which * only applies to Sandy and Ivy Bridge... which don't even support AVX2. * * That is why when compiling the AVX2 version, it is recommended to use either * -O2 -mavx2 -march=haswell * or * -O2 -mavx2 -mno-avx256-split-unaligned-load * for decent performance, or to use Clang instead. * * Fortunately, we can control the first one with a pragma that forces GCC into * -O2, but the other one we can't control without "failed to inline always * inline function due to target mismatch" warnings. */ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ # pragma GCC push_options # pragma GCC optimize("-O2") #endif #if XXH_VECTOR == XXH_NEON /* * NEON's setup for vmlal_u32 is a little more complicated than it is on * SSE2, AVX2, and VSX. * * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. * * To do the same operation, the 128-bit 'Q' register needs to be split into * two 64-bit 'D' registers, performing this operation:: * * [ a | b ] * | '---------. .--------' | * | x | * | .---------' '--------. | * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] * * Due to significant changes in aarch64, the fastest method for aarch64 is * completely different than the fastest method for ARMv7-A. * * ARMv7-A treats D registers as unions overlaying Q registers, so modifying * D11 will modify the high half of Q5. This is similar to how modifying AH * will only affect bits 8-15 of AX on x86. * * VZIP takes two registers, and puts even lanes in one register and odd lanes * in the other. * * On ARMv7-A, this strangely modifies both parameters in place instead of * taking the usual 3-operand form. * * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the * lower and upper halves of the Q register to end up with the high and low * halves where we want - all in one instruction. * * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } * * Unfortunately we need inline assembly for this: Instructions modifying two * registers at once is not possible in GCC or Clang's IR, and they have to * create a copy. * * aarch64 requires a different approach. * * In order to make it easier to write a decent compiler for aarch64, many * quirks were removed, such as conditional execution. * * NEON was also affected by this. * * aarch64 cannot access the high bits of a Q-form register, and writes to a * D-form register zero the high bits, similar to how writes to W-form scalar * registers (or DWORD registers on x86_64) work. * * The formerly free vget_high intrinsics now require a vext (with a few * exceptions) * * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one * operand. * * The equivalent of the VZIP.32 on the lower and upper halves would be this * mess: * * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } * * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): * * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); * * This is available on ARMv7-A, but is less efficient than a single VZIP.32. */ /* * Function-like macro: * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) * { * outLo = (uint32x2_t)(in & 0xFFFFFFFF); * outHi = (uint32x2_t)(in >> 32); * in = UNDEFINED; * } */ # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ && defined(__GNUC__) \ && !defined(__aarch64__) && !defined(__arm64__) # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ do { \ /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ } while (0) # else # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ do { \ (outLo) = vmovn_u64 (in); \ (outHi) = vshrn_n_u64 ((in), 32); \ } while (0) # endif #endif /* XXH_VECTOR == XXH_NEON */ /* * VSX and Z Vector helpers. * * This is very messy, and any pull requests to clean this up are welcome. * * There are a lot of problems with supporting VSX and s390x, due to * inconsistent intrinsics, spotty coverage, and multiple endiannesses. */ #if XXH_VECTOR == XXH_VSX # if defined(__s390x__) # include # else /* gcc's altivec.h can have the unwanted consequence to unconditionally * #define bool, vector, and pixel keywords, * with bad consequences for programs already using these keywords for other purposes. * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined. * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler, * but it seems that, in some cases, it isn't. * Force the build macro to be defined, so that keywords are not altered. */ # if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) # define __APPLE_ALTIVEC__ # endif # include # endif typedef __vector unsigned long long xxh_u64x2; typedef __vector unsigned char xxh_u8x16; typedef __vector unsigned xxh_u32x4; # ifndef XXH_VSX_BE # if defined(__BIG_ENDIAN__) \ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) # define XXH_VSX_BE 1 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ # warning "-maltivec=be is not recommended. Please use native endianness." # define XXH_VSX_BE 1 # else # define XXH_VSX_BE 0 # endif # endif /* !defined(XXH_VSX_BE) */ # if XXH_VSX_BE /* A wrapper for POWER9's vec_revb. */ # if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) # define XXH_vec_revb vec_revb # else XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; return vec_perm(val, val, vByteSwap); } # endif # endif /* XXH_VSX_BE */ /* * Performs an unaligned load and byte swaps it on big endian. */ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { xxh_u64x2 ret; memcpy(&ret, ptr, sizeof(xxh_u64x2)); # if XXH_VSX_BE ret = XXH_vec_revb(ret); # endif return ret; } /* * vec_mulo and vec_mule are very problematic intrinsics on PowerPC * * These intrinsics weren't added until GCC 8, despite existing for a while, * and they are endian dependent. Also, their meaning swap depending on version. * */ # if defined(__s390x__) /* s390x is always big endian, no issue on this platform */ # define XXH_vec_mulo vec_mulo # define XXH_vec_mule vec_mule # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ # define XXH_vec_mulo __builtin_altivec_vmulouw # define XXH_vec_mule __builtin_altivec_vmuleuw # else /* gcc needs inline assembly */ /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) { xxh_u64x2 result; __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); return result; } XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) { xxh_u64x2 result; __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); return result; } # endif /* XXH_vec_mulo, XXH_vec_mule */ #endif /* XXH_VECTOR == XXH_VSX */ /* prefetch * can be disabled, by declaring XXH_NO_PREFETCH build macro */ #if defined(XXH_NO_PREFETCH) # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ #else # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ # include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) # else # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ # endif #endif /* XXH_NO_PREFETCH */ /* ========================================== * XXH3 default settings * ========================================== */ #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) # error "default keyset is not large enough" #endif /* Pseudorandom secret taken directly from FARSH */ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, }; #ifdef XXH_OLD_NAMES # define kSecret XXH3_kSecret #endif /* * Calculates a 32-bit to 64-bit long multiply. * * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't * need to (but it shouldn't need to anyways, it is about 7 instructions to do * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we * use that instead of the normal method. * * If you are compiling for platforms like Thumb-1 and don't have a better option, * you may also want to write your own long multiply routine here. * * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) * { * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); * } */ #if defined(_MSC_VER) && defined(_M_IX86) # include # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) #else /* * Downcast + upcast is usually better than masking on older compilers like * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. * * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands * and perform a full 64x64 multiply -- entirely redundant on 32-bit. */ # define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) #endif /* * Calculates a 64->128-bit long multiply. * * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. */ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { /* * GCC/Clang __uint128_t method. * * On most 64-bit targets, GCC and Clang define a __uint128_t type. * This is usually the best way as it usually uses a native long 64-bit * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. * * Usually. * * Despite being a 32-bit platform, Clang (and emscripten) define this type * despite not having the arithmetic for it. This results in a laggy * compiler builtin call which calculates a full 128-bit multiply. * In that case it is best to use the portable one. * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 */ #if defined(__GNUC__) && !defined(__wasm__) \ && defined(__SIZEOF_INT128__) \ || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; XXH128_hash_t r128; r128.low64 = (xxh_u64)(product); r128.high64 = (xxh_u64)(product >> 64); return r128; /* * MSVC for x64's _umul128 method. * * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); * * This compiles to single operand MUL on x64. */ #elif defined(_M_X64) || defined(_M_IA64) #ifndef _MSC_VER # pragma intrinsic(_umul128) #endif xxh_u64 product_high; xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); XXH128_hash_t r128; r128.low64 = product_low; r128.high64 = product_high; return r128; #else /* * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. * * This is a fast and simple grade school multiply, which is shown below * with base 10 arithmetic instead of base 0x100000000. * * 9 3 // D2 lhs = 93 * x 7 5 // D2 rhs = 75 * ---------- * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 * --------- * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 * --------- * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 * * The reasons for adding the products like this are: * 1. It avoids manual carry tracking. Just like how * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. * This avoids a lot of complexity. * * 2. It hints for, and on Clang, compiles to, the powerful UMAAL * instruction available in ARM's Digital Signal Processing extension * in 32-bit ARMv6 and later, which is shown below: * * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) * { * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); * *RdHi = (xxh_u32)(product >> 32); * } * * This instruction was designed for efficient long multiplication, and * allows this to be calculated in only 4 instructions at speeds * comparable to some 64-bit ALUs. * * 3. It isn't terrible on other platforms. Usually this will be a couple * of 32-bit ADD/ADCs. */ /* First calculate all of the cross products. */ xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); /* Now add the products together. These will never overflow. */ xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); XXH128_hash_t r128; r128.low64 = lower; r128.high64 = upper; return r128; #endif } /* * Does a 64-bit to 128-bit multiply, then XOR folds it. * * The reason for the separate function is to prevent passing too many structs * around by value. This will hopefully inline the multiply, but we don't force it. */ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) { XXH128_hash_t product = XXH_mult64to128(lhs, rhs); return product.low64 ^ product.high64; } /* Seems to produce slightly better code on GCC for some reason. */ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { XXH_ASSERT(0 <= shift && shift < 64); return v64 ^ (v64 >> shift); } /* * This is a fast avalanche stage, * suitable when input bits are already partially mixed */ static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) { h64 = XXH_xorshift64(h64, 37); h64 *= 0x165667919E3779F9ULL; h64 = XXH_xorshift64(h64, 32); return h64; } /* * This is a stronger avalanche, * inspired by Pelle Evensen's rrmxmx * preferable when input has not been previously mixed */ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) { /* this mix is inspired by Pelle Evensen's rrmxmx */ h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); h64 *= 0x9FB21C651E98DF25ULL; h64 ^= (h64 >> 35) + len ; h64 *= 0x9FB21C651E98DF25ULL; return XXH_xorshift64(h64, 28); } /* ========================================== * Short keys * ========================================== * One of the shortcomings of XXH32 and XXH64 was that their performance was * sub-optimal on short lengths. It used an iterative algorithm which strongly * favored lengths that were a multiple of 4 or 8. * * Instead of iterating over individual inputs, we use a set of single shot * functions which piece together a range of lengths and operate in constant time. * * Additionally, the number of multiplies has been significantly reduced. This * reduces latency, especially when emulating 64-bit multiplies on 32-bit. * * Depending on the platform, this may or may not be faster than XXH32, but it * is almost guaranteed to be faster than XXH64. */ /* * At very short lengths, there isn't enough input to fully hide secrets, or use * the entire secret. * * There is also only a limited amount of mixing we can do before significantly * impacting performance. * * Therefore, we use different sections of the secret and always mix two secret * samples with an XOR. This should have no effect on performance on the * seedless or withSeed variants because everything _should_ be constant folded * by modern compilers. * * The XOR mixing hides individual parts of the secret and increases entropy. * * This adds an extra layer of strength for custom secrets. */ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); XXH_ASSERT(1 <= len && len <= 3); XXH_ASSERT(secret != NULL); /* * len = 1: combined = { input[0], 0x01, input[0], input[0] } * len = 2: combined = { input[1], 0x02, input[0], input[1] } * len = 3: combined = { input[2], 0x03, input[0], input[1] } */ { xxh_u8 const c1 = input[0]; xxh_u8 const c2 = input[len >> 1]; xxh_u8 const c3 = input[len - 1]; xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; return XXH64_avalanche(keyed); } } XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); XXH_ASSERT(4 <= len && len < 8); seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; { xxh_u32 const input1 = XXH_readLE32(input); xxh_u32 const input2 = XXH_readLE32(input + len - 4); xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); xxh_u64 const keyed = input64 ^ bitflip; return XXH3_rrmxmx(keyed, len); } } XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); XXH_ASSERT(8 <= len && len <= 16); { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi + XXH3_mul128_fold64(input_lo, input_hi); return XXH3_avalanche(acc); } } XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(len <= 16); { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); if (len) return XXH3_len_1to3_64b(input, len, secret, seed); return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); } } /* * DISCLAIMER: There are known *seed-dependent* multicollisions here due to * multiplication by zero, affecting hashes of lengths 17 to 240. * * However, they are very unlikely. * * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all * unseeded non-cryptographic hashes, it does not attempt to defend itself * against specially crafted inputs, only random inputs. * * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes * cancelling out the secret is taken an arbitrary number of times (addressed * in XXH3_accumulate_512), this collision is very unlikely with random inputs * and/or proper seeding: * * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a * function that is only called up to 16 times per hash with up to 240 bytes of * input. * * This is not too bad for a non-cryptographic hash function, especially with * only 64 bit outputs. * * The 128-bit variant (which trades some speed for strength) is NOT affected * by this, although it is always a good idea to use a proper seed if you care * about strength. */ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) { #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ /* * UGLY HACK: * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in * slower code. * * By forcing seed64 into a register, we disrupt the cost model and * cause it to scalarize. See `XXH32_round()` * * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on * GCC 9.2, despite both emitting scalar code. * * GCC generates much better scalar code than Clang for the rest of XXH3, * which is why finding a more optimal codepath is an interest. */ __asm__ ("" : "+r" (seed64)); #endif { xxh_u64 const input_lo = XXH_readLE64(input); xxh_u64 const input_hi = XXH_readLE64(input+8); return XXH3_mul128_fold64( input_lo ^ (XXH_readLE64(secret) + seed64), input_hi ^ (XXH_readLE64(secret+8) - seed64) ); } } /* For mid range keys, XXH3 uses a Mum-hash variant. */ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; XXH_ASSERT(16 < len && len <= 128); { xxh_u64 acc = len * XXH_PRIME64_1; if (len > 32) { if (len > 64) { if (len > 96) { acc += XXH3_mix16B(input+48, secret+96, seed); acc += XXH3_mix16B(input+len-64, secret+112, seed); } acc += XXH3_mix16B(input+32, secret+64, seed); acc += XXH3_mix16B(input+len-48, secret+80, seed); } acc += XXH3_mix16B(input+16, secret+32, seed); acc += XXH3_mix16B(input+len-32, secret+48, seed); } acc += XXH3_mix16B(input+0, secret+0, seed); acc += XXH3_mix16B(input+len-16, secret+16, seed); return XXH3_avalanche(acc); } } #define XXH3_MIDSIZE_MAX 240 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); #define XXH3_MIDSIZE_STARTOFFSET 3 #define XXH3_MIDSIZE_LASTOFFSET 17 { xxh_u64 acc = len * XXH_PRIME64_1; int const nbRounds = (int)len / 16; int i; for (i=0; i<8; i++) { acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); } acc = XXH3_avalanche(acc); XXH_ASSERT(nbRounds >= 8); #if defined(__clang__) /* Clang */ \ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ /* * UGLY HACK: * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. * In everywhere else, it uses scalar code. * * For 64->128-bit multiplies, even if the NEON was 100% optimal, it * would still be slower than UMAAL (see XXH_mult64to128). * * Unfortunately, Clang doesn't handle the long multiplies properly and * converts them to the nonexistent "vmulq_u64" intrinsic, which is then * scalarized into an ugly mess of VMOV.32 instructions. * * This mess is difficult to avoid without turning autovectorization * off completely, but they are usually relatively minor and/or not * worth it to fix. * * This loop is the easiest to fix, as unlike XXH32, this pragma * _actually works_ because it is a loop vectorization instead of an * SLP vectorization. */ #pragma clang loop vectorize(disable) #endif for (i=8 ; i < nbRounds; i++) { acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); } /* last bytes */ acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); return XXH3_avalanche(acc); } } /* ======= Long Keys ======= */ #define XXH_STRIPE_LEN 64 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) #ifdef XXH_OLD_NAMES # define STRIPE_LEN XXH_STRIPE_LEN # define ACC_NB XXH_ACC_NB #endif XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) { if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); memcpy(dst, &v64, sizeof(v64)); } /* Several intrinsic functions below are supposed to accept __int64 as argument, * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . * However, several environments do not define __int64 type, * requiring a workaround. */ #if !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) typedef int64_t xxh_i64; #else /* the following type must have a width of 64-bit */ typedef long long xxh_i64; #endif /* * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. * * It is a hardened version of UMAC, based off of FARSH's implementation. * * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD * implementations, and it is ridiculously fast. * * We harden it by mixing the original input to the accumulators as well as the product. * * This means that in the (relatively likely) case of a multiply by zero, the * original input is preserved. * * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve * cross-pollination, as otherwise the upper and lower halves would be * essentially independent. * * This doesn't matter on 64-bit hashes since they all get merged together in * the end, so we skip the extra step. * * Both XXH3_64bits and XXH3_128bits use this subroutine. */ #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH) #ifndef XXH_TARGET_AVX512 # define XXH_TARGET_AVX512 /* disable attribute target */ #endif XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; XXH_ASSERT((((size_t)acc) & 63) == 0); XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); { /* data_vec = input[0]; */ __m512i const data_vec = _mm512_loadu_si512 (input); /* key_vec = secret[0]; */ __m512i const key_vec = _mm512_loadu_si512 (secret); /* data_key = data_vec ^ key_vec; */ __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); /* xacc[0] += swap(data_vec); */ __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); __m512i const sum = _mm512_add_epi64(*xacc, data_swap); /* xacc[0] += product; */ *xacc = _mm512_add_epi64(product, sum); } } /* * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. * * Multiplication isn't perfect, as explained by Google in HighwayHash: * * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to * // varying degrees. In descending order of goodness, bytes * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. * // As expected, the upper and lower bytes are much worse. * * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 * * Since our algorithm uses a pseudorandom secret to add some variance into the * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. * * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid * extraction. * * Both XXH3_64bits and XXH3_128bits use this subroutine. */ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 63) == 0); XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); /* xacc[0] ^= (xacc[0] >> 47) */ __m512i const acc_vec = *xacc; __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); /* xacc[0] ^= secret; */ __m512i const key_vec = _mm512_loadu_si512 (secret); __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); /* xacc[0] *= XXH_PRIME32_1; */ __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); } } XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) { XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); XXH_ASSERT(((size_t)customSecret & 63) == 0); (void)(&XXH_writeLE64); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64); XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret; XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret; int i; for (i=0; i < nbRounds; ++i) { /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', * this will warn "discards ‘const’ qualifier". */ union { XXH_ALIGN(64) const __m512i* cp; XXH_ALIGN(64) void* p; } remote_const_void; remote_const_void.cp = src + i; dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); } } } #endif #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH) #ifndef XXH_TARGET_AVX2 # define XXH_TARGET_AVX2 /* disable attribute target */ #endif XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 31) == 0); { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i* const xinput = (const __m256i *) input; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i* const xsecret = (const __m256i *) secret; size_t i; for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { /* data_vec = xinput[i]; */ __m256i const data_vec = _mm256_loadu_si256 (xinput+i); /* key_vec = xsecret[i]; */ __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); /* data_key = data_vec ^ key_vec; */ __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); /* xacc[i] += swap(data_vec); */ __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); /* xacc[i] += product; */ xacc[i] = _mm256_add_epi64(product, sum); } } } XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 31) == 0); { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i* const xsecret = (const __m256i *) secret; const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); size_t i; for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ __m256i const acc_vec = xacc[i]; __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); /* xacc[i] ^= xsecret; */ __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); } } } XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) { XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); (void)(&XXH_writeLE64); XXH_PREFETCH(customSecret); { __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64); XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret; XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret; # if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with * customSecret, and on aarch64, this prevented LDP from merging two * loads together for free. Putting the loads together before the stores * properly generates LDP. */ __asm__("" : "+r" (dest)); # endif /* GCC -O2 need unroll loop manually */ dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); } } #endif #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) #ifndef XXH_TARGET_SSE2 # define XXH_TARGET_SSE2 /* disable attribute target */ #endif XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { /* SSE2 is just a half-scale version of the AVX2 version. */ XXH_ASSERT((((size_t)acc) & 15) == 0); { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i* const xinput = (const __m128i *) input; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i* const xsecret = (const __m128i *) secret; size_t i; for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { /* data_vec = xinput[i]; */ __m128i const data_vec = _mm_loadu_si128 (xinput+i); /* key_vec = xsecret[i]; */ __m128i const key_vec = _mm_loadu_si128 (xsecret+i); /* data_key = data_vec ^ key_vec; */ __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); /* xacc[i] += swap(data_vec); */ __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); __m128i const sum = _mm_add_epi64(xacc[i], data_swap); /* xacc[i] += product; */ xacc[i] = _mm_add_epi64(product, sum); } } } XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i* const xsecret = (const __m128i *) secret; const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); size_t i; for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ __m128i const acc_vec = xacc[i]; __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); /* xacc[i] ^= xsecret[i]; */ __m128i const key_vec = _mm_loadu_si128 (xsecret+i); __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); } } } XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) { XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); (void)(&XXH_writeLE64); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); # if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 // MSVC 32bit mode does not support _mm_set_epi64x before 2015 XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 }; __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); # else __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64); # endif int i; XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret; XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret; # if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack */ __asm__("" : "+r" (dest)); # endif for (i=0; i < nbRounds; ++i) { dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed); } } } #endif #if (XXH_VECTOR == XXH_NEON) XXH_FORCE_INLINE void XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); { XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ uint8_t const* const xinput = (const uint8_t *) input; uint8_t const* const xsecret = (const uint8_t *) secret; size_t i; for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { /* data_vec = xinput[i]; */ uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); /* key_vec = xsecret[i]; */ uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); uint64x2_t data_key; uint32x2_t data_key_lo, data_key_hi; /* xacc[i] += swap(data_vec); */ uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); uint64x2_t const swapped = vextq_u64(data64, data64, 1); xacc[i] = vaddq_u64 (xacc[i], swapped); /* data_key = data_vec ^ key_vec; */ data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); * data_key_hi = (uint32x2_t) (data_key >> 32); * data_key = UNDEFINED; */ XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); } } } XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); { uint64x2_t* xacc = (uint64x2_t*) acc; uint8_t const* xsecret = (uint8_t const*) secret; uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); size_t i; for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) { /* xacc[i] ^= (xacc[i] >> 47); */ uint64x2_t acc_vec = xacc[i]; uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); /* xacc[i] ^= xsecret[i]; */ uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); /* xacc[i] *= XXH_PRIME32_1 */ uint32x2_t data_key_lo, data_key_hi; /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); * data_key_hi = (uint32x2_t) (xacc[i] >> 32); * xacc[i] = UNDEFINED; */ XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); { /* * prod_hi = (data_key >> 32) * XXH_PRIME32_1; * * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will * incorrectly "optimize" this: * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); * shifted = vshll_n_u32(tmp, 32); * to this: * tmp = "vmulq_u64"(a, b); // no such thing! * shifted = vshlq_n_u64(tmp, 32); * * However, unlike SSE, Clang lacks a 64-bit multiply routine * for NEON, and it scalarizes two 64-bit multiplies instead. * * vmull_u32 has the same timing as vmul_u32, and it avoids * this bug completely. * See https://bugs.llvm.org/show_bug.cgi?id=39967 */ uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); /* xacc[i] = prod_hi << 32; */ xacc[i] = vshlq_n_u64(prod_hi, 32); /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); } } } } #endif #if (XXH_VECTOR == XXH_VSX) XXH_FORCE_INLINE void XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ xxh_u64x2 const v32 = { 32, 32 }; size_t i; for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { /* data_vec = xinput[i]; */ xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); /* key_vec = xsecret[i]; */ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); xxh_u64x2 const data_key = data_vec ^ key_vec; /* shuffled = (data_key << 32) | (data_key >> 32); */ xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); xacc[i] += product; /* swap high and low halves */ #ifdef __s390x__ xacc[i] += vec_permi(data_vec, data_vec, 2); #else xacc[i] += vec_xxpermdi(data_vec, data_vec, 2); #endif } } XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); { xxh_u64x2* const xacc = (xxh_u64x2*) acc; const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; /* constants */ xxh_u64x2 const v32 = { 32, 32 }; xxh_u64x2 const v47 = { 47, 47 }; xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; size_t i; for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { /* xacc[i] ^= (xacc[i] >> 47); */ xxh_u64x2 const acc_vec = xacc[i]; xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); /* xacc[i] ^= xsecret[i]; */ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); xxh_u64x2 const data_key = data_vec ^ key_vec; /* xacc[i] *= XXH_PRIME32_1 */ /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); xacc[i] = prod_odd + (prod_even << v32); } } } #endif /* scalar variants - universal */ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ size_t i; XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); for (i=0; i < XXH_ACC_NB; i++) { xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); xacc[i ^ 1] += data_val; /* swap adjacent lanes */ xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); } } XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ size_t i; XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); for (i=0; i < XXH_ACC_NB; i++) { xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); xxh_u64 acc64 = xacc[i]; acc64 = XXH_xorshift64(acc64, 47); acc64 ^= key64; acc64 *= XXH_PRIME32_1; xacc[i] = acc64; } } XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) { /* * We need a separate pointer for the hack below, * which requires a non-const pointer. * Any decent compiler will optimize this out otherwise. */ const xxh_u8* kSecretPtr = XXH3_kSecret; XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); #if defined(__clang__) && defined(__aarch64__) /* * UGLY HACK: * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are * placed sequentially, in order, at the top of the unrolled loop. * * While MOVK is great for generating constants (2 cycles for a 64-bit * constant compared to 4 cycles for LDR), long MOVK chains stall the * integer pipelines: * I L S * MOVK * MOVK * MOVK * MOVK * ADD * SUB STR * STR * By forcing loads from memory (as the asm line causes Clang to assume * that XXH3_kSecretPtr has been changed), the pipelines are used more * efficiently: * I L S * LDR * ADD LDR * SUB STR * STR * XXH3_64bits_withSeed, len == 256, Snapdragon 835 * without hack: 2654.4 MB/s * with hack: 3202.9 MB/s */ __asm__("" : "+r" (kSecretPtr)); #endif /* * Note: in debug mode, this overrides the asm optimization * and Clang will emit MOVK chains again. */ XXH_ASSERT(kSecretPtr == XXH3_kSecret); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; int i; for (i=0; i < nbRounds; i++) { /* * The asm hack causes Clang to assume that kSecretPtr aliases with * customSecret, and on aarch64, this prevented LDP from merging two * loads together for free. Putting the loads together before the stores * properly generates LDP. */ xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); } } } typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #if (XXH_VECTOR == XXH_AVX512) #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 #elif (XXH_VECTOR == XXH_AVX2) #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 #elif (XXH_VECTOR == XXH_SSE2) #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 #elif (XXH_VECTOR == XXH_NEON) #define XXH3_accumulate_512 XXH3_accumulate_512_neon #define XXH3_scrambleAcc XXH3_scrambleAcc_neon #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #elif (XXH_VECTOR == XXH_VSX) #define XXH3_accumulate_512 XXH3_accumulate_512_vsx #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #endif #ifndef XXH_PREFETCH_DIST # ifdef __clang__ # define XXH_PREFETCH_DIST 320 # else # if (XXH_VECTOR == XXH_AVX512) # define XXH_PREFETCH_DIST 512 # else # define XXH_PREFETCH_DIST 384 # endif # endif /* __clang__ */ #endif /* XXH_PREFETCH_DIST */ /* * XXH3_accumulate() * Loops over XXH3_accumulate_512(). * Assumption: nbStripes will not overflow the secret size */ XXH_FORCE_INLINE void XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT input, const xxh_u8* XXH_RESTRICT secret, size_t nbStripes, XXH3_f_accumulate_512 f_acc512) { size_t n; for (n = 0; n < nbStripes; n++ ) { const xxh_u8* const in = input + n*XXH_STRIPE_LEN; XXH_PREFETCH(in + XXH_PREFETCH_DIST); f_acc512(acc, in, secret + n*XXH_SECRET_CONSUME_RATE); } } XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; size_t const nb_blocks = (len - 1) / block_len; size_t n; XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); for (n = 0; n < nb_blocks; n++) { XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); } /* last partial block */ XXH_ASSERT(len > XXH_STRIPE_LEN); { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); /* last stripe */ { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); } } } XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) { return XXH3_mul128_fold64( acc[0] ^ XXH_readLE64(secret), acc[1] ^ XXH_readLE64(secret+8) ); } static XXH64_hash_t XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) { xxh_u64 result64 = start; size_t i = 0; for (i = 0; i < 4; i++) { result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); #if defined(__clang__) /* Clang */ \ && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ /* * UGLY HACK: * Prevent autovectorization on Clang ARMv7-a. Exact same problem as * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. * XXH3_64bits, len == 256, Snapdragon 835: * without hack: 2063.7 MB/s * with hack: 2560.7 MB/s */ __asm__("" : "+r" (result64)); #endif } return XXH3_avalanche(result64); } #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, const void* XXH_RESTRICT secret, size_t secretSize, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); /* do not align on 8, so that the secret is different from the accumulator */ #define XXH_SECRET_MERGEACCS_START 11 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); } /* * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); } /* * It's important for performance that XXH3_hashLong is not inlined. * Since the function is not inlined, the compiler may not be able to understand that, * in some scenarios, its `secret` argument is actually a compile time constant. * This variant enforces that the compiler can detect that, * and uses this opportunity to streamline the generated code for better performance. */ XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; (void)secret; (void)secretLen; return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); } /* * XXH3_hashLong_64b_withSeed(): * Generate a custom key based on alteration of default XXH3_kSecret with the seed, * and then use this key for long mode hashing. * * This operation is decently fast but nonetheless costs a little bit of time. * Try to avoid it whenever possible (typically when seed==0). * * It's important for performance that XXH3_hashLong is not inlined. Not sure * why (uop cache maybe?), but the difference is large and easily measurable. */ XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, XXH64_hash_t seed, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble, XXH3_f_initCustomSecret f_initSec) { if (seed == 0) return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble); { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; f_initSec(secret, seed); return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), f_acc512, f_scramble); } } /* * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const void* input, size_t len, XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) { (void)secret; (void)secretLen; return XXH3_hashLong_64b_withSeed_internal(input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); } typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); XXH_FORCE_INLINE XXH64_hash_t XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, XXH3_hashLong64_f f_hashLong) { XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); /* * If an action is to be taken if `secretLen` condition is not respected, * it should be done here. * For now, it's a contract pre-condition. * Adding a check and a branch here would cost performance at every hash. * Also, note that function signature doesn't offer room to return an error. */ if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); } /* === Public entry point === */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) { return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); } XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) { return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); } XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) { return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); } /* === XXH3 streaming === */ /* * Malloc's a pointer that is always aligned to align. * * This must be freed with `XXH_alignedFree()`. * * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. * * This underalignment previously caused a rather obvious crash which went * completely unnoticed due to XXH3_createState() not actually being tested. * Credit to RedSpah for noticing this bug. * * The alignment is done manually: Functions like posix_memalign or _mm_malloc * are avoided: To maintain portability, we would have to write a fallback * like this anyways, and besides, testing for the existence of library * functions without relying on external build tools is impossible. * * The method is simple: Overallocate, manually align, and store the offset * to the original behind the returned pointer. * * Align must be a power of 2 and 8 <= align <= 128. */ static void* XXH_alignedMalloc(size_t s, size_t align) { XXH_ASSERT(align <= 128 && align >= 8); /* range check */ XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ { /* Overallocate to make room for manual realignment and an offset byte */ xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); if (base != NULL) { /* * Get the offset needed to align this pointer. * * Even if the returned pointer is aligned, there will always be * at least one byte to store the offset to the original pointer. */ size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ /* Add the offset for the now-aligned pointer */ xxh_u8* ptr = base + offset; XXH_ASSERT((size_t)ptr % align == 0); /* Store the offset immediately before the returned pointer. */ ptr[-1] = (xxh_u8)offset; return ptr; } return NULL; } } /* * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. */ static void XXH_alignedFree(void* p) { if (p != NULL) { xxh_u8* ptr = (xxh_u8*)p; /* Get the offset byte we added in XXH_malloc. */ xxh_u8 offset = ptr[-1]; /* Free the original malloc'd pointer */ xxh_u8* base = ptr - offset; XXH_free(base); } } XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) { XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); if (state==NULL) return NULL; XXH3_INITSTATE(state); return state; } XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) { XXH_alignedFree(statePtr); return XXH_OK; } XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) { memcpy(dst_state, src_state, sizeof(*dst_state)); } static void XXH3_64bits_reset_internal(XXH3_state_t* statePtr, XXH64_hash_t seed, const void* secret, size_t secretSize) { size_t const initStart = offsetof(XXH3_state_t, bufferedSize); size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); XXH_ASSERT(statePtr != NULL); /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ memset((char*)statePtr + initStart, 0, initLength); statePtr->acc[0] = XXH_PRIME32_3; statePtr->acc[1] = XXH_PRIME64_1; statePtr->acc[2] = XXH_PRIME64_2; statePtr->acc[3] = XXH_PRIME64_3; statePtr->acc[4] = XXH_PRIME64_4; statePtr->acc[5] = XXH_PRIME32_2; statePtr->acc[6] = XXH_PRIME64_5; statePtr->acc[7] = XXH_PRIME32_1; statePtr->seed = seed; statePtr->extSecret = (const unsigned char*)secret; XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; } XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr) { if (statePtr == NULL) return XXH_ERROR; XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize); if (secret == NULL) return XXH_ERROR; if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) { if (statePtr == NULL) return XXH_ERROR; if (seed==0) return XXH3_64bits_reset(statePtr); if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } /* Note : when XXH3_consumeStripes() is invoked, * there must be a guarantee that at least one more byte must be consumed from input * so that the function can blindly consume all stripes using the "normal" secret segment */ XXH_FORCE_INLINE void XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, const xxh_u8* XXH_RESTRICT input, size_t nbStripes, const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { /* need a scrambling operation */ size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); f_scramble(acc, secret + secretLimit); XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); *nbStripesSoFarPtr = nbStripesAfterBlock; } else { XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); *nbStripesSoFarPtr += nbStripes; } } /* * Both XXH3_64bits_update and XXH3_128bits_update use this routine. */ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) return XXH_OK; #else return XXH_ERROR; #endif { const xxh_u8* const bEnd = input + len; const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; state->totalLen += len; if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ XXH_memcpy(state->buffer + state->bufferedSize, input, len); state->bufferedSize += (XXH32_hash_t)len; return XXH_OK; } /* total input is now > XXH3_INTERNALBUFFER_SIZE */ #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ /* * Internal buffer is partially filled (always, except at beginning) * Complete it, then consume it. */ if (state->bufferedSize) { size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); input += loadSize; XXH3_consumeStripes(state->acc, &state->nbStripesSoFar, state->nbStripesPerBlock, state->buffer, XXH3_INTERNALBUFFER_STRIPES, secret, state->secretLimit, f_acc512, f_scramble); state->bufferedSize = 0; } XXH_ASSERT(input < bEnd); /* Consume input by a multiple of internal buffer size */ if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) { const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; do { XXH3_consumeStripes(state->acc, &state->nbStripesSoFar, state->nbStripesPerBlock, input, XXH3_INTERNALBUFFER_STRIPES, secret, state->secretLimit, f_acc512, f_scramble); input += XXH3_INTERNALBUFFER_SIZE; } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); } XXH_ASSERT(input < bEnd); /* Some remaining input (always) : buffer it */ XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); state->bufferedSize = (XXH32_hash_t)(bEnd-input); } return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) { return XXH3_update(state, (const xxh_u8*)input, len, XXH3_accumulate_512, XXH3_scrambleAcc); } XXH_FORCE_INLINE void XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, const unsigned char* secret) { /* * Digest on a local copy. This way, the state remains unaltered, and it can * continue ingesting more input afterwards. */ memcpy(acc, state->acc, sizeof(state->acc)); if (state->bufferedSize >= XXH_STRIPE_LEN) { size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; size_t nbStripesSoFar = state->nbStripesSoFar; XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock, state->buffer, nbStripes, secret, state->secretLimit, XXH3_accumulate_512, XXH3_scrambleAcc); /* last stripe */ XXH3_accumulate_512(acc, state->buffer + state->bufferedSize - XXH_STRIPE_LEN, secret + state->secretLimit - XXH_SECRET_LASTACC_START); } else { /* bufferedSize < XXH_STRIPE_LEN */ xxh_u8 lastStripe[XXH_STRIPE_LEN]; size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); XXH3_accumulate_512(acc, lastStripe, secret + state->secretLimit - XXH_SECRET_LASTACC_START); } } XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) { const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; XXH3_digest_long(acc, state, secret); return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * XXH_PRIME64_1); } /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ if (state->seed) return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); } #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize) { XXH_ASSERT(secretBuffer != NULL); if (customSeedSize == 0) { memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return; } XXH_ASSERT(customSeed != NULL); { size_t const segmentSize = sizeof(XXH128_hash_t); size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize; XXH128_canonical_t scrambler; XXH64_hash_t seeds[12]; size_t segnb; XXH_ASSERT(nbSegments == 12); XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */ XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); /* * Copy customSeed to seeds[], truncating or repeating as necessary. */ { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds)); size_t filled = toFill; memcpy(seeds, customSeed, toFill); while (filled < sizeof(seeds)) { toFill = XXH_MIN(filled, sizeof(seeds) - filled); memcpy((char*)seeds + filled, seeds, toFill); filled += toFill; } } /* generate secret */ memcpy(secretBuffer, &scrambler, sizeof(scrambler)); for (segnb=1; segnb < nbSegments; segnb++) { size_t const segmentStart = segnb * segmentSize; XXH128_canonical_t segment; XXH128_canonicalFromHash(&segment, XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) ); memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment)); } } } /* ========================================== * XXH3 128 bits (a.k.a XXH128) * ========================================== * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, * even without counting the significantly larger output size. * * For example, extra steps are taken to avoid the seed-dependent collisions * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). * * This strength naturally comes at the cost of some speed, especially on short * lengths. Note that longer hashes are about as fast as the 64-bit version * due to it using only a slight modification of the 64-bit loop. * * XXH128 is also more oriented towards 64-bit machines. It is still extremely * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). */ XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { /* A doubled version of 1to3_64b with different constants. */ XXH_ASSERT(input != NULL); XXH_ASSERT(1 <= len && len <= 3); XXH_ASSERT(secret != NULL); /* * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } */ { xxh_u8 const c1 = input[0]; xxh_u8 const c2 = input[len >> 1]; xxh_u8 const c3 = input[len - 1]; xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; XXH128_hash_t h128; h128.low64 = XXH64_avalanche(keyed_lo); h128.high64 = XXH64_avalanche(keyed_hi); return h128; } } XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); XXH_ASSERT(4 <= len && len <= 8); seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; { xxh_u32 const input_lo = XXH_readLE32(input); xxh_u32 const input_hi = XXH_readLE32(input + len - 4); xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; xxh_u64 const keyed = input_64 ^ bitflip; /* Shift len to the left to ensure it is even, this avoids even multiplies. */ XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); m128.high64 += (m128.low64 << 1); m128.low64 ^= (m128.high64 >> 3); m128.low64 = XXH_xorshift64(m128.low64, 35); m128.low64 *= 0x9FB21C651E98DF25ULL; m128.low64 = XXH_xorshift64(m128.low64, 28); m128.high64 = XXH3_avalanche(m128.high64); return m128; } } XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); XXH_ASSERT(9 <= len && len <= 16); { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; xxh_u64 const input_lo = XXH_readLE64(input); xxh_u64 input_hi = XXH_readLE64(input + len - 8); XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); /* * Put len in the middle of m128 to ensure that the length gets mixed to * both the low and high bits in the 128x64 multiply below. */ m128.low64 += (xxh_u64)(len - 1) << 54; input_hi ^= bitfliph; /* * Add the high 32 bits of input_hi to the high 32 bits of m128, then * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to * the high 64 bits of m128. * * The best approach to this operation is different on 32-bit and 64-bit. */ if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ /* * 32-bit optimized version, which is more readable. * * On 32-bit, it removes an ADC and delays a dependency between the two * halves of m128.high64, but it generates an extra mask on 64-bit. */ m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); } else { /* * 64-bit optimized (albeit more confusing) version. * * Uses some properties of addition and multiplication to remove the mask: * * Let: * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) * c = XXH_PRIME32_2 * * a + (b * c) * Inverse Property: x + y - x == y * a + (b * (1 + c - 1)) * Distributive Property: x * (y + z) == (x * y) + (x * z) * a + (b * 1) + (b * (c - 1)) * Identity Property: x * 1 == x * a + b + (b * (c - 1)) * * Substitute a, b, and c: * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) * * Since input_hi.hi + input_hi.lo == input_hi, we get this: * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) */ m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); } /* m128 ^= XXH_swap64(m128 >> 64); */ m128.low64 ^= XXH_swap64(m128.high64); { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); h128.high64 += m128.high64 * XXH_PRIME64_2; h128.low64 = XXH3_avalanche(h128.low64); h128.high64 = XXH3_avalanche(h128.high64); return h128; } } } /* * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN */ XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(len <= 16); { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); if (len) return XXH3_len_1to3_128b(input, len, secret, seed); { XXH128_hash_t h128; xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); h128.low64 = XXH64_avalanche(seed ^ bitflipl); h128.high64 = XXH64_avalanche( seed ^ bitfliph); return h128; } } } /* * A bit slower than XXH3_mix16B, but handles multiply by zero better. */ XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed) { acc.low64 += XXH3_mix16B (input_1, secret+0, seed); acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); acc.high64 += XXH3_mix16B (input_2, secret+16, seed); acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); return acc; } XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; XXH_ASSERT(16 < len && len <= 128); { XXH128_hash_t acc; acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; if (len > 32) { if (len > 64) { if (len > 96) { acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); } acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); } acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); } acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); { XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) + ((len - seed) * XXH_PRIME64_2); h128.low64 = XXH3_avalanche(h128.low64); h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); return h128; } } } XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); { XXH128_hash_t acc; int const nbRounds = (int)len / 32; int i; acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; for (i=0; i<4; i++) { acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, secret + (32 * i), seed); } acc.low64 = XXH3_avalanche(acc.low64); acc.high64 = XXH3_avalanche(acc.high64); XXH_ASSERT(nbRounds >= 4); for (i=4 ; i < nbRounds; i++) { acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), seed); } /* last bytes */ acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0ULL - seed); { XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) + ((len - seed) * XXH_PRIME64_2); h128.low64 = XXH3_avalanche(h128.low64); h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); return h128; } } } XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); { XXH128_hash_t h128; h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); h128.high64 = XXH3_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * XXH_PRIME64_2)); return h128; } } /* * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; (void)secret; (void)secretLen; return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); } /* * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); } XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble, XXH3_f_initCustomSecret f_initSec) { if (seed64 == 0) return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble); { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; f_initSec(secret, seed64); return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), f_acc512, f_scramble); } } /* * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed(const void* input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)secret; (void)secretLen; return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); } typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, XXH64_hash_t, const void* XXH_RESTRICT, size_t); XXH_FORCE_INLINE XXH128_hash_t XXH3_128bits_internal(const void* input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, XXH3_hashLong128_f f_hl128) { XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); /* * If an action is to be taken if `secret` conditions are not respected, * it should be done here. * For now, it's a contract pre-condition. * Adding a check and a branch here would cost performance at every hash. */ if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); return f_hl128(input, len, seed64, secret, secretLen); } /* === Public XXH128 API === */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) { return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_default); } XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) { return XXH3_128bits_internal(input, len, 0, (const xxh_u8*)secret, secretSize, XXH3_hashLong_128b_withSecret); } XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) { return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_withSeed); } XXH_PUBLIC_API XXH128_hash_t XXH128(const void* input, size_t len, XXH64_hash_t seed) { return XXH3_128bits_withSeed(input, len, seed); } /* === XXH3 128-bit streaming === */ /* * All the functions are actually the same as for 64-bit streaming variant. * The only difference is the finalizatiom routine. */ static void XXH3_128bits_reset_internal(XXH3_state_t* statePtr, XXH64_hash_t seed, const void* secret, size_t secretSize) { XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); } XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr) { if (statePtr == NULL) return XXH_ERROR; XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize); if (secret == NULL) return XXH_ERROR; if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) { if (statePtr == NULL) return XXH_ERROR; if (seed==0) return XXH3_128bits_reset(statePtr); if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) { return XXH3_update(state, (const xxh_u8*)input, len, XXH3_accumulate_512, XXH3_scrambleAcc); } XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) { const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; XXH3_digest_long(acc, state, secret); XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); { XXH128_hash_t h128; h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * XXH_PRIME64_1); h128.high64 = XXH3_mergeAccs(acc, secret + state->secretLimit + XXH_STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); return h128; } } /* len <= XXH3_MIDSIZE_MAX : short code */ if (state->seed) return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); } /* 128-bit utility functions */ #include /* memcmp, memcpy */ /* return : 1 is equal, 0 if different */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { /* note : XXH128_hash_t is compact, it has no padding byte */ return !(memcmp(&h1, &h2, sizeof(h1))); } /* This prototype is compatible with stdlib's qsort(). * return : >0 if *h128_1 > *h128_2 * <0 if *h128_1 < *h128_2 * =0 if *h128_1 == *h128_2 */ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) { XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); /* note : bets that, in most cases, hash values are different */ if (hcmp) return hcmp; return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); } /*====== Canonical representation ======*/ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) { hash.high64 = XXH_swap64(hash.high64); hash.low64 = XXH_swap64(hash.low64); } memcpy(dst, &hash.high64, sizeof(hash.high64)); memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); } XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src) { XXH128_hash_t h; h.high64 = XXH_readBE64(src); h.low64 = XXH_readBE64(src->digest + 8); return h; } /* Pop our optimization override from above */ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ # pragma GCC pop_options #endif #endif /* XXH_NO_LONG_LONG */ #endif /* XXH_IMPLEMENTATION */ #if defined (__cplusplus) } #endif