pax_global_header00006660000000000000000000000064136774100470014523gustar00rootroot0000000000000052 comment=0b35080ae2db12ef2c9719edfdd284ca2b5a26dd libgff-2.0.0/000077500000000000000000000000001367741004700127535ustar00rootroot00000000000000libgff-2.0.0/.gitignore000066400000000000000000000003711367741004700147440ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app libgff-2.0.0/BoostLicense.txt000066400000000000000000000025511367741004700161100ustar00rootroot00000000000000Boost Software License - Version 1.0 - August 17, 2003 Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare [[derivative work]]s of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. libgff-2.0.0/CMakeLists.txt000066400000000000000000000041511367741004700155140ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.12) project(gff) message(STATUS "Install prefix : ${CMAKE_INSTALL_PREFIX}") set(INCLUDE_INSTALL_DIR include/) set(LIB_INSTALL_DIR lib/) include(CMakePackageConfigHelpers) set(ver_major 2) set(ver_minor 0) set(ver_patch 0) set(CPACK_PACKAGE_VERSION_MAJOR ${ver_major}) set(CPACK_PACKAGE_VERSION_MINOR ${ver_minor}) set(CPACK_PACKAGE_VERSION_PATCH ${ver_patch}) set(CPACK_PACKAGE_VERSION "${ver_major}.${ver_minor}.${ver_patch}") message("version: ${CPACK_PACKAGE_VERSION}") set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION}) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(PROJECT_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include) set(PROJECT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -D_FILE_OFFSET_BITS=64" ) set(GFFLib_SRCS ${PROJECT_SOURCE_DIR}/codons.cpp ${PROJECT_SOURCE_DIR}/GArgs.cpp ${PROJECT_SOURCE_DIR}/GBase.cpp ${PROJECT_SOURCE_DIR}/gdna.cpp ${PROJECT_SOURCE_DIR}/GFaSeqGet.cpp ${PROJECT_SOURCE_DIR}/GFastaIndex.cpp ${PROJECT_SOURCE_DIR}/gff.cpp #${PROJECT_SOURCE_DIR}/gff_utils.cpp ${PROJECT_SOURCE_DIR}/GStr.cpp) include_directories(${PROJECT_INCLUDE_DIR}) add_library(${PROJECT_NAME} STATIC ${GFFLib_SRCS}) add_executable(TestGFFParse ${PROJECT_SOURCE_DIR}/TestGFFParse.cpp) target_link_libraries(TestGFFParse ${PROJECT_NAME}) configure_package_config_file(libgffConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/libgffConfig.cmake INSTALL_DESTINATION ${LIB_INSTALL_DIR}/libgff/cmake PATH_VARS INCLUDE_INSTALL_DIR LIB_INSTALL_DIR) write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/libgffConfigVersion.cmake VERSION ${ver_major}.${ver_minor}.${ver_patch} COMPATIBILITY SameMajorVersion ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgffConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/libgffConfigVersion.cmake DESTINATION ${LIB_INSTALL_DIR}/libgff/cmake ) install(FILES ${CMAKE_BINARY_DIR}/lib${PROJECT_NAME}.a DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) install(DIRECTORY ${PROJECT_INCLUDE_DIR} DESTINATION ${CMAKE_INSTALL_PREFIX}) libgff-2.0.0/Readme.md000066400000000000000000000013071367741004700144730ustar00rootroot00000000000000LibGFF ====== This is an attempt to perform a simple "libraryfication" of the GFF/GTF parsing code that is used in [GFFRead](https://github.com/gpertea/gffread) codebase. There are not many (any?) relatively lightweight GTF/GFF parsers exposing a C++ interface, and the goal of this library is to provide this functionality without the necessity of drawing in a heavy-weight dependency like SeqAn. *Note*: This library draws _directly_ from the code in GFFRead and [GCLib](https://github.com/gpertea/gclib), and exists primarily to remove functionality (and hence code) that is unnecessary for our downstream purposes. In the future, it may be appropriate to just replace this library wholesale with GCLib. libgff-2.0.0/include/000077500000000000000000000000001367741004700143765ustar00rootroot00000000000000libgff-2.0.0/include/GArgs.h000066400000000000000000000076641367741004700155670ustar00rootroot00000000000000/* GArgs is a quick'n'dirty object oriented replacement for the standard getopts library available on many unix platforms; it accepts the regular single letter, single-dash style options -[ ][] but also attr=value style options: = or --[=] */ #ifndef G_ARGS_DEFINED #define G_ARGS_DEFINED #ifdef HAVE_CONFIG_H #include #endif #include struct GArgsDef { const char* longopt; char opt; //equivalent one-char option, if any bool req_value; //true if the string that follows must be a value int code; //an enum code to be associated with this option }; class GArgs { //structure for parsing arguments format definition struct fmtdef { char* longopt; char opt; //equivalent one-char option, if any bool req_value; //true if the string that follows must be a value int code; //an enum code to be associated with this option }; int fmtcount; fmtdef* fmt; //this will store format definition after parsing it struct argdata { char* opt; // this is NULL for non-dashed arguments // a single character for single dash style arguments // a string for ARG=VALUE or --long_option style arguments char* value; // is NULL for switches (dashed flags) int fmti; //index in fmt table //int code; // if GArgsDef[] constructor was used, for getOpt }; int _argc; char** _argv; //the original main() values argdata* args; //arguments table after parsing it int count; //total count of elements in 'args' array int nonOptCount; //count of non-dashed, non= arguments int nonOptPos; //current position for nonOpt arguments iterator int optPos; //current position for options iterator int errarg; //argv error position after parsing bool err_valmissing; //if the error is strictly about missing value for errarg option int parseArgs(bool nodigitopts=false); //parsing helper functions int validOpt(int c); int validShortOpt(char o); int validLongOpt(char* o, char* to); public: GArgs(int argc, char* argv[], const char* format, bool nodigitopts=false); /* format can be: {;|=} e.g. disable-test;PID=S= for --disable-test PID=50 (or --PID 50) S=3.5 etc. [:] e.g. p:hT for -p testing (or -ptesting) -h -T This means that the long options, if present, should be given at the beginning of the format string, before the single-dash, single-char options */ GArgs(int argc, char* argv[], const GArgsDef fmtrecs[], bool nodigitopts=false); ~GArgs(); int isError(); // returns the offending argv position or 0 if no error int getCount() { return count; } //total number of arguments given int getFmtCount() { return fmtcount; } //total number of option definitions int getNonOptCount() { return nonOptCount; } //total number of non-option arguments char* getOpt(const char* o); /* retrieve the value for option o returns NULL if option not given at all !=NULL if boolean option was given opt's value if value option was given */ char* getOpt(const char o); char* getOpt(int c); //retrieve value by enum code char* getOptName(int c); //retrieve name of by enum code int startOpt(); //init iteration through option arguments // returns number of option args char* nextOpt(); //get next option argument's string int nextCode(); //get next option argument's code int startNonOpt(void); //init iteration through non-option arguments // returns the number of non-option arguments void printError(FILE* fout, const char* usage=NULL, bool exitProgram=false); void printError(const char* usage=NULL, bool exitProgram=false); void printCmdLine(FILE* fout); char* nextNonOpt(); //get the next non-option argument }; #endif libgff-2.0.0/include/GBase.h000066400000000000000000000521421367741004700155340ustar00rootroot00000000000000#ifndef G_BASE_DEFINED #define G_BASE_DEFINED #define GCLIB_VERSION "0.11.9" #ifdef HAVE_CONFIG_H #include "config.h" #endif #if defined(__WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_WIN64) || defined(__MINGW64__) || defined(__WINDOWS__) #ifndef _WIN32 #define _WIN32 #endif #ifndef _WIN64 #define _WIN64 #endif #define __USE_MINGW_ANSI_STDIO 1 //#define __ISO_C_VISIBLE 1999 #endif #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #include #define CHPATHSEP '\\' #undef off_t #define off_t int64_t #ifndef popen #define popen _popen #endif #ifndef fseeko #ifdef _fseeki64 #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin) #else #define fseeko fseek #endif #endif #ifndef ftello #ifdef _ftelli64 #define ftello(stream) _ftelli64(stream) #else #define ftello ftell #endif #endif #else #define CHPATHSEP '/' #include #endif #ifndef fseeko #define fseeko fseek #endif #ifndef ftello #define ftello ftell #endif #ifdef DEBUG #undef NDEBUG #define _DEBUG 1 #define _DEBUG_ 1 #endif typedef int32_t int32; typedef uint32_t uint32; typedef int16_t int16; typedef uint16_t uint16; typedef unsigned char uchar; typedef unsigned char byte; #ifndef MAXUINT #define MAXUINT ((unsigned int)-1) #endif #ifndef MAXINT #define MAXINT INT_MAX #endif #ifndef MAX_UINT #define MAX_UINT ((unsigned int)-1) #endif #ifndef MAX_INT #define MAX_INT INT_MAX #endif typedef int64_t int64; typedef uint64_t uint64; /****************************************************************************/ #ifndef EXIT_FAILURE #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #endif /****************************************************************************/ #define ERR_ALLOC "Error allocating memory.\n" //------------------- #define GEXIT(a) { \ fprintf(stderr, "Error: "); fprintf(stderr, a); \ GError("Exiting from line %i in file %s\n",__LINE__,__FILE__); \ } // Debug helpers #ifndef NDEBUG #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__)) #define GVERIFY(condition) \ if (!(condition)) { \ fprintf(stderr, "Assumption \"%s\"\nFailed in file %s: at line:%i\n", \ #condition,__FILE__,__LINE__); \ GEXIT(#condition);} #ifdef TRACE #define GTRACE(exp) (GMessage(exp)) #else #define GTRACE(exp) #endif #else #define GASSERT(exp) #define GTRACE(exp) #define GVERIFY(condition) #endif #define GERROR(exp) (GError(exp)) // Abolute value #define GABS(val) (((val)>=0)?(val):-(val)) // Min and Max #define GMAX(a,b) (((a)>(b))?(a):(b)) #define GMIN(a,b) (((a)>(b))?(b):(a)) // Min of three #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z)) // Max of three #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z)) // Return minimum and maximum of a, b #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a))) // Clamp value x to range [lo..hi] #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x))) typedef void* pointer; typedef unsigned int uint; typedef int GCompareProc(const pointer item1, const pointer item2); typedef long GFStoreProc(const pointer item1, FILE* fstorage); //for serialization typedef pointer GFLoadProc(FILE* fstorage); //for deserialization typedef void GFreeProc(pointer item); //usually just delete, //but may also support structures with embedded dynamic members #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GFREE(ptr) GFree((pointer*)(&ptr)) inline char* strMin(char *arg1, char *arg2) { return (strcmp(arg1, arg2) < 0)? arg1 : arg2; } inline char* strMax(char *arg1, char *arg2) { return (strcmp(arg2, arg1) < 0)? arg1 : arg2; } inline int iround(double x) { return (int)floor(x + 0.5); } int Gmkdir(const char *path, bool recursive=true, int perms = (S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)); void Gmktempdir(char* templ); /****************************************************************************/ inline int Gintcmp(int a, int b) { //return (a>b)? 1 : ((a==b)?0:-1); return a-b; } int Gstrcmp(const char* a, const char* b, int n=-1); //same as strcmp but doesn't crash on NULL pointers int Gstricmp(const char* a, const char* b, int n=-1); bool GstrEq(const char* a, const char* b); bool GstriEq(const char* a, const char* b); //basic swap template function template void Gswap(T& lhs, T& rhs) { T tmp=lhs; //requires copy operator lhs=rhs; rhs=tmp; } /**************** Memory management ***************************/ bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory bool GRealloc(pointer* ptr,unsigned long size); // Resize memory void GFree(pointer* ptr); // Free memory, resets ptr to NULL //int saprintf(char **retp, const char *fmt, ...); void GError(const char* format,...); // Error routine (aborts program) void GMessage(const char* format,...);// Log message to stderr // Assert failed routine:- usually not called directly but through GASSERT void GAssert(const char* expression, const char* filename, unsigned int lineno); // ****************** basic string manipulation ************************* char *Gstrdup(const char* str, int xtracap=0); //string duplication with extra capacity added //duplicate a string by allocating a copy for it (+xtracap heap room) and returning the new pointer //caller is responsible for deallocating the returned pointer! char* Gstrdup(const char* sfrom, const char* sto); //same as GStrdup, but with an early termination (e.g. on delimiter) char* Gsubstr(const char* str, char* from, char* to=NULL); //extracts a substring, allocating it, including boundaries (from/to) char* replaceStr(char* &str, char* newvalue); //conversion: to Lower/Upper case // creating a new string: char* upCase(const char* str); char* loCase(const char* str); // changing string in place: char* strlower(char * str); char* strupper(char * str); //strstr but for memory zones: scans a memory region //for a substring: void* Gmemscan(void *mem, unsigned int len, void *part, unsigned int partlen); FILE* Gfopen(const char *path, char *mode=NULL); // test if a char is in a string: bool chrInStr(char c, const char* str); char* rstrchr(char* str, char ch); /* returns a pointer to the rightmost occurence of ch in str - like rindex for platforms missing it*/ char* strchrs(const char* s, const char* chrs); //strchr but with a set of chars instead of only one char* rstrfind(const char* str, const char *substr); // like rindex() but for strings; right side version of strstr() char* reverseChars(char* str, int slen=0); //in place reversal of string char* rstrstr(const char* rstart, const char *lend, const char* substr); /*the reversed, rightside equivalent of strstr: starts searching from right end (rstart), going back to left end (lend) and returns a pointer to the last (right) matching character in str */ char* strifind(const char* str, const char* substr); // case insensitive version of strstr -- finding a string within another string // returns NULL if not found //Determines if a string begins with a given prefix //(returns false when any of the params is NULL, // but true when prefix is '' (empty string)!) bool startsWith(const char* s, const char* prefix); bool startsiWith(const char* s, const char* prefix); //case insensitive bool endsWith(const char* s, const char* suffix); //Note: returns true if suffix is empty string, but false if it's NULL bool endsiWith(const char* s, const char* suffix); //case insensitive version //like endsWith but also remove the suffix if found //returns true if the given suffix was found and removed bool trimSuffix(char* s, const char* suffix); //case insensitive version: bool trimiSuffix(char* s, const char* suffix); // ELF hash function for strings int strhash(const char* str); //alternate hash functions: int fnv1a_hash(const char* cp); int djb_hash(const char* cp); //---- generic base GSeg : genomic segment (interval) -- // coordinates are considered 1-based (so 0 is invalid) class GSeg { public: uint start; //starte) { start=e;end=s; } else { start=s;end=e; } } //check for overlap with other segment uint len() { return end-start+1; } bool overlap(GSeg* d) { //return startstart ? (d->start<=end) : (start<=d->end); return (start<=d->end && end>=d->start); } bool overlap(GSeg& d) { //return start=d.start); } bool overlap(GSeg& d, int fuzz) { //return start=d.start); } bool overlap(uint x) { return (start<=x && x<=end); } bool overlap(uint s, uint e) { if (s>e) { Gswap(s,e); } return (start<=e && end>=s); } //return the length of overlap between two segments int overlapLen(GSeg* r) { if (startstart) { if (r->start>end) return 0; return (r->end>end) ? end-r->start+1 : r->end-r->start+1; } else { //r->start<=start if (start>r->end) return 0; return (r->endend-start+1 : end-start+1; } } int overlapLen(uint rstart, uint rend) { if (rstart>rend) { Gswap(rstart,rend); } if (startend) return 0; return (rend>end) ? end-rstart+1 : rend-rstart+1; } else { //rstart<=start if (start>rend) return 0; return (rendstart && end>=s->end); } bool contained(GSeg* s) { return (s->start<=start && s->end>=end); } bool equals(GSeg& d){ return (start==d.start && end==d.end); } bool equals(GSeg* d){ return (start==d->start && end==d->end); } //fuzzy coordinate matching: bool coordMatch(GSeg* s, uint fuzz=0) { //caller must check for s!=NULL if (fuzz==0) return (start==s->start && end==s->end); uint sd = (start>s->start) ? start-s->start : s->start-start; uint ed = (end>s->end) ? end-s->end : s->end-end; return (sd<=fuzz && ed<=fuzz); } void expand(int by) { //expand in both directions start-=by; end+=by; } void expandInclude(uint rstart, uint rend) { //expand to include given coordinates if (rstart>rend) { Gswap(rstart,rend); } if (rstartend) end=rend; } //comparison operators required for sorting bool operator==(GSeg& d){ return (start==d.start && end==d.end); } bool operator<(GSeg& d){ return (start==d.start)?(end=fCount) GError(GDynArray_INDEX_ERR, x, fCount) #endif #define GDynArray_MAXCOUNT UINT_MAX-1 #define GDynArray_NOIDX UINT_MAX //basic dynamic array (vector) template for simple/primitive types or structs //Warning: uses malloc so it will never call the item's default constructor when growing template class GDynArray { protected: bool byptr; //in-place copy (pointer) takeover of existing OBJ[] OBJ *fArray; uint fCount; uint fCapacity; // size of allocated memory const static uint dyn_array_defcap = 16; // initial capacity (in elements) public: GDynArray(int initcap=dyn_array_defcap):byptr(false), fArray(NULL), fCount(0), fCapacity(initcap) { // constructor GMALLOC(fArray, fCapacity*sizeof(OBJ)); } GDynArray(const GDynArray &a):byptr(false), fArray(NULL), fCount(a.fCount), fCapacity(a.fCapacity) { // copy constructor GMALLOC(fArray, sizeof(OBJ)*a.fCapacity); memcpy(fArray, a.fArray, sizeof(OBJ)* a.fCapacity); } GDynArray(OBJ* ptr, uint pcap):byptr(true), fArray(ptr), fCount(0), fCapacity(pcap) { //this will never deallocate the passed pointer } virtual ~GDynArray() { if (!byptr) { GFREE(fArray); } } GDynArray& operator = (const GDynArray &a) { // assignment operator if (this == &a) return *this; if (a.fCount == 0) { Clear(); return *this; } growTo(a.fCapacity); //set size memcpy(fArray, a.fArray, sizeof(OBJ)*a.fCount); return *this; } OBJ& operator[] (uint idx) {// get array item GDynArray_TEST_INDEX(idx); return fArray[idx]; } void Grow() { int delta = (fCapacity>16) ? (fCapacity>>2) : 2; if (GDynArray_MAXCOUNT-delta<=fCapacity) delta=GDynArray_MAXCOUNT-fCapacity; if (delta<=1) GError("Error at GDynArray::Grow(): max capacity reached!\n"); growTo(fCapacity + delta); } #define GDynArray_ADD(item) \ if (fCount==MAX_UINT-1) GError("Error at GDynArray: cannot add item, maximum count reached!\n"); \ if ((++fCount) > fCapacity) Grow(); \ fArray[fCount-1] = item; uint Add(OBJ* item) { // Add item to the end of array //element given by pointer if (item==NULL) return GDynArray_NOIDX; GDynArray_ADD( (*item) ); return (fCount-1); } uint Add(OBJ item) { // Add OBJ copy to the end of array GDynArray_ADD(item); return (fCount-1); } uint Push(OBJ item) { //same as Add GDynArray_ADD(item); return (fCount-1); } OBJ Pop() { //shoddy.. Do NOT call this for an empty array! if (fCount==0) return (OBJ)NULL; //a NULL cast operator is required --fCount; return fArray[fCount]; } uint Count() { return fCount; } // get size of array (elements) uint Capacity() { return fCapacity; } void growTo(uint newcap) { if (newcap==0) { Clear(); return; } if (newcap <= fCapacity) return; //never shrink! (use Pack() for shrinking) GREALLOC(fArray, newcap*sizeof(OBJ)); fCapacity=newcap; } void append(OBJ* arr, uint count) { //fast adding of a series of objects growTo(fCount+count); memcpy(fArray+fCount, arr, count*sizeof(OBJ)); fCount+=count; } void append(GDynArray arr) { //fast adding of a series of objects growTo(fCount+arr.fCount); memcpy(fArray+fCount, arr.fArray, arr.fCount*sizeof(OBJ)); fCount+=arr.fCount; } void Trim(int tcount=1) { //simply cut (discard) the last tcount items //new Count is now fCount-tcount //does NOT shrink capacity accordingly! if (fCount>=tcount) fCount-=tcount; } void Pack() { //shrink capacity to fCount+dyn_array_defcap if (fCapacity-fCount<=dyn_array_defcap) return; int newcap=fCount+dyn_array_defcap; GREALLOC(fArray, newcap*sizeof(OBJ)); fCapacity=newcap; } void zPack(OBJ z) { //shrink capacity to fCount+1 and adds a z terminator if (fCapacity-fCount<=1) { fArray[fCount]=z; return; } int newcap=fCount+1; GREALLOC(fArray, newcap*sizeof(OBJ)); fCapacity=newcap; fArray[fCount]=z; } inline void Shrink() { Pack(); } void Delete(uint idx) { GDynArray_TEST_INDEX(idx); --fCount; if (idx& fields, const char* delim, int maxfields=MAX_INT); //splits a string by placing 0 where any of the delim chars are found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int strsplit(char* str, GDynArray& fields, const char delim, int maxfields=MAX_INT); //splits a string by placing 0 where the delim char is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int strsplit(char* str, GDynArray& fields, int maxfields=MAX_INT); //splits by tab or space //splits a string by placing 0 where tab or space is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed // ************** simple line reading class for text files //GLineReader -- text line reading/buffering class class GLineReader { bool closeFile; //int len; //int allocated; GDynArray buf; int textlen; //length of actual text, without newline character(s) bool isEOF; FILE* file; off_t filepos; //current position bool pushed; //pushed back int lcount; //line counter (read lines) public: char* chars() { return buf(); } char* line() { return buf(); } int readcount() { return lcount; } //number of lines read void setFile(FILE* stream) { file=stream; } int blength() { return buf.Count(); } //binary/buffer length, including newline character(s) int charcount() { return buf.Count(); } //line length, including newline character(s) int tlength() { return textlen; } //text length excluding newline character(s) int linelen() { return textlen; } //line length, excluding newline character(s) //int size() { return buf.Count(); } //same as size(); bool isEof() {return isEOF; } bool eof() { return isEOF; } off_t getfpos() { return filepos; } off_t getFpos() { return filepos; } char* nextLine() { return getLine(); } char* getLine() { if (pushed) { pushed=false; return buf(); } else return getLine(file); } char* getLine(FILE* stream) { if (pushed) { pushed=false; return buf(); } else return getLine(stream, filepos); } char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update // the given file position void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request // so the next call will in fact return the same line GLineReader(const char* fname):closeFile(false),buf(1024), textlen(0), isEOF(false),file(NULL),filepos(0), pushed(false), lcount(0) { FILE* f=fopen(fname, "rb"); if (f==NULL) GError("Error opening file '%s'!\n",fname); closeFile=true; file=f; } GLineReader(FILE* stream=NULL, off_t fpos=0):closeFile(false),buf(1024), textlen(0), isEOF(false),file(stream), filepos(fpos), pushed(false), lcount(0) { } ~GLineReader() { if (closeFile) fclose(file); } }; /* extended fgets() - to read one full line from a file and update the file position correctly ! buf will be reallocated as necessary, to fit the whole line */ char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL); //print int/values nicely formatted in 3-digit groups char* commaprintnum(uint64 n); /*********************** File management functions *********************/ // removes the last part (file or directory name) of a full path // WARNING: this is a destructive operation for the given string! void delFileName(char* filepath); // returns a pointer to the last file or directory name in a full path const char* getFileName(const char* filepath); // returns a pointer to the file "extension" part in a filename const char* getFileExt(const char* filepath); int fileExists(const char* fname); //returns 0 if path doesn't exist // 1 if it's a directory // 2 if it's a regular file // 3 something else (but entry exists) int64 fileSize(const char* fpath); //write a formatted fasta record, fasta formatted void writeFasta(FILE *fw, const char* seqid, const char* descr, const char* seq, int linelen=60, int seqlen=0); //parses the next number found in a string at the current position //until a non-digit (and not a '.', 'e','E','-','+') is encountered; //updates the char* pointer to be after the last digit parsed bool parseNumber(char* &p, double& v); bool parseDouble(char* &p, double& v); //just an alias for parseNumber bool parseFloat(char* &p, float& v); bool strToInt(char* p, int& i); bool strToUInt(char* p, uint& i); bool parseInt(char* &p, int& i); //advance pointer p after the number bool parseUInt(char* &p, uint& i); //advance pointer p after the number bool parseHex(char* &p, uint& i); #endif /* G_BASE_DEFINED */ libgff-2.0.0/include/GFaSeqGet.h000066400000000000000000000247071367741004700163270ustar00rootroot00000000000000#ifndef GFASEQGET_H #define GFASEQGET_H #include "GFastaIndex.h" #define MAX_FASUBSEQ 0x20000000 //max 512MB sequence data held in memory at a time class GSubSeq { public: uint sqstart; //1-based coord of subseq start on sequence uint sqlen; //length of subseq loaded char* sq; //actual subsequence data will be stored here // (with end-of-line characters removed) /*char* xseq; //the exposed pointer to the last requested subsequence start off_t xstart; //the coordinate start for the last requested subseq off_t xlen; //the last requested subseq len*/ GSubSeq() { sqstart=0; sqlen=0; sq=NULL; /* xseq=NULL; xstart=0; xlen=0;*/ } void forget() { //forget about pointer data, so we can reuse it sq=NULL; sqstart=0; sqlen=0; } ~GSubSeq() { GFREE(sq); } // genomic, 1-based coordinates: void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0); //check for overlap with previous window and realloc/extend appropriately //returns offset from seq that corresponds to sstart // the window will keep extending until MAX_FASUBSEQ is reached }; // class GFaSeqGet { char* fname; //file name where the sequence resides FILE* fh; off_t fseqstart; //file offset where the sequence actually starts uint seq_len; //total sequence length, if known (when created from GFastaIndex) uint line_len; //length of each line of text uint line_blen; //binary length of each line // = line_len + number of EOL character(s) GSubSeq* lastsub; void initialParse(off_t fofs=0, bool checkall=true); const char* loadsubseq(uint cstart, int& clen); void finit(const char* fn, off_t fofs, bool validate); public: //GStr seqname; //current sequence name char* seqname; GFaSeqGet(): fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { } GFaSeqGet(const char* fn, off_t fofs, bool validate=false):fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { finit(fn,fofs,validate); } GFaSeqGet(const char* fn, bool validate=false):fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { finit(fn,0,validate); } GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen); //constructor from GFastaIndex record GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false); ~GFaSeqGet() { if (fname!=NULL) { GFREE(fname); fclose(fh); } GFREE(seqname); delete lastsub; } const char* seq(uint cstart=1, int clen=0) { int cend = clen==0 ? 0 : cstart+clen-1; return getRange(cstart, cend); } const char* subseq(uint cstart, int& clen); const char* getRange(uint cstart=1, uint cend=0) { if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ; if (cstart>cend) { Gswap(cstart, cend); } int clen=cend-cstart+1; //int rdlen=clen; return subseq(cstart, clen); } //caller is responsible for deallocating the return string char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false); //uncached, read and return allocated buffer //caller is responsible for deallocating the return string char* fetchSeq(int* retlen=NULL) { int clen=(seq_len>0) ? seq_len : MAX_FASUBSEQ; if (lastsub) { delete lastsub; lastsub=NULL; } subseq(1, clen); if (retlen) *retlen=clen; char* r=lastsub->sq; lastsub->forget(); if (clen>0) { r[clen]=0; } else { r=NULL; } return r; } void loadall(uint32 max_len=0) { //TODO: better read the whole sequence differently here - line by line //so when EOF or another '>' line is found, the reading stops! int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ); subseq(1, clen); } void load(uint cstart, uint cend) { //cache as much as possible if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request int clen=cend-cstart+1; subseq(cstart, clen); } int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; } int getseqlen() { return seq_len; } //known when loaded with GFastaIndex off_t getseqofs() { return fseqstart; } int getLineLen() { return line_len; } int getLineBLen() { return line_blen; } //reads a subsequence starting at genomic coordinate cstart (1-based) }; //multi-fasta sequence handling class GFastaDb { public: char* fastaPath; GFastaIndex* faIdx; //could be a cdb .cidx file //int last_fetchid; const char* last_seqname; GFaSeqGet* faseq; //GCdbYank* gcdb; GFastaDb(const char* fpath=NULL, bool forceIndexFile=true):fastaPath(NULL), faIdx(NULL), last_seqname(NULL), faseq(NULL) { //gcdb=NULL; init(fpath, forceIndexFile); } void init(const char* fpath, bool writeIndexFile=true) { if (fpath==NULL || fpath[0]==0) return; //last_fetchid=-1; last_seqname=NULL; if (!fileExists(fpath)) GError("Error: file/directory %s does not exist!\n",fpath); fastaPath=Gstrdup(fpath); //GStr gseqpath(fpath); if (fileExists(fastaPath)>1) { //exists and it's not a directory char* fainame=Gstrdup(fastaPath,4); int fainamelen=strlen(fainame); //int fainame_len=strlen(fainame); if (trimSuffix(fastaPath, ".fai")) { //.fai index file given directly if (!fileExists(fastaPath)) GError("Error: cannot find fasta file for index %s !\n", fastaPath); } else { //append .fai as needed strcpy(fainame+fainamelen, ".fai"); fainamelen+=4; } //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars()); faIdx=new GFastaIndex(fastaPath, fainame); char* fainamecwd=fainame; //will hold just the file name without the path char* plast=strrchr(fainamecwd, '/'); //CHPATHSEP if (plast!=NULL) { fainamecwd=plast+1; //point to the file name only } if (!faIdx->hasIndex()) { //could not load index file .fai //try current directory (Warning: might not be the correct index for that file!) if (plast==NULL) { if (fileExists(fainamecwd)>1) { faIdx->loadIndex(fainamecwd); } } } //tried to load index if (!faIdx->hasIndex()) { //no index file to be loaded, build the index //if (forceIndexFile) // GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath); faIdx->buildIndex(); //build index in memory only if (faIdx->getCount()==0) GError("Error: no fasta records found!\n"); if (writeIndexFile) { //GMessage("Fasta index rebuilt.\n"); FILE* fcreate=fopen(fainame, "w"); char* idxfname=fainame; if (fcreate==NULL) { GMessage("Warning: cannot create fasta index file %s! (permissions?)\n", fainame); if (fainame!=fainamecwd) { //try cwd idxfname=fainamecwd; GMessage(" Attempting to create the index in the current directory..\n"); if ((fcreate=fopen(fainamecwd, "w"))==NULL) GError("Error: cannot create fasta index file %s!\n", fainamecwd); } } if (fcreate!=NULL) { if (faIdx->storeIndex(fcreate)getCount()) GMessage("Warning: error writing the index file %s!\n", idxfname); else GMessage("FASTA index file %s created.\n", idxfname); } } //file storage of index requested } //creating FASTA index GFREE(fainame); } //multi-fasta file } GFaSeqGet* fetchFirst(const char* fname, bool checkFasta=false) { faseq=new GFaSeqGet(fname, checkFasta); faseq->loadall(); //last_fetchid=gseq_id; GFREE(last_seqname); last_seqname=Gstrdup(faseq->seqname); return faseq; } char* getFastaFile(const char* gseqname) { if (fastaPath==NULL) return NULL; int gnl=strlen(gseqname); char* s=Gstrdup(fastaPath, gnl+8); int slen=strlen(s); if (s[slen-1]!='/') {//CHPATHSEP ? s[slen]='/'; slen++; s[slen]='\0'; } //s.append(gseqname); strcpy(s+slen, gseqname); slen+=gnl; if (!fileExists(s)) { //s.append(".fa") strcpy(s+slen, ".fa"); slen+=3; } if (!fileExists(s)) { strcpy(s+slen, "sta"); slen+=3; } if (fileExists(s)) return Gstrdup(s); else { GMessage("Warning: cannot find genomic sequence file %s/%s{.fa,.fasta}\n",fastaPath, s); return NULL; } GFREE(s); } GFaSeqGet* fetch(const char* gseqname) { if (fastaPath==NULL) return NULL; if (last_seqname!=NULL && (strcmp(gseqname, last_seqname)==0) && faseq!=NULL) return faseq; delete faseq; faseq=NULL; //last_fetchid=-1; GFREE(last_seqname); last_seqname=NULL; //char* gseqname=GffObj::names->gseqs.getName(gseq_id); if (faIdx!=NULL) { //fastaPath was the multi-fasta file name and it must have an index GFastaRec* farec=faIdx->getRecord(gseqname); if (farec!=NULL) { faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos, farec->line_len, farec->line_blen); faseq->loadall(); //just cache the whole sequence, it's faster //last_fetchid=gseq_id; last_seqname=Gstrdup(gseqname); } else { GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname); return NULL; } } else { //directory with FASTA files named as gseqname char* sfile=getFastaFile(gseqname); if (sfile!=NULL) { faseq=new GFaSeqGet(sfile); faseq->loadall(); //last_fetchid=gseq_id; GFREE(sfile); } } //one fasta file per contig //else GMessage("Warning: fasta index not available, cannot retrieve sequence %s\n", // gseqname); return faseq; } ~GFastaDb() { GFREE(fastaPath); GFREE(last_seqname); //delete gcdb; delete faIdx; delete faseq; } }; GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, const char* seqid); #endif libgff-2.0.0/include/GFastaIndex.h000066400000000000000000000037331367741004700167120ustar00rootroot00000000000000/* * GFaIdx.h * * Created on: Aug 25, 2010 * Author: gpertea */ #ifndef GFAIDX_H_ #define GFAIDX_H_ #include "GHash.hh" #include "GList.hh" class GFastaRec { public: char* seqname; uint seqlen; off_t fpos; int line_len; //effective line length (without EoL) int line_blen; //length of line including EoL characters GFastaRec(uint slen=0, off_t fp=0, int llen=0, int llenb=0) { seqname=NULL; //only a pointer copy seqlen=slen; fpos=fp; line_len=llen; line_blen=llenb; } bool operator==(GFastaRec& d){ return (fpos==d.fpos); } bool operator>(GFastaRec& d){ return (fpos>d.fpos); } bool operator<(GFastaRec& d){ return (fpos records; void addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full); GFastaRec* getRecord(const char* seqname) { return records.Find(seqname); } bool hasIndex() { return haveFai; } int loadIndex(const char* finame); int buildIndex(); //build index in memory by parsing the whole fasta file int storeIndex(const char* finame); int storeIndex(FILE* fai); int getCount() { return records.Count(); } GFastaIndex(const char* fname, const char* finame=NULL):records() { if (fileExists(fname)!=2) GError("Error: fasta file %s not found!\n",fname); if (fileSize(fname)<=0) GError("Error: invalid fasta file %s !\n",fname); fa_name=Gstrdup(fname); fai_name=finame!=NULL ? Gstrdup(finame) : NULL; if (fileSize(fa_name)==0) { GError("Error creating GFastaIndex(%s): invalid fasta file!\n",fa_name); } haveFai=false; if (fai_name!=NULL && fileSize(fai_name)>0) { //try to load the index file if it exists loadIndex(fai_name); haveFai=(records.Count()>0); } } ~GFastaIndex() { GFREE(fa_name); GFREE(fai_name); } }; #endif /* GFAIDX_H_ */ libgff-2.0.0/include/GHash.hh000066400000000000000000000405451367741004700157210ustar00rootroot00000000000000/******************************************************************************** * Hash table class template (char* based) * *********************************************************************************/ #ifndef GHash_HH #define GHash_HH #include "GBase.h" /** * This class maintains a fast-access hash table of entities * indexed by a character string (essentially, maps strings to pointers) */ //#define HASH_DBG_PRINT 1 #define GSTR_HASH(s) strhash(s) //#define GSTR_HASH(s) djb_hash(s) //#define GSTR_HASH(s) fnv1a_hash(s) //#define GSTR_HASH(s) murmur3(s) template class GHash { protected: struct GHashEntry { char* key; // Key string bool keyalloc; // shared key flag (to free/not the key) int hash; // Hash value of key pointer data; // Data }; GHashEntry* hash; // Hash int fCapacity; // table size int fCount; // number of valid entries int fCurrentEntry; char* lastkeyptr; //pointer to last key string added //---------- Raw data retrieval (including empty entries) // Return key at position pos. const char* Key(uint pos) const { return hash[pos].key; } // return data OBJ* at given position OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; } // Return position of first filled slot, or >= fCapacity int First() const; // Return position of last filled slot or -1 int Last() const; // Return position of next filled slot in hash table // or a value greater than or equal to fCapacity if no filled // slot was found int Next(int pos) const; //Return position of previous filled slot in hash table //or a -1 if no filled slot was found int Prev(int pos) const; private: GHash(const GHash&); GHash &operator=(const GHash&); GFreeProc* fFreeProc; //procedure to free item data protected: public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } public: GHash(GFreeProc* freeProc); // constructs of an empty hash GHash(bool doFree=true); // constructs of an empty hash (free the item objects) void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { fFreeProc=(doFree)? &DefaultFreeProc : NULL; } int Capacity() const { return fCapacity; } // table's size, including the empty slots. void Resize(int m); // Resize the table to the given size. int Count() const { return fCount; }// the total number of entries in the table. // Insert a new entry into the table given key. // If there is already an entry with that key, leave it unchanged OBJ* Add(const char* ky, OBJ* ptr=NULL); //same with Add, but frees the old element if it's a replacement OBJ* fAdd(const char* ky, OBJ* ptr=NULL); //same as Add, but the key pointer is stored directly, no string copy needed //(shared-key-Add) OBJ* shkAdd(const char* ky, OBJ* ptr); // Replace data at key. If there was no existing entry, // a new entry is inserted. OBJ* Replace(const char* ky, OBJ* ptr); // Remove a given key and its data OBJ* Remove(const char* ky); // Find data OBJ* given key. OBJ* Find(const char* ky, char** keyptr=NULL); bool hasKey(const char* ky); char* getLastKey() { return lastkeyptr; } OBJ* operator[](const char* ky) { return Find(ky); } void startIterate(); //iterator-like initialization char* NextKey(); //returns next valid key in the table (NULL if no more) OBJ* NextData(); //returns next valid hash[].data OBJ* NextData(char*& nextkey); //returns next valid hash[].data //or NULL if no more //nextkey is SET to the corresponding key GHashEntry* NextEntry() { //returns a pointer to a GHashEntry int pos=fCurrentEntry; while (pos GHash::GHash(GFreeProc* freeProc) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fCurrentEntry=-1; fFreeProc=freeProc; lastkeyptr=NULL; for (uint i=0; i GHash::GHash(bool doFree) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fCurrentEntry=-1; lastkeyptr=NULL; fFreeProc = (doFree)?&DefaultFreeProc : NULL; for (uint i=0; i void GHash::Resize(int m) { int i,n,p,x,h; GHashEntry *k; GASSERT(fCount<=fCapacity); if(m>2)>m) n>>=1; // Shrink until n/4 <= m while((n>>1)>1)); GASSERT(DEF_HASH_SIZE<=n); if(n!=fCapacity){ GASSERT(m<=n); GMALLOC(k, sizeof(GHashEntry)*n); for(i=0; i=0){ p=HASH1(h,n); GASSERT(0<=p && p OBJ* GHash::Add(const char* ky, OBJ* pdata) { int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::fAdd(const char* ky, OBJ* pdata) { int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::shkAdd(const char* ky, OBJ* pdata) { int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Replace(const char* ky, OBJ* pdata){ int p,i,x,h,n; if(!ky){ GError("GHash::replace: NULL key argument.\n"); } GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Remove(const char* ky){ int p,x,h,n; if(!ky){ GError("GHash::remove: NULL key argument.\n"); } OBJ* removed=NULL; if(0 bool GHash::hasKey(const char* ky) { int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if(0 OBJ* GHash::Find(const char* ky, char** keyptr){ int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if (fCount==0) return NULL; h=GSTR_HASH(ky); GASSERT(0<=h); p=HASH1(h,fCapacity); GASSERT(0<=p && p void GHash::startIterate() {// initialize a key iterator; call fCurrentEntry=0; } template char* GHash::NextKey() { int pos=fCurrentEntry; while (pos OBJ* GHash::NextData() { int pos=fCurrentEntry; while (pos OBJ* GHash::NextData(char* &nextkey) { int pos=fCurrentEntry; while (pos int GHash::First() const { int pos=0; while(pos int GHash::Last() const { int pos=fCapacity-1; while(0<=pos){ if(0<=hash[pos].hash) break; pos--; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Find next valid entry template int GHash::Next(int pos) const { GASSERT(0<=pos && pos int GHash::Prev(int pos) const { GASSERT(0<=pos && pos= 0){ if(0<=hash[pos].hash) break; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Remove all template void GHash::Clear(){ int i; for(i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); //reinitialize it for (i=0; i GHash::~GHash(){ for(int i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); } class GStrSet:public GHash { protected: bool free_keys; public: GStrSet(bool shared_keys=false):GHash(false), free_keys(!shared_keys) { } void Add(const char* str) { if (free_keys) { //allocates a copy of str GHash::Add(str, NULL); } else this->shkAdd(str, NULL); } void add(const char* str) { this->Add(str); } void push(const char* str) { this->Add(str); } bool has(const char* str) { return hasKey(str); } }; #endif libgff-2.0.0/include/GList.hh000066400000000000000000000526441367741004700157540ustar00rootroot00000000000000//--------------------------------------------------------------------------- /* Sortable collections of objects and object pointers */ #ifndef _GList_HH #define _GList_HH #include "GVec.hh" #define GLIST_SORTED_ERR "Operation not allowed on a sorted list!\n" #define GLIST_UNSORTED_ERR "Operation not allowed on an unsorted list!\n" //------ useful macros: #define BE_UNSORTED if (fCompareProc!=NULL) { GError(GLIST_SORTED_ERR); return; } #define BE_SORTED if (fCompareProc==NULL) { GError(GLIST_UNSORTED_ERR); return; } #define SORTED (fCompareProc!=NULL) #define UNSORTED (fCompareProc==NULL) // GArray is the sortable array type, requires the comparison operator < to be defined template class GArray:public GVec { protected: bool fUnique; static int DefaultCompareProc(const pointer item1, const pointer item2) { //operator< MUST be defined for OBJ class! if (*((OBJ*)item2) < *((OBJ*)item1)) return 1; else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1; else return 0; } GCompareProc* fCompareProc; public: GArray(GCompareProc* cmpFunc=NULL); GArray(bool sorted, bool unique=false); GArray(int init_capacity, bool sorted, bool unique=false); GArray(GArray& array); //copy constructor const GArray& operator=(GArray& array); //~GArray(); //assignment operator void setSorted(GCompareProc* cmpFunc); void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } //sort the array if cmpFunc not NULL or changes int Add(OBJ* item); // specific implementation if sorted int Add(OBJ& item) { return Add(&item); } //both will CREATE a new OBJ and COPY to it // using OBJ new operator= int AddIfNew(OBJ& item, int* fidx=NULL); //requires == operator //if equal item not found, item is added and return the index of it //otherwise returns -1 and fidx is set to the equal item location int cAdd(OBJ item) { return Add(&item); } int cPush(OBJ item) { return Add(&item); } int Push(OBJ& item) { return Add(&item); } void Add(GArray& list); //add copies of all items from another list //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; void Sort(); //explicit sort may be requested bool Sorted() { return fCompareProc!=NULL; } void Replace(int idx, OBJ& item); //Put, use operator= to copy int Unique() { return fUnique; } int IndexOf(OBJ& item); //this needs the == operator to have been defined for OBJ bool Found(OBJ& item, int& idx); // for sorted arrays only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which fCompareProc returns 0 bool Exists(OBJ& item); //same as above without existing index info //unsorted only, place item at position idx: void Move(int curidx, int newidx); void Insert(int idx, OBJ* item); void Insert(int idx, OBJ item) { Insert(idx,&item); } }; //GList is a sortable collection of pointers to objects; requires operator< to be defined, or a custom compare function template class GList:public GPVec { protected: bool fUnique; GCompareProc* fCompareProc; //a pointer to a Compare function static int DefaultCompareProc(const pointer item1, const pointer item2) { //operator< MUST be defined for OBJ class! if (*((OBJ*)item2) < *((OBJ*)item1)) return 1; else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1; else return 0; } public: void sortInsert(int idx, OBJ* item); //special insert in sorted lists //WARNING: the caller must know the insert index such that the sort order is preserved! GList(GCompareProc* compareProc=NULL); //free by default GList(GCompareProc* compareProc, //unsorted by default GFreeProc *freeProc, bool beUnique=false); GList(bool sorted, bool free_elements=true, bool beUnique=false); GList(int init_capacity, bool sorted, bool free_elements=true, bool beUnique=false); GList(GList& list); //copy constructor? GList(GList* list); //kind of a copy constructor const GList& operator=(GList& list); //void Clear(); //~GList(); void setSorted(GCompareProc* compareProc); //sorted if compareProc not NULL; sort the list if compareProc changes ! bool Sorted() { return fCompareProc!=NULL; } void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } int Add(OBJ* item); //-- specific implementation if sorted - may become an Insert() void Add(GList& list); //add all pointers from another list OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL); // default: delete item if Found() (and pointers are not equal)! //returns the equal (==) object if it's in the list already //or the item itself if it is unique and actually added int AddedIfNew(OBJ* item); // if Found(item) (and pointers are not equal) delete item and returns -1 // if added, returns the new item index int Unique() { return fUnique; } //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; GCompareProc* GetCompareProc() {return fCompareProc;} int IndexOf(OBJ* item); //this has a specific implementation for sorted lists //if list is sorted, item data is located by binary search //based on the Compare function //if not, a linear search is performed, but //this needs the == operator to have been defined for OBJ void Put(int idx, OBJ* item, bool re_sort=false); bool Found(OBJ* item, int & idx); // sorted only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which GTCompareProc returns 0 bool Exists(OBJ* item); //same as above without existing index info bool Exists(OBJ& item); //same as above without existing index info void Sort(); //explicit sort may be requested using this function int Remove(OBJ* item); //search for pointer, using binary search if sorted void Insert(int idx, OBJ* item); //unsorted only, place item at position idx void Move(int curidx, int newidx); }; //GList //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GArray::GArray(GArray& array):GVec(0) { //copy constructor this->fCount=array.fCount; this->fCapacity=array.fCapacity; this->fArray=NULL; if (this->fCapacity>0) { //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ)); this->fArray=new OBJ[this->fCapacity]; } this->fCount=array.fCount; fUnique=array.fUnique; fCompareProc=array.fCompareProc; // uses OBJ operator= for (int i=0;ifCount;i++) this->fArray[i]=array[i]; } template const GArray& GArray::operator=(GArray& array) { if (&array==this) return *this; GVec::Clear(); this->fCount=array.fCount; this->fUnique=array.fUnique; this->fCapacity=array.fCapacity; if (this->fCapacity>0) { //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ)); this->fArray=new OBJ[this->fCapacity]; } this->fCompareProc=array.fCompareProc; this->fCount=array.fCount; // uses OBJ operator= for (int i=0;ifCount;i++) { this->fArray[i]=array[i]; } return *this; } template GArray::GArray(GCompareProc* cmpFunc):GVec(0) { fCompareProc = cmpFunc; fUnique = false; //only affects sorted lists } template GArray::GArray(bool sorted, bool unique):GVec(0) { fUnique=unique; fCompareProc = sorted ? DefaultCompareProc : NULL; } template GArray::GArray(int init_capacity, bool sorted, bool unique):GVec(init_capacity) { fUnique=unique; fCompareProc=sorted ? DefaultCompareProc : NULL; } template void GArray::setSorted(GCompareProc* cmpFunc) { GCompareProc* old_proc=fCompareProc; fCompareProc=cmpFunc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template int GArray::IndexOf(OBJ& item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GArray::Exists(OBJ& item) { int result=0; if (Found(item, result)) return true; else return false; } template int GArray::Add(OBJ* item) { if (item==NULL) return -1; int result; if (SORTED) { if (Found(*item, result)) if (fUnique) return -1; //cannot add a duplicate! //Found sets result to the position where the item should be! GVec::Insert(result, *item); } else { if (fUnique && Found(*item, result)) return -1; //set behaviour result = this->fCount; if (result==this->fCapacity) GVec::Grow(); this->fArray[result] = *item; //operator=, copies the item this->fCount++; } return result; } template void GArray::Add(GArray& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;isetCapacity(this->fCapacity+list.fCount); int s=this->fCount; for (int i=0;ifArray[s+i]=list.fArray[i]; this->fCount+=list.fCount; } } //returns -1 if existing equal object exists, sets fidx to that equal item index //or returns the index where the item was added/inserted template int GArray::AddIfNew(OBJ& item, int* fidx) { int rpos; if (Found(item, rpos)) { if (fidx) *fidx=rpos; //the position where the item should be inserted: return -1; //found and not added } //not found, let's insert it if (SORTED) { //Found() set result to the position where the item should be inserted GVec::Insert(rpos, item); } else { //simply append rpos = this->fCount; if (rpos==this->fCapacity) GVec::Grow(); this->fArray[rpos] = item; //operator= copies the item this->fCount++; } if (fidx!=NULL) *fidx=rpos; return rpos; } template bool GArray::Found(OBJ& item, int& idx) { //search the list by using fCompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (this->fCount==0) { idx=0; return false;} if (SORTED) { //binary search based on fCompareProc //do the simplest tests first: if ((*fCompareProc)(&(this->fArray[0]),&item)>0) { idx=0; return false; } if ((*fCompareProc)(&item, &(this->fArray[this->fCount-1]))>0) { idx=this->fCount; return false; } int l=0; int h = this->fCount - 1; int c; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(&(this->fArray[i]), &item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { //found! idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (ifCount) { if (this->fArray[i]==item) { //requires operator== idx=i; return true; } i++; } return false; } } template void GArray::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //forbid this operation on sorted data GVec::Insert(idx, item); } template void GArray::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! if (curidx!=newidx || newidx>=this->fCount) GError(GVEC_INDEX_ERR, newidx); OBJ tmp=this->fArray[curidx]; //copy constructor here this->fArray[curidx]=this->fArray[newidx]; this->fArray[newidx]=tmp; } template void GArray::Replace(int idx, OBJ& item) { //TEST_INDEX(idx); if (idx<0 || idx>=this->fCount) GError(GVEC_INDEX_ERR, __FILE__,__LINE__, idx); this->fArray[idx]=item; if ( SORTED ) Sort(); //re-sort ! this could be very expensive, don't do it } template void GArray::Sort() { if (fCompareProc==NULL) { fCompareProc=DefaultCompareProc; } if (this->fArray!=NULL && this->fCount>0) this->qSort(0, this->fCount-1, fCompareProc); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GList implementation -- sortable array of pointers to OBJ template GList::GList(GList& list):GPVec(list) { //copy constructor fUnique=list.fUnique; fCompareProc=list.fCompareProc; } template GList::GList(GList* plist):GPVec(0) { //another copy constructor this->fCapacity=plist->fCapacity; this->fList=NULL; if (this->fCapacity>0) { GMALLOC(this->fList, this->fCapacity*sizeof(OBJ*)); } fUnique=plist->fUnique; fCompareProc=plist->fCompareProc; this->fFreeProc=plist->fFreeProc; this->fCount=plist->fCount; memcpy(this->fList, plist->fList, this->fCount*sizeof(OBJ*)); //for (int i=0;ifCount;i++) Add(plist->Get(i)); } template void GList::Add(GList& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;isetCapacity(this->fCapacity+list.fCount); memcpy( & (this->fList[this->fCount]), list.fList, list.fCount*sizeof(OBJ*)); this->fCount+=list.fCount; } } template GList::GList(GCompareProc* compareProc, GFreeProc* freeProc, bool beUnique) { fCompareProc = compareProc; this->fFreeProc = freeProc; fUnique = beUnique; //only affects sorted lists } template GList::GList(GCompareProc* compareProc) { fCompareProc = compareProc; this->fFreeProc = GPVec::DefaultFreeProc; fUnique = false; //only affects sorted lists } template GList::GList(bool sorted, bool free_elements, bool beUnique) { if (sorted) { if (free_elements) { fCompareProc=&DefaultCompareProc; this->fFreeProc = GPVec::DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=&DefaultCompareProc; this->fFreeProc=NULL; fUnique=beUnique; } } else { if (free_elements) { fCompareProc=NULL; this->fFreeProc=GPVec::DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=NULL; this->fFreeProc=NULL; fUnique=beUnique; } } } template GList::GList(int init_capacity, bool sorted, bool free_elements, bool beUnique):GPVec(init_capacity, free_elements) { if (sorted) { fCompareProc=&DefaultCompareProc; fUnique=beUnique; } else { fCompareProc=NULL; fUnique=beUnique; } } template const GList& GList::operator=(GList& list) { if (&list!=this) { GPVec::Clear(); fCompareProc=list.fCompareProc; this->fFreeProc=list.fFreeProc; //Attention: the object pointers are copied directly, //but the actual objects are NOT duplicated for (int i=0;i void GList::setSorted(GCompareProc* compareProc) { GCompareProc* old_proc=fCompareProc; fCompareProc=compareProc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template int GList::IndexOf(OBJ* item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GList::Exists(OBJ& item) { int result=0; if (Found(&item, result)) return true; else return false; } template bool GList::Exists(OBJ* item) { int result=0; if (Found(item, result)) return true; else return false; } template int GList::Add(OBJ* item) { int result; if (item==NULL) return -1; if (SORTED) { if (Found(item, result)) if (fUnique) return -1; //duplicates forbidden //Found sets result to the position where the item should be! sortInsert(result, item); } else { if (fUnique && Found(item,result)) return -1; //set behaviour result = this->fCount; if (result==this->fCapacity) GPVec::Grow(); this->fList[result]=item; this->fCount++; } return result; } //by default, it deletes item if it an equal is found in the list! //returns the existing equal (==) object if it's in the list already //or returns the item itself if it's unique (and adds it) template OBJ* GList::AddIfNew(OBJ* item, bool deleteIfFound, int* fidx) { int r; if (Found(item, r)) { if (deleteIfFound && (pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); } if (fidx!=NULL) *fidx=r; return this->fList[r]; //found } //not found: if (SORTED) { //Found() set result to the position where the item should be inserted: sortInsert(r, item); } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; } if (fidx!=NULL) *fidx=r; return item; } //if item is found already in the list DELETE it and return -1 //otherwise the item is added and its index is returned template int GList::AddedIfNew(OBJ* item) { int r; if (Found(item, r)) { if ((pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); } return -1; } //not found: if (SORTED) { //Found() set r to the position where the item should be inserted: sortInsert(r, item); } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; } return r; } template bool GList::Found(OBJ* item, int& idx) { //search the list by using fCompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (this->fCount==0) { idx=0;return false;} if (SORTED) { //binary search based on fCompareProc //do the simple test first: if ((*fCompareProc)(this->fList[0],item)>0) { idx=0; return false; } if ((*fCompareProc)(item, this->fList[this->fCount-1])>0) { idx=this->fCount; return false; } int l, h, c; l = 0; h = this->fCount - 1; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(this->fList[i], item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (ifCount) { if (*this->fList[i]==*item) { idx=i; return true; } i++; } return false; } } template void GList::sortInsert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the current fList[idx] and all the above will be shifted +1 if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); if (this->fCount==this->fCapacity) { GPVec::Grow(idx, item); //expand and also copy/move data and insert the new item return; } //room still left, just move data around and insert the new one if (idxfCount) //copy/move pointers only! memmove(&(this->fList[idx+1]), &(this->fList[idx]), (this->fCount-idx)*sizeof(OBJ*)); this->fList[idx]=item; this->fCount++; } template void GList::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //cannot do that with a sorted list! GPVec::Insert(idx,item); } template void GList::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! GPVec::Move(curidx,newidx); } template void GList::Put(int idx, OBJ* item, bool re_sort) { //WARNING: this will never free the replaced item! // this may BREAK the sort order unless the "re_sort" parameter is given if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); this->fList[idx]=item; if (SORTED && item!=NULL && re_sort) Sort(); //re-sort } template int GList::Remove(OBJ* item) { //removes an item if it's in our list int result=IndexOf(item); if (result>=0) GPVec::Delete(result); return result; } template void GList::Sort() { if (fCompareProc==NULL) fCompareProc = DefaultCompareProc; if (this->fList!=NULL && this->fCount>0) this->qSort(0, this->fCount-1, fCompareProc); } //--------------------------------------------------------------------------- #endif libgff-2.0.0/include/GStr.h000066400000000000000000000227521367741004700154360ustar00rootroot00000000000000//--------------------------------------------------------------------------- #ifndef GSTR_H #define GSTR_H //--------------------------------------------------------------------------- #include "GBase.h" #include #include #include // This class uses reference counting and copy-on-write semantics // All indexes are zero-based. For all functions that accept an index, a // negative index specifies an index from the right of the string. Also, // for all functions that accept a length, a length of -1 specifies the rest // of the string. enum enTokenizeMode { tkFullString, tkCharSet }; class GStr { friend GStr operator+(const char* s1, const GStr& s2); friend bool operator==(const char* s1, const GStr& s2); friend bool operator<(const char* s1, const GStr& s2); friend bool operator<=(const char* s1, const GStr& s2); friend bool operator>(const char* s1, const GStr& s2); friend bool operator>=(const char* s1, const GStr& s2); friend bool operator!=(const char* s1, const GStr& s2); friend void Gswap(GStr& s1, GStr& s2); public: GStr(); GStr(const GStr& s); GStr(const char* s, uint addcap=4); GStr(const int i); GStr(const double f); GStr(char c, int n = 1); ~GStr(); operator const char* () const { return my_data->chars;} //inline here char& operator[](int index); char operator[](int index) const; GStr& operator=(const GStr& s); GStr& operator=(const char* s); GStr& operator=(const int i); GStr& operator=(const double f); GStr operator+(const GStr& s) const; GStr operator+(const char* s) const; GStr operator+(const char c) const; GStr operator+(const int i) const; GStr operator+(const double f) const; bool operator==(const GStr& s) const; bool operator==(const char* s) const; bool operator<(const GStr& s) const; bool operator<(const char* s) const; bool operator<=(const GStr& s) const; bool operator<=(const char* s) const; bool operator>(const GStr& s) const; bool operator>(const char* s) const; bool operator>=(const GStr& s) const; bool operator>=(const char* s) const; bool operator!=(const GStr& s) const; bool operator!=(const char* s) const; GStr& operator+=(const GStr& s) { return append(s.chars()); } GStr& operator+=(const char* s) { return append(s); } GStr& operator+=(char c) { return append(c); } GStr& operator+=(int i) { return append(i); } GStr& operator+=(uint i) { return append(i); } GStr& operator+=(long l) { return append(l); } GStr& operator+=(unsigned long l) { return append(l); } GStr& operator+=(double f); //interface: int length() const; bool is_empty() const; bool is_space() const; GStr substr(int index = 0, int len = -1) const; GStr to(char c); //return the first part up to first occurence of c //or whole string if c not found GStr from(char c); //same as to, but starting from the right side GStr copy() const; GStr& format(const char *fmt,...); GStr& reverse(); GStr& appendfmt(const char *fmt,...); GStr& cut(int index = 0, int len = -1); //delete a specified length GStr& remove(int from, int to) { return cut(from, to-from+1); } //paste a string at the specified position GStr& paste(const GStr& s, int index = 0, int len=-1); GStr& paste(const char* s, int index = 0, int len = -1); GStr& replace(const char* from, const char* to=NULL); GStr& insert(const GStr& s, int index = 0); GStr& insert(const char* s, int index = 0); GStr& append(const char* s); GStr& appendQuoted(const char* s, char q='"', bool onlyIfSpaced=false); GStr& appendmem(const char* m, int len); GStr& append(const char* m, int len); //same as appendmem but stops at '\0' GStr& append(const GStr& s); GStr& append(char c); GStr& append(int i); GStr& append(long l); GStr& append(double f); GStr& append(uint i); GStr& append(unsigned long l); GStr& upper(); GStr& lower(); GStr& clear(int init_cap=0);//make empty, but can specify initial capacity //character translation or removal: GStr& tr(const char* from, const char* to=NULL); //number of occurences of a char in the string: int count(char c); void startTokenize(const char* delimiter=" \t\n", enTokenizeMode tokenizemode=tkCharSet); bool nextToken(GStr& token); int asInt(int base=10); double asReal(); double asDouble() { return asReal(); } bool asReal(double& r); bool asDouble(double& r) { return asReal(r); } bool asInt(int& r, int base=10); int index(const GStr& s, int start_index = 0) const; int index(const char* s, int start_index = 0) const; int index(char c, int start_index = 0) const; int rindex(char c, int end_index = -1) const; int rindex(const char* str, int end_index = -1) const; bool contains(const GStr& s) const; bool contains(const char* s) const; bool contains(char c) const; bool startsWith(const char* s) const; bool startsWith(const GStr& s) const; bool endsWith(const char* s) const; bool endsWith(const GStr& s) const; GStr split(const char* delim); GStr split(char c); /* splits "this" in two parts, at the first (leftmost) encounter of delim: 1st would stay in "this" (which this way is truncated) 2nd will go to the returned string */ GStr splitr(const char* delim); GStr splitr(char c); /* splits "this" in two parts, at the last (rightmost) encounter of delim: 1st would stay in "this" 2nd will be returned */ int peelInt() const; //extract an integer, (left to right), from a //mixed alphanumeric string, e.g. 'T24HC1234b'=> 2 int peelIntR() const; //same as above, but starts from the right side //e.g. 'T2HC1234b'=> 1234 GStr& trim(char c); GStr& trim(const char* c=" \t\n\r"); //trim both ends of characters in given set GStr& trimR(const char* c=" \t\n\r"); //trim only right end GStr& trimR(char c=' '); GStr& chomp(char c='\n') { return trimR(c); } GStr& chomp(const char* cstr); //like trimR, but given string is taken as a whole GStr& trimL(const char* c=" \t\n\r"); //trim only left end GStr& trimL(char c=' '); GStr& padR(uint len, char c=' '); //align it in len spaces to the right GStr& padL(uint len, char c=' '); //align it in len spaces to the left GStr& padC(uint len, char c=' '); //center it size_t read(FILE* stream, const char* delimiter="\n", size_t bufsize=4096); //read next token from stream, using the given string as //a marker where the block should stop const char* chars() const; const char* text() const; char* detach(); //returns pointer to the string, giving up on its memory management protected: char* fTokenDelimiter; int fLastTokenStart; enTokenizeMode fTokenizeMode; void* readbuf; //file read buffer for the read() function size_t readbufsize; //last setting for the readbuf static void invalid_args_error(const char* fname); static void invalid_index_error(const char* fname); struct Data {//structure holding actual //string data and reference count information Data():ref_count(0), cap(0),length(0) { chars[0] = 0; } uint ref_count; //reference count uint cap; //allocated string capacity (excluding \0 end char) uint length; //actual string length (excluding \0 end char) char chars[1]; }; static Data* new_data(uint len, uint addcap=0); //alloc a specified length string's Data static Data* new_data(const char* str, uint addcap=0); //alloc a copy of a specified string, with an additional cap void prep_data(uint len, uint addcap=0); //allocates memory for the string, if needed void replace_data(Data* data); //WARNING (dangerous): direct access to pointer; string editing cannot change the length! char* chrs(); void make_unique(); static Data null_data; //a null (empty) string Data is available here Data* my_data; //pointer to a Data object holding actual string data }; /***************************************************************************/ inline int GStr::length() const { return my_data->length; } inline const char *GStr::chars() const { return my_data->chars; } inline char *GStr::chrs() { //allows direct modification of the chars ! return my_data->chars; } inline const char *GStr::text() const { return my_data->chars; } inline bool operator>=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) >= 0); } inline bool operator!=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) != 0); } inline void Gswap(GStr& s1, GStr& s2) { GStr::Data *tmp = s1.my_data; s1.my_data = s2.my_data; s2.my_data = tmp; } #endif libgff-2.0.0/include/GVec.hh000066400000000000000000000637611367741004700155600ustar00rootroot00000000000000//--------------------------------------------------------------------------- /* Sortable collection of pointers to objects */ #ifndef _GVec_HH #define _GVec_HH #include "GBase.h" #define GVEC_INDEX_ERR "GVec error: invalid index: %d\n" #if defined(NDEBUG) || defined(NODEBUG) || defined(_NDEBUG) || defined(NO_DEBUG) #define TEST_INDEX(x) #else #define TEST_INDEX(x) \ if (x<0 || x>=fCount) GError(GVEC_INDEX_ERR, x) #endif #define GVEC_CAPACITY_ERR "GVec error: invalid capacity: %d\n" #define GVEC_COUNT_ERR "GVec error: invalid count: %d\n" #define MAXLISTSIZE INT_MAX-1 #define FREEDATA (fFreeProc!=NULL) template struct IsPrimitiveType { enum { VAL = 0 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template int DefLTCompareProc(pointer p1, pointer p2) { OBJ& o1 = *((OBJ*) p1); OBJ& o2 = *((OBJ*) p2); if (o1 < o2) return -1; else return ((o2 < o1) ? 1 : 0 ); } //basic template for array of objects; //so it doesn't require comparison operators to be defined template class GVec { protected: OBJ* fArray; int fCount; int fCapacity; void qSort(int L, int R, GCompareProc* cmpFunc); public: GVec(int init_capacity=2); GVec(int init_count, const OBJ init_val); GVec(int init_count, OBJ* init_val, bool delete_initval=true); //convenience constructor for complex vectors GVec(const GVec& array); //copy constructor const GVec& operator=(const GVec& array); //copy operator virtual ~GVec(); void Insert(int idx, OBJ item) { Insert(idx, &item); } void Insert(int idx, OBJ* item); void idxInsert(int idx, OBJ& item) { Insert(idx, &item); } void Grow(); void Grow(int idx, OBJ& item); //grow and add/insert item copy void Reverse(); //WARNING: will break the sort order if SORTED! int Add(OBJ* item); // simply append to the end of fArray, reallocating as needed int Add(OBJ& item) { return Add(&item); } int cAdd(OBJ item) { return Add(&item); } //all these will CREATE a new OBJ and COPY to it // // using OBJ copy operator= // -- stack/queue usage: //int Push(OBJ& item) { return Add(&item); } int Push(OBJ& item) { return Add(&item); } int cPush(OBJ item) { return Add(&item); } OBJ Pop();// Stack use; removes and returns a copy of the last item OBJ Shift(); //Queue use: removes and returns a copy of the first item void Shift(int idx); //Queue use: removes first idx elements from array void Add(GVec& list); //append copies of all items from another list OBJ& Get(int idx) { TEST_INDEX(idx); return fArray[idx]; } inline OBJ& operator[](int i) { TEST_INDEX(i); return fArray[i]; } OBJ& Last() { TEST_INDEX(fCount-1); return fArray[fCount-1]; } OBJ& First() { TEST_INDEX(0); return fArray[0]; } void Clear(); void Delete(int index); void Replace(int idx, OBJ& item); //Put, use operator= to copy void Exchange(int idx1, int idx2); void Swap(int idx1, int idx2) { Exchange(idx1, idx2); } int Capacity() { return fCapacity; } //this will reject identical items in sorted lists only! void setCapacity(int NewCapacity); int Count() { return fCount; } void setCount(int NewCount); // will trim or expand the array as needed void setCount(int NewCount, OBJ v); void Resize(int NewCount) { setCount(NewCount); } void Resize(int NewCount, OBJ v) { setCount(NewCount, v); } //void Move(int curidx, int newidx); bool isEmpty() { return fCount==0; } bool notEmpty() { return fCount>0; } void Sort(GCompareProc* cmpFunc); void Sort(); }; //---- template for dynamic array of object pointers //---- it's faster than GVec and has item deallocation awareness template class GPVec { protected: OBJ** fList; //pointer to an array of pointers to objects int fCount; //total number of entries in list int fCapacity; //current allocated size GFreeProc* fFreeProc; //useful for deleting objects //--- void Expand(); void Grow(); void Grow(int idx, OBJ* newitem); void qSort(int L, int R, GCompareProc* cmpFunc); public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } virtual ~GPVec(); GPVec(int init_capacity=2, bool free_elements=true); //also the default constructor GPVec(bool free_elements); GPVec(GPVec& list); //copy constructor? GPVec(GPVec* list); //kind of a copy constructor const GPVec& operator=(const GPVec& list); inline OBJ* Get(int i) { TEST_INDEX(i); return fList[i]; } //OBJ* operator[](int i) { return this->Get(i); } inline OBJ*& operator[](int i) { TEST_INDEX(i); return fList[i]; } void Reverse(); //reverse pointer array; WARNING: will break(reverse) the sort order if sorted! void freeItem(int idx); //calls fFreeProc (or DefaultFreeProc) on fList[idx] and sets NULL there, doesn't pack! //it will free even if fFreeProc is NULL! void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { if (doFree) fFreeProc=DefaultFreeProc; else fFreeProc=NULL; } // -- stack usage: int Push(OBJ* item) { return Add(item); } OBJ* Pop();// Stack use; removes and returns last item,but does NOT FREE it OBJ* Shift(); //Queue use: removes and returns first item, but does NOT FREE it void deallocate_item(OBJ*& item); //forcefully call fFreeProc or delete on item void Clear(); void Exchange(int idx1, int idx2); void Swap(int idx1, int idx2) { Exchange(idx1, idx2); } OBJ* First() { return (fCount>0)?fList[0]:NULL; } OBJ* Last() { return (fCount>0)?fList[fCount-1]:NULL;} bool isEmpty() { return fCount==0; } bool notEmpty() { return fCount>0; } int Capacity() { return fCapacity; } int Count() { return fCount; } void setCapacity(int NewCapacity); void setCount(int NewCount); //the same as setCapacity() but the new item range is filled with NULLs int Add(OBJ* item); //simply append the pointer copy void Add(GPVec& list); //add all pointers from another list void Insert(int idx, OBJ* item); void Move(int curidx, int newidx); void Put(int idx, OBJ* item); void Pack(); void Delete(int index); //also frees the item if fFreeProc!=NULL, and shifts the successor items void Forget(int idx); //simply places a NULL at fList[idx], nothing else int RemovePtr(pointer item); //always use linear search to find the pointer! calls Delete() if found int IndexOf(pointer item); //a linear search for pointer address! void Sort(GCompareProc* cmpFunc); void Sort(); }; //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GVec::GVec(int init_capacity) { fCount=0; fCapacity=0; fArray=NULL; setCapacity(init_capacity); //if (set_count) fCount = init_capacity; } template GVec::GVec(int init_count, const OBJ init_val) { fCount=0; fCapacity=0; fArray=NULL; setCapacity(init_count); fCount = init_count; for (int i=0;i GVec::GVec(int init_count, OBJ* init_val, bool delete_initval) { fCount=0; fCapacity=0; fArray=NULL; setCapacity(init_count); fCount = init_count; for (int i=0;i GVec::GVec(const GVec& array) { //copy constructor this->fCount=array.fCount; this->fCapacity=array.fCapacity; this->fArray=NULL; if (this->fCapacity>0) { if (IsPrimitiveType::VAL) { GMALLOC(fArray, fCapacity*sizeof(OBJ)); memcpy(fArray, array.fArray, fCount*sizeof(OBJ)); } else { fArray=new OBJ[this->fCapacity]; //]() // uses OBJ operator= for (int i=0;ifCount;i++) fArray[i]=array.fArray[i]; } } this->fCount=array.fCount; } template const GVec& GVec::operator=(const GVec& array) { if (&array==this) return *this; Clear(); fCapacity=array.fCapacity; fCount=array.fCount; if (fCapacity>0) { if (IsPrimitiveType::VAL) { GMALLOC(fArray, fCapacity*sizeof(OBJ)); memcpy(fArray, array.fArray, fCount*sizeof(OBJ)); } else { fArray=new OBJ[this->fCapacity]; // ]() // uses OBJ operator= for (int i=0;i GVec::~GVec() { this->Clear(); } template void GVec::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: NewCapacity MUST be > fCount //if you want to shrink it use Resize() or setCount() if (NewCapacity!=fCapacity) { if (NewCapacity==0) { if (IsPrimitiveType::VAL) { GFREE(fArray); } else { delete[] fArray; fArray=NULL; } } else { if (IsPrimitiveType::VAL) { GREALLOC(fArray, NewCapacity*sizeof(OBJ)); //also zero init the new items? memset(fArray+fCount, 0, (NewCapacity-fCount)*sizeof(OBJ)); } else { OBJ* oldArray=fArray; //fArray=new OBJ[NewCapacity](); fArray=new OBJ[NewCapacity]; for (int i=0;ifCount;i++) { fArray[i] = oldArray[i]; }// we need operator= here //wouldn't be faster to use memcpy instead? //memcpy(fArray, oldArray, fCount*sizeof(OBJ)); if (oldArray) delete[] oldArray; } } fCapacity=NewCapacity; } } template void GVec::Clear() { fCount=0; if (IsPrimitiveType::VAL) { GFREE(fArray); } else { delete[] fArray; fArray=NULL; } fCapacity=0; } template void GVec::Grow() { int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; setCapacity(fCapacity + delta); } template void GVec::Reverse() { int l=0; int r=fCount-1; OBJ c; while (l void GVec::Grow(int idx, OBJ& item) { int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity >= MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range //if (NewCapacity!=fCapacity) { if (idx==fCount) { //append item //GREALLOC(fArray, NewCapacity*sizeof(OBJ)); setCapacity(NewCapacity); fArray[idx]=item; } else { //insert item at idx OBJ* newList; if (IsPrimitiveType::VAL) { GMALLOC(newList, NewCapacity*sizeof(OBJ)); //copy data before idx memcpy(&newList[0],&fArray[0], idx*sizeof(OBJ)); newList[idx]=item; //copy data after idx memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ)); //..shouldn't do this (zero init new unused items in the array) memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ)); //data copied: GFREE(fArray); } else { newList=new OBJ[NewCapacity]; // operator= required! for (int i=0;i int GVec::Add(OBJ* item) { if (item==NULL) return -1; if (fCount==fCapacity) Grow(); fArray[fCount] = *item; //OBJ::operator= must copy OBJ properly! fCount++; return fCount-1; } template void GVec::Add(GVec& list) { if (list.Count()==0) return; //simply copy setCapacity(fCapacity+list.fCount); if (IsPrimitiveType::VAL) { memcpy( &fArray[fCount], list.fArray, list.fCount*sizeof(OBJ)); } else { for (int i=0;i OBJ GVec::Pop() { if (fCount<=0) GError("Error: invalid GVec::Pop() operation!\n"); fCount--; //OBJ o(fArray[fCount]); //copy constructor //o=fList[fCount]; //fArray[fCount]=NULL; return fArray[fCount]; //copy of the last element (copy constructor called) } //Queue usage: template OBJ GVec::Shift() { if (fCount<=0) GError("Error: invalid GVec::Shift() operation!\n"); fCount--; OBJ o(fArray[0]); //copy constructor if (fCount>0) memmove(&fArray[0], &fArray[1], (fCount)*sizeof(OBJ)); //fList[fCount]=NULL; //not that it matters.. return o; } template void GVec::Shift(int idx) { if (idx<=0 || fCount-idx<=0) GError("Error: invalid GVec::Shift() operation!\n"); fCount-=idx; if (fCount>0) memmove(&fArray[0], &fArray[idx], (fCount)*sizeof(OBJ)); } template void GVec::Insert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the old idx item all the above will be shifted to idx+1 if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx); if (fCount==fCapacity) { //need to resize the array Grow(idx, *item); //expand and also copy/move data and insert the new item return; } //move data around to make room for the new item if (idx::VAL) { memmove(&fArray[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ)); } else { for (int i=fCount; i>idx; i--) { fArray[i]=fArray[i-1]; } } } fArray[idx]=*item; fCount++; } /*template void GVec::Move(int curidx, int newidx) { //swap if (curidx!=newidx || newidx>=fCount) GError(GVEC_INDEX_ERR, newidx); OBJ tmp=fArray[curidx]; //copy constructor here fArray[curidx]=fArray[newidx]; fArray[newidx]=tmp; }*/ template void GVec::Replace(int idx, OBJ& item) { TEST_INDEX(idx); fArray[idx]=item; } template void GVec::Exchange(int idx1, int idx2) { TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ item=fArray[idx1]; fArray[idx1]=fArray[idx2]; fArray[idx2]=item; } template void GVec::Delete(int index) { TEST_INDEX(index); fCount--; if (IsPrimitiveType::VAL) { if (index void GVec::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); //if (NewCount > fCapacity) setCapacity(NewCount); while(NewCount > fCapacity) Grow(); if (IsPrimitiveType::VAL && NewCount>fCount) { memset(fArray+fCount, 0, (NewCount-fCount)*sizeof(OBJ)); } fCount = NewCount; //new items will be populated by the default object constructor(!) } /* template void GVec::setCount(int NewCount, OBJ* v) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); while (NewCount > fCapacity) Grow(); if (NewCount>fCount) { for (int i=fCount;i void GVec::setCount(int NewCount, OBJ v) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); while (NewCount > fCapacity) Grow(); if (NewCount>fCount) { for (int i=fCount;i void GVec::qSort(int l, int r, GCompareProc* cmpFunc) { int i, j; OBJ p,t; do { i = l; j = r; p = this->fArray[(l + r) >> 1]; do { while (cmpFunc(&(this->fArray[i]), &p) < 0) i++; while (cmpFunc(&(this->fArray[j]), &p) > 0) j--; if (i <= j) { t = this->fArray[i]; this->fArray[i] = this->fArray[j]; this->fArray[j] = t; i++; j--; } } while (i <= j); if (l < j) qSort(l, j, cmpFunc); l = i; } while (i < r); } template void GVec::Sort(GCompareProc* cmpFunc) { if (cmpFunc==NULL) { GMessage("Warning: NULL compare function given, useless Sort() call.\n"); return; } if (this->fArray!=NULL && this->fCount>0) qSort(0, this->fCount-1, cmpFunc); } template void GVec::Sort() { GCompareProc* cmpFunc = DefLTCompareProc; Sort(cmpFunc); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GPVec implementation template GPVec::GPVec(GPVec& list) { //copy constructor fCount=list.fCount; fCapacity=list.fCapacity; fList=NULL; fFreeProc=list.fFreeProc; fCount=list.fCount; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); memcpy(fList, list.fList, fCount*sizeof(OBJ*)); } } template GPVec::GPVec(GPVec* plist) { //another copy constructor fCount=0; fCapacity=plist->fCapacity; fList=NULL; fFreeProc=plist->fFreeProc; fCount=plist->fCount; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); memcpy(fList, plist->fList, fCount*sizeof(OBJ*)); } } template const GPVec& GPVec::operator=(const GPVec& list) { if (&list!=this) { Clear(); fFreeProc=list.fFreeProc; //Attention: only the *POINTERS* are copied, // the actual objects are NOT duplicated fCount=list.fCount; fCapacity=list.fCapacity; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); memcpy(fList, list.fList, fCount*sizeof(OBJ*)); } } return *this; } template void GPVec::Add(GPVec& list) { if (list.Count()==0) return; //simply copy the pointers! -- the objects will be shared setCapacity(fCapacity+list.fCount); memcpy( & (fList[fCount]), list.fList, list.fCount*sizeof(OBJ*)); fCount+=list.fCount; } template void GPVec::Reverse() { int l=0; int r=fCount-1; OBJ* c; while (l GPVec::GPVec(int init_capacity, bool free_elements) { fCount=0; fCapacity=0; fList=NULL; fFreeProc=(free_elements) ? DefaultFreeProc : NULL; if (init_capacity>0) setCapacity(init_capacity); } template GPVec::GPVec(bool free_elements) { fCount=0; fCapacity=0; fList=NULL; fFreeProc=(free_elements) ? DefaultFreeProc : NULL; } template GPVec::~GPVec() { this->Clear();//this will free the items if fFreeProc is defined } template void GPVec::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) { GFREE(fList); } else { GREALLOC(fList, NewCapacity*sizeof(OBJ*)); } fCapacity=NewCapacity; } } template void GPVec::deallocate_item(OBJ* &item) { if (item==NULL) return; if (FREEDATA) { (*fFreeProc)(item); item=NULL; } else { delete item; item=NULL; } } template void GPVec::Clear() { if (FREEDATA) { for (int i=0; i void GPVec::Exchange(int idx1, int idx2) { TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ* item=fList[idx1]; fList[idx1]=fList[idx2]; fList[idx2]=item; } template void GPVec::Expand() { if (fCount==fCapacity) Grow(); //return this; } template void GPVec::Grow() { /* int delta; if (fCapacity > 64 ) { delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4); } else { delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; } */ int delta = (fCapacity>8) ? (fCapacity>>2) : 1; setCapacity(fCapacity + delta); } template void GPVec::Grow(int idx, OBJ* newitem) { /* int delta; if (fCapacity > 64 ) { delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4); } else { delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; } */ int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range //if (NewCapacity!=fCapacity) { /*if (NewCapacity==0) { GFREE(fList); } else {//add the new item */ if (idx==fCount) { GREALLOC(fList, NewCapacity*sizeof(OBJ*)); fList[idx]=newitem; } else { OBJ** newList; GMALLOC(newList, NewCapacity*sizeof(OBJ*)); //copy data before idx memcpy(&newList[0],&fList[0], idx*sizeof(OBJ*)); newList[idx]=newitem; //copy data after idx memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*)); memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*)); //data copied: GFREE(fList); fList=newList; } fCount++; fCapacity=NewCapacity; } template int GPVec::IndexOf(pointer item) { for (int i=0;i int GPVec::Add(OBJ* item) { int result; if (item==NULL) return -1; result = fCount; if (result==fCapacity) this->Grow(); fList[result]=item; fCount++; return fCount-1; } template void GPVec::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx); if (fCount==fCapacity) { Grow(idx, item); return; } if (idx void GPVec::Move(int curidx, int newidx) { //s //BE_UNSORTED; //cannot do that in a sorted list! if (curidx!=newidx || newidx>=fCount) GError(GVEC_INDEX_ERR, newidx); OBJ* p; p=Get(curidx); //this is a delete: fCount--; if (curidx void GPVec::Put(int idx, OBJ* item) { //WARNING: this will never free the replaced item! TEST_INDEX(idx); fList[idx]=item; } template void GPVec::Forget(int idx) { TEST_INDEX(idx); fList[idx]=NULL; //user should free that somewhere else } template void GPVec::freeItem(int idx) { TEST_INDEX(idx); if (fFreeProc!=NULL) { (*fFreeProc)(fList[idx]); } else this->DefaultFreeProc(fList[idx]); fList[idx]=NULL; } template void GPVec::Delete(int index) { TEST_INDEX(index); if (fFreeProc!=NULL && fList[index]!=NULL) { (*fFreeProc)(fList[index]); //freeItem } fList[index]=NULL; fCount--; if (index OBJ* GPVec::Pop() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[fCount]; fList[fCount]=NULL; return o; } //Queue usage: template OBJ* GPVec::Shift() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[0]; if (fCount>0) memmove(&fList[0], &fList[1], (fCount)*sizeof(OBJ*)); fList[fCount]=NULL; //not that it matters.. return o; } //linear search for the pointer address template int GPVec::RemovePtr(pointer item) { if (item==NULL) return -1; for (int i=0;i void GPVec::Pack() { for (int i=fCount-1; i>=0; i--) if (fList[i]==NULL) Delete(i); //shift rest of fList content accordingly } template void GPVec::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); if (NewCount > fCapacity) setCapacity(NewCount); if (NewCount > fCount) //pad with NULL pointers memset(& fList[fCount], 0, (NewCount - fCount) * sizeof(OBJ*)); fCount = NewCount; } template void GPVec::qSort(int L, int R, GCompareProc* cmpFunc) { int I, J; OBJ* P; OBJ* T; do { I = L; J = R; P = this->fList[(L + R) >> 1]; do { while (cmpFunc(this->fList[I], P) < 0) I++; while (cmpFunc(this->fList[J], P) > 0) J--; if (I <= J) { T = this->fList[I]; this->fList[I] = this->fList[J]; this->fList[J] = T; I++; J--; } } while (I <= J); if (L < J) qSort(L, J, cmpFunc); L = I; } while (I < R); } template void GPVec::Sort(GCompareProc* cmpFunc) { if (cmpFunc==NULL) { GMessage("Warning: NULL compare function given, useless Sort() call.\n"); return; } if (this->fList!=NULL && this->fCount>0) qSort(0, this->fCount-1, cmpFunc); } template void GPVec::Sort() { GCompareProc* cmpFunc = DefLTCompareProc; Sort(cmpFunc); } //--------------------------------------------------------------------------- #endif libgff-2.0.0/include/codons.h000066400000000000000000000021751367741004700160410ustar00rootroot00000000000000#ifndef CODONS_H #define CODONS_H #include "GBase.h" #include unsigned short packCodon(char n1, char n2, char n3); //assumes n1,n2,n3 are UPPERCASE! struct Codon { char nuc[3]; Codon(char* str=NULL) { if (str==NULL) { nuc[0]='N'; nuc[1]='N'; nuc[2]='N'; } else { nuc[0]=toupper(str[0]); nuc[1]=toupper(str[1]); nuc[2]=toupper(str[2]); } } Codon(char s1, char s2, char s3) { nuc[0]=toupper(s1); nuc[1]=toupper(s2); nuc[2]=toupper(s3); } char& operator[](int idx) { if (idx<0 || idx>2) GError("Error: Codon index out of bounds!\n"); return nuc[idx]; } char operator[](int idx) const { if (idx<0 || idx>2) GError("Error: Codon index out of bounds!\n"); return nuc[idx]; } char translate(); }; //simple 1st frame forward translation of a given DNA string //will allocated memory for the translation -- the caller is // responsible for freeing the returned string! char* translateDNA(const char* dnastr, int& aalen, int dnalen=0); char translateCodon(const char* dna); //returns the aminoacid code for the 1st codon at dna bool codonTableInit(); #endif libgff-2.0.0/include/gdna.h000066400000000000000000000006631367741004700154650ustar00rootroot00000000000000#ifndef GDNA_H #define GDNA_H #include "GBase.h" extern const char* IUPAC_DEFS; extern const char* IUPAC_COMP; char ntComplement(char c); //in-place reverse complement of a nucleotide (sub)sequence char* reverseComplement(char* seq, int slen=0); bool gDnaInit(); byte gdna2bit(char* &nt, int n=4); //pack n bases into a byte (n can be 1..4) char g2bit2base(byte v2bit); //convert the 2-bit value into 'A', 'C', 'G' or 'T' #endif libgff-2.0.0/include/gff.h000066400000000000000000001436231367741004700153220ustar00rootroot00000000000000#ifndef GFF_H #define GFF_H //#define CUFFLINKS 1 #include "GBase.h" #include "gdna.h" #include "codons.h" #include "GFaSeqGet.h" #include "GList.hh" #include "GHash.hh" #ifdef CUFFLINKS #include // for boost::crc_32_type #endif //reserved Gffnames::feats entries -- basic feature types extern int gff_fid_mRNA; // "mRNA" feature name extern int gff_fid_transcript; // *RNA, *transcript feature name extern int gff_fid_exon; extern const uint GFF_MAX_LOCUS; extern const uint GFF_MAX_EXON; extern const uint GFF_MAX_INTRON; extern const int CLASSCODE_OVL_RANK; //extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent //extern const byte gfo_flagShift_LEVEL; //extern bool gff_show_warnings; #define GFF_LINELEN 4096 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n" enum GffExonType { exgffIntron=-1, // useless "intron" feature exgffNone=0, //not recognizable or unitialized exonic segment exgffStartCodon, //from "start_codon" feature (within CDS) exgffStopCodon, //from "stop_codon" feature (may be outside CDS, but should) exgffCDS, //from "CDS" feature exgffUTR, //from "UTR" feature exgffCDSUTR, //from a merge of UTR and CDS feature exgffExon, //from "exon" feature }; extern const char* exonTypes[]; const char* strExonType(char xtype); class GfList; typedef void GFFCommentParser(const char* cmline, GfList* gflst); //comment parser callback //Useful for parsing/maintaining ref seq info from comment lines like this: //##sequence-region chr1 1 24895642 class GffReader; class GffObj; //---transcript overlapping - utility functions: int classcode_rank(char c); //returns priority value for class codes char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch=false); //returns: class code char transcriptMatch(GffObj& a, GffObj& b, int& ovlen); //generic transcript match test // -- return '=', '~' or 0 char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen); //single-exon transcript match test //--- // -- tracking exon/CDS segments from local mRNA to genome coordinates class GMapSeg:public GSeg { public: uint gstart; //genome start location uint gend; //genome end location //gendgend) { //reverse strand mapping if (gcgstart) return 0; return (gstart-gc); } else { if (gcgend) return 0; return (gc-gstart); } } }; struct GffScore { float score; int8_t precision; GffScore(float sc=0, int8_t prec=-1):score(sc),precision(prec) { } void print(FILE* outf) { if (precision<0) fprintf(outf, "."); else fprintf(outf, "%.*f", precision, score); } void sprint(char* outs) { if (precision<0) sprintf(outs, "."); else sprintf(outs, "%.*f", precision, score); } bool operator<(GffScore& v) { return this->scorescore<=v.score; } bool operator>(GffScore& v) { return this->score>v.score; } bool operator>=(GffScore& v) { return this->score>=v.score; } bool operator==(GffScore& v) { return this->score==v.score; } }; extern const GffScore GFFSCORE_NONE; class GMapSegments:public GVec { public: int dir; //-1 or +1 (reverse/forward for genome coordinates) GSeg lreg; // always 1,max local coord GSeg greg; // genomic min,max coords GMapSegments(char strand='+'):lreg(0,0),greg(0,0) { dir=(strand=='-') ? -1 : 1; } void Clear(char strand='+') { lreg.start=0;lreg.end=0; greg.start=0;greg.end=0; dir = (strand=='-') ? -1 : 1;; GVec::Clear(); } int add(uint s, uint e, uint gs, uint ge) { if (dir<0) { if (gsgreg.end) greg.end=gs; if (gegreg.end) greg.end=ge; if (gslreg.end) lreg.end=gm.end; if (gm.start::Add(gm); } uint gmap(uint lc) { //takes a local coordinate and returns its mapping to genomic coordinates //returns 0 if mapping cannot be performed! if (lc==0 || fCount==0 || lclreg.end) return 0; //find local segment containing this coord int i=0; while (i=fArray[i].start && lc<=fArray[i].end) return (fArray[i].gstart+dir*(lc-fArray[i].start)); ++i; } return 0; } uint lmap(uint gc) { //takes a genome coordinate and returns its mapping to local coordinates if (gc==0 || fCount==0 || gcgreg.end) return 0; //find genomic segment containing this coord int i=0; while (i exons; BEDLine(GffReader* r=NULL, const char* l=NULL); ~BEDLine() { GFREE(dupline); GFREE(line); } }; class GffLine { protected: char* _parents; //stores a copy of the Parent attribute value, //with commas replaced by \0 int _parents_len; bool parseSegmentList(GVec& segs, char* str); public: char* dupline; //duplicate of original line char* line; //this will have tabs replaced by \0 int llen; char* gseqname; char* track; char* ftype; //feature name: mRNA/gene/exon/CDS int ftype_id; char* info; //the last, attributes' field, unparsed uint fstart; uint fend; /* uint qstart; //overlap coords on query, if available uint qend; uint qlen; //query len, if given */ float score; int8_t score_decimals; char strand; union { unsigned int flags; struct { bool is_exonlike:2; //CDS,codon, UTR, exon }; struct { bool is_cds:1; //"cds" or "start/stop_codon" features bool is_exon:1; //"exon" and "utr" features bool is_transcript:1; //if current feature is *RNA or *transcript bool is_gene:1; //current feature is *gene //bool is_gff3:1; //line appears to be in GFF3 format (0=GTF) bool is_gtf_transcript:1; //GTF transcript line with Parents parsed from gene_id bool skipLine:1; bool gffWarnings:1; bool is_gene_segment:1; //for NCBI's D/J/V/C_gene_segment }; }; int8_t exontype; // gffExonType char phase; // '.' , '0', '1' or '2', can be also given as CDSphase attribute in TLF uint cds_start; //if TLF: CDS=start:end attribute uint cds_end; GVec exons; //if TLF: exons= attribute GVec cdss; //if TLF: CDS=segment_list attribute char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3) char* gene_id; //GTF only: value of "gene_id" attribute if present char** parents; //for GTF only parents[0] is used int num_parents; char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id) GffLine(GffReader* reader, const char* l); //parse the line accordingly void discardParent() { GFREE(_parents); _parents_len=0; num_parents=0; GFREE(parents); parents=NULL; } static char* extractGFFAttr(char*& infostr, const char* oline, const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL, bool deleteAttr=true); char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL){ return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, true); } char* getAttrValue(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL) { return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, false); } GffLine(GffLine& l): _parents(NULL), _parents_len(l._parents_len), dupline(NULL), line(NULL), llen(l.llen), gseqname(NULL), track(NULL), ftype(NULL), ftype_id(l.ftype_id), info(NULL), fstart(l.fstart), fend(l.fend), //qstart(l.fstart), qend(l.fend), qlen(l.qlen), score(l.score), score_decimals(l.score_decimals), strand(l.strand), flags(l.flags), exontype(l.exontype), phase(l.phase), cds_start(l.cds_start), cds_end(l.cds_end), exons(l.exons), cdss(l.cdss), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(l.num_parents), ID(NULL) { //if (l==NULL || l->line==NULL) // GError("Error: invalid GffLine(l)\n"); //memcpy((void*)this, (void*)l, sizeof(GffLine)); GMALLOC(line, llen+1); memcpy(line, l.line, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l.dupline, llen+1); //--offsets within line[] gseqname=line+(l.gseqname-l.line); track=line+(l.track-l.line); ftype=line+(l.ftype-l.line); info=line+(l.info-l.line); if (num_parents>0 && parents) { GMALLOC(parents, num_parents*sizeof(char*)); //_parents_len=l->_parents_len; copied above _parents=NULL; //re-init, forget pointer copy GMALLOC(_parents, _parents_len); memcpy(_parents, l._parents, _parents_len); for (int i=0;i(GffAttr& d){ return (this>&d); } bool operator<(GffAttr& d){ return (this<&d); } }; class GffNameList; class GffNames; class GffNameInfo { friend class GffNameList; public: int idx; char* name; GffNameInfo(const char* n=NULL):idx(-1),name(NULL) { if (n) name=Gstrdup(n); } ~GffNameInfo() { GFREE(name); } bool operator==(GffNameInfo& d){ return (strcmp(this->name, d.name)==0); } bool operator<(GffNameInfo& d){ return (strcmp(this->name, d.name)<0); } }; class GffNameList:public GPVec { friend class GffNameInfo; friend class GffNames; protected: GHash byName;//hash with shared keys int idlast; //fList index of last added/reused name int addStatic(const char* tname) {// fast add GffNameInfo* f=new GffNameInfo(tname); idlast=this->Add(f); f->idx=idlast; byName.shkAdd(f->name,f); return idlast; } public: //GffNameList(int init_capacity=6):GList(init_capacity, false,true,true), byName(false) { GffNameList(int init_capacity=6):GPVec(init_capacity, true), byName(false) { idlast=-1; setCapacity(init_capacity); } char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; } int lastNameId() { return idlast; } char* getName(int nid) { //retrieve name by its ID if (nid<0 || nid>=fCount) GError("GffNameList Error: invalid index (%d)\n",nid); return fList[nid]->name; } int addName(const char* tname) {//returns or create an id for the given name //check idlast first, chances are it's the same feature name checked /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0) return idlast;*/ GffNameInfo* f=byName.Find(tname); int fidx=-1; if (f!=NULL) fidx=f->idx; else {//add new entry f=new GffNameInfo(tname); fidx=this->Add(f); f->idx=fidx; byName.shkAdd(f->name,f); } idlast=fidx; return fidx; } int addNewName(const char* tname) { GffNameInfo* f=new GffNameInfo(tname); int fidx=this->Add(f); f->idx=fidx; byName.shkAdd(f->name,f); return fidx; } int getId(const char* tname) { //only returns a name id# if found GffNameInfo* f=byName.Find(tname); if (f==NULL) return -1; return f->idx; } int removeName() { GError("Error: removing names from GffNameList not allowed!\n"); return -1; } }; class GffNames { public: int numrefs; GffNameList tracks; GffNameList gseqs; GffNameList attrs; GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc. GffNames():tracks(),gseqs(),attrs(), feats() { numrefs=0; //the order below is critical! //has to match: gff_fid_mRNA, gff_fid_exon gff_fid_mRNA = feats.addStatic("mRNA");//index 0=gff_fid_mRNA gff_fid_transcript=feats.addStatic("transcript");//index 1=gff_fid_transcript gff_fid_exon=feats.addStatic("exon");//index 1=gff_fid_exon //feats.addStatic("CDS"); //index 2=gff_fid_CDS } }; void gffnames_ref(GffNames* &n); void gffnames_unref(GffNames* &n); enum GffPrintMode { pgtfAny, //print record as read, if GTF pgtfExon, //print exon only features pgtfCDS, //print CDS and exon features pgffAny, //print record as read (if isCDSonly() prints only CDS) pgffExon, pgffCDS, pgffBoth, //enforce exon printing if isCDSOnly() pgffTLF, //exon and CDS data shown as additional GFF attributes //in the transcript line (Transcript Line Format) //every line has the whole transcript data pgffBED //print a BED line with all other GFF attributes in column 13 }; class GffAttrs:public GList { public: GffAttrs():GList(false,true,true) { } void add_if_new(GffNames* names, const char* attrname, const char* attrval) { //adding a new value without checking for cds status int nid=names->attrs.getId(attrname); if (nid>=0) { //attribute name found in the dictionary for (int i=0;iattr_id) { return; } //don't update existing } else { //adding attribute name to global attr name dictionary nid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(nid, attrval)); } void add_if_new(GffNames* names, const char* attrname, const char* attrval, bool is_cds) { int nid=names->attrs.getId(attrname); if (nid>=0) { //attribute name found in the dictionary for (int i=0;iattr_id && is_cds==Get(i)->cds) { return; } //don't update existing } else { //adding attribute name to global attr name dictionary nid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(nid, attrval, is_cds)); } void add_or_update(GffNames* names, const char* attrname, const char* val) { //adding a new value without checking for cds status int aid=names->attrs.getId(attrname); if (aid>=0) { //attribute found in the dictionary for (int i=0;iattr_id) { //update the existing value for this attribute Get(i)->setValue(val); return; } } } else { //adding attribute name to global attr name dictionary aid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(aid, val)); } void add_or_update(GffNames* names, const char* attrname, const char* val, bool is_cds) { int aid=names->attrs.getId(attrname); if (aid>=0) { //attribute found in the dictionary for (int i=0;iattr_id && Get(i)->cds==is_cds) { //update the existing value for this attribute Get(i)->setValue(val, is_cds); return; } } } else { //adding attribute name to global attr name dictionary aid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(aid, val, is_cds)); } int haveId(int attr_id, bool is_cds=false) { for (int i=0;iattr_id && Get(i)->cds==is_cds) return i; return -1; } int haveId(const char* attrname, GffNames* names, bool is_cds=false) { int aid=names->attrs.getId(attrname); if (aid>=0) { for (int i=0;iattr_id && Get(i)->cds==is_cds) return i; } return -1; } char* getAttr(GffNames* names, const char* attrname) { int aid=names->attrs.getId(attrname); if (aid>=0) for (int i=0;iattr_id) return Get(i)->attr_val; return NULL; } char* getAttr(GffNames* names, const char* attrname, bool is_cds) { int aid=names->attrs.getId(attrname); if (aid>=0) for (int i=0;iattr_id && Get(i)->cds==is_cds) return Get(i)->attr_val; return NULL; } char* getAttr(int aid) { if (aid>=0) for (int i=0;iattr_id) return Get(i)->attr_val; return NULL; } char* getAttr(int aid, bool is_cds) { if (aid>=0) for (int i=0;iattr_id && Get(i)->cds==is_cds) return Get(i)->attr_val; return NULL; } void copyAttrs(GffAttrs* attrs, bool is_cds=false) { //deep copy attributes from another GffAttrs list // (only the ones which do not exist yet) if (attrs==NULL) return; for (int i=0;iCount();i++) { int aid=attrs->Get(i)->attr_id; if (haveId(aid, is_cds)<0) Add(new GffAttr(aid, attrs->Get(i)->attr_val, is_cds)); } } }; class GffExon : public GSeg { public: bool sharedAttrs; //do not free attrs on destruct! GffAttrs* attrs; //other attributes kept for this exon/CDS GffScore score; // gff score column int8_t exontype; char phase; //GFF phase column - for CDS segments only! // '.' = undefined (UTR), '0','1','2' for CDS exons void* uptr; //for associating extended user data to this exon char* getAttr(GffNames* names, const char* atrname) { if (attrs==NULL || names==NULL || atrname==NULL) return NULL; return attrs->getAttr(names, atrname); } char* getAttr(int aid) { if (attrs==NULL) return NULL; return attrs->getAttr(aid); } GffExon(bool share_attributes):GSeg(0,0), sharedAttrs(share_attributes), attrs(NULL), score(), exontype(0), phase('.'), uptr(NULL){ } GffExon(uint s=0, uint e=0, int8_t et=0, char ph='.', float sc=0, int8_t sc_prec=0):sharedAttrs(false), attrs(NULL), score(sc,sc_prec), exontype(et), phase(ph), uptr(NULL) { if (scopyAttrs(ex.attrs); } } GffExon& operator=(const GffExon& o) = default; //prevent gcc 9 warnings: //yes, I want a shallow copy here ~GffExon() { //destructor if (attrs!=NULL && !sharedAttrs) delete attrs; } }; //only for mapping to spliced coding sequence: class GffCDSeg:public GSeg { public: char phase; int exonidx; }; //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments class GffObj:public GSeg { protected: char* gffID; // ID name for mRNA (parent) feature char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3) char* geneID; //value of gene_id attribute (GTF) if present, or the ID attribute of a parent gene feature (GFF3) union { unsigned int flags; struct { bool flag_HAS_ERRORS :1; bool flag_CHILDREN_PROMOTED :1; bool flag_IS_GENE :1; bool flag_IS_TRANSCRIPT :1; bool flag_HAS_GFF_ID :1; //found transcript/RNA feature line (GFF3 or GTF2 with transcript line) bool flag_BY_EXON :1; //created by subfeature (exon/CDS) directly bool flag_CDS_ONLY :1; //transcript defined by CDS features only (GffObj::isCDS()) bool flag_CDS_NOSTART :1; //partial CDS at 5' end (no start codon) bool flag_CDS_NOSTOP :1; //partial CDS at 3' end (no stop codon) bool flag_CDS_X :1; //transcript having CDS with ribosomal shift (i.e. after merging exons) //CDS segments stored in ::cdss are incompatible with the exon segments bool flag_GENE_SEGMENT :1; //a transcript-like C/D/J/V_gene_segment (NCBI's annotation) bool flag_TRANS_SPLICED :1; bool flag_DISCONTINUOUS :1; //discontinuous feature (e.g. cDNA_match) segments linked by same ID bool flag_TARGET_ONLY :1; //Target= feature (e.g. from RepeatMasker output), lacks ID bool flag_DISCARDED :1; //it will be discarded from the final GffReader list bool flag_LST_KEEP :1; //controlled by isUsed(); if set, this GffObj will not be //deallocated when GffReader is destroyed bool flag_FINALIZED :1; //if finalize() was already called for this GffObj unsigned int gff_level :4; //hierarchical level (0..15) }; }; //-- friends: friend class GffReader; friend class GffExon; public: static GffNames* names; // dictionary storage that holds the various attribute names etc. int track_id; // index of track name in names->tracks int gseq_id; // index of genomic sequence name in names->gseqs int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value int subftype_id; //index of child subfeature name in names->feats (subfeatures stored in "exons") //if ftype_id==gff_fid_mRNA then this value is ignored GList exons; //for non-mRNA entries, these can be any subfeature of type subftype_id GList* cdss; //only !NULL for cases of "programmed frameshift" when CDS boundaries do not match //exons boundaries GPVec children; GffObj* parent; int udata; //user data, flags etc. void* uptr; //user pointer (to a parent object, cluster, locus etc.) GffObj* ulink; //link to another GffObj (user controlled field) //---mRNA specific fields: //bool isCDS; //just a CDS, no UTRs uint CDstart; //CDS lowest coordinate uint CDend; //CDS highest coordinate char CDphase; //initial phase for CDS start ('.','0'..'2') //CDphase is at CDend if strand=='-' static void decodeHexChars(char* dbuf, const char* s, int maxlen=1023); bool hasErrors() { return flag_HAS_ERRORS; } void hasErrors(bool v) { flag_HAS_ERRORS=v; } bool hasGffID() { return flag_HAS_GFF_ID; } void hasGffID(bool v) {flag_HAS_GFF_ID=v; } bool createdByExon() { return flag_BY_EXON; } void createdByExon(bool v) {flag_BY_EXON=v; } bool isCDSOnly() { return flag_CDS_ONLY; } void isCDSOnly(bool v) { flag_CDS_ONLY=v; } bool isXCDS() { return flag_CDS_X; } void isXCDS(bool v) { flag_CDS_X=v; } bool isFinalized() { return flag_FINALIZED; } void isFinalized(bool v) { flag_FINALIZED=v; } bool isGene() { return flag_IS_GENE; } void isGene(bool v) {flag_IS_GENE=v; } bool isDiscarded() { return flag_DISCARDED; } void isDiscarded(bool v) { flag_DISCARDED=v; } bool isUsed() { return flag_LST_KEEP; } void isUsed(bool v) {flag_LST_KEEP=v; } bool isTranscript() { return flag_IS_TRANSCRIPT; } void isTranscript(bool v) {flag_IS_TRANSCRIPT=v; } bool isGeneSegment() { return flag_GENE_SEGMENT; } void isGeneSegment(bool v) {flag_GENE_SEGMENT=v; } bool promotedChildren() { return flag_CHILDREN_PROMOTED; } void promotedChildren(bool v) { flag_CHILDREN_PROMOTED=v; } void setLevel(byte v) { gff_level=v; } byte getLevel() { return gff_level; } byte incLevel() { gff_level++; return gff_level; } bool isValidTranscript() { //return (ftype_id==gff_fid_mRNA && exons.Count()>0); return (isTranscript() && exons.Count()>0); } //return the index of exon containing coordinate coord, or -1 if not int whichExon(uint coord, GList* segs=NULL); int readExon(GffReader& reader, GffLine& gl); int addExon(GList& segs, GffLine& gl, int8_t exontype_override=exgffNone); //add to cdss or exons int addExon(uint segstart, uint segend, int8_t exontype=exgffNone, char phase='.', GffScore exon_score=GFFSCORE_NONE, GList* segs=NULL); protected: bool reduceExonAttrs(GList& segs); //utility segment-merging function for addExon() void expandSegment(GList&segs, int oi, uint segstart, uint segend, int8_t exontype); bool processGeneSegments(GffReader* gfr); //for genes that have _gene_segment features (NCBI annotation) void transferCDS(GffExon* cds); public: void removeExon(int idx); void removeExon(GffExon* p); char strand; //true if features are on the reverse complement strand GffScore gscore; int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths) GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature //constructor by gff line parsing: GffObj(GffReader& gfrd, BEDLine& bedline); GffObj(GffReader& gfrd, GffLine& gffline); //if gfline->Parent!=NULL then this will also add the first sub-feature // otherwise, only the main feature is created void copyAttrs(GffObj* from); void clearAttrs() { if (attrs!=NULL) { bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs); delete attrs; attrs=NULL; if (sharedattrs) exons[0]->attrs=NULL; } } GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() { //exons: sorted, free, non-unique gffID=NULL; uptr=NULL; ulink=NULL; flags=0; udata=0; parent=NULL; ftype_id=-1; subftype_id=-1; if (anid!=NULL) gffID=Gstrdup(anid); gffnames_ref(names); CDstart=0; // hasCDS <=> CDstart>0 CDend=0; CDphase=0; gseq_id=-1; track_id=-1; strand='.'; attrs=NULL; covlen=0; geneID=NULL; gene_name=NULL; } ~GffObj() { GFREE(gffID); GFREE(gene_name); GFREE(geneID); delete cdss; clearAttrs(); gffnames_unref(names); } //-------------- GffObj* finalize(GffReader* gfr); //complete parsing: must be called in order to merge adjacent/close proximity subfeatures void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false, bool CDSsrc=false); const char* getSubfName() { //returns the generic feature type of the entries in exons array //int sid=exon_ftype_id; //if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS; return names->feats.getName(subftype_id); } void setCDS(uint cd_start, uint cd_end, char phase=0); void setCDS(GffObj* t); //set CDS from another transcript bool monoFeature() { return (exons.Count()==0 || (exons.Count()==1 && //exon_ftype_id==ftype_id && exons[0]->end==this->end && exons[0]->start==this->start)); } bool hasCDS() { return (CDstart>0); } const char* getFeatureName() { return names->feats.getName(ftype_id); } void setFeatureName(const char* feature); void addAttr(const char* attrname, const char* attrvalue); int removeAttr(const char* attrname, const char* attrval=NULL); int removeAttr(int aid, const char* attrval=NULL); int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL); int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL); const char* getAttrName(int i) { if (attrs==NULL) return NULL; return names->attrs.getName(attrs->Get(i)->attr_id); } char* getAttr(const char* attrname, bool checkFirstExon=false) { if (names==NULL || attrname==NULL) return NULL; char* r=NULL; if (attrs==NULL) { if (!checkFirstExon) return NULL; } else r=attrs->getAttr(names, attrname); if (r!=NULL) return r; if (checkFirstExon && exons.Count()>0) { r=exons.First()->getAttr(names, attrname); } return r; } char* getExonAttr(GffExon* exon, const char* attrname) { if (exon==NULL || attrname==NULL) return NULL; return exon->getAttr(names, attrname); } char* getExonAttr(int exonidx, const char* attrname) { if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL; return exons[exonidx]->getAttr(names, attrname); } char* getAttrValue(int i) { if (attrs==NULL) return NULL; return attrs->Get(i)->attr_val; } const char* getGSeqName() { return names->gseqs.getName(gseq_id); } const char* getRefName() { return names->gseqs.getName(gseq_id); } void setRefName(const char* newname); const char* getTrackName() { return names->tracks.getName(track_id); } bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment //ignores strand! if (s>e) Gswap(s,e); for (int i=0;ioverlap(s,e)) return true; } return false; } bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment //if (gseq_id!=m.gseq_id) return false; // ignores strand and gseq_id, must check in advance for (int i=0;istart>m.exons[j]->end) continue; if (m.exons[j]->start>exons[i]->end) break; //-- overlap if we are here: return true; } } return false; } int exonOverlapIdx(GList& segs, uint s, uint e, int* ovlen=NULL, int start_idx=0); int exonOverlapLen(GffObj& m) { if (start>m.end || m.start>end) return 0; int i=0; int j=0; int ovlen=0; while (istart; uint iend=exons[i]->end; uint jstart=m.exons[j]->start; uint jend=m.exons[j]->end; if (istart>jend) { j++; continue; } if (jstart>iend) { i++; continue; } //exon overlap uint ovstart=GMAX(istart,jstart); if (iend(GffObj& d){ if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); if (start==d.start) { if (getLevel()==d.getLevel()) { if (end==d.end) return (strcmp(gffID, d.gffID)>0); else return (end>d.end); } else return (getLevel()>d.getLevel()); } else return (start>d.start); } bool operator<(GffObj& d){ if (gseq_id!=d.gseq_id) return (gseq_id& cds); void updateCDSPhase(GList& segs); //for CDS-only features, updates GffExon::phase void printGTab(FILE* fout, char** extraAttrs=NULL); void printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, GffExon* exon, bool gff3, bool cvtChars, char* dbuf, int dbuf_len); void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false); void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) { printGxf(fout, pgtfAny, tlabel, NULL, cvtChars); } void printGff(FILE* fout, const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false) { printGxf(fout, pgffAny, tlabel, gfparent, cvtChars); } void printTranscriptGff(FILE* fout, char* tlabel=NULL, bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) { if (isValidTranscript()) printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars); } void printExonList(FILE* fout); //print comma delimited list of exon intervals void printCDSList(FILE* fout); //print comma delimited list of CDS intervals void printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len); //print a BED-12 line + GFF3 attributes in 13th field void printSummary(FILE* fout=NULL); char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL, uint* cds_start=NULL, uint* cds_end=NULL, GMapSegments* seglst=NULL, bool cds_open=false); char* getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst=NULL); void addPadding(int padLeft, int padRight); //change exons to include this padding on the sides void removePadding(int padLeft, int padRight); //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ? bool empty() { return (start==0); } }; typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2); //user callback after parsing a mapping object: // Returns: "done with it" status: // TRUE if gobj is no longer needed so it's FREEd upon return // FALSE if the user needs the gobj pointer and is responsible for // collecting and freeing all GffObj objects //GSeqStat: collect basic stats about a common underlying genomic sequence // for multiple GffObj class GSeqStat { public: int gseqid; //gseq id in the global static pool of gseqs char* gseqname; //just a pointer to the name of gseq int fcount;//number of features on this gseq uint mincoord; uint maxcoord; uint maxfeat_len; //maximum feature length on this genomic sequence GffObj* maxfeat; GSeqStat(int id=-1, char* name=NULL) { gseqid=id; gseqname=name; fcount=0; mincoord=MAXUINT; maxcoord=0; maxfeat_len=0; maxfeat=NULL; } bool operator>(GSeqStat& g) { return (gseqid>g.gseqid); } bool operator<(GSeqStat& g) { return (gseqid { public: GfList(bool sorted):GList(sorted,false,false) { } GfList():GList(false,false,false) { //GffObjs in this list are NOT deleted when the list is cleared //-- for deallocation of these objects, call freeAll() or freeUnused() as needed } void finalize(GffReader* gfr); void freeAll() { for (int i=0;iisUsed()) continue; /*//inform the children? for (int c=0;cchildren.Count();c++) { fList[i]->children[c]->parent=NULL; } */ delete fList[i]; fList[i]=NULL; } Clear(); } }; class CNonExon { //utility class used in subfeature promotion public: //int idx; GffObj* parent; GffExon* exon; GffLine* gffline; //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) { CNonExon(GffObj* p, GffExon* e, GffLine& gl) { parent=p; exon=e; //idx=i; gffline=new GffLine(gl); } ~CNonExon() { delete gffline; } }; class GffReader { friend class GffObj; friend class GffLine; friend class GfList; char* linebuf; off_t fpos; int buflen; protected: union { unsigned int flags; unsigned int gff_type: 6; struct { bool is_gff3: 1; //GFF3 syntax was detected bool is_gtf:1; //GTF syntax was detected bool gtf_transcript:1; //has "transcript" features (2-level GTF) bool gtf_gene:1; //has "gene" features (3-level GTF ..Ensembl?) bool is_BED:1; //input is BED-12 format, possibly with attributes in 13th field bool is_TLF:1; //input is GFF3-like Transcript Line Format with exons= attribute //--other flags bool transcripts_Only:1; //default ; only keep recognized transcript features bool keep_Genes:1; //for transcriptsOnly, do not discard genes from gflst bool keep_Attrs:1; bool keep_AllExonAttrs:1; //when keep_Attrs, do not attempt to reduce exon attributes bool noExonAttrs:1; bool ignoreLocus:1; //discard locus features and attributes from input bool merge_CloseExons:1; bool gene2exon:1; bool sortByLoc:1; //if records should be sorted by location bool refAlphaSort:1; //if sortByLoc, reference sequences are // sorted lexically instead of their id# bool gff_warns:1; }; }; //char* lastReadNext; FILE* fh; char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader GFFCommentParser* commentParser; GffLine* gffline; BEDLine* bedline; //bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features //bool gene2exon; // for childless genes: add an exon as the entire gene span GHash discarded_ids; //for transcriptsOnly mode, keep track // of discarded parent IDs GHash< GPVec > phash; //transcript_id => GPVec(false) //GHash tids; //just for transcript_id uniqueness char* gfoBuildId(const char* id, const char* ctg); //void gfoRemove(const char* id, const char* ctg); GffObj* gfoAdd(GffObj* gfo); GffObj* gfoAdd(GPVec& glst, GffObj* gfo); GffObj* gfoReplace(GPVec& glst, GffObj* gfo, GffObj* toreplace); // const char* id, const char* ctg, char strand, GVec** glst, uint start, uint end bool pFind(const char* id, GPVec*& glst); GffObj* gfoFind(const char* id, GPVec* & glst, const char* ctg=NULL, char strand=0, uint start=0, uint end=0); CNonExon* subfPoolCheck(GffLine* gffline, GHash& pex, char*& subp_name); void subfPoolAdd(GHash& pex, GffObj* newgfo); GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash& pex); #ifdef CUFFLINKS boost::crc_32_type _crc_result; #endif public: GPVec gseqtable; //table with all genomic sequences, but only current GXF gseq ID indices will have non-NULL //GffNames* names; //just a pointer to the global static Gff names repository GfList gflst; //keeps track of all GffObj records being read (when readAll() is used) GffObj* newGffRec(GffLine* gffline, GffObj* parent=NULL, GffExon* pexon=NULL, GPVec* glst=NULL, bool replace_parent=false); GffObj* newGffRec(BEDLine* bedline, GPVec* glst=NULL); //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx); GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline); GffObj* updateParent(GffObj* newgfh, GffObj* parent); bool readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash* pex=NULL); GPVec gseqStats; //populated after finalize() with only the ref seqs in this file GffReader(FILE* f=NULL, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0), buflen(0), flags(0), fh(f), fname(NULL), commentParser(NULL), gffline(NULL), bedline(NULL), discarded_ids(true), phash(true), gseqtable(1,true), gflst(), gseqStats(1, false) { GMALLOC(linebuf, GFF_LINELEN); buflen=GFF_LINELEN-1; gffnames_ref(GffObj::names); //gff_warns=gff_show_warnings; transcripts_Only=t_only; sortByLoc=sort; noExonAttrs=true; //lastReadNext=NULL; } /* void init(FILE *f, bool t_only=false, bool sortbyloc=false, bool g2exon=false) { fname=NULL; fh=f; if (fh!=NULL) rewind(fh); fpos=0; flags=0; transcriptsOnly=t_only; gflst.sortedByLoc(sortbyloc); gene2exon=g2exon; } */ void gene2Exon(bool v) { gene2exon=v;} void enableSorting(bool sorting=true) { sortByLoc=sorting; } bool getSorting() { return sortByLoc; } void isBED(bool v=true) { is_BED=v; } //should be set before any parsing! void isTLF(bool v=true) { is_TLF=v; } //should be set before any parsing! void keepAttrs(bool keep_attrs=true, bool discardExonAttrs=true, bool preserve_exon_attrs=false) { keep_Attrs=keep_attrs; noExonAttrs=discardExonAttrs; keep_AllExonAttrs=preserve_exon_attrs; } void transcriptsOnly(bool t_only) { transcripts_Only=t_only; } bool transcriptsOnly() { return transcripts_Only; } void setIgnoreLocus(bool nolocus) { ignoreLocus=nolocus; } void keepGenes(bool keep_genes) { keep_Genes=keep_genes; } bool keepGenes() { return keep_Genes; } void mergeCloseExons(bool merge_close_exons=true) { merge_CloseExons=merge_close_exons; } void showWarnings(bool v) { gff_warns=v; //gff_show_warnings=v; } bool showWarnings() { return gff_warns; } void setRefAlphaSorted(bool v=true) { refAlphaSort=v; if (v) sortByLoc=true; } void setCommentParser(GFFCommentParser* cmParser=NULL) { commentParser=cmParser; } GffReader(const char* fn, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0), buflen(0), flags(0), fh(NULL), fname(NULL), commentParser(NULL), gffline(NULL), bedline(NULL), discarded_ids(true), phash(true), gseqtable(1,true), gflst(), gseqStats(1,false) { //gff_warns=gff_show_warnings; gffnames_ref(GffObj::names); noExonAttrs=true; transcripts_Only=t_only; sortByLoc=sort; fname=Gstrdup(fn); fh=fopen(fname, "rb"); GMALLOC(linebuf, GFF_LINELEN); buflen=GFF_LINELEN-1; //lastReadNext=NULL; } ~GffReader() { delete gffline; gffline=NULL; fpos=0; if (fh && fh!=stdin) fclose(fh); gflst.freeUnused(); gflst.Clear(); discarded_ids.Clear(); phash.Clear(); GFREE(fname); GFREE(linebuf); //GFREE(lastReadNext); gffnames_unref(GffObj::names); } GffLine* nextGffLine(); BEDLine* nextBEDLine(); // load all subfeatures, re-group them: void readAll(); void readAll(bool keepAttr, bool mergeCloseExons=false, bool noExonAttr=true) { this->keep_Attrs=keepAttr; this->merge_CloseExons=mergeCloseExons; this->noExonAttrs=noExonAttr; readAll(); } //only for well-formed files: BED or GxF where exons are strictly grouped by their transcript_id/Parent GffObj* readNext(); //user must free the returned GffObj* ! #ifdef CUFFLINKS boost::crc_32_type current_crc_result() const { return _crc_result; } #endif }; // end of GffReader // ---------------------------------------------------------- // -- auxiliary classes for GffObj::processGeneSegments() -- class GSegMatch { //keep track of "matching" overlaps of a GeneCDSChain with multiple GeneSegment containers public: int child_idx; //index of matching _gene_segment GffObj in gene->children[] list int noncov; //number of "non-covered" bases in the GeneSegment int gsegidx; //index of _gene_segment in GVec geneSegs // (i.e. UTRs + implied introns if exons are missing) bool operator<(GSegMatch& o) { return (noncovcdss[] list GeneCDS(int i=-1, uint cstart=0, uint cend=0):GSeg(cstart, cend), idx(i) { } }; class GeneCDSChain: public GSeg { //keep track of CDS chains of the gene and their boundaries public: GVec cdsList; //all CDSs in this chain GArray mxs; //list of "matching" container X_gene_segment transcripts; GeneCDSChain():cdsList(),mxs() { } GeneCDSChain(int idx, uint cstart, uint cend):GSeg(cstart, cend), cdsList(),mxs(true) { addCDS(idx, cstart, cend); } void addCDS(int idx, uint cstart, uint cend) { GeneCDS cds(idx, cstart, cend); cdsList.Add(cds); expandInclude(cstart, cend); } void addMatch(int childidx, int ncov, int gsegidx) { GSegMatch segmatch(childidx, ncov, gsegidx); mxs.Add(segmatch); } bool singleExonCDSMatch(uint tstart, uint tend, int& ncov) { if (start>=tstart && end<=tend) { ncov=start-tstart + tend-end; //add all CDS-"introns" if (cdsList.Count()>1) //shouldn't really consider this a valid "match" for (int i=1;ioverlap(cdsList[0])) { if (cdsList[0].start>=t.exons[i]->start && cdsList[0].end<=t.exons[i]->end) { match=true; nc+=cdsList[0].start-t.exons[i]->start+t.exons[i]->end+cdsList[0].end; } //contained in this exon else return false; //overlap, but not contained continue; } nc+=t.exons[i]->len(); } if (!match) return false; ncov=nc; return true; } bool multiCDStoExon(GffObj &t, int& ncov) { //multi-CDS vs multi-exon t int nc=0; int e=0, c=0; int emax=t.exons.Count()-1; int cmax=cdsList.Count()-1; int mintrons=0; //matched introns while (e0 && (cdsList[c].end!=t.exons[e]->end || cdsList[c+1].start!=t.exons[e+1]->start)) return false; GSeg cintron(cdsList[c].end+1, cdsList[c+1].start-1); GSeg eintron(t.exons[e]->end+1, t.exons[e+1]->start-1); if (cintron.start>eintron.end) { nc+=t.exons[e]->len(); e++; continue; } if (eintron.start<=cintron.end) { //intron overlap if (cintron.start==eintron.start && cintron.end==eintron.end) { //intron match if (mintrons==0) { if (cdsList[c].startstart) return false; nc+=cdsList[c].start-t.exons[e]->start; } mintrons++; c++;e++; continue; } else return false; } c++; //should never get here, CDS shouldn't be have to catch up with e } if (mintronsend-cdsList[c].end; for(int i=e+1;ilen(); ncov=nc; return true; } bool containedBy(GffObj& t, int& ncov) { // (Warning: t may have no defined exons!) //if yes: ncov will be set to the number of non-CDS-covered bases in t if (t.exons.Count()<2) { if (t.exons.Count()==0) //no exons defined, just check boundaries return singleExonCDSMatch(t.start, t.end, ncov); else //single-exon return singleExonCDSMatch(t.exons[0]->start, t.exons[0]->end, ncov); } //single or no exon else { //multi-exon transcript if (startstart || end>t.exons.Last()->end) return false; //no containment possible; if (cdsList.Count()==1) return singleCDStoExon(t, ncov); //check intron compatibility! } return true; } }; #endif libgff-2.0.0/libgffConfig.cmake.in000066400000000000000000000005421367741004700167420ustar00rootroot00000000000000set(LIB_GFF_VERSION 2.0.0) @PACKAGE_INIT@ set_and_check(LIB_GFF_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@") set_and_check(LIB_GFF_LIBRARY_DIR "@PACKAGE_LIB_INSTALL_DIR@") set_and_check(LIB_GFF_STATIC_LIBRARY "@PACKAGE_LIB_INSTALL_DIR@libgff.a") #set_and_check(LIB_GFF_SYSCONFIG_DIR "@PACKAGE_SYSCONFIG_INSTALL_DIR@") check_required_components(libgff) libgff-2.0.0/src/000077500000000000000000000000001367741004700135425ustar00rootroot00000000000000libgff-2.0.0/src/GArgs.cpp000066400000000000000000000255321367741004700152600ustar00rootroot00000000000000#include "GBase.h" #include "GArgs.h" #include GArgs::GArgs(int argc, char* argv[], const char* format, bool nodigitopts) { /* format can be: {;|=} e.g. disable-test;PID=S= for --disable-test PID=50 (or --PID 50) S=3.5 etc. [:] e.g. p:hT for -p testing (or -ptesting) -h -T */ const char* fstr=format; fmtcount=0; count=0; nonOptCount=0; nonOptPos=0; optPos=0; errarg=0; err_valmissing=false; args=NULL; fmt=NULL; _argc=argc; _argv=argv; int fmtlen=strlen(format); //---- first parse the format string while (fstr-format < fmtlen ) { int l=strcspn(fstr, ";=:"); if (fstr[l]==0) { //end of string reached //all previous chars are just switches: GREALLOC(fmt, (fmtcount+l)*sizeof(fmtdef)); //store each switch for (int i=0; i1 && p[0]=='\'' && p[alen-1]=='\'') { p++; p[alen-2 ]='\0'; } } int GArgs::parseArgs(bool nodigitopts) { int p=1; //skip program name int f=0; while (p<_argc) { // silly patch for annnoying MacOS gdb/eclipse issues: #if defined(__APPLE__) && defined(DEBUG) dbg_dequote(_argv[p]); #endif //-- if (_argv[p][0]=='-' && (_argv[p][1]==0 || _argv[p][1]!='-')) { //single-dash argument int cpos=1; char c=_argv[p][cpos]; if (c==0 || (nodigitopts && isdigit(c)) || (c=='.' && isdigit(_argv[p][cpos+1]))) { //special case: plain argument '-' or just a negative number GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=NULL; args[count].fmti=-1; if (c==0) { GCALLOC(args[count].value, 2); args[count].value[0]='-'; } else { //negative number given args[count].value=Gstrdup(_argv[p]); } count++; nonOptCount++; } else { //single-dash argument or switch COLLAPSED: if ((f=validShortOpt(c))>=0) { GREALLOC(args, (count+1)*sizeof(argdata)); GCALLOC(args[count].opt, 2); args[count].opt[0]=c; args[count].fmti=f; if (!fmt[f].req_value) {//switch type GCALLOC(args[count].value,1);//so getOpt() functions would not return NULL count++; // only switches can be grouped with some other switches or options if (_argv[p][cpos+1]!='\0') { cpos++; c=_argv[p][cpos]; goto COLLAPSED; } } else { //single-dash argument followed by a value if (_argv[p][cpos+1]=='\0') { if (p+1<_argc && _argv[p+1][0]!=0) { //value is the whole next argument p++; #if defined(__APPLE__) && defined(DEBUG) dbg_dequote(_argv[p]); #endif args[count].value=Gstrdup(_argv[p]); } else { errarg=p; err_valmissing=true; return errarg; } } else { //value immediately follows the dash-option args[count].value=Gstrdup(_argv[p]+cpos+1); } count++; } } //was validShortOpt else { //option not found in format definition! errarg=p; return errarg; } } } //-single-dash else {//not a single-dash argument char* ap=_argv[p]; bool is_longopt=false; if (*ap=='-' && ap[1]=='-') { //double-dash option is_longopt=true; ap+=2; } char* e=strchr(ap+1,'='); while (e!=NULL && *(e-1)=='\\') e=strchr(e,'='); if (e==NULL && is_longopt) { e=ap; while (*e!=0 && *e!=' ') e++; //e will be on eos or next space } if (e!=NULL && e>ap) { //this must be a long option //e is on eos, space or '=' if ((f=validLongOpt(ap,e-1))>=0) { GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=Gstrdup(ap,e-1); args[count].fmti=f; if (fmt[f].req_value) { if (*e==0) { //value is the next argument if (p+1<_argc && _argv[p+1][0]!=0) { p++; #if defined(__APPLE__) && defined(DEBUG) dbg_dequote(_argv[p]); #endif args[count].value=Gstrdup(_argv[p]); } else { errarg=p; err_valmissing=true; return errarg; } } else { //value is in the same argument //while (*e!=0 && (*e==' ' || *e=='=')) e++; if (*e=='=') e++; if (*e==0) { errarg=p; err_valmissing=true; return errarg; } args[count].value=Gstrdup(e); } } //value required else { //no value expected GCALLOC(args[count].value,1); //do not return NULL } count++; } else { //error - this long argument not recognized errarg=p; return errarg; } } else { //just a plain non-option argument if (e==ap) { //i.e. just "--" errarg=p; return errarg; } GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=NULL; //it's not an option args[count].value=Gstrdup(_argv[p]); args[count].fmti=-1; count++; nonOptCount++; } } p++;//check next arg string } //while arguments return errarg; } void GArgs::printError(FILE* fout, const char* usage, bool exitProgram) { if (errarg==0) return; if (usage) fprintf(fout, "%s\n", usage); if (err_valmissing) fprintf(fout, "Error: value required for option '%s'\n", _argv[errarg]); else fprintf(fout, "Error: invalid argument '%s'\n", _argv[errarg]); if (exitProgram) exit(1); } void GArgs::printError(const char* usage, bool exitProgram) { printError(stderr, usage, exitProgram); } void GArgs::printCmdLine(FILE* fout) { if (_argv==NULL) return; for (int i=0;i<_argc;i++) { fprintf(fout, "%s%c", _argv[i], (i==_argc-1)?'\n':' '); } } GArgs::GArgs(int argc, char* argv[], const GArgsDef fmtrecs[], bool nodigitopts) { fmtcount=0; count=0; nonOptCount=0; nonOptPos=0; optPos=0; errarg=0; err_valmissing=false; args=NULL; fmt=NULL; _argc=argc; _argv=argv; if (fmtrecs==NULL) return; const GArgsDef* frec=fmtrecs; while ((frec->longopt || frec->opt) && fmtcount<255) { fmtcount++; frec=&(fmtrecs[fmtcount]); } GCALLOC(fmt, fmtcount*sizeof(fmtdef)); for (int i=0;i=0 && fmt[args[i].fmti].code==c) return args[i].value; return NULL; } char* GArgs::getOptName(int c) { for (int i=0; i=0 && fmt[args[i].fmti].code==c) return args[i].opt; return NULL; } int GArgs::startNonOpt(){ //reset iteration through non-option arguments //returns the number of non-option arguments nonOptPos=0; return nonOptCount; } char* GArgs::nextNonOpt() { //get the next non-dashed argument //or NULL if no more for (int i=nonOptPos;i=0) { optPos=i+1; return fmt[args[i].fmti].code; } return 0; //must make sure that codes are > 0 for this to work properly } libgff-2.0.0/src/GBase.cpp000066400000000000000000000556261367741004700152450ustar00rootroot00000000000000#include "GBase.h" #include #include #ifndef S_ISDIR #define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) #endif #ifndef S_ISREG #define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) #endif //#ifdef _WIN32 // int (WINAPIV * __vsnprintf)(char *, size_t, const char*, va_list) = _vsnprintf; //#endif //************************* Debug helpers ************************** // Assert failed routine void GAssert(const char* expression, const char* filename, unsigned int lineno){ char msg[4096]; sprintf(msg,"%s(%d): ASSERT(%s) failed.\n",filename,lineno,expression); fprintf(stderr,"%s",msg); #ifdef DEBUG // modify here if you [don't] want a core dump abort(); #endif exit(1); } // Error routine (prints error message and exits!) void GError(const char* format,...){ #ifdef _WIN32 char msg[4096]; va_list arguments; va_start(arguments,format); _vsnprintf(msg, 4095, format, arguments); vfprintf(stderr, format, arguments); // if a console is available msg[4095]=0; va_end(arguments); OutputDebugString(msg); MessageBox(NULL,msg,NULL,MB_OK|MB_ICONEXCLAMATION|MB_APPLMODAL); #else va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); #ifdef DEBUG // comment this if you do NOT want a core dump abort(); #endif #endif exit(1); } // Warning routine (just print message without exiting) void GMessage(const char* format,...){ #ifdef _WIN32 char msg[4096]; va_list arguments; va_start(arguments,format); vfprintf(stderr, format , arguments); // if a console is available _vsnprintf(msg, 4095, format, arguments); msg[4095]=0; va_end(arguments); OutputDebugString(msg); #else va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); #endif } /*************** Memory management routines *****************/ // Allocate memory bool GMalloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size!=0) *ptr=malloc(size); return *ptr!=NULL; } // Allocate cleaned memory (0 filled) bool GCalloc(pointer* ptr,unsigned long size){ GASSERT(ptr); *ptr=calloc(size,1); return *ptr!=NULL; } // Resize memory bool GRealloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size==0) { GFree(ptr); return true; } if (*ptr==NULL) {//simple malloc void *p=malloc(size); if (p != NULL) { *ptr=p; return true; } else return false; }//malloc else {//realloc void *p=realloc(*ptr,size); if (p) { *ptr=p; return true; } return false; } } // Free memory, resets ptr to NULL afterward void GFree(pointer* ptr){ GASSERT(ptr); if (*ptr) free(*ptr); *ptr=NULL; } char* Gstrdup(const char* str, int xtracap) { if (str==NULL) return NULL; char *copy=NULL; GMALLOC(copy, strlen(str)+1+xtracap); strcpy(copy,str); return copy; } char* newEmptyStr() { char* zs=NULL; GMALLOC(zs,1); zs[0]=0; return zs; } char* Gstrdup(const char* sfrom, const char* sto) { if (sfrom==NULL || sto==NULL) return NULL; char *copy=NULL; if (sfrom[0]==0 || sto dirstack(4); // stack of directories that should be created while (psep>gpath && *(psep-1)=='/') --psep; //skip double slashes *psep='\0'; int fexists=0; while ((fexists=fileExists(gpath))==0) { dirstack.Push(psep); do { --psep; } while (psep>gpath && *psep!='/'); if (psep<=gpath) { psep=NULL; break; } while (psep>gpath && *(psep-1)=='/') --psep; *psep='\0'; } if (psep) *psep='/'; while (dirstack.Count()>0) { psep=dirstack.Pop(); int mkdir_err=0; if ((mkdir_err=G_mkdir(gpath, perms))!=0) { GMessage("Warning: mkdir(%s) failed: %s\n", gpath, strerror(errno)); GFREE(gpath); umask(process_mask); return -1; } *psep='/'; } GFREE(gpath); umask(process_mask); return 0; } FILE* Gfopen(const char *path, char *mode) { FILE* f=NULL; if (mode==NULL) f=fopen(path, "rb"); else f=fopen(path, mode); if (f==NULL) GMessage("Error opening file '%s': %s\n", path, strerror(errno)); return f; } bool GstrEq(const char* a, const char* b) { if (a==NULL || b==NULL) return false; return (strcmp(a,b)==0); } bool GstriEq(const char* a, const char* b) { if (a==NULL || b==NULL) return false; return (strcasecmp(a,b)==0); } int Gstricmp(const char* a, const char* b, int n) { if (a==NULL || b==NULL) return a==NULL ? -1 : 1; if (n>=0) return strncasecmp(a,b,n); else return strcasecmp(a,b); } int strsplit(char* str, GDynArray& fields, const char* delim, int maxfields) { //splits by placing 0 where any of the delim chars are found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; fields.Reset(); while (str[i]!=0 && tidx& fields, const char delim, int maxfields) { //splits by placing 0 where delim is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; fields.Reset(); while (str[i]!=0 && tidx& fields, int maxfields) { //splits by placing 0 where delim is found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; fields.Reset(); while (str[i]!=0 && tidx=str) { if (*p==ch) return p; p--; } return NULL; } /* DOS/UNIX safer fgets : reads a text line from a (binary) file and update the file position accordingly and the buffer capacity accordingly. The given buf is resized to read the entire line in memory -- even when it's abnormally long */ char* fgetline(char* & buf, int& buf_cap, FILE *stream, off_t* f_pos, int* linelen) { //reads a char at a time until \n and/or \r are encountered int c=0; GDynArray arr(buf, buf_cap); off_t fpos=(f_pos!=NULL) ? *f_pos : 0; while ((c=getc(stream))!=EOF) { if (c=='\n' || c=='\r') { if (c=='\r') { if ((c=getc(stream))!='\n') ungetc(c,stream); else fpos++; } fpos++; break; } fpos++; arr.Push((char)c); } //while i=str) { for (i=0; i=0 && s[i]==suffix[j]) { i--; j--; } return (j==-1); } bool endsiWith(const char* s, const char* suffix) { if (suffix==NULL || s==NULL) return false; if (suffix[0]==0) return true; //special case: empty suffix int j=strlen(suffix)-1; int i=strlen(s)-1; if (i=0 && tolower(s[i])==tolower(suffix[j])) { i--; j--; } return (j==-1); } bool trimSuffix(char* s, const char* suffix) { if (suffix==NULL || s==NULL) return false; if (suffix[0]==0) return true; //special case: empty suffix int j=strlen(suffix)-1; int i=strlen(s)-1; if (i=0 && s[i]==suffix[j]) { i--; j--; } if (j==-1) { //suffix found s[i+1]='\0'; //cut here return true; } return false; } bool trimiSuffix(char* s, const char* suffix) { if (suffix==NULL || s==NULL) return false; if (suffix[0]==0) return true; //special case: empty suffix int j=strlen(suffix)-1; int i=strlen(s)-1; if (i=0 && tolower(s[i])==tolower(suffix[j])) { i--; j--; } if (j==-1) { //suffix found s[i+1]='\0'; //cut here return true; } return false; } char* reverseChars(char* str, int slen) { if (slen==0) slen=strlen(str); int l=0; int r=slen-1; char c; while (l=lend) { for (i=0;i>24; h&=0x0fffffff; } GASSERT(h<=0x0fffffff); return h; } int djb_hash(const char* cp) { int h = 5381; while (*cp) h = (int)(33 * h ^ (unsigned char) *cp++); return (h & 0x7FFFFFFF); //always positive //return h; //return absolute value of this int: //int mask = (h >> (sizeof(int) * CHAR_BIT - 1)); //return (h + mask) ^ mask; } /* Fowler/Noll/Vo (FNV) hash function, variant 1a */ int fnv1a_hash(const char* cp) { int h = 0x811c9dc5; while (*cp) { h ^= (unsigned char) *cp++; h *= 0x01000193; } //return h; return (h & 0x7FFFFFFF); } // removes the last part (file or directory name) of a full path // this is a destructive operation for the given string!!! // the trailing '/' is guaranteed to be there void delFileName(char* filepath) { char *p, *sep; if (filepath==NULL) return; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p=='/' || *p=='\\') sep=p+1; *sep='\0'; // truncate filepath } // returns a pointer to the last file or directory name in a full path const char* getFileName(const char* filepath) { const char *p, *sep; if (filepath==NULL) return NULL; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p=='/' || *p=='\\') sep=p+1; return sep; } // returns a pointer to the file "extension" part in a filename const char* getFileExt(const char* filepath) { const char *p, *dp, *sep; if (filepath==NULL) return NULL; for (p=filepath, dp=filepath, sep=filepath;*p!='\0';p++) { if (*p=='.') dp=p+1; else if (*p=='/' || *p=='\\') sep=p+1; } return (dp>sep) ? dp : NULL ; } int fileExists(const char* fname) { struct stat stFileInfo; int r=0; // Attempt to get the path attributes int fs = stat(fname,&stFileInfo); if (fs == 0) { r=3; // We were able to get the file attributes // so the path exists if (S_ISREG (stFileInfo.st_mode)) { r=2; } if (S_ISDIR (stFileInfo.st_mode)) { r=1; } } return r; } int64 fileSize(const char* fpath) { #ifdef _WIN32 WIN32_FILE_ATTRIBUTE_DATA fad; if (!GetFileAttributesEx(fpath, GetFileExInfoStandard, &fad)) return -1; // error condition, could call GetLastError to find out more LARGE_INTEGER size; size.HighPart = fad.nFileSizeHigh; size.LowPart = fad.nFileSizeLow; return size.QuadPart; #else struct stat results; if (stat(fpath, &results) == 0) // The size of the file in bytes is in return (int64)results.st_size; else //An error occurred //GMessage("Error at stat(%s)!\n", fpath); return -1; #endif } bool parseNumber(char* &p, double& v) { //skip any spaces.. while (*p==' ' || *p=='\t') p++; char* start=p; /*if (*p=='-') p++; else if (*p=='+') { p++;start++; }*/ /* while ((*p>='1' && *p<='9') || *p=='0' || *p=='.' || *p=='-' || tolower(*p)=='e') p++; */ int numlen=strspn(start, "0123456789eE.-+"); p=start+numlen; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; v=strtod(start,&endptr); *p=saved; if (endptr!=p) return false; return true; } bool parseDouble(char* &p, double& v) { return parseNumber(p,v); } bool parseFloat(char* &p, float& v) { double dv; bool parsed=parseNumber(p,dv); if (parsed) v=(float)dv; return parsed; } bool parseInt(char* &p, int& i) { //pointer p is advanced after the number while (*p==' ' || *p=='\t') p++; char* start=p; char* p0=p; if (*p=='-') p++; else if (*p=='+') { p++;start++; } char* atdigits=p; while (*p>='0' && *p<='9') p++; //now p should be past the digits if (atdigits==p) {//no digits found! p=p0; return false; } char* endptr=NULL; long l=strtol(start,&endptr,10); i=(int)l; if (endptr!=p || endptr==start || i!=l) { p=p0; return false; } return true; } bool strToInt(char* p, int& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') p++; else if (*p=='+') { p++;start++; } char* atdigits=p; while (*p>='0' && *p<='9') p++; //now p should be past the digits if (atdigits==p) //no digits found! return false; char* endptr=NULL; long l=strtol(start,&endptr,10); i=(int)l; if (endptr!=p || endptr==start || i!=l) return false; return true; } bool strToUInt(char* p, uint& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (*p>='0' && *p<='9') p++; //now p is on a non-digit; if (start==p) return false; char* endptr=NULL; unsigned long l=strtoul(start,&endptr,10); i=(uint) l; if (endptr!=p || endptr==start || i!=l) return false; return true; } bool parseUInt(char* &p, uint& i) { //pointer p is advanced after the number while (*p==' ' || *p=='\t') p++; char* p0=p; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (*p>='0' && *p<='9') p++; //now p is on a non-digit; if (start==p) { p=p0; return false; } char* endptr=NULL; unsigned long l=strtoul(start,&endptr,10); i=(uint) l; if (endptr!=p || endptr==start || i!=l) { p=p0; return false; } return true; } bool parseHex(char* &p, uint& i) { //skip initial spaces/prefix while (*p==' ' || *p=='\t' || *p=='0' || *p=='x') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (isxdigit(*p)) p++; //now p is on a non-hexdigit; if (p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; unsigned long l=strtoul(start,&endptr,16); i=(uint) l; *p=saved; if (endptr!=p || i!=l) return false; return true; } //write a formatted fasta record, fasta formatted void writeFasta(FILE *fw, const char* seqid, const char* descr, const char* seq, int linelen, int seqlen) { fflush(fw); // write header line only if given! if (seqid!=NULL) { if (descr==NULL || descr[0]==0) fprintf(fw,">%s\n",seqid); else fprintf(fw,">%s %s\n",seqid, descr); } fflush(fw); if (seq==NULL || *seq==0) return; //nothing to print if (linelen==0) { //unlimited line length: write the whole sequence on a line if (seqlen>0) fwrite((const void*)seq, 1, seqlen,fw); else fprintf(fw,"%s",seq); fprintf(fw,"\n"); fflush(fw); return; } int ilen=0; if (seqlen>0) { //seq length given, so we know when to stop for (int i=0; i < seqlen; i++, ilen++) { if (ilen == linelen) { fputc('\n', fw); ilen = 0; } fputc(seq[i], fw); } fputc('\n', fw); } else { //seq length not given, stop when 0 encountered for (int i=0; seq[i]!=0; i++, ilen++) { if (ilen == linelen) { fputc('\n', fw); ilen = 0; } fputc(seq[i], fw); } //for fputc('\n', fw); } fflush(fw); } char* commaprintnum(uint64 n) { char retbuf[48]; int comma = ','; char *p = &retbuf[sizeof(retbuf)-1]; int i = 0; *p = '\0'; do { if(i%3 == 0 && i != 0) *--p = comma; *--p = '0' + n % 10; n /= 10; i++; } while(n != 0); return Gstrdup(p); } libgff-2.0.0/src/GFaSeqGet.cpp000066400000000000000000000267331367741004700160270ustar00rootroot00000000000000#include "GFaSeqGet.h" #include "gdna.h" #include GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, const char* seqid) { if (gfasta.fastaPath==NULL) return NULL; return gfasta.fetch(seqid); } void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) { if (sovl==0) { GFREE(sq); sqstart=sstart; uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ; sqlen = (slen==0 ? max_len : slen); GMALLOC(sq, sqlen); return; } //overlap -- copy the overlapping region char* newsq=NULL; GMALLOC(newsq, slen); memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl); GFREE(sq); sq=newsq; sqstart=sstart; sqlen=slen; } void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) { fh=fopen(fn,"rb"); if (fh==NULL) { GError("Error (GFaSeqGet) opening file '%s'\n",fn); } fname=Gstrdup(fn); initialParse(fofs, validate); lastsub=new GSubSeq(); } GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen):fname(NULL), fh(NULL), fseqstart(0), seq_len(0), line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) { //for GFastaIndex use mostly -- the important difference is that //the file offset is to the sequence, not to the defline fh=fopen(faname,"rb"); if (fh==NULL) { GError("Error (GFaSeqGet) opening file '%s'\n",faname); } fname=Gstrdup(faname); line_len=l_len; line_blen=l_blen; seq_len=seqlen; if (line_blen fseqname(64); fseqname.DetachPtr(); //will not free the allocated memory fseqstart=fofs; int c=getc(fh); fseqstart++; if (c!='>') //fofs must be at the beginning of a FASTA record! GError("Error (GFaSeqGet): not a FASTA record?\n"); bool getName=true; while ((c=getc(fh))!=EOF) { fseqstart++; if (getName) { if (c<=32) getName=false; else //seqname.append((char)c); fseqname.Add((char)c); } if (c=='\n' || c=='\r') { break; } //end of defline } fseqname.Add('\0'); //terminate the string seqname=fseqname(); //takeover the string pointer if (c==EOF) GError(gfa_ERRPARSE); line_len=0; uint lendlen=0; while ((c=getc(fh))!=EOF) { if (c=='\n' || c=='\r') { //end of line encountered if (line_len>0) { //end of the first "sequence" line lendlen++; break; } else {// another EoL char at the end of defline fseqstart++; continue; } }// end-of-line characters line_len++; } //we are at the end of first sequence line while ((c=getc(fh))!=EOF) { if (c=='\n' || c=='\r') lendlen++; else { ungetc(c,fh); break; } } line_blen=line_len+lendlen; if (c==EOF) return; // -- you don't need to check it all if you're sure it's safe if (checkall) { //validate the rest of the FASTA records uint llen=0; //last line length uint elen=0; //length of last line ending bool waseol=true; while ((c=getc(fh))!=EOF) { if (c=='>' && waseol) { ungetc(c,fh); break; } if (c=='\n' || c=='\r') { // eol char elen++; if (waseol) continue; //2nd eol char waseol=true; elen=1; continue; } if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered //--- on a seq char here: if (waseol) {//beginning of a seq line if (elen && (llen!=line_len || elen!=lendlen)) //GError(gfa_ERRPARSE); GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\ the sequence lines have the same length (except for the last line)"); waseol=false; llen=0; elen=0; } llen++; } //while reading chars }// FASTA checking was requested fseeko(fh,fseqstart,SEEK_SET); } const char* GFaSeqGet::subseq(uint cstart, int& clen) { //cstart is 1-based genomic coordinate within current fasta sequence int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ; //GMessage("--> call: subseq(%u, %d)\n", cstart, clen); if (clen>maxlen) { GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen); return NULL; } if (seq_len>0 && clen+cstart-1>seq_len) { //GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len); //Adjust request: clen=seq_len-cstart+1; } if (lastsub->sq==NULL || lastsub->sqlen==0) { lastsub->setup(cstart, clen, 0,0,0,seq_len); loadsubseq(cstart, clen); lastsub->sqlen=clen; return (const char*)lastsub->sq; } //allow extension up to MAX_FASUBSEQ uint bstart=lastsub->sqstart; uint bend=lastsub->sqstart+lastsub->sqlen-1; uint cend=cstart+clen-1; int qlen=0; //only the extra len to be allocated/appended/prepended uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file int newlen=0; //the new total length of the buffered sequence lastsub->sq int kovl=0; int czfrom=0;//0-based offsets for copying a previously read sequence chunk int czto=0; uint newstart=cstart; if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer return (const char*) &(lastsub->sq[cstart-bstart]) ; } //extend downward uint newend=GMAX(cend, bend); if (cstartMAX_FASUBSEQ) { newlen=MAX_FASUBSEQ; newend=cstart+newlen-1; //keep newstart, set newend } qlen=bstart-cstart; if (newend>bstart) { //overlap if (newend>bend) {// new region is larger & around the old one - so we have two regions to update kovl=bend-bstart+1; czfrom=0; czto=bstart-cstart; lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq qlen=bstart-cstart; loadsubseq(newstart, qlen); qlen=newend-bend; int toread=qlen; loadsubseq(bend+1, qlen); clen-=(toread-qlen); lastsub->sqlen=clen; return (const char*)lastsub->sq; } //newend<=bend kovl=newend-bstart+1; } else { //no overlap with previous buffer if (newend>bend) kovl=bend-bstart+1; else kovl=newend-bstart+1; } qlen=bstart-cstart; czfrom=0; czto=qlen; } //cstart=bstart, possibly extend upwards newstart=bstart; newlen=(newend-newstart+1); if (newlen>MAX_FASUBSEQ) { newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart newlen=MAX_FASUBSEQ; if (newstart<=bend) { //overlap with old buffer kovl=bend-newstart+1; czfrom=newstart-bstart; czto=0; } else { //not overlapping old buffer kovl=0; } } //newstart reassigned else { //we can extend the buffer to include the old one qlen=newend-bend; //how much to read from file qstart=bend+1; kovl=bend-bstart+1; czfrom=0; czto=0; } } lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region lastsub->sqlen-=qlen; //appending may result in a premature eof int toread=qlen; loadsubseq(qstart, qlen); //read the missing chunk, if any clen-=(toread-qlen); lastsub->sqlen+=qlen; return (const char*)(lastsub->sq+(cstart-newstart)); } char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) { if (cstart>cend) { Gswap(cstart, cend); } int clen=cend-cstart+1; const char* gs=subseq(cstart, clen); if (gs==NULL) return NULL; char* r=NULL; GMALLOC(r,clen+1); r[clen]=0; memcpy((void*)r,(void*)gs, clen); if (revCmpl) reverseComplement(r,clen); if (upCase) { for (int i=0;isq space allocated previously //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart] if (cstart>seq_len || lastsub->sqstart>cstart) { clen=0; //invalid request return NULL; } int eol_size=line_blen-line_len; char* seqp=lastsub->sq+(int)(cstart-lastsub->sqstart); //should be positive offset? //find the proper file offset and read the appropriate lines cstart--; //seq start offset, 0-based int lineofs = cstart % line_len; //file offset, relative to the first letter of the sequence in the file off_t f_start= ((int)(cstart/line_len))*line_blen + lineofs; uint letters_toread=clen; //actual sequence letters to read int maxlen=(seq_len>0)? seq_len-cstart : MAX_FASUBSEQ ; if (clen==0) letters_toread=maxlen; //read max allowed, or to the end of file uint c_end=cstart+letters_toread; //cstart+clen off_t f_end= ((int)(c_end/line_len))*line_blen + c_end % line_len; int bytes_toRead=f_end-f_start; f_start+=fseqstart; // file offset from the beginning of the file fseeko(fh, f_start, SEEK_SET); size_t actual_read=0; char* smem=NULL; GMALLOC(smem, bytes_toRead); actual_read=fread((void*)smem, 1, bytes_toRead, fh); if (actual_read==0) { //error reading any bytes from the file, or invalid request clen=0; return (const char*)seqp; } uint mp=0; //current read offset in smem uint sublen=0; //current sequence letter storage offset in seqp //copySeqOnly(seqp, smem, actualrlen); bool rdone=false; if (lineofs>0) { //read the partial first line uint reqrlen=line_len-lineofs; if (reqrlen>letters_toread) { reqrlen=letters_toread; //in case we need to read just a few chars rdone=true; } if (reqrlen>actual_read) { reqrlen=actual_read; //incomplete file read? rdone=true; } memcpy((void*)seqp, (void*)smem, reqrlen); if (rdone) { //eof reached prematurely GFREE(smem); clen=reqrlen; return (const char*)seqp; } letters_toread-=reqrlen; sublen+=reqrlen; mp+=reqrlen+eol_size; if (mp>actual_read) { GFREE(smem); clen=reqrlen; return (const char*)seqp; } }//loading first line //read the rest of the lines while (letters_toread>=line_len && mp+line_len=actual_read) { GFREE(smem); clen=sublen; return (const char*)seqp; } // read the last partial line, if any if (letters_toread>0) { if (mp+letters_toread>actual_read) letters_toread=actual_read-mp; if (letters_toread>0) { memcpy((void*)(&seqp[sublen]), (void*)(&smem[mp]), letters_toread); sublen+=letters_toread; } } //lastsub->sqlen+=sublen; GFREE(smem); clen=sublen; return (const char*)seqp; } libgff-2.0.0/src/GFastaIndex.cpp000066400000000000000000000133661367741004700164140ustar00rootroot00000000000000/* * GFastaIndex.cpp * * Created on: Aug 25, 2010 * Author: gpertea */ #include "GFastaIndex.h" #define ERR_FAIDXLINE "Error parsing fasta index line: \n%s\n" #define ERR_FALINELEN "Error: sequence lines in a FASTA record must have the same length!\n" void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) { GFastaRec* farec=records.Find(seqname); if (farec!=NULL) { GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n"); farec->seqlen=seqlen; farec->fpos=foffs; farec->line_len=llen; farec->line_blen=llen_full; } else { farec=new GFastaRec(seqlen,foffs,llen,llen_full); records.Add(seqname,farec); farec->seqname=records.getLastKey(); } } int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index if (finame==NULL) finame=fai_name; if (finame!=fai_name) { fai_name=Gstrdup(finame); } if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n"); records.Clear(); haveFai=false; FILE* fi=fopen(fai_name,"rb"); if (fi==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fai_name); return 0; } GLineReader fl(fi); char* s=NULL; while ((s=fl.nextLine())!=NULL) { if (*s=='#') continue; char* p=strchrs(s,"\t "); if (p==NULL) GError(ERR_FAIDXLINE,s); *p=0; //s now holds the genomic sequence name p++; uint len=0; int line_len=0, line_blen=0; #ifdef _WIN32 long offset=-1; sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen); #else long long offset=-1; sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen); #endif if (len==0 || line_len==0 || line_blen==0 || line_blen0); return records.Count(); } int GFastaIndex::buildIndex() { //this parses the whole fasta file, so it could be slow for large files //builds the index in memory only if (fa_name==NULL) GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n"); FILE* fa=fopen(fa_name,"rb"); if (fa==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fa_name); return 0; } records.Clear(); GLineReader fl(fa); char* s=NULL; uint seqlen=0; int line_len=0,line_blen=0; bool newSeq=false; //set when FASTA header is encountered off_t newSeqOffset=0; //int prevOffset=0; char* seqname=NULL; int last_len=0; bool mustbeLastLine=false; //true if the line length decreases while ((s=fl.nextLine())!=NULL) { if (s[0]=='>') { if (seqname!=NULL) { if (seqlen==0) GError("Warning: empty FASTA record skipped (%s)!\n",seqname); else { //seqlen!=0 addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen); } } char *p=s; while (*p > 32) p++; *p=0; GFREE(seqname); seqname=Gstrdup(&s[1]); newSeq=true; newSeqOffset=fl.getfpos(); last_len=0; line_len=0; line_blen=0; seqlen=0; mustbeLastLine=false; } //defline parsing else { //sequence line int llen=fl.tlength(); int lblen=fl.blength(); //fl.getFpos()-prevOffset; if (newSeq) { //first sequence line after defline line_len=llen; line_blen=lblen; } else {//next seq lines after first if (mustbeLastLine) { //could be empty line, adjust for possible spaces if (llen>0) { char *p=s; //trim spaces, tabs etc. on the last line while (*p > 32) ++p; llen=(p-s); } if (llen>0) GError(ERR_FALINELEN); } else { if (llenlast_len) GError(ERR_FALINELEN); } } seqlen+=llen; last_len=llen; newSeq=false; } //sequence line //prevOffset=fl.getfpos(); }//for each line of the fasta file if (seqlen>0) addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen); GFREE(seqname); fclose(fa); return records.Count(); } int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file if (records.Count()==0) GError("Error at GFastaIndex:storeIndex(): no records found!\n"); FILE* fai=fopen(finame, "w"); if (fai==NULL) GError("Error creating fasta index file: %s\n",finame); int rcount=storeIndex(fai); GFREE(fai_name); fai_name=Gstrdup(finame); return rcount; } int GFastaIndex::storeIndex(FILE* fai) { int rcount=0; GList reclist(true,false,true); //sorted, don't free members, unique records.startIterate(); GFastaRec* rec=NULL; while ((rec=records.NextData())!=NULL) { reclist.Add(rec); } //reclist has records sorted by file offset for (int i=0;iseqname,reclist[i]->seqlen,(long)reclist[i]->fpos, reclist[i]->line_len, reclist[i]->line_blen); #else int written=fprintf(fai, "%s\t%d\t%lld\t%d\t%d\n", reclist[i]->seqname, reclist[i]->seqlen, (long long)(reclist[i]->fpos), reclist[i]->line_len, reclist[i]->line_blen); #endif if (written>0) rcount++; else break; //couldn't write anymore } fclose(fai); haveFai=(rcount>0); return rcount; } libgff-2.0.0/src/GStr.cpp000066400000000000000000001134121367741004700151270ustar00rootroot00000000000000//--------------------------------------------------------------------------- #include "GStr.h" #include #include #include #include "GBase.h" #include #include //--------------------------------------------------------------------------- GStr::Data GStr::null_data; //========================================= GStr::Data * GStr::new_data(uint len, uint addcap) { //static method to return a new Data object (allocate length) //content is undefined, but it's null terminated if (len > 0) { Data* data; GMALLOC(data, sizeof(Data)+len+addcap); data->ref_count = 0; data->length = len; data->cap=len+addcap; data->chars[len] = '\0'; return data; } else return &null_data; } GStr::Data* GStr::new_data(const char* str, uint addcap) { //static method to return a new Data object (allocate: length+addcap) //as a copy of a given string if (str==NULL) return &null_data; int len=strlen(str); if (len+addcap > 0) { Data* data; GMALLOC(data, sizeof(Data)+len+addcap); strcpy(data->chars, str); data->ref_count = 0; data->cap=len+addcap; data->length = len; data->chars[len] = '\0'; return data; } else return &null_data; } void GStr::prep_data(uint len, uint addcap) { uint newcap=len+addcap; if (newcap > 0 && my_data->ref_count <= 1 && my_data->cap>=newcap && my_data->cap-newcap<(newcap>>1)+2) { //no need to shrink/reallocate the already allocated space my_data->length = len; my_data->chars[len]=0; return; } if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (len + addcap> 0) { GMALLOC(my_data, sizeof(Data)+len+addcap); my_data->ref_count = 1; my_data->length = len; my_data->cap=len+addcap; my_data->chars[len] = 0; } else my_data = &null_data; } GStr& GStr::clear(int init_cap) { make_unique(); //edit operation ahead prep_data(0, init_cap); return *this; } void GStr::replace_data(Data *data) { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (data != &null_data) data->ref_count++; my_data = data; } void GStr::make_unique() {//make sure it's not a reference to other string if (my_data->ref_count > 1) { Data *data = new_data(my_data->length, 0); ::memcpy(data->chars, my_data->chars, my_data->length); my_data->ref_count--; my_data = data; my_data->ref_count++; } } bool operator==(const char *s1, const GStr& s2){ if (s1==NULL) return s2.is_empty(); return (strcmp(s1, s2.chars()) == 0); } bool operator<(const char *s1, const GStr& s2) { if (s1==NULL) return !s2.is_empty(); return (strcmp(s1, s2.chars()) < 0); } bool operator<=(const char *s1, const GStr& s2){ if (s1==NULL) return true; return (strcmp(s1, s2.chars()) <= 0); } bool operator>(const char *s1, const GStr& s2) { if (s1==NULL) return false; return (strcmp(s1, s2.chars()) > 0); } GStr::GStr():my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; } //detach from the string data, returning a pointer to it char* GStr::detach() { make_unique(); char *r=my_data->chars; my_data=&null_data; fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; return r; } GStr::GStr(const GStr& s): my_data(&null_data){ fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; replace_data(s.my_data); } GStr::GStr(const char *s, uint addcap): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; my_data=new_data(s, addcap); my_data->ref_count = 1; } GStr::GStr(const int i): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); prep_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(const double f): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); prep_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(char c, int n): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; prep_data(n); ::memset(chrs(), c, n); } GStr::~GStr() { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); GFREE(fTokenDelimiter); GFREE(readbuf); } char& GStr::operator[](int idx){ //returns reference to char (can be l-value) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); make_unique(); //because the user will probably modify this char! return chrs()[idx]; } char GStr::operator[](int idx) const { //returns char copy (cannot be l-value!) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); return my_data->chars[idx]; } GStr& GStr::operator=(const GStr& s) { make_unique(); //edit operation ahead replace_data(s.my_data); return *this; } GStr& GStr::operator=(const char *s) { make_unique(); //edit operation ahead if (s==NULL) { prep_data(0); return *this; } const int len = ::strlen(s); prep_data(len); ::memcpy(my_data->chars, s, len); return *this; } GStr& GStr::operator=(const double f) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); prep_data(len); ::memcpy(my_data->chars, buf, len); return *this; } GStr& GStr::operator=(const int i) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); prep_data(len); ::memcpy(my_data->chars, buf, len); return *this; } bool GStr::operator==(const GStr& s) const { if (s.is_empty()) return is_empty(); return (length() == s.length()) && (memcmp(my_data->chars, s.chars(), length()) == 0); } bool GStr::operator==(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(my_data->chars, s) == 0); } bool GStr::operator<(const GStr& s) const { if (s.is_empty()) return false; return (strcmp(my_data->chars, s.chars()) < 0); } bool GStr::operator<(const char *s) const { if (s==NULL) return false; return (strcmp(my_data->chars, s) < 0); } bool GStr::operator<=(const GStr& s) const { if (s.is_empty()) return is_empty(); return (strcmp(my_data->chars, s.chars()) <= 0); } bool GStr::operator<=(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(my_data->chars, s) <= 0); } bool GStr::operator>(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (strcmp(my_data->chars, s.chars()) > 0); } bool GStr::operator>(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(my_data->chars, s) > 0); } bool GStr::operator>=(const GStr& s) const { if (s.is_empty()) return true; return (strcmp(my_data->chars, s.chars()) >= 0); } bool GStr::operator>=(const char *s) const { if (s==NULL) return true; return (strcmp(my_data->chars, s) >= 0); } bool GStr::operator!=(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (length() != s.length()) || (memcmp(my_data->chars, s.chars(), length()) != 0); } bool GStr::operator!=(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(my_data->chars, s) != 0); } GStr& GStr::append(int i) { char buf[20]; sprintf(buf,"%d",i); return append(buf); } GStr& GStr::append(uint i) { char buf[20]; sprintf(buf,"%u",i); return append(buf); } GStr& GStr::append(long l) { char buf[20]; sprintf(buf,"%ld",l); return append(buf); } GStr& GStr::append(unsigned long l) { char buf[20]; sprintf(buf,"%lu", l); return append(buf); } GStr& GStr::append(double f) { char buf[30]; sprintf(buf,"%f",f); return append(buf); } bool GStr::is_empty() const { //return my_data == &null_data; return (length()==0); } GStr GStr::copy() const { GStr newstring(*this); return newstring; } int GStr::index(const GStr& s, int start_index) const { return index(s.chars(), start_index); } bool GStr::contains(const GStr& s) const { return (index(s, 0) >= 0); } bool GStr::contains(const char *s) const { return (index(s, 0) >= 0); } bool GStr::startsWith(const char *s) const { //return (index(s, 0) == 0); return ::startsWith(my_data->chars, s); } bool GStr::startsWith(const GStr& s) const { //return (index(s, 0) == 0); return ::startsWith(my_data->chars, s.chars()); } bool GStr::endsWith(const char *s) const { //return (index(s, 0) == 0); return ::endsWith(my_data->chars, s); } bool GStr::endsWith(const GStr& s) const { //return (index(s, 0) == 0); return ::endsWith(my_data->chars, s.chars()); } bool GStr::contains(char c) const { return (index(c, 0) >= 0); } GStr& GStr::format(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions int len=vsprintf(buf,fmt,arguments); va_end(arguments); prep_data(len); //this also adds the '\0' at the end! //and sets the right len ::memcpy(chrs(), buf, len); GFREE(buf); return *this; } GStr& GStr::appendfmt(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions vsprintf(buf,fmt,arguments); va_end(arguments); append(buf); GFREE(buf); return *this; } GStr& GStr::trim(char c) { int istart; int iend; for (istart=0; istartchars[istart]==c;istart++) ; if (istart==length()) { make_unique(); //edit operation ahead prep_data(0); //string was entirely trimmed return *this; } for (iend=length()-1; iend>istart && my_data->chars[iend]==c;iend--) ; int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &my_data->chars[istart], newlen); replace_data(data); return *this; } GStr& GStr::trim(const char* c) { int istart; int iend; for (istart=0; istartchars[istart])!=NULL ;istart++) ; if (istart==length()) { prep_data(0); //string was entirely trimmed return *this; } for (iend=length()-1; iend>istart && strchr(c, my_data->chars[iend])!=NULL;iend--) ; int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, & (my_data->chars[istart]), newlen); replace_data(data); return *this; } GStr& GStr::trimR(char c) { //only trim the right end int iend; for (iend=length()-1; iend>=0 && my_data->chars[iend]==c;iend--) ; if (iend==-1) { make_unique(); prep_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead my_data->length=newlen; my_data->chars[newlen]='\0'; /* Data *data = new_data(newlen); ::memcpy(data->chars, my_data->chars, newlen); replace_data(data); */ return *this; } GStr& GStr::trimR(const char* c) { int iend; for (iend=length()-1; iend>=0 && strchr(c,my_data->chars[iend])!=NULL;iend--) ; if (iend==-1) { make_unique(); prep_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead my_data->length=newlen; my_data->chars[newlen]='\0'; /* Data *data = new_data(newlen); ::memcpy(data->chars, my_data->chars, newlen); replace_data(data); */ return *this; } GStr& GStr::chomp(const char* cstr) { int iend; if (cstr==NULL || *cstr==0) return *this; //check if this ends with cstr int cend=strlen(cstr)-1; iend=my_data->length-1; while (iend>=0 && cend>=0) { if (my_data->chars[iend]!=cstr[cend]) return *this; iend--; cend--; } if (iend==-1) { make_unique(); prep_data(0); //string will be entirely trimmed return *this; } int newlen=iend+1; make_unique(); //edit operation ahead my_data->length=newlen; my_data->chars[newlen]='\0'; //Data *data = new_data(newlen); //::memcpy(data->chars, my_data->chars, newlen); //replace_data(data); return *this; } GStr& GStr::trimL(char c) { int istart; for (istart=0; istartchars[istart]==c;istart++) ; if (istart==length()) { prep_data(0); //string was entirely trimmed return *this; } int newlen=length()-istart; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &my_data->chars[istart], newlen); replace_data(data); return *this; } GStr& GStr::trimL(const char* c) { int istart; for (istart=0; istartchars[istart])!=NULL;istart++) ; if (istart==length()) { prep_data(0); //string was entirely trimmed return *this; } int newlen=length()-istart; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &my_data->chars[istart], newlen); replace_data(data); return *this; } GStr& GStr::padR(uint len, char c) { //pad with c until total string length is len if (my_data->length>=len) return *this; //no room for padding make_unique(); //edit operation ahead if (my_data->cap>=len) { ::memset(my_data->chars, c, len-my_data->length); my_data->length=len; return *this; } Data *data = new_data(len); ::memset(data->chars,c,len-my_data->length); ::memcpy(&data->chars[len-length()], my_data->chars, my_data->length); replace_data(data); return *this; } GStr& GStr::padL(uint len, char c) { //align left the string if (my_data->length>=len) return *this; //no room for padding make_unique(); //edit operation ahead Data *data = new_data(len); ::memcpy(data->chars, my_data->chars, length()); ::memset(&data->chars[length()],c,len-length()); replace_data(data); return *this; } GStr& GStr::padC(uint len, char c) { if (my_data->length>=len) return *this; //no room for padding make_unique(); //edit operation ahead uint istart=(len-length())/2; Data *data = new_data(len); if (istart>0) ::memset(data->chars, c, istart); ::memcpy(&data->chars[istart], my_data->chars, length()); uint iend=istart+length(); if (iendchars[iend],c,len-iend); replace_data(data); return *this; } GStr operator+(const char *s1, const GStr& s2) { const int s1_length = ::strlen(s1); if (s1_length == 0) return s2; else { GStr newstring; newstring.prep_data(s1_length + s2.length()); ::memcpy(newstring.chrs(), s1, s1_length); ::memcpy(&(newstring.chrs())[s1_length], s2.chars(), s2.length()); return newstring; } } //========================================= GStr GStr::operator+(const GStr& s) const { if (length() == 0) return s; else if (s.length() == 0) return *this; else { GStr newstring; newstring.prep_data(length() + s.length()); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], s.chars(), s.length()); return newstring; } } //========================================= GStr GStr::operator+(const char *s) const { const int s_length = ::strlen(s); if (s_length == 0) return *this; else { GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], s, s_length); return newstring; } } GStr GStr::operator+(const int i) const { char buf[20]; sprintf(buf, "%d", i); const int s_length = ::strlen(buf); GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const char c) const { char buf[4]; sprintf(buf, "%c", c); const int s_length = ::strlen(buf); GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const double f) const { char buf[30]; sprintf(buf, "%f", f); const int s_length = ::strlen(buf); GStr newstring; newstring.prep_data(length() + s_length); ::memcpy(newstring.chrs(), my_data->chars, length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } //========================================= bool GStr::is_space() const { if (my_data == &null_data) return false; for (const char *p = my_data->chars; *p; p++) if (!isspace(*p)) return false; return true; } //========================================= GStr GStr::substr(int idx, int len) const { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); else if (idx>=length()) { len=0; idx=length(); } if (len) { // A length of -1 specifies the rest of the string. if (len < 0 || len>length()-idx) len = length() - idx; if (idx<0 || idx>=length() || len<0 ) invalid_args_error("substr()"); } GStr newstring; if (len) { newstring.prep_data(len); ::memcpy(newstring.chrs(), &my_data->chars[idx], len); } return newstring; } GStr& GStr::reverse() { make_unique(); int l=0; int r=my_data->length-1; char c; while (lchars[l]; my_data->chars[l]=my_data->chars[r]; my_data->chars[r]=c; l++;r--; } return *this; } //transform: any character from 'from' is replaced with a coresponding //char from 'to' GStr& GStr::tr(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); if (rto!=NULL && rto[0]==0) rto=NULL; if (rto!=NULL && strlen(rto)!=l) invalid_args_error("tr()"); make_unique(); //edit operation ahead if (rto==NULL) { //delete all characters Data *data = new_data(length()); char* s = my_data->chars; char* p=NULL; char* dest = data->chars; do { if ((p=strpbrk(s,rfrom))!=NULL) { memcpy(dest,s,p-s); dest+=p-s; s=p+1; } else { strcpy(dest, s); dest+=strlen(s); } } while (p!=NULL); (*dest)='\0'; data->length=strlen(data->chars); replace_data(data); } else { //char substitution case - easier! const char* p=NULL; for (int i=0; ichars[i]))!=NULL) my_data->chars[i]=rto[p-rfrom]; } } return *this; } // search and replace all the occurences of a string with another string // or just remove the given string (if replacement is NULL) GStr& GStr::replace(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); unsigned int tl= (rto==NULL)?0:strlen(rto); make_unique(); //edit operation ahead char* p; char* dest; char* newdest=NULL; char* s = my_data->chars; if (tl!=l) { //reallocation if (tl>l) { //possible enlargement GMALLOC(newdest, length()*(tl-l+1)+1); } else {//delete or replace with a shorter string GMALLOC(newdest, length() + 1); } dest=newdest; if (tl==0) {//deletion while ((p=strstr(s,rfrom))!=NULL) { //rfrom found at position p memcpy(dest,s,p-s); dest+=p-s; s+=p-s+l; //s positioned in string after rfrom } //no more occurences, copy the remaining string strcpy(dest, s); } else { //replace with another string while ((p=strstr(s,rfrom))!=NULL) { memcpy(dest,s,p-s); //copy up rto the match dest+=p-s; memcpy(dest,rto,tl); //put the replacement string dest+=tl; s+=p-s+l; } //not found any more, copy rto end of string strcpy(dest, s); } Data* data=new_data(newdest); replace_data(data); GFREE(newdest); } else { //inplace editing: no need rto reallocate while ((p=strstr(s,rfrom))!=NULL) { memcpy(p,rto,l); s+=p-s+l; } } return *this; } GStr& GStr::cut(int idx, int len) { if (len == 0) return *this; make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string, // so the left part will be cut out if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("cut()"); Data *data = new_data(length() - len); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); ::strcpy(&data->chars[idx], &my_data->chars[idx+len]); replace_data(data); return *this; } //========================================= GStr& GStr::paste(const GStr& s, int idx, int len) { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); make_unique(); //edit operation ahead // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); if (len == s.length() && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s.chars(), len); else { Data *data = new_data(length() - len + s.length()); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); if (s.length() > 0) ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &my_data->chars[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::paste(const char *s, int idx, int len) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); const int s_length = ::strlen(s); if (len == s_length && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s, len); else { Data *data = new_data(length() - len + s_length); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); if (s_length > 0) ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &my_data->chars[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const GStr& s, int idx) { make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); if (s.length() > 0) { Data *data = new_data(length() + s.length()); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &my_data->chars[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const char *s, int idx) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); const int s_length = ::strlen(s); if (s_length > 0) { Data *data = new_data(length() + s_length); if (idx > 0) ::memcpy(data->chars, my_data->chars, idx); ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &my_data->chars[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::append(char c) { make_unique(); //edit operation ahead uint newlen=my_data->length+1; if (my_data->cap==0) { prep_data(1, 6); my_data->chars[0]=c; return *this; } if (newlen>my_data->cap) { //not enough room to append this char GREALLOC(my_data, sizeof(Data)+newlen); my_data->cap=newlen; } my_data->chars[my_data->length]=c; my_data->length++; my_data->chars[my_data->length]='\0'; return *this; } GStr& GStr::append(const char* s) { make_unique(); //edit operation ahead uint len=::strlen(s); uint newlen=len+my_data->length; if (newlen<=my_data->length) return *this; if (my_data->length==0 && my_data->capchars, s, len); return *this; } if (newlen>=my_data->cap) { //not enough room to append these chars GREALLOC(my_data, sizeof(Data)+newlen+1); my_data->cap=newlen+1; } ::memcpy(my_data->chars+my_data->length, s, len); my_data->length=newlen; my_data->chars[newlen]='\0'; return *this; } GStr& GStr::appendQuoted(const char* s, char q, bool onlyIfSpaced) { if (onlyIfSpaced) { if (strpbrk(s, "\t ")==NULL) return this->append(s); } char qend=q; if (q=='[' || q=='{' || q=='<') qend=q+2; else if (q=='(') qend=')'; this->append(q); this->append(s); this->append(qend); return *this; } GStr& GStr::append(const char* s, int len) { make_unique(); //edit operation ahead //uint len=::strlen(s); uint newlen=len+my_data->length; if (newlen<=my_data->length) return *this; if (my_data->length==0 && my_data->capchars, s, len); return *this; } if (newlen>=my_data->cap) { //not enough room to append these chars GREALLOC(my_data, sizeof(Data)+newlen+1); my_data->cap=newlen+1; } //strncpy(my_data->chars+my_data->length, s, len); newlen=my_data->length; for (int i=0;s[i]!='\0' && ichars[my_data->length+i]=s[i]; ++newlen; } my_data->length=newlen; my_data->chars[newlen]='\0'; return *this; } GStr& GStr::appendmem(const char* m, int len) { if (len<=0) return *this; make_unique(); //edit operation ahead uint newlen=len+my_data->length; //if (newlength<=my_data->length) return *this; if (my_data->length==0) { prep_data(len); ::memcpy(my_data->chars, m, len); return *this; } //faster solution with realloc if (newlen>=my_data->cap) { //not enough room to append these chars GREALLOC(my_data, sizeof(Data)+newlen+1); my_data->cap=newlen+1; } ::memcpy(my_data->chars + my_data->length, m, len); my_data->length=newlen; my_data->chars[newlen]='\0'; return *this; } GStr& GStr::append(const GStr& s) { return appendmem(s.chars(), s.length()); } GStr& GStr::upper() { make_unique(); //edit operation ahead for (char *p = chrs(); *p; p++) *p = (char) toupper(*p); return *this; } //========================================= GStr& GStr::lower() { make_unique(); for (char *p = chrs(); *p; p++) *p = (char) tolower(*p); return *this; } //========================================= int GStr::index(const char *s, int start_index) const { // A negative index specifies an index from the right of the string. if (strlen(s)>(size_t)length()) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); const char* idx = strstr(&my_data->chars[start_index], s); if (!idx) return -1; else return idx - my_data->chars; } //========================================= int GStr::index(char c, int start_index) const { // A negative index specifies an index from the right of the string. if (length()==0) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); if (c == '\0') return -1; const char *idx=(char *) ::memchr(&my_data->chars[start_index], c, length()-start_index); if (idx==NULL) return -1; else return idx - my_data->chars; } int GStr::rindex(char c, int end_index) const { if (c == 0 || length()==0 || end_index>=length()) return -1; if (end_index<0) end_index=my_data->length-1; for (int i=end_index;i>=0;i--) { if (my_data->chars[i]==c) return i; } return -1; } int GStr::rindex(const char* str, int end_index) const { if (str==NULL || *str == '\0' || length()==0 || end_index>=length()) return -1; int slen=strlen(str); if (end_index<0) end_index=my_data->length-1; //end_index is the index of the right-side boundary //the scanning starts at the end if (end_index>=0 && end_index=0;i--) { if (memcmp((void*)(my_data->chars+i),(void*)str, slen)==0) return i; } return -1; } GStr GStr::split(const char* delim) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::split(char c) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } GStr GStr::splitr(const char* delim) { GStr result; int i=rindex(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::splitr(char c) { GStr result; int i=rindex(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } void GStr::startTokenize(const char* delimiter, enTokenizeMode tokenizemode) { GFREE(fTokenDelimiter); if (delimiter) { GMALLOC(fTokenDelimiter,strlen(delimiter)+1); strcpy(fTokenDelimiter, delimiter); } fLastTokenStart=0; fTokenizeMode=tokenizemode; } bool GStr::nextToken(GStr& token) { if (fTokenDelimiter==NULL) { GError("GStr:: no token delimiter; use StartTokenize first\n"); } if (fLastTokenStart>=length()) {//no more GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } int dlen=strlen(fTokenDelimiter); char* delpos=NULL; //delimiter position int tlen=0; if (fTokenizeMode==tkFullString) { //exact string as a delimiter delpos=(char*)strstr(my_data->chars+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(my_data->chars+length()); //empty records may be returned if (my_data->chars+fLastTokenStart == delpos) { //empty token fLastTokenStart=(delpos-my_data->chars)+dlen; token=""; return true; } else { tlen=delpos-(my_data->chars+fLastTokenStart); token.prep_data(tlen); ::memcpy(token.chrs(), &my_data->chars[fLastTokenStart], tlen); fLastTokenStart=(delpos-my_data->chars)+dlen; return true; } } else { //tkCharSet - any character is a delimiter //empty records are never returned ! if (fLastTokenStart==0) {//skip any starting delimiters delpos=(char*)my_data->chars; while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; if (*delpos!='\0') fLastTokenStart = delpos-my_data->chars; else { //only delimiters here,no tokens GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } } //now fLastTokenStart is on a non-delimiter char //GMessage("String at fLastTokenStart=%d is %s\n", fLastTokenStart, delpos); char* token_end=NULL; delpos=(char*)strpbrk(my_data->chars+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(my_data->chars+length()); token_end=delpos-1; while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; //skip any other delimiters in the set! //now we know that delpos is on the beginning of next token tlen=(token_end-my_data->chars)-fLastTokenStart+1; if (tlen==0) { GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } token.prep_data(tlen); ::memcpy(token.chrs(), &my_data->chars[fLastTokenStart], tlen); fLastTokenStart=delpos-my_data->chars; return true; } //return true; } size_t GStr::read(FILE* stream, const char* delimiter, size_t bufsize) { //read up to (and including) the given delimiter string //if delimiter is NULL or zero length, it will read the whole file if (readbuf==NULL) { GMALLOC(readbuf, bufsize); readbufsize=bufsize; } else if (bufsize!=readbufsize) { GFREE(readbuf); if (bufsize>0) { GMALLOC(readbuf, bufsize); } readbufsize=bufsize; } if (bufsize==0) { prep_data(0); return 0; //clear the string and free the buffer } size_t numread; size_t acc_len=0; //accumulated length int dlen=0; if (delimiter!=NULL && delimiter[0]!=0) dlen=strlen(delimiter); void* p=NULL; Data* data = &null_data; do { numread=fread(readbuf, 1, bufsize, stream); if (numread) { if (dlen>0) p=Gmemscan(readbuf, bufsize, (void*) delimiter, dlen); if (p!=NULL) {//found the delimiter //position the stream after it int l = (char*)p-(char*)readbuf; fseek(stream, l+dlen-numread, SEEK_CUR); numread=l+dlen; } else {//not found, go back if not eof if (numread==bufsize) { if (dlen>0) { fseek(stream, -dlen, SEEK_CUR); //check if this works! numread-=dlen; } } } if (data==&null_data) { data=new_data(numread); ::memcpy(data->chars, readbuf, numread); acc_len+=numread; } else { GREALLOC(data, sizeof(Data)+acc_len+numread); memcpy(&data->chars[acc_len], readbuf, numread); acc_len+=numread; data->length=acc_len; data->chars[acc_len]='\0'; } } //if something read } while (p==NULL && numread!=0); replace_data(data); return acc_len; } int GStr::asInt(int base /*=10 */) { return strtol(text(), NULL, base); } bool GStr::asInt(int& r, int base) { errno=0; char*endptr; long val=strtol(text(), &endptr, base); if (errno!=0) return false; if (endptr == text()) return false; /* If we got here, strtol() successfully parsed a number */ r=val; return true; } double GStr::asReal() { return strtod(text(), NULL); } bool GStr::asReal(double& r) { errno=0; char* endptr; double val=strtod(text(), &endptr); if (errno!=0) return false; if (endptr == text()) return false; //no digits to parse r=val; return true; } int GStr::peelInt() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=0;ichars[i])) j++; //set coord else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i-j], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } int GStr::peelIntR() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=length()-1;i>=0;i--) { if (started) { if (isdigit(my_data->chars[i])) j++; //set length else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i+1], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } GStr GStr::to(char c) { //return the first part up to first occurence of c int i=index(c); if (i>=0) return substr(0,i); else return (*this); } //or whole string if c not found GStr GStr::from(char c) { //same as to, but starting from the right side int i=rindex(c); if (i>=0) return substr(i+1); else return (*this); } int GStr::count(char c){ //return the number of occurences of char c within the string int result=0; for (int i=0;ichars[i]==c) result++; return result; } //========================================= void GStr::invalid_args_error(const char *fname) { GError("GStr:: %s - invalid arguments\n", fname); } //**************************************************************************** void GStr::invalid_index_error(const char *fname) { GError("GStr:: %s - invalid index\n", fname); } //**************************************************************************** libgff-2.0.0/src/TestGFFParse.cpp000066400000000000000000000015301367741004700165020ustar00rootroot00000000000000#include #include #include #include "gff.h" int main(int argc, char* argv[]) { if (argc == 1 or argc > 2) { std::cerr << "Usage: TestGFFParse input\n"; std::exit(1); } GffReader reader(argv[1], true, false); reader.readAll(true); std::cerr << "had count of " << reader.gflst.Count() << "\n"; size_t nfeat = reader.gflst.Count(); for (size_t i=0; i < nfeat; ++i) { GffObj* f = reader.gflst[i]; if (f->isTranscript()) { std::cout << f->getID() << '\t' << f->getGeneID() << '\t'; if (f->attrs) { for (size_t j=0; j < f->attrs->Count(); ++j) { std::cout << f->getAttrName(j) << "\t" << f->getAttrValue(j) << "\t"; } } std::cout << "\n"; } } std::exit(0); } libgff-2.0.0/src/codons.cpp000066400000000000000000000113451367741004700155370ustar00rootroot00000000000000#include "codons.h" static char codonTable[32768]; //32K table for fasta codon decoding // codons are encoded as triplets of 5-bit-encoded nucleotides // (so any codon can be encoded/decoded as a unique 15-bit value) static char codonData[]={ //long list of 3+1 characters (codon+translation) 'A','A','A','K', 'A','A','C','N', 'A','A','G','K', 'A','A','R','K', 'A','A','T','N', 'A','A','Y','N', 'A','C','A','T', 'A','C','B','T', 'A','C','C','T', 'A','C','D','T', 'A','C','G','T', 'A','C','H','T', 'A','C','K','T', 'A','C','M','T', 'A','C','N','T', 'A','C','R','T', 'A','C','S','T', 'A','C','T','T', 'A','C','V','T', 'A','C','W','T', 'A','C','Y','T', 'A','G','A','R', 'A','G','C','S', 'A','G','G','R', 'A','G','R','R', 'A','G','T','S', 'A','G','Y','S', 'A','T','A','I', 'A','T','C','I', 'A','T','G','M', 'A','T','H','I', 'A','T','M','I', 'A','T','T','I', 'A','T','W','I', 'A','T','Y','I', 'C','A','A','Q', 'C','A','C','H', 'C','A','G','Q', 'C','A','R','Q', 'C','A','T','H', 'C','A','Y','H', 'C','C','A','P', 'C','C','B','P', 'C','C','C','P', 'C','C','D','P', 'C','C','G','P', 'C','C','H','P', 'C','C','K','P', 'C','C','M','P', 'C','C','N','P', 'C','C','R','P', 'C','C','S','P', 'C','C','T','P', 'C','C','V','P', 'C','C','W','P', 'C','C','Y','P', 'C','G','A','R', 'C','G','B','R', 'C','G','C','R', 'C','G','D','R', 'C','G','G','R', 'C','G','H','R', 'C','G','K','R', 'C','G','M','R', 'C','G','N','R', 'C','G','R','R', 'C','G','S','R', 'C','G','T','R', 'C','G','V','R', 'C','G','W','R', 'C','G','Y','R', 'C','T','A','L', 'C','T','B','L', 'C','T','C','L', 'C','T','D','L', 'C','T','G','L', 'C','T','H','L', 'C','T','K','L', 'C','T','M','L', 'C','T','N','L', 'C','T','R','L', 'C','T','S','L', 'C','T','T','L', 'C','T','V','L', 'C','T','W','L', 'C','T','Y','L', 'G','A','A','E', 'G','A','C','D', 'G','A','G','E', 'G','A','R','E', 'G','A','T','D', 'G','A','Y','D', 'G','C','A','A', 'G','C','B','A', 'G','C','C','A', 'G','C','D','A', 'G','C','G','A', 'G','C','H','A', 'G','C','K','A', 'G','C','M','A', 'G','C','N','A', 'G','C','R','A', 'G','C','S','A', 'G','C','T','A', 'G','C','V','A', 'G','C','W','A', 'G','C','Y','A', 'G','G','A','G', 'G','G','B','G', 'G','G','C','G', 'G','G','D','G', 'G','G','G','G', 'G','G','H','G', 'G','G','K','G', 'G','G','M','G', 'G','G','N','G', 'G','G','R','G', 'G','G','S','G', 'G','G','T','G', 'G','G','V','G', 'G','G','W','G', 'G','G','Y','G', 'G','T','A','V', 'G','T','B','V', 'G','T','C','V', 'G','T','D','V', 'G','T','G','V', 'G','T','H','V', 'G','T','K','V', 'G','T','M','V', 'G','T','N','V', 'G','T','R','V', 'G','T','S','V', 'G','T','T','V', 'G','T','V','V', 'G','T','W','V', 'G','T','Y','V', 'M','G','A','R', 'M','G','G','R', 'M','G','R','R', 'N','N','N','X', 'R','A','Y','B', 'S','A','R','Z', 'T','A','A','.', 'T','A','C','Y', 'T','A','G','.', 'T','A','R','.', 'T','A','T','Y', 'T','A','Y','Y', 'T','C','A','S', 'T','C','B','S', 'T','C','C','S', 'T','C','D','S', 'T','C','G','S', 'T','C','H','S', 'T','C','K','S', 'T','C','M','S', 'T','C','N','S', 'T','C','R','S', 'T','C','S','S', 'T','C','T','S', 'T','C','V','S', 'T','C','W','S', 'T','C','Y','S', 'T','G','A','.', 'T','G','C','C', 'T','G','G','W', 'T','G','T','C', 'T','G','Y','C', 'T','R','A','.', 'T','T','A','L', 'T','T','C','F', 'T','T','G','L', 'T','T','R','L', 'T','T','T','F', 'T','T','Y','F', 'X','X','X','X', 'Y','T','A','L', 'Y','T','G','L', 'Y','T','R','L' }; static bool isCodonTableReady=codonTableInit(); unsigned short packCodon(char n1, char n2, char n3) { //assumes they are uppercase already! byte b1=n1-'A'; byte b2=n2-'A'; byte b3=n3-'A'; b1 |= (b2 << 5); b2 = (b2 >> 3) | (b3 << 2); return ( ((unsigned short)b2) << 8) + b1; } bool codonTableInit() { memset((void*)codonTable, 'X', 32768); int cdsize=sizeof(codonData); for (int i=0;i const char* IUPAC_2BIT ="AACCTTGGTTAAAAAACCCCGGAAAAAACCAAAAAA"; const char* IUPAC_2BITN ="001133223300000011112200000011000000"; const char* IUPAC_DEFS ="AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbNnXx-*"; const char* IUPAC_COMP ="TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvNnXx-*"; #define A_2BIT 0 // 00 #define C_2BIT 1 // 01 #define G_2BIT 2 // 10 #define T_2BIT 3 // 11 static byte ntCompTable[256]; static byte nt2bit[256]; //maps any character to a 2bit base value (with N = A) static char v_2bit2nt[4] = {'A','C','G','T'}; //---------------------- static bool gdna_Ready=gDnaInit(); //---------------------- byte gdna2bit(char* &nt, int n) { // Pack n bases into a byte (n can be 1..4) byte out = 0; while (n && *nt) { n--; out <<= 2; out += nt2bit[(int)*nt]; nt++; } #ifdef GDEBUG if (n) { GError("Error: attempt to read 6-mer beyond the end of the string!\n"); } #endif return out; } char ntComplement(char c) { return ntCompTable[(int)c]; } char g2bit2base(byte v2bit) { return v_2bit2nt[v2bit & 0x03 ]; } //in place reverse complement of nucleotide (sub)sequence char* reverseComplement(char* seq, int slen) { if (slen==0) slen=strlen(seq); //reverseChars(seq,len); int l=0; int r=slen-1; char c; while (l5MB mouse intron const int GFF_MIN_INTRON = 4; //for mergeCloseExons option //bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings() int gff_fid_mRNA=0; //mRNA (has CDS) int gff_fid_transcript=1; // generic "transcript" feature int gff_fid_exon=2; // generic "exon"-like feature (exon,CDS,UTR,start/stop codon) int gff_fid_CDS=3; // CDS feature (CDS, start/stop codon) const char* exonTypes[]={ "None", "StartCodon", "StopCodon", "CDS", "UTR", "CDS+UTR", "exon" }; const GffScore GFFSCORE_NONE; //const uint gfo_flag_LEVEL_MSK = 0x00FF0000; //const byte gfo_flagShift_LEVEL = 16; void gffnames_ref(GffNames* &n) { if (n==NULL) n=new GffNames(); n->numrefs++; } void gffnames_unref(GffNames* &n) { if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n"); n->numrefs--; if (n->numrefs==0) { delete n; n=NULL; } } const int CLASSCODE_OVL_RANK = 15; int classcode_rank(char c) { switch (c) { case '=': return 0; //intron chain match or full exon chain match if strict matching is enabled case '~': return 1; //intron chain match when strict matching is enabled case 'c': return 2; //containment, perfect partial match (transfrag < reference) case 'k': return 6; // reverse containment (reference < transfrag) case 'm': return 6; // full span overlap with all reference introns either matching or retained case 'n': return 6; // partial overlap transfrag with at least one intron retention case 'j': return 6; // multi-exon transfrag with at least one junction match case 'e': return 12; // single exon transfrag partially overlapping an intron of reference (possible pre-mRNA fragment) case 'o': return 14; // other generic exon overlap //**** >15 = no-overlaps (not on the same strand) from here on ***** case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand (wrong strand mapping?) case 'x': return 18; // generic overlap on opposite strand (usually wrong strand mapping) case 'i': return 20; // intra-intron (transfrag fully contained within a reference intron) case 'y': return 30; // no exon overlap: ref exons fall within transfrag introns! case 'p': return 90; //polymerase run case 'r': return 92; //repeats case 'u': return 94; //intergenic case 0 : return 100; default: return 96; } } const char* strExonType(char xtype) { static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"}; if (xtype>0 && xtype<7) return extbl[(int)xtype]; else return "NULL"; } int gfo_cmpByLoc(const pointer p1, const pointer p2) { GffObj& g1=*((GffObj*)p1); GffObj& g2=*((GffObj*)p2); if (g1.gseq_id==g2.gseq_id) { if (g1.start!=g2.start) return (int)(g1.start-g2.start); else if (g1.getLevel()!=g2.getLevel()) return (int)(g1.getLevel()-g2.getLevel()); else if (g1.end!=g2.end) return (int)(g1.end-g2.end); else return strcmp(g1.getID(), g2.getID()); } else //return (int)(g1.gseq_id-g2.gseq_id); // input order ! return strcmp(g1.getGSeqName(), g2.getGSeqName()); //lexicographic ! } //comparator for ordering by reference sequence (chromosome) index int gfo_cmpRefByID(const pointer p1, const pointer p2) { GffObj& g1=*((GffObj*)p1); GffObj& g2=*((GffObj*)p2); if (g1.gseq_id==g2.gseq_id) { if (g1.start!=g2.start) return (int)(g1.start-g2.start); else if (g1.getLevel()!=g2.getLevel()) return (int)(g1.getLevel()-g2.getLevel()); else if (g1.end!=g2.end) return (int)(g1.end-g2.end); else return strcmp(g1.getID(), g2.getID()); } else return (g1.gseq_id-g2.gseq_id); // sort refs by their id# order } char* GffLine::extractGFFAttr(char* & infostr, const char* oline, const char* attr, bool caseStrict, bool enforce_GTF2, int* rlen, bool deleteAttr) { //parse a key attribute and remove it from the info string //(only works for attributes that have values following them after ' ' or '=') static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required for GTF) at line:\n%s\n"; int attrlen=strlen(attr); char cend=attr[attrlen-1]; //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr); //must make sure attr is not found in quoted text char* pos=infostr; char prevch=0; bool in_str=false; bool notfound=true; int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp; while (notfound && *pos) { char ch=*pos; if (ch=='"') { in_str=!in_str; pos++; prevch=ch; continue; } if (!in_str && (prevch==0 || prevch==' ' || prevch == ';') && strcmpfn(attr, pos, attrlen)==0) { //attr match found //check for word boundary on right char* epos=pos+attrlen; if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') { notfound=false; break; } //not a perfect match, move on pos=epos; prevch=*(pos-1); continue; } //not a match or in_str prevch=ch; pos++; } if (notfound) return NULL; char* vp=pos+attrlen; while (*vp==' ') vp++; if (*vp==';' || *vp==0) { GMessage("Warning: cannot parse value of GFF attribute \"%s\" at line:\n%s\n", attr, oline); return NULL; } bool dq_enclosed=false; //value string enclosed by double quotes if (*vp=='"') { dq_enclosed=true; vp++; } if (enforce_GTF2 && !dq_enclosed) GError(GTF2_ERR, attr, oline); char* vend=vp; if (dq_enclosed) { while (*vend!='"' && *vend!=';' && *vend!=0) vend++; } else { while (*vend!=';' && *vend!=0) vend++; } if (enforce_GTF2 && *vend!='"') GError(GTF2_ERR, attr, oline); char *r=Gstrdup(vp, vend-1); if (rlen) *rlen = vend-vp; if (deleteAttr) {//-- remove this attribute from infostr while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++; if (*vend==0) vend--; for (char *src=vend, *dest=pos;;src++,dest++) { *dest=*src; //shift the rest of infostr (copy over) if (*src==0) break; } } return r; } BEDLine::BEDLine(GffReader* reader, const char* l): skip(true), dupline(NULL), line(NULL), llen(0), gseqname(NULL), fstart(0), fend(0), strand(0), ID(NULL), info(NULL), cds_start(0), cds_end(0), cds_phase(0), exons(1) { if (reader==NULL || l==NULL) return; llen=strlen(l); GMALLOC(line,llen+1); memcpy(line, l, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l, llen+1); char* t[14]; int i=0; int tidx=1; t[0]=line; if (startsWith(line, "browser ") || startsWith(line, "track ")) return; while (line[i]!=0) { if (line[i]=='\t') { line[i]=0; t[tidx]=line+i+1; tidx++; //our custom BED-13+ format, with GFF3 attributes in 13th column if (tidx>12) { info=t[12]; break; } } i++; } /* if (tidx<6) { // require BED-6+ lines GMessage("Warning: 6+ BED columns expected, instead found:\n%s\n", l); return; } */ gseqname=t[0]; char* p=t[1]; if (!parseUInt(p,fstart)) { GMessage("Warning: invalid BED start coordinate at line:\n%s\n",l); return; } ++fstart; //BED start is 0 based p=t[2]; if (!parseUInt(p,fend)) { GMessage("Warning: invalid BED end coordinate at line:\n%s\n",l); return; } if (fend5) { strand=*t[5]; if (strand!='-' && strand !='.' && strand !='+') { GMessage("Warning: unrecognized BED strand at line:\n%s\n",l); return; } } else strand='.'; //if (tidx>12) ID=t[12]; // else ID=t[3]; ID=t[3]; //now parse the exons, if any if (tidx>11) { int numexons=0; p=t[9]; if (!parseInt(p, numexons) || numexons<=0) { GMessage("Warning: invalid BED block count at line:\n%s\n",l); return; } char** blen; char** bstart; GMALLOC(blen, numexons * sizeof(char*)); GMALLOC(bstart, numexons * sizeof(char*)); i=0; int b=1; blen[0]=t[10]; while (t[10][i]!=0 && b<=numexons) { if (t[10][i]==',') { t[10][i]=0; if (b0 at line:\n%s\n",exonstart, l); return; } exonstart+=fstart; uint exonend=exonstart+exonlen-1; if ((uint)exonstart>fend || exonend>fend) { GMessage("Warning: BED exon %d-%d is outside record boundary at line:\n%s\n",exonstart,exonend, l); return; } ex.start=exonstart;ex.end=exonend; exons.Add(ex); } GFREE(blen); GFREE(bstart); } else { //take it as single-exon transcript GSeg v(fstart, fend); exons.Add(v); } if (info!=NULL) { char* cdstr=GffLine::extractGFFAttr(info, dupline, "CDS="); if (cdstr) { char* p=strchr(cdstr, ':'); if (p!=NULL) { *p='\0'; ++p; } if (strToUInt(cdstr, cds_start) && cds_start>=fstart-1) { ++cds_start; if (!strToUInt(p, cds_end) || cds_end>fend) { GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n", cds_start, cds_end, dupline); cds_start=0; cds_end=0; //invalid CDS coordinates } } char* cdstr_phase=NULL; if (cds_start>0 && (cdstr_phase=GffLine::extractGFFAttr(info, dupline, "CDSphase="))!=NULL) { cds_phase=cdstr_phase[0]; GFREE(cdstr_phase); } GFREE(cdstr); } } if (cds_start==0 && cds_end==0 && tidx>7) { //check if columns 7,8 can be reasonably assumed to be CDS start-end coordinates if (strToUInt(t[6], cds_start) && strToUInt(t[7], cds_end) && cds_end>cds_start) { if (cds_start>=fstart-1 && cds_end<=fend) cds_start++; else { cds_start=0; cds_end=0; } } } skip=false; } bool GffLine::parseSegmentList(GVec& segs, char* str) { bool segs_valid=false; char* p=strchr(str, '-'); if (p!=NULL && p>str) { GDynArray ss; strsplit(str, ss, ','); GSeg seg; segs_valid=true; for (uint i=0;i(int)fend){ segs_valid=false; break; } if (!strToInt(p, xend) || xend<(int)fstart || xend>(int)fend) { segs_valid=false; break; } if (xstart>xend) { seg.start=(uint)xend;seg.end=(uint)xstart; } else { seg.start=(uint)xstart;seg.end=(uint)xend; } segs.Add(seg); } //parse all CDS segments if (segs_valid) { if (segs.Count()>1) segs.Sort(); } else segs.Clear(); } return segs_valid; } GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len(0), dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL), ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0), score(0), score_decimals(-1), strand(0), flags(0), exontype(exgffNone), phase(0), cds_start(0), cds_end(0), exons(), cdss(), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) { llen=strlen(l); GMALLOC(line,llen+1); memcpy(line, l, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l, llen+1); skipLine=true; //clear only if we make it to the end of this function char* t[9]; int i=0; int tidx=1; t[0]=line; char fnamelc[128]; while (line[i]!=0) { if (line[i]=='\t') { line[i]=0; t[tidx]=line+i+1; tidx++; if (tidx>8) break; } i++; } if (tidx<8) { // ignore non-GFF lines return; } gffWarnings=reader->gff_warns; gseqname=t[0]; track=t[1]; ftype=t[2]; info=t[8]; char* p=t[3]; if (!parseUInt(p,fstart)) { //chromosome_band entries in Flybase GMessage("Warning: invalid start coordinate at line:\n%s\n",l); return; } p=t[4]; if (!parseUInt(p,fend)) { GMessage("Warning: invalid end coordinate at line:\n%s\n",l); return; } if (fendfeats.addName(ftype); } else if (endsWith(fnamelc, "_gene_segment")) { is_transcript=true; is_t_data=true; is_gene_segment=true; } else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) { is_gene=true; is_t_data=true; //because its name will be attached to parented transcripts } char* Parent=NULL; /* Rejecting non-transcript lines early if only transcripts are requested ?! It would be faster to do this here but there are GFF cases when we reject an unusual parent feature here (e.g. protein with CDS children) and then their exon/CDS children show up and get assigned to an implicit parent mRNA The solution is to still load this parent as GffObj for now and BAN it later so its children get dismissed/discarded as well. */ if (reader->ignoreLocus) { if (strcmp(ftype, "locus")==0) return; if (is_transcript || is_gene) { char* locus=NULL; if (reader->is_gff3 || reader->gff_type==0) locus=extractAttr("locus="); else locus=extractAttr("locus"); if (locus!=NULL) { GFREE(locus); } } } char *gtf_tid=NULL; char *gtf_gid=NULL; if (reader->is_gff3 || reader->gff_type==0) { ID=extractAttr("ID=",true); Parent=extractAttr("Parent=",true); if (reader->gff_type==0) { if (ID!=NULL || Parent!=NULL) reader->is_gff3=true; else { //check if it looks like a GTF gtf_tid=extractAttr("transcript_id", true, true); if (gtf_tid==NULL) { gtf_gid=extractAttr("gene_id", true, true); if (gtf_gid==NULL) return; //cannot determine file type yet } reader->is_gtf=true; } } } if (reader->is_gff3) { //parse as GFF3 //if (ID==NULL && Parent==NULL) return; //silently ignore unidentified/unlinked features if (ID!=NULL) { //has ID attr so it's likely to be a parent feature //look for explicit gene name gene_name=getAttrValue("gene_name="); if (gene_name==NULL) { gene_name=getAttrValue("geneName="); if (gene_name==NULL) { gene_name=getAttrValue("gene_sym="); if (gene_name==NULL) { gene_name=getAttrValue("gene="); } } } gene_id=getAttrValue("geneID="); if (gene_id==NULL) { gene_id=getAttrValue("gene_id="); } /* if (is_gene) { //--WARNING: this might be mislabeled (e.g. TAIR: "mRNA_TE_gene") //---special case: keep the Name and ID attributes of the gene feature //if (gene_name==NULL) // gene_name=extractAttr("Name="); if (gene_id==NULL) //the ID is also gene_id in this case gene_id=Gstrdup(ID); //skip=false; //return; //-- we don't care about gene parents.. unless it's a mislabeled "gene" feature } //gene feature (probably) */ //--parse exons for TLF char* segstr=extractAttr("exons="); bool exons_valid=false; if (segstr) { exons_valid=parseSegmentList(exons, segstr); char* exoncountstr=extractAttr("exonCount="); if (exoncountstr) { int exoncount=0; if (!strToInt(exoncountstr, exoncount) || exoncount!=(int)exons.Count()) GMessage("Warning: exonCount attribute value doesn't match the exons attribute!\n"); GFREE(exoncountstr); } GFREE(segstr); } if (exons_valid) { bool validCDS=false; segstr=extractAttr("CDS="); if (segstr) { char* p=strchr(segstr, ':'); if (p!=NULL) { // CDS=start:end format *p='\0'; ++p; validCDS=true; if (validCDS && strToUInt(segstr, cds_start) && cds_start>=fstart) { if (!strToUInt(p, cds_end) || cds_end>fend) { validCDS=false; } } if (!validCDS || (int)cds_start<=0 || (int)cds_end<=0) { GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n", cds_start, cds_end, dupline); cds_start=0; cds_end=0; } } //CDS=start:end format else { //CDS = list of start-end segments, just like the exons validCDS=parseSegmentList(cdss, segstr); if (validCDS && cdss.Count()>0) { if (cds_start==0) cds_start=cdss.First().start; if (cds_end==0) cds_end=cdss.Last().end; } } GFREE(segstr); } if (validCDS) { char* cds_phase=NULL; if ((cds_phase=extractAttr("CDSphase="))!=NULL) { phase=cds_phase[0]; GFREE(cds_phase); } } //CDS found }//has valid exons }// has GFF3 ID if (Parent!=NULL) { //keep Parent attr //parse multiple parents num_parents=1; p=Parent; int last_delim_pos=-1; while (*p!=';' && *p!=0) { if (*p==',' && *(p+1)!=0 && *(p+1)!=';') { num_parents++; last_delim_pos=(p-Parent); } p++; } _parents_len=p-Parent+1; _parents=Parent; GMALLOC(parents, num_parents*sizeof(char*)); parents[0]=_parents; int i=1; if (last_delim_pos>0) { for (p=_parents+1;p<=_parents+last_delim_pos;p++) { if (*p==',') { char* ep=p-1; while (*ep==' ' && ep>_parents) ep--; *(ep+1)=0; //end the string there parents[i]=p+1; i++; } } } } //has Parent field //special case for gene_id: for genes, this is the ID if (is_gene && gene_id==NULL && ID!=NULL) { gene_id=Gstrdup(ID); } //parse other potentially useful GFF3 attributes /* if ((p=strstr(info,"Target="))!=NULL) { //has Target attr p+=7; while (*p!=';' && *p!=0 && *p!=' ') p++; if (*p!=' ') { GError("Error parsing target coordinates from GFF line:\n%s\n",l); } if (!parseUInt(p,qstart)) GError("Error parsing target start coordinate from GFF line:\n%s\n",l); if (*p!=' ') { GError("Error parsing next target coordinate from GFF line:\n%s\n",l); } p++; if (!parseUInt(p,qend)) GError("Error parsing target end coordinate from GFF line:\n%s\n",l); } if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr p+=5; if (!parseUInt(p,qstart)) GError("Error parsing target start coordinate from GFF line:\n%s\n",l); if (*p!='-') { GError("Error parsing next target coordinate from GFF line:\n%s\n",l); } p++; if (!parseUInt(p,qend)) GError("Error parsing target end coordinate from GFF line:\n%s\n",l); if (*p=='|' || *p==':') { p++; if (!parseUInt(p,qlen)) GError("Error parsing target length from GFF Qreg|: \n%s\n",l); } }//has Qreg attr if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) { p+=5; if (!parseUInt(p,qlen)) GError("Error parsing target length from GFF Qlen:\n%s\n",l); } */ } //GFF3 else { // ----------------- GTF syntax ------------------ if (reader->transcripts_Only && !is_t_data) { return; //alwasys skip unrecognized non-transcript features in GTF } if (is_gene) { reader->gtf_gene=true; ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //Ensemble GTF might lack this gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); if (ID==NULL) { //no transcript_id -- this should not be valid GTF2 format, but Ensembl (Gencode?) //has being known to add "gene" features with only gene_id in their GTF if (gene_id!=NULL) { //likely a gene feature line (Ensembl!) ID=Gstrdup(gene_id); //take over as ID (for defective GTF lacking transcript_id) } } // else if (strcmp(gene_id, ID)==0) //GENCODE v20 gene feature ? } else if (is_transcript) { ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID if (ID==NULL) { //something is wrong here, cannot parse the GTF ID GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l); return; } gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); if (gene_id!=NULL) Parent=Gstrdup(gene_id); reader->gtf_transcript=true; is_gtf_transcript=1; } else { //must be an exon type Parent = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); // for GTF this is the only attribute accepted as geneID //old pre-GTF2 formats like Jigsaw's (legacy support) if (Parent==NULL && exontype==exgffExon) { if (startsWith(track,"jigsaw")) { is_cds=true; strcpy(track,"jigsaw"); p=strchr(info,';'); if (p==NULL) { Parent=Gstrdup(info); info=NULL; } else { Parent=Gstrdup(info,p-1); info=p+1; } } } if (Parent==NULL) { //something is wrong here couldn't parse the transcript ID for this feature GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l); return; } } //more GTF attribute parsing if (is_gene && gene_id==NULL && ID!=NULL) gene_id=Gstrdup(ID); gene_name=getAttrValue("gene_name"); if (gene_name==NULL) { gene_name=getAttrValue("gene_sym"); if (gene_name==NULL) { gene_name=getAttrValue("gene"); if (gene_name==NULL) gene_name=getAttrValue("genesymbol"); } } //*** IMPORTANT: prepare GTF for easy parseAttr by adding '=' character after the attribute name // for ALL attributes p=info; bool noed=true; //not edited after the last delim bool nsp=false; //non-space found after last delim while (*p!=0) { if (*p==' ') { if (nsp && noed) { *p='='; noed=false; p++; continue; } } else nsp=true; //non-space if (*p==';') { noed=true; nsp=false; } p++; } //-- GTF prepare parents[] if Parent found if (Parent!=NULL) { //GTF transcript_id found as a parent _parents=Parent; num_parents=1; _parents_len=strlen(Parent)+1; GMALLOC(parents, sizeof(char*)); parents[0]=_parents; } } //GTF if (ID==NULL && parents==NULL) { if (gffWarnings) GMessage("Warning: discarding unrecognized feature (no ID or Parent):\n%s\n",dupline); return; //skip } skipLine=false; } //FIXME - this should only be used AFTER finalize() was called, and must have cdss=NULL of course void GffObj::setCDS(uint cd_start, uint cd_end, char phase) { if (cd_startstart) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n", gffID, cd_start); return; } if (cd_end>this->end) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n", gffID, cd_end); return; } this->CDstart=cd_start; this->CDend=cd_end; this->CDphase=phase; isTranscript(true); subftype_id=gff_fid_exon; if (monoFeature()) { if (exons.Count()==0) addExon(this->start, this->end, exgffExon); else exons[0]->exontype=exgffExon; } } void GffObj::setCDS(GffObj* t) { //copy the cdss as well uint cd_start=t->CDstart; uint cd_end=t->CDend; uint phase=t->CDphase; if (cd_startstart) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n", gffID, cd_start); return; } if (cd_end>this->end) { GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n", gffID, cd_end); return; } this->CDstart=cd_start; this->CDend=cd_end; this->CDphase=phase; isTranscript(true); subftype_id=gff_fid_exon; if (monoFeature()) { if (exons.Count()==0) addExon(this->start, this->end, exgffExon); else exons[0]->exontype=exgffExon; } if (t->cdss!=NULL) { if (this->cdss!=NULL) delete cdss; cdss=new GList(true, true, false); for (int i=0;icdss->Count();i++) { cdss->Add(new GffExon(*(t->cdss->Get(i)))); } } } int GffObj::readExon(GffReader& reader, GffLine& gl) { // -- this should only be called before ::finalize()! //should make sure to get the right subftype_id! if (!isTranscript() && gl.exontype>exgffNone) { //subfeature recognized as exon-like, so this should be considered a transcript! isTranscript(true); } if (isTranscript()) { if (subftype_id<0) {//exon_ftype_id=gff_fid_exon; if (gl.exontype>exgffNone) subftype_id=gff_fid_exon; else subftype_id=names->feats.addName(gl.ftype); } //any recognized exon-like segment gets the generic "exon" type (also applies to CDS) if (gl.exontype==exgffNone && !gl.is_transcript) { //extraneous mRNA feature, discard if (reader.gff_warns) GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n", gl.ftype, gffID); return -1; } } else { //non-mRNA parent feature, check this subf type int subf_id=names->feats.addName(gl.ftype); if (subftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added) subftype_id=subf_id; else { if (subftype_id!=subf_id) { if (subftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) { //the existing exon was just a dummy one created by default, discard it? exons.Clear(); covlen=0; subftype_id=subf_id; //allow the new subfeature to completely takeover } else { //multiple subfeatures, prefer those exon-like if (reader.gff_warns) GMessage("Warning: multiple subfeatures (%s and %s) found for %s, discarding ", names->feats.getName(subf_id), names->feats.getName(subftype_id),gffID); if (gl.exontype>exgffNone) { //new feature is an exon, discard previously parsed subfeatures if (reader.gff_warns) GMessage("%s.\n", names->feats.getName(subftype_id)); subftype_id=subf_id; exons.Clear(); covlen=0; } else { //discard new feature if (reader.gff_warns) GMessage("Warning: skipping subfeature %s.\n", names->feats.getName(subf_id)); return -1; //skip this 2nd subfeature type for this parent! } } } //incoming subfeature is of different type } //new subfeature type } //non-mRNA parent int eidx=-1; GList* segs=NULL; //either cds or &exons if (gl.is_cds) { if (cdss==NULL) cdss=new GList(true, true, false); segs=cdss; } else { segs=&exons; } eidx=addExon(*segs, gl); if (eidx<0) { GMessage("Warning: addExon() failed for GFF line:\n%s\n",gl.dupline); return eidx; //this should never happen! } if (reader.keep_Attrs) { if (reader.noExonAttrs) { parseAttrs(attrs, gl.info, true); } else { //need all exon-level attributes parseAttrs((*segs)[eidx]->attrs, gl.info, true, gl.is_cds); } } return eidx; } int GffObj::addExon(GList& segs, GffLine& gl, int8_t exontype_override) { int ex_type=(exontype_override!=exgffNone) ? exontype_override : gl.exontype; GffScore exon_score(gl.score, gl.score_decimals); int eidx=addExon(gl.fstart, gl.fend, ex_type, gl.phase, exon_score, &segs); if (&segs==cdss && isGene() && gl.ID!=NULL && eidx>=0) { //special NCBI cases where CDS can be treated as discontiguous features, grouped by their ID //-- used for genes with X_gene_segment features //char* cds_id=Gstrdup(gl.ID); //segs[eidx]->uptr=cds_id; segs[eidx]->uptr=gl.ID; gl.ID=NULL; } return eidx; } int GffObj::exonOverlapIdx(GList& segs, uint s, uint e, int* ovlen, int start_idx) { //return the exons' index for the overlapping OR ADJACENT exon //ovlen, if given, will return the overlap length //if (s>e) Gswap(s,e); for (int i=start_idx;istart>e+1) break; if (s-1>segs[i]->end) continue; //-- overlap/adjacent if we are here: if (ovlen!=NULL) { int ovlend= (segs[i]->end>e) ? e : segs[i]->end; *ovlen= ovlend - ((s>segs[i]->start)? s : segs[i]->start)+1; } return i; } //for each exon *ovlen=0; return -1; } void GffObj::transferCDS(GffExon* cds) { //direct adding of a cds to the cdss pointer, without checking if (cdss==NULL) cdss=new GList(true, true, false); cdss->Add(cds); //now the caller must forget this exon! if (CDstart==0 || CDstart>cds->start) CDstart=cds->start; } int GffObj::addExon(uint segstart, uint segend, int8_t exontype, char phase, GffScore exon_score, GList* segs) { if (segstart>segend) { Gswap(segstart, segend); } if (segs==NULL) segs=&exons; if (exontype!=exgffNone) { //check for overlaps between exon/CDS-type segments //addExonSegment(gl.fstart, gl.fend, gl.score, gl.phase, gl.is_cds, exontype_override); int ovlen=0; int oi=-1; while ((oi=exonOverlapIdx(*segs, segstart, segend, &ovlen, oi+1))>=0) { //note: ovlen==0 for adjacent segments if ((*segs)[oi]->exontype>exgffNone && (*segs)[oi]->start<=segstart && (*segs)[oi]->end>=segend) { //existing feature contains this segment, so we do NOT need to add it //-- unless its the annoying NCBI exception: gene with multiple alternate // _gene_segment CDS features! if (!(this->isGene() && exontype==exgffCDS && (*segs)[oi]->exontype==exgffCDS )) return oi; } if (ovlen==0 || !(exontype==exgffCDS && (*segs)[oi]->exontype==exgffCDS)) { //always merge adjacent features //but NEVER merge two overlapping CDS (CDS programmed ribosomal shift aware) int8_t segtype=((*segs)[oi]->exontype==exgffCDS || exontype==exgffCDS) ? exgffCDS : exgffExon; //if expanded upward, may overlap the segment(s) above expandSegment(*segs, oi, segstart, segend, segtype); return oi; } } } //exon overlap/adjacent check //new exon/CDS, not merged in a previous one GffExon* enew=new GffExon(segstart, segend, exontype, phase, exon_score.score, exon_score.precision); int eidx=segs->Add(enew); if (eidx<0) { //this would actually be possible if the object is a "Gene" and "exons" are in fact isoforms delete enew; hasErrors(true); return -1; } if (start>segs->First()->start) start=segs->First()->start; if (endLast()->end) end=segs->Last()->end; if (isFinalized() && segs==&exons) { covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1; } return eidx; } void GffObj::expandSegment(GList& segs, int oi, uint segstart, uint segend, int8_t exontype) { //oi is the index of the *first* overlapping segment found that must be enlarged covlen-=segs[oi]->len(); if (segstartstart) segs[oi]->start=segstart; //if (qs && qsqstart) exons[oi]->qstart=qs; if (segend>segs[oi]->end) segs[oi]->end=segend; //if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe; //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse //if (sc!=0) segs[oi]->score=sc; //covlen+=exons[oi]->len(); //if (exons[oi]->exontype< exontype) -- always true segs[oi]->exontype = exontype; //if (exontype==exgffCDS) exons[oi]->phase=fr; //we must check if any more exons are also overlapping this int ni=oi+1; //next exon index after oi while (nistart<=segend+1) { // next segment overlaps OR adjacent to newly enlarged segment if (segs[ni]->exontype>0 && (segs[ni]->start==segend+1 || segs[ni]->exontype!=exgffCDS || exontype!=exgffCDS)) { if (segs[ni]->startstart) { segs[oi]->start=segs[ni]->start; if (strand=='+') segs[oi]->phase=segs[ni]->phase; } if (segs[ni]->end>segs[oi]->end) { segs[oi]->end=segs[ni]->end; if (strand=='-') segs[oi]->phase=segs[ni]->phase; } segs.Delete(ni); } else ++ni; } //until no more overlapping/adjacent segments found // -- make sure any other related boundaries are updated: if (isFinalized()) { if (&segs==&exons) { start=exons.First()->start; end=exons.Last()->end; //recalculate covlen covlen=0; for (int i=0;ilen(); } } else { if (start>segs.First()->start) start=segs.First()->start; if (endend) end=segs.Last()->end; } } void GffObj::removeExon(int idx) { if (idx<0 || idx>=exons.Count()) return; int segstart=exons[idx]->start; int segend=exons[idx]->end; exons.Delete(idx); if (isFinalized()) { covlen -= (int)(segend-segstart)+1; start=exons.First()->start; end=exons.Last()->end; if (isCDSOnly()) { CDstart=start; CDend=end; } } } void GffObj::removeExon(GffExon* p) { for (int idx=0;idxstart; int segend=exons[idx]->end; exons.Delete(idx); covlen -= (int)(segend-segstart)+1; if (exons.Count() > 0) { start=exons.First()->start; end=exons.Last()->end; if (isCDSOnly()) { CDstart=start; CDend=end; } } return; } } } GffObj::GffObj(GffReader& gfrd, BEDLine& bedline):GSeg(0,0), exons(true,true,false), cdss(NULL), gscore() { uptr=NULL; ulink=NULL; parent=NULL; udata=0; flags=0; CDstart=0; CDend=0; CDphase=0; attrs=NULL; gffID=NULL; track_id=-1; gseq_id=-1; //ftype_id=-1; //subftype_id=-1; strand='.'; gffnames_ref(names); //qlen=0;qstart=0;qend=0; covlen=0; geneID=NULL; gene_name=NULL; ftype_id=gff_fid_transcript; subftype_id=gff_fid_exon; start=bedline.fstart; end=bedline.fend; gseq_id=names->gseqs.addName(bedline.gseqname); track_id=names->tracks.addName("BED"); strand=bedline.strand; //setup flags from gffline isGene(false); isTranscript(true); gffID=Gstrdup(bedline.ID); for (int i=0;iaddExon(bedline.exons[i].start, bedline.exons[i].end, exgffExon); if (eidx<0 && gfrd.showWarnings()) GMessage("Warning: failed adding segment %d-%d for %s (discarded)!\n", bedline.exons[i].start, bedline.exons[i].end, gffID); } if (bedline.cds_start>0) { CDstart=bedline.cds_start; CDend=bedline.cds_end; if (CDstart>0 && bedline.cds_phase) CDphase=bedline.cds_phase; } if (gfrd.keep_Attrs && bedline.info!=NULL) this->parseAttrs(attrs, bedline.info); } GffObj::GffObj(GffReader &gfrd, GffLine& gffline): GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() { uptr=NULL; ulink=NULL; parent=NULL; udata=0; flags=0; CDstart=0; CDend=0; CDphase=0; geneID=NULL; gene_name=NULL; attrs=NULL; gffID=NULL; track_id=-1; gseq_id=-1; //ftype_id=-1; subftype_id=-1; strand='.'; gffnames_ref(names); //qlen=0;qstart=0;qend=0; covlen=0; ftype_id=gffline.ftype_id; start=gffline.fstart; end=gffline.fend; gseq_id=names->gseqs.addName(gffline.gseqname); track_id=names->tracks.addName(gffline.track); strand=gffline.strand; /* qcov=0; qlen=gffline.qlen; qstart=gffline.qstart; qend=gffline.qend; */ //setup flags from gffline isCDSOnly(gffline.is_cds); //for now isGene(gffline.is_gene); isTranscript(gffline.is_transcript || gffline.exontype!=exgffNone); //fromGff3(gffline.is_gff3); isGeneSegment(gffline.is_gene_segment); if (gffline.parents!=NULL && !gffline.is_transcript) { //GTF style -- create a GffObj directly by subfeature //(also possible orphan GFF3 exon line, or an exon given before its parent (chado)) if (gffline.exontype!=exgffNone) { //recognized exon-like feature ftype_id=gff_fid_transcript; //so this is some sort of transcript subftype_id=gff_fid_exon; //subfeatures MUST be exons //typical GTF2 without "transcript" line // or malformed GFF3 with orphan or premature exon feature (found before the transcript line) gffID=Gstrdup(gffline.parents[0]); this->createdByExon(true); if (gfrd.is_gff3 && gfrd.showWarnings()) GMessage("Warning: exon feature found before transcript ID %s\n",gffID); //this is the first exon/segment of the transcript readExon(gfrd, gffline); } else {//unrecognized (non-exon) subfeature //make this GffObj of the same feature type ftype_id=names->feats.addName(gffline.ftype); if (gffline.ID!=NULL) { //unrecognized non-exon feature ? use the ID instead this->hasGffID(true); gffID=Gstrdup(gffline.ID); if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info); } else { //no ID, just Parent GMessage("Warning: unrecognized parented feature without ID found before its parent:\n%s\n", gffline.dupline); gffID=Gstrdup(gffline.parents[0]); this->createdByExon(true); readExon(gfrd, gffline); } } //unrecognized (non-exon) feature } //non-transcript parented subfeature given directly else { //non-parented feature OR a recognizable transcript //create a parent feature in its own right gscore.score=gffline.score; gscore.precision=gffline.score_decimals; if (gffline.ID==NULL || gffline.ID[0]==0) GError("Error: no valid ID found for GFF record\n"); this->hasGffID(true); gffID=Gstrdup(gffline.ID); //there must be an ID here //if (gffline.is_transcript) ftype_id=gff_fid_mRNA; //else if (gffline.is_transcript) { subftype_id=gff_fid_exon; if (ftype_id<0) ftype_id=names->feats.addName(gffline.ftype); if (gfrd.is_gff3) { if (gffline.exons.Count()>0) { //for compact GFF-like transcript line format (TLF), exons were already found as attributes for (int i=0;iaddExon(gffline.exons[i].start, gffline.exons[i].end, exgffExon, '.', gscore); if (eidx<0 && gfrd.showWarnings()) GMessage("Warning: failed adding exon %d-%d for %s (discarded)!\n", gffline.exons[i].start, gffline.exons[i].end, gffID); } } if (gffline.cds_start>0) { CDstart=gffline.cds_start; CDend=gffline.cds_end; } if (gffline.phase!=0) CDphase=gffline.phase; if (gffline.cdss.Count()>0) { //for compact GFF-like transcript line format (TLF), CDS might be already found as attributes if (cdss==NULL) cdss=new GList(true, true, false); for (int i=0;iaddExon(gffline.cdss[i].start, gffline.cdss[i].end, exgffCDS, 0, GFFSCORE_NONE, cdss); if (eidx<0 && gfrd.showWarnings()) GMessage("Warning: failed adding CDS segment %d-%d for %s (discarded)!\n", gffline.cdss[i].start, gffline.cdss[i].end, gffID); } } } } //is_transcript if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info); if (gfrd.is_gff3 && gffline.parents==NULL && gffline.exontype!=exgffNone) { //special case with bacterial genes just given as a CDS/exon, without parent! this->createdByExon(true); if (ftype_id<0) ftype_id=gff_fid_mRNA; readExon(gfrd, gffline); } if (ftype_id<0) ftype_id=names->feats.addName(gffline.ftype); }//no parent OR recognizable transcript if (gffline.gene_name!=NULL) { gene_name=Gstrdup(gffline.gene_name); } if (gffline.gene_id) { //only for gene features or GTF2 gene_id attribute geneID=Gstrdup(gffline.gene_id); } /*//we cannot assume parents[0] is a gene! for NCBI miRNA, parent can be a primary_transcript feature! else if (gffline.is_transcript && gffline.parents!=NULL) { geneID=Gstrdup(gffline.parents[0]); } */ } BEDLine* GffReader::nextBEDLine() { if (bedline!=NULL) return bedline; //caller should free gffline after processing while (bedline==NULL) { int llen=0; buflen=GFF_LINELEN-1; char* l=fgetline(linebuf, buflen, fh, &fpos, &llen); if (l==NULL) return NULL; int ns=0; //first nonspace position while (l[ns]!=0 && isspace(l[ns])) ns++; if (l[ns]=='#' || llen<7) continue; bedline=new BEDLine(this, l); if (bedline->skip) { delete bedline; bedline=NULL; continue; } } return bedline; } GffLine* GffReader::nextGffLine() { if (gffline!=NULL) return gffline; //caller should free gffline after processing while (gffline==NULL) { int llen=0; buflen=GFF_LINELEN-1; char* l=fgetline(linebuf, buflen, fh, &fpos, &llen); if (l==NULL) { return NULL; //end of file } #ifdef CUFFLINKS _crc_result.process_bytes( linebuf, llen ); #endif int ns=0; //first nonspace position bool commentLine=false; while (l[ns]!=0 && isspace(l[ns])) ns++; if (l[ns]=='#') { commentLine=true; if (llen<10) { if (commentParser!=NULL) (*commentParser)(l, &gflst); continue; } } gffline=new GffLine(this, l); if (gffline->skipLine) { if (commentLine && commentParser!=NULL) (*commentParser)(gffline->dupline, &gflst); delete gffline; gffline=NULL; continue; } if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID //this might not be needed, already checked in the GffLine constructor if (gff_warns) GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n"); delete gffline; gffline=NULL; //continue; } } return gffline; } char* GffReader::gfoBuildId(const char* id, const char* ctg) { //caller must free the returned pointer char* buf=NULL; int idlen=strlen(id); GMALLOC(buf, idlen+strlen(ctg)+2); strcpy(buf, id); buf[idlen]='~'; strcpy(buf+idlen+1, ctg); return buf; } GffObj* GffReader::gfoAdd(GffObj* gfo) { GPVec* glst=phash.Find(gfo->gffID); if (glst==NULL) glst=new GPVec(false); int i=glst->Add(gfo); phash.Add(gfo->gffID, glst); return glst->Get(i); } GffObj* GffReader::gfoAdd(GPVec& glst, GffObj* gfo) { int i=glst.Add(gfo); return glst[i]; } GffObj* GffReader::gfoReplace(GPVec& glst, GffObj* gfo, GffObj* toreplace) { for (int i=0;i*& glst) { glst = phash.Find(id); return (glst!=NULL); } GffObj* GffReader::gfoFind(const char* id, GPVec*& glst, const char* ctg, char strand, uint start, uint end) { GPVec* gl=NULL; if (glst) { gl=glst; } else { gl = phash.Find(id); } GffObj* gh=NULL; if (gl) { for (int i=0;iCount();i++) { GffObj& gfo = *(gl->Get(i)); if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0) continue; if (strand && gfo.strand!='.' && strand != gfo.strand) continue; if (start>0) { if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS) continue; if (end>0 && (gfo.start>end || gfo.endchildren.Add(newgfo); if (newgfo->parent==NULL) newgfo->parent=parent; newgfo->setLevel(parent->getLevel()+1); //if (parent->isGene()) { if (parent->gene_name!=NULL && newgfo->gene_name==NULL) newgfo->gene_name=Gstrdup(parent->gene_name); if (parent->geneID!=NULL && newgfo->geneID==NULL) newgfo->geneID=Gstrdup(parent->geneID); //} return newgfo; } GffObj* GffReader::newGffRec(GffLine* gffline, GffObj* parent, GffExon* pexon, GPVec* glst, bool replace_parent) { GffObj* newgfo=new GffObj(*this, *gffline); GffObj* r=NULL; gflst.Add(newgfo); //tag non-transcripts to be discarded later if (this->transcripts_Only && this->is_gff3 && gffline->ID!=NULL && gffline->exontype==exgffNone && !gffline->is_gene && !gffline->is_transcript) { //unrecognized non-exon entity, should be discarded newgfo->isDiscarded(true); this->discarded_ids.Add(gffline->ID, new int(1)); } if (replace_parent && glst) { r=gfoReplace(*glst, newgfo, parent); updateParent(r, parent); } else { //regular case of new GffObj creation r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo); if (parent!=NULL) { updateParent(r, parent); if (pexon!=NULL) parent->removeExon(pexon); } } return r; } GffObj* GffReader::newGffRec(BEDLine* bedline, GPVec* glst) { GffObj* newgfo=new GffObj(*this, *bedline); GffObj* r=NULL; gflst.Add(newgfo); r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo); return r; } GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline) { if (prevgfo==NULL) return NULL; //prevgfo->gffobj->createdByExon(false); if (gffline->ftype_id>=0) prevgfo->ftype_id=gffline->ftype_id; else prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype); prevgfo->start=gffline->fstart; prevgfo->end=gffline->fend; prevgfo->isGene(gffline->is_gene); prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=exgffNone); prevgfo->hasGffID(gffline->ID!=NULL); if (keep_Attrs) { if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear(); prevgfo->parseAttrs(prevgfo->attrs, gffline->info); } return prevgfo; } bool GffReader::readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash* pex) { //this should only be called before prevgfo->finalize()! bool r=true; if (gffline->strand!=prevgfo->strand) { if (prevgfo->strand=='.') { prevgfo->strand=gffline->strand; } else { GMessage("Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n", prevgfo->gffID, prevgfo->strand, gffline->fstart, gffline->fend, gffline->strand, prevgfo->getGSeqName()); return true; } } int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end : ((gffline->fendstart)? prevgfo->start-gffline->fend : 0 ); if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID); //validation_errors = true; r=false; if (!gff_warns) exit(1); } int eidx=prevgfo->readExon(*this, *gffline); if (pex!=NULL && eidx>=0) { //if (eidx==0 && gffline->exontype>0) prevgfo->isTranscript(true); if (gffline->ID!=NULL && gffline->exontype==exgffNone) subfPoolAdd(*pex, prevgfo); } return r; } CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash& pex, char*& subp_name) { CNonExon* subp=NULL; subp_name=NULL; for (int i=0;inum_parents;i++) { if (transcripts_Only && discarded_ids.Find(gffline->parents[i])!=NULL) continue; subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name subp=pex.Find(subp_name); if (subp!=NULL) return subp; GFREE(subp_name); } return NULL; } void GffReader::subfPoolAdd(GHash& pex, GffObj* newgfo) { //this might become a parent feature later if (newgfo->exons.Count()>0) { char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname); pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], *gffline)); GFREE(xbuf); } } GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash& pex) { GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene) //if (prevp!=gflst[subp->idx]) // GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID); subp->gffline->discardParent(); GffObj* gfoh=newGffRec(subp->gffline, prevp, subp->exon); pex.Remove(subp_name); //no longer a potential parent, moved it to phash already prevp->promotedChildren(true); return gfoh; //returns the holder of newly promoted feature } //In the rare cases where the GFF/GTF stream is properly formatted // i.e. when all sub-features are grouped with (and preceded by) their parent! GffObj* GffReader::readNext() { //user must free the returned GffObj* GffObj* gfo=NULL; //GSeg tseg(0,0); //transcript boundaries char* lastID=NULL; if (is_BED) { if (nextBEDLine()) { gfo=new GffObj(*this, *bedline); //tseg.start=gfo->start; //tseg.end=gfo->end; delete bedline; bedline=NULL; } //else return NULL; } else { //GFF parsing while (nextGffLine()!=NULL) { char* tid=gffline->ID; if (gffline->is_exon) tid=gffline->parents[0]; else //not an exon if (!(gffline->is_transcript || gffline->is_gene)) tid=NULL; //WARNING: only parsing transcript && gene records here //if (tid==NULL || gffline->num_parents>1) { if (tid==NULL) { //not a suitable transcript ID found, skip this line delete gffline; gffline=NULL; continue; } bool sameID=(lastID!=NULL && strcmp(lastID, tid)==0); if (sameID) { if (gfo==NULL) GError("Error: same transcript ID but GffObj not initialized?!(%s)\n", tid); //TODO: if gffline->is_transcript: trans-splicing! if (!gffline->is_exon) { GMessage("Warning: skipping unexpected non-exon record with previously seen ID:\n%s\n", gffline->dupline); delete gffline; gffline=NULL; continue; } readExonFeature(gfo, gffline); //also takes care of adding CDS segments } else { //new transcript if (gfo==NULL) { //start gathering this transcript's data now gfo=new GffObj(*this, *gffline); //GFREE(lastID); lastID=Gstrdup(tid); /*if (gffline->is_transcript) { tseg.start=gffline->fstart; tseg.end=gffline->fend; }*/ } else { //this gffline is for the next transcript! //return what we've got so far //return gfo; break; } } //transcript ID change //gffline processed, move on delete gffline; gffline=NULL; } //while nextgffline() } //GFF records GFREE(lastID); //gfo populated with all its sub-features (or eof reached) if (gfo!=NULL) { gfo->finalize(this); } return gfo; } //Usually we have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input // (thanks to the annoyingly loose GFF3 specs) //Trans-splicing and fusions shall only be accepted in proper GFF3 format, i.e. multiple transcript features //with the same ID but NOT overlapping/continuous // *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged // and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq) void GffReader::readAll() { bool validation_errors = false; if (is_BED) { while (nextBEDLine()) { GPVec* prevgflst=NULL; GffObj* prevseen=gfoFind(bedline->ID, prevgflst, bedline->gseqname, bedline->strand, bedline->fstart); if (prevseen) { //duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs //e.g. a trans-spliced transcript - but segments should not overlap if (prevseen->overlap(bedline->fstart, bedline->fend)) { //overlapping feature with same ID is going too far GMessage("Error: overlapping duplicate BED feature (ID=%s)\n", bedline->ID); //validation_errors = true; if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors delete bedline; bedline=NULL; continue; } else exit(1); } //create a separate entry (true discontinuous feature?) prevseen=newGffRec(bedline, prevgflst); if (gff_warns) { GMessage("Warning: duplicate BED feature ID %s (%d-%d) (discontinuous feature?)\n", bedline->ID, bedline->fstart, bedline->fend); } } else { newGffRec(bedline, prevgflst); } delete bedline; bedline=NULL; } } else { //regular GFF/GTF or perhaps TLF? //loc_debug=false; GHash pex; //keep track of any parented (i.e. exon-like) features that have an ID //and thus could become promoted to parent features while (nextGffLine()!=NULL) { GffObj* prevseen=NULL; GPVec* prevgflst=NULL; if (gffline->ID && gffline->exontype==exgffNone) { //parent-like feature ID (mRNA, gene, etc.) not recognized as an exon feature //check if this ID was previously seen on the same chromosome/strand within GFF_MAX_LOCUS distance prevseen=gfoFind(gffline->ID, prevgflst, gffline->gseqname, gffline->strand, gffline->fstart); if (prevseen) { //same ID seen in the same locus/region if (prevseen->createdByExon()) { if (gff_warns && (prevseen->startfstart || prevseen->end>gffline->fend)) GMessage("Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID); //an exon of this ID was given before //this line has the main attributes for this ID updateGffRec(prevseen, gffline); } else { //possibly a duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs //e.g. a trans-spliced transcript - though segments should not overlap! bool gtf_gene_dupID=(prevseen->isGene() && gffline->is_gtf_transcript); if (prevseen->overlap(gffline->fstart, gffline->fend) && !gtf_gene_dupID) { //in some GTFs a gene ID may actually be the same with the parented transcript ID (thanks) //overlapping feature with same ID is going too far GMessage("Error: discarding overlapping duplicate %s feature (%d-%d) with ID=%s\n", gffline->ftype, gffline->fstart, gffline->fend, gffline->ID); //validation_errors = true; if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors delete gffline; gffline=NULL; continue; } //else exit(1); } if (gtf_gene_dupID) { //special GTF case where parent gene_id matches transcript_id (sigh) prevseen=newGffRec(gffline, prevseen, NULL, prevgflst, true); } else { //create a separate entry (true discontinuous feature) prevseen=newGffRec(gffline, prevseen->parent, NULL, prevgflst); if (gff_warns) { GMessage("Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n", gffline->ID, gffline->fstart, gffline->fend); } } } //duplicate ID in the same locus } //ID seen previously in the same locus } //parent-like ID feature (non-exon) if (gffline->parents==NULL) { //top level feature (transcript, gene), no parents (or parents can be ignored) if (!prevseen) newGffRec(gffline, NULL, NULL, prevgflst); } else { //--- it's a child feature (exon/CDS or even a mRNA with a gene as parent) //updates all the declared parents with this child bool found_parent=false; if (gffline->is_gtf_transcript && prevseen && prevseen->parent) { found_parent=true; //parent already found in special GTF case } else { GffObj* newgfo=prevseen; GPVec* newgflst=NULL; GVec kparents; //kept parents (non-discarded) GVec< GPVec* > kgflst(false); GPVec* gflst0=NULL; for (int i=0;inum_parents;i++) { newgflst=NULL; //if (transcriptsOnly && ( if (discarded_ids.Find(gffline->parents[i])!=NULL) continue; if (!pFind(gffline->parents[i], newgflst)) continue; //skipping discarded parent feature kparents.Add(i); if (i==0) gflst0=newgflst; kgflst.Add(newgflst); } if (gffline->num_parents>0 && kparents.Count()==0) { kparents.cAdd(0); kgflst.Add(gflst0); } for (int k=0;kis_transcript || gffline->exontype==exgffNone) {//likely a transcript //parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname, // gffline->strand, gffline->fstart, gffline->fend); if (newgflst!=NULL && newgflst->Count()>0) parentgfo = newgflst->Get(0); } else { //for exon-like entities we only need a parent to be in locus distance, //on the same strand parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname, gffline->strand, gffline->fstart); } if (parentgfo!=NULL) { //parent GffObj parsed earlier found_parent=true; if ((parentgfo->isGene() || parentgfo->isTranscript()) && (gffline->is_transcript || gffline->exontype==exgffNone)) { //not an exon, but could be a transcript parented by a gene // *or* by another transcript (! miRNA -> primary_transcript) if (newgfo) { updateParent(newgfo, parentgfo); } else { newgfo=newGffRec(gffline, parentgfo); } } else { //potential exon subfeature? bool addingExon=false; if (transcripts_Only) { if (gffline->exontype>0) addingExon=true; } else { //always discard silly "intron" features if (! (gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) addingExon=true; } if (addingExon) if (!readExonFeature(parentgfo, gffline, &pex)) validation_errors=true; } } //overlapping parent feature found } //for each parsed parent Id if (!found_parent) { //new GTF-like record starting directly here as a subfeature //or it could be some chado GFF3 barf with exons coming BEFORE their parent :( //or it could also be a stray transcript without a parent gene defined previously //check if this feature isn't parented by a previously stored "child" subfeature char* subp_name=NULL; CNonExon* subp=NULL; if (!gffline->is_transcript) { //don't bother with this check for obvious transcripts if (pex.Count()>0) subp=subfPoolCheck(gffline, pex, subp_name); if (subp!=NULL) { //found a subfeature that is the parent of this (!) //promote that subfeature to a full GffObj GffObj* gfoh=promoteFeature(subp, subp_name, pex); //add current gffline as an exon of the newly promoted subfeature if (!readExonFeature(gfoh, gffline, &pex)) validation_errors=true; } } if (subp==NULL) { //no parent subfeature seen before //loc_debug=true; GffObj* ngfo=prevseen; if (ngfo==NULL) { //if it's an exon type, create directly the parent with this exon //but if it's recognized as a transcript, the object itself is created ngfo=newGffRec(gffline, NULL, NULL, newgflst); } if (!ngfo->isTranscript() && gffline->ID!=NULL && gffline->exontype==0) subfPoolAdd(pex, ngfo); //even those with errors will be added here! } GFREE(subp_name); } //no previous parent found } } //parented feature //-- delete gffline; gffline=NULL; }//while gff lines } if (gflst.Count()>0) { gflst.finalize(this); //force sorting by locus if so constructed } // all gff records are now loaded in GList gflst // so we can free the hash phash.Clear(); //tids.Clear(); if (validation_errors) { exit(1); } } void GfList::finalize(GffReader* gfr) { //if set, enforce sort by locus GList discarded(false,true,false); for (int i=0;ifinalize(gfr); if (fList[i]->isDiscarded()) { discarded.Add(fList[i]); //inform parent that thiis child is removed if (fList[i]->parent!=NULL) { GPVec& pchildren=fList[i]->parent->children; for (int c=0;cchildren.Count()>0) { //inform children that the parent was removed for (int c=0;cchildren.Count();c++) { fList[i]->children[c]->parent=NULL; if (gfr->keep_Attrs) //inherit the attributes of discarded parent (e.g. pseudo=true; ) fList[i]->children[c]->copyAttrs(fList[i]); } } this->Forget(i); } } if (discarded.Count()>0) { this->Pack(); } if (gfr->sortByLoc) { this->setSorted(false); if (gfr->refAlphaSort) this->setSorted((GCompareProc*)gfo_cmpByLoc); else this->setSorted((GCompareProc*)gfo_cmpRefByID); } } bool GffObj::reduceExonAttrs(GList& segs) { bool attrs_discarded=false; for (int a=0;aattrs->Count();a++) { int attr_id=segs[0]->attrs->Get(a)->attr_id; char* attr_name=names->attrs.getName(attr_id); char* attr_val =segs[0]->attrs->Get(a)->attr_val; bool sameExonAttr=true; bool discardAll=(GstrEq("exon_id", attr_name) || GstrEq("exon_number", attr_name)); if (!discardAll) for (int i=1;igetAttr(attr_id); if (ov==NULL || (strcmp(ov,attr_val)!=0)) { sameExonAttr=false; break; } } if (sameExonAttr) { //delete this attribute from exon level attrs_discarded=true; if (!discardAll) { //add the attribute to transcript level //rename it if it exists and is different for the transcript! char* t_val=NULL; bool same_aval=false; if (this->attrs!=NULL && (t_val=this->attrs->getAttr(attr_id))!=NULL) { //same attribute name already exists for the transcript! //write it using CDS_ or exon_ prefix same_aval=(strcmp(attr_val, t_val)==0); if (!same_aval) { //add renamed attribute const char* prefix = (&segs==cdss) ? "CDS_" : "exon_"; char* new_attr_name=NULL; GMALLOC(new_attr_name, strlen(prefix)+strlen(attr_name)+1); new_attr_name[0]=0; strcat(new_attr_name, prefix); strcat(new_attr_name, attr_name); this->attrs->add_or_update(names, new_attr_name, attr_val); GFREE(new_attr_name); } } else { //no such attribute exists for the transcript, copy it from the exon this->addAttr(attr_name, attr_val); } } for (int i=1;iattrs->freeItem(a); } //sameExonAttr } if (attrs_discarded) segs[0]->attrs->Pack(); return attrs_discarded; } //return the segs index of segment containing coord: int GffObj::whichExon(uint coord, GList* segs) { //segs MUST be sorted by GSeg order (start coordinate) if (segs==NULL) segs=&exons; if (segs->Count()==0) return -1; if (coordFirst()->start || coord>segs->Last()->end) return -1; if (segs->Count()<6) { //simple scan for (int i=0;iCount();i++) if ((*segs)[i]->overlap(coord)) return i; return -1; } else { //use quick search int i=0; int l=0; //lower boundary int h=segs->Count()-1; //higher boundary while (l<=h) { i = (l+h) >> 1; //range midpoint if (coord > segs->Get(i)->end) l=i+1; else { //coord <= segs->Get(i)->end if (coord >= segs->Get(i)->start) { return i; } //here: coord < segs->Get(i)->start h = i-1; } } } return -1; } bool GffObj::processGeneSegments(GffReader* gfr) { /* procedure: 1)store the info about any X_gene_segment entries in a GVec (just storing their index in gene->children[] list) 2)for each CDS, group them by ID in GHash (and a GPVec for storage) 3)for each GeneCDSChain, collect _gene_segments having a containment-relationship and rank them by lowest noncov 4)for each GeneCDSChain, pick best _gene_segment match (if any) and transfer CDSs to it */ GVec geneSegs; //X_gene_segment features (children transcripts of this gene) GHash cdsChainById(false); // hash of CDS chains: CDS feature grouped by ID GPVec cdsChains; // CDS chains storage if (cdss==NULL || cdss->Count()==0 || children.Count()==0) return false; //we shouldn't be here //check if we have any _gene_segment children for this gene for (int i=0;iflag_GENE_SEGMENT) { if (children[i]->hasCDS() || children[i]->cdss!=NULL) { GMessage("Warning: will not transfer CDS from %s to gene_segment %s which already has its own\n", gffID, children[i]->gffID); continue; } geneSegs.Add(i); } if (geneSegs.Count()==0) { if (gfr->gff_warns) GMessage("Warning: gene %s has CDS and transcripts but no suitable _gene_segment features\n",gffID); return false; //nothing to do } //group CDSs into CDS chains by their ID: for (int i=0;iCount();i++) { char* id=(char*)(cdss->Get(i)->uptr); if (id==NULL) continue; //should never happen GeneCDSChain *gcc=cdsChainById.Find(id); if (gcc!=NULL) gcc->addCDS(i, cdss->Get(i)->start, cdss->Get(i)->end); else { //new CDS chain: gcc=new GeneCDSChain(i, cdss->Get(i)->start, cdss->Get(i)->end); cdsChains.Add(gcc); cdsChainById.shkAdd(id, gcc); } } for (int i=0;iCount();i++) { GFREE(cdss->Get(i)->uptr); //no CDS ID no longer needed } //collect _gene_segment containers for each CDS chain int cds_moved=0; for (int i=0;itransferCDS(cdss->Get(gc.cdsList[c].idx)); cdss->Forget(gc.cdsList[c].idx); cds_moved++; } // also remove it from the list of gene_segments to be mapped geneSegs.Delete(gc.mxs.First().gsegidx); //assigned, should no longer be checked against other CDS chains if (t->isFinalized()) t->finalize(gfr); } if (cds_moved>0) cdss->Pack(); if (cdss->Count()==0) { delete cdss; cdss=NULL; if (exons.Count()==0) isTranscript(false); } return true; } GffObj* GffObj::finalize(GffReader* gfr) { if (this->createdByExon() && this->end-this->start<10 && this->exons.Count()<=1) { //? misleading exon-like feature parented by an exon or CDS mistakenly // interpreted as a standalone transcript // example: GENCODE gff3 feature "stop_codon_redefined_as_selenocysteine" which is // parented by a CDS ! if (cdss==NULL || cdss->Count()<=1) { if (gfr->showWarnings()) { GMessage("Warning: discarding suspicious '%s' record (ID=%s)\n",this->getFeatureName(),gffID); } isDiscarded(true); } } if (!isDiscarded()) { bool noExons=(exons.Count()==0 && (cdss==NULL || cdss->Count()==0)); if (noExons) { if (isTranscript() || (isGene() && children.Count()==0 && gfr->gene2exon)) { //add exon feature to an exonless transcript/gene addExon(this->start, this->end, exgffExon); //effectively this becomes a transcript (even childless genes if gene2exon) isTranscript(true); } } else { //it has exons or CDSs if (cdss!=NULL && isGene() && children.Count()>0) { //check for X_gene_segment processing processGeneSegments(gfr);//distribute the cdss to children _gene_segments } // _gene_segment processing } } if (cdss!=NULL && isGene()) //in case we stored IDs for gene_segment features for (int i=0;iCount();i++) { GFREE(cdss->Get(i)->uptr); } if (gfr->transcripts_Only && !isTranscript() && !(gfr->keep_Genes && isGene())) { //discard non-transcripts, unless it's a gene and keepGenes was specified isDiscarded(true); } isFinalized(true); if (isDiscarded()) { //just in case we have cds with uptr in use (X_gene_segment), free them uptr=NULL; udata=0; return this; } if (isTranscript()) { isCDSOnly(cdss!=NULL && exons.Count()==0 && cdss->Count()>0); subftype_id=isCDSOnly() ? gff_fid_CDS : gff_fid_exon; } if (cdss!=NULL && cdss->Count()>0) { CDstart=cdss->First()->start; CDend=cdss->Last()->end; CDphase=(strand=='-')? cdss->Last()->phase : cdss->First()->phase; bool updatePhase=(CDphase=='.' || CDphase==0); if (!updatePhase) for (int i=0;iCount();++i) if ((*cdss)[i]->phase<'0') { updatePhase=true; break; } if (updatePhase) updateCDSPhase(*cdss); //there are GFFs out there which only provide UTR and CDS records instead of full exons //so make sure we add all CDS segments to exons, if they are not already there for (int i=0;iCount();++i) { int eidx=addExon((*cdss)[i]->start, (*cdss)[i]->end, exgffExon, 0, (*cdss)[i]->score); if (eidx<0) GError("Error: could not reconcile CDS %d-%d with exons of transcript %s\n", (*cdss)[i]->start, (*cdss)[i]->end, gffID); } } else if (CDstart==0) {//no CDS, no phase CDphase=0; CDend=0; } //-- attribute reduction for some records which // repeat the exact same attr=value for every exon bool reduceAttributes=(gfr->keep_Attrs && !gfr->noExonAttrs && !gfr->keep_AllExonAttrs && exons.Count()>0 && exons[0]->attrs!=NULL); if (reduceAttributes) { //for each attribute of the 1st exon, if it has the //same value for all other exons, move it to transcript level //bool reduced=reduceExonAttrs(exons); reduceExonAttrs(exons); //if (gfr->showWarnings() && reduced) // GMessage("Info: duplicate exon attributes reduced for %s\n", gffID); //do the same for CDS segments, if any if (cdss!=NULL && cdss->Count()>0 && (*cdss)[0]->attrs!=NULL) { //reduced= reduceExonAttrs(*cdss); //if (gfr->showWarnings() && reduced) // GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID); } } //merge close exons if requested if (exons.Count()>0 && isTranscript()) { if (gfr->merge_CloseExons) { for (int i=0;iend; while (nistart-mend-1); //<0 = overlap, 0 = adjacent, >0 = bases apart if (dist>GFF_MIN_INTRON) break; //no merging with next segment if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) { GMessage("Warning: merging adjacent/overlapping segments (distance=%d) of %s on %s (%d-%d, %d-%d)\n", dist, gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end); } mend=exons[ni]->end; exons[i]->end=mend; if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL || exons[i]->attrs->Count()attrs->Count())) { //use the other exon attributes, if it has more delete(exons[i]->attrs); exons[i]->attrs=exons[ni]->attrs; exons[ni]->attrs=NULL; } exons.Delete(ni); } //check for merge with next exon } //for each exon } //merge close exons if (isCDSOnly() && exons.Count()!=cdss->Count()) isCDSOnly(false); } //-- check features vs their exons' span if (isTranscript()) { if (exons.Count()>0) { if (gfr->gff_warns && (this->start!=exons.First()->start || this->end!=exons.Last()->end) ) GMessage("Warning: adjusted transcript %s boundaries according to terminal exons.\n", gffID); this->start=exons.First()->start; this->end=exons.Last()->end; } } else { //non-transcripts just have to be at least as wide as their sub-features if (exons.Count()>0) { bool adj=false; if (this->start>exons.First()->start) { this->start=exons.First()->start; adj=true; } if (this->endend) { this->end=exons.First()->end; adj=true; } if (gfr->gff_warns && adj) GMessage("Warning: adjusted %s %s boundaries according to terminal sub-features.\n", this->getFeatureName(), gffID); } } //-- update covlen covlen=0; for (int i=0;ilen(); //-- check if CDS segments are different from exons and thus worth keeping separately in cdss if (cdss!=NULL && cdss->Count()>0) { bool cds_exComp=true; //CDSs are exon-compatible (no need to keep them separately) if (cdss->Count()==1) { //check that the CDS segment is within a single exon int start_eidx=-1; int end_eidx=-1; for (int i=0;istart, // exons[i]->end); if (CDstart>=exons[i]->start && CDstart<=exons[i]->end) { start_eidx=i; } if (CDend>=exons[i]->start || CDend<=exons[i]->end ) { end_eidx=i; } if (start_eidx>=0 && end_eidx>=0) break; } cds_exComp=(start_eidx==end_eidx && start_eidx>=0); if (!cds_exComp) GMessage("Warning: transcript %s has incorrect CDS segment definition (%d-%d)!\n", gffID, CDstart, CDend); cds_exComp=true; //just to free cdss, even though it's wrong } else { if (cdss->Count()>exons.Count()) { cds_exComp=false; } else { //2 or more CDS segments //CDSs should be intron compatible with exons, and CDS ends should be within exons int imax=exons.Count()-1; int jmax=cdss->Count()-1; int i=0; int j=0; //find which exon has CDstart for (i=0;i<=imax;++i) if (CDstart>=exons[i]->start && CDstart<=exons[i]->end) break; if (i>imax) cds_exComp=false; else { //check the introns now while (iend!=(*cdss)[j]->end || exons[i+1]->start!=(*cdss)[j+1]->start) { cds_exComp=false; break; } ++i; ++j; } //now j must be the last segment of cdss and CDend must be within exon[i] if (cds_exComp) if (j!=jmax || CDend>exons[i]->end || CDendstart) cds_exComp=false; } } } //multiple CDS segments if (cds_exComp) { if (isCDSOnly() && cdss->Count()==exons.Count()) for (int i=0;iCount();i++) exons[i]->phase=cdss->Get(i)->phase; if (gfr->keep_Attrs && !gfr->noExonAttrs) { int eidx=whichExon((*cdss)[0]->start, &exons); if (eidx<0) GError("Error finding CDS coordinate inside exons (?) for %s\n", gffID); for (int i=0;iCount();i++) { if (isCDSOnly()) //eidx should be the same with i exons[eidx]->phase=cdss->Get(i)->phase; if ((*cdss)[i]->attrs!=NULL && (*cdss)[i]->attrs->Count()>0) { if (exons[eidx]->attrs==NULL) exons[eidx]->attrs=new GffAttrs(); exons[eidx]->attrs->copyAttrs((*cdss)[i]->attrs, true); if (exons[eidx]->attrs->Count()==0) { delete exons[eidx]->attrs; exons[eidx]->attrs=NULL; } } ++eidx; } } delete cdss; cdss=NULL; //this->isXCDS(false); } else this->isXCDS(true); }//cdss check //--- collect stats for the reference genomic sequence if (gfr->gseqtable.Count()<=gseq_id) { gfr->gseqtable.setCount(gseq_id+1); } GSeqStat* gsd=gfr->gseqtable[gseq_id]; if (gsd==NULL) { gsd=new GSeqStat(gseq_id,names->gseqs.getName(gseq_id)); //gfr->gseqtable.Put(gseq_id, gsd); gfr->gseqtable[gseq_id]=gsd; gfr->gseqStats.Add(gsd); } gsd->fcount++; if (startmincoord) gsd->mincoord=start; if (end>gsd->maxcoord) gsd->maxcoord=end; if (this->len()>gsd->maxfeat_len) { gsd->maxfeat_len=this->len(); gsd->maxfeat=this; } uptr=NULL; udata=0; return this; } void GffObj::printExonList(FILE* fout) { //print comma delimited list of exon intervals for (int i=0;i0) fprintf(fout, ","); fprintf(fout, "%d-%d",exons[i]->start, exons[i]->end); } } void GffObj::printCDSList(FILE* fout) { //print comma delimited list of CDS intervals if (!hasCDS()) return; GVec cds; this->getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment for (int i=0;i0) fprintf(fout, ","); fprintf(fout, "%d-%d", cds[i].start, cds[i].end); } } void BED_addAttribute(FILE* fout, int& acc, const char* format,... ) { ++acc; if (acc==1) fprintf(fout, "\t"); else fprintf(fout, ";"); va_list arguments; va_start(arguments,format); vfprintf(fout,format,arguments); va_end(arguments); } void GffObj::printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len) { //print a BED-12 line + GFF3 attributes in 13th field int cd_start=CDstart>0? CDstart-1 : start-1; int cd_end=CDend>0 ? CDend : end; char cdphase=(CDphase>0) ? CDphase : '0'; fprintf(fout, "%s\t%d\t%d\t%s\t%d\t%c\t%d\t%d\t%c,0,0", getGSeqName(), start-1, end, getID(), 100, strand, cd_start, cd_end, cdphase); if (exons.Count()>0) { int i; fprintf(fout, "\t%d\t", exons.Count()); for (i=0;ilen()); fprintf(fout, "\t"); for (i=0;istart-start); } else { //no-exon feature(!), shouldn't happen fprintf(fout, "\t1\t%d,\t0,", len()); } //now add the GFF3 attributes for in the 13th field int numattrs=0; if (CDstart>0) BED_addAttribute(fout, numattrs,"CDS=%d:%d",CDstart-1, CDend); if (CDphase>0) BED_addAttribute(fout, numattrs,"CDSphase=%c", CDphase); if (geneID!=NULL) BED_addAttribute(fout, numattrs, "geneID=%s",geneID); if (gene_name!=NULL) fprintf(fout, ";gene_name=%s",gene_name); if (attrs!=NULL) { for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); const char* attrval=attrs->Get(i)->attr_val; if (attrval==NULL || attrval[0]=='\0') { BED_addAttribute(fout, numattrs,"%s",attrname); continue; } if (cvtChars) { decodeHexChars(dbuf, attrval, dbuf_len-1); BED_addAttribute(fout, numattrs, "%s=%s", attrname, dbuf); } else BED_addAttribute(fout, numattrs,"%s=%s", attrname, attrs->Get(i)->attr_val); } } fprintf(fout, "\n"); } void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon, bool CDSsrc) { if (names==NULL) GError(ERR_NULL_GFNAMES, "parseAttrs()"); if (atrlist==NULL) { atrlist=new GffAttrs(); } bool exon2transcript=(isExon && atrlist==this->attrs); char* endinfo=info+strlen(info); char* start=info; char* pch=start; while (startadd_or_update(this->names, start, ech, CDSsrc); //overwrite previous attr with the same name } start=pch; } //while info characters if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; } } void GffObj::addAttr(const char* attrname, const char* attrvalue) { if (this->attrs==NULL) this->attrs=new GffAttrs(); //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue)); this->attrs->add_or_update(names, attrname, attrvalue); } void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript if (from==NULL || from->attrs==NULL || from->attrs->Count()==0) return; if (this->attrs==NULL) { this->attrs=new GffAttrs(); } //special RefSeq case int desc_attr_id=names->attrs.getId("description"); //from gene int prod_attr_id=names->attrs.getId("product"); //from transcript (this) char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL; for (int i=0;iattrs->Count();++i) { //this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val); int aid=from->attrs->Get(i)->attr_id; //special case for GenBank refseq genes vs transcripts: if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0) continue; //skip description if product already there and the same bool haveit=false; for (int ai=0;aiattrs->Count();++ai) { //do we have it already? if (aid==this->attrs->Get(ai)->attr_id) { haveit=true; break; //skip this, don't replace } } if (!haveit) this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val)); } } void GffObj::setFeatureName(const char* feature) { //change the feature name/type for a transcript int fid=names->feats.addName(feature); if (monoFeature() && exons.Count()>0) this->subftype_id=fid; this->ftype_id=fid; } void GffObj::setRefName(const char* newname) { //change the feature name/type for a transcript int rid=names->gseqs.addName(newname); this->gseq_id=rid; } int GffObj::removeAttr(const char* attrname, const char* attrval) { if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0; int aid=this->names->attrs.getId(attrname); if (aid<0) return 0; int delcount=0; //could be more than one ? for (int i=0;iattrs->Count();i++) { if (aid==this->attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { delcount++; this->attrs->freeItem(i); } } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeAttr(int aid, const char* attrval) { if (this->attrs==NULL || aid<0) return 0; int delcount=0; //could be more than one ? for (int i=0;iattrs->Count();i++) { if (aid==this->attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { delcount++; this->attrs->freeItem(i); } } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) { if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0; int aid=this->names->attrs.getId(attrname); if (aid<0) return 0; int delcount=0; //could be more than one for (int i=0;iCount();i++) { if (aid==exon.attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { delcount++; exon.attrs->freeItem(i); } } } if (delcount>0) exon.attrs->Pack(); return delcount; } int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) { if (exon.attrs==NULL || aid<0) return 0; int delcount=0; //could be more than one for (int i=0;iCount();i++) { if (aid==exon.attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { delcount++; exon.attrs->freeItem(i); } } } if (delcount>0) exon.attrs->Pack(); return delcount; } char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst) { if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n"); return NULL; } //restore normal coordinates: if (exons.Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } char* unspliced=NULL; int seqstart=exons.First()->start; int seqend=exons.Last()->end; int unsplicedlen = 0; if (seglst) seglst->Clear(strand); unsplicedlen += seqend - seqstart + 1; GMALLOC(unspliced, unsplicedlen+1); //allocate more here //uint seqstart, seqend; int s = 0; //resulting nucleotide counter if (strand=='-') { if (seglst!=NULL) seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend); for (int i=seqend;i>=seqstart;i--) { unspliced[s] = ntComplement(gsubseq[i-start]); s++; }//for each nt } // - strand else { // + strand if (seglst!=NULL) seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend); for (int i=seqstart;i<=seqend;i++) { unspliced[s]=gsubseq[i-start]; s++; }//for each nt } // + strand //assert(s <= unsplicedlen); unspliced[s]=0; if (rlen!=NULL) *rlen=s; return unspliced; } void GffObj::addPadding(int padLeft, int padRight) { this->start-=padLeft; this->end+=padRight; if (exons.Count()>0) { exons[0]->start-=padLeft; exons.Last()->end+=padRight; } covlen+=padLeft+padRight; } void GffObj::removePadding(int padLeft, int padRight) { this->start+=padLeft; this->end-=padRight; if (exons.Count()>0) { exons[0]->start+=padLeft; exons.Last()->end-=padRight; } covlen-=padLeft+padRight; } char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end, GMapSegments* seglst, bool cds_open) { //cds_open only makes sense when CDSonly is true by overriding CDS 3'end such that the end of //the sequence beyond the 3' CDS end is also returned (the 3' UTR is appended to the CDS) if (CDSonly && CDstart==0) { GMessage("Warning: getSpliced(CDSOnly) requested for transcript with no CDS (%s)!\n", gffID); //should never happen return NULL; } if (faseq==NULL) { GMessage("Warning: getSpliced() called with uninitialized GFaSeqGet object!\n"); //should never happen return NULL; } GList* xsegs=&exons; if (CDSonly && this->cdss!=NULL) xsegs=this->cdss; if (xsegs->Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust int endadj=end-start+1-fspan; uint prevend=end; end-=endadj; if (CDend>end) CDend=end; if (xsegs->Last()->end>end) { xsegs->Last()->end=end; //this could be trouble if exon start is also > end if (xsegs->Last()->start>xsegs->Last()->end) { GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n", prevend,getGSeqName(), getID()); } covlen-=endadj; } } char* spliced=NULL; GMALLOC(spliced, covlen+1); //IMPORTANT: covlen must be correct here! uint g_start=0, g_end=0; int cdsadj=0; if (CDphase=='1' || CDphase=='2') { cdsadj=CDphase-'0'; } uint CDS_start=CDstart; uint CDS_stop=CDend; if (cdsadj>0) { if (strand=='-') CDS_stop-=cdsadj; else CDS_start+=cdsadj; } if (CDSonly) { g_start=CDS_start; g_end=CDS_stop; if (g_end-g_start<3) GMessage("Warning: CDS %d-%d too short for %s, check your data.\n", g_start, g_end, gffID); } else { //all exon content, not just CDS g_start=xsegs->First()->start; g_end=xsegs->Last()->end; cds_open=false; //override mistaken user request } if (seglst!=NULL) seglst->Clear(strand); int s=0; //resulting nucleotide counter if (strand=='-') { if (cds_open) {// appending 3'UTR g_start=xsegs->First()->start; //CDS_start=g_start; } for (int x=xsegs->Count()-1;x>=0;x--) { uint sgstart=xsegs->Get(x)->start; uint sgend=xsegs->Get(x)->end; if (g_endsgend) continue; if (g_start>=sgstart && g_start<=sgend) sgstart=g_start; //3' end within this segment if (g_end>=sgstart && g_end<=sgend) sgend=g_end; //5' end within this segment if (seglst!=NULL) seglst->add(s+1,s+1+sgend-sgstart,sgend,sgstart); for (uint i=sgend;i>=sgstart;i--) { spliced[s] = ntComplement(gsubseq[i-start]); s++; }//for each nt //--update local CDS start-end coordinates if (cds_start!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) { //CDS start in this segment *cds_start=s-(CDS_stop-sgstart); } if (cds_end!=NULL && CDS_start>=sgstart && CDS_start<=sgend) { //CDS stop in this segment *cds_end=s-(CDS_start-sgstart); } } //for each exon } // - strand else { // + strand if (cds_open) { // appending 3'UTR g_end=xsegs->Last()->end; //CDS_stop=g_end; } for (int x=0;xCount();x++) { uint sgstart=xsegs->Get(x)->start; uint sgend=xsegs->Get(x)->end; if (g_endsgend) continue; if (g_start>=sgstart && g_start<=sgend) sgstart=g_start; //seqstart within this segment if (g_end>=sgstart && g_end<=sgend) sgend=g_end; //seqend within this segment if (seglst!=NULL) seglst->add(s+1,s+1+sgend-sgstart, sgstart, sgend); for (uint i=sgstart;i<=sgend;i++) { spliced[s]=gsubseq[i-start]; s++; }//for each nt //--update local CDS start-end coordinates if (cds_start!=NULL && CDS_start>=sgstart && CDS_start<=sgend) { //CDS start in this segment *cds_start=s-(sgend-CDS_start); } if (cds_end!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) { //CDS stop in this segment *cds_end=s-(sgend-CDS_stop); } } //for each exon } // + strand spliced[s]=0; if (rlen!=NULL) *rlen=s; return spliced; } void GffObj::printSummary(FILE* fout) { if (fout==NULL) fout=stdout; fprintf(fout, "%s\t%c\t%d\t%d\t", gffID, strand, start, end); gscore.print(fout); fprintf(fout, "\n"); } //TODO we should also have an escapeChars function for some situations //when we want to write a GFF3 strictly compliant to the dang specification void GffObj::decodeHexChars(char* dbuf, const char* s, int maxlen) { int dlen=0; dbuf[0]=0; if (s==NULL) return; for (const char* p=s;(*p)!=0 && dlen'Z') a^=0x20; //toupper() if (a>'9') a=10+(a-'A'); else a-='0'; int b=*(++p); if (b>'Z') b^=0x20; if (b>'9') b=10+(b-'A'); else b-='0'; char c=(char)((a<<4)+b); if (c=='%') { dbuf[dlen]='p'; ++dlen; dbuf[dlen]='r'; ++dlen; c='c'; } else if (c==';') c='.'; else if (c<='\t') c=' '; if (c>=' ') { dbuf[dlen]=c; ++dlen; continue; } } dbuf[dlen]=*p; ++dlen; } dbuf[dlen]=0; } void GffObj::printGTab(FILE* fout, char** extraAttrs) { fprintf(fout, "%s\t%c\t%d\t%d\t%s\t", this->getGSeqName(), this->strand, this->start, this->end, this->getID()); if (exons.Count()) printExonList(fout); else fprintf(fout, "."); if (extraAttrs!=NULL) { //print a list of "attr=value;" pairs here as the last column //for requested attributes bool t1=true; for (int i=0;extraAttrs[i]!=NULL;++i) { const char* v=this->getAttr(extraAttrs[i]); if (v==NULL) continue; if (t1) { fprintf(fout, "\t"); t1=false; } fprintf(fout, "%s=%s;", extraAttrs[i], v); } } fprintf(fout,"\n"); } void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, GffExon* exon, bool gff3, bool cvtChars, char* dbuf, int dbuf_len) { //strcpy(dbuf,"."); //if (exon->score>0) sprintf(dbuf,"%.2f", exon->score); exon->score.sprint(dbuf); if (exon->phase==0 || !iscds) exon->phase='.'; const char* ftype=iscds ? "CDS" : getSubfName(); const char* attrname=NULL; const char* attrval=NULL; if (gff3) { fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s", gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand, exon->phase, gffID); if (exon->attrs!=NULL) { for (int i=0;iattrs->Count();i++) { if (exon->attrs->Get(i)->cds!=iscds) continue; attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id); if (cvtChars) { decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1); fprintf(fout,";%s=%s", attrname, dbuf); } else { fprintf(fout,";%s=%s", attrname, exon->attrs->Get(i)->attr_val); } } } fprintf(fout, "\n"); } //GFF3 else {//GTF fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";", gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand, exon->phase, gffID); if (geneID) fprintf(fout," gene_id \"%s\";",geneID); if (gene_name!=NULL) { fprintf(fout," gene_name \"%s\";",gene_name); } if (exon->attrs!=NULL) { bool trId=false; bool gId=false; for (int i=0;iattrs->Count();i++) { if (exon->attrs->Get(i)->attr_val==NULL) continue; if (exon->attrs->Get(i)->cds!=iscds) continue; attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id); if (strcmp(attrname, "transcriptID")==0) { if (trId) continue; trId=true; } if (strcmp(attrname, "transcript_id")==0 && !trId) { attrname="transcriptID"; trId=true; } if (strcmp(attrname, "geneID")==0) { if (gId) continue; gId=true; } if (strcmp(attrname, "gene_id")==0 && !gId) { attrname="geneID"; gId=true; } if (Gstricmp(attrname, "gene_name")==0 && gene_name!=NULL) { continue; } fprintf(fout, " %s ",attrname); if (cvtChars) { decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1); attrval=dbuf; } else { attrval=exon->attrs->Get(i)->attr_val; } if (attrval[0]=='"') fprintf(fout, "%s;",attrval); else fprintf(fout, "\"%s\";",attrval); } } //for GTF, also append the GffObj attributes to each exon line // - do not do this when the transcript line is also printed! /* if (attrs!=NULL) { for (int i=0;iCount();i++) { if (attrs->Get(i)->attr_val==NULL) continue; attrname=names->attrs.getName(attrs->Get(i)->attr_id); fprintf(fout, " %s ",attrname); if (cvtChars) { decodeHexChars(dbuf, attrs->Get(i)->attr_val, dbuf_len-1); attrval=dbuf; } else { attrval=attrs->Get(i)->attr_val; } if (attrval[0]=='"') fprintf(fout, "%s;",attrval); else fprintf(fout, "\"%s\";",attrval); } } */ fprintf(fout, "\n"); }//GTF } void GffObj::printGxf(FILE* fout, GffPrintMode gffp, const char* tlabel, const char* gfparent, bool cvtChars) { const int DBUF_LEN=1024; //there should not be attribute values longer than 1K! char dbuf[DBUF_LEN]; if (tlabel==NULL) { tlabel=track_id>=0 ? names->tracks.Get(track_id)->name : (char*)"gffobj" ; } if (gffp==pgffBED) { printBED(fout, cvtChars, dbuf, DBUF_LEN); return; } const char* gseqname=names->gseqs.Get(gseq_id)->name; bool gff3 = (gffp>=pgffAny && gffp<=pgffTLF); bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth); bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth); //if (gscore>0.0) sprintf(dbuf,"%.2f", gscore); // else strcpy(dbuf,"."); gscore.sprint(dbuf); if (gffp<=pgtfCDS && gffp>=pgtfAny) { //GTF output fprintf(fout, "%s\t%s\ttranscript\t%d\t%d\t%s\t%c\t.\ttranscript_id \"%s\"", gseqname, tlabel, start, end, dbuf, strand, gffID); char* gid=NULL; if (geneID!=NULL) { gid=geneID; } else { gid=getAttr("gene_id"); if (gid==NULL) gid=gffID; //last resort, write gid the same with gffID } if (gid!=NULL) fprintf(fout, "; gene_id \"%s\"",gid); if (gene_name!=NULL && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL) fprintf(fout, "; gene_name \"%s\"",gene_name); if (attrs!=NULL) { bool trId=false; //bool gId=false; for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); const char* attrval=attrs->Get(i)->attr_val; if (attrval==NULL || attrval[0]=='\0') continue; if (strcmp(attrname, "transcriptID")==0) { if (trId) continue; trId=true; } if (strcmp(attrname, "transcript_id")==0 && !trId) { attrname="transcriptID"; trId=true; } if (Gstrcmp(attrname, "geneID")==0 && gid!=NULL && strcmp(attrval, gid)==0) continue; if (strcmp(attrname, "gene_id")==0) continue; if (cvtChars) { decodeHexChars(dbuf, attrval, DBUF_LEN-1); fprintf(fout,"; %s \"%s\"", attrname, dbuf); } else fprintf(fout,"; %s \"%s\"", attrname, attrs->Get(i)->attr_val); } } fprintf(fout,";\n"); } else if (gff3) { //print GFF3 transcript line: uint pstart, pend; if (gffp==pgffCDS) { pstart=CDstart; pend=CDend; } else { pstart=start;pend=end; } //const char* ftype=isTranscript() ? "mRNA" : getFeatureName(); const char* ftype=getFeatureName(); fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s", gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID); bool parentPrint=false; if (gfparent!=NULL && gffp!=pgffTLF) { //parent override - also prevents printing gene_name and gene_id fprintf(fout, ";Parent=%s",gfparent); parentPrint=true; } else if (parent!=NULL && !parent->isDiscarded() && gffp!=pgffTLF) { fprintf(fout, ";Parent=%s",parent->getID()); if (parent->isGene()) parentPrint=true; } if (gffp==pgffTLF) { fprintf(fout, ";exonCount=%d",exons.Count()); if (exons.Count()>0) fprintf(fout, ";exons=%d-%d", exons[0]->start, exons[0]->end); for (int i=1;istart, exons[i]->end); } } if (CDstart>0 && (gffp==pgffTLF || !showCDS)) { if (cdss==NULL) fprintf(fout,";CDS=%d:%d",CDstart,CDend); else { fprintf(fout, ";CDS="); for (int i=0;iCount();++i) { if (i>0) fprintf(fout, ","); fprintf(fout, "%d-%d", (*cdss)[i]->start, (*cdss)[i]->end); } } } if (CDphase>0 && (gffp==pgffTLF || !showCDS)) fprintf(fout,";CDSphase=%c", CDphase); char* g_id=NULL; if (geneID!=NULL && !parentPrint && getAttr("geneID")==NULL && ((g_id=getAttr("gene_id"))==NULL || strcmp(g_id, geneID)!=0)) fprintf(fout, ";geneID=%s",geneID); if (gene_name!=NULL && !parentPrint && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL) fprintf(fout, ";gene_name=%s",gene_name); if (attrs!=NULL) { for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); const char* attrval=attrs->Get(i)->attr_val; if (attrval==NULL || attrval[0]=='\0') continue; //fprintf(fout,";%s",attrname); if (cvtChars) { decodeHexChars(dbuf, attrval, DBUF_LEN-1); fprintf(fout,";%s=%s", attrname, dbuf); } else fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val); } } fprintf(fout,"\n"); }// gff3 transcript line if (gffp==pgffTLF) return; bool is_cds_only = (gffp==pgffBoth) ? false : isCDSOnly(); if (showExon) { //print exons for (int i=0;i0) { GVec cds; getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment for (int i=0;i& segs) { int cdsacc=0; if (CDphase=='1' || CDphase=='2') { cdsacc+= 3-(CDphase-'0'); } else CDphase='0'; if (strand=='-') { //reverse strand for (int i=segs.Count()-1;i>=0;i--) { segs[i]->phase='0'+ (3-cdsacc%3)%3; cdsacc+=segs[i]->end-segs[i]->start+1; } } else { //forward strand for (int i=0;iphase='0'+ (3-cdsacc%3)%3; cdsacc+=segs[i]->end-segs[i]->start+1; } } } void GffObj::getCDSegs(GVec& cds) { //like updateCDSPhase() above, also updates phase for each segment GffExon cdseg(true); cds.Clear(); if (cdss!=NULL) { //copy directly from cdss list for (int i=0;iCount();i++) { cdseg=(*cdss->Get(i)); cdseg.sharedAttrs=true; cds.Add(cdseg); } return; } int cdsacc=0; if (CDphase=='1' || CDphase=='2') { cdsacc+= 3-(CDphase-'0'); } if (strand=='-') { for (int x=exons.Count()-1;x>=0;x--) { uint sgstart=exons[x]->start; uint sgend=exons[x]->end; if (CDendsgend) continue; if (CDstart>=sgstart && CDstart<=sgend) sgstart=CDstart; //cdstart within this segment if (CDend>=sgstart && CDend<=sgend) sgend=CDend; //cdend within this segment cdseg.start=sgstart; cdseg.end=sgend; //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); cdseg.phase='0'+ (3-cdsacc%3)%3; cdsacc+=sgend-sgstart+1; cdseg.attrs=exons[x]->attrs; cdseg.sharedAttrs=true; cds.Add(cdseg); } //for each exon cds.Reverse(); } // - strand else { // + strand for (int x=0;xstart; uint sgend=exons[x]->end; if (CDendsgend) continue; if (CDstart>=sgstart && CDstart<=sgend) sgstart=CDstart; //seqstart within this segment if (CDend>=sgstart && CDend<=sgend) sgend=CDend; //seqend within this segment cdseg.start=sgstart; cdseg.end=sgend; //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); cdseg.phase='0' + (3-cdsacc%3)%3 ; cdsacc+=sgend-sgstart+1; cdseg.attrs=exons[x]->attrs; cdseg.sharedAttrs=true; cds.Add(cdseg); } //for each exon } // + strand } //-- transcript match/overlap classification functions char transcriptMatch(GffObj& a, GffObj& b, int& ovlen) { //return '=' if exact exon match, '~' if intron-chain match (or 80% overlap for single-exon) // or 0 otherwise int imax=a.exons.Count()-1; int jmax=b.exons.Count()-1; ovlen=0; if (imax!=jmax) return false; //different number of exons, cannot match if (imax==0) //single-exon mRNAs return (singleExonTMatch(a,b,ovlen)); if ( a.exons[imax]->startend || b.exons[jmax]->startend ) return 0; //intron chains do not overlap at all //check intron overlaps ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1; ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start; for (int i=1;i<=imax;i++) { if (ilen(); if ((a.exons[i-1]->end!=b.exons[i-1]->end) || (a.exons[i]->start!=b.exons[i]->start)) { return 0; //intron mismatch } } //--- full intron chain match: if (a.exons[0]->start==b.exons[0]->start && a.exons.Last()->end==b.exons.Last()->end) return '='; return '~'; } char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) { //return '=' if exact match, '~' if the overlap is >=80% of the longer sequence length // return 0 if there is no overlap GSeg mseg(m.start, m.end); ovlen=mseg.overlapLen(r.start,r.end); if (ovlen<=0) return 0; // fuzzy matching for single-exon transcripts: // matching = overlap is at least 80% of the length of the longer transcript // *OR* in case of reverse containment (reference contained in m) // it's also considered "matching" if the overlap is at least 80% of // the reference len AND at least 70% of the query len if (m.start==r.start && m.end==r.end) return '='; if (m.covlen>r.covlen) { if ( (ovlen >= m.covlen*0.8) || (ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ) ) //allow also some fuzzy reverse containment return '~'; } else { if (ovlen >= r.covlen*0.8) return '~'; } return 0; } //formerly in gffcompare char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch) { ovlen=0; //total actual exonic overlap if (!m.overlap(r.start,r.end)) return 0; int jmax=r.exons.Count()-1; //int iovlen=0; //total m.exons overlap with ref introns char rcode=0; if (m.exons.Count()==1) { //single-exon transfrag GSeg mseg(m.start, m.end); if (jmax==0) { //also single-exon ref //ovlen=mseg.overlapLen(r.start,r.end); char eqcode=0; if ((eqcode=singleExonTMatch(m, r, ovlen))>0) { if (strictMatch) return eqcode; else return '='; } if (m.covlen= m.covlen*0.8) return 'c'; } // fuzzy containment else if (ovlen >= r.covlen*0.8 ) return 'k'; // fuzzy reverse containment return 'o'; //just plain overlapping } //-- single-exon qry overlaping multi-exon ref //check full pre-mRNA case (all introns retained): code 'm' if (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start) return 'm'; for (int j=0;j<=jmax;j++) { //check if it's ~contained by an exon int exovlen=mseg.overlapLen(r.exons[j]); if (exovlen>0) { ovlen+=exovlen; if (m.start>r.exons[j]->start-4 && m.endend+4) { return 'c'; //close enough to be considered contained in this exon } } if (j==jmax) break; //last exon here, no intron to check //check if it fully covers an intron (retained intron) if (m.startend && m.end>r.exons[j+1]->start) return 'n'; //check if it's fully contained by an intron if (m.endstart && m.start>r.exons[j]->end) return 'i'; // check if it's a potential pre-mRNA transcript // (if overlaps this intron at least 10 bases) uint introvl=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1); //iovlen+=introvl; if (introvl>=10 && mseg.len()>introvl+10) { rcode='e'; } } //for each ref exon if (rcode>0) return rcode; return 'o'; //plain overlap, uncategorized } //single-exon transfrag //-- multi-exon transfrag -- int imax=m.exons.Count()-1;// imax>0 here if (jmax==0) { //single-exon reference overlap //any exon overlap? GSeg rseg(r.start, r.end); for (int i=0;i<=imax;i++) { //check if it's ~contained by an exon int exovlen=rseg.overlapLen(m.exons[i]); if (exovlen>0) { ovlen+=exovlen; if (r.start>m.exons[i]->start-4 && r.endend+4) { return 'k'; //reference contained in this assembled exon } } if (i==imax) break; if (r.endstart && r.start>m.exons[i]->end) return 'y'; //ref contained in this transfrag intron } return 'o'; } // * check if transfrag contained by a ref intron for (int j=0;jstart && m.start>r.exons[j]->end) return 'i'; } if (m.exons[imax]->startend) { //qry intron chain ends before ref intron chain starts //check if last qry exon plugs the 1st ref intron if (m.exons[imax]->start<=r.exons[0]->end && m.exons[imax]->end>=r.exons[1]->start) return 'n'; return 'o'; //only terminal exons overlap } else if (r.exons[jmax]->startend) { //qry intron chain starts after ref intron chain ends //check if first qry exon plugs the last ref intron if (m.exons[0]->start<=r.exons[jmax-1]->end && m.exons[0]->end>=r.exons[jmax]->start) return 'n'; return 'o'; //only terminal exons overlap } //check intron chain overlap (match, containment, intron retention etc.) int i=1; //index of exon to the right of current qry intron int j=1; //index of exon to the right of current ref intron bool intron_conflict=false; //overlapping introns have at least a mismatching splice site //from here on we check all qry introns against ref introns bool junct_match=false; //true if at least a junction match is found bool ichain_match=false; //if there is intron (sub-)chain match, to be updated by any mismatch bool intron_ovl=false; //if any intron overlap is found bool intron_retention=false; //if any ref intron is covered by a qry exon //intron chain (partial) match exon-index boundaries: int imfirst=0; //index of exon after first intron match in query (valid>0) int jmfirst=0; //index of exon after first intron match in reference (valid>0) int imlast=0; //index of exon after last intron match in query int jmlast=0; //index of exon after last intron match in reference //--keep track of the last overlapping introns in both qry and ref: //int q_last_iovl=0; //int r_last_iovl=0; //check for intron matches while (i<=imax && j<=jmax) { uint mstart=m.exons[i-1]->end; //qry intron start-end uint mend=m.exons[i]->start; uint rstart=r.exons[j-1]->end; //ref intron start-end uint rend=r.exons[j]->start; if (rendoverlap(mstart+1, mend-1)) intron_conflict=true; //next ref exon overlaps this qry intron if (!intron_retention && rstart>=m.exons[i-1]->start && rend<=m.exons[i-1]->end) intron_retention=true; //this ref intron is covered by previous qry exons[i-1] if (intron_ovl) ichain_match=false; j++; continue; } //no intron overlap, skipping ref intron if (rstart>mend) { //qry intron ends before ref intron starts //if qry intron overlaps the exon on the left, we have an intron conflict if (!intron_conflict && r.exons[j-1]->overlap(mstart+1, mend-1)) intron_conflict=true; if (!intron_retention && rstart>=m.exons[i]->start && rend<=m.exons[i]->end) intron_retention=true; if (intron_ovl) ichain_match=false; i++; continue; } //no intron overlap, skipping qry intron intron_ovl=true; //q_last_iovl=i; //keep track of the last overlapping introns in both qry and ref //r_last_iovl=j; //overlapping introns, test junction matching bool smatch=(mstart==rstart); bool ematch=(mend==rend); if (smatch || ematch) junct_match=true; if (smatch && ematch) { //perfect match for this intron if (jmfirst==0) { ichain_match=true; jmfirst=j; imfirst=i; } if (ichain_match) { imlast=i; jmlast=j; } i++; j++; continue; } //intron overlapping but not fully matching intron_conflict=true; ichain_match=false; if (mend>rend) j++; else i++; } //while checking intron overlaps /*** additional checking needed for intron retention when there is no ichain_match or overlap ? if (!intron_retention && r_last_iovlend; //ref intron start-end uint rend=r.exons[j]->start; if (rendstart) { i++; continue; } if (rstart>m.exons[i]->end) continue; //overlap between ref intron and m.exons[i] if (rstart>=m.exons[i]->start && rend<=m.exons[i]->end) { intron_retention=true; break; } } } ***/ // --- when qry intron chain is contained within ref intron chain // qry terminal exons may poke (overhang) into ref's other introns int l_iovh=0; // overhang of q left boundary beyond the end of ref intron on the left int r_iovh=0; // same type of overhang through the ref intron on the right int qry_intron_poking=0; // --- when ref intron chain is contained within qry intron chain, // terminal exons of ref may poke (overhang) into qry other introns int l_jovh=0; // overhang of q left boundary beyond the end of ref intron to the left int r_jovh=0; // same type of overhang through the ref intron on the right int ref_intron_poking=0; if (ichain_match) { //intron (sub-)chain compatible so far (but there could still be conflicts) if (imfirst==1 && imlast==imax) { // qry full intron chain match if (jmfirst==1 && jmlast==jmax) {//identical intron chains if (strictMatch) return (r.exons[0]->start==m.exons[0]->start && r.exons.Last()->end && m.exons.Last()->end) ? '=' : '~'; else return '='; } // -- a partial intron chain match if (jmfirst>1) { //find if m.start falls within any ref intron before jmfirst for (int j=jmfirst-1;j>0;--j) if (m.startstart) { if (m.start>r.exons[j-1]->end) { //m.start within this ref intron l_iovh = r.exons[j]->start - m.start; break; } else { intron_retention=true; ichain_match=false; } } } if (jmlast r.exons[j]->end) { if (m.end < r.exons[j+1]->start) { //m.end within this ref intron r_iovh = m.end - r.exons[j]->end; break; } else { intron_retention=true; ichain_match=false; } } } if (ichain_match && l_iovh<4 && r_iovh<4) return 'c'; qry_intron_poking=GMAX(l_iovh, r_iovh); } else if ((jmfirst==1 && jmlast==jmax)) {//ref intron chain match //check if the reference j-chain is contained in qry i-chain //check for ref ends poking into qry introns if (imfirst>1) { for (int i=imfirst-1;i>0;--i) if (m.exons[i]->start>r.start) { if (r.start>m.exons[i-1]->end) { l_jovh = m.exons[i]->start - r.start; break; } else { ichain_match = false; } } } if (imlast m.exons[i]->end) { if (r.end < m.exons[i+1]->start) { r_jovh = r.end - m.exons[i]->end; break; } else { ichain_match = false; } } } if (ichain_match && l_jovh<4 && r_jovh<4) return 'k'; //reverse containment ref_intron_poking=GMAX(l_jovh, r_jovh); } } //'=', 'c' and 'k' were checked and assigned, check for 'm' and 'n' before falling back to 'j' if (intron_retention) { //ref is boundary contained with qry intron chain ? that's not required for 'm' //GMessage("r_jovh=%d, r_iovh=%d, l_jovh=%d, l_iovh=%d\n", r_jovh, r_iovh, l_jovh, l_iovh); //GMessage("m.start=%d, r.exons[0]->end=%d, m.end=%d, r.exons[jmax]->start=%d\n", // m.start, r.exons[0]->end, m.end, r.exons[jmax]->start); //if (ref_intron_poking>0 && ) //we just need to have no intron poking going on if (!intron_conflict && ref_intron_poking<4 && qry_intron_poking<4) return 'm'; else return 'n'; } if (junct_match) return 'j'; //we could have 'o' or 'y' here //any real exon overlaps? ovlen=m.exonOverlapLen(r); if (ovlen>4) return 'o'; return 'y'; //all reference exons are within transfrag introns! }