cassiopee-0.1.13/0000755000004100000410000000000012632711744013606 5ustar www-datawww-datacassiopee-0.1.13/bin/0000755000004100000410000000000012632711744014356 5ustar www-datawww-datacassiopee-0.1.13/bin/cassie.rb0000644000004100000410000000634712632711744016164 0ustar www-datawww-data#!/usr/bin/env ruby require File.join(File.dirname(__FILE__), '../lib/cassiopee') require 'optparse' require 'logger' options = {} optparse = OptionParser.new do|opts| # Set a banner, displayed at the top # of the help screen. opts.banner = "Usage: cassie.rb [options]" options[:verbose] = false opts.on( '-v', '--verbose', 'Output more information' ) do options[:verbose] = true end options[:filter] = nil opts.on( '-f', '--filter FILTER', 'Filter matches between min and max positions ex. 100-150' ) do |filter| options[:filter] = filter end options[:file] = nil opts.on( '-i', '--index FILE', 'File to index' ) do |file| options[:file] = file end options[:fpattern] = nil opts.on( '--fpattern FILE', 'File with pattern' ) do |file| options[:fpattern] = file end options[:pattern] = nil opts.on( '-p', '--pattern PATTERN', 'Search pattern' ) do |file| options[:pattern] = file end options[:store] = nil opts.on( '-s', '--store FILE', 'Store index to file' ) do |file| options[:store] = file end options[:name] = nil opts.on( '-n', '--name NAME', 'name of index, default [crawler]' ) do |name| options[:name] = name end options[:exact] = false opts.on( '-x', '--exact', 'Do exact search (default)' ) do options[:exact] = true end options[:error] = 0 opts.on( '-m', '--hamming ERROR', 'Maximum number of error to search with Hamming distance' ) do |error| options[:error] = error end opts.on( '-e', '--edit ERROR', 'Maximum number of error to search with edit(levenshtein) distance' ) do |error| options[:error] = error * (-1) end opts.on( '-h', '--help', 'Display this screen' ) do puts opts exit end end optparse.parse! if(options[:file]==nil) puts "Error, input file is missing, use -h option for usage" exit elif(options[:verbose]) puts "Input sequence: " << options[:file].to_s end if(options[:fpattern]==nil && options[:pattern]==nil) puts "Error, pattern is missing, use -h option for usage" exit end if(options[:error]==0) options[:exact] = true end crawler = Cassiopee::Crawler.new crawler.setLogLevel(Logger::INFO) if(options[:store]) crawler.use_store = true end if(options[:name]!=nil) crawler.file_suffix = options[:name] end if(options[:filter]!=nil) positions = options[:filter].split('-') crawler.filter_position(positions[0],positions[1]) end # String to index crawler.indexFile(options[:file]) matches = nil if(options[:fpattern]==nil) pattern = options[:pattern] else pattern = '' file = File.new(options[:fpattern], "r") while (line = file.gets) input = line.downcase.chomp pattern << input end file.close if(pattern.length==0) puts "Error pattern file is empty" exit end end if(options[:verbose]) puts "Search pattern " << pattern end if(options[:exact]) puts "Search exact" unless !options[:verbose] matches = crawler.searchExact(pattern) else puts "Search approximate" unless !options[:verbose] matches = crawler.searchApproximate(pattern,options[:errors]) end # Go through matches while((match = crawler.next())!=nil) puts "Match: " << match.inspect end cassiopee-0.1.13/bin/demo-mt.rb0000644000004100000410000000133312632711744016245 0ustar www-datawww-data#!/usr/bin/env ruby $:.unshift '../lib' require 'cassiopee-mt' require 'logger' # Instanciate a new crawler crawler = CassiopeeMt::CrawlerMt.new crawler.setLogLevel(Logger::INFO) crawler.maxthread=3 #crawler.use_store = true # String to index crawler.indexString('iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiisallou salluiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii') # Search pattern in indexed string crawler.searchExact('llo') # Go through matches while((match = crawler.next())!=nil) puts "got an exact match " << match.inspect end crawler.clear() crawler.searchApproximate('llo',1) # Go through matches while((match = crawler.next())!=nil) puts "got an approximate match " << match.inspect end cassiopee-0.1.13/bin/demo.rb0000644000004100000410000000141312632711744015626 0ustar www-datawww-data#!/usr/bin/env ruby $:.unshift '../lib' require 'cassiopee' require 'logger' # Instanciate a new crawler crawler = Cassiopee::Crawler.new #crawler.use_store = true # String to index crawler.indexString('sallou sallu') # Search pattern in indexed string crawler.searchExact('llo') # Search it again, using already loaded indexed data crawler.searchExact('llo') test= "my string" # Extend to use match algorithms test.extend(Cassiopee) test.computeDistance('test',0,0) puts "Hamming: " << test.computeHamming("my strigg",1).to_s puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s # Approcimate search, edit distance = 1 crawler.searchApproximate("llu",-2) # Go through matches while((match = crawler.next())!=nil) puts "got a match " << match.inspect end cassiopee-0.1.13/tests/0000755000004100000410000000000012632711744014750 5ustar www-datawww-datacassiopee-0.1.13/tests/amb.map0000644000004100000410000000001412632711744016201 0ustar www-datawww-datau=a,c v=c,g cassiopee-0.1.13/tests/test-suite.rb0000644000004100000410000000641212632711744017406 0ustar www-datawww-data$:.unshift '../lib' require 'cassiopee' require 'cassiopee-mt' #require File.join(File.dirname(__FILE__), '../lib/cassiopee') #require File.join(File.dirname(__FILE__), '../lib/cassiopee-mt') require 'logger' require 'test/unit' class TestCrawler < Test::Unit::TestCase def test_exactsearch crawler = Cassiopee::Crawler.new #crawler.setLogLevel(Logger::DEBUG) crawler.indexString('my sample example') matches = crawler.searchExact('ampl') assert_equal(2,matches.length) # Minus 1, because first element is len of match #match = crawler.next() #assert_equal(2,match[2].length-1) end def test_exactsearch2 crawler = Cassiopee::Crawler.new crawler.indexString('my sample example') matches = crawler.searchExact('xample') assert_equal(1,matches.length) end def test_ambiguous crawler = Cassiopee::Crawler.new crawler.loadAmbiguityFile(File.join(File.dirname(__FILE__), 'amb.map')) crawler.indexString('aaaaaaaaaaacgttttttt') matches = crawler.searchExact('aucgt') assert_equal(1,matches.length) end def test_hammingsearch crawler = Cassiopee::Crawler.new crawler.indexString('my sample example') matches = crawler.searchApproximate('ebampl',1) assert_equal(1,matches.length) end def test_levenshteinsearch crawler = Cassiopee::Crawler.new crawler.indexString('my sample example') matches = crawler.searchApproximate('ebampl',-1) assert_equal(1,matches.length) end def test_levenshteinsearch2 crawler = Cassiopee::Crawler.new crawler.indexString('aaaaacgtttttt') matches = crawler.searchApproximate('ac',-2) matches.each do |match| puts match.to_s end assert_equal(1,matches.length) end def test_directmethod crawler = Cassiopee::Crawler.new crawler.method = Cassiopee::Crawler::METHOD_DIRECT crawler.indexString('my sample example') matches = crawler.searchApproximate('ebampl',1) assert_equal(1,matches.length) end def test_suffixmethod crawler = Cassiopee::Crawler.new crawler.method = Cassiopee::Crawler::METHOD_SUFFIX crawler.indexString('my sample example') matches = crawler.searchApproximate('ebampl',1) assert_equal(1,matches.length) end def test_multithreadsearch crawler = CassiopeeMt::CrawlerMt.new crawler.maxthread=3 crawler.indexString('iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimy sample exampleiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii') matches = crawler.searchExact('exam') assert_equal(1,matches.length) end def test_cache crawler = Cassiopee::Crawler.new crawler.indexString('my sample example') matches = crawler.searchApproximate('ebampl',-1) cache = Cassiopee::CrawlerCache.new cache.method = 2 cache.min_position = 0 cache.max_position = 0 cache.errors = 1 cache.saveCache(matches) cache = Cassiopee::CrawlerCache.new cache.method = 2 cache.min_position = 0 cache.max_position = 0 cache.errors = 1 cachematches = cache.loadCache assert_equal(1,cachematches.length) cache = Cassiopee::CrawlerCache.new cache.method = 2 cache.min_position = 0 cache.max_position = 0 cache.errors = 2 cachematches = cache.loadCache assert_equal(0,cachematches.length) end end cassiopee-0.1.13/Changelog0000644000004100000410000000164312632711744015424 0ustar www-datawww-datav0.1.13: fix typo v0.1.12: fix match not found when match includes last character v0.1.11: fix match to text 1.2 v0.1.9: move demos to bin, to be dir compliant v0.1.8: remove rubygems dependency in code. v0.1.7: Fix in ambiguity management v0.1.6: Fix ambiguity and cache support in CassiopeeMT v0.1.5 : add CrawlerCache for basic cache management (previous result only), several fixes v0.1.4 : fix 0.1.3 error on index load, add filter_position management in mt v0.1.3 : 09/11 Olivier Sallou add CrawlerMT in cassiopee-mt for multi thread support to speed up the search v0.1.2 : 09/11 Olivier Sallou add possibility to reload an "index" instead of using index method again fix comment mngt (comments attribute) add filter methods add to_pos method to display results per position add optimal methods add ambiguity support v0.1.1 : fix #1, add filter option 08/09/11 Olivier Sallou v0.1.0 : First version Olivier Sallou cassiopee-0.1.13/README0000644000004100000410000000021312632711744014462 0ustar www-datawww-dataSearch an exact or approximate word (hamming or edit distance) in a string. Support index cache with incremental update for later searches cassiopee-0.1.13/lib/0000755000004100000410000000000012632711744014354 5ustar www-datawww-datacassiopee-0.1.13/lib/cassiopee-mt.rb0000644000004100000410000000550112632711744017273 0ustar www-datawww-datarequire 'digest/md5' require 'logger' require 'zlib' require File.join(File.dirname(__FILE__), 'cassiopee') include Cassiopee # Module managing multi threads to search in strings, extending Cassiopee module CassiopeeMt # Multi threaded search using a Crawler per thread # Filtering is used to split the input data according to maxthread # Matches of each thread are merge to matches of CrawlerMT class CrawlerMt < Crawler MINSEQSIZE=10 # Max number fo threads to use attr_accessor :maxthread @th = [] def initialize super @th = [] @matches = Array.new end def setParams(crawler,threadId) crawler.setLogLevel($log.level) crawler.file_suffix = @file_suffix crawler.loadIndex() crawler.method = @method crawler.comments = @comments crawler.useAmbiguity = @useAmbiguity crawler.ambiguous = @ambiguous crawler.useCache = @useCache #crawler.file_suffix = @file_suffix+"."+threadId.to_s end def searchExact(pattern) len = @sequence.length if(@min_position>0) min = @min_position else min = 0 end if(@max_position>0) max = @max_position else max= @sequence.length end len = max - min if(len0) min = @min_position else min = 0 end if(@max_position>0) max = @max_position else max = @sequence.length end len = max - min if(lenhamming.to_i) return -1 end end end return nberr end # Calculate number of substitution between string and pattern # Extend a String # Return -1 if max is reached def computeHamming(pattern,hamming) nberr = 0 (0..(self.length-1)).each do |c| if(pattern[c] != self[c]) nberr = nberr+1 if(nberr>hamming.to_i) return -1 end end end return nberr end # Calculate the edit distance between string and pattern # Extend a String # Return -1 if max is reached def computeLevenshtein(pattern,edit) distance = Text::Levenshtein.distance(self, pattern) if(distance>edit) return -1 end return distance end # Compute Levenshtein distance but using a mapping matrix of alphabet ambiguity # Code comes from Text gem, Text::Levenshtein.distance, adapted for ambiguity comparison def computeLevenshteinAmbiguous(pattern, edit, ambiguous) prepare = if "ruby".respond_to?(:encoding) lambda { |str| str.encode(Encoding::UTF_8).unpack("U*") } else rule = $KCODE.match(/^U/i) ? "U*" : "C*" lambda { |str| str.unpack(rule) } end s, t = [self, pattern].map(&prepare) n = s.length m = t.length return m if (0 == n) return n if (0 == m) d = (0..m).to_a x = nil (0...n).each do |i| e = i+1 (0...m).each do |j| cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1 x = [ d[j+1] + 1, # insertion e + 1, # deletion d[j] + cost # substitution ].min d[j] = e e = x end d[m] = x end if(x>edit) return -1 end return x end # checks if 2 chars are equal with ambiguity rules # * ambigous is a Hash of char/Array of char mapping def isAmbiguousEqual(a,b,ambiguous) if(ambiguous==nil || (ambiguous[a.chr]==nil && ambiguous[b.chr]==nil )) if(a==b) return true else return false end end if(a==b || (ambiguous[a.chr]!=nil && ambiguous[a.chr].index(b.chr)!=nil) || (ambiguous[b.chr]!=nil && ambiguous[b.chr].index(a.chr)!=nil)) return true else return false end end # Class maning cache of results class CrawlerCache FILE_CACHE_EXT = ".sfc" # Suffix files name/path attr_accessor :file_suffix # search exact: 0 # hamming : 1 # edit : 2 attr_accessor :method # filter attr_accessor :min_position attr_accessor :max_position # max errors attr_accessor :errors attr_accessor :cache $log = Logger.new(STDOUT) $log.level = Logger::INFO def setLogger(userlogger) $log = userlogger end def initialize @file_suffix = "crawler" end # Loads cache from file def loadCache return Array.new unless File.exists?(@file_suffix+FILE_CACHE_EXT) begin file = Zlib::GzipReader.open(@file_suffix+FILE_CACHE_EXT) rescue Zlib::GzipFile::Error file = File.open(@file_suffix+FILE_CACHE_EXT, 'r') ensure obj = Marshal.load file.read file.close if(method!=obj.method || min_positionobj.max_position || errors>obj.errors) return Array.new end return filterCache(obj) end end # Save self to cache, with cache object set from obj def saveCache(obj) self.cache = obj marshal_dump = Marshal.dump(self) sfxpos = File.new(@file_suffix+FILE_CACHE_EXT,'w') sfxpos = Zlib::GzipWriter.new(sfxpos) sfxpos.write marshal_dump sfxpos.close end def clearCache File.delete(@file_suffix+FILE_CACHE_EXT) unless !File.exists?(@file_suffix+FILE_CACHE_EXT) end private # filter cache according to settings # obj: cache object def filterCache(cacheobject) realmatches = Array.new if(cacheobject==nil) return realmatches end cacheobject.cache.each do |obj| if(obj[1]>self.errors) next end realpos = Array.new realpos << obj[2][0] (1..obj[2].length-1).each do |i| curpos= obj[2][i] if((curpos<=max_position || max_position==0) && curpos>=min_position) realpos << curpos end end if(realpos.length<=1) next end realmatches << Array[obj[0],obj[1],realpos] end return realmatches end end # Base class to index and search through a string class Crawler # Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile attr_accessor :useAmbiguity # Suffix files name/path attr_accessor :file_suffix # Max number fo threads to use (not yet used) attr_accessor :maxthread # Use persistent suffix file ? attr_accessor :use_store # Array of comment characters to skip lines in input sequence file attr_accessor :comments # Manage basic cache to store previous match attr_accessor :useCache # Ambiguity map (Hash) attr_accessor :ambiguous # Method for search FORCE or SUFFIX # * SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused) # * FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search # FORCE method does not yet support optimal filters attr_accessor :method METHOD_DIRECT = 0 METHOD_SUFFIX = 1 @min_position = 0 @max_position = 0 # Previous position filter @prev_min_position = 0 @prev_max_position = 0 @pattern = nil FILE_SUFFIX_EXT = ".sfx" FILE_SUFFIX_POS = ".sfp" SUFFIXLEN = 'suffix_length' $maxthread = 1 @cache = nil $log = Logger.new(STDOUT) $log.level = Logger::INFO def initialize @useAmbiguity = false @ambiguous = nil @useCache = false @file_suffix = "crawler" @method = 0 @prev_min_position = 0 @prev_max_position = 0 @suffix = nil @suffixmd5 = nil @position = 0 @suffixes = Hash.new @matches = Array.new @curmatch = 0 @use_store = false @sequence = nil @comments = Array["#"] @cache = Cassiopee::CrawlerCache.new end def filterLength filterOptimal(0) end def filterCost filterOptimal(1) end # Clear suffixes in memory # If using use_store, clear the store too def clear @suffixes = Hash.new @matches.clear @pattern = nil @prev_max_position = 0 @prev_min_position = 0 @cache.clearCache() File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS) end # Set Logger level def setLogLevel(level) $log.level = level end # Index an input file # Clear existing indexes def indexFile(f) # Parse file, map letters to reduced alphabet # Later on, use binary map instead of ascii map # Take all suffix, order by length, link to position map on other file # Store md5 for easier compare? + 20 bytes per suffix @sequence = readSequence(f) clear() @min_position = 0 @max_position = 0 end # Index an input string # Clear existing indexes def indexString(s) @sequence = s File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| data.puts(@sequence) end clear() @min_position = 0 @max_position = 0 end # Load ambiguity rules from a file # File format should be: # * A=B,C # D=E,F # ... def loadAmbiguityFile(f) if(!File.exists?(f)) $log.error("File "<< f << "does not exists") exit(1) end @ambiguous = Hash.new file = File.new(f, "r") while (line = file.gets) definition = line.downcase.chomp ambdef = definition.split('=') ambequal = ambdef[1].split(',') @ambiguous[ambdef[0]] = ambequal end @useAmbiguity = true $log.debug("loaded ambiguity rules: " << @ambiguous.inspect()) file.close end # Load sequence from a previous index command def loadIndex seq = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") while (line = file.gets) input = line.downcase.chomp seq << input end file.close rescue => err $log.error("Exception: #{err}") exit() end @sequence = seq clear() @min_position = 0 @max_position = 0 end # Filter matches to be between min and max start position # If not using use_store, search speed is improved but existing indexes are cleared # If max=0, then max is string length # Must be called after index creation or load def filter_position(min,max) if(!use_store) clear() end @prev_min_position = @min_position @prev_max_position = @max_position @min_position = min @max_position = max end # Search exact match def searchExact(s) if(@useAmbiguity) return searchApproximate(s,0) end s = s.downcase updateCache(0,0) @matches = @cache.loadCache() if(@matches.length>0) return cache?(@matches) end #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,s.length,s.length,0,s) return @matches unless(method == METHOD_SUFFIX) # Search required length, compare (compare md5?) # MD5 = 128 bits, easier to compare for large strings matchsize = @pattern.length @suffixes.each do |md5val,posArray| if (isMatchEqual?(md5val)) match = Array[md5val, 0, posArray] $log.debug "Match: " << match.inspect @matches << match end end return cache?(@matches) end # Search an approximate string # # * support insertion, deletion, substitution # * If edit > 0, use Hamming # * Else use Levenshtein def searchApproximate(s,edit) if(edit==0 && !@useAmbiguity) return searchExact(s) end allowederrors = edit if(edit>=0) useHamming = true minmatchsize = s.length maxmatchsize = s.length updateCache(1,edit) @matches = @cache.loadCache() else useHamming = false edit = edit * (-1) minmatchsize = s.length - edit maxmatchsize = s.length + edit updateCache(2,edit) @matches = @cache.loadCache() end if(@matches.length>0) return @matches end s = s.downcase #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s) return cache?(@matches) unless(method == METHOD_SUFFIX) @suffixes.each do |md5val,posArray| if(md5val == SUFFIXLEN) next end if (md5val == @pattern) filteredPosArray = filter(posArray) match = Array[md5val, 0, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match else if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize) # Get string seq = extractSuffix(posArray[1],posArray[0]) errors = isApproximateEqual?(seq,s,useHamming,edit) if(errors>=0) filteredPosArray = filter(posArray) match = Array[md5val, errors, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match end end end end return cache?(@matches) end # Filter the array of positions with defined position filter def filter(posArray) $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s) if(@min_position==0 && @max_position==0) return posArray end filteredArray = Array.new i = 0 posArray.each do |pos| if(i==0) # First elt of array is match length filteredArray << pos end if(i>0 && pos>=@min_position && pos<=@max_position) filteredArray << pos end i +=1 end return filteredArray end # Extract un suffix from suffix file based on md5 match def extractSuffix(start,len) sequence = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") file.pos = start sequence = file.read(len) file.close rescue => err puts "Exception: #{err}" return nil end return sequence end # Iterates over matches def next if(@curmatch<@matches.length) @curmatch = @curmatch + 1 return @matches[@curmatch-1] else @curmatch = 0 return nil end end def to_pos positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1]] else posmatch = Array.new posmatch << Array[len,match[1]] positions[pos] = posmatch end end i += 1 end end return positions.sort end def to_s puts '{ matches: "' << @matches.length << '" }' end private # If cache is used, store results for later retrieval, else return matches directly def cache?(results) if(@useCache) @cache.saveCache(results) end return results end # Update cache object with current object parameters # * method: 0 -> exact, 1 -> hamming, 2 -> edit def updateCache(method,errors) @cache.file_suffix = @file_suffix @cache.min_position = @min_position @cache.max_position = @max_position @cache.method = method @cache.errors = errors end # check if md5 is equal to pattern def isMatchEqual?(s) if(@pattern == s) return true end return false end # check if string is approximatly equal to pattern # s: string to compare # pattern: base pattern used # useHamming: use Hamming or edit distance # edit : allowed errors def isApproximateEqual?(s,pattern,useHamming,edit) errors = -1 s.extend(Cassiopee) if(useHamming) if(@useAmbiguity && @ambiguous!=nil) errors = s.computeHammingAmbiguous(pattern,edit,@ambiguous) else errors = s.computeHamming(pattern,edit) end else if(@useAmbiguity && @ambiguous!=nil) errors = s.computeLevenshteinAmbiguous(pattern,edit,@ambigous) else errors = s.computeLevenshtein(pattern,edit) end end end # Parse input string # # * creates a suffix file # * creates a suffix position file def parseSuffixes(s,minlen,maxlen,edit=0,pat=nil) # Controls if(minlen<=0) minlen = 1 end if(maxlen>@sequence.length) maxlen = @sequence.length end if(!use_store) minpos = @min_position if(@max_position==0) maxpos = @sequence.length else maxpos = @max_position end else minpos = 0 maxpos = @sequence.length - minlen end suffixlen = nil $log.info('Start indexing') loaded = false # Hash in memory already contain suffixes for searched lengths if(@suffixes != nil && !@suffixes.empty?) suffixlen = @suffixes[SUFFIXLEN] if(suffixlen!=nil && !suffixlen.empty?) loaded = true (maxlen).downto(minlen) do |len| if(suffixlen.index(len)==nil) loaded = false break end end end end if(@use_store && loaded) $log.debug('already in memory, skip file loading') end # If not already in memory if(@use_store && !loaded) @suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS) suffixlen = @suffixes[SUFFIXLEN] end nbSuffix = 0 changed = false # Load suffix between maxlen and minlen (maxlen).downto(minlen) do |i| $log.debug('parse for length ' << i.to_s) if(suffixlen!=nil && suffixlen.index(i)!=nil) $log.debug('length '<@sequence.length) next end @suffix = s[j,i] @suffixmd5 = Digest::MD5.hexdigest(@suffix) @position = j progress = (@position * 100).div(@sequence.length) if((progress % 10) == 0 && progress > prev_progress) prev_progress = progress $log.debug("progress: " << progress.to_s) end if(method==METHOD_DIRECT) if(edit==0 && !@useAmbiguity) if(isMatchEqual?(@suffixmd5)) errors = 0 else errors = -1 end else if(edit>=0) useHamming = true allowederrors = edit else useHamming = false allowederrors = edit * (-1) end errors = isApproximateEqual?(@suffix,pat,useHamming,allowederrors) end if(errors>=0) match = Array[@suffixmd5, errors, Array[i,j]] $log.debug "Match: " << match.inspect @matches << match end else nbSuffix += addSuffix(@suffixmd5, @position,i) end end $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s) unless method==METHOD_DIRECT end if(@use_store && changed) $log.info("Store suffixes") marshal_dump = Marshal.dump(@suffixes) sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w') sfxpos = Zlib::GzipWriter.new(sfxpos) sfxpos.write marshal_dump sfxpos.close end $log.info('End of indexing') end # Add a suffix in Hashmap def addSuffix(md5val,position,len) if(@suffixes.has_key?(md5val)) # Add position @suffixes[md5val] << position else # Add position, write new suffix # First elt is size of elt @suffixes[md5val] = Array[len, position] if(@suffixes.has_key?(SUFFIXLEN)) @suffixes[SUFFIXLEN] << len else @suffixes[SUFFIXLEN] = Array[len] end end return 1 end # read input string, and concat content def readSequence(s) $log.debug('read input sequence') counter = 1 sequence = '' begin file = File.new(s, "r") File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT) File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| while (line = file.gets) counter = counter + 1 input = line.downcase.chomp skip = false comments.each do |c| if(input[0] == c[0]) # Line start with a comment char, skip it $log.debug("skip line") skip = true break end end if(!skip) sequence << input data.puts input end end end file.close rescue => err puts "Exception: #{err}" err end $log.debug('data file created') return sequence end # Load suffix position file in memory def loadSuffixes(file_name) return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS) begin file = Zlib::GzipReader.open(file_name) rescue Zlib::GzipFile::Error file = File.open(file_name, 'r') ensure obj = Marshal.load file.read file.close return obj end end # Filter @matches to keep only the longest or the error less matches for a same start position def filterOptimal(type) positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1],match[0]] #positions[pos] << posmatch else posmatch = Array.new posmatch << Array[len,match[1],match[0]] positions[pos] = posmatch end end i += 1 end end matchtoremove = Array.new positions.each do |pos,posmatch| optimal = nil match = nil count = 0 newoptimal = nil newmatch = nil (0..posmatch.length-1).each do |i| solution = posmatch[i] if(i==0) if(type==0) # length optimal = solution[0] else # cost optimal = solution[1] end match = solution[2].to_s #count += 1 next end newmatch = solution[2].to_s if(type==0) # length newoptimal = solution[0] if(newoptimal.to_i>optimal.to_i) optimal = newoptimal matchtoremove << match match = newmatch else matchtoremove << newmatch end else # cost newoptimal = solution[1] if(newoptimal=' - !ruby/object:Gem::Version version: 1.2.0 type: :runtime prerelease: false version_requirements: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: 1.2.0 - !ruby/object:Gem::Dependency name: rake requirement: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' - !ruby/object:Gem::Dependency name: rspec requirement: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hamming and/or edit distance. email: olivier.sallou@gmail.com executables: [] extensions: [] extra_rdoc_files: [] files: - README - Changelog - LICENSE - bin/demo.rb - bin/demo-mt.rb - lib/cassiopee.rb - lib/cassiopee-mt.rb - bin/cassie.rb - tests/test-suite.rb - tests/amb.map homepage: https://github.com/osallou/cassiopee licenses: - LGPL-3 post_install_message: rdoc_options: [] require_paths: - lib required_ruby_version: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' required_rubygems_version: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' requirements: [] rubyforge_project: rubygems_version: 1.8.25 signing_key: specification_version: 3 summary: Cassiopee index strings and provide exact or approximate search. test_files: - tests/test-suite.rb cassiopee-0.1.13/LICENSE0000644000004100000410000000004512632711744014612 0ustar www-datawww-dataSoftware is distributed under LGPL v3