tRNAscan-SE-2.0/0000755000543100007160000000000013210665317012636 5ustar pchanlowelabtRNAscan-SE-2.0/config.h.in0000644000543100007160000000673113100451013014650 0ustar pchanlowelab/* config.h.in. Generated from configure.ac by autoheader. */ /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP systems. This function is required for `alloca.c' support on those systems. */ #undef CRAY_STACKSEG_END /* Define to 1 if using `alloca.c'. */ #undef C_ALLOCA /* Define to 1 if you have `alloca', as a function or macro. */ #undef HAVE_ALLOCA /* Define to 1 if you have and it should be used (not on Ultrix). */ #undef HAVE_ALLOCA_H /* Define to 1 if you have the `bzero' function. */ #undef HAVE_BZERO /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_LIMITS_H /* Define to 1 if your system has a GNU libc compatible `malloc' function, and to 0 otherwise. */ #undef HAVE_MALLOC /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H /* Define to 1 if you have the `memset' function. */ #undef HAVE_MEMSET /* Define to 1 if your system has a GNU libc compatible `realloc' function, and to 0 otherwise. */ #undef HAVE_REALLOC /* Define to 1 if you have the `regcomp' function. */ #undef HAVE_REGCOMP /* Define to 1 if you have the `re_comp' function. */ #undef HAVE_RE_COMP /* Define to 1 if you have the `sqrt' function. */ #undef HAVE_SQRT /* Define to 1 if you have the header file. */ #undef HAVE_STDDEF_H /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H /* Define to 1 if you have the header file. */ #undef HAVE_STDLIB_H /* Define to 1 if you have the `strcasecmp' function. */ #undef HAVE_STRCASECMP /* Define to 1 if you have the `strchr' function. */ #undef HAVE_STRCHR /* Define to 1 if you have the header file. */ #undef HAVE_STRINGS_H /* Define to 1 if you have the header file. */ #undef HAVE_STRING_H /* Define to 1 if you have the `strstr' function. */ #undef HAVE_STRSTR /* Define to 1 if you have the header file. */ #undef HAVE_SYS_STAT_H /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H /* Define to 1 if the system has the type `_Bool'. */ #undef HAVE__BOOL /* Name of package */ #undef PACKAGE /* Define to the address where bug reports for this package should be sent. */ #undef PACKAGE_BUGREPORT /* Define to the full name of this package. */ #undef PACKAGE_NAME /* Define to the full name and version of this package. */ #undef PACKAGE_STRING /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME /* Define to the home page for this package. */ #undef PACKAGE_URL /* Define to the version of this package. */ #undef PACKAGE_VERSION /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be automatically deduced at runtime. STACK_DIRECTION > 0 => grows toward higher addresses STACK_DIRECTION < 0 => grows toward lower addresses STACK_DIRECTION = 0 => direction of growth unknown */ #undef STACK_DIRECTION /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS /* Version number of package */ #undef VERSION /* Define to rpl_malloc if the replacement function should be used. */ #undef malloc /* Define to rpl_realloc if the replacement function should be used. */ #undef realloc /* Define to `unsigned int' if does not define. */ #undef size_t tRNAscan-SE-2.0/EukHighConfidenceFilter.in0000644000543100007160000006723613276407662017665 0ustar pchanlowelab#! @PERL@ -w # # -------------------------------------------------------------------- # EukHighConfidenceFilter v1.0 # # Annotate eukaryotic tRNA high confidence set from tRNAscan-SE 2.0 predictions # # Copyright (C) 2017 Patricia Chan and Todd Lowe # # Baskin School of Engineering, University of California, Santa Cruz # lowe@soe.ucsc.edu # http://lowelab.ucsc.edu/ # -------------------------------------------------------------------- use strict; use lib "@libdir@/tRNAscan-SE"; use Getopt::Long; use tRNAscanSE::tRNA; use tRNAscanSE::ArraytRNA; our @isotypes = ('Ala', 'Gly', 'Pro', 'Thr', 'Val', 'Ser', 'Arg', 'Leu', 'Phe','Asn', 'Lys', 'Asp', 'Glu', 'His', 'Gln', 'Ile', 'Met', 'Tyr', 'Sup', 'Cys', 'Trp', 'SeC'); our %ac_list = ( 'Ala' => [qw/AGC GGC CGC TGC/], 'Gly' => [qw/ACC GCC CCC TCC/], 'Pro' => [qw/AGG GGG CGG TGG/], 'Thr' => [qw/AGT GGT CGT TGT/], 'Val' => [qw/AAC GAC CAC TAC/], 'Ser' => [qw/AGA GGA CGA TGA ACT GCT    /], 'Arg' => [qw/ACG GCG CCG TCG     CCT TCT/], 'Leu' => [qw/AAG GAG CAG TAG     CAA TAA/], 'Phe' => [qw/AAA GAA    /], 'Asn' => [qw/ATT GTT    /], 'Lys' => [qw/    CTT TTT/], 'Asp' => [qw/ATC GTC     /], 'Glu' => [qw/    CTC TTC/], 'His' => [qw/ATG GTG     /], 'Gln' => [qw/    CTG TTG/], 'Tyr' => [qw/ATA GTA     /], 'Sup' => [qw/    CTA TTA/], 'Ile' => [qw/AAT GAT   TAT/], 'Met' => [qw/    CAT  /], 'Cys' => [qw/ACA GCA     /], 'Trp' => [qw/    CCA  /], 'SeC' => [qw/      TCA/] ); our %aa_list = ( 'AGC'=>'Ala', 'GGC'=>'Ala', 'CGC'=>'Ala', 'TGC'=>'Ala', 'ACC'=>'Gly', 'GCC'=>'Gly', 'CCC'=>'Gly', 'TCC'=>'Gly', 'AGG'=>'Pro', 'GGG'=>'Pro', 'CGG'=>'Pro', 'TGG'=>'Pro', 'AGT'=>'Thr', 'GGT'=>'Thr', 'CGT'=>'Thr', 'TGT'=>'Thr', 'AAC'=>'Val', 'GAC'=>'Val', 'CAC'=>'Val', 'TAC'=>'Val', 'AGA'=>'Ser', 'GGA'=>'Ser', 'CGA'=>'Ser', 'TGA'=>'Ser', 'ACT'=>'Ser', 'GCT'=>'Ser', 'ACG'=>'Arg', 'GCG'=>'Arg', 'CCG'=>'Arg', 'TCG'=>'Arg', 'CCT'=>'Arg', 'TCT'=>'Arg', 'AAG'=>'Leu', 'GAG'=>'Leu', 'CAG'=>'Leu', 'TAG'=>'Leu', 'CAA'=>'Leu', 'TAA'=>'Leu', 'AAA'=>'Phe', 'GAA'=>'Phe', 'ATT'=>'Asn', 'GTT'=>'Asn', 'CTT'=>'Lys', 'TTT'=>'Lys', 'ATC'=>'Asp', 'GTC'=>'Asp', 'CTC'=>'Glu', 'TTC'=>'Glu', 'ATG'=>'His', 'GTG'=>'His', 'CTG'=>'Gln', 'TTG'=>'Gln', 'ATA'=>'Tyr', 'GTA'=>'Tyr', 'CTA'=>'Sup', 'TTA'=>'Sup', 'AAT'=>'Ile', 'GAT'=>'Ile', 'TAT'=>'Ile', 'CAT'=>'Met', 'ACA'=>'Cys', 'GCA'=>'Cys', 'CCA'=>'Trp', 'TCA'=>'SeC', ); our %euk_aa_list = ( 'AGC'=>'Ala', 'CGC'=>'Ala', 'TGC'=>'Ala', 'GCC'=>'Gly', 'CCC'=>'Gly', 'TCC'=>'Gly', 'AGG'=>'Pro', 'CGG'=>'Pro', 'TGG'=>'Pro', 'AGT'=>'Thr', 'CGT'=>'Thr', 'TGT'=>'Thr', 'AAC'=>'Val', 'CAC'=>'Val', 'TAC'=>'Val', 'AGA'=>'Ser', 'CGA'=>'Ser', 'TGA'=>'Ser', 'GCT'=>'Ser', 'ACG'=>'Arg', 'CCG'=>'Arg', 'TCG'=>'Arg', 'CCT'=>'Arg', 'TCT'=>'Arg', 'AAG'=>'Leu', 'CAG'=>'Leu', 'TAG'=>'Leu', 'CAA'=>'Leu', 'TAA'=>'Leu', 'GAA'=>'Phe', 'GTT'=>'Asn', 'CTT'=>'Lys', 'TTT'=>'Lys', 'GTC'=>'Asp', 'CTC'=>'Glu', 'TTC'=>'Glu', 'GTG'=>'His', 'CTG'=>'Gln', 'TTG'=>'Gln', 'GTA'=>'Tyr', 'CTA'=>'Sup', 'TTA'=>'Sup', 'AAT'=>'Ile', 'GAT'=>'Ile', 'TAT'=>'Ile', 'CAT'=>'Met', 'GCA'=>'Cys', 'CCA'=>'Trp', 'TCA'=>'SeC', ); our ($opt_result, $opt_ss, $opt_output, $opt_prefix, $opt_remove, $opt_cmscore1, $opt_ssscore1, $opt_isoscore1, $opt_isoscore2, $opt_isomaxscore2, $opt_help); our $ANTICODON_COUNT_CUTOFF = 40; our %file_names = (); our $tRNAs = tRNAscanSE::ArraytRNA->new(); our %tRNA_counts = (); &set_options(); &set_file_names(); my ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count) = &filtering(); &print_results($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count); exit; sub set_options { $opt_result = ""; $opt_ss = ""; $opt_output = ""; $opt_prefix = ""; $opt_remove = 0; $opt_cmscore1 = 50; $opt_ssscore1 = 10; $opt_isoscore1 = 70; $opt_isoscore2 = 70; $opt_isomaxscore2 = 95; Getopt::Long::GetOptions("result|i=s", "ss|s=s", "output|o=s", "prefix|p=s", "remove|r", "cmscore1|c1=f", "ssscore1|m1=f", "isoscore1|e1=f", "isoscore2|e2=f", "isomaxscore2|x=f", "help|h"); if ($opt_help || $opt_result eq "" || $opt_ss eq "" || $opt_output eq "" || $opt_prefix eq "" || $opt_cmscore1 < -1 || $opt_ssscore1 < -1 || $opt_isoscore1 < -1 || $opt_isoscore2 < -1 || $opt_isomaxscore2 < -1) { die "Usage: EukQualityFilter [options]\n", "Options\n", "--result -i tRNAscan-SE output file used as input\n", "--ss -s tRNAscan-SE secondary structure file used as input\n", "--output -o Directory where output files will be written\n", "--prefix -p Prefix for output file name\n", "--remove -r Remove filtered tRNA hits (default: filtered tRNA hits are only tagged)\n", "--cmscore1 -c1 Domain-specific model score cutoff for secondary filtering (default = 50; -1 if not used for filtering)\n", "--ssscore1 -m1 Secondary structure score cutoff for secondary filtering (default = 10; -1 if not used for filtering)\n", "--isoscore1 -e1 Isotype-specific model score cutoff for secondary filtering (default = 70; -1 if not used for filtering)\n", "--isoscore2 -e2 Isotype-specific model starting score cutoff for tertiary filtering (default = 70; -1 if not used for filtering)\n", "--isomaxscore2 -x Maximum isotype-specific model score cutoff for tertiary filtering (default = 95)\n", "--help -h Print this help\n\n"; } } sub set_file_names { system("mkdir -p ".$opt_output); $file_names{tRNAscan_out} = $opt_result; $file_names{tRNAscan_ss} = $opt_ss; $file_names{output_tRNAscan_out} = $opt_output."/".$opt_prefix.".out"; $file_names{output_tRNAscan_ss} = $opt_output."/".$opt_prefix.".ss"; $file_names{log} = $opt_output."/".$opt_prefix.".log"; } sub filtering { my $sec_pass_filtered_ac = {}; my $iso_score_cutoff = {}; &pseudogene_filter(); &secondary_filter(); if ($opt_isoscore2 != -1) { ($sec_pass_filtered_ac, $iso_score_cutoff) = &tertiary_filter(); } $tRNAs->sort_array("tRNAscan_id"); my $ac_count = &get_ac_count(); return ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count); } sub print_results { my ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count) = @_; &write_out_file($ac_count); &write_ss_file(); &write_summary($sec_pass_filtered_ac, $iso_score_cutoff); } sub pseudogene_filter { my $line = ""; my $tRNA = undef; my %header = (); my ($startpos, $endpos); my @columns = (); my $ct = 0; $tRNA_counts{total} = 0; $tRNA_counts{pseudo_filter} = 0; print "Status: Filtering pseudogenes\n"; open(FILE_IN, "$file_names{tRNAscan_out}") or die "Fail to open $file_names{tRNAscan_out}\n"; while ($line = ) { $ct++; print STDERR "." if ($ct % 1000 == 0); print STDERR "\n" if ($ct % 50000 == 0); chomp($line); if ($line =~ /^Name/) { $line =~ s/tRNA #/tRNA#/; } if ($line =~ /^Sequence/) { $line =~ s/Intron Bounds/Intron\tBound/; } @columns = split(/\t/, $line, -1); for (my $i = 0; $i < scalar(@columns); $i++) { $columns[$i] = &trim($columns[$i]); } if ($columns[0] =~ /^Sequence/ || $columns[0] =~ /^Name/ || $columns[0] =~ /^-----/) { if ($columns[0] =~ /^Sequence/) { for (my $i = 0; $i < scalar(@columns); $i++) { if ($columns[$i] eq "Sequence") { $header{seqname} = $i; } elsif ($columns[$i] eq "Anti") { $header{anticodon} = $i; } elsif ($columns[$i] eq "Intron") { $header{intron_start} = $i; $header{intron_end} = $i+1; } elsif ($columns[$i] eq "Inf") { $header{score} = $i; } elsif ($columns[$i] eq "Cove") { $header{score} = $i; } elsif ($columns[$i] eq "HMM") { $header{hmm_score} = $i; } elsif ($columns[$i] eq "2'Str") { $header{ss_score} = $i; } elsif ($columns[$i] eq "Hit") { $header{hit_origin} = $i; } elsif ($columns[$i] eq "HMM") { $header{hmm_score} = $i; } elsif ($columns[$i] eq "Type") { $header{isotype_type} = $i; } } } elsif ($columns[0] =~ /^Name/) { for (my $i = 0; $i < scalar(@columns); $i++) { if ($columns[$i] eq "tRNA#") { $header{trna_id} = $i; } elsif ($columns[$i] eq "Begin" and !defined $header{start}) { $header{start} = $i; $header{end} = $i+1; } elsif ($columns[$i] eq "Type") { $header{isotype} = $i; } elsif ($columns[$i] eq "CM") { $header{isotype_cm} = $i; $header{isotype_score} = $i+1; } elsif ($columns[$i] eq "Note") { $header{note} = $i; } elsif ($columns[$i] eq "Count") { $header{intron_count} = $i; } } } } else { if (!defined $header{isotype_cm}) { die "Error: This filter requires isotype-specific model scan result in the tRNAscan-SE v2 output file.\n"; } if (!defined $header{note}) { die "Error: This filter requires tRNAscan-SE v2 output file.\n"; } $tRNA_counts{total}++; if ($columns[$header{seqname}] eq "chrM" or $columns[$header{seqname}] eq "M" or $columns[$header{seqname}] eq "chrMT" or $columns[$header{seqname}] eq "MT") { $tRNA_counts{total}--; next; } elsif ($columns[$header{note}] =~ /pseudo/) { $tRNA_counts{pseudo_filter}++; next; } else { $tRNA = tRNAscanSE::tRNA->new; $tRNA->seqname($columns[$header{seqname}]); $tRNA->tRNAscan_id($columns[$header{seqname}].".trna".$columns[$header{trna_id}]); $startpos = $columns[$header{start}]; $endpos = $columns[$header{end}]; if ($startpos < $endpos) { $tRNA->start($startpos); $tRNA->end($endpos); $tRNA->strand("+"); } else { $tRNA->end($startpos); $tRNA->start($endpos); $tRNA->strand("-"); } $tRNA->category("cyto"); $tRNA->isotype($columns[$header{isotype}]); $tRNA->anticodon($columns[$header{anticodon}]); $tRNA->score($columns[$header{score}]); $tRNA->hmm_score($columns[$header{hmm_score}]); $tRNA->ss_score($columns[$header{ss_score}]); $tRNA->tRNAscan_id($tRNA->tRNAscan_id()."-".$tRNA->isotype().$tRNA->anticodon()); my $isotype_type = "cytosolic"; if (defined $header{isotype_type}) { $isotype_type = $columns[$header{isotype_type}]; } $tRNA->add_model_hit($isotype_type, $columns[$header{isotype_cm}], $columns[$header{isotype_score}], ""); $tRNAs->put($tRNA); } } } close(FILE_IN); print STDERR "\n"; } sub euk_anticodon_filter { my ($ac_count, $tRNA) = @_; my $tag = ""; if (!$tRNA->is_pseudo()) { if ($tRNA->isotype() eq "Sup") { $tag = "unexpected anticodon"; $tRNA_counts{ac_filter}++; } elsif (!defined $euk_aa_list{$tRNA->anticodon()}) { my $alt_anticodon = $tRNA->anticodon(); if (substr($tRNA->anticodon(), 0, 1) eq "A") { $alt_anticodon = "G".substr($tRNA->anticodon(), 1); } elsif (substr($tRNA->anticodon(), 0, 1) eq "G") { $alt_anticodon = "A".substr($tRNA->anticodon(), 1); } if (defined $aa_list{$tRNA->anticodon()} and defined $aa_list{$alt_anticodon} and defined $euk_aa_list{$alt_anticodon} and $ac_count->{$tRNA->anticodon()} > $ac_count->{$alt_anticodon}) {} else { $tag = "unexpected anticodon"; $tRNA_counts{ac_filter}++; } } } return $tag; } sub secondary_filter { print "Status: Secondary filtering\n"; $tRNA_counts{secondary_filter} = 0; for (my $i = 0; $i < $tRNAs->get_count(); $i++) { my $tRNA = $tRNAs->get($i); my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model(); if ($opt_isoscore1 > -1 and !$tRNA->is_pseudo() and ($type eq "mito" or ($type eq "cytosolic" and $iso_score < $opt_isoscore1))) { $tRNA->pseudo(1); $tRNA_counts{secondary_filter}++; } elsif ($opt_cmscore1 > -1 and !$tRNA->is_pseudo() and $tRNA->score() < $opt_cmscore1) { $tRNA->pseudo(1); $tRNA_counts{secondary_filter}++; } elsif ($opt_ssscore1 > -1 and !$tRNA->is_pseudo() and $tRNA->ss_score() < $opt_ssscore1 and $tRNA->isotype() ne "SeC") { $tRNA->pseudo(1); $tRNA_counts{secondary_filter}++; } } } sub get_ac_count { my %ac_count = (); for my $ac (sort keys %aa_list) { $ac_count{$ac} = 0; } for (my $i = 0; $i < $tRNAs->get_count(); $i++) { my $tRNA = $tRNAs->get($i); if ($tRNA->is_cytosolic() and !$tRNA->is_pseudo()) { $ac_count{$tRNA->anticodon()} += 1; } } return \%ac_count; } sub get_isotype_count { my %isotype_count = (); my $iso = ""; for my $isotype (sort @isotypes) { $isotype_count{$isotype} = 0; } for (my $i = 0; $i < $tRNAs->get_count(); $i++) { my $tRNA = $tRNAs->get($i); if ($tRNA->is_cytosolic() and !$tRNA->is_pseudo()) { $isotype_count{$tRNA->isotype()} += 1; } } return \%isotype_count; } sub tertiary_filter { print "Status: Tertiary filtering\n"; $tRNA_counts{tertiary_filter} = 0; my %sec_pass_filtered_ac = (); my %iso_score_cutoff = (); my $count_pair = []; my %filtering_isotypes = (); my $tRNA_index = 0; my $start_index = -1; my $end_index = -1; my $isotype_tRNAs = tRNAscanSE::ArraytRNA->new(); my $trna = undef; my $find = 0; my $ac_count = &get_ac_count(); foreach my $ac (sort {$ac_count->{$b} <=> $ac_count->{$a}} keys %$ac_count) { if ($ac_count->{$ac} > $ANTICODON_COUNT_CUTOFF) { if (!defined $filtering_isotypes{$aa_list{$ac}}) { $filtering_isotypes{$aa_list{$ac}} = 1; } } else { last; } } $tRNAs->sort_array("isotype"); foreach my $isotype (sort keys %filtering_isotypes) { $iso_score_cutoff{$isotype} = $opt_isoscore2; $find = 0; $isotype_tRNAs->clear(); $start_index = -1; $end_index = -1; while (!$find and $tRNA_index < $tRNAs->get_count()) { if ($tRNAs->get($tRNA_index)->isotype() lt $isotype) { $tRNA_index++; } elsif ($tRNAs->get($tRNA_index)->isotype() eq $isotype) { if ($start_index == -1) { $start_index = $tRNA_index; } $end_index = $tRNA_index; my $tRNA = $tRNAs->get($tRNA_index); if (!$tRNA->is_pseudo()) { my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model(); $trna = tRNAscanSE::tRNA->new; $trna->tRNAscan_id($tRNA->tRNAscan_id()); $trna->anticodon($tRNA->anticodon()); $trna->score($iso_score); $isotype_tRNAs->put($trna); } $tRNA_index++; } else { $find = 1; } } if ($find) { foreach my $ac (@{$ac_list{$isotype}}) { if ($ac ne " ") { my $local_ac_count = $ac_count->{$ac}; while ($local_ac_count > $ANTICODON_COUNT_CUTOFF and $iso_score_cutoff{$isotype} <= $opt_isomaxscore2) { $local_ac_count = &iso_score_filter($isotype_tRNAs, $ac, $iso_score_cutoff{$isotype}, $local_ac_count); $iso_score_cutoff{$isotype}++; } if ($ac_count->{$ac} > $local_ac_count) { $iso_score_cutoff{$isotype}--; } } } foreach my $ac (@{$ac_list{$isotype}}) { if ($ac ne " ") { $count_pair = []; $count_pair->[0] = $ac_count->{$ac}; $sec_pass_filtered_ac{$ac} = $count_pair; for (my $i = $start_index; $i <= $end_index; $i++) { my $tRNA = $tRNAs->get($i); my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model(); if ($tRNA->isotype() eq $isotype and !$tRNA->is_pseudo()) { if ($type eq "mito" or ($type eq "cytosolic" and $iso_score < $iso_score_cutoff{$isotype})) { $tRNA->pseudo(2); $tRNA_counts{tertiary_filter}++; } } } } } } } $ac_count = &get_ac_count(); foreach my $ac (sort keys %sec_pass_filtered_ac) { $sec_pass_filtered_ac{$ac}->[1] = $ac_count->{$ac}; } return (\%sec_pass_filtered_ac, \%iso_score_cutoff); } sub iso_score_filter { my ($isotype_tRNAs, $ac, $isotype_score_cutoff, $local_ac_count) = @_; my $count = $local_ac_count; for (my $i = 0; $i < $isotype_tRNAs->get_count(); $i++) { my $tRNA = $isotype_tRNAs->get($i); if ($tRNA->anticodon() eq $ac and !$tRNA->is_pseudo()) { if ($tRNA->score() < $isotype_score_cutoff) { $tRNA->pseudo(1); $count--; } } } return $count; } sub write_out_file { my ($ac_count) = @_; my $line = ""; my $tRNA = undef; my %header = (); my @columns = (); my $tRNAscan_id = ""; my $index = -1; my $include = 0; my $tag = ""; my $ct = 0; $tRNA_counts{iso_filter} = 0; $tRNA_counts{undet_filter} = 0; $tRNA_counts{ac_filter} = 0; print "Status: Writing output file $file_names{output_tRNAscan_out}\n"; open(FILE_IN, "$file_names{tRNAscan_out}") or die "Fail to open $file_names{tRNAscan_out}\n"; open(FILE_OUT, ">$file_names{output_tRNAscan_out}") or die "Fail to open $file_names{output_tRNAscan_out}\n"; while ($line = ) { $ct++; print STDERR "." if ($ct % 1000 == 0); print STDERR "\n" if ($ct % 50000 == 0); chomp($line); if ($line =~ /^Name/) { $line =~ s/tRNA #/tRNA#/; } @columns = split(/\t/, $line, -1); for (my $i = 0; $i < scalar(@columns); $i++) { $columns[$i] = &trim($columns[$i]); } if ($columns[0] =~ /^Sequence/ || $columns[0] =~ /^Name/ || $columns[0] =~ /^-----/) { print FILE_OUT $line."\n"; if ($columns[0] =~ /^Sequence/) { for (my $i = 0; $i < scalar(@columns); $i++) { if ($columns[$i] eq "Sequence") { $header{seqname} = $i; } elsif ($columns[$i] eq "Anti") { $header{anticodon} = $i; } elsif ($columns[$i] eq "Type") { $header{isotype_type} = $i; } } } elsif ($columns[0] =~ /^Name/) { for (my $i = 0; $i < scalar(@columns); $i++) { if ($columns[$i] eq "tRNA#") { $header{trna_id} = $i; } elsif ($columns[$i] eq "Type") { $header{isotype} = $i; } elsif ($columns[$i] eq "CM") { $header{isotype_cm} = $i; $header{isotype_score} = $i+1; } elsif ($columns[$i] eq "Note") { $header{note} = $i; } } } } else { $include = 0; $tag = ""; $tRNAscan_id = $columns[$header{seqname}].".trna".$columns[$header{trna_id}]."-".$columns[$header{isotype}].$columns[$header{anticodon}]; $index = $tRNAs->bsearch_id($tRNAscan_id, "tRNAscan_id"); if ($index == -1) { if ($columns[$header{seqname}] eq "chrM" or $columns[$header{seqname}] eq "M" or $columns[$header{seqname}] eq "chrMT" or $columns[$header{seqname}] eq "MT") {} elsif (!$opt_remove) { $include = 1; } } else { $tRNA = $tRNAs->get($index); if ($tRNA->is_pseudo() and !$opt_remove) { $include = 1; if ($tRNA->pseudo() == 1) { $tag = "secondary filtered"; } elsif ($tRNA->pseudo() == 2) { $tag = "tertiary filtered"; } } elsif (!$tRNA->is_pseudo()) { $include = 1; if ($columns[$header{note}] =~ /IPD/ and $columns[$header{isotype}] ne "Sup") { $tRNA_counts{iso_filter}++; $tag = "isotype mismatch"; } elsif ($columns[$header{isotype}] eq "Undet") { $tRNA_counts{undet_filter}++; $tag = "undetermined isotype"; } else { $tag = &euk_anticodon_filter($ac_count, $tRNA); if ($tag eq "") { $tag = "high confidence set"; } } } } if ($include) { for (my $i = 0; $i < $header{note}; $i++) { print FILE_OUT $columns[$i]."\t"; } if ($columns[$header{note}] ne "") { print FILE_OUT $columns[$header{note}]; if ($tag ne "") { print FILE_OUT ",".$tag; } } else { print FILE_OUT $tag; } for (my $i = $header{note} + 1; $i < scalar(@columns); $i++) { print FILE_OUT "\t".$columns[$i]; } print FILE_OUT "\n"; } } } close(FILE_IN); close(FILE_OUT); print STDERR "\n"; } sub write_ss_file { my $tRNA = undef; my $line = ""; my $tRNAscan_id = ""; my $seqname = ""; my $index = -1; my $print_line = ""; my $include = 0; my $tag = ""; my $ct = 0; print "Status: Writing secondary structure file $file_names{output_tRNAscan_ss}\n"; open(FILE_IN, "$file_names{tRNAscan_ss}") || die "Error: Fail to open $file_names{tRNAscan_ss}\n"; open(FILE_OUT, ">$file_names{output_tRNAscan_ss}") or die "Fail to open $file_names{output_tRNAscan_ss}\n"; while ($line = ) { if ($line =~ /^(\S+)\s+\(\d+\-\d+\)\s+Length:\s\d+\sbp/) { $tRNAscan_id = $1; $seqname = substr($tRNAscan_id, 0, rindex($tRNAscan_id, ".")); $print_line = $line; $include = 0; $tag = ""; $ct++; print STDERR "." if ($ct % 1000 == 0); print STDERR "\n" if ($ct % 50000 == 0); } elsif ($line =~ /^Type:\s(\S+)\s+Anticodon:\s(\S+)\sat\s.+\s\(.+\)\s+Score:\s\S+/) { $tRNAscan_id .= "-".$1.$2; $print_line .= $line; $index = $tRNAs->bsearch_id($tRNAscan_id, "tRNAscan_id"); if ($index == -1) { if (!$opt_remove) { $include = 1; $tag = "Filtered"; } } else { $tRNA = $tRNAs->get($index); if ($tRNA->is_pseudo() and !$opt_remove) { $include = 1; $tag = "Filtered"; } elsif (!$tRNA->is_pseudo()) { $include = 1; } } } elsif ($line =~ /^HMM Sc=\S+\s+Sec struct Sc=\S+/) { if ($tag ne "") { $print_line .= $tag.": ".$line; } else { $print_line .= $line; } } elsif (($line =~ /^Possible pseudogene:\s+HMM Sc=\S+\s+Sec struct Sc=\S+/) or ($line =~ /^Possible truncation, pseudogene:\s+HMM Sc=\S+\s+Sec struct Sc=\S+/)) { if ($tag ne "") { $print_line .= $tag.", ".$line; } else { $print_line .= $line; } } elsif ($line =~ /^Possible intron: \d+-\d+ \(\d+-\d+\)/) { $print_line .= $line; } elsif (index($line, " * | * | * |") > -1) { $print_line .= $line; } elsif ($line =~ /^Seq:\s\S+$/) { $print_line .= $line; } elsif ($line =~ /^Str:\s\S+$/) { $print_line .= $line; if ($seqname eq "chrM" or$seqname eq "M" or $seqname eq "chrMT" or $seqname eq "MT") {} elsif ($include) { print FILE_OUT $print_line."\n"; } } } close(FILE_IN); close(FILE_OUT); print STDERR "\n"; } sub write_summary { my ($sec_pass_filtered_ac, $iso_score_cutoff) = @_; print "Status: Writing summary file $file_names{log}\n"; open(FILE_OUT, ">$file_names{log}") or die "Fail to open $file_names{log}\n"; print FILE_OUT "EukHighConfidenceFilter v1.0 Summary\n", "Completed Time: ".localtime()."\n\n"; print FILE_OUT "Inputs\n", "------------------------------------------------------\n", "tRNAscan-SE output file: ".$file_names{tRNAscan_out}."\n", "tRNAscan-SE ss file: ".$file_names{tRNAscan_ss}."\n"; if ($opt_cmscore1 == -1) { print FILE_OUT "Secondary filtering domain-specific model score: Disabled\n"; } else { print FILE_OUT "Secondary filtering domain-specific model score cutoff: $opt_cmscore1\n"; } if ($opt_ssscore1 == -1) { print FILE_OUT "Secondary filtering secondary structure score: Disabled\n"; } else { print FILE_OUT "Secondary filtering secondary structure score cutoff: $opt_ssscore1\n"; } if ($opt_isoscore1 == -1) { print FILE_OUT "Secondary filtering isotype-specific model score: Disabled\n"; } else { print FILE_OUT "Secondary filtering isotype-specific model score cutoff: $opt_isoscore1\n"; } if ($opt_isoscore2 == -1) { print FILE_OUT "Tertiary filtering isotype-specific model score: Disabled\n"; } else { print FILE_OUT "Tertiary filtering isotype-specific model starting score cutoff: $opt_isoscore2\n"; print FILE_OUT "Tertiary filtering maximum isotype-specific model score cutoff: $opt_isomaxscore2\n"; } if ($opt_remove) { print FILE_OUT "Remove filtered hits: Yes\n"; } else { print FILE_OUT "Remove filtered hits: No\n"; } print FILE_OUT "\n"; print FILE_OUT "Outputs\n", "------------------------------------------------------\n", "tRNAscan-SE output file: ".$file_names{output_tRNAscan_out}."\n", "tRNAscan-SE ss file: ".$file_names{output_tRNAscan_ss}."\n", "Summary file: ".$file_names{log}."\n\n"; my $remaining_hits = $tRNA_counts{total} - $tRNA_counts{pseudo_filter}; print FILE_OUT "Summary statistics\n", "------------------------------------------------------\n"; print FILE_OUT "Total tRNA predictions: ".$tRNA_counts{total}."\n"; print FILE_OUT "Possible pseudogenes: ".$tRNA_counts{pseudo_filter}."\n"; if ($opt_isoscore1 > -1 or $opt_ssscore1 > -1 or $opt_cmscore1 > -1) { print FILE_OUT "tRNAs with scores lower than secondary cutoffs: ".$tRNA_counts{secondary_filter}."\n"; $remaining_hits -= $tRNA_counts{secondary_filter}; } if ($opt_isoscore2 > -1) { $remaining_hits -= $tRNA_counts{tertiary_filter}; print FILE_OUT "tRNAs failed tertiary filter: ".$tRNA_counts{tertiary_filter}."\n\n"; } print FILE_OUT "tRNAs after filtering: ".$remaining_hits."\n\n"; print FILE_OUT "tRNAs with mismatched isotype: ".$tRNA_counts{iso_filter}."\n"; print FILE_OUT "tRNAs with undetermined isotype: ".$tRNA_counts{undet_filter}."\n"; print FILE_OUT "tRNAs with unexpected anticodon: ".$tRNA_counts{ac_filter}."\n\n"; print FILE_OUT "High confidence set: ".($remaining_hits - $tRNA_counts{iso_filter} - $tRNA_counts{undet_filter} - $tRNA_counts{ac_filter})."\n\n"; if ($opt_isoscore2 > -1 and $tRNA_counts{tertiary_filter} > 0) { print FILE_OUT "Tertiary filter - anticodon counts\n", "------------------------------------------------------\n\n"; print FILE_OUT "Isotype Anticodon BeforeFiltering AfterFiltering FinalScoreCutoff\n"; foreach my $isotype (sort keys %$iso_score_cutoff) { foreach my $ac (@{$ac_list{$isotype}}) { if ($ac ne " " and defined $euk_aa_list{$ac}) { printf FILE_OUT "%-7s %-9s %-15d %-14d %-16d\n", $isotype, $ac, $sec_pass_filtered_ac->{$ac}->[0], $sec_pass_filtered_ac->{$ac}->[1], $iso_score_cutoff->{$isotype}; } } } print FILE_OUT "\n"; } my $ac_count = &get_ac_count(); my %isotype_count = (); foreach my $ac (sort keys %aa_list) { if (!defined $isotype_count{$aa_list{$ac}}) { $isotype_count{$aa_list{$ac}} = $ac_count->{$ac}; } else { $isotype_count{$aa_list{$ac}} += $ac_count->{$ac}; } } print FILE_OUT "Isotype / Anticodon Counts After Filtering\n", "------------------------------------------------------\n\n"; foreach my $aa (@isotypes) { my $iso_count = 0; if (defined $isotype_count{$aa}) { $iso_count = $isotype_count{$aa}; } if ($aa eq "SeC") { printf FILE_OUT ("%-8s: %d\t", "SelCys", $iso_count); } elsif ($aa eq "Sup") { printf FILE_OUT ("%-8s: %d\t", "Supres", $iso_count); } else { printf FILE_OUT ("%-8s: %d\t", $aa, $iso_count); } foreach my $ac (@{$ac_list{$aa}}) { if ($ac eq " ") { print FILE_OUT " "; } else { if (defined $ac_count->{$ac}) { printf FILE_OUT ("%5s: %-6s", $ac, $ac_count->{$ac}); } else { printf FILE_OUT ("%5s: %-6s", $ac, ""); } } } print FILE_OUT "\n"; } print FILE_OUT "\n"; close(FILE_OUT); } sub trim { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } tRNAscan-SE-2.0/INSTALL0000644000543100007160000000243313075030430013660 0ustar pchanlowelab Installation of tRNAscan-SE: a tool for finding transfer RNAs ----------------------------------------------------------------- Starting from a source distribution, trnascan-se-2.0.0.tar.gz: uncompress: gunzip trnascan-se-2.0.0.tar.gz unpack: tar xf trnascan-se-2.0.0.tar move into new directory: cd trnascan-se-2.0.0 configure: ./configure build: make automated install: make install To specify a directory for installation, for example, your home directory "/home/xyz", run configure as ./configure --prefix=/home/xyz For more information about special parameters for running configure, please get it at ./configure --help tRNAscan-SE is designed to run on POSIX-compatible platforms, including UNIX, Linux and MacOS/X. The POSIX standard essentially includes all operating systems except Microsoft Windows. Non-ANSI compilers may need one or more -MDEFS defined. Dependencies ----------------------------------------------------------------- tRNAscan-SE 2.0 requires separate installation of Infernal 1.1.2. The source code and prebuilt binaries can be obtained at http://eddylab.org/infernal/. The installation directory of Infernal should be the same as the one where tRNAscan-SE 2.0 is installed. tRNAscan-SE-2.0/Demo/0000755000543100007160000000000013265405366013530 5ustar pchanlowelabtRNAscan-SE-2.0/Demo/Example1-tRNAs.out0000644000543100007160000000116313100451013016675 0ustar pchanlowelabSequence tRNA Bounds tRNA Anti Intron Bounds Inf HMM 2'Str Hit Isotype Isotype Name tRNA # Begin End Type Codon Begin End Score Score Score Origin CM Score Note -------- ------ ----- ------ ---- ----- ----- ---- ------ ----- ----- ------ ------- ------- ------ CELF22B7 1 12619 12738 Leu CAA 12657 12692 74.2 51.20 23.00 Inf Leu 119.9 CELF22B7 2 19480 19561 Ser AGA 0 0 81.6 47.50 34.10 Inf Ser 125.0 CELF22B7 3 26367 26439 Phe GAA 0 0 82.5 56.60 25.90 Inf Phe 112.1 CELF22B7 4 26992 26920 Phe GAA 0 0 82.5 56.60 25.90 Inf Phe 112.1 CELF22B7 5 23765 23694 Pro CGG 0 0 71.5 48.20 23.30 Inf Pro 113.0 tRNAscan-SE-2.0/Demo/Example2.fa0000644000543100007160000000122613100451013015470 0ustar pchanlowelab>MySeq1 GTTTCTGCGTGAGGCCCTATAGCTCAGGGGTtAGAGCACTGGTCTTGTAA ACCAGGGGtCGCGAGTTCAAATCTCGCTGGGGCCTTGCGAAACTACTTTC >MySeq2 AGAATGTCTCTGTGGCGCAATGGACGAGCGCGCTGGACTTCTAATCCAGA GGTTCTGGGTTCGAGTCCCGGCAGAGATGATCACCT >MySeq3 TTCAAATCGAAATGGCACTATGGCCGAGTGGTtAAGGCGAGAGACTCGAA tggaataaaaagttcggctATCTCTTGGGCTCTGCCCGCGCTGGTTCAAA TCCTGCTGGTGTCGTTTAATTTTTTTTAAATAAC >MySeq4 GCCAGGGAGAGATGGCCGAGCGGTCCAAGGCGCTGGTTTAAGGCAACCAG TAGCTTCGGGGGCGTGGGTTCGAATCCCACTCTCTTCA >MySeq5 TAGCCCGGATGATCCTCAGTGGTCTGGGGTGCAGGCTTCAAACCTGTAGC TGTCTAGCGACAGAGTGGTTCAATTCCACCTTTCGGGCGAGATAA >MySeq6 TTAGCAGACACGGTGGCCGAGTGGTTTAAGGCATGAGACACTTGATCTCA AACGGTTCTAACCGAACGCAGGTTCGAATCCTGCCCGTGTCA tRNAscan-SE-2.0/Demo/Example2-tRNAs.out0000644000543100007160000000122413100451013016674 0ustar pchanlowelabSequence tRNA Bounds tRNA Anti Intron Bounds Inf HMM 2'Str Hit Isotype Isotype Name tRNA # Begin End Type Codon Begin End Score Score Score Origin CM Score Note -------- ------ ----- ------ ---- ----- ----- ---- ------ ----- ----- ------ ------- ------- ------ MySeq1 1 13 85 Thr TGT 0 0 78.0 54.80 23.20 Inf Thr 93.1 MySeq2 1 6 79 Arg TCT 0 0 75.1 56.60 18.50 Inf Arg 89.3 MySeq3 1 14 114 Ser CGA 51 69 71.8 49.10 22.70 Inf Ser 118.3 MySeq4 1 6 88 Leu AAG 0 0 65.0 43.90 21.10 Inf Leu 92.2 MySeq5 1 3 89 SeC TCA 0 0 146.9 0.00 0.00 Inf SeC 146.9 MySeq6 1 7 92 Lys CTT 0 0 72.1 40.60 31.50 Inf Leu 75.7 ISM (-73.90) tRNAscan-SE-2.0/Demo/Example1-tRNAs.iso0000644000543100007160000000140413100451013016656 0ustar pchanlowelabtRNAscanID Anticodon_predicted_isotype Ala Arg Asn Asp Cys Gln Glu Gly His Ile Leu Lys Met Phe Pro SeC Ser Thr Trp Tyr Val iMet CELF22B7.trna1 Leu 37.4 55.3 10.1 12.7 51.2 39.7 -8.4 45.3 58.7 58.5 119.9 27.1 8.8 5.0 16.6 -7.6 69.0 52.1 41.5 45.5 28.8 -999 CELF22B7.trna2 Ser 30.7 40.4 5.2 4.8 56.8 31.0 -0.1 28.9 45.0 36.2 72.1 5.5 0.4 -11.4 -999 -17.2 125.0 43.9 35.2 43.0 11.5 -999 CELF22B7.trna3 Phe 54.4 63.8 67.7 32.4 71.8 43.0 6.1 37.0 49.7 73.6 -999 71.1 85.0 112.1 43.3 -999 -999 84.2 71.9 81.3 60.8 -17.7 CELF22B7.trna4 Phe 54.4 63.8 67.7 32.4 71.8 43.0 6.1 37.0 49.7 73.6 -999 71.1 85.0 112.1 43.3 -999 -999 84.2 71.9 81.3 60.8 -17.7 CELF22B7.trna5 Pro 74.8 50.4 13.0 54.6 48.0 54.4 49.8 60.0 59.9 32.0 19.9 4.4 19.2 6.6 113.0 -999 -5.2 50.9 53.7 22.5 61.2 -999 tRNAscan-SE-2.0/Demo/Example2-tRNAs.iso0000644000543100007160000000157113100451013016664 0ustar pchanlowelabtRNAscanID Anticodon_predicted_isotype Ala Arg Asn Asp Cys Gln Glu Gly His Ile Leu Lys Met Phe Pro SeC Ser Thr Trp Tyr Val iMet MySeq1.trna1 Thr 62.6 63.2 48.1 34.4 77.1 62.4 21.9 36.3 60.9 66.0 28.5 46.4 65.2 39.8 47.1 -999 32.5 93.1 59.9 58.6 61.3 -7.1 MySeq2.trna1 Arg 42.5 89.3 67.8 29.2 48.4 45.1 30.0 38.1 47.4 54.0 11.1 51.0 62.4 24.8 22.6 -999 26.2 65.8 61.6 44.4 49.2 25.7 MySeq3.trna1 Ser 24.2 45.4 8.9 0.1 58.6 43.5 8.7 31.9 47.1 43.5 74.6 7.8 0.3 -2.7 -11.5 -999 118.3 45.3 28.2 51.0 19.0 -999 MySeq4.trna1 Leu 40.8 32.4 -12.6 -5.6 34.6 23.7 -999 26.8 50.6 30.5 92.2 -11.2 3.2 -11.9 12.8 -999 67.7 44.5 37.5 29.8 17.6 -999 MySeq5.trna1 SeC -999 -999 5.4 -2.1 -3.4 -10.0 -999 -2.0 -2.5 -5.8 -5.7 -2.0 -999 -999 -12.7 146.9 -14.6 -0.5 -999 -4.4 -999 -999 MySeq6.trna1 Lys 15.0 50.3 21.2 6.0 40.4 34.1 -9.2 23.3 69.9 55.8 75.7 1.8 14.6 -2.1 2.5 -999 72.4 51.9 36.6 54.8 19.2 -999 tRNAscan-SE-2.0/Demo/Example1.fa0000644000543100007160000012020213100451013015463 0ustar pchanlowelab> CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT AACACATATTGACCATTTGGTTTGTTCAAATCAGAACAAATCTTAGCGAG CATAAAGTTAGATGCGATTCCAGCAGAACATGTTAATCCCGTGAGTTGTT CAACTCGAAATCGAATTTCTCGAACAGCCTCCTCTCGTCCAGTTCCGAAC TCCACATGGTCGTAGTAGATTTTCCGCGATTTTTCGCATTTTGGACAGAT CGATTCTTCGATTTTCAAGTCTTCCAAAGTATTTTCATTCTCGTCGAAAC GGGGTAACCAACATGGACAATCTCCGCCGAATCTGTGACGCTTGAAGGTT TCTAGTAAGCAAATAGTTTTTTGTTAATAATCAAATCTAAATCACTAACT TTTTTCTGTATTACTTGCCACATAGTCTGTCAAATCTATAAATGCCTCAT CCAATGACATCATTCCAACATCCGAATCGTATTCCATGAAAATTTGTGAA AATTGGCGACTGACTTTAGTGTATTTAGGGTAATTTCCTGGAACAATCGT TAGACTCGGACAAAGTTTATTTGAGATGAAGCCAGGCATTCCAGCACGGA CTCCAAAACGGCGAGCCAAGTAGTTGGATGTGCTCTGAAAGAATAGATTT AAAGCTTTTCCGAAATCGAAAATTTACTTTTCAAATTGAGTTAGGTGCTT ACCAGCATTGCCGATGAGCCTACGGCCATAGGAACTGTTCTCAGTGCAGG ATTATCTCTCATTTCAACTGCGGCAAAATAAGCATCCATATCTATACAAA CACAGTCTCTTGATAAATCTCTAGATGATTCCAGTTTCATCTCAAGATTC TCCATCTGAAAATTGGAAATTTGCTCTAAGTAAATTTATTAGCTTTTAAA ATACATACCAGTATCTCAGATTTTTGTCTTTCTTCTCTGGTTGCTGTTTG CAAACGATTTTTGATTTCTAAAACTTTCTCTTCAATTCTTGATTGTTGTT TTTTGGAAAACGACGAGTAGGACGCTGATGTGTTTTCTTCGATGACTTTC GTGATTTTCTCCTTATCCAGTCCATTCATTCCAGCTTTATTATCGTTGAA AGTCAGCATTTTTCACGAAAAAGAGCCGGATTTTGAAATTCGCGGAGAAA CTCCACTGATTGTGAGTGTGCAAATGCGCGTAATGGTGTACTTACGTACA TCGGCAACACATTTGACGACATATCAGAAAGCGGCGCCAAATTAGAAGTT GAGTGCGCGAGAAAAAACTACGCGTTAACCGCCAATTTTCACTTCCCCAC AGATCTGTCTCGAGATTCTCGAGTCATTTTTCAAGTTTATTTGTTTGTCA GCGGTTGTTTTATTGAAGATTTGTAAAATTTATAACAAAATGTGCAATAG TCTATTAAACCTCGTGAGATATTTGAAGAAACTTTCCCCGTTTTAAATAT TTCGTGTATTCGTGGAGATCGCGGGAATGTTTTGCCTGTTTCCGTAAAAT TCCTCTATTTCTTTTATTTTTGCTTGCAATTTTCGATTCATTTCAGAAGT TTCCACATTCGCAAAACGAATGGACGTCTCTTATTACGATGGTCCCAAGG ATGAAGTCGCCGAAGCAATGCTGAAAAGCGCGGTGACGGCCATGAGATTG GGACAATACGAGGATGGAAAAGGACGCTTAGAGGAGATAATGGAGTTCGG AACCTCAAATTTTCAACTACTTGGTACAATCTACATGTATTACGGAAGAG TGTGCAGGCATTTGAACCATGATGCCAAGGCCTTGGAGTTTTTCGAACAT GAGTTGAACATGTTCAAGTAAGTGAATCACAAAAATGAGCTGGACATTCT ATAACCTTAATTTTTCAGATTGATCTTCAACTACCCAGAAGCATGTGATT CCACACGTCGCATCGTCGAGCAGGCACTCAAAATGGGAAAGTTCCCCAAG GCTCGACGGTTTGCTGAGGATCTCATTGATTACACCAGCAATAAGAAGAA CGGAGAGAAGTATATCGGTCAAGCTCGAATTTTGTTCGCTTCCGTGTGCC TCGAAGGATGTGAAAGAGACGTCGAGAGTAATCAAGATGAGAAGAAGAAG CTTTTGTCAATATGTGCTGAACAGATTGCAGCCGTGAAATTGTTCAACGA GAATAATACGGAAGGAGCTGTGTCTGAGACCAAAATCATGTTACTTGAGG CGAAATGCTTGTCACTAGACGAAAAATACGAGGAATCGCGTCGCAAGTAT CAAGAATGCATCGATTTTGCCATCAAAACAGACCAGTTTGAAGCAGTTCA CATCGCCTATTACGACAAGGCTCTATATGCTGAGACAGATCTTCTTTTCT TTATTATCAGAGATCTCAGGTAATTTTTAGTTTTAACGATTAATAAAAAT ATCAATTCTTTATTCACAGAAGTGCTCTCTTCTACGCCACGAAATTCGGA AAAGAGCGAGATGTAGTCAAATATAAGTCGAAGCTATCCGAAGAGATGCT GAGAAATGGCGAATTCCACGAAGCATATCTCTACGGATTGGAAGCGCTTG TATCGATTCGGAAGCTTGGATTGAACGAATACATTGGAGATGTGTTGCTT ACAATCGCAAAGTGCCTCATTGCACTTGGAAAAAGACGCCAAGCTGCTTA TTTTATCATCTTGGGGAGTGTTCTGACCATCAACCAAAACAGTTTCAAAC TGTTCTACGAGCAGATCGACGTGGCGATGAATCAAGAGAGAAGCGAAACG GCAACTGATCAAGATGTATGCCTCGCAATTGATTCGTCTCCTGATCCGAC ATCCTCGAATGACATGATTAATAAGTTCGTCGTCGAACTGGAGCACGCAA CAAATGTGGAAACCTGGGAAATGATTGTCAACGGAATCATTGACGACCAG AAGAAACCAGTGGCGATCGAAAAGAAAGAGAACGAGGAACCCGTAGACAT GATGGATCTCATTTTCAGTATGAGCTCACGTATGGATGATCAAAGAACTG AACTGCCTGCTGCCAGATTCATTCCGCGTAAGAAATGTTATAAATAAGCG TATAAGTATGCAAATTTATATTTTTCCAGCTCGTCCAGTGTCATCGGCAT CGAAAAAGACTACAAAGAGTCACAGAATCCTCCCTGGACTCCGTGCCAAT TGGACAAAAGTGCAGTCGATGAAGTTCGATGGTCACACAATGAATAGGAT CCTGAAGAGGTCGAAGAAAAGCAAATCGTCATTGGATTCTACAAATTCGA TGCAGGGCGATGATACTCGAAGCGATGATGTGACAATGACGTCCAAATAG GACCATTATTTTTTCTGTCAAATAATACAATCAAACTTTCTTTATTTATT TTTTTTTTACTTTCTTTCAGTAAATATTATTATCATTTTAGTGGTTCTTT TATTTTATTTGCTGGTCAGAAAAGCTGATTTTTTCAATTAGCGAAAATCC ATCAAGTCATATTTCTATAGACTCTTTACTACATACGTTGATGACTTTCG TGATTTTCTCCTTATCCGGTTCATCCATTCCAGCTTTATTATTGTTGAAA GTAAATAAGCATTGTTTCCCGAAAAAGAGCCGGATTTTGAAATTCGCGGA GAAAAAGTTGAAAATTGAAAAATCCAAACGATGCTCCGATGTTCCGTCCG AAATGTGTGTTTTACCGTGACGGTGTTTGCAGACAGCTTGAAAAAACATT TATTTTTTTATTTTAATTTTATTAATTTATTATTTATTTTAGAATGTTCT ATATTTTAAAATGTGAATTTGTTTCAGGGTACTCGGAATGTTTGTCTTAA ACCGTTCAAGCGGGCTCATTCATCGATCTGTACCTTTATTAGCTCAAGTA TCCACGCCTACGACTTCCACAACAAAATTAGGTTAATAATTCTCCATTTG GTGATAAACCAATTCTTTTCTGCTTTTTTAAAACATTATGTTACAGCTCA ACTTCACACAACGCATGCACTAAGCAAAGAAGATTATTATAAGACTTTGG GTGTCGACAAAAAATCTGATGCAAAAGCAATCAAAAAGGCTTATTTCCAG GTAATATAAGTTTTTATCGAATACTTTGTAAGTATAAATACGTTATTTCA GCTTGCCAAGAAATACCATCCAGATGTAAACAAAACAAAAGAAGCGCAGA CGAAATTTCAAGAGATTTCTGAAGCATATGAGGTATTTTCAACAAACAAT AGAGCAGGCTCGAATCAAAAATATTAAGGTACTTTCCGATGACACAAAAC GTCAAGAATATGATGCATACGGAAGCGGAGGTGGCCCAGCTGGTGGAAGA GGTGGTGCTGGAGGTTTCCACCACCATGGAAATGTTGATGTTAACGAAAT TTTCAGAAGAGCATTTGGTGGAGGGGGTGGAATGGTGAGTTCTCCTTATG AATTCTTCTGAATATTATATTAATTATATAATTTTTATTGTAATTAATAA AAACTACAGTTTTATTTATTTTTTCGCTGATTCCAGGGTGGCTTTAATTT TGATAATTTTGCCCAAAGTGCTTTCGGACATTCTGCTGCTCAGGAAATGG TTATGGATATTTCGTTCGAAGAAGCTGTCCGAGGAGCCACCAAAAATGTT TCTGTAAACGTAGTTGAAGATTGTCTGAAATGTCACGGAACTCAAGTTGA ACCAGGTCACAAAAAGACGTCGTGTCCGTATTGTAACGGAACTGGAGCAG TTTCTCAACGTCTTCAGGGTGGTTTCTTCTATCAAACAACTTGTAATCGA TGCAGAGGAAGTGGACATTATAATAAGGTAGAGTTATTGATTTTTCTTTT ATTGTAGCTTTTAATTTTTTCTTCAGAATCCTTGTCAAGAATGTGAAGGT GAAGGTCAAACCGTTCAACGACGTCAAGTATCATTCAATGTGCCAGCTGG AACTAATAATGGAGATAGTTTGAAGTTCCAAGTGGGGAAAAATCAATTAT TTGTTCGTTTCAACGTTGCACCATCTTTGAAATTCCGACGTGAGAAAGAT GATATTCACTGTGACGTAGATATTTCTCTGGCTCAAGCTGTTCTTGGTGG TACTGTAAAGGTTCCTGGAATTAATGGAGATACATATGTTCATATTCCGG CAGGAACTGGCAGTCACACTAAAATGAGGTAATAAATCTCCAAAAATTGG AACTTAAATATTCATTAAACAATTTTCAGATTAACAGGAAAAGGAGTAAA ACGATTGCATTCTTACGGAAATGGAGATCAATATATGCATATTAAAGTAA CGGTTCCGAAATATTTGACAGCCGAACAGAAACAAATTATGTTGGCTTGG GCTGCGACGGAACAGCTGAAAGATGGAACCATCAAAGGATTGGAGAAAAA TCAGAAAACCGAGGAGAAGGAGACGAAGAAAAATGAGGAAAAGAAGTCTG AAGGTGAAAAAATATGATTTCAATTTGAAACAAAGGTTTTATCAAAAAGT TGTCTTTAAAAAATCGAGAAATTACCAAAAAAAATTCATTAATTTTTTTT TTTTAAACCGACATTATTGGTTGTAAAGATTCAAAAAGTTTGTAAAATTT TAAAGTTGTTTGAAAAATACCTGAAAGTCTTGTTTTTTTTTTTGCATTCT GATGCACTTGTAATCTCAATTTTTCCCCAAAACTGATTTTTGATTCTTTT CAATCCAAAATTACTAATTTATAGGTGCATCAGAATCACAAAAACGGAGA AGTGAGCCAGTAGCTGAGAATGCAGAAACTATTGACGAAAATCAAGAAAA CGAGGGATTTTTCGAAAAAATTAAACGAAAAATTTTCGGGTAAATATATA AATTTCTAAACAAAAATTAAATATTTTTGTTTCATTTGTATTAATTGAAA TTTTCCAGATAAGTCCGAGGAGAAGTCCGAGTCAAAAGAAGAGCCAAAAA ACGAAGAATCTAGTGAAACCCCCGAGAAAAAAGCTGCAGAATCCTAATAT TTGTCGCAATTTATCGATTACTGTCCCAATCTTTCCAGTTTTGTTCCCTT GAAATTTGTTTTGTCTCATTTGTAGTTCTATTAGAGTGATTATAGTGTCT TTTTTGTTACACTTGTTTTTTTTTTCTATTTTCAATTAAACCGAAACAAA TTTATCAAATTTTATTCAGAAAGAGATAACACAACTAGCAATGATATGAA GAGTTTTATGGATGAAACTGGATTTTTTTAGAAAATGGTCTGTAACTTGG CTGAACATGAAAATATCAAAAACCTCAAAATTGATAAAATGTGGTAAAAT GTTTTTCAGTTTTACTACACAACAAAGATTTTTTTATGAAATTAGATGGA TGTTGAGTTATGACTGGTAGAACAGAAAAGTTTTCAAAAATTTTTGTGGA TTTTGGTAAAATGGCTTTGACTCAATGTAACGCAACGAATAATTGGTATT TAAAAATGACAAAGATTTTAAAAGAAAGAAAAAACATTTCTGATCATTAT GTCAGTTGAACATTTGTTTCCTAGGCATTCATGTTTCAAGATACGCCAGT TTGAAGATACGCAGTTTAGTGAGTTTAGGAAGTTATATTCTCATTACAAG CACAGGTACATGAGAAAACAGCAGTCAAAGAAGGGCGGTATTTTTTAATT TGGAATTATATTTTGCTCCTTATAATATTGGGAACACATCTGTGCTGCTA AATTATTGTTTTGTTAATTTAAACCGAAACAAATTAATCAAAATTTATTG TAAAATAAAAATACAACTAGCAAAATATTTTATAGGACAAAATATGAAAA ACGGAATGTCGTGTGAAGATGGAATTGAAAAGATATGAAATCATTGTGAT CTGTTCAGGCAAACTTTATAAAAAGCAAGGTTTTCGGGATAGTTTGAAAG AGCAGATCACCTGGATGTGAAAATACGGTGACATTTTTTTGGAAAAGGGA AAGAAAATGAAAAAAAATCGATAATTTATAGAAATTTTAGCCCGTTTTTA ATTAGGGAAAGGGCGGTGAGTTAAGGATGTGAAATAATTGAAATTTCAAA ATGGAAAAGTGGTTTTTTTTGCACAGAATTTTTCATTTTTCATCTTGAAG GTTATATTAAACACGCGATGACTGTATTTAGAACCCATTAAAATAATGGA GAAAAATGTCTCAAAACTGAAATACTGATTTGAGAAAACCATTTTCCCTA TTTGAAAACTAAAAAAGATGACAAAATATAAAAATGGAAAATCTATTTGA ATGCACGATTGAGAGAATATCTTGAATGACAAAATCTACAACTTTGCATA TCAATTGCAGACGATGACGTGGAAATGGTAGCAGCATCGGCAGTTTGATC AATATATCGAATACATCGGATATCAGTTGGTATGAATGGTATGATGTGTT TGGAAACATAGAGATTCTCCACTATAAAAGCTTCTCGGGACATGTTACGA GCTCGCTTTTGCATTAAATTTGCGTAGAGTTCTGATACAAGGACTACCTG GAAATATTTCAATTTTCCGTTGATCAACATAAAAAAGCAAAAAAAAATGC TTTTCAAAAAAGGCAAAAGTTTTTTAAAAGTTTCAACTATATGCATATTT TTGACCACAAAACCAAAACAAAACAAACTCAATAATATAAGTAAAGAAAA CATTATGGAAATCATTTTCTCGCAAGGTCTTGTAAAACTAAATCGCACTT GAAACTGACTATTTATATATATGAGAAAAATGAAGGAGTCCTCCAAAAAT GATGCAACTCAAAAACGAAACATCAAAACGTATTTTTCAACATTTCGCAT TTCGCATTAGAGACATTTCGCATAGAACGGATTTATGTGGAAATTTATGG ACGTTTTCGTCCTTTTTAGATACATGAGCACATGTTCTCCTGTTTTATTT AGAGAGATGACTTGCTTTCAAAATTAAAAAAAAAAACTGCTTATACATAC CTTTCCTCCTACAACCGCCAATGCAGATCTAGCGTCTTGAATTTTTCTTC CGAAATAATGAATCTTTCGAATATACTGTACTCCTACCAAATCAATGCAC ATTGTAGTTATTGCAAGACCTAAAAATATTTTGAATTTTTGTTTTTTTTT CATTGAAAATTTACCTAAAATGATATAGAGCAATATGATATACATGTATC CGTCCCTTCTGGGCATCAAGTCGCCAAACCCGACCTAAAAATATTAATTT CAAAGAGTTCAACAAACTGGAACAAATTACAGTAGTCATTGTAATGAAGG ACCAGTAGAATGAAGTGAAGAAAGACCACGGCTCTAATTTTGACATTAGG ACACCGCCAAACGCTGTATATACTATCAGAATAGCTAATACCAGGAATGC AGGAATTCTGGAAAGACAAAAATTGAGATGAGTGAATTAAGTTTTGAAAG GTAATTTTCAAAATCCATTAAAAATGTATTAAAAATATTTTACAGTTTTC CAACGACCAAAAAATCTGTCGAACATATACAAAAAACCAAAGCCAGCTTA CAATGGAACCCCTAGTTTTTGTGATTGATCGATTGTTTCATAAATGTGTT TTATAACATTACGAACAAAAAACGCTTCTTGACTGGCAAAAACGTTTACC TTTTCTCCTCGATATTCATATCATGCCCCATTCCATGACTGTGACAGTGC TCACAAACGTGCTCTCTCCGTTCTTTTCGATGTCGTGACAATATGAGATA TTTTAATTTCAAATAGTTTCCATACAACCAAACAAGATGTTCAGATAGGA ATTTACCTGAAAATTATTAAAATTATAATACATTTTCTACATTTTCCCTT TAAAAACAAATTTTAGATGCGATTAGCTGATCGCAGTAGCTATCAGATGT AACTGATGTTTTTTTAAAAACTTCACGTACTACAGATTTTCTAGTAAAAG CATATAAGTGCCTGCCCACTTACCCAAGTCAGCGATGGTAACCAGTGTTA GAGGTATTCCAAGCAAGGAGAACAATATACACCATATCCGTCCAATGTTT GTCACTGGAACTGGATTACCGTATCCGATGGTAGTGACGACGGTTACGGC AAAGAAAATGGACGATGAAAATGTCCATGTCTCCGTTGCTGCGTTCTTTT TTACCTGAAAATAAATTAAATTATTTTTGAGGTATTTTGAGGTAGTATTA TTGTATTAAAAGTTTGAATAATTTTGGTTGAATTTGGAATAGGTTGAAAT CAAACTTTTCAAGTAAAAAACAAAGTTTCAATAGAAACAGTTTGAGAGAA ATGATAAATAGCTGACCGGAAATGAGTACGAAAATCTACTTTTTAAACTT TGAGTATGCTCTGAAATATTTTCCTGTTATTTCCTTTTTCCATATTCTGG CTTTCAATTATCTTTAAAAAATTTTAAAGACACATCTTGGTTAAGTGTTT GGTGAAACTATAAATCTAAAACTATTTTTAAAGGAAAATCCTTGTCCCTT TGGATTAAAAAGAACAAGAATAATTTTGAAATGTTTAATTACAGACACAG ACAATTGCTCAGTTAGCGCATAAAAAAATAAGCAGACAATGAACCAACAG AACAAAAAACTTGAAAGGGTCTCTCTATCTGACAAGTTAGAGTTTTGTCA TCTCCCGCGGGAACAGCTGAGCTTTCTCCACTTGTTTTTGTCTAATCTCT CCACATTTTTCATGATCTATTGTTGTTTCTATCTTTTGATAGTCACAAGG GACCTCTTGTGTTTTCCGTTTCTATTTCACACCGCCAAACAGACTCAAAG AATGGAAGGTTTTGTGGAAAAATAGGAAAAAGATGAATTGGTGAGGATTA TCAAGGATGGGGGAAAAGAGGTCGATAAGGAAATAGTTGGGTTAGAGAAG AACAAATAATAGTCCGGGTAGTTTGGATGGTTCAGATGTTACTTCTATAT TGATAAATTATAAGTTGTGTCGATGCAAAATGTTAATTTAAATAGCCGAT CGTAGTTTGAAGCAATCAGTTTACTGGTCAATTTCCGGCACATTTCGGCC AATTTTTGCCAAAATATTATATAAGTTTAGTTAGAAAGTTTTCAAAGCTG GAAATAAATTCGGCAAGACTTGGCCTAGATTTCCAAGTTTAGCTAAGCTT TGATTAAACTTCTGTCAATTTTTCATGAAAACTGTTCGTATTTTTTTATA GGGTCGAAATCGGGATGCCTGGAGTTGTGCCCGGATTGAGAAAAAAATTT GCAAATAGCCATCTCAATTACTCAATTACAAATTGCTTGCAATACATTTT CCGTACCTTCTGGTCGTCACATGCAATCTTTACAACAAAAAGTCCTGTCA AACTCACCTCGTTACTCGTCAAGAAATACTTCTCAAATGCGACAAACAGC TGATCGGACATATTGTGCATGTGCCGTTCGGCTAGTGACTCCCATTCGTA TCGTTCTGCAGACAATTGTGTTCAGGAAATTTTGTGGATTTCCATTTTTT GCCAGTTTGGTCGAATTATGTGTTGTAGGTATTGTCAAACAAGTTTCGTA AATTTGGCAATGTGCCAAAACTTTCAAAAAAAAAAAACAACGCTTTTAAA GTGTTTTAAAATACTTGGTCAATCTGAATTAATATAATTGCATGTAAATC CCCACTACTGATATTAAAAAGTTATGCCAATCTAACTTACTTGTCTCATT TCCAGCAGCCAAACGAATTAAGTCGTCAACAAACTCATTTTGACGAGTGT AGATCAGTTTAAGCTGAAAACTATTATTATTTTGAAAATTTCATATAATA TTTCGAAATATTCATAGAAAGAAAACAATTGTGTCCACAAAAATGCTGTT GTAAATATATAATTAGGTGCACTCATGTGCTGCCAGGTTGTTCTTTGTTG AAAAATTCGATTTCAGGATGCCATAAAGTAAATGGGAAAATATAATGGAA TCTAACGCGACATTGTTTAAAATACTTTGGCCGCTTGTGACGCCACACTG TTAAGTTTTAAAAATATATAAATTGATCCAAAAAAATTGAAACTAACTAT AAGCTTGACGAATAAAATCTAAGAGCCTAATATAAAAAAGAAGCTTTTTC TCATGCTAGGTACTAAAAAAGGCGCCCGAAAAGCACCTGGCCACCGCAGC CTTTTTTCCAGAAAATCGGAAGTCGTTCGCAAAGAAAATGGAAATGCTTG AGCCAACTGAGCCCATCTGCCACGTCAATGAAATAGGCAGATGGCTGAAT CGAACGGAAAAGAAGAAGGATGTGACGAGTGAATGAATGGCTTTTTGCGT TGAAAAAGTACCACTTTTGGTAGAATAAGATGGGCGAGTGGGAGTGAAGG AGTAGAAATAATAGAAATTGAAAATTGGAAACTTTTTCATAATTTTAACT AGATTAATTTTAGATGTCAGAGATAAATCAAAATGTGCTTCAAAATAAAA TAGTCTGAAATAGTCTTAATCCTTTTTATGCAAAATAAAAATTCGAACTT CCCCGAAATTCAAGTGGCATAATTGTTGCAATTTACACAGTAGTTTTGTG ATTTTTGCGCCAAAAGACAAATTATTATCAAGTGTGAAAAAAGTGTGCGC CTTTGAAGAGTACTGTAGTTCTAAACTCTTGTTGCTGCAGGGTTTTTCAA AGTTTTTGTCATTTTTTTAATGTTTATCTTTATATTTTTAATTCATACAT GTATTTTAAAAATATATTTTTAACAAAAACTGTGAAAAATCTAAATAGAT TTCGCAGCAATGAGAGTTTGCAGTTACAGTTATTATATATCTTTAAAGGC ACACACCTTTTTGAAATTAACAAACAATATCGAGTCGAGACCGCGTACCG TATTAATATCGCAAAACTTAGAGTTTTATTTGTTTTTATCTTGATGAAAG GCTTTTCAGGAGAACGTTTTTTTTTCAAGAAGAATTATCACCTCTCTCAC GAATTATCAACTCGTTCTCATTTTATTTCATTTTTAAGTGTACATACACA TCGCTGTTAGTTACCATCGCACGGATGGCCGAGTGGTCTAAGGCGCCAGA CTCAAGCGAAATGCTTGCCTCATGCTCGAGGTCGACTGGGTGTTCTGGTA CTCGTATGGGTGCGTGGGTTCGAATCCCACTTCGTGCAGAATTTTTTATG TTTTCAACCTTGACCAAAAGTCATATTCATCCAATTCGGTACCCTATTTC TACGTTATCTTGTTCCTCAAACGATGGCCTTATGTTTTGATCTTCAACAC TGCATAAGGAAATGTATACCAAAGACGAAGGTCGTTTTGATTTTTCAGTA ACTGGCAGCAATGATCCTTAAATCAGACTCCCACGCCCCTTCCGTTGAAA ATAGGGCTTCGATCGCCAAACTAATCTTATAAACACTAGAAAAGTTTTTG TTAAAAATCCGTATAAAAAGGGAATAATGTGATGAAAAAAGGCATCAAAT TTCAAATTTGAACATTTCAAGCTTAACTCGAACTTCAGCTTTCTCGCGAA AAAATGAGTCAAATCTTCCAAATTCTTGTGATCTTCGCCATTTTGTCAGC ATTGCAAGTGAATGGGTAGGTGAAGTACAGTACTTTATGGCATCAAATGA AATATACTCAGATTCCTCTTTCCAACTTATTCGTCAGGATATGACTACGA TTGTTATGGATACGGTAACAATGGTTACGGTAATGGTGGATACGGCTATG GAAATGGTGGCGGATACTATGGAGGATACAATGGATACTAAATAAAATAA ATGAATACTTTTTGACGGATTTTTCATTGAAAATTTATTTAATAAATGTT AAGCATTCATAAGTCTTGATGTAGCCTACCCTGAAGTATGATTTCGGACG GAATTATTAGGTGTAGGTCGTTTCCTACGTTTGCCTTATATGCAGGCAGG CACGCCTTCGCGGAGATCAGGGGTGTATTTTTGCTATCGGGCAATCAAAT GAAATATAGATTTGAAAAATTTTGTATTTTTTTTTTCTCTAAATGTTTTC AATTTTCAAGCTCAAAACATTTCAAGCCTAGAAGGAATTGTTTCAAATTT CTTCTCTCAAGTTGTGGAACTATTTCCAATATAATATATTTATTCCAAAT TACTCCCATGTGAACTATAGAACACGTATTGTCTCATCATTTCACTTTTA TAGTTTCTGTATTACAACACTTGAATGAATGCTTGGTGTAAACAATTTAT CAATGTGATAAACAATTCGATTTGACGAATTTTTCCGAATTCTCAGGATG ACCGAGTTTCGATGGTCAGTGGGAGTAAATGAATTGGGCAAATTGGTAAT GGAAATCTAGTTTTTATGGTGAGTCATGGCAAAGAAATCAAACATTTAAA AATTACAAGTGATAAACCATGAGTAAACTTAATCAGGCAAGCTGGCGTTA TCAAGTTGTTCAGTTGTTATAATAAAAATTACAAAAAAGATTTTCAAACA ATGTAGATTTTTTCCACTATCAATATTCGGTAACTCAAAATAAGTCTCCG TAGCATAGGACCTGGCAGCCTACACCTACATCTACATCTACATCTACACC TACCCACACTACAAGTACACCTAAATTAATGGTTCAAATGAGTCAAACTA ACCTGTTGTTCCTTCATCATTTGCTCGTGCGGCTGCTCAACACTATAAAA TATCAATGCTCCAATTACTGTATATGTACACGTTAGAAGTACAAGTGCAA CATGAGGTAGGACGAGTTTAGCAAACTGAAATAGGAAAATCACTTCAATT AAAATATTAGAATAAATACTAAGTTTTCAATTTTTAATTAATTAATTTCA ACGCGTTTGAATAATTTTGTTAAACTGTCGAGGAAAAAAATCTCATAGTC AGCGATCGAATTCTCACGAGTTCTGCCAATTTTTCCTTTTTTTTTAGATT CTTGTTTCCAATAGATTACCTTCCTGATTCCAGTCTCCTCTTCTTCTTCA TCTTCATCTTTACTTTCATCATCACTTTCATCGTCGATATCTTCATGGCC ACTCTGAAAAGTTGTATTAACTTTTTAAAAATTCAACCAAAAATAAAAAT AAACAACTGTGTGAATCAAATGAAAAACTTTTAGTTAAAATTCTATAACA TTGAAAATTGTATGAAACTAACAATAAAATCACACAGTTGACTATTTTCT TCAACGATCCATTTTTGCTAATGATTATAAAATGGTTAGCATAAAAATCC TTGTAAAATCAATAAAAACACACAAAATGGGTATATGGGAAAGGGGAATA TAAATGAAAAATTCCATGCTTCTAATTCCAAAACCTACTTAATAAACTCA CAATTATGGCTGAAATCAAGCTGATTTCTAGCTCAACTTCCTCATTTTCA ATGAAAATTTTTGCAGATTTTTTCGATTTTTTTATTTTCCGATTTGCGTA CGGACATTGATGAGGGTTATCTGGTGCTTCCTGTTTTACAAGAGTTTTAC AAAATAGGGAAAAGAGCAAAAGGCACAGTAAATCTTGTAAATTTAGAATT AATGAAAACAAACCTTATCACAACTGTGAATTAAATGACTATCTTCATAA TATTCAGCTCTTCCAAGTCCAGCTCTTGTTACTTCTTCACCGAATAAAGC TTCCTGAAAATAAAGGGAAAAGGGCATGCAAACGTTTTTGGAAAATCATG CAGTTTTTATTTTTATTTTCTAAGATGCATTTTATTGTCCACGGTGTTAA CACCTGGTAACTCTATCTATATATGTTTTTTTTTGAAATTTCTTATACCT TTAGCTCTGGTATACCTAATTTTTTTAAATATTAATTTTAAAAATTACAT GCATTACTAACAGAGATAATTATTCTGATCATTAAAATTTTGTGATTGCT CTGGTCCTTGTAGATAAAAATAGACAAAATAATTACCCTCCTCAAAACTC TCAACTCTTCATCTGTTGCCACGTCATCTCCGTAAATCAAGTTCTCAAAA ATTTTAATATTATGATCAATTGAGCCATTATTATTTTCATAATGATCTGG TTTTGTATAGCCACGTTTTTCATTTATAAAAAACTTTGCGATTCCAAAAA ATTCCATTATGAAGAATTTGGAGACCGTGCTGAAAACATCCATCAAAATT GAGTTAAAGAAAATCGAAATAATTAAATGAAATTGAGTTGAACGTAGGAT ATTTATTAATTTGTTTTAAAATCAGTTTAAAACCATTTTTGACGAATCTC AAAGCATTCAAAAATAAATAACAGAACATGGGAAAAATTGTATGAGTTTA TATAATACGTTTTCTCATGGTACTATTGATTGATCAGTGACAGAGTTTTC GATTCAAATTATTATGTCGCCTTGATCAATGAACTGAATTTTCTGAAATC GTATGTCATACTATTTACTAAAACTTCACAAGTCATAAAAATCAAAATAT CAAAAAAAAAGGATCTAGAGAACTTTCAATTTTTCTCCGTTTCAAAAGTG TTTGCCAAAAACCGAAATGGGAGTACATTTCTGAAGATTCCTTTTTTCTT TTTTGTTCAAAAAAAAGTTGTTAAGCTGGAAAAAAACTTTGGTTTGAAGA GATGGAAGTTGTCTTGGACACACTCACATTTTTGATGGACAGCCTATCAG CGAACCACTTTTGGATTCTTGGCGAGAAATTTTTGAATTTTCGAGTCAAT TTTCATGTCTAACTTGAGTATATTTATGACAGATATCACATTTTTCGAAA ACAACAAAACAGAAATAAAAATTTATGAAAAACAGATTTTTTTCCGGATT TCAAAATTGTAAACTAGAATTTAGATTTTTTCTATTAGTAATGCATACAA CTTTTTGGAAGGAAAAATACTAGAATTTTAGTAAAATTTGTTTTTTTTTA ATTTTCTAATTTCTACAATTTTCCAAAACCATGCAAGAATCCATTCAATG TCACCCAAAAAAGCAAAAGTCAAACAAGAAAAATCAATTAAAAAAAAGTT GAACCCTTTCAATTTATACACTCACCCTTCTCTGCCGTCTATGAAATCCT TCTGGAATTTCAGTGACTCGATCATCCATCTCATGCCGTTCAAGTTCTTC ATAGCTGTCATATTCAACTCCTACGTGGTGGTATGTTGAAGCTGCAACAG AAAAATTGAACATCAAGTTATTTCTTACAGCTTGAACCTTCAGCTCGCGC AGATTAAAAATTGGTAACTCCGTTGCAAAAAAGATGGAGTTGCGTTTTTG GGAAGTTTTCAAGATTTCTTACTAAGAAGTTTGGTACTTCGGAGTTATGT TGAGTATAATAGGGCAACTACCCAAGTTTATTCTACCTTTAACAATCCTA ACACTACTTACCGTAATCATCGTTATTACGATGAAGCAATGCTTCATGTG GATTATCACCTTCCTCTTCCATAAGTAGACTGCCACCATCACCACTAGAG TCAACTCGTTGATATCCACGAGATGATGAAGTCATCGCTGATGACGTGGC ATCTGAAAACTCAATTAATTGATGTTAAATTAATTGTATTATAATATTAA GTTCAACTTGAATTTTCTGAATGTATGGAAATTGAGCTTTTTTTTTAGCA AATCGCCATTTTGAAAACTCTCATTTTCAAATAGGGTCTAAAAGAGACAA AAAGGCAATACCTGTTCATAAGACACTGTTTGAAATTTTCAGTGAAAATT TTGGCAATATGCCAAAATTGCAAAAAACTACAAGTTATGATATAAGATGC TACGAGAAAAAATAAAAATGTAAATTTCAATCTACATCTTCTAAAAATTG TAAGAACATCATGCAAAACCTGCAACAAAACCTTGAAAAATTGAAAGAAA AAAGAAAGGTGAGGTAATCTGAGTCAAGGTGGAATGATGGAGGTTGGTGG GAAGAGGAAGGAAATTCAAAACCGCATAGACACCAAATATTGCAATGTTT GACAGAGGGCGACGAGGAGAGGAAAAAAACGAGATGAGCTCACGGGCGGA AAAAAGTTTTTTTGGAAGCGCTAAACTGATTAGTCAGTGGCGGTGGAGGA AGACCATTGGTATTGGATGGATAGCCCCCAATTTCGATGATTGACTTTTA CACATATACACACCAAGCTTTTGTTTAGAAAAAAGAAACAAAGAATTGTG TGTGTTGTAGGATGCCATTTTGAGGCGTGTGATTTGATAAATGAAGACTT CAACACCACTACTCCCTTGAGTCCAGATGAATTGAAGAACACGAAAAAAC GATTTATAAACAGATGGTTTCGTGTGGTTTCTGTGTTTTAAAAAGGCTGA CCAAAAAACTTCTGTTTTACAGGACGATGTGAGGGCTCTTCCAAGCTCGT TCTTTCCTCGCGCCTTACCGCAACTACCCATTAGTTTATCTGTGAGTTTA TTGAAATACAGTAACCTAGCTTCACCGTATTTCCTCTAATAGTATTTCAC CACACCACTTTACTTTAAAAGGTTATATCTTGGTGAACTTCAAAGATATG AAAAATTATAAAGTATGAATCGACTAAAAAACGAATATTCTTCCAACTTG AAATTTTTGAAAAGAACAAACGACAGCTGAGATATAAGTTGTTAAAGTTG AACAATGGGGTCCAACACTAGTAGAGGATCATATTACTCTTTGTAAAACT TTTTCGAATAGTGGAAAACTTAGTCGTAAATACACAAAATTAGACTCTAG TGACTCATAAATTTCGATCAAAATCTTTGAAATGTAGAATTTTTTGCCAG CAAGTTCTGTGAGCCACAACAATCAAAGAAGAAACCATGGAAAATGTTGA ATCGGTCAAAGGAAATCGCGGAAAAAGTAGTGGAAAAGGGGCAAAAAGTG AAGAAGAACTAGGCAAGATTAATGAGCCGCCTTTTTGGCTCTGAAGAGCC ATGTTTCCCTTGTGAGCATCATTTCTTCCGCGGCGCAAAGAATACGGAAG AATTGGAACAAACTTTTGATATTTGTTAAATATTTGCCCTTTTGGAAAAA ATCGTTGTGGTTGTGAAGGAAAAAACTACAGACAAAACATTTGTAAAAGT CGAAAATTGCTTGAAAATTTGCGAAATCGGTGAAACAATCAGAATTTCTG GCTTTGCCACCAATACCTAACGAGACCCTGCGTTTAGGGGCGGAGCATTT TATTAAGCCATGGAGCGCGATTACACCTTGGAGTGCTTTTCGAAAAAAAC CAAATTCAAATAATTTTTTAAAAAATCGAGAAATTATAAGTTTTAATTTT AACTTTGCTCTATTGATATGGGTTATCTAGAAGCTAGACTCTCTATAAAA AATCGAATAAAAACTAACGCGAATAAATCTTGAAAAGGTCGGAGTGCTCC CGAGAAAAGTTTTTGTGCTCTTTTCGTCGTAAGACTTCAAAAAAAGAGGC ATTTGGTTTGAAAAATGGCTCTGATCTCTTCGGTTTTTGCCCTCTTTTTG TTTAGAATATTTCATCTGGAACAATATTTGGAATCACGGAAAGAAATTTG CAGATGGTATTGTTTATAACTTTTATATATTTATGGAAAATCACTCTTTT GAATAGATTGTTCCACAAAACCTCACAAAATACTGAAAAAAGGAGAAATG GAAGACAGATATACAACGATATAACTTTTTTTGAATATAGGTTAATAAAA AATATTTGTGGGGGAATATTTGAAGATATTGGCATATTGAAAAAAAAAAA GAAGAAAGCGGCTACCTTTGAGTTTTATGGACAATGGAAATGGGAAAAGA GAAGGAATTTCATAGGTAGCTAGGAAAAGGTTCTAGTATATGGAACTACG TTTTGGGTATTATAGAGTTCAATAATATGAAAATCAATTTAATGATGGCA CAGTTTACACCCTATTCCCGGCAGTCGAACACAAACTTGTGCAAAAAATA GTATGTCATTTTTGTTCTAATTTTTCCGAGCAGTCATGTCCGAGTGGTTA AGGAGATTGACTAGAAATCAATTGGGCTCTGCCCGCGTAGGTTCGAATCC TGCTGACTGCGTGTACTTTTTATTTTCTTTCAATTCATAGAGTAATGCAA ATTCTATTAAAATCGGATACAGTATGTTATGAAAAAAAAGTGTTCTTCTC GAAAGTATTTGGCCTCGCTTCTTCAGGCTGCTGGTTCTAGAGCAGCAGTC AACTCAATTGTTTTCCACTTTTCCTTCTCATTTTCATTTTTTCATCGCTC CTTTTTCGTTCCTCACACTCCTCTAATTTTCCCTTTTTTACTCATTTTTG GAAAAAAAAAAGAAAAGGAGGTCTTATTTTGTAATCCTTCGGAAATTCGA AAGCTTGAAGAAAAACTAATAATGATGGAACTGTGTGGATTATTGAACGA GAAAAAGTTTTAAAGGTGCATACAGTAATTTGTGTGTGGTCTCGCCGCGA TCTATGATATACCAGCAAATACGGGATTATTACTTCAATATTGATGGAGA ATGTACATAGAGGGAACGGGGTTGGGTATCGCTGTCAGAAAAATACTGAA AAACCAAAAAAAAAATCCAAAAAAATTGTTTTTCATGATTATAACCTAAA CTTCATGAGTAAAAGTATCAAACTATCTCATTCTGAACAAGTTGAAAACC TCGGGACCACCACACCAAGAAGTGAGAAGCGCCCTCTTTTTGGCCATTGT TCCCGGGGGACAGTTTTGCGATTTGACATAAAAATCGCGAGAATAGTGTG TGAGATGAAGAAACGCAGAGTAGTAGAAGTAGGGGTATACGATGTTAGGC AAAGAAAAGGAGGGAGCTGGAGAAATGTGTCTGAAGAGCACTGACTGACA AAAATGTGCAAAATGTCGGAAGGAGGATGGGTTTTCATGTTTTCGGTGTA AAACTGACATAAGAAAAACATAAGAATATTGGTAGAAACTGGTAGGGGTT GTTAGGTTTTTCAATCTTTGAGAAATTGACGAAAAGTTGAAGTCTTCCGA ACTTTTTTGTGAAATTTAGGAGAAAAGATCAATTGAACGTCGGATTGAGA ATTTAAAACTAGATAAGTTTCAACTCAAGCGACGTTTGAAAAGATATGAA GTTTTAAATGAAAAATAAACGAACATTTTAAACAGGCAAACAATGCCTGC CTACGTGCTTACAAGAAAATTGAAACTAACAAGTAGGGCTACCAAATCAG ACGTACGATTTCACTGGATTCCGGACGATCCCGCCCCTTGTTTGTTTCCA ATTTACAAGAATATAAATGGTTAGGTTTTCTACGTGCCTACACGTCAACT TCATGACTGCCTGCCTACTTGCCTACTATATGGATACCGTGGCGAGGTCT TCTGGTTTTGCAAGCGTGCAGGCAAGCAATAGTTTTTTTTCTGTATAAAC ATTCTGGAAAATTTAACTGACCGTGGTTTATCAAATACTTCAATACTTCA AAAATACAGTTTAAAACATAATTTTTCTTTTTTTTCAACTTCTAAACTAA CCTGTTGTAGATTCATCAATAGCCGGAATATCTGATTCTATTGAAGGTTC GAACGGCTTAGACGGCGTAGAAGCTGGAACTTTTCAGTCTATTTTCAGTT TAAAAAACTTTAACTAACCTTTTTTGAAAAGTCGTAAATATTTTGCTCCA GTTCCACGTGGCATGATAGCTGGGGAGCTTTTTCAATTTTGTTTCCGATT TCAAAACAATTTTCAAAAAACACTGGTAAAGTGCACTTCCTGCTCAATAT CAGCCAGACAGAAACCAGAAGAACACAACTATTTCTGTCTATTTTTCTGA GAGAAAAAGAGCATTTTGAGCAATTTTGAGCCAGAGAGTCCCAAAACACA TTTTAATTTGAAGGATCCTTGCTCCTGAGCCCCCAGTTTTACGGGTCAAG TTAGGCATAATTGAATATGAGGAGGCGCAGAAAAAAAGAGAAGAAAATAA ATGAAAATTAATTTTATGGAGCTAATTTTTGAAAAAGAAGCAGGAACTAA TGTTTTAATTGTTCGTAGTTTGACTTGATGTGTGGTGAATAGTGTTTTAG TGTTTCACTGATCCTTCAAAGTAGATGTTGCACCATGGTCGAGGGCGCTT TTCAACCGCTTTGCTCTGTGGACCACAGTTATTTGAACTTTTCTAATTAT TTTTTTTTGTTTTTTAGTTTAAATCAAAATAGTGTCACTATTTTACTTTT TTTTATATGAACAAAACACTGGGAGTCGGGGTTCCAAATTTTATGAATTC ATATCACATCACAGCGATTTTTGTATTTTCAAAAAAAAACAAAATTTCAG ATCATTCTCCGGAGGTGGACAGTAAATGTTCTCAAATCAATAAAATTTAA TTCGATTTGTGATCCCAACCATCAATATGCATAAATATTTCCCAGATCAT TCGAAGAAATCATTCGAAAATGCGTCTTCTTATTCTCGCCCTCGCCTCTT TTTCCGTTCTTGTGAGCTCTTTCATCGTAAGTTCTTCTTTTGACTAAAAC TGTGTGAAAACCGTGGAAATACGTGTTTATCAGAGTTAGTGACGAAATTT TACCTTTTAATGGTATGATGCCTCGTAAATTTAATCAAACCTCTGTAGAA AACTCGTGTCATGTCAAGCGATTCGAAATCGATTTCTGAATGTGAAACGG AAACCCGTGTCACTTTGATGTCCAAACGCAGGAATAATTCCTTTGAAATT CAGATTCGTACTCATGATCTGCAAAATGCACCAAAAGCAAGAAGATTCCT GAAAAGTTTTGATCAACCTGAAGTAGTTGATCTTACTAAGGATCCCACTA CGTTCCAGGAGTTTACCAAATTCTATGTCCAACTTCTAATCGACGTAATC CAAAAAAAAGACACCAGAAGCCTTCATGGCACACAATAATTACCTGGATC TTGAAATGTTGTTCAATGAATGTAATGCTCAACATTACGAAAATTTAGGT TGGTTTCTGTAATAATATTTCAAAGTCTTTTTTAATAAAATTTTCAGGAA TGTTCTTCAAATGGCTTCAACAATTTGCTGCTCATCACGAAGATCCAAAG ATCACAATTAAAGATTTCAAGGGAGATAAAACATCAGGAGAAATGATTCT TCATGTTAATACCTTCACTTTATTCAAGTACAGACAAGAATTTGACATGA AAGTTAGCGCTTTGAATGTGGGTTATTGAATCAATATTTTGAAGACAAAA CTAATTTTTCAGGACAATGGACTTGGGTGGAAGATTTTGTTTGTTGACCG CAGTCTTGGGTGTAATTGATTCATTTTTTGAAACCGAATAAACTAATTTG TTAAATTATTAAATTAGGCAATGGGACAGTCTGGAAGTTGCTAGAATCTA CCGATATTTATTAGATATTTATTACTTACTTGCTACTATGTTTCACAAAA CAATATCAGAGAAAAAAAATCAAAACTGAGAAAAAACGAAACAACTTACC GAACAATAGTTAGCATTGCTTTGTACAAAAAAAACTGTGAGGCTATTGGA GCACTCCTCTGATTGATCCGGTCGTGAGTATTGATTTTGTTTGTCAATTT GTCTAGGACGCAATTGATTCATCGTTTGAAAAAAAAGCTCGTTCATTCAA AATCATTGATTATTTATTCCATGCAGAAAACTGTTTTATAATTCTCGATA AAAATGTTATATTTTAAACTGTAAATATTTATTTTTCCACATGATATGGT CTAACCGTGGGCGTAATATTTTCGGTTTAGAACGGCTGAATCGAATCTTT TGGACTGCAAAGACGGATTGGATTTTGAAACAAACTACGGTGTTTTTGTG ACAGTATTAGTCAACGAATGGTAATTAGGCATATGTTTTTGGAACAACTC ATAATTTCTTGTTGGAAAACAAAAGTGGCAAAGTATAATTATTTAAAACT CTGACAAATCTTTTGTCTTTTTTCTCTGGTACTCCTGTTTTTAACAGTGT CCTTATTATGCTCCTATTACTCTTGTGTTGTGCACGTTCTTTTGAAATCT ATATTGCAGAGCGGGAATGAATTTAAATAAGAATAATTCAACTTGGTCAT TCAAACAATAAAACGCCTACCATTAATATATCCAAAATAAGTTCTTTATT TTTGCAAGGTATTCAAATATATCAATAACTCGAGCCTGAAGAAATCCACC GCTAGAACATAAAATTAACGATAAATATTAAGGCAAAATGTTTGGGCCGA ACCGGGAATCGAACCCGGGACCTCTCGCACCCGAAGCGAGAATCATACCT CTAGACCATCCGGCCACTGCCCGGCTGGAGAAAAATGGGGGACTTTGAGC GTTACAACATTCTCGAGGCCCTCTCGAGTCGAGGGGGACCGAGACGGAGA GTGCTAGGAAAACCAAGCATATTTGGTTTTTGAGTATATAGTCGTACGAT TGAGAGGAAGCAAAGATAGATTTATAGCTTTTAATTTATTTTTTCTAGGA TTTTACGTAACATTAATTTCAATACTGATTTCGAGATTTGTCTTTTGATA GATTTACAACATGGTTATCATCCGGTATACTATTGTATAAAAGATGGTCC TAAATTCTATGAGATTATTTTATATTCTCTCTCAACGAGATGCTTCTTCC AAAAATTTCAATTCTTCTATATATTCTCGTCGTTCTTCAAGAAACAGCGG CTGTCCCGGTGCTCTTTTTCGCTCCGGAAGAGCTGTTCCGTTTGAGCGTG TCGTTGGACAACAGGTGAATTTTTTATTTCATTTTGAGTGAGCCATTGTA ACATTGGATCATAAGAGTCACGTGGAAAGTAGAACAATCTCAACCCACAA GTTAAAGCCTAATATTATTCCAATGCCTCTAGTAAGATTTTTATAAAAAT AGAAACCTCGCCATATTAGCCAGACACGAGATTTTTACGATTTCTTCGTC AAATATACGGTACCCAATCTCGACACGTCAATTTTTCAATAAATGCAAAA AGATGTGCGCCTTCAATGAGTACTGTAACTTCAAACTTTTGTTGCTGCCG AATCGACTTAGTTTTGTGAAAATATATGTATTCATGTTTTAAACAACTCA GAATTAACCCAAAAATTTTAACAAAAACTTTTTTTTAAGCTATGAAAAAT CAATTTGAATTCGGCAGCAACAAATGTTTTAAATTACAGTATTTTTTAAA AGCTTACGCCATTTTGCATTCATTGAACATTTGTCGTGTCGAGGCCGGGT ATATTTTTGACAGAAAAAAAACAAAATTTCAGGTTCGTTGTTTTTTGTTG TCAAAATTATAATGTGTTTTAATATCATATCAAAAAATTAGTCATTTGAT TTTCGATAGATCAAGCTTCAGTAGAGGTTTCTAATCATTCGGAAATATAC ACAAAATCTTGAATTTACACTTCCGATCAGTTGGAGAATATCATTGATCT ATCAATAATGTTAGATTTTGTAGTTGATATTTTCACAAAACTCTCAAAAC TTTTTAACATAAAAAGTTGGCATGGTGCGTCTTATAATTTTTGAATAACT TATTAAATTTCTAAAATTAATATAGAGATTGGCTCCCACATGTCGAGAAG TTGAATTATTTGTTCCTTGTCTATTCACTTCTAACCGTGATTGTGTTATC GACAAAATATGTGTTCATGAGAAACCACTTCCCCTTTCATCGATTTCTGT GTCAAGCCCGAGTCCAAATACAAAACGGGAAGGTCATCCCGAGTTCATGA GCCAAATTGCGCCGCCTAGAAGAAATGTGAGAGTTGAGCACAGCATTCAA CAATTTCTAATCAGCTTTCAGGACTTTTTGCGATTCGGAAGAGCCGGAAT GGCTTCTGGAGTTGGTGGAGGATCGGAAGGAGGACCTGATGATGTGAAGA ATTCGTATATTCGGGTCAATGGGGAGCCAGAGATTGTTTATCAATAAAAT AATATGGATTCGATGCGTCCGATTATTTTTTTTTACTTGATATTACATTC TCGAAACTATTGTAATGTGTGGGGAATCGTTATAAATAAAATATCTTCTT GTTAAACAATTATTCCATGAAATATGAAGTTATAAAGATATAATCCAGAT ACGAAACTTTGAGATTTTCGTCCGAAAAGTACGGTAGTGGGTCTCGACAC GACAAATGTTAGTTGTGCGCCTTTGAAGATTACTATCATTTAATTTTTTG ATTAAATCGTTTTATTCTAAATGCTTAGTATTTACACAAATATATTTCAA TCGGAATTTTCAAAGTATTTTCCTAGAGGAAACTACTTTGAAAATTCAAA AATTCAGCAACAACGAGAGTTTGTAATTACGGTAATCTTCAAAGGCGCAC ACCTACTCGTATTTAACAAAAATTTGTCGTGTCAAGACCGGGTTCCGCTC TTTTTTGCAGTAGTACAGAGAATATTCCGAAAGAAGTTTGAAATTAAATT AATTTATTATACCGACAAATCATCAAATGTGATGTATTTCAATGAGTACA ATTTTTCGAAAGAAAAACAGATTGAGGATAAAACTTGAGTGATGAGATAA CCGTAATATGGAGAATTATATCAGTGTCAAGAAGGCACATTGTTCAGTTT CATATTTACAGATGTTTGGGATTAAATGAAGATTCGGTATGCATCGACGA TCAGAACAATGAACGAGTGAGTTGAAAGACCTGGAATGTTTAAAATTAAT TTTTAAATTAAAATTCAATTTTCTGCCAACCTTTTGTTGAATGATATCAG TTCTTGGAGAAGACCATCTTTCAATTGCTTCAAATTCACGACGTCTTTTC ACCAGTTGTAAGAAGGCGGTCAACGACCCGTCAACGTTTGCGTAGTGTGT GCGGCAGGAGAAGAGACTGCAAAACAGAGATATTTCATGAAGACTCAAAT GAACAGCGGAAAATGTGCCTCGATAGCTCAGTTGGGAGAGCGTACGACTG AAGATCGTAAGGTCACCAGTTCGATCCTGGTTCGGGGCAATTCTTTTTCG ATTTTTTGAAACTTACCAAATTCCATCATACGAGAGTCCTTGGTGCAGAA TTTCGTGAACCAAATCAAATTCTGCTTCTTGCCGGTTTAGAAGATCTGGT GCCAAAATGATGTCGAATTTTTTTCCGCCAAGGAACTTCATTGCCTCTTC AATTGTACCACATGAAACCTTGGTCTTGATCATTGGAATATTATTTCTCT TAAGTGTTGGACGACAGTAAAGCTCCAAACTAGTCTTATCCATTGTGTGC ATTGCAATTTCTTCTGCTCCATTTTCGAAAGCATATACTGATGGGAGTCC AGTTACGAATCCGATTTCCAAAACAGATTTTCCATCGAAAAAGTCAGTTT CCATCTCAACATTGACAATATTATCAATAGTGTGGCAAATTGTGTTGACA CCTTCCCAGTCATCATGATGATCTGAAAATAAGACATTTTCAAAATTAAA TCAAAAATTAAAAAAGCAATGCCCCGAACCAGGATCGAACTGGTGACCTT ACGATCTTCAGTCGTACGCTCTCCCAACTGAGCTATCGAGGCATGCCGAA ACTGACGTTTCTGATCAAACAGGAAAAATTTGAAATTGAACCGTACCTTG GTGACTATTTTTTCTCATTATAGGCAATTCCTCTGCCACATGATTTTTCG GGTTTGAATTTCTGGTTGCCACGTACTTGATGCAACGGCCGGAATCCAAA CTGAGTTCACAGTTGACTGGAGCAACGGAAGCAATTGTTGGGATGGTAAG GACCATTTTTGTGTGATGAAAGGATATGAGAAGAACGCTGAAACCGAGCC GGAGCTATTATTTATTTTGAAAAAAATGACCCGAAACTAAAGAGACAGGT GGTTCTTTGTAGTTTTAGATAGTAAATAAAGTAATGGGTAGGGGGCAAAA ACAAAGAAGATAAGATACGCTTTAGGATGGAGTGAATCTTGATCGCATGA CGTGAATGGCAGAAAGTTGAAATTTTGAAAAAAAAGGAAGGAAAAAGTTT GTACCGAAAACTAGAGAATTTCCAAAGGGTTTGTTTGCATTCGACCATTC AGTTGCTTTTAAAGATTGTCCCGCAAAAGAGAGAGGGGCGGAGTGAATGG AAAAGTCACGAGGGCGGGTTGTGTGCAGAACAACGGCAGGAAAATCATAA ACACGAACATAGAGAGAGCTATTGTACTTGGAAGAATCAAAGAACGAACT GGAGTACTGGAATTGAGTGTGGAATTTGTATGTCTAGGTCAGGAAAGTGA GATCTAAAGCTAAAATAAAAGTGAAAGTCCATCTGAGTTGCAGGAGAATA ATTGAGATATTAGTCGAGCTTGTTCTTCAACTCTAAAAACTAATTGAATA AGTTTAAAGTAAAATGGAGCTCTTGAGAAAAAGCGAAAATGCCATGCTGC TTTAATACCACCCTAACCCCCTTCTATTGACTCCGCCTACTTTTTGTCTT CGCTAGATGAAGTAGATGAAGACGCCGGAAGCAATTGGAGATGTTAGACG TTTAGAGATGATAGAATCTATTCATCCAAGTGAGAGATCAAACAAGACAC TGAAGAAGAAAAGAAGATTGGCGGTAAAAGGGATGATCTTCAAAGAACAG AGAGCTCCGACACTTTTTGTTTTTCATCTCTTTTTTCACTGGTTGGTGGT CAGCGAAGAGGAATAAGAAGAAGAATGATGGCAAGAAAAAGAGAACAGCT GATCGATTCTTGTTCCTGAATAGATCAATGACGTACCATGAAAGAGTGTG AGTAGGTGGATAGTTGATGAAATTGATGGCAGAGGGTAGATCATTGTAGA AAAGAACGATCAACTTGAAGTAAACACAAAGTAATACGATAGAGATGATT CCTTAGCAAAAATAGTACCGGTGTCTACTGAACAGATCACGATTAATCTT TTTTATAGAGAACTTCTGTTGCTTAGAGACATTGTTTTTTAGAAGAAGTT GAAATATCAATTGAGATTCTCGTCAACTTCCTGATGAACACAAATAACTG GGCATTCACCAAAGGGCACCGCGATCAGTTCAGGGTTTCACCTAGACCGG GTTATTCAAAACTTCGAAAAATTCGGATAACCTGTAATTTGTCGATTTTC CGAACTGGCCGATAATCAACGTGCCGAACGATAATCACCGCTCATTCCTG ATTGAATACACGAGAAGCTGCAAATTAAAATACCTAAATTTCGAGGTTTC CAGAGTTTAGCCAACATCTAAGCTTAGACAAAAAAAAGCCTTTGCGAGTG TTTCTGGAATGGATTATTTTCTCTCAGCTTCAGTAAAAGATCCTTGGTTA CTATTTCTTGACCATCTCACGTTTAACTCCCAAAAATCATCGAACACAAC TAATTTCTCAACAAACCTATTTACTTTATTTCGCCTTGCTTTGTTATTTA TTTCACTAAAAAACGAACCTCAATGATTCACGTGTGATCGGTGACCAATG AGGCTGGCTGGCACGTGTACCAGGCGAAAATTTTCAGTGAGAAACACAGA GAACAGTGTATTCGGTAAATAAAATTAAACTTATTAAACTTTTTTTGCTG CTGATTCACGTGAATAATACACCAATGTTTGAGGTGAGCAGACTGATGAT ATCATTATTAATAGCAAGCTAATTACTAATACATTTTTAGACGTAATTCA ATCTAACTATTTCTATAGGAATATTTTTTCAACTTGCGAAATAGTTCCAT AATAATAAAACAACTAGAATTGGGCCATCAAGTAGTTACAAAAACTGTGA CTGCCATAAAAGGATTCAGGTAGGTAGGTGGCGCCATGCGTTTCAATTGA ATATTACTGAGTCAACTTCACATCTACAAAAATCACAGCATGATCATATC TAGTAAAGAAATCATAATAACAGCTGAGAGTGTTCAGATTTGCATAAATG ATGACATCAACATCTGATCTAGTCCAAACAATCACACTTTCATTTCCTTC ATTCTATGAATAATCTTCTCTTTAAAAAGAACCTGTTCTTCAGGACCTCC ATCTATATTGTCGTCTTCTCGTATGTTCCCAAATGTTATGATAATACTAT CTTCTATGACATCTTTTGGTGTGGGTGCATAATGACGTGGCAACCGTACT CAGTGTGCACCAAACTACGAAAACAACGAACAAACAAAAGTGAGTGTTAT CTTATTATCGCATAACGAATTCTTGTCGAATGGCATGATATGGGGACCTG GAAATTTAATTTGAAAATCTATCCATCAACTGAAAGAAAAAAATGAATCA GAGGGATTTGCGAAATTTCAAAATCAAAGAATGACCACGTGTCTTATTCA ATCAAATCAGTGGACGTTTCTCATAGAATGCTGCAGAGTAGTTACGAAGG AAATGTCTCAGTTTCTACCGCCAGCCTATCACAGGGATGGGTCTCACCAC AACGGGCTTCGCAAAAATGGGTCTTGCAAGGATGGGTCCCGCAACAATCG GTTCCGTAACAATAAGTCTCGTAAAAATGGGTCTCACAGCGAAGAACCGC TTAAATCGTGTAACACGAGGCCCTTGCTTCCACGCCAAAATGCGGAAAAG AAGTACTCGGCATTCAGAGGCAACTTCCTGCCATTTTATGGCAACTTCCT GCCAAACTTTTGCACATTCCTGCCAACATTTAGATGTTGCTGAGCGTTTT TTCCCACTTGCTGCCACTACGTGGCGACTTCCTGCCAACAACTCGGCAAC TTCCTGCCAGCAATTTGGCAACTTATTTTAGTGGCACTAAATAAGTTGCC AAAGTGTGGCAGGAAGTTGTCTCTGAATGCCGAGTTGGCAGGCACGTAGG CATTTAAGGGGGAAGGTGCCTGTCTGCCCTAGAAGACTTACTAAATAACT ATGAAAAGCATTAACATTTAGCGGCCACGCATAACACTATAAACCATTTC AGTTTGCTTACAGAGAATAATCAAGAAAAGCGGAACGTTATCTTGAAACT ATTGAACTATCTCCATGTGATGCGTTTTCTAGCACTTGACTTAATTTATC ATTCCGTTTCTGTCGCTTGTGATAAGAAAATTCTCAAATTTAATCGCCAA TTTAATTTTTTCGCAGATGGTAATCTGTTTGGACTTTCGTTCCAGTTCGG TTCACGTGAAACTCATGTTCAAGTTTTTATTTTTTAATTTTCAGATGAGA GCTACGTAATCATATTTTTGTTTTGTTTGCATTGGTATGGTTCTTGATCT ACTCGTATGGGGGTAAAAACGTGTGTCTAGTTGACATCAGTCGAGTGATA AGATGGAGATGAATAGTTTGGGAAGAAGATGGGGATCGGAAAAAAGCTCA AGAAATGGGTTTCAACTTTTTGAAGTTTTAATTTGGCAGCGTGAACATTT GCAAGGTATATGAAACTTCAAGAAGGAGAAGTGTAGGAAACAATGTGGAG ACGCTGTTATAAATTACGTTTTGAAAAAGTCTCCGGTTTTCCGTCAAGCA ACTTAGATATACATATGTATACTGCTTCACTTTGAAAAATCAATATGTCG CAAAATAAAATAAAAATAAGCCGCGAAGAAAAATAAAAATAAGCCGCGTA GGCCCGTAACCGTCTTTCTACCTTACTTATCGGAGTACGCCTAACTCGTG CCTACGTGCCTACTGCCAAAACAAATTTGGCACGAGACAGGCCTTTTCAC ATGTAGTATTTTCGTAACATGGATGCGTTATCCGATATATGGAAACTTTA GAGCTTTTATTCATTTAAAGTCAGTTTCTAATTCTGATCAGTATCGCCTT GTGCTGAACAACGAATTGTTAATTACATAACCACATCTCGTACTTCTGAT CTATTGATACTTCTTTTTTGTTTTGTTTCCATTTTTAAAAAGTTGCTCGT GTTTTTTGTACTTTTCCGTTTTCATAGTCTTTACCATCAGTTGTCATGGA AATTGTTTTCAAAGTTTGTGGATTTCAACCCGGAAGTCTATCAATTGTTT GCCACTCTGTTTTGTCAATGTATTTCTAGGTGCACCTTGCAGACATGTAC TCTGCGACCGAACCTGTCACCTGACAAAGTACCGTAACTTATTGTTTTCA CAAACACTTAAAGATGCCTGACCTGGAACGACGATCACTATCATCAAACG GTAATGATCGAAATTAGTTGAAAACATGATTTGACCGCAGATCGAACTGA TGGCGCCAGGGAAACACCGATAGAAATGTTTTTCAAAAGTCACATGTGCT AAAAATAACTTTTATTTGGTCTTTCTTTTTCCTTTTGTCACATCAACCAT TACATGGGCGGTCAATATCTGTCTTCTTATCTATTCAGTCTCACGTTAAA TACACGTGTCAATTTATGTAAAAGTCGGGCGGTCCTGACTTTTGATGATC ATGACATGTTTTCGATCAGTCGGAAACTGGGGGGAAAAAGAGAAAGACAG AGACGTGTTTACCACTATGCAGCTGCAATTCGGTTTTCTCTCGAACATTC CATGACGAGAAGAGACGCAGAACATTCAATCAAGAGGGGTCTTTGAAGAT CATCGTGTGCTTCTTTTATGTGTGTTTGTGTACGTGTTTGTGTGTTACTA AAAGTTGACTGAGGGACGGGAGACAGAGAGAGCTCAATGAGCAATGACGG ATGAGGGATTGGTTCTTAAAGTTGGTTATTTGTGGAACTTCAAGGTAGTT GTGATGTCTAAAGATTAAGGTTAAGAAACCAAATATTTTAAGTTTAAGAT AAATAAATAAAAACTTTTCAGGCATCGTCCGTAATTATCGGTAATTCGGA ATGATTTTAGATCTCAAGACCAGAAAAATTTTTGCGAAATATTCAAAATT TAATCGATAAAAATTTTAGGTGGAAAATTCAATTTCAATTTTTAAAAATT TTGGACCGGAAATTGAACTGCAAACTCTACGAAATGGCCGATTGCACCAT GTTGTTCGGACATTTTTAATTAAAAATTAGTATCAAAATTTTTTTCAAAT GAATTTAAAAAAATTACCTGATTTATTTTAAAATCCCATTAGTCTCAGCT AGCACGTTTTAAAAAGTACTCAGAACTGTTCTGAAAATTGTAAATTACTG GTAAACAAAATTCGACAATTCCATATTCTACTAGTGGCAACTTTCAAATC ATTTTCGAGCATTTTCTTAAATATTTCAGCTAAACAATGGGATTTTCGGA TTAATCAAATAATTTTCTATATTTTGACATTAATTTTCGATGAAAAATAT AGGAACTGCATGGTGCAGTGTGTCATTTTGCAGAATCGCCGAAAATAACT GTCGCACATTCTTGATTGAAACTTCATTTTTTTAATTGGCGCGAAATTCA AATTTTAATTTTTAAATATTTTAGGCGGAAACCTCAGACTTCAACTTTCG AACCATTTTGACTGAAAAATCATACTTTAATTTTCAGAAATTAATGAAAT CAGCATTATTGTAGATGTTTCGGCGCACACCGTTGGCCTGTGGGTTGGCC GGTAGGCATATAAGTGCCTATATGAGATGTTGATCTAGAAATTAAAAATA GACAAATTTCTAATCTAGGATATTTAACTGTTCAGTTTTGTTTGTCATAT TTTCTGCTTTCAAATGTTTTCAATTTCTTCTCCTCTTTTCATATTTTTCA TTTGCACTTCACATCAAAGTTTTATTGTCAGAGGACCTAGTAAATAAATA CATTTTTCATTCTCTCACATGCTGGTGTTTGATGTTCAGTTAAAAATGTT TTTGATTCTGCAGGAAAAAAAAGAAGAAAAAGTCAACATACAGTACCGGT TCATTTGATTTCATCGTTTTGGAATTACAGAAAAATACTGGAACTAAATA AATAAATTTAAAAACAAAATAGAAAAAAATCAGATTTTCAAAACAATATA ATTCCTATTAGCATTATAAACTATTTTAAATGGAGCTATTCGTGAATTGT CTTTTGAATTTGCAGCAAGTATCCAAAGATCACAAAGAGCTGTTCGACCA CTATTCTTTGGATCTTCTTCATCCCACATTCCACGATAAACACATTCTGG CCTGAAAAATAGGTTTTTTTTAAATGAACATTATTGTTCCGATTTTTATA CCTCCTCTCATCATTTGAACAATTCATAAGTTGGAAAACAAAATAATTGA CACCCATTTCTTTCATTATTTTGTGAACTTCCGCGATTGGTTTTTTACTA AACATAGAATACACTTTCAGAGTTCGTTCACTGAAACAAACAAGGTTTAA TACTTTCTAGAGAAAATTAACAAATTACCGTATTCCAACATGTTCATAAT GAGGGTGATTGACAATTGGCCTAAGAGTTGTGAGCTTCACATTTGCCATT ACTGGCATTGTTCCAGCGAACACAGCGTCTGAAAAAAGTTATTTTGAAGC TAATATGGAAGGCAGGCATGTCTTCAGATGTACGTGCCTGCCTACCACTT CGGTGGTAAGTTTGTATATTACCTTGTTTGGTGTTATGTTGAATCCAGTC AAACAACATTTCTTGATCAGGATTACTGTATTCTCCCTTGACATTCAACT GTTGACGGATATTTGGAATTCCTGAAGGATGAATAAATATTTAGTAATGC AGTGAAAGAATTCAAACCTCTATAAAATAATATTGCGATCACTCCAACAA GAGCAGAAACTCGTATAGTTTTTGATATTCGATCTCCCCCCAGCAGCTTG GAGTTTGCAAAAAGTGCAGCAACAATACATAAATGAGGTGTCATGAATAG TTTGAGACGCATTATCAGGAATGCCATGACAGTTGAACAGCAGAGTTGGA CCACATTATATAGAATCTGAAATGCCAAATTGTAACTTTTTAGTTTCAAA AGTTATAGTATTGAAGCTGAAAACAATAATCTTAAAATTTTAAAATTTTA AACAAAGCTGTATTTTATTTTTCTATTTTTTTAATAATATTTTTTAAATA ACCACCGATCTGCCAGATTCAAAATAGTTGTTTTAAAATTTGGAAATTAC CTCTCCATTTTCTCCAATTTCTTCACTATTCCTCCATAATAAATTCGTAT TTTTCACAAAGTTAAAAACGAAAGTGACAAGAGAAATGAGGGCAAGTGGA ATAAGAAGTGTTCCACACAATTTCTCAATTGTTGAATATTGAATAAAATC GAATTCAGCTGAACAAGTATACAGTCTCGTGTGAAAGTTGGCAAAACTTG TGAATTTTGAACGAAGAATATCAAAGATATGAGCCTGAAATCATTAATTG AAACACAATACAATAAAAACGATAACCAACATCATCTTCAATTCCAAGTC CTTTTGACAATCCAATTTTAAGCCCAAGAGTAATTGATGCAAAGATGATA GCCAGGAAAAGAACATAAGCTGGACGGAATTTAAGATTTGAGAGTAGAGG AGAGATGTATATGATCATCTGGAAGCTTTTTGAAATTTATTTTTATCTAG TTTTTTTTTTTTTGGATTTGAGTTTTTACCCGAAAATTGTATAAGAGACA TAAACTATTTTTAGTTTCTCTATTCAATAGAAAACTTACCCCCAATGCCA AAATACTTGGAAAATACAATGCAGTAATCATCATCTCGTTACCAAAAAGC AGCAAAAATCCAATGAGAAATGAAATGATATGAGAATGAATTACTGTTTT AGCAGTAGAGAATGGAATCAGATCCAGTGAAAACGCTAAGAATATTGAAC AAATTTGTGTGAAAAATGCAAATTGGGTGAATTGCCAGAATAAAAGAGCT GGAACTGCCATGGAAGTTAAAAGAAGAATCATTGAGTGACCAGACTTTTT ATATCTGGAAAAATGAGTTTTGAGAACATCATATGCCAAAGAAAATTAGC ATTGCTACGTCTCGCATTTTCGTCATTTTTTTTGTCATAACTTACTTGAT AACAAATGTCAAAATCGCAATATGCCCGATAATAAACGGAAAAGCAAAGC TCTCCCGAAGAGGAGGTGTCCATTGAACTCGAGTTGCTTCTCCGTGATTA AATGCAAAACATAATACTGACAAGAATCCTCCGAATATTGAATCACTGTA ATATATATATTTAGTTAAAAGTAATAATTTTTGTGTCGCTCACCTAACAA GAACACCCAAATAAAAAATGGATGAAGCTACAGTTCCAGCCACAATGAAA ACTCCGGTTATATAGAAATAATGAGGATTTCCAATTCCTTCACAACTTTC AACAGGCCGAAGTTCTCCTCGATTAACTTGCCAGCATAGTTCGATTTGCC AATTCGCTGATTTGGCAAATGCTCGGAATGGGCGGTACAGAAATGCAAGG ATTACCTAAAATAAGCTTGGAAGGATTTTAGGTATGGAGGAAATTGAGAG AAAATGTAATACAGGTGAATGTATTCTTGTGGTAATTAGAAGTTTGCCAA AGGTTTTTCCGACGGAAAACATTACAATGTGAAAACTTTGCGACTCACCT CTGGATATAAATTAAATCGATTTAGTGTGTTAATCTCGTGTCCATGTTCT GTCACAGTATCATGTGTTATCTCCTGAACACCCTCAAGAAATGATGGTGC ATTGATTATTGTTTTATAGTATGAATAGTAGAGACCCTGAAAAAACAAGA AATTTTGAATTTGTATTTTTAAAGAAAACTGTACCATTTCCGTTCGATAT GCCATCTCGCGTTCAAAGTCCGCCAAATGGGAGAAATGTTTGTCGTTTTC GAATAATGTGTAGACATGTTGGTAGTTGATGTATCCAACGAGGAGTCCTG AAAATAAAAAAAACTGTATTGAAATTCGTTTGAGCGTAAAATAATTTGTC GCAAGTTTGATAAAGATCCGAGAAGACTATTTCTGTAACTTGCATGATTG TCTGCCAACTAATTATTTTCTGAATTTTTCTGTCAGGTCACATGCCATAT CCGGGTGGAACAGGACCACTTTAGAGATTGATACGGCTCTAGTTTAATGA AGAGCAAAAAATCTGCAGGTAGGTCAGTAGGTAGGTGTTGTAAGCAGGCA GACATTTTGAACCCTACATGGACACCCCATTTCAAATAACTACTATTAAA ACATAGTTTTCTTAAAAAAGTATTTAATTTGTTTCCCCACCCAATTGCTC TTAATAATCCTCATTGGTTCTCCTAGTTTTACTCTTTTGTTCCCATTTCC CTTTTTTTGCAATTCAAAACACATCCAGTTTTGAGAGAGAGGTCTCTCTC TCTGTCCCTCTCTACCTCCCCCTCTCTCTGTAATTATCGACTTTGGGGAC GAAATGTCAGTTCATTTGTGGAAATAGTTTATGAATCGGAGAGACTTTAG ACACTTTGAACAAACCGTGTCGTCGTCAGTGACGAGACACAAACACAGGA AGTTTCGTTGAATATGTTGATGTTTCTGTTGACGTTTCGTTCCCTAGTTT TTAGAGATTGAGAGCATCTAAGGATTAAGGTTCAATGTTTCAAGATTTAA AGTTTTGAAACATAAGTAACAGAGTAAATGATATGATTTAGATAATTTTC TTATTTTTTAATCTGGCAAGCACGCTCAACTAACAAAACACGAATCCGAC AATCAGTCAAACATCTTAAACTTTTTAAAAAATTGTTCATTCTTTATAAG AGCGAATTTCAAATTTAAAAAAAACTTTAATTAAGCTTCAGGTCAAGCAA TTAGGCGTTATTATTAATTCTGGCAAGTTTCCGTTTTTCAGATATAATCA TTTCAATTTCGATTCTTTCTTCAAAGTGTCTGGAAAAAATGCTCTTTTTT AATAATTTCGCCGAATCTAATAGTTCTAAAATTTTATGTTGAAACGATCA ATTCTATAACAGTATATTCAAAAATAACCTCACTAAACTTGAATTTTTTC CAAAAAAAAGCATTCAAAGTGAGCAAATAGTTTTGGTAATACAGGTGGCT ACAAATTTTCTGTCAAAATGTTCAATACACAAAGTGTGAGCAAGAGCAGA ACCAGTTTTCAAACATATTGCTCTCAGTTCTCACTTTCATTTTTGTTTGT ATTAGAGGCTCATTGAGCAATAGCAACTTGAACTTTACTTACTGTTTGTA ATAGCTTAACTGTTCACATTTTTACTAAAACTTTGCAACCTATAGGTATA CCTAATAATTGGGTTTTTCAATTTTGTATGAGAAATCACATCCCGAAACT GAATGGAAACTTTCTGATTAAAATGGATATCACTTCAAACATAGTGTCTT TTAATATTGTCTAAGCCTGTATGACAGTGATAATTTCTAATAAAGAGGCG CAGAGAAATTAGATAATGACCGATAATGATGAGATGAATGAATGTAATGA TGTGGGAAAATAGATAATAGAATAAGAAGTGTGGGCATAATGATTTAGAT GATAGAAGCGTTCAGTCTAGGAATCATTTCAAGTCAATATCACTAATTAT CTCATTTATTTTTGTTCCTTTGATTTATTCGTTTTGGTAGGGCCGTTTCA TCTTAAAGCGTATATTCAAAACAATTAAAAAATCGTTTTTGAAGTCTTCC AAGTAAAAATAAATATCTGCTTTGTGCCTATATTGCGCACCTATCATTTA ATTTCTTAAATGGGGCGTAGCAATTTTGGACTTCTGCTTCAATATCTTCA AAATGATCCCAATAGACCGAATTTCATAATGTGACTCCTCGAAAATTTCT TATGAAGATACAACATTTTAACACTGTTTTCTTTCATAGTGTCCAACGCC TGCCTTTTTCCTACACTAGTTTTTTTTTCAATATCCTTGCAAGTGCACCT GCAAGAAAAGTTTAACAAAAGTTCTTAAACTTAAAAAAGTAGGTGGTAGG CAGGCACGTAGGTGCGCAGGTATGTAGCCAATAGATTCTCAAAATATAAT TATAAAAACTCTGCATTTCTTTTTATTCATCCTAAAAGCGCATTCTACTC AAAACCCAGTCACATGCTCTATTCACAAAAGTCAATTTTTTTTCATCTGC TCTCCGCATAAACTCTTGCTTCCAATTTCCAGATGCATTTAATGTCACGT GCGTTCACTTTTTTCCCTTCAAGCTTCCAGAACAAAAAAGTCTTGTGATG TTTCGCTCAGAAATTTGTTTGAATAGATGATAATTGGATTTCGGTTGAAT TATTTTTGCCATCGCATGTCCTTTCTTCAATTCTCAATTTCCAGTTTTCG TGTGATTCTCCAAAAGTTTCAAACATCATCCGAATTTTCTTGTTTTTCTA TTATGTCACTGATTTCCTATTTATTTTTCATATTTTTAAAGTTTTTTTTA AGGGAAAATGAAAACGCCTGCAATTTTATTACTCCTATTCTTATTTATGA TGAGTTTGGCACAAAGTGATGAGGAACAAAGTGGAAAGGAACCTCCCGAA AAAGATGATGTTCTAGTTATTCTGGTTGGTTTTTGCCAATGTTACCATGT CAGAAATGCATGCATGCCTACTTTCGGCACATCTTCGTTTTCTTTTCCCG TATTCTATGTTTTAATTTTCAGGCAGAAGACAATTCGACACTATTAAACA ATAACCAAACAATGTATGACCCTTCTTCCGAAGAAAAAGAAAGTGACAAA CCAAAATTCAATTTATTGGAGACATATTTACCACTTTTCATATTTGTACT CGATCTACCAACAGAGGATAGAGAAACTTTGAAAGCCTATGTAAAAGATA AAACTTATGAAAAACTGAAATATGTAATCGATTCAAAAGTTAAAATGAAT CGAAATGAGAAAAATGTCTTAGGAAGAATCAATGAATCCTTGGTACAATC AATCGATAATTTGGAACATTTTGATGAGATTACTGTAGATGTGGTGAATG GTTATCACGTAAGTTTGACATTTCAATGGAAAATCATCTCTCTTTAATCT TTAAAATGTTTTATTTTCAAAAAAAACCTTAATATGACAATACCAAATTT TCCTTCTATAACTAAATTTCTAGAATGCCTCCGTATACAACGCAATCCGT TCAGTTTGGTTGAAACAATTACGTCCATTTCTGCCGCTAATTGATCAAAT GGCCATTCAGCAGTATTTTGCACAAAGGTTTCAAACATCAAGCGTCTATC AAAAACTATGTTTATTTTCCAGAAAATCTTCTGAAACTGAGAAAATGTCA ATTTGGGCAAGATTATGGGAAAATCTGAGTGTTTGGATGTATGGAAAGAA AAAATTAAACGAATGTTATGCAATGGAGCCAAAATGTGTTCGACACGCAT TGGAATTGCTTAATTTTGAACAAAGGATTGATTTGGATTTAGCTGCTTAT GAGAACAAGTTTGACAGTGTTGATGGAATTATTAGGGAAAGAGTGAGTCA ATTTAGATATTTAAAAAATGTCAAAAGCCCCAAAACGAATTAATTTTCAG CTTGGCGAAAATCGGAAAAGCAGTGAACTCGACGAATGGTTGCACAGAAA TCGCCCACCAAAAGCGCTACAAGCTATTTTAGATGAGCGAAGTGACATTG AGGAGGAGTAAGATATTTATATTGAATAGATAAACGTTAGATTTTTAGAG CTCTTCAACGCCTTCGAGATAACGGAGTTCTGAGCTCACTCAAACATTAT TACAAAGCAGTTATAGAGTCAAGAAGTCATGAAGAGCAAGAGGTAACCAA TGTTTTAGATAGTTCTTGTCAATTTTCTTACGCATTTTTCGATATTCTGC TAATGGGTGCAAAGTTCTAAATTGTTTTTAATTGTAGCTTTCTATATTAA ACTTTCAGGACATTCGTCATTTCTTCGACATAATGAATGACACATTTGCT CGTTGCTTTGACCCACTACGAGGCCAGTATCATGATTTCTAGAAAAACCC TCTTTTTGACTTCTTCCTCCAT tRNAscan-SE-2.0/Demo/Example2-tRNAs.ss0000644000543100007160000000452113100451013016515 0ustar pchanlowelabMySeq1.trna1 (13-85) Length: 73 bp Type: Thr Anticodon: TGT at 34-36 (46-48) Score: 78.0 HMM Sc=54.80 Sec struct Sc=23.20 * | * | * | * | * | * | * | Seq: GGCCCTATAGCTCAGGGGTtAGAGCACTGGTCTTGTAAACCAGGGGtCGCGAGTTCAAATCTCGCTGGGGCCT Str: >>>>>>>..>>>>........<<<<.>>>>>.......<<<<<.....>>>>>.......<<<<<<<<<<<<. MySeq2.trna1 (6-79) Length: 74 bp Type: Arg Anticodon: TCT at 35-37 (40-42) Score: 75.1 HMM Sc=56.60 Sec struct Sc=18.50 * | * | * | * | * | * | * | Seq: GTCTCTGTGGCGCAATGGAcgAGCGCGCTGGACTTCTAATCCAGAGGtTCTGGGTTCGAGTCCCGGCAGAGATG Str: >>>>>>>..>>>>.........<<<<.>>>>>.......<<<<<.....>>>>>.......<<<<<<<<<<<<. MySeq3.trna1 (14-114) Length: 101 bp Type: Ser Anticodon: CGA at 34-36 (47-49) Score: 71.8 Possible intron: 38-56 (51-69) HMM Sc=49.10 Sec struct Sc=22.70 * | * | * | * | * | * | * | * | * | * | Seq: GGCACTATGGCCGAGTGGTtAAGGCGAGAGACTCGAAtggaataaaaagttcggctATCTCTTGGGCTCTGCCCGCGCTGGTTCAAATCCTGCTGGTGTCG Str: >>>>>>>..>>>..........<<<.>>>>>..........................<<<<<.>>>>...<<<<..>>.>>.......<<.<<<<<<<<<. MySeq4.trna1 (6-88) Length: 83 bp Type: Leu Anticodon: AAG at 35-37 (40-42) Score: 65.0 HMM Sc=43.90 Sec struct Sc=21.10 * | * | * | * | * | * | * | * | Seq: GGAGAGATGGCCGAGCGGTccAAGGCGCTGGTTTAAGGcAACCAGTaGCTTCGGGGGCGTGGGTTCGAATCCCACTCTCTTCA Str: >>>>>>>..>>>...........<<<.>>>>>........<<<<<...>>>><<<<..>>>>>.......<<<<<<<<<<<<. MySeq5.trna1 (3-89) Length: 87 bp Type: SeC Anticodon: TCA at 36-38 (38-40) Score: 146.9 HMM Sc=0.00 Sec struct Sc=0.00 * | * | * | * | * | * | * | * | * Seq: GCCCGGATGATCCTCAGTGGTCTGGGGTGCAGGCTTCAAACCTGTAGCTGTCTAGCGACAGAGTGGTTCAATTCCACCTTTCGGGCG Str: >>>>>>>.>..>>>>>>....<<<<<<>>>>>>.......<<<<<<.>>>>>....<<<<<.>>>>.......<<<<<.<<<<<<<. MySeq6.trna1 (7-92) Length: 86 bp Type: Lys Anticodon: CTT at 35-37 (41-43) Score: 72.1 ISM: HMM Sc=40.60 Sec struct Sc=31.50 * | * | * | * | * | * | * | * | * Seq: GACACGGTGGCCGAGTGGTttAAGGCATGAGACACTTGATCTCAAACGGTTCTAACCGAaCGCAGGTTCGAATCCTGCCCGTGTCA Str: >>>>>>>..>>>...........<<<.>>>>>.......<<<<<..>>>>....<<<<...>>>>>.......<<<<<<<<<<<<. tRNAscan-SE-2.0/Demo/Example1-tRNAs.stats0000644000543100007160000000745513100451013017236 0ustar pchanlowelab tRNAscan-SE v.2.0rc1 (April 2017) scan results (on host pismo.soe.ucsc.edu) Started: Mon Apr 10 01:02:12 PDT 2017 ------------------------------------------------------------ Sequence file(s) to search: Example1.fa Search Mode: Eukaryotic Results written to: Example1-tRNAs.out Output format: Tabular Searching with: Infernal First Pass->Infernal Isotype-specific model scan: Yes Covariance model: /home/pchan/lib/tRNAscan-SE/models/TRNAinf-euk.cm /home/pchan/lib/tRNAscan-SE/models/TRNAinf-euk-SeC.cm Infernal first pass cutoff score: 10 Temporary directory: /tmp tRNA secondary structure predictions saved to: Example1-tRNAs.ss tRNA predictions saved to: Example1-tRNAs.bed Isotype specific predictions saved to: Example1-tRNAs.iso Search log saved in: Example1-tRNAs.log Search statistics saved in: Example1-tRNAs.stats Reporting HMM/2' structure score breakdown ------------------------------------------------------------ First-pass Stats: --------------- Sequences read: 1 Seqs w/at least 1 hit: 1 Bases read: 40222 (x2 for both strands) Bases in tRNAs: 420 tRNAs predicted: 5 Av. tRNA length: 84 Script CPU time: 0.02 s Scan CPU time: 0.77 s Scan speed: 104.5 Kbp/sec First pass search(es) ended: Mon Apr 10 01:02:13 PDT 2017 Infernal Stats: ----------- Candidate tRNAs read: 5 Infernal-confirmed tRNAs: 5 Bases scanned by Infernal: 520 % seq scanned by Infernal: 0.6 % Script CPU time: 0.07 s Infernal CPU time: 8.38 s Scan speed: 62.1 bp/sec Infernal analysis of tRNAs ended: Mon Apr 10 01:02:25 PDT 2017 Overall scan speed: 8706.1 bp/sec tRNAs decoding Standard 20 AA: 5 Selenocysteine tRNAs (TCA): 0 Possible suppressor tRNAs (CTA,TTA): 0 tRNAs with undetermined/unknown isotypes: 0 tRNAs with mismatch isotypes: 0 Predicted pseudogenes: 0 ------- Total tRNAs: 5 tRNAs with introns: 1 | Leu-CAA: 1 | Isotype / Anticodon Counts: Ala : 0 (0) AGC: GGC: CGC: TGC: Gly : 0 (0) ACC: GCC: CCC: TCC: Pro : 1 (1) AGG: GGG: CGG: 1 TGG: Thr : 0 (0) AGT: GGT: CGT: TGT: Val : 0 (0) AAC: GAC: CAC: TAC: Ser : 1 (1) AGA: 1 GGA: CGA: TGA: ACT: GCT: Arg : 0 (0) ACG: GCG: CCG: TCG: CCT: TCT: Leu : 1 (1) AAG: GAG: CAG: TAG: CAA: 1 TAA: Phe : 2 (2) AAA: GAA: 2 Asn : 0 (0) ATT: GTT: Lys : 0 (0) CTT: TTT: Asp : 0 (0) ATC: GTC: Glu : 0 (0) CTC: TTC: His : 0 (0) ATG: GTG: Gln : 0 (0) CTG: TTG: Ile : 0 (0) AAT: GAT: TAT: Met : 0 (0) CAT: Tyr : 0 (0) ATA: GTA: Supres: 0 (0) CTA: TTA: Cys : 0 (0) ACA: GCA: Trp : 0 (0) CCA: SelCys: 0 (0) TCA: tRNAscan-SE-2.0/Demo/Example2-tRNAs.stats0000644000543100007160000000745113100451013017233 0ustar pchanlowelab tRNAscan-SE v.2.0rc1 (April 2017) scan results (on host pismo.soe.ucsc.edu) Started: Mon Apr 10 01:02:38 PDT 2017 ------------------------------------------------------------ Sequence file(s) to search: Example2.fa Search Mode: Eukaryotic Results written to: Example2-tRNAs.out Output format: Tabular Searching with: Infernal First Pass->Infernal Isotype-specific model scan: Yes Covariance model: /home/pchan/lib/tRNAscan-SE/models/TRNAinf-euk.cm /home/pchan/lib/tRNAscan-SE/models/TRNAinf-euk-SeC.cm Infernal first pass cutoff score: 10 Temporary directory: /tmp tRNA secondary structure predictions saved to: Example2-tRNAs.ss tRNA predictions saved to: Example2-tRNAs.bed Isotype specific predictions saved to: Example2-tRNAs.iso Search log saved in: Example2-tRNAs.log Search statistics saved in: Example2-tRNAs.stats Reporting HMM/2' structure score breakdown ------------------------------------------------------------ First-pass Stats: --------------- Sequences read: 6 Seqs w/at least 1 hit: 6 Bases read: 595 (x2 for both strands) Bases in tRNAs: 504 tRNAs predicted: 6 Av. tRNA length: 84 Script CPU time: 0.01 s Scan CPU time: 2.74 s Scan speed: 0.4 Kbp/sec First pass search(es) ended: Mon Apr 10 01:02:42 PDT 2017 Infernal Stats: ----------- Candidate tRNAs read: 6 Infernal-confirmed tRNAs: 6 Bases scanned by Infernal: 575 % seq scanned by Infernal: 48.3 % Script CPU time: 0.21 s Infernal CPU time: 16.21 s Scan speed: 35.5 bp/sec Infernal analysis of tRNAs ended: Mon Apr 10 01:03:10 PDT 2017 Overall scan speed: 62.1 bp/sec tRNAs decoding Standard 20 AA: 5 Selenocysteine tRNAs (TCA): 1 Possible suppressor tRNAs (CTA,TTA): 0 tRNAs with undetermined/unknown isotypes: 0 tRNAs with mismatch isotypes: 0 Predicted pseudogenes: 0 ------- Total tRNAs: 6 tRNAs with introns: 1 | Ser-CGA: 1 | Isotype / Anticodon Counts: Ala : 0 (0) AGC: GGC: CGC: TGC: Gly : 0 (0) ACC: GCC: CCC: TCC: Pro : 0 (0) AGG: GGG: CGG: TGG: Thr : 1 (1) AGT: GGT: CGT: TGT: 1 Val : 0 (0) AAC: GAC: CAC: TAC: Ser : 1 (1) AGA: GGA: CGA: 1 TGA: ACT: GCT: Arg : 1 (1) ACG: GCG: CCG: TCG: CCT: TCT: 1 Leu : 1 (2) AAG: 1 GAG: CAG: TAG: CAA: TAA: Phe : 0 (0) AAA: GAA: Asn : 0 (0) ATT: GTT: Lys : 1 (0) CTT: 1 TTT: Asp : 0 (0) ATC: GTC: Glu : 0 (0) CTC: TTC: His : 0 (0) ATG: GTG: Gln : 0 (0) CTG: TTG: Ile : 0 (0) AAT: GAT: TAT: Met : 0 (0) CAT: Tyr : 0 (0) ATA: GTA: Supres: 0 (0) CTA: TTA: Cys : 0 (0) ACA: GCA: Trp : 0 (0) CCA: SelCys: 2 (0) TCA: 2 tRNAscan-SE-2.0/Demo/Example1-tRNAs.ss0000644000543100007160000000402413100451013016512 0ustar pchanlowelabCELF22B7.trna1 (12619-12738) Length: 120 bp Type: Leu Anticodon: CAA at 35-37 (12653-12655) Score: 74.2 Possible intron: 39-74 (12657-12692) HMM Sc=51.20 Sec struct Sc=23.00 * | * | * | * | * | * | * | * | * | * | * | * Seq: GCACGGATGGCCGAGTGGTctAAGGCGCCAGACTCAAGcgaaatgcttgcctcatgctcgaggtcgactgggtgTTCTGGTACTCGTATGGGTGCGTGGGTTCGAATCCCACTTCGTGCA Str: >>>>>>>..>>>...........<<<.>>>>>...........................................<<<<<.>>>>....<<<<..>>>>>.......<<<<<<<<<<<<. CELF22B7.trna2 (19480-19561) Length: 82 bp Type: Ser Anticodon: AGA at 34-36 (19513-19515) Score: 81.6 HMM Sc=47.50 Sec struct Sc=34.10 * | * | * | * | * | * | * | * | Seq: GCAGTCATGTCCGAGTGGTtAAGGAGATTGACTAGAAATCAATTGGGCTCTGCCCGCGTAGGTTCGAATCCTGCTGACTGCG Str: >>>>>>>..>>>..........<<<.>>>>>.......<<<<<.>>>>...<<<<..>>>>>.......<<<<<<<<<<<<. CELF22B7.trna3 (26367-26439) Length: 73 bp Type: Phe Anticodon: GAA at 34-36 (26400-26402) Score: 82.5 HMM Sc=56.60 Sec struct Sc=25.90 * | * | * | * | * | * | * | Seq: GCCTCGATAGCTCAGTTGGGAGAGCGTACGACTGAAGATCGTAAGGtCACCAGTTCGATCCTGGTTCGGGGCA Str: >>>>>>>..>>>>........<<<<.>>>>>.......<<<<<.....>>>>>.......<<<<<<<<<<<<. CELF22B7.trna4 (26992-26920) Length: 73 bp Type: Phe Anticodon: GAA at 34-36 (26959-26957) Score: 82.5 HMM Sc=56.60 Sec struct Sc=25.90 * | * | * | * | * | * | * | Seq: GCCTCGATAGCTCAGTTGGGAGAGCGTACGACTGAAGATCGTAAGGtCACCAGTTCGATCCTGGTTCGGGGCA Str: >>>>>>>..>>>>........<<<<.>>>>>.......<<<<<.....>>>>>.......<<<<<<<<<<<<. CELF22B7.trna5 (23765-23694) Length: 72 bp Type: Pro Anticodon: CGG at 33-35 (23733-23731) Score: 71.5 HMM Sc=48.20 Sec struct Sc=23.30 * | * | * | * | * | * | * | Seq: GGCCGGATGGTCTAGAGGTATGATTCTCGCTTCGGGTGCGAGAGGtCCCGGGTTCGATTCCCGGTTCGGCCC Str: >>>>>>>..>>>.........<<<.>>>>>.......<<<<<.....>>>>>.......<<<<<<<<<<<<. tRNAscan-SE-2.0/Demo/Example1-tRNAs.bed0000644000543100007160000000055613100451013016625 0ustar pchanlowelabCELF22B7 12618 12738 CELF22B7.tRNA1-LeuCAA 742 + 12618 12738 0 2 38,46, 0,74, CELF22B7 19479 19561 CELF22B7.tRNA2-SerAGA 816 + 19479 19561 0 1 82, 0, CELF22B7 23693 23765 CELF22B7.tRNA5-ProCGG 715 - 23693 23765 0 1 72, 0, CELF22B7 26366 26439 CELF22B7.tRNA3-PheGAA 825 + 26366 26439 0 1 73, 0, CELF22B7 26919 26992 CELF22B7.tRNA4-PheGAA 825 - 26919 26992 0 1 73, 0, tRNAscan-SE-2.0/LICENSE0000644000543100007160000010451313100451013013627 0ustar pchanlowelab GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . tRNAscan-SE-2.0/README0000644000543100007160000001024513665337775013542 0ustar pchanlowelab------------------------------------------------------------- tRNAscan-SE: An improved tool for transfer RNA detection Patricia Chan, Brian Lin, and Todd Lowe School of Engineering, University of California, Santa Cruz, CA -------------------------------------------------------------- Current release: 2.0.6 (May 2020) tRNAscan-SE was written in the PERL (version 5) script language. Input consists of DNA or RNA sequences in FASTA format. tRNA predictions are output in standard tabular or ACeDB format. tRNAscan-SE does no tRNA detection itself, but instead combines the strengths of three independent tRNA prediction programs by negotiating the flow of information between them, performing a limited amount of post-processing, and outputting the results in one of several formats. tRNAscan-SE pioneers the large-scale use of covariance models to annotate tRNA genes in genomes. A covariance model is an implementation of a stochastic context-free grammar, able to integrate both primary sequence and secondary structure information, and is trained on an aligned, structurally annotated set of RNAs. Any given sequence can be searched for tRNAs by alignment to a tRNA covariance model. tRNAscan-SE 2.0 combines the use of the latest Infernal v1.1 (1) as the covariance model search engine and covariance models specifically trained and built using tRNA sequences from available genomes in the three domains of life for gene prediction. The method replaces the original use of COVE (2) with two prefilters - tRNAscan 1.3 (3) and an implementation of an algorithm described by Pavesi and colleagues (4) for searching eukaryotic pol III tRNA promoters (our implementation referred to as EufindtRNA), which is still available as a backward compatible option. Predicted tRNA genes will then be assessed using a set of isotype-specific covariance models. Comparative analysis among these models enables better annotation, particularly of atypical tRNAs, some of which may produce ÒrecodingÓ events due to mutations in the anticodon. The new tRNAscan-SE also enables better recognition of tRNA-derived SINEs that are abundant in many eukaryotic genomes by using a post quality filter. This distribution includes the PERL script tRNAscan-SE, the convariance models, all the files necessary to compile and run the complete COVE package (version 2.4.4), all the files necessary to compile and run the modified version of tRNAscan (version 1.4), and all the files needed to compile and run eufindtRNA 1.0 (the cove programs, tRNAscan 1.4, and eufindtRNA are included for use with the tRNAscan-SE program, but may also be run as stand-alone programs). Installation of the PERL (Practical Extraction and Report Language, Larry Wall) interpreter package version 5.0 or later is required to run the tRNAscan-SE PERL script. Users also need to download and install Infernal before installing and using tRNAscan-SE. The Infernal source package can be obtained at http://eddylab.org/infernal/. You can obtain a copy of this software from http://lowelab.ucsc.edu/software/tRNAscan-SE.tar.gz If you use this software, please cite the Nucleic Acids Research paper describing the program & its analysis of several genomes (4). If you have any questions, bug reports, or suggestions, please e-mail Todd Lowe lowe@soe.ucsc.edu Department of Biomolecular Engineering University of California 1156 High Street Santa Cruz, ZA 95064 References 1. Nawrocki, E.P. and Eddy, S.R. (2013) "Infernal 1.1: 100-fold Faster RNA Homology Searches", Bioinformatics, 29, 2933-2935. 2. Eddy, S.R. and Durbin, R. (1994) "RNA sequence analysis using covariance models", Nucl. Acids Res., 22, 2079-2088. 3. Fichant, G.A. and Burks, C. (1991) "Identifying potential tRNA genes in genomic DNA sequences", J. Mol. Biol., 220, 659-671. 4. Pavesi, A., Conterio, F., Bolchi, A., Dieci, G., Ottonello, S. (1994) "Identification of new eukaryotic tRNA genes in genomic DNA databases by a multistep weight matrix analysis of transcriptional control regions", Nucl. Acids Res., 22, 1247-1256. 5. Lowe, T.M. & Eddy, S.R. (1997) "tRNAscan-SE: A program for improved detection of transfer RNA genes in genomic sequence", Nucl. Acids Res., 25, 955-964. tRNAscan-SE-2.0/configure.ac0000644000543100007160000000271513707665137015144 0ustar pchanlowelab## configure.ac ## tRNAscan-SE ## ## Settings for generating a configure file AC_PREREQ([2.69]) AC_INIT([tRNAscan-SE],[2.0.0],[pchan@soe.ucsc.edu]) AC_CONFIG_HEADERS([config.h]) AC_ARG_ENABLE(debugging, AS_HELP_STRING([--enable-debugging],[enable debugging, default: no]), [ case "${enableval}" in yes) enable_debugging=yes ;; no) enable_debugging=no ;; *) AC_MSG_ERROR([bad value ${enableval} for --enable-debugging]) ;; esac], [enable_debugging=no]) # Checks for programs. AC_PROG_CC if test "$enable_debugging" != "no"; then CFLAGS="-g -Wall" else CFLAGS="-O3" fi # standard install program AC_PROG_INSTALL # Check for perl/get path AC_PATH_PROG(PERL, perl) # Check perl version AC_DEFUN([AX_PROG_PERL_VERSION], [AC_CACHE_CHECK([for Perl version $1 or later], [ax_cv_prog_perl_version], [AS_IF(["$PERL" -e 'require $1;' >/dev/null 2>&1], [$2], [$3])])]) AX_PROG_PERL_VERSION([5.0.0],[AC_MSG_RESULT(yes)],[AC_MSG_ERROR([not compatible with this version of perl], [1])]) # Checks for header files. AC_FUNC_ALLOCA AC_CHECK_HEADERS([limits.h memory.h stddef.h stdlib.h string.h unistd.h]) # Checks for typedefs, structures, and compiler characteristics. AC_CHECK_HEADER_STDBOOL AC_TYPE_SIZE_T # Checks for library functions. AC_FUNC_MALLOC AC_FUNC_REALLOC AC_CHECK_FUNCS([bzero memset re_comp regcomp sqrt strcasecmp strchr strstr]) # use automake AM_INIT_AUTOMAKE([foreign]) # use Makefile.in to generate Makefile AC_CONFIG_FILES([Makefile]) AC_OUTPUT tRNAscan-SE-2.0/src/0000755000543100007160000000000014044111062013412 5ustar pchanlowelabtRNAscan-SE-2.0/src/fast-dbviterbi.c0000644000543100007160000003622111021467304016474 0ustar pchanlowelab/* fast-dbviterbi.c * SRE, Fri Sep 30 15:09:06 1994 * * Search with a covariance model * Fast "banded search" version of dbviterbi.c; subsequence * lengths are bounded by probabilistically determined bounds. * ********** * * To optimize memory access patterns, the score storage is implemented * as a two-matrix version. amx is the * main storage. bmx is a smaller auxiliary matrix with a different * access pattern, holding scores of BEGIN state alignments; it * is used when calculating BIFURC scores. * * amx is [j = 0..1] [y = 0..statenum] [diff = 0..j] * diff == 0 is for off-diagonal boundary conditions (this is why diff is shifted +1) * diff == 1 is for the diagonal, i==j * We only need to keep two j rows in memory (current and previous). * Note that this is yet *another* memory access pattern and it's different * from dbviterbi.c!!! * * bmx is [y = 0..statenum] [j = 0..N] [ diff = 0..j] * a j,diff matrix exists only where y is a BEGIN state * * The 2.0 implementation allows variable storage per node rather * than storing and calculating a fixed max number of states per node, * which should save up to 2x in both time and space. * * An optimization is made which requires END states to be explicitly * added, so statenum (the number of states in the integer model) * is *inclusive* of ENDs. */ #include #include #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static int allocate_mx(struct istate_s *icm, int statenum, int window, int ****ret_amx, int ****ret_bmx); static int init_mx (struct istate_s *icm, int statenum, int N, int ***amx, int ***bmx); static int recurse_mx (struct istate_s *icm, int statenum, int *minb, int *maxb, char *seq, int seqlen, int window, int ***amx, int ***bmx, int ithresh, int (*gotone_f)(int, int, double)); static void free_mx (int ***amx, int ***bmx, int statenum, int window); /* Function: FastViterbiScan() * * Purpose: Scanning version of the Viterbi alignment algorithm, * for finding matches in a long sequence. * * Args: icm - the model to align sequence to (int log-odds) * statenum - length of model in states (inclusive of END) * minb - minimum length bounds for states * maxb - maximum length bounds for states * seq - sequence to align model to * window - scanning window size (nucleotides) * thresh - scores above this are reported through gotone_f() * gotone_f - function which gets told about a match * * Return: 1 on success, 0 on failure. */ int FastViterbiScan(struct istate_s *icm, int statenum, int *minb, int *maxb, char *seq, int window, double thresh, int (*gotone_f)(int, int, double)) { int ***amx; /* the main score matrix */ int ***bmx; /* the BEGIN score matrix */ int N; /* length of sequence */ int ithresh; /* thresh, converted and scaled to int */ N = strlen(seq); seq--; /* convert to 1..N. Ugh! */ ithresh = (int) (thresh * INTPRECISION); if (! allocate_mx(icm, statenum, window, &amx, &bmx)) return 0; #ifdef DEBUG printf("allocated matrices\n"); #endif if (! init_mx(icm, statenum, window, amx, bmx)) return 0; #ifdef DEBUG printf("matrices initialized\n"); #endif if (! recurse_mx(icm, statenum, minb, maxb, seq, N, window, amx, bmx, ithresh, gotone_f)) return 0; #ifdef DEBUG printf("recursion finished\n"); #endif /* terminate scanning hit reporting */ ReportScanHit(-1,-1, 0.0, gotone_f); free_mx(amx, bmx, statenum, window); return 1; } /* Function: allocate_mx() * * Purpose: Malloc space for the score matrices. * amx and atr are indexed as j, i, y. * bmx and btr are indexed as k, j, i. * In the two sequence dimensions j, i they are * diagonal (+1 off diagonal) matrices with * rows j = 0..N, i = 1..j+1. * In the node dimension k bmx and btr are k = 0..M. * In the state dimension y amx and atr are y = 0..numstates. * * Args: icm - the int, log-odds, state-based model * statenum - number of states in model * window - length of scanning window * ret_amx - RETURN: main score matrix * ret_bmx - RETURN: BEGIN score matrix * * Return: Ptr to allocated scoring matrix, or * dies and exits. */ static int allocate_mx(struct istate_s *icm, int statenum, int window, int ****ret_amx, int ****ret_bmx) { int ***amx; int ***bmx; int diag, j, y; /* Main matrix, amx: fastest varying index is y (j,i,y) * we only keep two rows for j, 0 and 1. */ /* malloc for j = 0..1 rows */ if ((amx = (int ***) malloc (2 * sizeof(int **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (j = 0; j <= 1; j++) /* loop over rows j = 0..1 */ { /* malloc for diag = 0..window cols */ if ((amx[j] = (int **) malloc ((window + 1) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* loop over cols diag = 0..window */ for (diag = 0; diag <= window; diag++) /* malloc for y = 0..statenum-1 decks */ if ((amx[j][diag] = (int *) malloc ((statenum) * sizeof (int ))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } /* B auxiliary matrix: fastest varying index is diag (y,j,diag) * bmx keeps score decks for BEGIN states */ /* 0..statenum-1 decks */ if ((bmx = (int ***) malloc (statenum * sizeof(int **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (y = 0; y < statenum; y++) { bmx[y] = NULL; /* we keep score info for BEGIN states */ if (icm[y].statetype == uBEGIN_ST) { /* j= 0..window-1 rows */ if ((bmx[y] = (int **) malloc ((window) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* diff = 0..window columns */ for (j = 0; j < window; j++) if ((bmx[y][j] = (int *) malloc ((window+1) * sizeof(int ))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } } *ret_amx = amx; *ret_bmx = bmx; return 1; } /* Function: free_mx() * * Purpose: Free the space allocated to the scoring and traceback matrices. * Precisely mirrors the allocations above in allocate_cvmx(). * * Return: (void) */ static void free_mx(int ***amx, int ***bmx, int statenum, int window) { int diag, j, y; /* Free the main matrix, amx: * amx[j][i][y] = [0..1] [0..window] [0..statenum-1] */ for (j = 0; j <= 1; j++) { for (diag = 0; diag <= window; diag++) free(amx[j][diag]); free(amx[j]); } free(amx); /* Free the auxiliary matrix, bmx * bmx[y][j][i] = [0..statenum-1] [0..window] [0..window] */ for (y = 0; y < statenum; y++) { if (bmx[y] != NULL) { for (j = 0; j < window; j++) free(bmx[y][j]); free(bmx[y]); } } free(bmx); } /* Function: init_mx() * * Purpose: Initialization of the scoring matrices. We initialize the off-diagonal, * the diagonal, and the "floor" (end states) of the cube. * * Return: 1 on success, 0 on failure. */ static int init_mx(struct istate_s *icm, /* integer model */ int statenum, /* number of states in icm */ int window, /* size of scanning window on sequence */ int ***amx, int ***bmx) { int diag, j, y; /* counters for indices over the cvmx */ int ynext; /* index of next state k+1 */ int *beam; /* z-axis vector of numbers in amx */ /* Init the whole amx to -Infinity. We do this with memcpy, trying * to be fast. We fill in j=0,diag=0 by hand, then memcpy() the other * columns. */ for (y = 0; y < statenum; y++) amx[0][0][y] = amx[1][0][y] = NEGINFINITY; for (diag = 1; diag <= window; diag++) { memcpy(amx[0][diag], amx[0][0], statenum * sizeof(int)); memcpy(amx[1][diag], amx[0][0], statenum * sizeof(int)); } /* Init the whole bmx to -Inf. We know state 0 is a begin (it's ROOT), so we * start there, and memcpy rows as needed. */ for (diag = 0; diag <= window; diag++) bmx[0][0][diag] = NEGINFINITY; for (j = 1; j < window; j++) memcpy(bmx[0][j], bmx[0][0], (window+1) * sizeof(int)); for (y = 1; y < statenum; y++) if (bmx[y] != NULL) for (j = 0; j < window; j++) memcpy(bmx[y][j], bmx[0][0], (window+1) * sizeof(int)); /* Init the off-diagonal (j = 0..window-1; diag == 0) with -log P scores. * End state = 0; * del, bifurc states are calc'ed * begin states same as del's * THIS IS WASTEFUL AND SHOULD BE CHANGED. */ for (j = 0; j < window; j++) for (y = statenum-1; y >= 0; y--) { /* Set the alignment of END states to the off-diagonal (diag = 0) * to be zero, and never touch them again. */ if (icm[y].statetype == uEND_ST) amx[j%2][0][y] = 0; else if (icm[y].statetype == uBIFURC_ST) amx[j%2][0][y] = bmx[icm[y].tmx[0]][j][0] + bmx[icm[y].tmx[1]][j][0]; else if (icm[y].statetype == uDEL_ST || icm[y].statetype == uBEGIN_ST) { /* only calc DEL-DEL and BEGIN-DEL transitions. Since * we optimized the state transition tables, removing * the unused ones, we don't know where the number * for "to DEL" is! But we can find it, because it'll * be the connection to a non-infinite score */ beam = amx[j%2][0] + y + icm[y].offset; for (ynext = 0; ynext < icm[y].connectnum; ynext++) { if (*beam != NEGINFINITY) amx[j%2][0][y] = *beam + icm[y].tmx[ynext]; beam++; } } /* make a copy into bmx if y is a BEGIN */ if (icm[y].statetype == uBEGIN_ST) bmx[y][j][0] = amx[j%2][0][y]; } return 1; } /* Function: recurse_mx() * * Purpose: Carry out the fill stage of the dynamic programming * algorithm. After each j row is filled in, check the score * of best full alignment ending at this row; if greater * than threshold (ithresh), report it. * * Returns: 1 on success, 0 on failure. */ static int recurse_mx(struct istate_s *icm, /* integer, state-form model */ int statenum, /* number of states in icm */ int *minb, int *maxb, char *seq, /* sequence, 1..seqlen */ int seqlen, /* length of seq */ int window, /* length of scanning window on seq */ int ***amx, /* main scoring matrix */ int ***bmx, /* bifurc scoring matrix */ int ithresh, /* reporting threshold */ int (*gotone_f)(int, int, double)) { int i, j, y; /* indices for 3 dimensions */ int aj; /* 0 or 1, index for j in A matrix */ int bj; /* 0..window-1, index for j in B matrix */ int diff; /* loop counter for difference: diff = j-i + 1 */ int symi, symj; /* symbol indices for seq[i], seq[j] */ int sc; /* tmp for a score */ int ynext; /* index of next state y */ int bestdiff, bestscore; int *beam; /* ptr to a beam (z-axis vector) */ int leftdiff; /* diff coord of BEGIN_L of a bifurc */ int leftj; /* j coord of BEGIN_L of a bifurc */ int **left_p; /* pointer into whole 2D deck of BEGINL's of a bifurc */ int *right_p; /* ptr into row of BEGIN_R's of a bifurc */ int *scp; /* score pointer: ptr into beam of scores being calc'ed */ struct istate_s *st; /* state pointer: ptr at current state in icm */ int *tmx; int emitsc; for (j = 1; j <= seqlen; j++) { aj = j % 2; /* 0 or 1 index in amx */ bj = j % window; /* 0..window-1 index in bmx */ symj = SymbolIndex(seq[j]); for (y = statenum-1; y >= 0; y--) { st = &icm[y]; for (diff = minb[y]; diff <= maxb[y] && diff <= j; diff++) { i = j - diff + 1; symi = SymbolIndex(seq[i]); scp = &amx[aj][diff][y]; if (st->statetype != uBIFURC_ST) /* a normal (non-BIFURC) state */ { /* Connect the "beam" pointer to the appropriate * starting place in the ynext scores we're connecting * y to */ switch (st->statetype) { case uBEGIN_ST: case uDEL_ST: beam = amx[aj][diff]; emitsc = 0; break; case uMATP_ST: /* !aj toggles from 0 to 1 and vice versa */ if (diff == 1) continue; beam = amx[!aj][diff-2]; emitsc = st->emit[symi * ALPHASIZE + symj]; break; case uMATR_ST: case uINSR_ST: beam = amx[!aj][diff-1]; emitsc = st->emit[symj]; break; case uMATL_ST: case uINSL_ST: beam = amx[aj][diff-1]; emitsc = st->emit[symi]; break; case uEND_ST: continue; default: Die("no such state type %d", st->statetype); } beam += y + st->offset; tmx = st->tmx; /* Init for ynext == 0 case */ *scp = *beam + *tmx; /* Calculate remaining cases */ for (ynext = 1; ynext < st->connectnum; ynext++) { beam++; tmx++; if (*beam > *scp) { sc = *beam + *tmx; if (sc > *scp) *scp = sc; } } /* Add emission scores now */ *scp += emitsc; /* Make a copy into bmx, btr if necessary */ if (st->statetype == uBEGIN_ST) bmx[y][bj][diff] = *scp; } /* end block of normal state stuff */ else /* a BIFURC state */ { leftdiff = diff; leftj = bj; right_p = bmx[st->tmx[1]][leftj]; left_p = bmx[st->tmx[0]]; /* init w/ case that left branch emits it all */ *scp = left_p[leftj][leftdiff] + *right_p; while (leftdiff > 0) { leftdiff--; leftj = leftj ? leftj-1 : window-1; /* scan window wraparound */ right_p++; sc = left_p[leftj][leftdiff] + *right_p; if (sc > *scp) *scp = sc; } } } /* end loop over states */ } /* end loop over diff */ /* We've completed a row. Now we can examine the scores in diff, * aj, ROOT_ST to decide whether to report this row. If we do, * we report the 1..seqlen i, j coords of the matching subsequence * in seq, as well as the score converted to double-precision bits. */ bestdiff = 1; bestscore = bmx[0][bj][1]; for (diff = 2; diff <= window; diff++) if (bmx[0][bj][diff] > bestscore) { bestscore = bmx[0][bj][diff]; bestdiff = diff; } if (bestscore > ithresh) if (! ReportScanHit(j - bestdiff + 1, j, (double)(bestscore / INTPRECISION), gotone_f)) Warn("caller ignored report of a match!"); } /* end loop over j */ return 1; } tRNAscan-SE-2.0/src/trnascan.c0000644000543100007160000017556414044141217015416 0ustar pchanlowelab/* * Copyright, 1991, The Regents of the University of California. * This software was produced by the Los Alamos National Laboratory, * which is operated by the University of California for the United * States Department of Energy under contract W-7405-ENG-36. The * U. S. Government is licensed to use, reproduce, and distribute * this software. Permission is granted to the public to copy and * use this software without charge, provided that this Notice and * any statement of authorship are reproduced on all copies. * Neither the Government nor the University makes any warranty, * express or implied, or assumes any liability or responsibility * for the use of this software. */ #define PROGRAM "trnascan" #define RELEASE "1.4 (Feb 96)" #define DERIV_VERSION "1.3 (Oct 91)" #define BRIEF "'Identification of tRNA genes in genomic DNA'" #define CITATION "Fichant and Burks, J. Mol. Biol. (1991) 220:659-671." #define MODIF "(modified & optimized for use in tRNAscan-SE package by T. Lowe 2/96)" /* Modified by T. Lowe 11/95 */ /* Changes: 1) Search parameters named in #define constants 2) Print statements added to help trace progress of search - VERBOSE constant must be defined Trace statements sent to "tscan.verb.out" 3) Bug that caused program to crash on any non-ACGT sequence characters fixed 4) fgetseq() modified to correctly read in fasta sequences 5) compstrand() modified to increase efficiency & getseqsize() added to allow input sequences of any length (memory allowing) 6) Numerous calls to strlen(sequence) eliminated for efficiency 7) Calls to myindex() function eliminated - replaced by in-line switch statements for efficiency 8) basepairings() function rewritten for efficiency 9) program ANSI-fied to allow compilation with gcc 10) Fixed bug: indexing out of 'ntab' array bounds causing unpredictable side-effects ** Modifications result in over 200-fold speed increase ** -T. Lowe 9/2000 added "-i" option to allow alternate start of sequence nuc numbering */ /* #define NO_AMBIG -use this option to eliminate conservative * calling of 'N's as base pairing matches * in tRNAs (this gives more false positives) */ static char banner[] = "trnascan: scan a sequence for tRNAs"; char usage[] = "\n\ Usage: trnascan [-options] \n\ where supported options are:\n\ -s : use original tRNAscan 1.3 parameters (default)\n\ -r : use relaxed search parameters (used with tRNAscan-SE)\n\ -a : use alternate (user-set) search parameters\n\ -c : suppress credits\n\ -o : write results to \n\ -i : start sequence numbering at (def=1)\n\ -h : print (this) short help message\n\n"; #include #include #include #include #include #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define TRUE 1 #define FALSE 0 #ifndef TSCANDIR #define TSCANDIR "/usr/local/lib/trnascan" #endif #define MAXLINE 1000 /* max input seq line length */ #define BUF_SIZE 100 /* extra room added onto allocated sequence */ /* over that determined by getseqsize() */ #define MAX(x,y) (((x) > (y)) ? (x) : (y)) /* #define VERBOSE */ /* Original version 1.3 default parameters */ #define ST_SG_CUTOFF 5 /* general score (SG) cutoff */ #define ST_TPC_SIG_THRESH 0.40 /* TPC signal sequence matrix score */ /* cutoff */ #define ST_D_SIG_THRESH 0.40 /* D signal sequence matrix score cutoff */ #define ST_TPC_INV 2 /* Number of TPC matrix invariant bases */ /* allowed NOT to be invariant */ #define ST_TPC_INCSG 5 /* Number of base pairs required in TPC */ /* stem to increment general score */ #define ST_TPC_KEEP 4 /* Number of base pairs required in TPC */ /* stem to keep trna as a candidate */ #define ST_D_INV 1 /* Number of D matrix invariant bases */ /* allowed NOT to be invariant */ #define ST_LOOK_FOR_ACLOOP_SG 4 /* Minimum SG required to begin looking */ /* for anticodon loop */ #define ST_ACLOOP_MIN 4 /* Minimum base pairs required in */ /* anticodon loop */ #define ST_AA_INCSG 7 /* Number of base pairs in Amino acyl */ /* stem needed to increment SG */ #define ST_AA_KEEP 6 /* Number of base pairs in Amino acyl */ /* stem needed to keep tRNA candidate */ /* "Relaxed" parameters - used by default with */ /* tRNAscan-SE program. Makes tRNAscan into */ /* rough pre-filter for Covariance tRNA prediction */ /* program by S. Eddy */ #define RX_SG_CUTOFF 5 #define RX_TPC_SIG_THRESH 0.40 #define RX_D_SIG_THRESH 0.30 #define RX_TPC_INV 2 #define RX_TPC_INCSG 4 #define RX_TPC_KEEP 2 #define RX_D_INV 2 #define RX_LOOK_FOR_ACLOOP_SG 3 #define RX_ACLOOP_MIN 3 #define RX_AA_INCSG 5 #define RX_AA_KEEP 4 /* "Alternate" params - for experimenting */ /* with other param values */ #define ALT_SG_CUTOFF 4 #define ALT_TPC_SIG_THRESH 0.40 #define ALT_D_SIG_THRESH 0.30 #define ALT_TPC_INV 2 #define ALT_TPC_INCSG 4 #define ALT_TPC_KEEP 2 #define ALT_D_INV 2 #define ALT_LOOK_FOR_ACLOOP_SG 3 #define ALT_ACLOOP_MIN 3 #define ALT_AA_INCSG 6 #define ALT_AA_KEEP 4 #define MIN_VAR_LOOP 28 /* Minimum variable loop size, assumes min */ /* intron length = 8bp */ #define MAX_INTRON_LEN 60 /* Maximum allowable intron length */ #define MIN_SEQ_LEN 70 /* Minimum length of sequence that will be */ /* for tRNAs */ #define STRICT_PARAMS 1; #define RELAXED_PARAMS 2; #define ALT_PARAMS 3; typedef struct pset { float tpc_sig_thresh, d_sig_thresh; int sg_cutoff, tpc_inv, tpc_incsg, tpc_keep, d_inv, look_for_acloop_sg, acloop_min, aa_incsg, aa_keep; } Param_set_type; Param_set_type ps; void set_search_params (Param_set_type *ps, int params); /* Subroutine that accomplishes the end of the test for the presence of a tRNA gene */ void following_search(long int pos, /* first position of the found T-Psi-C signal */ long int pos1, /* first position of the found D signal */ char *ptr1, /* pointer to the first position of the D signal */ char *ptr3, /* ptr3=ptr1+2 pointer to first position of the D arm */ int lpair, int nloop, /*nloop=0 test of the sequence, */ /*nloop=1 test of the complementary sequence */ char *sequence, long int seqlen, FILE *fpo, /*pointer to the output file */ FILE *fpverb, char *name, int score, /* Value of the general score SG */ int match2, /* integer testing the presence of D */ /* arm with 3 base-pairings */ int *ntrna, int *npred, int *match, long int sqoffset /* offset nucleotide numbering by this much (set with -i param) */ ); /* Subroutine to read the consensus matrix */ void lectval(FILE *fp, /* pointer to the consensus matrix file */ float (*table_cons)[4], /* table containing the frequency of each base at each position of the signal */ int (*table_inv)[2], /* table containing the position and the nature of the invariant bases found in the signal. Code for the bases: A=0, C=1, G=2 and T=3. */ int *lsig, /*lsig=length of the signal */ int *ktot, /* ktot= number of invariant bases */ float *maxtot /* maxtot= sum of the maximum frequencies */ ); /* Subroutine reading the sequence */ /* Modified to correctly read FASTA sequence files */ int fgetseq(char *name, /* string w/name of the sequence */ char *sequence, /* character string containing the sequence */ long int *seqlen, /* length of sequence */ FILE *fpi); /* input file pointer */ /* Subroutine reading & returning the sequence length */ int getseqsize(FILE *fpi); /* input file pointer */ /* Subroutine looking for the presence of a given signal, returns 1 if a signal is found and 0 otherwise. It also return the table 'weight' containing the frequencies of the oberved bases in the windowed sequence and the number 'ninv' of invariant bases found in the windowed sequence*/ int readsignal(char *ptr, /* pointer to the sequence */ int (*table_inv)[2], /* table containing the position and nature of the invariant bases found in the consensus matrix */ int *lsig, /* lsig= length of the signal */ int *ktot, /* ktot= number of invariant bases in the consensus matrix */ float *weight, /* table containing the frequencies of the observed base at each position of the windowed sequence tested */ float (*table_cons)[4], /* table corresponding to the consensus matrix */ int *ninv, /* ninv= number of invariant */ /* bases in the windowed sequence */ int threshold_inv); /* Number of invariant bases */ /* allowed not to b e invariant */ /* Subroutine that calculates the similarity score on the potential signal previously retained by the subroutine readsignal. This subroutine returns 1 it the computed score is greater or equal to the defined threshold and 0 otherwise. It returns also the value of the computed score (score) */ int scoring(float *weight, /* table containing the frequencies of the observed base at each position of the potential signal */ int lsig, /* length of the signal */ float max, /* sum of the maximum frequencies found in */ /* the consensus matrix */ int ktot, /* number of invariant bases found in the consensus matrix */ float *score, /* value of the computed score on the potential signal */ float ThresholdValue, /* defined threshold for the similarity score */ int ninv /* number of invariant bases found in the potential signal */ ); /* Subroutine looking for base-pairings between two parts of the sequence. It returns the number of base-pairings found (ncomp) */ void basepairing(char *ptr, /* pointer to the sequence */ int npair, /* number of base-pairings forming a given arm */ int lpair, /* number of nucleotides found between the first position of the first part of the sequence involved in the stem and the last position of the second part of the sequence involved in the stem */ int *ncomp /* number of base-pairings observed between the two parts of the sequence tested */ ); /* Subroutine that complements the sequence. It returns the complementary sequence to the main */ void compstrand(char **sequence, /* pointer to the sequence string */ long int seqlen /* sequence length */ ); /* Subroutine that codes the anticodon signal sequence by a number comprised between 1 and 65. It returns this number. */ void codage(char *anticodon, /* anticodon signal sequence */ int length1, /* length of the anticodon signal, lenght1=3 */ int *num /* number associated to the anticodon signal sequence */ ); /* Subroutine that determines the tRNA gene family */ void corresaa(int num, /* Number coding the anticodon signal sequence */ char *type_trna /* tRNA gene family */ ); /* Subroutine that prints the results of the search */ void printresult(FILE *fpo, /* output file pointer */ FILE *fpverb, /* character string for the name of the sequence */ char *name, long int pos1, /* first position of the D signal */ long int pos, /* first position of the T-Psi-C signal */ int lpair, /* number of nucleotides between the */ /* first position of the D arm and the last one */ int lpair1, /* number of nucleotides between the first position of the aminoacyl arm and the last one */ int lpair2, /* number of nucleotides between the first position of the anticodon arm and the last one */ int nloop, /* nloop=0, scanning of the direct strand; nloop=1 scanning of the complementary strand */ int *ntrna, /* number of tRNA genes predicted in the sequence */ char *chaine2, /* character string for the predicted tRNA gene sequence */ char *sequence, /* character string containing the sequence tested */ long int length, /* length of the sequence */ int *match, /* match=1 if at least one tRNA gene has been found on the direct strand and 0 otherwise */ int ncomp, /* number of base-pairings in the anticodon arm of the predicted tRNA gene */ char *type_trna, /* character string for the tRNA gene family */ char *anticodon, /* character string for the anticodon signal sequence */ long int sqoffset /* offset nucleotide numbering by this much (set with -i param) */ ); int main(int argc, char **argv) { /* pointers to the different files fpi=input file, fpo=output file, fpcons1= T-Psi-C matrix file, fpcons2= D matrix file */ extern Param_set_type ps; /* search parameters */ FILE *fpi,*fpo,*fpcons1,*fpcons2, *fpverb; /* lsig1= length of the T-Psi-C signal, lsig2= length of the D signal ktot1= number of invariant bases in the T-Psi-C signal ktot2= number of invariant bases in the D signal */ int lsig1=0, lsig2=0, ktot1=0, ktot2=0; long int pos,pos1, begin; /* start position of the T-Psi-C signal (pos) and D signal (pos1) and of the search for the D signal (begin) */ int npair=0, lpair=0; /* variables used to test the presence of a stem, see definition further in the program */ float table_cons1[30][4]; /* table containing the frequency of each base at each position of the T-Psi-C signal */ float table_cons2[30][4]; /* table containing the frequency of each base at each position of the D signal */ float maxtot1=0, maxtot2=0; /* sum of the maximum frequencies found in the T-Psi-C matrix (maxtot1) and in the D matrix (maxtot2) */ float weight1[30]; /* table containing the frequency of the observed base at each position of the windowed sequence tested (T-Psi-C matrix) */ float weight2[30]; /* table containing the frequency of the observed base at each position of the windowed sequence tested (D matrix) */ int table_inv1[30][2]; /* table containing the position and the nature of the invariant bases found in the T-Psi-C matrix */ int table_inv2[30][2]; /* table containing the position and the nature of the invariant bases found in the D matrix */ int ntab[5][2]; /* table containing for each potential D arm found in the windowed sequence, the number of base-pairings and the number of nucleotides that separates the first position of the D arm and the last one */ char *sequence; /* character string containing the sequence */ long int seqlen; /* keeps the length of the current sequence */ long int sqoffset=0; /* start numbering nucleotides by this offset (def=0) */ char *ptr,*ptr1,*ptr3, *ptrstart; /* pointers to the sequence, their definitions are given at the place they are used in the program */ char name[80]; /* character string containing the name of the sequence */ char tscan_dir[120]; /* holds name of directory of consensus files */ int ntrna; /* number of tRNA genes predicted in the sequence and its complement */ int npred=0; /* Total number of tRNA genes predicted in the sequences of the input file */ int nseq=0; /* Number of sequences tested (number of sequences of the input file) */ long int lseq=0; /* Number of nucleotides tested */ int score; /* variable corresponding to the general score SG */ int ninv=0; /* number of invariant bases oberved in the windowed sequence for each signal */ int ncomp=0; /* number of base-pairings observed in the sequence tested for the presence of one of the four arms */ int threshold_inv; /* number of invariant bases that are allowed not to be invariant */ int match; /* if at least one tRNA gene is predicted of the direct strand, match=1, if no tRNA gene is found on the direct strand, match=0 */ int first_score=0,match2,h, nloop; /* ThresholdValue1= threshold value of the similarity score used to retain a T-Psi-C signal ThresholdValue2= threshold value of the similarity score used to retain a D signal Changing the threshold value of the similarity score, MODIFY '0.4' in the following line. */ float ThresholdValue1, ThresholdValue2; float score1=0, score2=0; int i,j; int params; int suppress_credits; /* flag for display of credits */ char *seqfile, /* name of input sequence file */ *outfile, /* name of output file (if not sent to stdout) */ *verbfile; /* name of file for verbose output */ int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ /*********************************************** * Parse command line ***********************************************/ suppress_credits = FALSE; params = STRICT_PARAMS; outfile = NULL; verbfile = NULL; while ((optc = getopt(argc, argv, "csraho:v:i:")) != -1) switch (optc) { case 'c': suppress_credits = TRUE; break; case 's': params = STRICT_PARAMS; break; case 'r': params = RELAXED_PARAMS; break; case 'a': params = ALT_PARAMS; break; case 'o': outfile = optarg; break; case 'v': verbfile = optarg; break; case 'i': sqoffset = atoi(optarg)-1; break; case 'h': printf("%s\n version %s\n%s\n", banner, RELEASE, usage); exit(0); default: fprintf(stderr,"unrecognized option %c\n", optc); exit(1); } if (argc - optind != 1) { fprintf(stderr,"%s\n", usage); exit(1); } seqfile = argv[argc-1]; /* open the sequence file */ if ((fpi=fopen(seqfile,"r")) == NULL) { fprintf(stderr,"tRNAscan1.4: FATAL: Cannot open the input sequence file %s\n",seqfile); exit(1); } /* open the output file */ if (outfile == NULL) { fpo = stdout; } else if ((fpo=fopen(outfile,"w")) == NULL) { fprintf(stderr,"tRNAscan1.4: FATAL: Cannot open the output file %s\n",outfile); exit(1); } /* open Verbose output file */ if (verbfile != NULL) if ((fpverb=fopen(verbfile,"w")) == NULL) { fprintf(stderr,"tRNAscan1.4: Cannot open verbose output file %s\n",verbfile); exit(1); } /* open the consensus matrix file for the T-Psi-C signal */ strcpy(tscan_dir,TSCANDIR); if (((fpcons1=fopen("TPCsignal","r")) == NULL) && ((fpcons1=fopen(strcat(tscan_dir,"/TPCsignal"),"r")) == NULL)) { fprintf(stderr,"tRNAscan1.4: main cannot open TPCsignal consensus file\n"); exit(1); } /* open the consensus matrix file for the D signal */ strcpy(tscan_dir,TSCANDIR); if (((fpcons2=fopen("Dsignal","r")) == NULL) && ((fpcons2=fopen(strcat(tscan_dir,"/Dsignal"),"r")) == NULL)) { fprintf(stderr,"tRNAscan1.4: main cannot open Dsignal consensus file\n"); exit(1); } /* Set search parameters */ set_search_params(&ps,params); ThresholdValue1 = ps.tpc_sig_thresh; ThresholdValue2 = ps.d_sig_thresh; /* Credits */ if (!suppress_credits) { printf("\n %s %s -- derived from version %s\n", PROGRAM, RELEASE, DERIV_VERSION); printf("\n Please cite: %s\n %s\n", BRIEF,CITATION); printf(" %s\n\n",MODIF); } /* reading the two consensus file */ lectval(fpcons1,table_cons1,table_inv1,&lsig1,&ktot1,&maxtot1); lectval(fpcons2,table_cons2,table_inv2,&lsig2,&ktot2,&maxtot2); /* main loop for reading & analyzing one sequence at a time */ while (!feof(fpi)) { /* find out sequence size before allocating memory & reading in */ seqlen = getseqsize(fpi); sequence= (char *) calloc (seqlen+BUF_SIZE,sizeof(char)); if (sequence == NULL ) { fprintf(stderr,"tRNAscan1.4: Can't malloc for sequence\n"); exit(-1); } /* reading the name of the sequence and the sequence from the input file */ if (fgetseq(name,sequence,&seqlen,fpi) == 0) break; nseq++; lseq += seqlen; match=0; for (nloop=0; nloop <=1; nloop++) { /* if the sequence is shorter than 70 bp long it is skipped */ if (seqlen < MIN_SEQ_LEN) break; ntrna=0; /* search of tRNA genes starts pos= first position of the T-Psi-C signal ptr= pointer to the first position of the T-Psi-C signal */ for (pos=44, ptr=(sequence+43); pos= 3 (ktot1=4) change in the threshold of the T-Psi-C signal for the increment of SG, MODIFY the following line */ if (ninv >= ktot1-1) { score++; #ifdef VERBOSE fprintf(fpverb,"+ TPC invariant bp: %d. SG++\n",ninv); } else { fprintf(fpverb,"X TPC invariant bp: %d. NO SG++\n",ninv); #endif } /* Computation of the score on the found T-Psi-C signal, signal retains if the computed score is >= ThresholdValue1 */ if(scoring(weight1,lsig1,maxtot1,ktot1,&score1,ThresholdValue1, ninv)) { #ifdef VERBOSE fprintf(fpverb,"+ TPC signal over thresh: %f\n",score1); #endif /* npair= number of base-pairings in the T-Psi-C stem lpair= number of nucleotides between the first position of the T-Psi-C arm and the last position of the T-Psi-C arm */ npair = 5; lpair = 16; /* Test for the presence of the T-Psi-C stem */ basepairing(ptr+1,npair,lpair,&ncomp); /* If the number of base-pairings (ncomp) equal 5, SG (score) incremented by 1. Change in the threshold of base-pairings in the T-Psi-C arm for the increment of SG, MODIFY the value '5' in the following line */ if (ncomp >= ps.tpc_incsg) { score++; #ifdef VERBOSE fprintf(fpverb,"+ TPC parings (%d) add to SG=%d\n",ncomp,score); } else { fprintf(fpverb,"X TPC parings (%d) NO add to SG=%d\n",ncomp,score); #endif } /* Change in the threshold to retain a stem as a potential T-Psi-C arm, MODIFY the value '4' in the following line */ #ifdef VERBOSE if (ncomp < ps.tpc_keep) { fprintf(fpverb,"X TPC parings retain loop: %d\n",ncomp); } else { fprintf(fpverb,"+ TPC parings retain loop: %d\n",ncomp); #else if (ncomp >= ps.tpc_keep) { #endif /* For the same found T-Psi-C region, different potential D region can be found because of the possible different lengthes of the D loop. As the search for the tRNA gene is applied for each potential D arm, the value of SG computed on the T-Psi-C region is stored in 'first_score' */ first_score=score; /*Change in the number of invariant bases to be present in the D signal, MODIFY the value of threshold_inv here */ threshold_inv= ps.d_inv; /* Search for the presence of a D region between -120 and -37 nucleotides upstream from the found T-Psi-C region. 37 nucleotides is the observed minimum length assuming no insertion in the D and variable loop. 120 nucleotides allows for an intron of up to 60 nucleotides. Change in the length of the intron, MODIFY the value '120' in the definition of begin and ptrstart. ptr1= pointer to the first position of the D signal pos1= first position of the D signal begin= starting position for the search of the D signal ptrsart= pointer to the starting position for the search of the D signal */ /* If the sequence does not have 127 nucleotides upstream of the T-Psi-C signal, the search for the D signal starts at position 8 */ if (pos <=127) { begin = 8; ptrstart = sequence+7; } else { begin = pos-(MAX_INTRON_LEN+60); ptrstart = ptr-(MAX_INTRON_LEN+60); } for(pos1=begin,ptr1=ptrstart; pos1<=(pos-37);pos1++, ptr1++) { /* Search for the D signal */ #ifdef VERBOSE if(!readsignal(ptr1,table_inv2,&lsig2,&ktot2,weight2, table_cons2,&ninv,threshold_inv)) { fprintf(fpverb,"X D invariant bp: %d\n",ninv); } else { fprintf(fpverb,"+ D invariant bp: %d\n",ninv); #else if(readsignal(ptr1,table_inv2,&lsig2,&ktot2,weight2, table_cons2,&ninv,threshold_inv)) { #endif /* Number of invariant bases found in the windowed sequence equal to the number of invariant bases of the matrix (ktot2=3), then SG (score) is incremented by 1 Change in the increment of SG for the D signal, MODIFY the following line */ if (ninv >= ktot2) { score++; #ifdef VERBOSE fprintf(fpverb,"+ D invariant (%d) inc SG\n",ninv); } else { fprintf(fpverb,"X D invariant (%d) NO add to SG\n",ninv); #endif } /* Computation of the score on the found D signal, signal retains if the computed score is >= TresholdValue2 */ #ifdef VERBOSE if(!scoring(weight2,lsig2,maxtot2,ktot2,&score2, ThresholdValue2,ninv)) { fprintf(fpverb,"X D signal threshold: %f\n",score2); } else { fprintf(fpverb,"+ D signal threshold: %f\n",score2); #else if(scoring(weight2,lsig2,maxtot2,ktot2,&score2, ThresholdValue2,ninv)) { #endif /* ptr3= pointer pointing on the first position of the D arm npair= number of base-pairings in the D stem lpair= number of nucleotides between the first position of the D arm and the last one. As the D loop presents some variation in length, lpair can take different values: lpair=14 smallest length of the D loop lpair=18 greatest length of the D loop */ ptr3=ptr1+2; npair = 3; for (i=0; i < 5; i++) for (j=0; j < 2; j++) ntab[i][j]=0; for (lpair=14,h=0; lpair <=18; lpair++,h++) { /* Search for the presence of the D arm */ basepairing(ptr3,npair,lpair,&ncomp); /* For each potential stem found, the number of base-pairings (ncomp) and lpair are stored in ntab */ ntab[h][0]=ncomp; ntab[h][1]=lpair; } match2=0; for (h=0; h < 5;h++) { /* If stems with 3 base-pairings are found for the D arm, the following steps of the algorithm are applied on each of these potential stems. The 2 base-pairings stems are discarded */ if (ntab[h][0] == 3) { lpair=ntab[h][1]; match2=1; following_search(pos,pos1,ptr1,ptr3,lpair,nloop, sequence,seqlen,fpo,fpverb,name,score,match2,&ntrna,&npred,&match,sqoffset); } } /* If no stems with 3 base-pairings have been found, then the stems with 2 base-pairings are used in the following steps */ if (!match2) { for (h=0; h < 5;h++) { if (ntab[h][0] == 2) { lpair=ntab[h][1]; following_search(pos,pos1,ptr1,ptr3,lpair,nloop, sequence,seqlen,fpo,fpverb,name,score,match2,&ntrna,&npred,&match,sqoffset); } } } } score=first_score; } } } } } } /* The sequence is complemented and the algorithm is applied on the complementary strand */ compstrand(&sequence,seqlen); #ifdef VERBOSE if (nloop == 0) fprintf(fpverb,"\n== Trying COMPLEMENTARY strand\n"); #endif } free(sequence); /* free up mem to ready for next sequence */ } fprintf(fpo,"number of sequences= %d\n", nseq); fprintf(fpo,"number of bases tested (one strand)=%ld\n", lseq); lseq = 2* lseq; fprintf(fpo,"number of bases tested (both strands)= %ld\n", lseq); fprintf(fpo,"number of predicted tRNA=%d\n", npred); exit(0); } /* Subroutine that accomplishes the end of the test for the presence of a tRNA gene */ void following_search(long int pos, /* first position of the found T-Psi-C signal */ long int pos1, /* first position of the found D signal */ char *ptr1, /* pointer to the first position of the D signal */ char *ptr3, /* ptr3=ptr1+2 pointer to first position of the D arm */ int lpair, int nloop, /*nloop=0 test of the sequence, */ /*nloop=1 test of the complementary sequence */ char *sequence, long int seqlen, FILE *fpo, /*pointer to the output file */ FILE *fpverb, char *name, int score, /* Value of the general score SG */ int match2, /* integer testing the presence of D */ /* arm with 3 base-pairings */ int *ntrna, int *npred, int *match, long int sqoffset /* offset nucleotide numbering by this much (set with -i param) */ ) { extern Param_set_type ps; char chaine2[300]; /* character string containing the predicted tRNA gene sequence */ char anticodon[4]; /* character string containing the anticodon signal sequence */ char type_trna[4]; /* character string containing the tRNA gene family */ char *ptr2,*ptr4,*ptr5; /*pointers to the sequence, see their definition below*/ int score2, score1; /* variables for the general score SG */ int npair1=0; /* number of base-pairings in the aminoacyl arm */ int lpair1=0; /* number of nucleotides between the first position of the aminoacyl arm and the last one */ int lpair2=0; /* number of nucleotides between the first position of the anticodon arm and the last one */ int npair2=0; /* number of base-pairings in the anticodon arm */ int pos6=0; /* number of nucleotides found between the first position of the anticodon arm and the first position of the T-Psi-C signal. */ int pos4=0; /* number of nucleotides present in the variable loop.*/ int length1=3; /* length of the anticodon signal */ int num=0; /* variable that codes the anticodon signal sequence (comprised between 1 and 65). */ int match1; /* match1=1 if a tRNA without intron has been predicted and 0 otherwise*/ int ncomp=0; /* number of base-pairings observed in the sequence tested for the presence of a given arm */ int i; #ifdef VERBOSE fprintf(fpverb,"IN following search...\n"); #endif score1=score; /* If match2=1, the found D arm present 3 base-pairings, the general score SG (score1) is incremented by 1 Change in the threshold of base-pairings in the D arm for the increment of SG, MODIFY the following line */ if (match2) { score1++; #ifdef VERBOSE fprintf(fpverb,"+ D arm found 3 bp\n"); } else { fprintf(fpverb,"X D arm found less than 3 bp\n"); #endif } npair1=7; /* Number of base-pairings in the aminoacyl arm */ lpair1=pos-pos1+8+23; /* Number of nucleotides between the first position of the aminoacyl arm and the last one */ ptr2=(ptr1-7); /* pointer to the first position of the amino acyl arm */ /* Test for the presence of the aminoacyl arm */ basepairing(ptr2,npair1,lpair1,&ncomp); /* If the number of base-pairings (ncomp) equal 7, the general score SG (score1) is incremented by 1 . Change in the threshold of base-pairings in the aminoacyl arm for the increment of SG, MODIFY the value '7' in the following line */ if (ncomp >= ps.aa_incsg) { score1++; #ifdef VERBOSE fprintf(fpverb,"+ AA arm found %d base pairings\n",ncomp); } else { fprintf(fpverb,"X AA arm found %d base pairings\n",ncomp); #endif } /* Change in the threshold to retain a stem as a potential aminoacyl arm MODIFY the value '6' in the following line */ if (ncomp >= ps.aa_keep) { i = 0; while (i< 300) { chaine2[i]='\0'; i++; } /* If the general score SG is >= 4, the algorithm looks for the presence of an anticodon stem, otherwise the algorithm is initiated again on the following windowed sequence */ if (score1 >= ps.look_for_acloop_sg) { #ifdef VERBOSE fprintf(fpverb,"Looking for anticodon stem\n"); #endif ptr4=ptr3+lpair+2; /* pointer to the first position of the anticodon arm */ match1=0; npair2=5; /* Number of base-pairings in the anticodon arm */ lpair2=16; /* Number of nucleotides between the first position of the anticodon arm and the last position */ /* Test for the presence of an anticodon arm */ basepairing(ptr4,npair2,lpair2,&ncomp); /* If 4 or 5 base-pairings are observed, the windowed sequence is retained as a potential anticodon arm */ if (ncomp >= ps.acloop_min) { if (nloop == 0) *match=1; i=0; while (i< 4) { anticodon[i]='\0'; type_trna[i]='\0'; i++; } /* Test of the presence of the residue T at the position preceding the anticodon signal. If the residue T is present, the general score is incremented by 1.*/ if ((*(ptr4+6)) == 't') { score1++; #ifdef VERBOSE fprintf(fpverb,"+ Invariant T found in anticodon. SG++\n"); } else { fprintf(fpverb,"X Invariant T NOT found in anticodon. No SG inc\n"); #endif } /* If SG >= 5, the windowed sequence is retained as a potential tRNA gene. The tRNA gene predicted at that level is without intron */ #ifdef VERBOSE if (score1 < ps.sg_cutoff) { fprintf(fpverb,"X Under SG threshold: %d\n",score1); } else { fprintf(fpverb,"+ Over SG threshold: %d\n",score1); #else if (score1 >= ps.sg_cutoff) { #endif /* Identification of the tRNA gene family */ strncpy(anticodon,ptr4+7,3); codage(anticodon,length1,&num); corresaa(num,type_trna); strncpy(chaine2,(ptr1-7),lpair1+2); (*ntrna)++; (*npred)++; match1=1; /* match1=1 indicates that a tRNA gene without intron has been predicted */ /* The results are printed in the output file */ printresult(fpo,fpverb,name,pos1,pos,lpair,lpair1,lpair2,nloop,ntrna, chaine2,sequence,seqlen,match,ncomp,type_trna,anticodon,sqoffset); } } /* If no anticodon arm has been found for a value of lpair2=16, i. e. an anticodon loop of 7 nucleotides (without intron), the algorithm searchs for the presence of an anticodon arm assuming the presence of an intron in the anticodon loop */ /* pos6 corresponds to the number of nucleotides found between the first position of the anticodon arm and the first position of the T-Psi-C signal. */ #ifdef VERBOSE fprintf(fpverb,"+ Looking for anticodon stem WITH intron...\n"); #endif pos6=pos-(pos1+lpair+3); /* The search for the presence of an anticodon stem is initiated only if an intron of at least 8 bases long is expected to be present. The value 28 for pos6 corresponds to the number of nucleotides found for the smallest length of the variable loop and for an intron length of 8 nucleotides. Change in the smallest length of the intron, MODIFY the value of pos6 in the following line and also the start value of lpair2 (lpair2=24) in the loop (lpair2 is the number of nucleotides found between the first position of the anticodon arm and the last position of the anticodon arm. lpair2=16 when no intron in the anticodon loop). */ if ((!match1) && (pos6 >=MIN_VAR_LOOP)) { for (lpair2=MIN_VAR_LOOP-4;lpair2= 3. */ pos4=pos-pos1-lpair-lpair2-4; /* ptr5 = pointer to the position preceding the intron. */ ptr5=ptr4+10; /* The anticodon arm is retained if 4 or 5 base-pairings are formed and if pos4 >= 3 */ if ((ncomp >=4) && (pos4 >= 3)) { /* Test for the presence of the residues A or G at the position preceding the intron */ if(((*ptr5) == 'a') || ((*ptr5) == 'g')) { if (nloop == 0) (*match)=1; i=0; while (i<4) { anticodon[i]='\0'; type_trna[i]='\0'; i++; } /* Test for the presence of the residue T at the position preceding the anticodon signal. If the residue T is present, the general score is incremented by 1.*/ if ((*(ptr4+6)) == 't') { score2++; #ifdef VERBOSE fprintf(fpverb,"+ Found invariant T. SG= %d\n",score2); } else { fprintf(fpverb,"X Did NOT find invariant T. SG= %d\n",score2); #endif } /* If SG >= 5, the windowed sequence is retained as a potential tRNA gene. The tRNA gene predicted at that level has one intron in the anticodon loop. */ if (score2 >= ps.sg_cutoff) { /* Identification of the tRNA gene family */ strncpy(anticodon,ptr4+7,3); codage(anticodon,length1,&num); corresaa(num, type_trna); strncpy(chaine2,(ptr1-7),lpair1+2); (*ntrna)++; (*npred)++; /* The results are printed in the output file */ printresult(fpo,fpverb,name,pos1,pos,lpair,lpair1,lpair2,nloop, ntrna,chaine2,sequence,seqlen,match,ncomp,type_trna,anticodon,sqoffset); } } } } } } } } /* Subroutine to read the consensus matrix */ void lectval(FILE *fp, /* pointer to the consensus matrix file */ float (*table_cons)[4], /* table containing the frequency of each base at each position of the signal */ int (*table_inv)[2], /* table containing the position and the nature of the invariant bases found in the signal. Code for the bases: A=0, C=1, G=2 and T=3. */ int *lsig, /*lsig=length of the signal */ int *ktot, /* ktot= number of invariant bases */ float *maxtot /* maxtot= sum of the maximum frequencies */ ) { int i=0,j, k=0,l,m; float max=0; int ret = 0; for (l=0; l< 30; l++) for (m=0; m<4; m++) table_cons[l][m]=0; for (l=0; l< 30; l++) for (m=0; m < 2; m++) table_inv[l][m]=0; while(feof(fp) == 0) { for(j=0;j<4;j++) { ret = fscanf(fp,"%f",&table_cons[i][j]); } i++; } *lsig=i-1; for (i=0; i<*lsig; i++) { for (j=0; j<4; j++) { if (table_cons[i][j] == 1.0) { k++; table_inv[k][0]=i; table_inv[k][1]=j; } } } for (i=0; i<*lsig; i++) { max=table_cons[i][0]; for (j=1; j<4; j++) max= MAX(max,table_cons[i][j]); *maxtot += max; } *ktot=k; } /* Subroutine reading the sequence */ /* Modified to correctly read FASTA sequence files */ int fgetseq(char *name, /* string w/name of the sequence */ char *sequence, /* character string containing the sequence */ long int *seqlen, /* length of sequence */ FILE *fpi) /* input file pointer */ { char line[MAXLINE]; /* character string used to read a line */ char *ptr; /* pointer to the sequence */ long int i,j,c; char *ptrRet; line[0]='\0'; *sequence='\0'; *seqlen = 0; /* Test the first character to choose between the two formats available */ if (fgets(line, MAXLINE, fpi) == NULL) { return 0; } else if (line[0] == ';') { /* File in the non-GenBank format */ if (line[1] != ' ') { for (i=0, ptr= &(line[1]); *ptr != ' ' && *ptr !='\n';i++) name[i]= *ptr++; name[i] = '\0'; while ((c = getc(fpi)) == ';') ptrRet = fgets(line, MAXLINE, fpi); ungetc(c, fpi); ptr = sequence; while ((c= getc(fpi)) != ';' && c != EOF) if (isalpha(c)) { *ptr = tolower(c); ptr++; (*seqlen)++; } if (c != EOF) ungetc(c, fpi); *ptr= '\0'; } else if (line[1] == ' ') { /* Intelligenetics format */ while ((c = getc(fpi)) == ';') ptrRet = fgets(line,MAXLINE, fpi); ungetc(c, fpi); ptrRet = fgets(line, MAXLINE, fpi); for (i=0, ptr= &(line[0]); *ptr != ' ' && *ptr !='\n';i++) name[i]= *ptr++; name[i] = '\0'; ptr = sequence; while ((c= getc(fpi)) != ';' && c != EOF) if (isalpha(c)){ *ptr = tolower(c); ptr++; (*seqlen)++; } if (c != EOF) ungetc(c, fpi); *ptr= '\0'; } } else if (line[0] == '>') { /* Fasta Format */ for (i=1; line[i] == ' '; i++) ; for (j = 0, ptr = &line[i]; *ptr != ' '; ptr++, j++) name[j] = *ptr; name[j] = '\0'; ptr = sequence; while ((c= getc(fpi)) != '>' && c != EOF) if (isalpha(c)) { *ptr = tolower(c); *seqlen += 1; ptr++; } if (c != EOF) ungetc(c, fpi); *ptr= '\0'; } else { /* File in GenBank format */ while (strncmp(line, "LOCUS", 5) != 0) if (fgets(line, MAXLINE, fpi) == NULL) exit(1); for (i = 0, ptr = &line[12]; *ptr != ' '; ptr++, i++) name[i] = *ptr; name[i] = '\0'; while (strncmp(line, "ORIGIN", 6) != 0) if (fgets(line, MAXLINE, fpi) == NULL) exit(1); ptr = sequence; *seqlen=0; ptrRet = fgets(line, MAXLINE, fpi); while (strncmp(line, "//", 2) != 0) { for (i = 0; line[i] != '\n'; i++) if (isalpha(line[i])) { *ptr++ = tolower(line[i]); (*seqlen)++; } ptrRet = fgets(line, MAXLINE, fpi); } *ptr = '\0'; } return (1); } /* Subroutine reading & returning the sequence length */ int getseqsize(FILE *fpi /* input file pointer */ ) { char line[MAXLINE]; /* character string used to read a line */ long int i,c, seqlen, fpi_save_pos; char* ptrRet; line[0]='\0'; seqlen = 0; fpi_save_pos = ftell(fpi); /* save current position in file */ /* Test the first character to choose between the two formats available */ if (fgets(line, MAXLINE, fpi) == NULL) { return 0; } else if (line[0] == ';') { /* File in the non-GenBank format */ if (line[1] != ' ') { while ((c = getc(fpi)) == ';') ptrRet = fgets(line, MAXLINE, fpi); ungetc(c, fpi); while ((c= getc(fpi)) != ';' && c != EOF) if (isalpha(c)) { seqlen++; } } else if (line[1] == ' ') { /* Intelligenetics format */ while ((c = getc(fpi)) == ';') ptrRet = fgets(line,MAXLINE, fpi); ungetc(c, fpi); ptrRet = fgets(line, MAXLINE, fpi); while ((c= getc(fpi)) != ';' && c != EOF) if (isalpha(c)){ seqlen++; } } } else if (line[0] == '>') { /* Fasta Format */ while ((c= getc(fpi)) != '>' && c != EOF) if (isalpha(c)) seqlen++; } else { /* File in GenBank format */ while (strncmp(line, "LOCUS", 5) != 0) if (fgets(line, MAXLINE, fpi) == NULL) exit(1); while (strncmp(line, "ORIGIN", 6) != 0) if (fgets(line, MAXLINE, fpi) == NULL) exit(1); ptrRet = fgets(line, MAXLINE, fpi); while (strncmp(line, "//", 2) != 0) { for (i = 0; line[i] != '\n'; i++) if (isalpha(line[i])) { seqlen++; } ptrRet = fgets(line, MAXLINE, fpi); } } fseek(fpi,fpi_save_pos,0); /* reposition file pointer */ return seqlen; } /* Subroutine that returns the position in the string s where the string t begins or -1 if s does not contain t */ /* Calls to this function eliminated for efficiency T. Lowe 11/95 */ int myindex (char *s, char *t) { int i, j, k; for (i=0; s[i] != '\0'; i++) { for (j=i, k=0; t[k] != '\0' && s[j] == t[k]; j++, k++) ; if (t[k] == '\0') return(i); } return(-1); } /* Subroutine looking for the presence of a given signal, returns 1 if a signal is found and 0 otherwise. It also return the table 'weight' containing the frequencies of the oberved bases in the windowed sequence and the number 'ninv' of invariant bases found in the windowed sequence*/ int readsignal(char *ptr, /* pointer to the sequence */ int (*table_inv)[2], /* table containing the position and nature of the invariant bases found in the consensus matrix */ int *lsig, /* lsig= length of the signal */ int *ktot, /* ktot= number of invariant bases in the consensus matrix */ float *weight, /* table containing the frequencies of the observed base at each position of the windowed sequence tested */ float (*table_cons)[4], /* table corresponding to the consensus matrix */ int *ninv, /* ninv= number of invariant */ /* bases in the windowed sequence */ int threshold_inv) /* Number of invariant bases */ /* allowed not to b e invariant */ { extern Param_set_type ps; int k = 1, i=0, j,match1; int l; int tab[30]; (*ninv)=0; for (l=0; l< 30; l++) { weight[l]=0; tab[l]=0; } /* If the consensus matrix contains some invariant bases, the subroutine calculates the number of invariant bases that are found in the windowed sequence (ninv) */ if (*ktot) { for (k=1; k <= (*ktot); k++) { /* (original code commented out) temp[0]= *(ptr+table_inv[k][0]); j = myindex(base,temp); */ switch (*(ptr+table_inv[k][0])) { case 'a': j=0; break; case 'c': j=1; break; case 'g': j=2; break; case 't': j=3; break; default: j=-1; } /* trap for non-ATGC chars, assume match for ambiguous bases (j= -1)*/ #ifdef NO_AMBIG if (j == table_inv[k][1]) #else if ((j == table_inv[k][1]) || (j == -1)) #endif (*ninv)++; } } /* If the number of invariant bases found in the windowed sequence is < to the threshold allowed, the windowed sequence is discarded as a potential signal. */ if ((*ninv) < (*ktot)-threshold_inv) return(0); /* If the number of invariant bases is >= to the threshold, the table 'weight' is constructed */ if((*ninv) >= (*ktot)-threshold_inv) { match1=1; while(match1 && (i<*lsig)) { /* (original code commented out) temp[0]= *(ptr+i); tab[i]= myindex(base,temp); */ switch (*(ptr+i)) { case 'a': tab[i]=0; break; case 'c': tab[i]=1; break; case 'g': tab[i]=2; break; case 't': tab[i]=3; break; default: tab[i]=-1; } /* trap for non-ATGC chars, assume match for ambig bases (tab[i]= -1) */ if (tab[i] == -1) #ifdef NO_AMBIG weight[i] = 0; #else weight[i] = 1; #endif else weight[i]=table_cons[i][tab[i]]; if((ktot == 0) && (weight[i] == 0)) { match1=0; return(0); } else { i++; } } } return(1); } /* Subroutine that calculates the similarity score on the potential signal previously retained by the subroutine readsignal. This subroutine returns 1 it the computed score is greater or equal to the defined threshold and 0 otherwise. It returns also the value of the computed score (score) */ int scoring(float *weight, /* table containing the frequencies of the observed base at each position of the potential signal */ int lsig, /* length of the signal */ float max, /* sum of the maximum frequencies found in */ /* the consensus matrix */ int ktot, /* number of invariant bases found in the */ /* consensus matrix */ float *score, /* value of the computed score on the */ /* potential signal */ float ThresholdValue, /* defined threshold for the */ /* similarity score */ int ninv /* number of invariant bases found */ /* in the potential signal */ ) { float tot; int i; /* Computation of the value of the score on the potential signal */ tot=0; for(i=0; i< lsig; i++) tot += weight[i]; tot -= ninv; max -= ktot; *score = tot / max; /* Comparison of the computed score with the defined threshold value */ if (*score >= ThresholdValue) return(1); else { return(0); } } /* Subroutine looking for base-pairings between two parts of the sequence. It returns the number of base-pairings found (ncomp) */ /* rewritten to improve efficiency & eliminate use of myindex() calls */ void basepairing(char *ptr, /* pointer to the sequence */ int npair, /* number of base-pairings forming a given arm */ int lpair, /* number of nucleotides found between the first position of the first part of the sequence involved in the stem and the last position of the second part of the sequence involved in the stem */ int *ncomp) /* number of base-pairings observed between the two parts of the sequence tested */ { int n; /* loop counter */ *ncomp=0; #ifdef NO_AMBIG for(n=0; n=1;i--) { switch (anticodon[i-1]) { case 'a': j=0; break; case 'c': j=1; break; case 'g': j=2; break; case 't': j=3; break; default: j=-1; } if (j == -1) { match=1; i=1; } *num=(*num)+j*iba; iba=4*iba; } if (match) *num=65; } /* Subroutine that determines the tRNA gene family */ void corresaa(int num, /* Number coding the anticodon signal sequence */ char *type_trna /* tRNA gene family */ ) { /*The table 'amino_acid' gives the correspondence between the number (num) coding the anticodon signal and the amino acid that is added to the protein by the tRNA. For example, if the anticodon is AAA then num=1 and the table will associate the amino acid Phe to that anticodon. The tRNA gene in that case will be a Phe tRNA gene. */ static char *amino_acid[]= {"Phe","Val","Leu","Ile","Cys","Trp","Arg","Ser", "Ser","Ala","Pro","Thr","Tyr","Asp","His","Asn", "Leu","Val","Leu","Met","Trp","Gly","Arg","Arg", "Ser","Ala","Pro","Thr","Sup","Glu","Gln","Lys", "Phe","Val","Leu","Ile","Cys","Gly","Arg","Ser", "Ser","Ala","Pro","Thr","Tyr","Asp","His","Asn", "Leu","Val","Leu","Ile","Sup","Gly","Arg","Arg", "Ser","Ala","Pro","Thr","Sup","Glu","Gln","Lys", "Ind"}; strncpy(type_trna,amino_acid[num-1],3); } /* Subroutine that prints the results of the search */ void printresult(FILE *fpo, /* output file pointer */ FILE *fpverb, /* character string for the name of the sequence */ char *name, long int pos1, /* first position of the D signal */ long int pos, /* first position of the T-Psi-C signal */ int lpair, /* number of nucleotides between the */ /* first position of the D arm and the last one */ int lpair1, /* number of nucleotides between the first position of the aminoacyl arm and the last one */ int lpair2, /* number of nucleotides between the first position of the anticodon arm and the last one */ int nloop, /* nloop=0, scanning of the direct strand; nloop=1 scanning of the complementary strand */ int *ntrna, /* number of tRNA genes predicted in the sequence */ char *chaine2, /* character string for the predicted tRNA gene sequence */ char *sequence, /* character string containing the sequence tested */ long int length, /* length of the sequence */ int *match, /* match=1 if at least one tRNA gene has been found on the direct strand and 0 otherwise */ int ncomp, /* number of base-pairings in the anticodon arm of the predicted tRNA gene */ char *type_trna, /* character string for the tRNA gene family */ char *anticodon, /* character string for the anticodon signal sequence */ long int sqoffset /* offset nucleotide numbering by this much (set with -i param) */ ) { long int pos2; /* first position of the predicted tRNA gene when it is found on the complementary strand */ long int posstart; /* first position of the intron */ long int posend; /* last position of the intron */ /* If lpair2 > 16, treatment of the tRNA gene with one intron */ /* Results for the tRNA genes predicted on the direct strand */ #ifdef VERBOSE fprintf(fpverb,"*** tRNA found\n\n"); #endif if ((nloop) == 0) { if((*ntrna) == 1) fprintf(fpo,"sequence name= %s\n", name); fprintf(fpo,"start position= %ld end position= %ld\n",pos1-7+sqoffset,pos1-6+lpair1+sqoffset); fprintf(fpo,"potential tRNA sequence= %s\n",chaine2); fprintf(fpo,"D signal= %ld %ld TpsyC signal= %ld %ld\n", pos1+sqoffset,pos1+7+sqoffset, pos+sqoffset, pos+14+sqoffset); fprintf(fpo,"amino-acyl stem= %ld-%ld;%ld-%ld\n",pos1-7+sqoffset,pos1-1+sqoffset,pos1-13+lpair1+sqoffset, pos1-7+lpair1+sqoffset); fprintf(fpo,"D stem= %ld-%ld;%ld-%ld\n",pos1+2+sqoffset,pos1+4+sqoffset,pos1+lpair+sqoffset, pos1+lpair+2+sqoffset); if(lpair2 > 16) { fprintf(fpo,"anticodon stem= %ld-%ld;%ld-%ld\n",pos1+lpair+4+sqoffset,pos1+lpair+8+sqoffset, pos1+lpair+lpair2+sqoffset,pos1+lpair+lpair2+4+sqoffset); } else { fprintf(fpo,"anticodon stem= %ld-%ld;%ld-%ld\n",pos1+lpair+4+sqoffset,pos1+lpair+8+sqoffset, pos1+lpair+16+sqoffset,pos1+lpair+20+sqoffset); } fprintf(fpo,"TpsyC stem= %ld-%ld;%ld-%ld\n",pos+1+sqoffset,pos+5+sqoffset,pos+13+sqoffset,pos+17+sqoffset); if (strcmp(type_trna,"Ind") != 0) { fprintf(fpo,"tRNA predict as a tRNA- %s : anticodon %s\n", type_trna, anticodon); } else { fprintf(fpo,"anticodon includes unknown bases\n"); } if (lpair2 > 16) { posstart=pos1+lpair+15; posend= pos1+lpair+lpair2-2; fprintf(fpo,"potential intron between positions %ld %ld\n",posstart+sqoffset, posend+sqoffset); } fprintf(fpo,"number of base pairing in the anticodon stem= %d\n",ncomp); fprintf(fpo,"\n"); } else { /* Results on the complementary strand */ /* If no tRNA gene is predicted on the direct strand, then for the first tRNA gene predicted on the complementary strand the two following lines are printed in the output file */ if( (!(*match)) && ((*ntrna) == 1)) { fprintf(fpo,"sequence name= %s\n", name); fprintf(fpo,"complementary strand\n"); } /* If tRNA genes are predicted on the direct strand, then for the first tRNA gene predicted on the complemetary strand the following line is printed in the output file */ else if ((*ntrna) == 1) { fprintf(fpo,"complementary strand\n"); } pos2= length-pos1+8; fprintf(fpo,"start position= %ld end position= %ld\n",pos2+sqoffset,pos2-lpair1-1+sqoffset); fprintf(fpo,"potential tRNA sequence= %s\n",chaine2); fprintf(fpo,"D signal= %ld %ld TpsyC signal= %ld %ld\n",length-pos1+1+sqoffset, length-pos1-6+sqoffset,length-pos+1+sqoffset,length-pos-13+sqoffset); fprintf(fpo,"amino-acyl stem= %ld-%ld;%ld-%ld\n",pos2+sqoffset,pos2-6+sqoffset, pos2-lpair1+6+sqoffset, pos2-lpair1+sqoffset); fprintf(fpo,"D stem= %ld-%ld;%ld-%ld\n",length-pos1-1+sqoffset,length-pos1-3+sqoffset, length-pos1-lpair+1+sqoffset,length-pos1-lpair-1+sqoffset); if (lpair2 > 16) { posstart=pos1+lpair+15; posend=pos1+lpair+lpair2-2; fprintf(fpo,"anticodon stem= %ld-%ld;%ld-%ld\n",length-pos1-lpair-3+sqoffset, length-pos1-lpair-7+sqoffset,length-posend-1+sqoffset,length-posend-5+sqoffset); } else { fprintf(fpo,"anticodon stem= %ld-%ld;%ld-%ld\n",length-pos1-lpair-3+sqoffset, length-pos1-lpair-7+sqoffset,length-pos1-lpair-lpair2+1+sqoffset, length-pos1-lpair-lpair2-3+sqoffset); } fprintf(fpo,"TpsyC stem= %ld-%ld;%ld-%ld\n",length-pos+sqoffset,length-pos-4+sqoffset, length-pos-12+sqoffset,length-pos-16+sqoffset); if (strcmp(type_trna,"Ind") != 0) { fprintf(fpo,"tRNA predict as a tRNA- %s : anticodon %s\n", type_trna, anticodon); } else { fprintf(fpo,"anticodon includes unknown bases\n"); } if (lpair2 > 16) { posstart=pos1+lpair+15; posend=pos1+lpair+lpair2-2; fprintf(fpo,"potential intron between positions %ld %ld\n", length-posstart+1+sqoffset ,length-posend+1+sqoffset); } fprintf(fpo,"number of base pairing in the anticodon stem=%d\n",ncomp); fprintf(fpo,"\n"); } } void set_search_params (Param_set_type *ps, int params) { if (params == 1) { ps->tpc_sig_thresh = ST_TPC_SIG_THRESH; ps->d_sig_thresh = ST_D_SIG_THRESH; ps->sg_cutoff = ST_SG_CUTOFF; ps->tpc_inv = ST_TPC_INV; ps->tpc_incsg = ST_TPC_INCSG; ps->tpc_keep = ST_TPC_KEEP; ps->d_inv = ST_D_INV; ps->look_for_acloop_sg = ST_LOOK_FOR_ACLOOP_SG; ps->acloop_min = ST_ACLOOP_MIN; ps->aa_incsg = ST_AA_INCSG; ps->aa_keep = ST_AA_KEEP; } else if (params == 2) { ps->tpc_sig_thresh = RX_TPC_SIG_THRESH; ps->d_sig_thresh = RX_D_SIG_THRESH; ps->sg_cutoff = RX_SG_CUTOFF; ps->tpc_inv = RX_TPC_INV; ps->tpc_incsg = RX_TPC_INCSG; ps->tpc_keep = RX_TPC_KEEP; ps->d_inv = RX_D_INV; ps->look_for_acloop_sg = RX_LOOK_FOR_ACLOOP_SG; ps->acloop_min = RX_ACLOOP_MIN; ps->aa_incsg = RX_AA_INCSG; ps->aa_keep = RX_AA_KEEP; } else if (params == 3) { ps->tpc_sig_thresh = ALT_TPC_SIG_THRESH; ps->d_sig_thresh = ALT_D_SIG_THRESH; ps->sg_cutoff = ALT_SG_CUTOFF; ps->tpc_inv = ALT_TPC_INV; ps->tpc_incsg = ALT_TPC_INCSG; ps->tpc_keep = ALT_TPC_KEEP; ps->d_inv = ALT_D_INV; ps->look_for_acloop_sg = ALT_LOOK_FOR_ACLOOP_SG; ps->acloop_min = ALT_ACLOOP_MIN; ps->aa_incsg = ALT_AA_INCSG; ps->aa_keep = ALT_AA_KEEP; } else { fprintf (stderr,"tRNAscan1.4: FATAL: Unable to select search parameter set.\n"); exit(1); } } tRNAscan-SE-2.0/src/build_main.c0000644000543100007160000002771711021467304015704 0ustar pchanlowelab/* build_main.c * SRE, Mon Sep 6 09:18:35 1993 * * coveb - construct a covariance model from aligned sequences */ #include #include #include #include #include #ifdef NEED_GETOPTH #include #endif #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "ap:g:ho:P:" static void mtr2rf(struct trace_s *mtr, int alen, char **ret_rf); static char usage[] = "\ Usage: coveb [-options] \n\ where options are:\n\ -a : annotate all pairs, not just canonical Watson-Crick\n\ -g : -P1 or -P2 only - columns above this fractional gap \n\ occurrence are assigned to INS (default 0.7)\n\ -h : print out short help info\n\ -o : save structure-annotated alignment to \n\ -p : use prior probability info from \n\ Construction plans:\n\ (default) : Maximum likelihood (slow)\n\ -P1 : fast heuristic (MIXY based)\n\ -P2 : use specified consensus structure (CS line)\n\ -P3 : use both specified consensus and reference info\n"; static char banner[] = "coveb: construct covariance model from aligned sequences"; int main(int argc, char **argv) { char **aseqs; /* training sequences */ AINFO ainfo; /* misc. associated alignment info */ int nseq; /* number of seqs */ char *seqfile; /* sequence file */ int format; /* format of sequence file */ char *cmfile; /* OUTPUT: saved cm */ FILE *cmfp; /* OUTPUT: fp to cmfile */ struct cm_s *cm; /* model */ struct prior_s *prior; /* prior prob. distributions */ int idx; /* index for sequences */ double secinfo; /* secondary structure info content */ struct trace_s *mtr; /* master traceback for alignment */ struct trace_s *tr; /* a traceback for indiv seq */ struct trmem_s *pool; /* memory pool for traceback */ char **ss; /* secondary structures */ double worstscore; double bestscore; double sqsum; double tot_score; double score; int leftcount, rightcount; int apos; enum plan_e { PLAN_ML, PLAN_MIXY, PLAN_CS, PLAN_CSRF } plan; char *prifile; /* file to get prior from */ FILE *prifp; /* open priorfile */ char *structfile; /* file to save structure-annotation to */ FILE *structfp; /* open structfile */ double gapthresh; /* heuristic INS assignment parameter */ int watsoncrick; /* TRUE to annotate canonical pairs only */ int optc; /* for getopt() */ extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ #ifdef MEMDEBUG /* for Cahill's dbmalloc */ unsigned long histid1, histid2, orig_size, current_size; #endif /*********************************************** * Parse command line ***********************************************/ prifile = NULL; /* forces use of default prior in prior.h */ gapthresh = .70; structfile = NULL; watsoncrick = TRUE; plan = PLAN_ML; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'a': watsoncrick = FALSE; break; case 'g': gapthresh = atof(optarg); break; case 'o': structfile = optarg; break; case 'p': prifile = optarg; break; case 'P': switch (*optarg) { case '1': plan = PLAN_MIXY; break; case '2': plan = PLAN_CS; break; case '3': plan = PLAN_CSRF; break; default: Die("No such construction plan.\n%s", usage); } break; case 'h': printf("%s\n version %s (%s)\n%s", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("Error: unrecognized option %c\n", optc); } if (argc - optind != 2) Die("Wrong number of command line arguments.\n%s\n", usage); cmfile = argv[argc-2]; seqfile = argv[argc-1]; #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif /*********************************************** * Get sequence data and prior ***********************************************/ if (! SeqfileFormat(seqfile, &format, NULL)) Die("Can't determine format of file %s", seqfile); /* read the training seqs from file */ if (! ReadAlignment(seqfile, format, &aseqs, &nseq, &ainfo)) Die("Failed to read aligned sequence file %s", seqfile); /* convert seq to all upper case */ for (idx = 0; idx < nseq; idx++) s2upper(aseqs[idx]); if (prifile == NULL) { if (! DefaultPrior(&prior)) Die("Failed to copy prior probability distribution information"); } else { if ((prifp = fopen(prifile, "r")) == NULL) Die("Failed to open prior probability info file %s", prifile); if (! ReadPrior(prifp, &prior)) Die("Failed to read prior probability info file %s", prifile); fclose(prifp); } /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" release %s, %s\n\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); printf("Training alignment: %s (%d sequences)\n", seqfile, nseq); printf("Prior distributions: "); if (prifile == NULL) printf("default (plus-one, Laplace)\n"); else printf("from file %s\n", prifile); printf("Construction plan: "); switch (plan) { case PLAN_ML: printf("maximum likelihood\n"); break; case PLAN_MIXY: printf("fast (MIXY-based) heuristic\n"); break; case PLAN_CS: printf("specified consensus structure\n"); break; case PLAN_CSRF: printf("specified structure and match columns\n"); break; } if (plan == PLAN_MIXY || plan == PLAN_CS) printf("INS if gap freq >: %.2f\n", gapthresh); printf("---------------------------------------------------\n"); /*********************************************** * Create the starting model ***********************************************/ #ifdef MEMDEBUG printf("Checking malloc chain "); malloc_chain_check(0); printf("... done.\n"); #endif switch (plan) { case PLAN_ML: if (! Maxmodelmaker(aseqs, &ainfo, nseq, -1, prior, &secinfo, &cm, &mtr)) Die("Maxmodelmaker failed to create starting model from alignment"); break; case PLAN_MIXY: if (! Fastmodelmaker(aseqs, &ainfo, nseq, prior, gapthresh, &secinfo, &cm, &mtr)) Die("Fastmodelmaker failed to create starting model from alignment"); break; case PLAN_CS: EasyModelmaker(aseqs, &ainfo, nseq, prior, gapthresh, FALSE, &cm, &mtr); break; case PLAN_CSRF: EasyModelmaker(aseqs, &ainfo, nseq, prior, gapthresh, TRUE, &cm, &mtr); break; default: Die("That must be a secret plan, pal, because I've never heard of it."); } if (! VerifyCM(cm)) Die("Bad covariance model. Bad, bad covariance model...\n"); /* Use master traceback to reconstruct individual traces. * Use individual traces to a) find individual secondary structures * b) calculate avg, high, low scores * Some duplication of effort here, because the modelmakers * already had to construct tracebacks and threw them away. */ if ((ss = (char **) malloc (sizeof(char *) * nseq)) == NULL) Die("malloc failed"); worstscore = HUGE_VAL; bestscore = -HUGE_VAL; tot_score = sqsum = 0.0; for (idx = 0; idx < nseq; idx++) { Transmogrify(mtr, aseqs[idx], &tr, &pool); score = TraceScore(cm, aseqs[idx], tr); tot_score += score; sqsum += score * score; if (score > bestscore) bestscore = score; if (score < worstscore) worstscore = score; if (ainfo.sqinfo[idx].flags & SQINFO_SS) free(ainfo.sqinfo[idx].ss); Trace2KHS(tr, aseqs[idx], ainfo.alen, watsoncrick, &(ss[idx])); MakeDealignedString(aseqs[idx], ainfo.alen, ss[idx], &(ainfo.sqinfo[idx].ss)); ainfo.sqinfo[idx].flags |= SQINFO_SS; FreeTrace(tr, pool); } /* Construct a consensus structure string and reference * line as annotation. * Secondary structure strings, ss, are currently aligned to * the aseqs. Calculate an aligned consensus structure from * them. */ if (plan == PLAN_MIXY || plan == PLAN_ML) { if (ainfo.flags & AINFO_CS) free(ainfo.cs); if ((ainfo.cs = (char *) malloc (sizeof(char) * (ainfo.alen+1))) == NULL) Die("malloc failed"); for (apos = 0; apos < ainfo.alen; apos++) { leftcount = rightcount = 0; for (idx = 0; idx < nseq; idx++) if (ss[idx][apos] == '<') rightcount++; else if (ss[idx][apos] == '>') leftcount++; if (rightcount >= nseq / 2) ainfo.cs[apos] = '<'; else if (leftcount >= nseq / 2) ainfo.cs[apos] = '>'; else ainfo.cs[apos] = '.'; } ainfo.cs[ainfo.alen] = '\0'; ainfo.flags |= AINFO_CS; } /* Construct a reference line to indicate which columns were assigned * as match states. */ if (plan != PLAN_CSRF) { if (ainfo.flags & AINFO_RF) free(ainfo.rf); mtr2rf(mtr, ainfo.alen, &(ainfo.rf)); ainfo.flags |= AINFO_RF; } if ((cmfp = fopen(cmfile, "w")) == NULL) Die("Failed to open %s for writing", cmfile); if (! WriteCM(cmfp, cm)) Die("Failed to save the model to %s", cmfile); fclose(cmfp); if (structfile != NULL) { if ((structfp = fopen(structfile, "w")) == NULL) Die("Failed to open structure annotation alignment file %s", structfile); if (! WriteSELEX(structfp, aseqs, nseq, &ainfo, 60)) Die("Failed to write annotated alignment to %s", structfile); fclose(structfp); printf("Structure annotated alignment file written to %s\n", structfile); } printf("Constructed a covariance model (%d nodes)\n", cm->nodes); printf("Average score: %10.2f bits\n", tot_score / (double) nseq); printf("Minimum score: %10.2f bits\n", worstscore); printf("Maximum score: %10.2f bits\n", bestscore); printf("Std. deviation: %10.2f bits\n", sqrt((sqsum - (tot_score * tot_score / (double) nseq)) / ((double) nseq - 1.0))); printf("\nCM written to file %s\n", cmfile); FreeCM(cm); FreeTrace(mtr, NULL); free(prior); FreeAlignment(aseqs, nseq, &ainfo); Free2DArray(ss, nseq); #ifdef MEMDEBUG current_size = malloc_size(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "No memory leaks, sir.\n"); #endif return 0; } /* Function: mtr2rf() * * Purpose: Make an #=RF line from a master traceback, to indicate * which columns were used as match columns in building * a model. Keep in mind that master traces use node type * indices, rather than state type indices like traces * are supposed to. */ static void mtr2rf(struct trace_s *mtr, int alen, char **ret_rf) { struct tracestack_s *dolist; struct trace_s *curr; char *rf; rf = (char *) MallocOrDie(sizeof(char) * (alen+1)); memset(rf, ' ', alen); rf[alen] = '\0'; dolist = InitTracestack(); PushTracestack(dolist, mtr->nxtl); while ((curr = PopTracestack(dolist)) != NULL) { if ( curr->type == MATP_NODE ) rf[curr->emitl] = rf[curr->emitr] = '.'; else if ( curr->type == MATL_NODE ) rf[curr->emitl] = '.'; else if ( curr->type == MATR_NODE ) rf[curr->emitr] = '.'; if (curr->nxtr) PushTracestack(dolist, curr->nxtr); if (curr->nxtl) PushTracestack(dolist, curr->nxtl); } FreeTracestack(dolist); *ret_rf = rf; } tRNAscan-SE-2.0/src/funcs.h0000644000543100007160000001645411021467306014722 0ustar pchanlowelab/* funcs.h * Declarations and prototypes of functions * SRE, Fri Sep 3 09:19:50 1993 * */ #include #include "structs.h" /* * from align.c */ extern int Trace2ali(char *seq, struct trace_s *tr, int watsoncrick, struct align_s **ret_ali); extern int Traces2Alignment(char **rseqs, SQINFO *sqinfo, struct trace_s **tr, int nseq, struct cm_s *cm, int watsoncrick, char ***ret_aseqs, AINFO *ainfo); /* from dbviterbi.c */ extern int ViterbiScan(struct istate_s *icm, int statenum, char *seq, int window, double thresh, int (*gotone_f)(int, int, double)); /* from debug.c */ extern char *UstatetypeName(int ustatetype); extern char *StatetypeName(int statetype); extern char *NodetypeName(int nodetype); extern void PrintViterbiAMX(FILE *fp, struct istate_s *icm, int statenum, char *seq, int N, int ***amx); extern void PrintTrace(FILE *fp, struct trace_s *tr); extern void PrintAli(FILE *fp, struct align_s *ali); extern void PrintICM(FILE *fp, struct cm_s *cm, struct istate_s *icm, int nstates); /* * from emit.c * * Generate sequences from a CVHMM */ extern int EmitSequence(struct cm_s *cm, int watsoncrick, struct align_s **ret_ali, char **ret_khseq, char **ret_seq); extern int EmitBestSequence(struct cm_s *cm, int watsoncrick, struct align_s **ret_ali, char **ret_khseq, char **ret_seq); /* * from fastmodelmaker.c */ extern int Fastmodelmaker(char **aseqs, AINFO *ainfo, int nseq, struct prior_s *prior, double gapthresh, double *ret_secinfo, struct cm_s **ret_cm, struct trace_s **ret_mtr); /* from fast-dbviterbi.c */ extern int FastViterbiScan(struct istate_s *icm, int statenum, int *minb, int *maxb, char *seq, int window, double thresh, int (*gotone_f)(int, int, double)); /* * from konings.c * * ASCII representation of structures and structural alignments */ extern int Align2kh(struct align_s *ali, char **ret_aseq, char **ret_khseq); extern int PrintAliLandscape(FILE *fp, struct cm_s *cm, struct align_s *ali); extern void Trace2KHS(struct trace_s *tr, char *seq, int rlen, int watsoncrick, char **ret_ss); extern int KHS2ct(char *ss, int len, int allow_pknots, int **ret_ct); extern int IsRNAComplement(char sym1, char sym2, int allow_gu); /* * from lengthdist.c */ extern void LengthDistribution(struct pstate_s *pcm, int statenum, int N, double ***ret_lmx); extern void LengthBounds(double **lmx, int statenum, int N, double epsilon, int **ret_min, int **ret_max); /* * from maxmodelmaker.c */ extern int Maxmodelmaker(char **aseqs, AINFO *ainfo, int nseq, double gapthresh, struct prior_s *prior, double *ret_ssinfo, struct cm_s **ret_cm, struct trace_s **ret_mtr); /* * from misc.c */ extern int SymbolIndex(char sym); extern int PrepareSequence(char *seq); /* * from model.c */ extern struct cm_s *AllocCM(int nodes); extern void FreeCM(struct cm_s *cm); extern void NormalizeCM(struct cm_s *cm); extern int VerifyCM(struct cm_s *cm); extern int RearrangeCM(struct cm_s *cm, double *rfreq, struct istate_s **ret_icm, int *ret_statenum); extern int MakePCM(struct cm_s *cm, struct pstate_s **ret_pcm, int *ret_statenum); extern void NormalizePCM(struct pstate_s *pcm, int M); /* from modelmaking.c */ extern void NumberMasterTrace(struct trace_s *mtr, int *ret_nodes); extern void TopofyNewCM(struct cm_s *cm, struct trace_s *mtr); extern void Transmogrify(struct trace_s *mtr, char *aseq, struct trace_s **ret_tr, struct trmem_s **ret_pool); extern void EasyModelmaker(char **aseq, AINFO *ainfo, int nseq, struct prior_s *prior, double gapthresh, int use_rf, struct cm_s **ret_cm, struct trace_s **ret_mtr); /* * from prior.c */ extern int DefaultPrior(struct prior_s **ret_prior); extern int WritePrior(FILE *fp, struct prior_s *prior); extern int ReadPrior(FILE *fp, struct prior_s **ret_prior); extern void NormalizePrior(struct prior_s *prior); /* * from probify.c */ extern void ProbifyCM(struct cm_s *cm, struct prior_s *prior); extern void ProbifyTransitionMatrix(double tmx[STATETYPES][STATETYPES],int from_node, int to_node, struct prior_s *prior); extern void ProbifySingletEmission(double emvec[ALPHASIZE], int statetype, struct prior_s *prior); extern void ProbifyPairEmission(double emx[ALPHASIZE][ALPHASIZE], struct prior_s *prior); /* * from save.c */ extern int WriteCM(FILE *fp, struct cm_s *cm); extern int WriteBinaryCM(FILE *fp, struct cm_s *cm); extern int ReadCM(char *filename, struct cm_s **ret_cm); /* from scorestack.c */ extern int ReportScanHit(int left, int right, double score, int (*print_hit)(int,int,double)); /* from smallviterbi.c */ extern int SmallViterbiAlign(struct istate_s *icm, int statenum, char *seq, double *ret_score, struct trace_s **ret_trace); /* * from structs.c * * Implementation of data structures: trees, stacks, and linked lists */ extern int StatetypeIndex(int type); extern int UniqueStatetype(int nodetype, int stidx); extern struct m2ali_s *Init_m2ali(void); extern void Push_m2ali(struct m2ali_s *stack, int nodeidx, int type, struct align_s *after); extern int Pop_m2ali(struct m2ali_s *stack, int *ret_nodeidx, int *ret_type, struct align_s **ret_after); extern void Free_m2ali( struct m2ali_s *stack ); extern struct t2ali_s *Init_t2ali(void); extern void Push_t2ali(struct t2ali_s *stack, struct trace_s *tracenode, struct align_s *after); extern int Pop_t2ali(struct t2ali_s *stack, struct trace_s **ret_tracenode, struct align_s **ret_after); extern void Free_t2ali( struct t2ali_s *stack ); extern struct align_s *Init_align(void); extern struct align_s *Insafter_align(int pos, char sym, char ss, int nodeidx, int type, struct align_s *after); extern void Delafter_align(struct align_s *after); extern void Free_align(struct align_s *head); extern struct intstack_s *InitIntStack(void); extern void PushIntStack(struct intstack_s *stack, int data); extern int PopIntStack(struct intstack_s *stack, int *ret_data); extern int FreeIntStack( struct intstack_s *stack ); /* * from trace.c */ extern void InitTrace(struct trace_s **ret_new, struct trmem_s **ret_pool); extern struct trace_s *AttachTrace(struct trace_s *parent, struct trmem_s *pool, int emitl, int emitr, int nodeidx, int type); extern void FreeTrace(struct trace_s *tr, struct trmem_s *pool); extern void DeleteTracenode(struct trace_s *oldtr, struct trmem_s *pool); extern void InitTracepool(struct trmem_s **ret_pool); extern struct trace_s *PopTracepool(struct trmem_s *pool); extern void FreeTracepool(struct trmem_s *pool); extern struct tracestack_s *InitTracestack(void); extern void PushTracestack(struct tracestack_s *stack, struct trace_s *node); extern struct trace_s *PopTracestack(struct tracestack_s *stack); extern void FreeTracestack(struct tracestack_s *stack); extern int TraceCount(struct cm_s *cm, char *seq, double weight, struct trace_s *tr); extern int TraceCountPrior(struct cm_s *cm, struct prior_s *prior, char *seq, double weight, struct trace_s *tr); extern double TraceScore(struct cm_s *cm, char *seq, struct trace_s *tr); /* * from viterbi.c */ extern int ViterbiAlign(struct istate_s *cm, int statenum, char *seq, double *ret_score, struct trace_s **ret_trace); tRNAscan-SE-2.0/src/stack.c0000644000543100007160000000441611021467306014677 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* stack.c * SRE, Thu Mar 3 10:08:48 1994 * * Implementation of generic stack structures. */ #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /************************************************************ * intstack_s implementation. * * Functions: InitIntStack() - returns ptr to new stack * PushIntStack() - (void) * PopIntStack() - returns 1 on success, 0 if stack empty * FreeIntStack() - returns number of elements free'd, or 0 if * stack was empty. * * Implementation of the pushdown stack for storing single * integers. *************************************************************/ struct intstack_s * InitIntStack(void) { struct intstack_s *stack; if ((stack = (struct intstack_s *) malloc (sizeof(struct intstack_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); stack->nxt = NULL; return stack; } void PushIntStack(struct intstack_s *stack, int data) { struct intstack_s *new; if ((new = (struct intstack_s *) malloc (sizeof(struct intstack_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->data = data; new->nxt = stack->nxt; stack->nxt = new; } int PopIntStack(struct intstack_s *stack, int *ret_data) { struct intstack_s *old; if (stack->nxt == NULL) return 0; old = stack->nxt; stack->nxt = old->nxt; *ret_data = old->data; free(old); return 1; } void ReverseIntStack(struct intstack_s *stack) { struct intstack_s *old; struct intstack_s *new; old = stack->nxt; stack->nxt = NULL; while (old != NULL) { new = old; /* remove one from top of old stack */ old = old->nxt; new->nxt = stack->nxt; /* push it onto new stack */ stack->nxt = new; } } int FreeIntStack( struct intstack_s *stack ) { int data; int count = 0; while (PopIntStack(stack, &data)) count++; free(stack); return count; } tRNAscan-SE-2.0/src/selex.c0000644000543100007160000006061011021467305014707 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* selex.c * * Fri Dec 4 17:43:24 1992, SRE: * Reading and writing aligned sequences to/from disk files. * Implements a new, broader specification of SELEX format * and supercedes alignio.c. * * SRE, Tue Nov 9 17:40:50 1993: * major revision. #= special comments and aliinfo_s optional * alignment info support added. Support for #=CS (consensus * secondary structure), #=SS (individual secondary structure), * #=RF (reference coordinate system), #=SQ (per-sequence header info), * and #=AU ("author") added. * * SRE, Mon Jan 30 14:41:49 1995: * #=SA side chain % surface accessibility annotation supported * * SELEX format is documented in Docs/formats.tex. **************************************************************************** */ #include #include #include #include #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static void homogenize_gapsym(char *s, char gapsym); static int copy_alignment_line(char *aseq, int apos, int name_rcol, char *buffer, int lcol, int rcol, char gapsym); static char commentsyms[] = "%#"; /* Function: ReadSELEX() * * Read multiple aligned sequences from the file seqfile. * Store aligned sequences in aseqs, names in names, and * the number of sequences in num. * * Memory is allocated for aseqs and names, and they must be * free'd by the caller. * * If optional information is desired, a non-NULL ainfo * pointer is passed. * * Returns 1 on success. Returns 0 on failure and sets * squid_errno to indicate the cause of the failure. */ int ReadSELEX(char *seqfile, char ***ret_aseqs, int *ret_num, AINFO *ainfo) { FILE *fp; /* ptr to opened seqfile */ char **aseqs; /* aligned seqs */ int num; /* number of seqs read */ char buffer[LINEBUFLEN]; /* input buffer for lines */ char bufcpy[LINEBUFLEN]; /* strtok'able copy of buffer */ struct block_struc { /** alignment data for a block: */ int lcol; /* furthest left aligned sym */ int rcol; /* furthest right aligned sym */ } *blocks; int blocknum; /* number of blocks in file */ char *nptr; /* ptr to start of name on line */ char *sptr; /* ptr into sequence on line */ int currnum; /* num. seqs in given block */ int currblock; /* index for blocks */ int i; /* loop counter */ int seqidx; /* counter for seqs */ int alen; /* length of alignment */ int warn_names; /* becomes TRUE if names don't match between blocks */ int headnum; /* seqidx in per-sequence header info */ int currlen; int count; /*************************************************** * First pass across file. * Count seqs, get names, determine column info * Determine what sorts of info are active in this file. ***************************************************/ ainfo->flags = 0; /* open the file for reading */ fp = fopen(seqfile, "r"); if (fp == NULL) { squid_errno = SQERR_NOFILE; return 0; } /* get first line of the block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_NODATA; return 0; } strcpy(bufcpy, buffer); if (*buffer == '#') { if (strncmp(buffer, "#=CS", 4) == 0) ainfo->flags |= AINFO_CS; else if (strncmp(buffer, "#=RF", 4) == 0) ainfo->flags |= AINFO_RF; } } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); blocknum = 0; warn_names = FALSE; while (!feof(fp)) { /* allocate for info about this block. */ if (blocknum == 0) blocks = (struct block_struc *) malloc (sizeof(struct block_struc)); else blocks = (struct block_struc *) realloc (blocks, (blocknum+1) * sizeof(struct block_struc)); if (blocks == NULL) { squid_errno = SQERR_MEM; return 0; } blocks[blocknum].lcol = LINEBUFLEN+1; blocks[blocknum].rcol = -1; currnum = 0; while (nptr != NULL) /* becomes NULL when this block ends. */ { /* First block only: save names */ if (blocknum == 0) { if (currnum == 0) ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO)); else ainfo->sqinfo = (SQINFO *) realloc (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO)); if (ainfo->sqinfo == NULL) { squid_errno = SQERR_MEM; return 0; } ainfo->sqinfo[currnum].flags = 0; SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME); } else /* in each additional block: check names */ { if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0) warn_names = TRUE; } currnum++; /* check rcol, lcol */ if ((sptr = strtok(NULL, WHITESPACE)) != NULL) { /* is this the furthest left we've seen word 2 in this block? */ if (sptr - bufcpy < blocks[blocknum].lcol) blocks[blocknum].lcol = sptr - bufcpy; /* look for right side in buffer */ for (sptr = buffer + strlen(buffer) - 1; strchr(WHITESPACE, *sptr) != NULL; sptr --) /* do nothing */ ; if (sptr - buffer > blocks[blocknum].rcol) blocks[blocknum].rcol = sptr - buffer; } /* get the next line; blank line means end of block */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); if (strncmp(buffer, "#=SS", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS; else if (strncmp(buffer, "#=SA", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA; else if (strncmp(buffer, "#=CS", 4) == 0) ainfo->flags |= AINFO_CS; else if (strncmp(buffer, "#=RF", 4) == 0) ainfo->flags |= AINFO_RF; if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; } while (strchr(commentsyms, *nptr) != NULL); } /* check that number of sequences matches expected */ if (blocknum == 0) num = currnum; else if (currnum != num) { squid_errno = SQERR_FORMAT; return 0; } blocknum++; /* get first line of next block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); } /*************************************************** * Get ready for second pass: * figure out the length of the alignment * malloc space * rewind the file ***************************************************/ alen = 0; for (currblock = 0; currblock < blocknum; currblock++) alen += blocks[currblock].rcol - blocks[currblock].lcol + 1; rewind(fp); /* allocations */ if ((aseqs = (char **) malloc (num * sizeof(char *))) == NULL) { squid_errno = SQERR_MEM; return 0; } if ((ainfo->flags & AINFO_CS) && (ainfo->cs = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } if ((ainfo->flags & AINFO_RF) && (ainfo->rf = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (i = 0; i < num; i++) { if ((aseqs[i] = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } if ((ainfo->sqinfo[i].flags & SQINFO_SS) && (ainfo->sqinfo[i].ss = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } if ((ainfo->sqinfo[i].flags & SQINFO_SA) && (ainfo->sqinfo[i].sa = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } } ainfo->alen = alen; ainfo->flags |= AINFO_ALEN; /*************************************************** * Second pass across file. Parse header; assemble sequences ***************************************************/ /* We've now made a complete first pass over the file. We know how * many blocks it contains, we know the number of seqs in the first * block, and we know every block has the same number of blocks; * so we can be a bit more cavalier about error-checking as we * make the second pass. */ /* Look for header */ headnum = 0; for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_NODATA; return 0; } strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */ if (strcmp(nptr, "#=AU") == 0 && /* "author" info */ (sptr = strtok(NULL, "\n")) != NULL) { strncpy(ainfo->au,sptr,63); ainfo->au[63] = '\0'; ainfo->flags |= AINFO_AUTH; } else if (strcmp(nptr, "#=SQ") == 0) /* per-sequence header info */ { /* first field is the name */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) { squid_errno = SQERR_FORMAT; return 0; } if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE; /* second field is the weight */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) { squid_errno = SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_WGT); /* third field is database source id */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) { squid_errno = SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID); /* fourth field is database accession number */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) { squid_errno = SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC); /* fifth field is start..stop::olen */ if ((sptr = strtok(NULL, ".:")) == NULL) { squid_errno = SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) == NULL) { squid_errno = SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ":\t ")) == NULL) { squid_errno = SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN); /* rest of line is optional description */ if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC); headnum++; } else if (strcmp(nptr, "#=CS") == 0) break; else if (strcmp(nptr, "#=RF") == 0) break; else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */ } currlen = 0; for (currblock = 0 ; currblock < blocknum; currblock++) { /* parse the block */ seqidx = 0; while (nptr != NULL) { /* Consensus structure */ if (strcmp(nptr, "#=CS") == 0) { if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) { squid_errno = SQERR_FORMAT; return 0; } } /* Reference coordinates */ else if (strcmp(nptr, "#=RF") == 0) { if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) ' ')) { squid_errno = SQERR_FORMAT; return 0; } } /* Individual secondary structure */ else if (strcmp(nptr, "#=SS") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) { squid_errno = SQERR_FORMAT; return 0; } } /* Side chain % surface accessibility code */ else if (strcmp(nptr, "#=SA") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) { squid_errno = SQERR_FORMAT; return 0; } } /* Aligned sequence; avoid unparsed machine comments */ else if (strncmp(nptr, "#=", 2) != 0) { if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) { squid_errno = SQERR_FORMAT; return 0; } seqidx++; } /* get next line */ for (;;) { nptr = NULL; if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* data */ } } /* end of a block */ currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; /* get line 1 of next block */ for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */ } } /* end of the file */ /* Lengths in sqinfo are for raw sequence (ungapped), * and SS, SA are 0..rlen-1 not 0..alen-1. * Only the seqs with structures come out of here with lengths set. */ for (seqidx = 0; seqidx < num; seqidx++) { int apos, rpos; /* secondary structures */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SS) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos]; rpos++; } ainfo->sqinfo[seqidx].ss[rpos] = '\0'; homogenize_gapsym(ainfo->sqinfo[seqidx].ss, (char) '.'); } /* Surface accessibility */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SA) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos]; rpos++; } ainfo->sqinfo[seqidx].sa[rpos] = '\0'; homogenize_gapsym(ainfo->sqinfo[seqidx].sa, (char) '.'); } } /* NULL-terminate all the strings */ if (ainfo->flags & AINFO_RF) ainfo->rf[alen] = '\0'; if (ainfo->flags & AINFO_CS) { ainfo->cs[alen] = '\0'; homogenize_gapsym(ainfo->cs, (char) '.'); } for (seqidx = 0; seqidx < num; seqidx++) { aseqs[seqidx][alen] = '\0'; homogenize_gapsym(aseqs[seqidx], (char) '.'); } /* find raw sequence lengths for sqinfo */ for (seqidx = 0; seqidx < num; seqidx++) { count = 0; for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) if (!isgap(*sptr)) count++; ainfo->sqinfo[seqidx].len = count; ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; } /*************************************************** * Garbage collection and return ***************************************************/ fclose(fp); free(blocks); if (warn_names) Warn("sequences may be in different orders in blocks of %s?", seqfile); *ret_num = num; *ret_aseqs = aseqs; return 1; } /* Function: WriteSELEX() * * Write aligned sequences to an open file pointer, * breaking into multiple blocks if the sequences are * long. Number of symbols written per line is set by cpl. * The alignment must be flushed (all aseqs the same length). * * cpl cannot exceed 255. * * May also write optional information from ainfo; * ainfo may be NULL. * * Returns 1 on success. Returns 0 on failure, and sets * squid_errno to indicate the cause. */ int WriteSELEX(FILE *fp, char **aseqs, int num, struct aliinfo_s *ainfo, int cpl) { int idx; /* counter for sequences */ int namelen; /* maximum name length used */ int len; /* tmp variable for name lengths */ char buffer[256]; /* buffer for writing seq */ int alen; int currpos; char **ss; /* aligned secondary structure strings */ char **sa; /* aligned accessibility strings */ alen = (ainfo->flags & AINFO_ALEN) ? ainfo->alen : strlen(aseqs[0]); /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < num; idx++) if ((len = strlen(ainfo->sqinfo[idx].name)) > namelen) namelen = len; if (namelen < 6) namelen = 6; /* Make aligned secondary structure strings */ ss = (char **) MallocOrDie(sizeof(char *) * num); sa = (char **) MallocOrDie(sizeof(char *) * num); for (idx = 0; idx < num; idx++) { if (ainfo->sqinfo[idx].flags & SQINFO_SS) MakeAlignedString(aseqs[idx], alen, ainfo->sqinfo[idx].ss, &(ss[idx])); if (ainfo->sqinfo[idx].flags & SQINFO_SA) MakeAlignedString(aseqs[idx], alen, ainfo->sqinfo[idx].sa, &(sa[idx])); } /* Write header info */ if (ainfo->flags & AINFO_AUTH) fprintf(fp, "#=AU %s\n", ainfo->au); if ((ainfo->sqinfo[0].flags & SQINFO_WGT) || (ainfo->sqinfo[0].flags & SQINFO_ID) || (ainfo->sqinfo[0].flags & SQINFO_ACC) || (ainfo->sqinfo[0].flags & SQINFO_START) || (ainfo->sqinfo[0].flags & SQINFO_STOP) || (ainfo->sqinfo[0].flags & SQINFO_OLEN) || (ainfo->sqinfo[0].flags & SQINFO_DESC)) for (idx = 0; idx < num; idx++) fprintf(fp, "#=SQ %-*.*s %6.4f %s %s %d..%d::%d %s\n", namelen, namelen, ainfo->sqinfo[idx].name, (ainfo->sqinfo[idx].flags & SQINFO_WGT) ? ainfo->sqinfo[idx].weight : 1.0, (ainfo->sqinfo[idx].flags & SQINFO_ID) ? ainfo->sqinfo[idx].id : "-", (ainfo->sqinfo[idx].flags & SQINFO_ACC) ? ainfo->sqinfo[idx].id : "-", (ainfo->sqinfo[idx].flags & SQINFO_START) ? ainfo->sqinfo[idx].start : 0, (ainfo->sqinfo[idx].flags & SQINFO_STOP) ? ainfo->sqinfo[idx].stop : 0, (ainfo->sqinfo[idx].flags & SQINFO_OLEN) ? ainfo->sqinfo[idx].olen : 0, (ainfo->sqinfo[idx].flags & SQINFO_DESC) ? ainfo->sqinfo[idx].desc : "-"); fprintf(fp, "\n"); /* main loop: write seqs in blocks. */ for (currpos = 0; currpos < alen; currpos += cpl) { /* Reference coord system */ if (ainfo->flags & AINFO_RF) { strncpy(buffer, ainfo->rf + currpos, cpl); buffer[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namelen, namelen, "#=RF", buffer); } /* Consensus secondary structure */ if (ainfo->flags & AINFO_CS) { strncpy(buffer, ainfo->cs + currpos, cpl); buffer[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namelen, namelen, "#=CS", buffer); } for (idx = 0; idx < num; idx++) { /* Aligned sequence */ strncpy(buffer, aseqs[idx] + currpos, cpl); buffer[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namelen, namelen, ainfo->sqinfo[idx].name, buffer); /* Individual secondary structure */ if (ainfo->sqinfo[idx].flags & SQINFO_SS) { strncpy(buffer, ss[idx] + currpos, cpl); buffer[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namelen, namelen, "#=SS", buffer); } /* Surface accessibility */ if (ainfo->sqinfo[idx].flags & SQINFO_SA) { strncpy(buffer, sa[idx] + currpos, cpl); buffer[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namelen, namelen, "#=SA", buffer); } } /* put blank line between blocks */ fprintf(fp, "\n"); } /* Garbage collection */ for (idx = 0; idx < num; idx++) if (ainfo->sqinfo[idx].flags & SQINFO_SS) free(ss[idx]); free(ss); return 1; } /* Function: homogenize_gapsym() * * Purpose: Make gap symbols homogeneous. */ static void homogenize_gapsym(char *s, char gapsym) { for (; *s != '\0'; s++) if (isgap(*s)) *s = gapsym; } /* Function: copy_alignment_line() * * Purpose: Given a line from an alignment file, and bounds lcol,rcol * on what part of it may be sequence, save the alignment into * aseq starting at position apos. * * name_rcol is set to the rightmost column this aseqs's name * occupies; if name_rcol >= lcol, we have a special case in * which the name intrudes into the sequence zone. */ static int copy_alignment_line(char *aseq, int apos, int name_rcol, char *buffer, int lcol, int rcol, char gapsym) { char *s1, *s2; int i; s1 = aseq + apos; s2 = buffer; /* be careful that buffer doesn't end before lcol! */ for (i = 0; i < lcol; i++) if (*s2) s2++; for (i = lcol; i <= rcol; i++) { if (*s2 == '\t') { Warn("TAB characters will corrupt a SELEX alignment! Please remove them first."); return 0; } if (name_rcol >= i) /* name intrusion special case: pad left w/ gaps */ *s1 = gapsym; /* short buffer special case: pad right w/ gaps */ else if (*s2 == '\0' || *s2 == '\n') *s1 = gapsym; else /* normal case: copy buffer into aseq */ *s1 = *s2; s1++; if (*s2) s2++; } return 1; } /* Function: DealignAseqs() * * Given an array of (num) aligned sequences aseqs, * strip the gaps, represented by ' ' space characters. * Store the raw sequences in a new allocated array. * * Caller is responsible for free'ing the memory allocated to * rseqs. * * Returns 1 on success. Returns 0 and sets squid_errno on * failure. */ int DealignAseqs(char **aseqs, int num, char ***ret_rseqs) { char **rseqs; /* de-aligned sequence array */ int idx; /* counter for sequences */ int depos; /* position counter for dealigned seq*/ int apos; /* position counter for aligned seq */ int seqlen; /* length of aligned seq */ /* alloc space */ if ((rseqs = (char **) malloc (num * sizeof(char *))) == NULL) { squid_errno = SQERR_MEM; return 0; } /* main loop */ for (idx = 0; idx < num; idx++) { seqlen = strlen(aseqs[idx]); /* alloc space */ if ((rseqs[idx] = (char *) malloc ((seqlen + 1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } /* strip gaps */ depos = 0; for (apos = 0; aseqs[idx][apos] != '\0'; apos++) if (!isgap(aseqs[idx][apos])) { rseqs[idx][depos] = aseqs[idx][apos]; depos++; } rseqs[idx][depos] = '\0'; } *ret_rseqs = rseqs; return 1; } /* Function: IsSELEXFormat() * * Return TRUE if filename may be in SELEX format. * * Accuracy is sacrificed for speed; a TRUE return does * *not* guarantee that the file will pass the stricter * error-checking of ReadSELEX(). All it checks is that * the first 500 non-comment lines of a file are * blank, or if there's a second "word" on the line * it looks like sequence (i.e., it's not kOtherSeq). * * Returns TRUE or FALSE. */ int IsSELEXFormat(char *filename) { FILE *fp; /* ptr to open sequence file */ char buffer[LINEBUFLEN]; char *sptr; /* ptr to first word */ int linenum; if ((fp = fopen(filename, "r")) == NULL) { squid_errno = SQERR_NOFILE; return 0; } linenum = 0; while (linenum < 500 && fgets(buffer, LINEBUFLEN, fp) != NULL) { linenum++; /* dead giveaways for extended SELEX */ if (strncmp(buffer, "#=AU", 4) == 0) goto DONE; else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE; else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE; else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE; else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE; /* a comment? */ if (strchr(commentsyms, *buffer) != NULL) continue; /* a blank line? */ if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue; /* a one-word line (name only) is possible, though rare */ if ((sptr = strtok(NULL, "\n")) == NULL) continue; if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;} } DONE: fclose(fp); return 1; } /* Function: TruncateNames() * * Make sure all names are a single word. * - if they are blank, make a name up (use the number of the sequence) * - if it's already one word, leave it alone * - if it's more than one word, put a terminator '\0' after the * first word * * Used to check an array of names before writing a SELEX-format file. * * Returns 1 on success. Returns 0 on failure and sets squid_errno * to indicate the cause. */ int TruncateNames(char **names, int N) { int idx; char newname[32]; for (idx = 0; idx < N; idx++) if (names[idx] == NULL || strtok(names[idx], " \t\n") == NULL) { (void) sprintf(newname, "%d", idx); if (names[idx] != NULL) free(names[idx]); names[idx] = Strdup(newname); } return 1; } tRNAscan-SE-2.0/src/getopt.c0000644000543100007160000001467311021467312015077 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ #include #include #include #include "squid.h" /* Function: Getopt() * * Purpose: Portable command line option parsing with abbreviated * option switches. Replaces UNIX getopt(). Using UNIX getopt() * hinders portability to non-UNIX platforms, and getopt() * is also limited to single letter options. * * Getopt() implements a superset of UNIX getopt(). * All of getopt()'s single-character switch behavior * is emulated, and "--" by itself terminates the options. * Additionally, Getopt() provides extended switches * like "--youroptionhere", and Getopt() type checks * arguments. * * Extended options must start with "--", as in "--option1". * Normal options must start with "-", as in "-o". * Normal options may be concatenated, as in "-a -b" == "-ab". * * See bottom of this .c file after #fdef GETOPT_TESTDRIVER * for an example of calling Getopt(). * * Args: argc - from main(). number of elems in argv. * argv - from main(). argv[0] is the name of the command. * opt - array of opt_s structures, defining option switches * nopts - number of switches in opt * usage - a (possibly long) string to print if usage error. * ret_optind - RETURN: the index in argv[] of the next * valid command-line token. * ret_optname- RETURN: ptr to the name of option switch * seen, or NULL if no option was seen. * ret_optarg - RETURN: ptr to the optional argument, if any; * NULL if option takes no argument. * * Return: 1 if a valid option was parsed. * 0 if no option was found, and command-line parsing is complete. * Die()'s here if an error is detected. */ int Getopt(int argc, char **argv, struct opt_s *opt, int nopts, char *usage, int *ret_optind, char **ret_optname, char **ret_optarg) { int i; int arglen; int nmatch; static int optind = 1; /* init to 1 on first call */ static char *optptr = NULL; /* ptr to next valid switch */ int opti; /* Check to see if we've run out of options. */ if (optind >= argc || argv[optind][0] != '-') { *ret_optind = optind; *ret_optarg = NULL; *ret_optname = NULL; return 0; } /* Check to see if we're being told that this is the end * of the options. */ if (strcmp(argv[optind], "--") == 0) { optind++; *ret_optind = optind; *ret_optname = NULL; *ret_optarg = NULL; return 0; } /* We have a real option. Find which one it is. * We handle single letter switches "-o" separately * from full switches "--option", based on the "-" vs. "--" * prefix -- single letter switches can be concatenated. */ /* full option */ if (optptr == NULL && strncmp(argv[optind], "--", 2) == 0) { optptr = NULL; /* full options can't concantenate */ arglen = strlen(argv[optind]); nmatch = 0; for (i = 0; i < nopts; i++) if (opt[i].single == FALSE && strncmp(opt[i].name, argv[optind], arglen) == 0) { nmatch++; opti = i; } if (nmatch > 1) Die("Option \"%s\" is ambiguous; please be more specific.\n%s", argv[optind], usage); if (nmatch == 0) Die("No such option \"%s\".\n%s", argv[optind], usage); *ret_optname = opt[opti].name; /* Set the argument, if there is one */ if (opt[opti].argtype != ARG_NONE) { if (optind+1 >= argc) Die("Option %s requires an argument\n%s", opt[opti].name, usage); *ret_optarg = argv[optind+1]; optind+=2; } else /* ARG_NONE */ { *ret_optarg = NULL; optind++; } } else /* else, a single letter option "-o" */ { /* find the option */ if (optptr == NULL) optptr = argv[optind]+1; for (opti = -1, i = 0; i < nopts; i++) if (opt[i].single == TRUE && *optptr == opt[i].name[1]) { opti = i; break; } if (opti == -1) Die("No such option \"%c\".\n%s", *optptr, usage); *ret_optname = opt[opti].name; /* set the argument, if there is one */ if (opt[opti].argtype != ARG_NONE) { if (*(optptr+1) != '\0') /* attached argument */ { *ret_optarg = optptr+1; optind++; } else if (optind+1 < argc) /* unattached argument */ { *ret_optarg = argv[optind+1]; optind+=2; } else Die("Option %s requires an argument\n%s", opt[opti].name, usage); optptr = NULL; /* can't concatenate after an argument */ } else /* ARG_NONE */ { *ret_optarg = NULL; if (*(optptr+1) != '\0') /* concatenation */ optptr++; else { optind++; /* move to next field */ optptr = NULL; } } } /* Type check the argument, if there is one */ if (opt[opti].argtype != ARG_NONE) { if (opt[opti].argtype == ARG_INT && ! IsInt(*ret_optarg)) Die("Option %s requires an integer argument\n%s", opt[opti].name, usage); else if (opt[opti].argtype == ARG_FLOAT && ! IsReal(*ret_optarg)) Die("Option %s requires a numerical argument\n%s", opt[opti].name, usage); else if (opt[opti].argtype == ARG_CHAR && strlen(*ret_optarg) != 1) Die("Option %s requires a single-character argument\n%s", opt[opti].name, usage); /* ARG_STRING is always ok, no type check necessary */ } *ret_optind = optind; return 1; } #ifdef GETOPT_TESTDRIVER struct opt_s OPTIONS[] = { { "--test1", FALSE, ARG_INT }, { "--test2", FALSE, ARG_FLOAT }, { "--test3", FALSE, ARG_STRING }, { "--test4", FALSE, ARG_CHAR }, { "-a", TRUE, ARG_NONE }, { "-b", TRUE, ARG_INT }, }; #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) int main(int argc, char **argv) { int optind; char *optarg; char *optname; while (Getopt(argc, argv, OPTIONS, NOPTIONS, "Usage/help here", &optind, &optname, &optarg)) { printf("index: %d name: %s argument: %s\n", optind, optname, optarg); } } #endif /*GETOPT_TESTDRIVER*/ tRNAscan-SE-2.0/src/sstofa.in0000644000543100007160000000235713100451015015245 0ustar pchanlowelab#! @PERL@ if ($#ARGV < 0) { die "\nConvert secondary structure file to fasta format\n", "\nUsage: sstofa \n\n"; } $ss_file = shift; open (SSFILE, $ss_file) || die "Couldn't find $ss_file\n"; open(SEQFILE,">-"); $ct = 0; while ($line = ) { if ($line =~ /^(\S+)\s+(\(\d+\-\d+\))\s+Length:\s(\d+)\sbp/) { $SeqName = $1; $bounds = $2; $SeqLen = $3; } elsif ($line =~ /^Type:\s(\S+)\s+Anticodon:\s(\S+).+Score:\s(\S+)/) { $isotype = $1; $ac = $2; $SeqName .= "-".$isotype.$ac; $score = $3; $SeqDescription = "$bounds $isotype ($ac) $SeqLen bp Sc: $score"; } elsif ($line =~ /pseudogene/) { $SeqDescription .= " Pseudo"; } elsif ($line =~ /^Seq:\s(\S+)$/) { $Seq = $1; &write_fasta($SeqName,$SeqDescription,length($Seq), uc($Seq),SEQFILE); $SeqName = ""; $bounds = ""; $SeqLen = 0; $isotype = ""; $ac = ""; $score = 0.0; $SeqDescription = ""; $Seq = ""; } } # End Main sub write_fasta { local($name, $description, $length, $sequence,*FAHANDLE) = @_; local($pos, $line); print FAHANDLE ">$name $description\n"; for ($pos = 0; $pos < $length; $pos += 60) { $line = substr($sequence,$pos,60); print FAHANDLE $line, "\n"; } 1; } tRNAscan-SE-2.0/src/debug.c0000644000543100007160000001234311704460714014662 0ustar pchanlowelab/* debug.c * Fri Jan 28 14:10:59 1994 * * Code specifically for debugging the package. */ #include #include "structs.h" #include "funcs.h" /* Function: UstatetypeName() * * Purpose: "Ustatetypes" -- unique state types -- are used in the * integer-style models of the alignment algorithms. * Given such a flag, return a string representation of * the unique statetype. */ char * UstatetypeName(int ustatetype) { switch (ustatetype) { case uDEL_ST: return "uDEL_ST"; case uMATP_ST: return "uMATP_ST"; case uMATL_ST: return "uMATL_ST"; case uMATR_ST: return "uMATR_ST"; case uINSL_ST: return "uINSL_ST"; case uINSR_ST: return "uINSR_ST"; case uBEGIN_ST: return "uBEGIN_ST"; case uEND_ST: return "uEND_ST"; case uBIFURC_ST: return "uBIFURC_ST"; default: return "Unknown state type"; } } /* Function: StatetypeName() * * Purpose: Given a statetype integer, return a string representation * for that statetype. */ char * StatetypeName(int statetype) { switch (statetype) { case DEL_ST: return "DEL/BEG/BIF/END"; case MATP_ST: return "MATP_ST"; case MATL_ST: return "MATL_ST"; case MATR_ST: return "MATR_ST"; case INSL_ST: return "INSL_ST"; case INSR_ST: return "INSR_ST"; default: return "Unknown State"; } } /* Function: NodetypeName() * * Purpose: Given a node type integer, return a printable name * for the node type. */ char * NodetypeName(int nodetype) { switch (nodetype) { case BIFURC_NODE: return "BIF/END NODE"; case MATP_NODE: return "MATP_NODE"; case MATL_NODE: return "MATL_NODE"; case MATR_NODE: return "MATR_NODE"; case BEGINL_NODE: return "BEGINL_NODE"; case BEGINR_NODE: return "BEGINR_NODE"; case ROOT_NODE: return "ROOT_NODE"; default: return "Unknown Node"; } } /* Function: PrintViterbiAMX() * * Purpose: Print out a normal main matrix from the original * Viterbi alignment algorithm. * */ void PrintViterbiAMX(FILE *fp, /* usually stderr/stdout */ struct istate_s *icm, /* integer model */ int statenum, /* length of model in states */ char *seq, /* sequence, 1..N */ int N, /* length of seq */ int ***amx) /* 'A' matrix */ { int diff, j, y; /* indices for three dimensions */ for (y = 0; y < statenum; y++) { fprintf(fp, "### A Matrix for state %d, type %d (%s), from node %d\n", y, icm[y].statetype, UstatetypeName(icm[y].statetype), icm[y].nodeidx); fprintf(fp, " "); for (diff = 0; diff <= N; diff++) fprintf(fp, "%6d ", diff); fprintf(fp, "\n"); for (j = 0; j <= N; j++) { fprintf(fp, "%c %3d ", ((j > 0) ? seq[j] : '*'), j); for (diff = 0; diff <= j; diff++) fprintf(fp, "%6d ", amx[j][diff][y]); fprintf(fp, "\n"); } fprintf(fp, "\n\n"); } } /* Function: PrintTrace() * * Purpose: Debugging tool. Print a traceback tree. */ void PrintTrace(FILE *fp, struct trace_s *tr) { struct tracestack_s *stack; struct trace_s *currtr; stack = InitTracestack(); PushTracestack(stack, tr->nxtl); fprintf(fp, " address emitl emitr nodeidx type nxtl nxtr prv\n"); while ((currtr = PopTracestack(stack)) != NULL) { fprintf(fp, "(%p) %3d %3d %3d %3d %p %p %p %s\n", currtr, currtr->emitl, currtr->emitr, currtr->nodeidx, currtr->type, currtr->nxtl, currtr->nxtr, currtr->prv, UstatetypeName(currtr->type)); if (currtr->nxtr != NULL) PushTracestack(stack, currtr->nxtr); if (currtr->nxtl != NULL) PushTracestack(stack, currtr->nxtl); } FreeTracestack(stack); } /* Function: PrintAli() * * Purpose: Debugging tool. Print out an align_s structure */ void PrintAli(FILE *fp, struct align_s *ali) { struct align_s *curr; for (curr = ali; curr != NULL; curr = curr->nxt) fprintf(fp, "%4d %c %c %4d %s\n", curr->pos, curr->sym, curr->ss, curr->nodeidx, UstatetypeName(curr->type)); } /* Function: PrintICM() * * Purpose: Print an integer-version CM, as used by the alignment * algorithms. */ void PrintICM(FILE *fp, struct cm_s *cm, struct istate_s *icm, int nstates) { int y; int x; for (y = 0; y < nstates; y++) { fprintf(fp, "node %d (%s) state %d (%s)\n", icm[y].nodeidx, NodetypeName(cm->nd[icm[y].nodeidx].type), y, UstatetypeName(icm[y].statetype)); fprintf(fp, " connectnum %d offset %d (connections start at %d)\n", icm[y].connectnum, icm[y].offset, y + icm[y].offset); fprintf(fp, " Transitions: "); for (x = 0; x < icm[y].connectnum; x++) fprintf(fp, "%d ", icm[y].tmx[x]); fputs("\n", fp); fprintf(fp, " Emissions: "); switch (icm[y].statetype) { case uMATP_ST: for (x = 0; x < ALPHASIZE * ALPHASIZE; x++) fprintf(fp, "%d ", icm[y].emit[x]); fputs("\n", fp); break; case uMATR_ST: case uMATL_ST: case uINSR_ST: case uINSL_ST: for (x = 0; x < ALPHASIZE; x++) fprintf(fp, "%d ", icm[y].emit[x]); fputs("\n", fp); break; default: fputs("NONE\n", fp); break; } } } tRNAscan-SE-2.0/src/sre_ctype.c0000644000543100007160000000122511021467305015561 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* sre_ctype.c * * For portability. Some systems have functions tolower, toupper * as macros (for instance, MIPS M-2000 RISC/os!) */ #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif int sre_tolower(int c) { if (isupper(c)) return tolower(c); else return c; } int sre_toupper(int c) { if (islower(c)) return toupper(c); else return c; } tRNAscan-SE-2.0/src/shuffle_main.c0000644000543100007160000000553611021467311016232 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* main for shuffle * * shuffle - generate shuffled sequences * Mon Feb 26 16:56:08 1996 */ #include #include #include #include "squid.h" struct opt_s OPTIONS[] = { { "-h", TRUE, ARG_NONE }, /* help */ { "-n", TRUE, ARG_INT }, /* number of shuffled seqs per input seq */ { "--seed", FALSE, ARG_INT }, /* set the random number seed */ }; #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) char usage[] = "Usage: shuffle [-options] \n\ Generate shuffled copies of input sequences.\n\ Available options:\n\ -h : help; print version and usage info\n\ -n : make shuffles per input seq (default 1)\n\ --seed : set random number seed to \n\ "; int main(int argc, char **argv) { char *seqfile; /* name of sequence file */ SQFILE *dbfp; /* open sequence file */ int fmt; /* format of seqfile */ char *seq; /* sequence */ SQINFO sqinfo; /* additional sequence info */ char *shuff; /* shuffled sequence */ int num; /* number to generate */ int seed; /* random number generator seed */ int i; char *optname; /* option name */ char *optarg; /* option argument (or NULL) */ int optind; /* index of next argv[] */ /*********************************************** * Parse command line ***********************************************/ num = 1; seed = (int) time ((time_t *) NULL); while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "-n") == 0) { num = atoi(optarg); } else if (strcmp(optname, "--seed") == 0) { seed = atoi(optarg); } else if (strcmp(optname, "-h") == 0) { printf("shuffle %s, %s\n%s\n", squid_version, squid_date, usage); exit(EXIT_SUCCESS); } } if (argc - optind != 1) Die("%s\n", usage); seqfile = argv[optind]; sre_srandom(seed); if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Failed to determine format of file %s", seqfile); if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) { shuff = (char *) MallocOrDie ((sqinfo.len + 1) * sizeof(char)); for (i = 0; i < num; i++) { StrShuffle(shuff, seq); WriteSeq(stdout, kPearson, shuff, &sqinfo); } free(shuff); FreeSequence(seq, &sqinfo); } SeqfileClose(dbfp); return 0; } tRNAscan-SE-2.0/src/fasta2gsi.in0000644000543100007160000000550313100451015015625 0ustar pchanlowelab#! @PERL@ # Usage: fasta2gsi # Creates seqfile.gsi # # Create a .gsi sequence database index file. # # GSI allows multiple files per index, but fasta2gsi.pl # creates a GSI index for a single FASTA file. # # Part of the SQUID sequence analysis library. # Copyright (C) 1992-1996 Sean R. Eddy $seqfile = shift; $gsifile = $seqfile.".gsi"; $tmpfile = $seqfile.".tmpgsi"; # Library of Perl functions for creating GSI index files. # # GSI definition: # 1 + + total records. # Each record = 38 bytes. # # one header record : <"GSI" (32)> # file records : # key records : # # Part of the SQUID sequence analysis library. # Copyright (C) 1992-1996 Sean R. Eddy # The following numbers MUST match their counterparts in squid.h # $sqd_fmt_genbank = 2; $sqd_fmt_embl = 4; $sqd_fmt_fasta = 7; $sqd_fmt_pir = 12; # Function: GSI_WriteHeader(GSIFILE, $filenum, $keynum) # # Write the header of an open GSI file. # sub GSI_WriteHeader { local(*GSIFILE, $filenum, $keynum) = @_; local($header); $header = pack("a32 n N", "GSI", $filenum, $keynum); print GSIFILE $header; 1; } # Function: GSI_WriteFileRecord(GSIFILE, $filename, $idx, $fmt) # # Write a file record to an open GSI file. # sub GSI_WriteFileRecord { local(*GSIFILE, $filename, $idx, $fmt) = @_; local($record); $record = pack("a32 n N", $filename, $idx, $fmt); print GSIFILE $record; 1; } # Function: GSI_WriteKeyRecord(GSIFILE, $key, $filenum, $offset) # # Write a key record to an open GSI file. # sub GSI_WriteKeyRecord { local(*GSIFILE, $key, $filenum, $offset) = @_; local($record); $record = pack("a32 n N", $key, $filenum, $offset); print GSIFILE $record; 1; } # First pass. Create an unsorted flat text file. # $curr_offset = 0; $recnum = 0; print "Calculating offsets for $seqfile...\n"; open(TMPFILE,">$tmpfile"); open(SEQFILE,$seqfile); while () { if (($key) = /^>\s*(\S+)/) { print TMPFILE "$key 1 $curr_offset\n"; $recnum++; } $curr_offset = tell; } close(SEQFILE); close(TMPFILE); # Sort the temporary file alphabetically on the key. print "Sorting the intermediate index file...\n"; system("sort -o $tmpfile $tmpfile"); # Second pass. Convert flat text file to binary GSI. # print "Writing the final binary GSI file...\n"; open(GSIFILE,">$gsifile"); &GSI_WriteHeader(GSIFILE, 1, $recnum); &GSI_WriteFileRecord(GSIFILE, $seqfile, 1, $sqd_fmt_fasta); open(TMPFILE,$tmpfile); while () { ($key, $filenum, $offset) = split; &GSI_WriteKeyRecord(GSIFILE, $key, $filenum, $offset); } close(TMPFILE); close(GSIFILE); unlink $tmpfile; print "Complete.\n"; print "$gsifile indexes $recnum sequence names.\n"; tRNAscan-SE-2.0/src/sqfuncs.h0000644000543100007160000001633211021467306015261 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ #ifndef SQFUNCSH_INCLUDED #define SQFUNCSH_INCLUDED /* sqfuncs.h * SRE, Mon Jul 12 12:20:00 1993 * * Prototypes for squid library functions; * also makes a good reference list for what the package contains. * Slowly being added to as I ANSI-fy squid. */ /* * from aligneval.c */ extern float ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2); extern float CompareRefPairAlignments(int *ref, char *known1, char *known2, char *calc1, char *calc2); extern float CompareMultAlignments(char **kseqs, char **tseqs, int N); extern float CompareRefMultAlignments(int *ref, char **kseqs, char **tseqs, int N); extern void PairwiseIdentity(char *s1, char *s2, int allow_ragged, float *ret_id1, float *ret_id2, float *ret_idtot); /* * from alignio.c */ extern void FreeAlignment(char **aseqs, int nseq, struct aliinfo_s *ainfo); extern int MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s); extern int MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s); extern int WritePairwiseAlignment(FILE *ofp, char *aseq1, char *name1, int spos1, char *aseq2, char *name2, int spos2, int **pam, int indent); extern int MingapAlignment(char **aseqs, int num, struct aliinfo_s *ainfo); extern int RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex, char ***ret_aseqs, AINFO *ainfo); /* from cluster.c */ extern int Cluster(float **mx, int N, enum clust_strategy mode, struct phylo_s **ret_tree); extern struct phylo_s *AllocPhylo(int N); extern void FreePhylo(struct phylo_s *tree, int N); extern int MakeDiffMx(char **aseqs, int num, int alen, int allow_ragged, float ***ret_dmx); extern int MakeIdentityMx(char **aseqs, int num, int alen, int allow_ragged, float ***ret_dix); /* * from dayhoff.c */ extern int ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale); extern void ScalePAM(int **pam, int scale); /* from getopt.c */ extern int Getopt(int argc, char **argv, struct opt_s *opt, int nopts, char *usage, int *ret_optind, char **ret_optname, char **ret_optarg); /* from interleaved.c */ extern int ReadInterleaved(char *seqfile, int (*parse_header)(FILE *, AINFO *, int *), int (*is_dataline)(char *, char *), char ***ret_aseqs, int *ret_num, AINFO *ainfo); extern int ReadAlignment(char *seqfile, int format, char ***ret_aseqs, int *ret_num, struct aliinfo_s *ainfo); /* * from msf.c */ extern int WriteMSF(FILE *fp, char **aseqs, int num, struct aliinfo_s *ainfo); extern void FlushAlignment(char **aseqs, int num, int *ret_alen); /* from revcomp.c */ extern char *revcomp(char *comp, char *seq); /* * from selex.c */ extern int ReadSELEX(char *seqfile, char ***ret_aseqs, int *ret_num, struct aliinfo_s *ret_aliinfo); extern int WriteSELEX(FILE *fp, char **aseqs, int num, struct aliinfo_s *ainfo, int cpl); extern int DealignAseqs(char **aseqs, int num, char ***ret_rseqs); extern int IsSELEXFormat(char *filename); extern int TruncateNames(char **names, int N); /* OBSOLETE? */ extern FILE *EnvFileOpen(char *fname, char *env); /* * from seqencode.c */ extern int seqcmp(char *s1, char *s2, int allow); extern int seqncmp(char *s1, char *s2, int n, int allow); extern int seqencode(char *codeseq,char *str); extern int coded_revcomp(char *comp, char *seq); extern int seqdecode(char *str, char *codeseq); extern int seqndecode(char *str, char *codeseq, int n); /* * from sqerror.c */ extern int Die(char *format, ...); extern int Warn(char *format, ...); /* * from sqio.c */ extern void FreeSequence(char *seq, SQINFO *sqinfo); extern int SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag); extern void SeqinfoCopy(SQINFO *sq1, SQINFO *sq2); extern void ToDNA(char *seq); extern void ToRNA(char *seq); extern int ReadMultipleRseqs(char *seqfile, int fformat, char ***ret_rseqs, SQINFO **ret_sqinfo, int *ret_num); extern SQFILE *SeqfileOpen(char *filename, int format, char *env); extern void SeqfilePosition(SQFILE *sqfp, long offset); extern void SeqfileClose(SQFILE *sqfp); extern int ReadSeq(SQFILE *fp, int format, char **ret_seq, SQINFO *sqinfo); extern int GCGBinaryToSequence(char *seq, int len); extern int GCGchecksum(char *seq, int seqlen); extern int GCGMultchecksum(char **seqs, int nseq); extern int SeqfileFormat(char *filename, int *ret_format, char *env); extern int WriteSeq(FILE *outf, int outfmt, char *seq, SQINFO *sqinfo); extern int Seqtype(char *seq); extern char *SeqFormatString(int code); extern GSIFILE *GSIOpen(char *gsifile); extern int GSIGetOffset(GSIFILE *gsi, char *key, char *sqfile, long *ret_offset); extern void GSIClose(GSIFILE *gsi); /* from sre_ctype.c */ extern int sre_tolower(int c); extern int sre_toupper(int c); /* from sre_math.c */ extern float Gaussrandom(float mean, float stddev); extern int Linefit(float *x, float *y, int N, float *ret_a, float *ret_b, float *ret_r); extern float Gammln(float xx); extern int DNorm(double *vec, int n); extern int FNorm(float *vec, int n); extern void DScale(double *vec, int n, double scale); extern void FScale(float *vec, int n, float scale); extern void DSet(double *vec, int n, double value); extern void FSet(float *vec, int n, float value); extern double DSum(double *vec, int n); extern float FSum(float *vec, int n); extern float sre_random(void); extern void sre_srandom(int seed); extern int DChoose(double *p, int n); extern int FChoose(float *p, int n); extern double DLogSum(double *logp, int n); extern float FLogSum(float *logp, int n); /* from sre_string.c */ #ifdef NOSTR extern char *strstr(char *s, char *subs); #endif extern char *Strdup(char *s); extern int Strinsert(char *s1, char c, int pos); extern int Strdelete(char *s1, int pos); extern void s2lower(char *s); extern void s2upper(char *s); extern void *MallocOrDie(size_t size); extern void *ReallocOrDie(void *p, size_t size); extern int Strparse(char *rexp, char *s, char **buf, int ntok, ...); extern void StrShuffle(char *s1, char *s2); /* from stack.c */ extern struct intstack_s *InitIntStack(void); extern void PushIntStack(struct intstack_s *stack, int data); extern int PopIntStack(struct intstack_s *stack, int *ret_data); extern void ReverseIntStack(struct intstack_s *stack); extern int FreeIntStack( struct intstack_s *stack ); /* * from translate.c */ extern char *Translate(char *seq, char **code); /* * from types.c */ extern int IsInt(char *s); extern int IsReal(char *s); /* * from weight.c */ extern int SonnhammerWeights(char **aseq, int nseq, int alen, float **ret_weights); extern int VoronoiWeights(char **aseq, int nseq, int alen, float **ret_weights); extern void FilterAlignment(char **aseq, int nseq, AINFO *ainfo, float cutoff, char ***ret_anew, int *ret_nnew, AINFO **ret_newinfo); extern void SampleAlignment(char **aseq, int nseq, AINFO *ainfo, int sample, char ***ret_anew, int *ret_nnew, AINFO **ret_newinfo); #endif /* SQFUNCSH_INCLUDED */ tRNAscan-SE-2.0/src/emit_main.c0000644000543100007160000001111311021467304015522 0ustar pchanlowelab/* emit_main.c * main() for emitting sequences from a stored model * written as a debugging aid * * 1.0: SRE, Tue Jun 15 09:32:43 1993 * 2.0: SRE, Fri Sep 10 08:06:37 1993 */ #include #include #include #include #ifdef NEED_GETOPTH #include #endif #include "structs.h" #include "funcs.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "abls:L" static char usage[] = "\ Usage: covee [-options] \n\ where options are:\n\ -a : annotate all pairs, not just canonical ones\n\ -b : emit single most probable sequence\n\ -l : print as mountain landscape\n\ -s : set seed for random()\n\ EXPERIMENTAL OPTIONS:\n\ -L : calculate expected length distributions for states\n"; static char banner[] = "covee: emit sequences from a covariance model"; int main(int argc, char **argv) { char *cmfile; /* file to read model from */ struct cm_s *cm; /* model */ int i; /* counter for sequences */ char *seq; /* generated sequence */ char *khseq; /* generated structure */ struct align_s *ali; /* generated "alignment" */ int emitnum; /* number of sequences to emit */ int seed; /* seed for random number generator */ int do_best; /* TRUE if generating only best seq */ int do_landscape; /* TRUE if printing as landscape */ int watsoncrick; /* TRUE to annotate only canonical pairs */ int do_lengths; /* TRUE to do length distributions */ int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ /*********************************************** * Parse command line ***********************************************/ emitnum = 20; seed = (int) time (0); do_best = FALSE; do_landscape = FALSE; watsoncrick = TRUE; do_lengths = FALSE; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'a': watsoncrick = FALSE; break; case 'b': do_best = TRUE; break; case 'l': do_landscape = TRUE; break; case 's': seed = atoi(optarg); break; case 'L': do_lengths = TRUE; break; default: Die("Error: unrecognized option %c\n", optc); } if (argc - optind != 1) Die("Wrong number of arguments.\n%s", usage); cmfile = argv[optind]; sre_srandom((unsigned) seed); /*********************************************** * Read in the model ***********************************************/ if (! ReadCM(cmfile, &cm)) Die("Failed to read model from file %s\n", cmfile); /*********************************************** * Generate sequences from model and print them. ***********************************************/ puts(banner); printf(" version %s, %s\n\n", RELEASE, RELEASEDATE); if (do_lengths) { struct pstate_s *pcm; int statenum; double **lmx; int *min, *max; int y; MakePCM(cm, &pcm, &statenum); NormalizePCM(pcm, statenum); LengthDistribution(pcm, statenum, 200, &lmx); LengthBounds(lmx, statenum, 200, 1.0e-6, &min, &max); for (y = 0; y < statenum; y++) printf("%4d %4d %4d (%4d) %8.8f %s\n", y, min[y], max[y], max[y]-min[y]+1, (float) (max[y]-min[y]+1) / 200.0, UstatetypeName(pcm[y].statetype)); Free2DArray(lmx, statenum); free(min); free(max); free(pcm); } else if (do_best) { if (! EmitBestSequence(cm, watsoncrick, &ali, &khseq, &seq)) Die("EmitBestSequence() failed"); if (do_landscape) { if (! PrintAliLandscape(stdout, cm, ali)) Warn("PrintAliLandscape failed\n"); } else { printf("%s\n", seq); printf("%s\n", khseq); puts(""); } free(ali); free(khseq); free(seq); } else { for (i = 0; i < emitnum; i++) { if (! EmitSequence(cm, watsoncrick, &ali, &khseq, &seq)) Die("failed to generate a sequence from the model."); if (do_landscape) PrintAliLandscape(stdout, cm, ali); else { printf("seq %2d: %s\n", i, seq); printf(" %s\n", khseq); puts(""); } free(ali); free(khseq); free(seq); } } /*********************************************** * Cleanup and exit ***********************************************/ FreeCM(cm); return 0; } tRNAscan-SE-2.0/src/maspar.h0000644000543100007160000000051611021467306015057 0ustar pchanlowelab/* Header file specifically for MasPar parallelized version */ /* These define's affect mpcovels. */ #define NYPROC 64 /* how many rows of processors (nyproc) */ #define BLOCKSIZE 128 /* size of sequence blocks sent to DPU */ #define VPENUM 3 /* # of virtual PE's per PE. N = NYPROC * VPENUM -1 */ tRNAscan-SE-2.0/src/mpcovels_main.c0000644000543100007160000002741211021467305016426 0ustar pchanlowelab/* mpcovels_main.c * Sun Aug 21 12:25:52 1994 * * main() for MasPar version of covels * */ #include #include #include #include #include #ifdef NEED_GETOPTH #include #endif #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #include "maspar.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "cg:ho:t:A:D:" static char usage[] = "\ Usage: mpcovels [-options] \n\ where options are:\n\ -c : do complementary strand too\n\ -g : set background expected GC content (0.5 default)\n\ -h : print short help and version info\n\ -o : save hits in \n\ -t : set score reporting threshold\n\ CRASH PROTECTION OPTIONS:\n\ -A : maintain file of names of active seqs\n\ -D : save names of finished seqs here\n"; static char banner[] = "mpcovels - scan sequences for matches to an RNA covariance model"; extern int MPViterbiScan(struct istate_s *fe_icm, int *fe_statenum, int *fe_threshold, char *fe_buffer); /* Because we do some back-and-forth communication between the * DPU and the front end, I need to make the following variables * available externally to the functions that the DPU calls back to. */ static SQFILE *fp; /* open sequence file */ static int format; /* format of sequence file */ static char **seq; /* array of NYPROC sequences */ static SQINFO *sqinfo; /* array of NYPROC sequence info structures */ static char **sptr; /* ptrs into NYPROC active sequences */ static char *buffer; /* NYPROC blocks of processed seq to go to PE's */ static int moreseqs; /* TRUE if there's still more seqs in the file */ static int morebases; /* TRUE if there's still more bases in **seq */ static int ithresh; /* scaled integer score threshold */ static FILE *ofp; /* output file for scores */ static int do_revcomp; /* TRUE to do reverse complements too */ static int inrev[NYPROC]; /* TRUE if we're doing a revcomp in this row now */ static char *activefile; /* name of active seq file to save to, or NULL */ static FILE *donefp; /* open finished seq name file, or NULL */ int main(int argc, char **argv) { char *seqfile; /* sequence file */ char *cmfile; /* file containing covariance model */ struct cm_s *cm; /* model */ struct istate_s *icm; /* integer log-odds model */ int statenum; /* number of states in icm */ int y; /* counter for rows */ double rfreq[ALPHASIZE]; /* expected background symbol frequencies */ char *outfile; /* save file for scores */ char *donefile; /* save file for finished seq names */ float thresh; /* threshold score for reporting a match */ double gcfrac; /* fraction GC expected background */ int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, curr_size; #endif /*********************************************** * Parse command line ***********************************************/ thresh = 0.0; do_revcomp = FALSE; outfile = NULL; activefile = NULL; donefile = NULL; gcfrac = 0.5; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'c': do_revcomp = TRUE; break; case 'g': gcfrac = (double) atof(optarg);break; case 'o': outfile = optarg; break; case 't': thresh = (float) atof(optarg); break; case 'A': activefile = optarg; break; case 'D': donefile = optarg; break; case 'h': printf("%s\n version %s (%s)\n%s\n", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("unrecognized option %c\n", optc); } if (argc - optind != 2) Die("%s\n", usage); cmfile = argv[optind]; optind++; seqfile = argv[optind]; /* The random model probabilities */ rfreq[1] = rfreq[2] = gcfrac / 2.0; rfreq[0] = rfreq[3] = (1.0 - gcfrac) / 2.0; ofp = stdout; if (outfile != NULL && (ofp = fopen(outfile, "w")) == NULL) Die("Failed to open output file %s", outfile); donefp = NULL; if (donefile != NULL && (donefp = fopen(donefile, "w")) == NULL) Die("Failed to open finished sequence names file %s", donefile); /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" version %s, %s\n\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); printf("Database to search/score: %s\n", seqfile); printf("Model: %s\n", cmfile); printf("Reporting threshold: %.2f\n", thresh); if (outfile != NULL) printf("Scores saved to file: %s\n", outfile); printf("Reverse complement too? %s\n", do_revcomp ? "yes" : "no"); printf("---------------------------------------------------\n"); puts(""); /*********************************************** * Get the model, open the sequence database ***********************************************/ if (! ReadCM(cmfile, &cm)) Die("Failed to read model from file %s", cmfile); if (! RearrangeCM(cm, rfreq, &icm, &statenum)) Die("Failed to convert CM to integer log-odds form\n"); if (! SeqfileFormat(seqfile, &format, NULL)) switch (squid_errno) { case SQERR_NOFILE: Die("Sequence file %s could not be opened for reading", seqfile); case SQERR_FORMAT: default: Die("Failed to determine format of sequence file %s", seqfile); } if ((fp = SeqfileOpen(seqfile, format, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); ithresh = (int) (thresh * INTPRECISION); /* Load the first set of sequences */ if ((seq = (char **) malloc (sizeof(char *) * NYPROC)) == NULL || (sqinfo= (SQINFO *) malloc (sizeof(SQINFO) * NYPROC)) == NULL || (sptr = (char **) malloc (sizeof(char *) * NYPROC)) == NULL || (buffer= (char *) malloc (sizeof(char) * (NYPROC * BLOCKSIZE))) == NULL) Die("malloc failed"); moreseqs = TRUE; morebases = TRUE; for (y = 0; y < NYPROC; y++) { if (! moreseqs || ! ReadSeq(fp, format, &seq[y], &(sqinfo[y]))) { moreseqs = FALSE; seq[y] = NULL; } if (seq[y] != NULL) s2upper(seq[y]); sptr[y] = seq[y]; inrev[y] = FALSE; } /* Call the parallel scanning function. * It will load the model up, then immediately call us * back and ask us to prepare the first block of sequence data. * We tell it where the sequence data is by passing the address of buffer. */ callRequest(MPViterbiScan, 16, icm, &statenum, &ithresh, buffer); /* Cleanup and exit */ free(seq); free(sqinfo); free(sptr); free(buffer); SeqfileClose(fp); fclose(ofp); if (donefp != NULL) fclose(donefp); return 0; } /* Function: NextSequenceBlock() * * Purpose: Prepare a sequence buffer for the DPU. * The buffer consists of NYPROC blocks of size BLOCKSIZE. * The values are 0,1,2,3 for A,C,G,U, or 4 for no seq. * If we return 1, the DPU blockIn()'s this buffer. * If we return 0, the DPU concludes that the search is complete. * * Note: * There's some intricacy below, so be careful what you muck with. * These next two functions are written carefully to make sure that * sqinfo[y] is still valid for the hits that are reported to * DPUReportsHit(). This has the following consequences: * 1) NextSequenceBlock() must make sure that there's only one * sequence in a BLOCKSIZE. It must wait for the next block * to start sending a new sequence. * 2) Moreover, it's got to make sure that the terminating '\0' * was part of the block -- so the DPU can clean up and report * the final hit on that sequence *before* we free it and its * info. Hence, the delayed FreeSequence() call here. * * This results in some inefficiency, since we can't start new sequences * until the next block. */ int NextSequenceBlock(void) { int changed; /* TRUE if active sequence list changed */ int y, pos; char *rev; /* Load new sequences if we have to. */ morebases = FALSE; changed = FALSE; for (y = 0; y < NYPROC; y++) { /* where we have a seq, but now we're * done with it... * free the old seq, load a new one */ if (seq[y] != NULL && sptr[y] == NULL) { if (do_revcomp && !inrev[y]) { if ((rev = (char *) malloc (sizeof(char) * (sqinfo[y].len + 1))) == NULL) Die("malloc failed"); if (revcomp(rev, seq[y]) == NULL) Die("revcomp failed"); free(seq[y]); seq[y] = sptr[y] = rev; inrev[y] = TRUE; changed = TRUE; } else { if (donefp != NULL) /* save name to finished seq file */ { fprintf(donefp, "%s\n", sqinfo[y].name); fflush(donefp); } FreeSequence(seq[y], &sqinfo[y]); if (! moreseqs || ! ReadSeq(fp, format, &seq[y], &sqinfo[y])) { moreseqs = FALSE; seq[y] = NULL; } if (seq[y] != NULL) s2upper(seq[y]); sptr[y] = seq[y]; inrev[y] = FALSE; changed = TRUE; } } if (seq[y] != NULL) morebases = TRUE; } if (morebases == FALSE) return 0; /* Construct a new block of sequence data for the DPU to blockIn(). */ for (y = 0; y < NYPROC; y++) { for (pos = 0; pos < BLOCKSIZE; pos++) { if (seq[y] == NULL || sptr[y] == NULL) buffer[y * BLOCKSIZE + pos] = 4; else if (*(sptr[y]) == '\0') { buffer[y * BLOCKSIZE + pos] = 4; sptr[y] = NULL; /* signal that we're done w/ this seq */ } else { buffer[y * BLOCKSIZE + pos] = SymbolIndex(*(sptr[y])); sptr[y] ++; } } } /* crash protection: maintain list of active names */ if (changed == TRUE && activefile != NULL) { FILE *actfp; if ((actfp = fopen(activefile, "w")) == NULL) Die("failed to open active sequence name list file %s", activefile); for (y = 0; y < NYPROC; y++) fprintf(actfp, "%s\n", sqinfo[y].name); fclose(actfp); } return 1; } /* Function: DPUReportsHit() * * Purpose: The DPU is reporting one or more hits (up to NYPROC). * The easiest thing to do is to blockIn() from *all* * NYPROC rows, and find what rows actually have real * hits here. */ int DPUReportsHit(int *dpu_seeme, int *dpu_i, int *dpu_j, int *dpu_score) { int seeme[NYPROC]; /* TRUE if we're reporting in this row */ int start[NYPROC]; int end[NYPROC]; int score[NYPROC]; int y; blockIn(dpu_seeme, seeme, 0, 0, 1, NYPROC, sizeof(int)); blockIn(dpu_i, start, 0, 0, 1, NYPROC, sizeof(int)); blockIn(dpu_j, end, 0, 0, 1, NYPROC, sizeof(int)); blockIn(dpu_score, score, 0, 0, 1, NYPROC, sizeof(int)); for (y = 0; y < NYPROC; y++) if (seeme[y]) { if (inrev[y]) fprintf(ofp, "%6.2f %5d %5d : %s %s\n", (float) score[y] / INTPRECISION, sqinfo[y].len - start[y], sqinfo[y].len - end[y], sqinfo[y].name, (sqinfo[y].flags & SQINFO_DESC) ? sqinfo[y].desc : ""); else fprintf(ofp, "%6.2f %5d %5d : %s %s\n", (float) score[y] / INTPRECISION, start[y] + 1, end[y] + 1, sqinfo[y].name, (sqinfo[y].flags & SQINFO_DESC) ? sqinfo[y].desc : ""); fflush(ofp); } return 1; } tRNAscan-SE-2.0/src/structs.h0000644000543100007160000002103211021467306015277 0ustar pchanlowelab#ifndef STRUCTSH_INCLUDED #define STRUCTSH_INCLUDED /* structs.h - declarations of data structures * SRE, Tue Aug 31 15:17:12 1993 */ #include "squid.h" /* Alphabet information. * The package is designed to be configurable for protein analysis * just by changing these define's. Dunno if it would be *useful* * to apply it to protein work -- but the possibility's there. */ #define ALPHATYPE kRNA #define ALPHASIZE 4 extern char *ALPHABET; /* defined at top of misc.c */ /* * Node types. * These are used for clarity in the code, not to make it easy * to change them: the program makes assumptions about the order * they come in, so DON'T CHANGE THESE. * * Specifically, the following assumptions are made: * - that they come in *exactly* this order (static state transition array in prior.h) * - that BIFURC, MATP, MATL, MATR are all less than 4 * (second index, prior.h; also some loops in maxmodelmaker.c) * - ROOT is last (indexing of scores of mmx model construction matrix, maxmodelmaker.c) */ #define BIFURC_NODE 0 #define MATP_NODE 1 #define MATL_NODE 2 #define MATR_NODE 3 #define BEGINL_NODE 4 #define BEGINR_NODE 5 #define ROOT_NODE 6 #define NODETYPES 7 /* number of different node types */ #define END_NODE BIFURC_NODE /* * State types. * These are used for clarity in the code, not to make it easy * to change them: the program makes assumptions about the order * they come in, so DON'T CHANGE THESE. */ #define DEL_ST 0 #define MATP_ST 1 #define MATL_ST 2 #define MATR_ST 3 #define INSL_ST 4 #define INSR_ST 5 #define STATETYPES 6 /* MATP nodes contain 6 states */ #define BEGIN_ST DEL_ST #define BIFURC_ST DEL_ST #define END_ST DEL_ST /* Unique identifiers for state types, used as flags not indexes * in the alignment algorithms. */ #define uDEL_ST (1<<0) #define uMATP_ST (1<<1) #define uMATL_ST (1<<2) #define uMATR_ST (1<<3) #define uINSL_ST (1<<4) #define uINSR_ST (1<<5) #define uBEGIN_ST (1<<6) #define uEND_ST (1<<7) #define uBIFURC_ST (1<<8) /* Structure: node_s * * Purpose: Contains all the information necessary to describe a node. */ struct node_s { int type; double tmx[STATETYPES][STATETYPES]; /* up to 49 transition probs */ double mp_emit[ALPHASIZE][ALPHASIZE]; /* 4x4 MATP emission probs */ double il_emit[ALPHASIZE]; /* 4 INSL emission probs */ double ir_emit[ALPHASIZE]; /* 4 INSR emission probs */ double ml_emit[ALPHASIZE]; /* 4 MATL emission probs */ double mr_emit[ALPHASIZE]; /* 4 MATR emission probs */ int nxt; /* connection to left child */ int nxt2; /* connection to right child */ }; /* Structure: cm_s * * Purpose: A covariance model. */ struct cm_s { int nodes; /* number of nodes */ struct node_s *nd; /* array of nodes 0..nodes-1 */ }; /* Structure: istate_s * * In the alignment algorithms, a CM is converted to an array of states, * each represented by one of these structures. Each state contains * probability info as integers instead of floating point. * * The order of the state transition vector is different than in * the CM. INSL and INSR are first: INSL, INSR, DEL, MATP, MATL, MATR. */ struct istate_s { int nodeidx; /* index of node this state belongs to */ int statetype; /* unique id for type of this state (uMATP_ST, etc.) */ int offset; /* offset in state array to first INS state */ int connectnum; /* number of elements in tmx */ int tmx[STATETYPES]; /* rearranged transition vector, int log-odds */ int emit[ALPHASIZE*ALPHASIZE]; /* int lod emission vector (4 or 16) or NULL */ }; struct pstate_s { int nodeidx; /* index of node this state belongs to */ int statetype; /* unique id for type of this state (uMATP_ST, etc.) */ int offset; /* offset in state array to first INS state */ int connectnum; /* number of elements in tmx */ int bifr; /* (uBIF_ST only) index of right connection */ double tmx[STATETYPES]; /* rearranged transition vector */ double emit[ALPHASIZE*ALPHASIZE]; /* emission vector (4 or 16) or NULL */ }; /* Structure: prior_s * * Purpose: Contains the prior probability distributions for * state transitions and symbol emissions, as well * as the alpha "confidence" values applied during * regularization, and alphabet information. */ struct prior_s { double tprior[7][4][STATETYPES][STATETYPES]; /* state transitions */ double matp_prior[ALPHASIZE][ALPHASIZE]; /* MATP_ST emissions */ double matl_prior[ALPHASIZE]; /* MATL_ST emissions */ double matr_prior[ALPHASIZE]; /* MATR_ST emissions */ double insl_prior[ALPHASIZE]; /* INSL_ST emissions */ double insr_prior[ALPHASIZE]; /* INSR_ST emissions */ double talpha[STATETYPES]; /* alpha's for state transitions */ double emalpha[STATETYPES]; /* alpha's for symbol emissions */ double rfreq[ALPHASIZE]; /* background symbol freqs for random model */ }; /* Structure: trace_s * * Binary tree structure for storing a traceback of an alignment; * also used for tracebacks of model constructions. */ struct trace_s { int emitl; /* i position (1..N) or 0 if nothing */ int emitr; /* j position (1..N) or 0 if nothing */ int nodeidx; /* index of node responsible for this alignment */ int type; /* type of substate (uMATP_ST, etc.) used (unique) */ struct trace_s *nxtl; /* ptr to left (or only) branch, or NULL for end */ struct trace_s *nxtr; /* ptr to right branch, BIFURC only, else NULL */ struct trace_s *prv; /* ptr to parent */ }; /* Structure: trmem_s * * It's expensive in malloc()'s to build trace trees. This structure * allows trace.c to cut down malloc overhead, by keeping a pool * of trace_s structures. */ struct trmem_s { int next; /* index of next trace_s to use in pool */ int num; /* how many trace_s total in pool */ struct trace_s *pool; /* alloced array of trace_s structs */ struct tracestack_s *used; /* old (fully used) pools, waiting to be freed */ }; #define TMEM_BLOCK 256 /* how many trace_s to alloc per malloc() call */ /* Structure: tracestack_s * * Formerly a pushdown stack used for traversing a binary tree of trace_s structures. * Reimplemented as an array for malloc efficiency. */ struct tracestack_s { int next; /* index of next trace_s pointer to use */ int num; /* number of trace_s pointers alloc'ed */ struct trace_s **list; /* array of trace_s pointers */ }; #define TSTACK_BLOCK 64 /* A struct align_s implements a linked list describing the alignment * of a model to a sequence. Note that this is the inverse of what * trace_s trees are for; align_s is a linear representation of * the alignment (from the sequence's point of view, if you will) */ struct align_s { int pos; /* pos in seq emitted (0..N-1; -1 if none) */ char sym; /* symbol emitted (ACGU, . if none) */ char ss; /* secondary structure character, <>. */ int nodeidx; /* index of model state aligned to this position */ int type; /* type of substate reponsible for this emission (unique) */ struct align_s *nxt; }; /* A struct m2ali_s implements a pushdown stack used for traversing * a model and producing an align_s alignment list. */ struct m2ali_s { int nodeidx; /* index of position in model (0..M) */ int type; /* subtype of position in model */ struct align_s *after; /* position in align_s list */ struct m2ali_s *nxt; }; /* A struct t2ali_s implements a pushdown stack for traversing a * traceback tree and producing an align_s alignment list. */ struct t2ali_s { struct trace_s *tracenode; struct align_s *after; struct t2ali_s *nxt; }; /* some stuff used when we store sums of log scores as integers, * for speed and precision */ #define INTPRECISION 1000.0 /* pick up three decimal places in our ints */ #define NEGINFINITY -999999 /* -999.999 is small enough for -Inf */ #define POSINFINITY 999999 /* +999.999 is large enough for +Inf */ #define ILOG2(a) (((a) > 0.0) ? (log(a) / 0.69314718 * INTPRECISION) : NEGINFINITY) #endif /* STRUCTSH_INCLUDED */ tRNAscan-SE-2.0/src/konings.c0000644000543100007160000002124011021467304015232 0ustar pchanlowelab/* konings.c * 1.0: SRE, Wed Jul 7 08:53:37 1993 * adapted for 2.0: SRE, Thu Sep 9 13:38:13 1993 * * Representation of secondary structure and secondary structural * alignments using Danielle Konings' string notation, and Hogeweg's * mountain notation. * * See: Konings and Hogeweg, J. Mol. Biol. 207:597-614 1989 */ #include #include #include #include #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: Align2kh() * * Purpose: Convert an alignment (align_s linked list) into * a secondary structure string. The symbols used * are > and < for the two sides of a MATP; '.' * for other symbols. * * Also may return the "aligned" sequence; this * sequence is only aligned in the sense that deleted * consensus positions are represented as '.' * and insert positions are lower case. * * Either ret_aseq or ret_khseq may be passed as NULL * if you don't want one of them. */ int Align2kh(struct align_s *ali, char **ret_aseq, char **ret_khseq) { struct align_s *curr; /* ptr into alignment list */ char *aseq; /* RETURN: aligned seq */ char *khseq; /* RETURN: structure string */ int len; /* length of alignment, khseq */ int pos; /* position in khseq */ /* Count the length of the list and malloc for khseq. */ len = 0; for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt) len++; if ((khseq = (char *) malloc ((len+1) * sizeof(char))) == NULL || (aseq = (char *) malloc ((len+1) * sizeof(char))) == NULL) Die("Memory allocation failed, line %d of %s", __LINE__, __FILE__); /* This used to be harder. Now align_s already has a field for * secondary structure annotation, and we just have to copy it. */ pos = 0; for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt) { switch (curr->type) { case uBEGIN_ST: case uBIFURC_ST: break; /* neither should appear in an align_s! */ case uDEL_ST: khseq[pos] = ' '; aseq[pos] = '-'; break; case uINSL_ST: case uINSR_ST: khseq[pos] = curr->ss; aseq[pos] = sre_tolower(curr->sym); break; case uMATL_ST: case uMATR_ST: case uMATP_ST: khseq[pos] = curr->ss; aseq[pos] = sre_toupper(curr->sym); break; default: Die("unrecognized state type %d in Align2kh()", curr->type); } pos++; } khseq[pos] = '\0'; aseq[pos] = '\0'; if (ret_khseq == NULL) free(khseq); else *ret_khseq = khseq; if (ret_aseq == NULL) free(aseq); else *ret_aseq = aseq; return 1; } /* Function: PrintAliLandscape() * * Purpose: Print an alignment of sequence to model in the form * of a Konings/Hogeweg "mountain". */ int PrintAliLandscape(FILE *fp, struct cm_s *cm, struct align_s *ali) { int altitude; /* current height on the mountain */ struct align_s *curr; /* ptr to current ali element */ struct align_s *prev; /* ptr to previous ali element */ int i; altitude = 0; prev = NULL; for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt) { if (curr->pos >= 0) fprintf(fp, "%4d %c ", curr->pos+1, curr->sym); else fprintf(fp, " %c ", curr->sym); for (i = 0; i < altitude; i++) fputc(' ', fp); switch (curr->type) { case uBEGIN_ST: case uBIFURC_ST: break; case uDEL_ST: fputs(" DEL ", fp); break; case uINSL_ST: fputs("` INSL", fp); break; case uINSR_ST: fputs("\' INSR", fp); break; case uMATL_ST: fputs("\\ MATL", fp); break; case uMATR_ST: fputs("/ MATR", fp); break; case uMATP_ST: if (prev == NULL || curr->nodeidx > prev->nodeidx) { fputs(" v MATP", fp); altitude++; } else { fputs("^ MATP", fp); altitude--; } break; default: Die("unrecognized state type %d in PrintAliLandscape()", curr->type); } printf(" %d\n", curr->nodeidx); prev = curr; } return 1; } /* Function: Trace2KHS() * * Purpose: From a traceback tree of seq, produce a * secondary structure string. ">" and "<" are * used for pairwise emissions; "." for single-stranded stuff. * Note that structure is defined by pairwise emissions, * not by Watson-Crick-isms and stacking rules. * * Args: tr - traceback structure * seq - sequence, 0..rlen-1 * rlen - length of seq and returned ss string * watsoncrick - TRUE to annotate canonical pairs only * ret_ss - RETURN: alloc'ed secondary structure string * * Return: ret_ss contains a string 0..rlen-1 containing the * secondary structure. Must be free'd by caller. */ void Trace2KHS(struct trace_s *tr, char *seq, int rlen, int watsoncrick, char **ret_ss) { struct tracestack_s *dolist; struct trace_s *curr; char *ss; if ((ss = (char *) malloc (sizeof(char) * rlen+1)) == NULL) Die("malloc failed"); memset(ss, '.', rlen); ss[rlen] = '\0'; dolist = InitTracestack(); PushTracestack(dolist, tr->nxtl); while ((curr = PopTracestack(dolist)) != NULL) { if ( curr->type == uMATP_ST ) { if (! watsoncrick || IsRNAComplement(seq[curr->emitl], seq[curr->emitr], TRUE)) { ss[curr->emitl] = '>'; ss[curr->emitr] = '<'; } } if (curr->nxtr) PushTracestack(dolist, curr->nxtr); if (curr->nxtl) PushTracestack(dolist, curr->nxtl); } FreeTracestack(dolist); *ret_ss = ss; } /* Function: IsRNAComplement() * * Purpose: Returns TRUE if sym1, sym2 are Watson-Crick complementary. * If allow_gu is TRUE, GU pairs also return TRUE. */ int IsRNAComplement(char sym1, char sym2, int allow_gu) { sym1 = toupper(sym1); sym2 = toupper(sym2); if (sym1 == 'T') sym1 = 'U'; if (sym2 == 'T') sym2 = 'U'; if ((sym1 == 'A' && sym2 == 'U') || (sym1 == 'C' && sym2 == 'G') || (sym1 == 'G' && sym2 == 'C') || (sym1 == 'U' && sym2 == 'A') || (allow_gu && sym1 == 'G' && sym2 == 'U') || (allow_gu && sym1 == 'U' && sym2 == 'G')) return TRUE; else return FALSE; } /* Function: KHS2ct() * * Purpose: Convert a secondary structure string to an array of integers * representing what position each position is base-paired * to (0..len-1), or -1 if none. This is off-by-one from a * Zuker .ct file representation. * * The .ct representation can accomodate pseudoknots but the * secondary structure string cannot easily; the string contains * "Aa", "Bb", etc. pairs as a limited representation of * pseudoknots. The string contains "><" for base pairs. * Other symbols are ignored. If allow_pseudoknots is FALSE, * the pseudoknot symbols will be ignored and these positions * will be treated as single stranded. * * Return: ret_ct is allocated here and must be free'd by caller. * Returns 1 on success, 0 if ss is somehow inconsistent. */ int KHS2ct(char *ss, int len, int allow_pseudoknots, int **ret_ct) { struct intstack_s *dolist[27]; int *ct; int i; int pos, pair; int status = 1; /* success or failure return status */ for (i = 0; i < 27; i++) dolist[i] = InitIntStack(); if ((ct = (int *) malloc (len * sizeof(int))) == NULL) Die("malloc failed"); for (pos = 0; pos < len; pos++) ct[pos] = -1; for (pos = 0; ss[pos] != '\0'; pos++) { if (ss[pos] > 127) status = 0; /* bulletproof against SGI buggy ctype.h */ else if (ss[pos] == '>') /* left side of a pair: push onto stack 0 */ PushIntStack(dolist[0], pos); else if (ss[pos] == '<') /* right side of a pair; resolve pair */ { if (! PopIntStack(dolist[0], &pair)) { status = 0; } else { ct[pos] = pair; ct[pair] = pos; } } /* same stuff for pseudoknots */ else if (allow_pseudoknots && isupper((int) ss[pos])) PushIntStack(dolist[ss[pos] - 'A' + 1], pos); else if (allow_pseudoknots && islower((int) ss[pos])) { if (! PopIntStack(dolist[ss[pos] - 'a' + 1], &pair)) { status = 0; } else { ct[pos] = pair; ct[pair] = pos; } } else if (allow_pseudoknots && !isgap(ss[pos])) status = 0; /* bad character */ } for (i = 0; i < 27; i++) if ( FreeIntStack(dolist[i]) > 0) status = 0; *ret_ct = ct; return status; } tRNAscan-SE-2.0/src/lengthdist.c0000644000543100007160000001246211021467304015735 0ustar pchanlowelab/* lengthdist.c * SRE, Fri Sep 30 09:51:14 1994 * * Calculate length distributions expected at each state. */ #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #ifdef DEBUG #include #endif /* Function: LengthDistribution() * * Purpose: Given a covariance model, calculate the * length distribution that we expect each state * to be aligned to. Uses a full forward * (summed probabilities) calculation. * * Args: cm - covariance model (probability form) * N - maximum length to look at. * ret_lmx - RETURN: (0..M-1) by (0..N) matrix of probabilities * for states 0..M-1 emitting lengths of (0..N). * * Return: (void) * ret_lmx is alloc'ed here. Free2DArray(*ret_lmx, M) */ void LengthDistribution(struct pstate_s *pcm, int statenum, int N, double ***ret_lmx) { double **lmx; int y, len; int ynext; int mid; /* Allocate the matrix for storing probability * distributions. */ lmx = (double **) MallocOrDie(statenum * sizeof(double *)); for (y = 0; y < statenum; y++) lmx[y] = (double *) MallocOrDie ( (N+1) * sizeof(double)); /* Set whole thing to zero. */ for (y = 0; y < statenum; y++) for (len = 0; len <= N; len++) lmx[y][len] = 0.0; /* Initialize at length = 0. */ for (y = statenum-1; y >= 0; y--) switch (pcm[y].statetype) { case uEND_ST: lmx[y][0] = 1.0; break; case uBIFURC_ST: lmx[y][0] = lmx[y+1][0] * lmx[pcm[y].bifr][0]; break; case uDEL_ST: case uBEGIN_ST: for (ynext = 0; ynext < pcm[y].connectnum; ynext++) lmx[y][0] += pcm[y].tmx[ynext] * lmx[y + pcm[y].offset + ynext][0]; break; } /* Recurse for lengths 1..N. */ for (len = 1; len <= N; len++) for (y = statenum-1; y >= 0; y--) switch (pcm[y].statetype) { case uEND_ST: break; case uBIFURC_ST: for (mid = 0; mid <= len; mid++) lmx[y][len] += lmx[y+1][mid] * lmx[pcm[y].bifr][len-mid]; break; case uDEL_ST: case uBEGIN_ST: for (ynext = 0; ynext < pcm[y].connectnum; ynext++) lmx[y][len] += pcm[y].tmx[ynext] * lmx[y + pcm[y].offset + ynext][len]; break; case uMATP_ST: if (len > 1) for (ynext = 0; ynext < pcm[y].connectnum; ynext++) lmx[y][len] += pcm[y].tmx[ynext] * lmx[y + pcm[y].offset + ynext][len-2]; break; case uMATR_ST: case uMATL_ST: case uINSL_ST: case uINSR_ST: for (ynext = 0; ynext < pcm[y].connectnum; ynext++) lmx[y][len] += pcm[y].tmx[ynext] * lmx[y + pcm[y].offset + ynext][len-1]; break; default: Die("unrecognized state type %d", pcm[y].statetype); } *ret_lmx = lmx; return; } /* Function: LengthBounds() * * Purpose: Takes the probability distributions produced by * LengthDistribution() and produces a set of * minimum and maximum length bounds, within which * lies a specified amount of the probability. * * The algorithm is simple. Find the peak of the * probability distribution and include it. Then * look left and right; choose whichever one is * higher P, and include it. Continue until the * included probability exceeds the target. * * Args: lmx: probability distributions from LengthDistribution() * statenum: # of rows in lmx * N: max length (lmx is [0..statenum-1[0..N]) * epsilon: target probability is 1.0 - epsilon. * ret_min: RETURN: [0..statenum-1] array of minimum * lengths for each state * ret_max: RETURN: [0..statenum-1] array of maximum * lengths for each state * * Return: (void) * ret_min, ret_max alloced here. free(). */ void LengthBounds(double **lmx, int statenum, int N, double epsilon, int **ret_min, int **ret_max) { int *min, *max; int y, len; int cmin, cmax; /* current min and max */ double best; double p_remain; min = (int *) MallocOrDie (sizeof(int) * statenum); max = (int *) MallocOrDie (sizeof(int) * statenum); for (y = 0; y < statenum; y++) { /* danger! assuming that N was large enough! */ DNorm(lmx[y], (N+1)); /* step 1. find peak */ best = lmx[y][0]; cmin = 0; for (len = 1; len <= N; len++) if (lmx[y][len] > best) { best = lmx[y][len]; cmin = len; } /* that's where we start */ cmax = cmin; p_remain = 1.0 - lmx[y][cmin]; /* extend to find bounds */ while (p_remain > epsilon) { if (cmin == 0 && cmax == N) break; /* Die("state %d distribution not within set bound of %d; %f remains\n", y, N, p_remain); */ else if (cmin == 0) /* must look right */ p_remain -= lmx[y][++cmax]; else if (cmax == N) /* must look left */ p_remain -= lmx[y][--cmin]; else /* look left and right */ { if (lmx[y][cmin-1] > lmx[y][cmax+1]) p_remain -= lmx[y][--cmin]; else p_remain -= lmx[y][++cmax]; } } min[y] = cmin; max[y] = cmax; } *ret_min = min; *ret_max = max; } tRNAscan-SE-2.0/src/maxmodelmaker.c0000644000543100007160000017513111021467304016421 0ustar pchanlowelab/* maxmodelmaker.c - maximum likelihood construction of a covariance model * Tue Aug 31 14:54:35 1993 * * Given a multiple sequence alignment, construct the model * which generates that alignment with maximal likelihood. * Uses a dynamic programming algorithm to assign a score * to the optimal subtree with a "root" node MATP, MATR, MATL, * BEGINL, BEGINR, or BIFURC aligned to matrix cell i,j, * where i and j are column and row positions in a multiple * sequence alignment. The alignment of ROOT to i=0,j=N-1 is * the score of the best model; a traceback from this point * creates the optimal tree structure. The tree structure is * explicitly aligned to the multiple sequence alignment, * so it is trivial to estimate the parameters of the model. * * * */ #include #include #include #include "version.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define MAXINSERT 6 /* maximum INSL or INSR path length between MATP nodes */ /* Structure maxmx_s * * One per cell of the 2D diagonal matrix of the alignment * against itself. */ struct maxmx_s { int sc[NODETYPES-1]; /* scores of assigning each possible node type */ /*** Traceback info: */ short matp_i2; /* matp assignment connects to node ftype at i2,j2 */ short matp_j2; char matp_ftype; short matl_i2; /* matl assignment connects to node ftype at i2,j */ char matl_ftype; short matr_j2; /* matr assignment connects to node ftype at i,j2 */ char matr_ftype; char begl_ftype; /* begl assignment connects to node ftype at i,j */ short begr_i2; /* begr assignment connects to node ftype at i2, j */ char begr_ftype; short bifurc_mid; /* best bifurcation is into i,mid, mid+1,j */ }; static struct maxmx_s **alloc_maxmx(int alen); static void init_maxmx(struct maxmx_s **mmx, int nseq, int alen, struct prior_s *prior, int *mscore, double *gapcount); static void recurse_maxmx(int **aseqsT, float *weights, int alen, int nseq, struct prior_s *prior, int *mscore, double *gapcount, double gapthresh, struct maxmx_s **mmx); static void trace_maxmx(struct maxmx_s **mmx, int alen, struct trace_s **ret_mtr); static void transpose_alignment(char **aseqs, int alen, int nseq, int ***ret_aseqsT); static void singlet_emissions(int **aseqsT, float *weights, int alen,int nseq, struct prior_s *prior, int **ret_mscore, double **ret_gapcount); static int pair_emissioncost(int *coli, int *colj,float *weights, int nseq, struct prior_s *prior); static void frommatp_transtable(int **aseqsT, float *weights, int nseq, int i, int j, int i2, int j2, int *accum_insl, int *accum_insr, double trans[STATETYPES][STATETYPES]); static void frommatl_transtable(int **aseqsT, float *weights, int nseq, int i, int j, int i2, int *accum_insl, double trans[STATETYPES][STATETYPES]); static void frommatr_transtable(int **aseqsT, float *weights, int nseq, int i, int j, int j2, int *accum_insr, double trans[STATETYPES][STATETYPES]); static void frombeginr_transtable(int **aseqsT, float *weights, int nseq, int j, int i2, int *accum_insl, double trans[STATETYPES][STATETYPES]); static void frombeginl_transtable(int **aseqsT, float *weights, int nseq, int i, int j, double trans[STATETYPES][STATETYPES]); static void fromroot_transtable(int **aseqsT, float *weights, int nseq, int i2,int j2, int *accum_insl, int *accum_insr, double trans[STATETYPES][STATETYPES]); static int assign_cell(int i, int j, int symi, int symj); static void to_matp_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]); static void to_matr_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]); static void to_matl_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]); static void to_bifurc_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]); static int dot_score(double *cvec, double *pvec, int veclen); #ifdef DEBUG static void print_mmx(struct maxmx_s **mmx, int alen); static void print_assignments(struct trace_s *mtr, int nseq, int alen, double *gapcount); #endif /* MACROS: copy_transtable() * copy_singlet() * copy_pairwise() * * For speed, we use memcpy() to do the operations, relying * on the fact that C stores 2D arrays in 1D. */ #define copy_transtable(tomx, frommx) memcpy((void *) tomx, (void *) frommx, sizeof(double) * STATETYPES * STATETYPES) #define copy_singlet(tovec, fromvec) memcpy((void *) tovec, (void *) fromvec, sizeof(double) * ALPHASIZE) #define copy_pairwise(tomx, frommx) memcpy((void *) tomx, (void *) frommx, sizeof(double) * ALPHASIZE * ALPHASIZE) /* MACROS: zero_transtable() * zero_singlet() * zero_pairwise() * * For speed, we use memset() to initialize, relying * on the fact that C stores 2D arrays in 1D. Are doubles * really eight 0 bytes when they're == 0.0? */ #define zero_transtable(mx) memset(mx, (char) 0, sizeof(double) * STATETYPES * STATETYPES) #define zero_singlet(vec) memset(vec, (char) 0, sizeof(double) * ALPHASIZE) #define zero_pairwise(mx) memset(mx, (char) 0, sizeof(double) * ALPHASIZE * ALPHASIZE) /* Function: Maxmodelmaker() * * Purpose: Create a maximally likely model structure from a multiple * sequence alignment. * * Args: aseqs - flushed sequence alignment; each seq is 0..alen-1 * ainfo - information about the alignment * nseq - number of sequences * gapthresh - heuristic: fractional occupancy <= a column must be a MAT of some kind * prior - prior probability distributions, and alphabet info * ret_ssinfo - RETURN: total info content of the alignment (may pass NULL) * ret_cm - RETURN: the new covariance model (may pass NULL) * ret_mtr - RETURN: master traceback for aseqs (may pass NULL) * * Return: 1 on success, 0 on failure. * ret_cm is alloc'ed here and must be free'd by the caller. */ int Maxmodelmaker(char **aseqs, AINFO *ainfo, int nseq, double gapthresh, struct prior_s *prior, double *ret_ssinfo, struct cm_s **ret_cm, struct trace_s **ret_mtr) { int **aseqsT; /* transpose of alignment, [1..alen+1][0..nseq-1] */ int *mscore; /* emission scores of single columns as MATR or MATL */ double *gapcount; /* weighted count of gap occurrence in each column */ struct maxmx_s **mmx; /* saved score and traceback pointer matrix */ struct cm_s *cm; /* RETURN: new model */ int nodes; /* size of new model in nodes */ struct trace_s *mtr; /* consensus master traceback */ struct trace_s *tr; /* individual "fake" traceback */ struct trmem_s *pool; /* memory pool for traceback */ int idx; /* counter for sequences */ float *wt; /* array of 0..nseq-1 weights */ double ssinfo; /* information content */ /* Set up an array of sequence weights */ if ((wt = (float *) malloc (sizeof(float) * nseq)) == NULL) Die("malloc failed"); for (idx = 0; idx < nseq; idx++) wt[idx] = (ainfo->sqinfo[idx].flags & SQINFO_WGT) ? ainfo->sqinfo[idx].weight : 1.0; /* Transpose (and shift by 1 column index) aseqs[0..nseq-1][0..alen-1] to * aseqsT[1..alen+1][0..nseq-1]. */ transpose_alignment(aseqs, ainfo->alen, nseq, &aseqsT); /* Pre-calculate expected singlet emission scores for each column; * also, pick up a weighted count of gap occurrences in each column. */ singlet_emissions(aseqsT, wt, ainfo->alen, nseq, prior, &mscore, &gapcount); /* Allocate the scoring matrix. It is j=0..alen rows by i=1..j+1 columns; * i.e., a lower left diagonal matrix for 1..alen, with an extra off-diagonal * (j+1,j) for boundary conditions. */ mmx = alloc_maxmx(ainfo->alen); /* Initialize the off-diagonal (j+1,j) and the diagonal (j,j), to * set our boundary conditions. */ init_maxmx(mmx, nseq, ainfo->alen, prior, mscore, gapcount); /* The heart of the calculation: recursively calculate scores for all * subsequences i..j, and save traceback pointers. */ recurse_maxmx(aseqsT, wt, ainfo->alen, nseq, prior, mscore, gapcount, gapthresh, mmx); #ifdef DEBUG print_mmx(mmx,ainfo->alen); #endif /* Now we know the info content */ ssinfo = (double) (mmx[ainfo->alen][0].sc[MATP_NODE] / INTPRECISION); /* Traceback, constructing a consensus tree. */ trace_maxmx(mmx, ainfo->alen, &mtr); /* PrintTrace(stdout, mtr); */ /* Count nodes in the consensus tree and number them */ NumberMasterTrace(mtr, &nodes); /* Create a new model */ if ((cm = AllocCM(nodes)) == NULL) Die("failed to allocate for new model of %d nodes\n", nodes); TopofyNewCM(cm, mtr); /* For each sequence, construct an individual "fake" traceback * using the master, and count it into the new model. */ for (idx = 0; idx < nseq; idx++) { Transmogrify(mtr, aseqs[idx], &tr, &pool); if (! TraceCount(cm, aseqs[idx], wt[idx], tr)) Die("TraceCount() failed"); FreeTrace(tr, pool); } #ifdef DEBUG printf("Checking model after trace counts\n"); if (! VerifyCM(cm)) Die("Verification fails -- baaaad model\n"); #endif /* And, finally, convert the new model to probabilities. * There. Wasn't that simple? */ ProbifyCM(cm, prior); #ifdef DEBUG printf("Checking model after probify\n"); if (! VerifyCM(cm)) Die("Verification fails -- baaaad model\n"); #endif free(mscore); free(gapcount); Free2DArray(mmx, ainfo->alen+1); Free2DArray(aseqsT, ainfo->alen+2); free(wt); if (ret_mtr != NULL) *ret_mtr = mtr; else FreeTrace(mtr, NULL); if (ret_ssinfo != NULL) *ret_ssinfo = ssinfo; if (ret_cm != NULL) *ret_cm = cm; else FreeCM(cm); return 1; } /* Function: alloc_maxmx() * * Purpose: allocate the storage matrix. It is a lower left diagonal * matrix with inverted indexing, mmx[j][i], i <= j+1. * * Args: alen - width of alignment * * Return: mmx: allocated storage matrix. Can be free'd by Free2DArray(mmx, alen+1). */ static struct maxmx_s ** alloc_maxmx(int alen) { struct maxmx_s **mmx; int i, j, y; if ((mmx = (struct maxmx_s **) malloc ((alen+1) * sizeof(struct maxmx_s))) == NULL) Die("malloc failed"); for (j = 0; j <= alen; j++) if ((mmx[j] = (struct maxmx_s *) malloc ((j+2) * sizeof(struct maxmx_s))) == NULL) Die("malloc failed"); /* Set the whole matrix so that scores are NEGINFINITY and * each cell's traceback pointers point at itself. */ for (j = 0; j <= alen; j++) for (i = 0; i <= j+1; i++) { for (y = 0; y < NODETYPES-1; y++) mmx[j][i].sc[y] = NEGINFINITY; mmx[j][i].matp_i2 = (short) i; mmx[j][i].matp_j2 = (short) j; mmx[j][i].matp_ftype = (char) MATP_NODE; mmx[j][i].matl_i2 = (short) i; mmx[j][i].matl_ftype = (char) MATL_NODE; mmx[j][i].matr_j2 = (short) j; mmx[j][i].matr_ftype = (char) MATR_NODE; mmx[j][i].begl_ftype = (char) BEGINL_NODE; mmx[j][i].begr_i2 = (short) i; mmx[j][i].begr_ftype = (char) BEGINR_NODE; mmx[j][i].bifurc_mid = (short) i; } return mmx; } /* Function: init_maxmx() * * Purpose: Initialize the scoring matrix. The offdiagonal j+1,j and the * diagonal j,j are initialized. * * In the offdiagonal, we use BIFURC to represent END, and set * its score to zero. * * In the diagonal, it doesn't matter whether we use MATL or * MATR to generate single symbols; we use MATL for an implementation- * specific reason (if we use MATR, we need an extra row for i=0,j=-1) * MATL's get calculated scores and traceback pointers to * (j+1,j,END). BEGINL, BEGINR are also calculated here. * MATR, MATP, BIFURC are left at NEGINFINITY. * * Args: mmx: saved score/traceback pointer matrix * nseq: number of sequences in alignment * alen: number of columns in alignment * prior: prior probability distributions * mscore: singlet emission costs for each column * gapcount: weighted counts of gaps in each column * * Return: (void). maxmx() is initialized. */ static void init_maxmx(struct maxmx_s **mmx, int nseq, int alen, struct prior_s *prior, int *mscore, double *gapcount) { int j; int fromstate, tostate; double trans[STATETYPES][STATETYPES]; /* state transition matrix */ /* Do the offdiagonal (j+1,j). * set BIFURC/END alignment costs to zero. * Everything else is left at -oo. */ for (j = 0; j <= alen; j++) mmx[j][j+1].sc[END_NODE] = 0; /* Do the diagonal (j,j). * MATL is calculated; MATR is the same; then BEGINL, BEGINR are calculated. */ for (j = 1; j <= alen; j++) { /* Make a transition matrix for MATL -> END for this (j,j) */ for (fromstate = 0; fromstate < STATETYPES; fromstate++) for (tostate = 0; tostate < STATETYPES; tostate++) trans[fromstate][tostate] = 0.0; trans[MATL_ST][END_ST] = (double) nseq - gapcount[j]; trans[DEL_ST][END_ST] = gapcount[j]; ProbifyTransitionMatrix(trans, MATL_NODE, END_NODE, prior); /* Score = sum P(j | MATL) + sum T(END | j,j,(DEL|MATL)) * Set traceback pointers. */ mmx[j][j].sc[MATL_NODE] = mscore[j] + (int) (INTPRECISION * ((log(trans[MATL_ST][END_ST]) * ((double) nseq - gapcount[j])) + (log(trans[DEL_ST][END_ST]) * gapcount[j])) ); mmx[j][j].matl_i2 = j+1; mmx[j][j].matl_ftype = END_NODE; /* MATR_NODE scores are exactly the same as MATL_NODE on * the diagonal */ mmx[j][j].sc[MATR_NODE] = mmx[j][j].sc[MATL_NODE]; mmx[j][j].matr_j2 = j-1; mmx[j][j].matr_ftype = END_NODE; /* Calculate BEGINL -> MATL; * set traceback pointer. */ for (fromstate = 0; fromstate < STATETYPES; fromstate++) for (tostate = 0; tostate < STATETYPES; tostate++) trans[fromstate][tostate] = 0.0; trans[BEGIN_ST][MATL_ST] = (double) nseq - gapcount[j]; trans[BEGIN_ST][DEL_ST] = gapcount[j]; ProbifyTransitionMatrix(trans, BEGINL_NODE, MATL_NODE, prior); mmx[j][j].sc[BEGINL_NODE] = mmx[j][j].sc[MATL_NODE] + (int) (INTPRECISION * ((log(trans[BEGIN_ST][MATL_ST]) * ((double) nseq - gapcount[j])) + (log(trans[BEGIN_ST][DEL_ST]) * gapcount[j])) ); mmx[j][j].begl_ftype = MATL_NODE; /* Make a transition matrix for BEGINR -> MATL for this (j,j) */ for (fromstate = 0; fromstate < STATETYPES; fromstate++) for (tostate = 0; tostate < STATETYPES; tostate++) trans[fromstate][tostate] = 0.0; trans[BEGIN_ST][DEL_ST] = gapcount[j]; trans[BEGIN_ST][MATL_ST] = (double) nseq - gapcount[j]; ProbifyTransitionMatrix(trans, BEGINR_NODE, MATL_NODE, prior); /* Score for (j,j,BEGINR_NODE), and set traceback pointers */ mmx[j][j].sc[BEGINR_NODE] = mmx[j][j].sc[MATL_NODE] + (int) (INTPRECISION * ((log(trans[BEGIN_ST][MATL_ST]) * ((double) nseq - gapcount[j])) + (log(trans[BEGIN_ST][DEL_ST]) * gapcount[j])) ); mmx[j][j].begr_i2 = j; mmx[j][j].begr_ftype = MATL_NODE; } } /* Function: recurse_maxmx() * * Purpose: Recursion calculations of the maximum likelihood CM construction * algorithm. * * Args: aseqsT - transposed alignment; [1..alen+1][0..nseq-1] * weights - weights assigned to each sequence, usually 1.0 * alen - number of columns in alignment * nseq - number of aseqs * prior - structure containing prior probability distributions * mscore - singlet match emission costs * mmx - scoring matrix we fill in; mmx[j][i], lower diagonal * * Return: (void). mmx is filled with scores and traceback pointers. */ static void recurse_maxmx(int **aseqsT, float *weights, int alen, int nseq, struct prior_s *prior, int *mscore, double *gapcount, double gapthresh, struct maxmx_s **mmx) { int **insl_accum; /* accumulated INSL counts for all starting positions, all seqs */ int **insr_accum; /* accumulated INSR counts for all starting positions, all seqs */ double tmaster[STATETYPES][STATETYPES]; /* master copy of state transition table, counts */ double tcounts[STATETYPES][STATETYPES]; /* state transition table, counts */ double tmx[STATETYPES][STATETYPES]; /* state transition probabilities post-regularization */ int tonode; /* node type to connect to */ int i,j; /* current cell column, row */ int i2, j2; /* i', j': cell to connect to */ int idx; /* counter for sequences */ int sc; /* temp variable holding a score */ int mid; /* midpoint for a bifurcation */ if ((insr_accum = (int **) malloc ((alen+1) * sizeof(int *))) == NULL || (insl_accum = (int **) malloc ((alen+2) * sizeof(int *))) == NULL) Die("malloc failed"); for (i = 0; i <= alen; i++) if ((insr_accum[i] = (int *) malloc (nseq * sizeof(int))) == NULL) Die("malloc failed"); for (i = 0; i <= alen+1; i++) if ((insl_accum[i] = (int *) malloc (nseq * sizeof(int))) == NULL) Die("malloc failed"); gapthresh *= (double) nseq; /* scale gapthresh to be comparable to counts in gapcount array */ /* Initialize insr_accum. (Vertical, j insertion accumulator) * insr_accum contains, for each sequence, a count of how many symbols * must be inserted to get from any row fromj to the current row (exclusive * of fromj and the current column). * insr_accum is therefore [0..alen-1][0..nseq-1] */ for (j2 = 1; j2 <= alen; j2++) for (idx = 0; idx < nseq; idx++) insr_accum[j2][idx] = 0; for (idx = 0; idx < nseq; idx++) if (aseqsT[1][idx] >= 0) insr_accum[0][idx] = 1; for (j = 2; j <= alen; j++) { /* Initialize insl_accum (horizontal, i insertion) array each time we start * a new row. insl_accum contains, for each sequence, how many symbols * must be inserted to get from any column i2 to the current column, * exclusive. insl_accum is therefore [1..alen+1][0..nseq] */ for (i2 = 1; i2 <= j+1; i2++) for (idx = 0; idx < nseq; idx++) insl_accum[i2][idx] = 0; for (idx = 0; idx < nseq; idx++) if (aseqsT[j][idx] >= 0) insl_accum[j+1][idx]++; for (i = j-1; i > 0; i--) { /* BIFURC: explain i,j as sum of i,mid,BEGINL + mid+1,j,BEGINR * i <= mid <= j */ for (mid = i; mid <= j; mid++) { sc = mmx[mid][i].sc[BEGINL_NODE] + mmx[j][mid+1].sc[BEGINR_NODE]; if (sc > mmx[j][i].sc[BIFURC_NODE]) { mmx[j][i].sc[BIFURC_NODE] = sc; mmx[j][i].bifurc_mid = mid; } } /* MATP: Score subsequence i,j, given that i,j are emitted by MATP * Look at all possible connections i'j': i < i' < j, i < j' < j, * i' <= j'+1 * Could be i < i' <= j' < j: the extra i' <= j'+1 condition * allows for checking all the ways of generating i'..j' * as entirely insertion, and we may just as well define a * default... */ for (j2 = j-1; j2 >= i && j - j2 - 1 <= MAXINSERT ; j2--) /* for (j2 = j-1; j2 >= i; j2--) */ { for (i2 = i+1; i2 <= j2+1 && i2 - i - 1 <= MAXINSERT; i2++) /* for (i2 = i+1; i2 <= j2+1; i2++) */ { if (mmx[j2][i2].sc[MATP_NODE] < mmx[j][i].sc[MATP_NODE] && mmx[j2][i2].sc[MATL_NODE] < mmx[j][i].sc[MATP_NODE] && mmx[j2][i2].sc[MATR_NODE] < mmx[j][i].sc[MATP_NODE] && mmx[j2][i2].sc[BIFURC_NODE] < mmx[j][i].sc[MATP_NODE]) continue; frommatp_transtable(aseqsT, weights, nseq, i,j, i2, j2, insl_accum[i2], insr_accum[j2], tmaster); for (tonode = 0; tonode < 4; tonode++) { if (i2 > j2 && tonode != BIFURC_NODE) continue; if (i2 == j2 && tonode == MATP_NODE) continue; if (mmx[j2][i2].sc[tonode] < mmx[j][i].sc[MATP_NODE]) continue; switch (tonode) { case MATP_NODE: to_matp_transtable(tmaster, tcounts); break; case MATL_NODE: to_matl_transtable(tmaster, tcounts); break; case MATR_NODE: to_matr_transtable(tmaster, tcounts); break; case BIFURC_NODE: to_bifurc_transtable(tmaster, tcounts); break; default: Die("Gotcha. MATP, MATL, MATR, BIFURC nodes must be numbered 0..3"); } copy_transtable(tmx, tcounts); ProbifyTransitionMatrix(tmx, MATP_NODE, tonode, prior); sc = dot_score((double *) tcounts, (double *) tmx, STATETYPES*STATETYPES) + mmx[j2][i2].sc[tonode]; if (sc > mmx[j][i].sc[MATP_NODE]) { mmx[j][i].sc[MATP_NODE] = sc; mmx[j][i].matp_i2 = i2; mmx[j][i].matp_j2 = j2; mmx[j][i].matp_ftype = tonode; } } if (gapcount[i2] <= gapthresh) break; } if (gapcount[j2] <= gapthresh) break; } mmx[j][i].sc[MATP_NODE] += pair_emissioncost(aseqsT[i], aseqsT[j], weights, nseq, prior); /* MATR: i,j is accounted for by emitting j and connecting * to some i,j2. */ for (j2 = j-1; j2 >= i-1; j2--) { if (mmx[j2][i].sc[MATP_NODE] < mmx[j][i].sc[MATR_NODE] && mmx[j2][i].sc[MATL_NODE] < mmx[j][i].sc[MATR_NODE] && mmx[j2][i].sc[MATR_NODE] < mmx[j][i].sc[MATR_NODE] && mmx[j2][i].sc[BIFURC_NODE] < mmx[j][i].sc[MATR_NODE]) continue; frommatr_transtable(aseqsT, weights, nseq, i, j, j2, insr_accum[j2], tmaster); for (tonode = 0; tonode < 4; tonode++) { if (i > j2 && tonode != BIFURC_NODE) continue; if (i == j2 && tonode == MATP_NODE) continue; if (mmx[j2][i].sc[tonode] < mmx[j][i].sc[MATR_NODE]) continue; switch (tonode) { case MATP_NODE: to_matp_transtable(tmaster, tcounts); break; case MATL_NODE: to_matl_transtable(tmaster, tcounts); break; case MATR_NODE: to_matr_transtable(tmaster, tcounts); break; case BIFURC_NODE: to_bifurc_transtable(tmaster, tcounts); break; default: Die("Gotcha. MATP, MATL, MATR, BIFURC nodes must be numbered 0..3"); } copy_transtable(tmx, tcounts); ProbifyTransitionMatrix(tmx, MATR_NODE, tonode, prior); sc = dot_score((double *) tcounts, (double *) tmx, STATETYPES*STATETYPES) + mmx[j2][i].sc[tonode]; if (sc > mmx[j][i].sc[MATR_NODE]) { mmx[j][i].sc[MATR_NODE] = sc; mmx[j][i].matr_j2 = j2; mmx[j][i].matr_ftype = tonode; } } if (gapcount[j2] <= gapthresh) break; } mmx[j][i].sc[MATR_NODE] += mscore[j]; /* MATL: account for i,j by emitting i and connecting to some (i2,j) */ for (i2 = i+1; i2 <= j+1; i2++) { if (mmx[j][i2].sc[MATP_NODE] < mmx[j][i].sc[MATL_NODE] && mmx[j][i2].sc[MATL_NODE] < mmx[j][i].sc[MATL_NODE] && mmx[j][i2].sc[MATR_NODE] < mmx[j][i].sc[MATL_NODE] && mmx[j][i2].sc[BIFURC_NODE] < mmx[j][i].sc[MATL_NODE]) continue; frommatl_transtable(aseqsT, weights, nseq, i, j, i2, insl_accum[i2], tmaster); for (tonode = 0; tonode < 4; tonode++) { if (i2 > j && tonode != BIFURC_NODE) continue; if (i2 == j && tonode == MATP_NODE) continue; if (mmx[j][i2].sc[tonode] < mmx[j][i].sc[MATL_NODE]) continue; switch (tonode) { case MATP_NODE: to_matp_transtable(tmaster, tcounts); break; case MATL_NODE: to_matl_transtable(tmaster, tcounts); break; case MATR_NODE: to_matr_transtable(tmaster, tcounts); break; case BIFURC_NODE: to_bifurc_transtable(tmaster, tcounts); break; default: Die("Gotcha. MATP, MATL, MATR, BIFURC nodes must be numbered 0..3"); } copy_transtable(tmx, tcounts); ProbifyTransitionMatrix(tmx, MATL_NODE, tonode, prior); sc = dot_score((double *) tcounts, (double *) tmx, STATETYPES*STATETYPES) + mmx[j][i2].sc[tonode]; if (sc > mmx[j][i].sc[MATL_NODE]) { mmx[j][i].sc[MATL_NODE] = sc; mmx[j][i].matl_i2 = i2; mmx[j][i].matl_ftype = tonode; } } if (gapcount[i2] <= gapthresh) break; } mmx[j][i].sc[MATL_NODE] += mscore[i]; /* bump insl_accum: add column i to horizontal accumulator as insertion. */ for (i2 = i+1; i2 <= j+1; i2++) for (idx = 0; idx < nseq; idx++) if (aseqsT[i][idx] >= 0) insl_accum[i2][idx]++; /* BEGINR: has an INSL state, so it can connect to any * (i2,j) *inclusive* of (i,j) -- that's why we just bumped * the insl_accum insert counters */ for (i2 = i; i2 <= j+1; i2++) { if (mmx[j][i2].sc[MATP_NODE] < mmx[j][i].sc[BEGINR_NODE] && mmx[j][i2].sc[MATL_NODE] < mmx[j][i].sc[BEGINR_NODE] && mmx[j][i2].sc[MATR_NODE] < mmx[j][i].sc[BEGINR_NODE] && mmx[j][i2].sc[BIFURC_NODE] < mmx[j][i].sc[BEGINR_NODE]) continue; frombeginr_transtable(aseqsT, weights, nseq, j, i2, insl_accum[i2], tmaster); for (tonode = 0; tonode < 4; tonode++) { if (i2 > j && tonode != BIFURC_NODE) continue; if (i2 == j && tonode == MATP_NODE) continue; if (mmx[j][i2].sc[tonode] < mmx[j][i].sc[BEGINR_NODE]) continue; switch (tonode) { case MATP_NODE: to_matp_transtable(tmaster, tcounts); break; case MATL_NODE: to_matl_transtable(tmaster, tcounts); break; case MATR_NODE: to_matr_transtable(tmaster, tcounts); break; case BIFURC_NODE: to_bifurc_transtable(tmaster, tcounts); break; default: Die("Gotcha. MATP, MATL, MATR, BIFURC nodes must be numbered 0..3"); } copy_transtable(tmx, tcounts); ProbifyTransitionMatrix(tmx, BEGINR_NODE, tonode, prior); sc = dot_score((double *) tcounts, (double *) tmx, STATETYPES*STATETYPES) + mmx[j][i2].sc[tonode]; if (sc > mmx[j][i].sc[BEGINR_NODE]) { mmx[j][i].sc[BEGINR_NODE] = sc; mmx[j][i].begr_i2 = i2; mmx[j][i].begr_ftype = tonode; } } if (gapcount[i2] <= gapthresh) break; } /* BEGINL: has no inserts, so must connect to (i,j) */ frombeginl_transtable(aseqsT, weights, nseq, i, j, tmaster); for (tonode = 0; tonode < 4; tonode++) { if (mmx[j][i].sc[MATP_NODE] < mmx[j][i].sc[BEGINL_NODE] && mmx[j][i].sc[MATL_NODE] < mmx[j][i].sc[BEGINL_NODE] && mmx[j][i].sc[MATR_NODE] < mmx[j][i].sc[BEGINL_NODE] && mmx[j][i].sc[BIFURC_NODE] < mmx[j][i].sc[BEGINL_NODE]) continue; switch (tonode) { case MATP_NODE: to_matp_transtable(tmaster, tcounts); break; case MATL_NODE: to_matl_transtable(tmaster, tcounts); break; case MATR_NODE: to_matr_transtable(tmaster, tcounts); break; case BIFURC_NODE: to_bifurc_transtable(tmaster, tcounts); break; default: Die("Gotcha. MATP, MATL, MATR, BIFURC nodes must be numbered 0..3"); } copy_transtable(tmx, tcounts); ProbifyTransitionMatrix(tmx, BEGINL_NODE, tonode, prior); sc = dot_score((double *) tcounts, (double *) tmx, STATETYPES*STATETYPES) + mmx[j][i].sc[tonode]; if (sc > mmx[j][i].sc[BEGINL_NODE]) { mmx[j][i].sc[BEGINL_NODE] = sc; mmx[j][i].begl_ftype = tonode; } } } /* end loop over columns i */ /* bump insr_accum: add row j to vertical accumulator as insertion. */ for (j2 = 0; j2 < j; j2++) for (idx = 0; idx < nseq; idx++) if (aseqsT[j][idx] >= 0) insr_accum[j2][idx]++; } /* end loop over rows j */ /* Termination. ROOT can connect anywhere. * We hack here: ROOT alignment info is stored in the cell mmx[alen][0] * (otherwise, the i==0 column is unused), and score/traceback info is kept as if * for MATP_NODE (because ROOT and MATP have similar traceback requirements, * since they permit inserts on both sides.) * */ for (j2 = alen; j2 >= 0; j2--) { for (i2 = 1; i2 <= j2+1; i2++) { if (mmx[j2][i2].sc[MATP_NODE] < mmx[alen][0].sc[MATP_NODE] && mmx[j2][i2].sc[MATL_NODE] < mmx[alen][0].sc[MATP_NODE] && mmx[j2][i2].sc[MATR_NODE] < mmx[alen][0].sc[MATP_NODE] && mmx[j2][i2].sc[BIFURC_NODE] < mmx[alen][0].sc[MATP_NODE]) continue; fromroot_transtable(aseqsT, weights, nseq, i2, j2, insl_accum[i2], insr_accum[j2], tmaster); for (tonode = 0; tonode < 4; tonode++) { if (i2 > j2 && tonode != BIFURC_NODE) continue; if (i2 == j2 && tonode == MATP_NODE) continue; if (mmx[j2][i2].sc[tonode] < mmx[alen][0].sc[MATP_NODE]) continue; switch (tonode) { case MATP_NODE: to_matp_transtable(tmaster, tcounts); break; case MATL_NODE: to_matl_transtable(tmaster, tcounts); break; case MATR_NODE: to_matr_transtable(tmaster, tcounts); break; case BIFURC_NODE: to_bifurc_transtable(tmaster, tcounts); break; default: Die("Gotcha. MATP, MATL, MATR, BIFURC nodes must be numbered 0..3"); } copy_transtable(tmx, tcounts); ProbifyTransitionMatrix(tmx, ROOT_NODE, tonode, prior); sc = dot_score((double *) tcounts, (double *) tmx, STATETYPES*STATETYPES) + mmx[j2][i2].sc[tonode]; if (sc > mmx[alen][0].sc[MATP_NODE]) { mmx[alen][0].sc[MATP_NODE] = sc; mmx[alen][0].matp_i2 = i2; mmx[alen][0].matp_j2 = j2; mmx[alen][0].matp_ftype = tonode; } } if (gapcount[i2] <= gapthresh) break; } if (gapcount[j2] <= gapthresh) break; } Free2DArray(insr_accum, alen+1); Free2DArray(insl_accum, alen+2); } /* Function: trace_maxmx() * * Purpose: Traceback of the filled matrix. Constructs a consensus * tree structure. The trace_s structures are only partially * used: emitl and emitr hold 0..alen-1 column coords of emitted * columns (even if not responsible for the emission); nodeidx * is unused; type holds a *node* type, not a state type. * * Note that the alignment of ROOT_NODE has been stored in * mmx[alen][0] like a MATP_NODE, due to some convenient * hacking by the recursion routine. * * The mmx[] scoring matrix traceback pointers are 1..alen coords. * They have to be converted to 0..alen-1 for the traceback tree. * * Args: mmx - filled scoring matrix: mmx[j][i], lower diagonal * alen - width of alignment * ret_mtr - RETURN: master (consensus) traceback * * Return: (void). ret_mtr must be free'd by the caller. */ static void trace_maxmx(struct maxmx_s **mmx, int alen, struct trace_s **ret_mtr) { struct trace_s *mtr; /* master (consensus) traceback */ struct trace_s *curr_mtr; /* current node on traceback tree */ struct tracestack_s *dolist; /* pushdown stack for traversing mtr */ int i,j; /* coords to connect to, next trace ssegment */ int nxti, nxtj; InitTrace(&mtr, NULL); dolist = InitTracestack(); /* Initialization. First attach a root node. Then, trace first segment, * attach it, and start the tracestack to-do list with it. */ curr_mtr = AttachTrace(mtr, NULL, 0, alen-1, 0, ROOT_NODE); PushTracestack(dolist, AttachTrace(curr_mtr, NULL, mmx[alen][0].matp_i2 -1, mmx[alen][0].matp_j2 -1, 0, mmx[alen][0].matp_ftype)); while ((curr_mtr = PopTracestack(dolist)) != NULL) { i = curr_mtr->emitl + 1; /* i,j are 1..alen */ j = curr_mtr->emitr + 1; /* avoid dummy END node on trace tree leaves, and * avoid tracing back from off-diagonal */ if (curr_mtr->nxtl == NULL || i > j) continue; switch (curr_mtr->type) { case MATP_NODE: nxti = mmx[j][i].matp_i2 - 1; nxtj = mmx[j][i].matp_j2 - 1; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(curr_mtr, NULL, nxti, nxtj, 0, mmx[j][i].matp_ftype)); else { curr_mtr->nxtl->emitl = nxti; curr_mtr->nxtl->emitr = nxtj; } break; case MATL_NODE: nxti = mmx[j][i].matl_i2 - 1; nxtj = j-1; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(curr_mtr, NULL, nxti, nxtj, 0, mmx[j][i].matl_ftype)); else { curr_mtr->nxtl->emitl = nxti; curr_mtr->nxtl->emitr = nxtj; } break; case MATR_NODE: nxti = i-1; nxtj = mmx[j][i].matr_j2 -1; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(curr_mtr, NULL, nxti, nxtj, 0, mmx[j][i].matr_ftype)); else { curr_mtr->nxtl->emitl = nxti; curr_mtr->nxtl->emitr = nxtj; } break; case BIFURC_NODE: /* BIFURC must attach right side first */ PushTracestack(dolist, AttachTrace(curr_mtr, NULL, mmx[j][i].bifurc_mid, j -1, 0, BEGINR_NODE)); PushTracestack(dolist, AttachTrace(curr_mtr, NULL, i -1, mmx[j][i].bifurc_mid -1, 0, BEGINL_NODE)); break; case BEGINL_NODE: nxti = i-1; nxtj = j-1; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(curr_mtr, NULL, nxti, nxtj, 0, mmx[j][i].begl_ftype)); else { curr_mtr->nxtl->emitl = nxti; curr_mtr->nxtl->emitr = nxtj; } break; case BEGINR_NODE: nxti = mmx[j][i].begr_i2 - 1; nxtj = j-1; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(curr_mtr, NULL, nxti, nxtj, 0, mmx[j][i].begr_ftype)); else { curr_mtr->nxtl->emitl = nxti; curr_mtr->nxtl->emitr = nxtj; } break; default: Die("Invalid node type %d", curr_mtr->type); } } FreeTracestack(dolist); *ret_mtr = mtr; } /* Function: transpose_alignment() * * Purpose: Alignments are indexed [seqidx][position]; it turns out to * be more convenient here to index them as [position][seqidx], * because of memory access patterns. This transpose also * lets us switch to a 1..alen indexing scheme for the alignment * columns, from 0..alen-1; this is important for implementing * boundary conditions properly in the scoring matrix. And * finally, we store the symbols as indices to save time in * further lookups: -1 for gaps, 0..3 for ACGU (or 0..19 for aminos) * * Args: aseqs - flushed sequence alignment, each seq indexed 0..alen-1 * alen - number of columns in alignment * nseq - number of sequences * prior - contains alphabet info * ret_aseqsT - RETURN: transposed alignment, [1..alen][0..nseq-1] * * Return: (void). ret_aseqsT is malloc'ed here and must be free'd * by caller. */ static void transpose_alignment(char **aseqs, int alen, int nseq, int ***ret_aseqsT) { int **aseqsT; int acol; int seqidx; if ((aseqsT = (int **) malloc ((alen+2) * sizeof(int *))) == NULL) Die("malloc failed"); for (acol = 0; acol <= alen+1; acol++) if ((aseqsT[acol] = (int *) malloc (nseq * sizeof(int))) == NULL) Die("malloc failed"); /* "guard" columns 0 and alen+1 */ for (seqidx = 0; seqidx < nseq; seqidx++) { aseqsT[0][seqidx] = -1; aseqsT[alen+1][seqidx] = -1; } /* gaps are assigned value -1 in aseqsT */ for (seqidx = 0; seqidx < nseq; seqidx++) for (acol = 0; acol < alen; acol++) aseqsT[acol+1][seqidx] = isgap(aseqs[seqidx][acol]) ? -1 : SymbolIndex(aseqs[seqidx][acol]); *ret_aseqsT = aseqsT; } /* Function: singlet_emissions() * * Purpose: Count emission statistics for all the columns of * a multiple alignment; calculate and return an * array of emission scores for each column emitted * by MATL. (The caller can assume that MATR is scored * the same way.) * * Args: aseqsT - [1..alen][0..nseqs-1] transpose of sequence alignment * weights - weights on sequences (usually just 1.0 for each) * alen - number of columns in alignment * nseq - number of sequences * prior - prior probability distributions, and alphabet info * ret_mscore - RETURN: array of singlet emission costs * ret_gapcount - RETURN: weighted counts of gaps in each column * * Return: (void). ret_mscore is passed back; it is malloc'ed here * and must be free'd by the caller. */ static void singlet_emissions(int **aseqsT, float *weights, int alen, int nseq, struct prior_s *prior, int **ret_mscore, double **ret_gapcount) { double **emcounts; double *gapcount; int *mscore; double emvec[ALPHASIZE]; int i; int idx; int sym; /* Allocations */ if ((emcounts = (double **) malloc ((alen+1) * sizeof(double *))) == NULL || (mscore = (int *) malloc ((alen+1) * sizeof(int))) == NULL || (gapcount = (double *) malloc ((alen+2) * sizeof(double))) == NULL) Die("malloc failed"); for (i = 0; i <= alen; i++) if ((emcounts[i] = (double *) malloc (ALPHASIZE * sizeof(double))) == NULL) Die("malloc failed"); /* Count symbol occurrences in each column */ for (i = 1; i <= alen; i++) { gapcount[i] = 0.0; for (sym = 0; sym < ALPHASIZE; sym++) emcounts[i][sym] = 0.0; for (idx = 0; idx < nseq; idx++) { if (aseqsT[i][idx] >= 0) emcounts[i][aseqsT[i][idx]] += weights[idx]; else gapcount[i] += weights[idx]; } } gapcount[0] = 0.0; gapcount[alen+1] = 0.0; /* For each column, create an emission probability vector, * and calculate score using emcounts. */ for (i = 1; i <= alen; i++) { for (sym = 0; sym < ALPHASIZE; sym++) emvec[sym] = emcounts[i][sym]; ProbifySingletEmission(emvec, uMATL_ST, prior); mscore[i] = dot_score(emcounts[i], emvec, ALPHASIZE); mscore[i] += (nseq - gapcount[i]) * (int) (INTPRECISION * log((float) ALPHASIZE)); } Free2DArray(emcounts, alen+1); *ret_gapcount = gapcount; *ret_mscore = mscore; } /* Function: pair_emissioncost() * * Purpose: Count emission statistics for a given MATP_NODE-assigned * i,j, and assign a cost. * * Args: coli - ptr to column i from aseqsT transposed alignment, [0..nseq-1] * colj - ptr to column j from aseqsT transposed alignment, [0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * prior - prior probability distributions * * Return: The cost of emitting the (i,j) column pair (integer). */ static int pair_emissioncost(int *coli, int *colj, float *weights, int nseq, struct prior_s *prior) { double matp_count[ALPHASIZE][ALPHASIZE]; double matl_count[ALPHASIZE]; double matr_count[ALPHASIZE]; double matp_emit[ALPHASIZE][ALPHASIZE]; double matl_emit[ALPHASIZE]; double matr_emit[ALPHASIZE]; int sc; int symi, symj; int idx; /* Zero the counter arrays */ for (symi = 0; symi < ALPHASIZE; symi++) { matr_count[symi] = 0.0; matl_count[symi] = 0.0; for (symj = 0; symj < ALPHASIZE; symj++) matp_count[symi][symj] = 0.0; } /* Count pairs, singlets */ for (idx = 0; idx < nseq; idx++) { if (coli[idx] == -1) { if (colj[idx] == -1) continue; else matl_count[colj[idx]] += weights[idx]; } else if (colj[idx] == -1) matr_count[coli[idx]] += weights[idx]; else matp_count[coli[idx]][colj[idx]] += weights[idx]; } /* Create probability matrices */ copy_singlet(matl_emit, matl_count); copy_singlet(matr_emit, matr_count); copy_pairwise(matp_emit, matp_count); ProbifySingletEmission(matl_emit, uMATL_ST, prior); ProbifySingletEmission(matr_emit, uMATR_ST, prior); ProbifyPairEmission(matp_emit, prior); /* convert probs to log odds */ for (symi = 0; symi < ALPHASIZE; symi++) { matl_emit[symi] *= (double) ALPHASIZE; matr_emit[symi] *= (double) ALPHASIZE; for (symj = 0; symj < ALPHASIZE; symj ++) matp_emit[symi][symj] *= (double) (ALPHASIZE*ALPHASIZE); } /* Score is the sum of dot products of counts and probability matrices */ sc = dot_score((double *) matl_count, (double *) matl_emit, ALPHASIZE) + dot_score((double *) matr_count, (double *) matr_emit, ALPHASIZE) + dot_score((double *) matp_count, (double *) matp_emit, ALPHASIZE*ALPHASIZE); return sc; } /* Function: frommatp_transtable() * * Purpose: Given an starting cell i,j and an ending cell * i',j' (i < i' < j' < j), calculate the * 6x6 transition table between these pairs, * assuming both (i,j) and (i',j') are assigned * to MATP. Transitions to other types of nodes * at i',j' (MATR, MATL, BIFURC) are easily derived from this one. * * Args: aseqsT - transpose of alignment, [1..alen][0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * i,j - i,j * i2,j2 - i',j' * accum_insl - (0..nseq-1) array of number of inserted symbols * between i,i' for each sequence * accum_insr - (0..nseq-1) array of number of inserted symbols * between j',j for each sequence * trans[][] - filled in: state transition matrix (counts) * * Return: (void) * trans[][] is filled in. */ static void frommatp_transtable(int **aseqsT, float *weights, int nseq, int i, int j, int i2, int j2, int *accum_insl, int *accum_insr, double trans[STATETYPES][STATETYPES]) { int fy, ty; /* from state index, to state index */ int idx; /* counter for sequences */ /* Zero the counter array */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = 0.0; /* For each sequence: * assign fy, based on symbols vs. gaps at i,j * assign ty, based on symbols vs. gaps at i',j' * use accum_insr and accum_insl and bump counters appropriately. */ for (idx = 0; idx < nseq; idx++) { fy = assign_cell(i,j,aseqsT[i][idx], aseqsT[j][idx]); ty = assign_cell(i2,j2,aseqsT[i2][idx], aseqsT[j2][idx]); if (accum_insl[idx] == 0 && accum_insr[idx] == 0) { trans[fy][ty] += weights[idx]; } else if (accum_insl[idx] > 0) { trans[fy][INSL_ST] += weights[idx]; trans[INSL_ST][INSL_ST] += (accum_insl[idx]-1) * weights[idx]; if (accum_insr[idx] > 0) { trans[INSL_ST][INSR_ST] += weights[idx]; trans[INSR_ST][INSR_ST] += (accum_insr[idx]-1) * weights[idx]; trans[INSR_ST][ty] += weights[idx]; } else { trans[INSL_ST][ty] += weights[idx]; } } else if (accum_insr[idx] > 0) { trans[fy][INSR_ST] += weights[idx]; trans[INSR_ST][INSR_ST] += (accum_insr[idx]-1) * weights[idx]; trans[INSR_ST][ty] += weights[idx]; } } /* end loop over all sequences */ } /* Function: frommatl_transtable() * * Purpose: Given an starting cell i,j assigned to MATL_NODE * and an ending cell i',j (i < i' < j), calculate the * 6x6 transition table between these pairs, * assuming (i',j) is assigned to MATP. * Transitions to other types of nodes * at i',j (MATR, MATL, BIFURC) are easily derived from this one. * * Args: aseqsT - transpose of alignment, [1..alen][0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * i,j - i,j * i2 - i' * accum_insl - (0..nseq-1) array of number of inserted symbols * between i,i' for each sequence * trans[][] - filled in: state transition matrix (counts) * * Return: (void) * trans[][] is filled in. */ static void frommatl_transtable(int **aseqsT, float *weights, int nseq, int i, int j, int i2, int *accum_insl, double trans[STATETYPES][STATETYPES]) { int fy, ty; /* from state index, to state index */ int idx; /* counter for sequences */ /* Zero the counter array */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = 0.0; /* For each sequence: * assign fy, based on symbols vs. gaps at i * assign ty, based on symbols vs. gaps at i',j * use accum_insl and bump counters appropriately. */ for (idx = 0; idx < nseq; idx++) { fy = (aseqsT[i][idx] == -1) ? DEL_ST : MATL_ST; ty = assign_cell(i2, j, aseqsT[i2][idx], aseqsT[j][idx]); if (accum_insl[idx] == 0) { trans[fy][ty] += weights[idx]; } else { trans[fy][INSL_ST] += weights[idx]; trans[INSL_ST][INSL_ST] += (accum_insl[idx]-1) * weights[idx]; trans[INSL_ST][ty] += weights[idx]; } } /* end loop over all sequences */ } /* Function: frommatr_transtable() * * Purpose: Given an starting cell i,j assigned to MATR_NODE * and an ending cell i,j' (i < j' < j), calculate the * 6x6 transition table between these pairs, * assuming (i,j') is assigned to MATP. * Transitions to other types of nodes * at i,j' (MATR, MATL, BIFURC) are easily derived from this one. * * Args: aseqsT - transpose of alignment, [1..alen][0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * i,j - i,j * j2 - j' * accum_insr - (0..nseq-1) array of number of inserted symbols * between j',j for each sequence * trans[][] - filled in: state transition matrix (counts) * * Return: (void) * trans[][] is filled in. */ static void frommatr_transtable(int **aseqsT, float *weights, int nseq, int i, int j, int j2, int *accum_insr, double trans[STATETYPES][STATETYPES]) { int fy, ty; /* from state index, to state index */ int idx; /* counter for sequences */ /* Zero the counter array */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = 0.0; /* For each sequence: * assign fy, based on symbols vs. gaps at j * assign ty, based on symbols vs. gaps at i,j' * use accum_insr and bump counters appropriately. */ for (idx = 0; idx < nseq; idx++) { fy = (aseqsT[j][idx] == -1) ? DEL_ST : MATR_ST; ty = assign_cell(i, j2, aseqsT[i][idx], aseqsT[j2][idx]); if (accum_insr[idx] == 0) { trans[fy][ty] += weights[idx]; } else { trans[fy][INSR_ST] += weights[idx]; trans[INSR_ST][INSR_ST] += (accum_insr[idx]-1) * weights[idx]; trans[INSR_ST][ty] += weights[idx]; } } /* end loop over all sequences */ } /* Function: frombeginr_transtable() * * Purpose: Given an starting cell i,j assigned to BEGINR_NODE * and an ending cell i',j (i < i' < j), calculate the * 6x6 transition table between these pairs, * assuming (i',j) is assigned to MATP. * Transitions to other types of nodes * at i',j (MATR, MATL, BIFURC) are easily derived from this one. * * Args: aseqsT - transpose of alignment, [1..alen][0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * i,j - i,j * i2 - i' * accum_insl - (0..nseq-1) array of number of inserted symbols * between j',j for each sequence * trans[][] - filled in: state transition matrix (counts) * * Return: (void) * trans[][] is filled in. */ static void frombeginr_transtable(int **aseqsT, float *weights, int nseq, int j, int i2, int *accum_insl, double trans[STATETYPES][STATETYPES]) { int fy, ty; /* from state index, to state index */ int idx; /* counter for sequences */ /* Zero the counter array */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = 0.0; /* For each sequence: * assign ty, based on symbols vs. gaps at i',j * use accum_insr and bump counters appropriately. */ fy = BEGIN_ST; for (idx = 0; idx < nseq; idx++) { ty = assign_cell(i2, j, aseqsT[i2][idx], aseqsT[j][idx]); if (accum_insl[idx] == 0) { trans[fy][ty] += weights[idx]; } else { trans[fy][INSL_ST] += weights[idx]; trans[INSL_ST][INSL_ST] += (accum_insl[idx]-1) * weights[idx]; trans[INSL_ST][ty] += weights[idx]; } } /* end loop over all sequences */ } /* Function: frombeginl_transtable() * * Purpose: Given an starting cell i,j assigned to BEGINR_NODE (which must * connect to i,j itself), * calculate the 6x6 transition table between the BEGINR * and the other states in the cell, assuming i,j is * assigned to MATP. * Transitions to other types of nodes * at i,j (MATR, MATL, BIFURC) are easily derived from this one. * * Args: aseqsT - transpose of alignment, [1..alen][0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * i,j - i,j * trans[][] - filled in: state transition matrix (counts) * * Return: (void) * trans[][] is filled in. */ static void frombeginl_transtable(int **aseqsT, float *weights, int nseq, int i, int j, double trans[STATETYPES][STATETYPES]) { int fy, ty; /* from state index, to state index */ int idx; /* counter for sequences */ /* Zero the counter array */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = 0.0; /* For each sequence: * assign ty, based on symbols vs. gaps at i',j * use accum_insr and bump counters appropriately. */ fy = BEGIN_ST; for (idx = 0; idx < nseq; idx++) { ty = assign_cell(i, j, aseqsT[i][idx], aseqsT[j][idx]); trans[fy][ty] += weights[idx]; } } /* Function: fromroot_transtable() * * Purpose: Given an ending cell i2,j2, calculate the * 6x6 transition table between 1,alen,ROOT and this pair, * assuming (i',j') is assigned * to MATP. Transitions to other types of nodes * at i',j' (MATR, MATL, BIFURC) are easily derived from this one. * * Args: aseqsT - transpose of alignment, [1..alen][0..nseq-1] * weights - weights on sequences (usually just 1.0 for each) * nseq - number of sequences * i2,j2 - i',j' * accum_insl - (0..nseq-1) array of number of inserted symbols * between i,i' for each sequence * accum_insr - (0..nseq-1) array of number of inserted symbols * between j',j for each sequence * trans[][] - filled in: state transition matrix (counts) * * Return: (void) * trans[][] is filled in. */ static void fromroot_transtable(int **aseqsT, float *weights, int nseq, int i2, int j2, int *accum_insl, int *accum_insr, double trans[STATETYPES][STATETYPES]) { int fy, ty; /* from state index, to state index */ int idx; /* counter for sequences */ /* Zero the counter array */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = 0.0; /* For each sequence: * assign ty, based on symbols vs. gaps at i',j' * use accum_insr and accum_insl and bump counters appropriately. */ for (idx = 0; idx < nseq; idx++) { ty = assign_cell(i2,j2,aseqsT[i2][idx], aseqsT[j2][idx]); if (accum_insl[idx] == 0 && accum_insr[idx] == 0) { trans[BEGIN_ST][ty] += weights[idx]; } else if (accum_insl[idx] > 0) { trans[BEGIN_ST][INSL_ST] += weights[idx]; trans[INSL_ST][INSL_ST] += (accum_insl[idx]-1) * weights[idx]; if (accum_insr[idx] > 0) { trans[INSL_ST][INSR_ST] += weights[idx]; trans[INSR_ST][INSR_ST] += (accum_insr[idx]-1) * weights[idx]; trans[INSR_ST][ty] += weights[idx]; } else { trans[INSL_ST][ty] += weights[idx]; } } else if (accum_insr[idx] > 0) { trans[BEGIN_ST][INSR_ST] += weights[idx]; trans[INSR_ST][INSR_ST] += (accum_insr[idx]-1) * weights[idx]; trans[INSR_ST][ty] += weights[idx]; } } /* end loop over all sequences */ } /* Function: assign_cell() * * Purpose: Given that we assign a cell i,j to MATP for the * whole alignment, return the actual assignment * for the given sequence seq. This will be MATP * if seq has symbols in both columns i,j; * MATL if j is a gap; MATR if i is a gap; * DEL if both i,j are gaps. * * If i == j, assign_cell returns MATP or DEL even though * MATP is not a valid assignment; the to_matr or to_matl functions * straighten this out later. * * Args: i,j : coordinates (columns in alignment) * symi: symbol in position i * symj: symbol in position j * * Return: DEL_ST, MATP_ST, MATR_ST, or MATL_ST */ static int assign_cell(int i, int j, int symi, int symj) { if (i > j) return END_ST; else if (symi >= 0) { if (symj >= 0) return MATP_ST; else return MATL_ST; } else if (symj >= 0) return MATR_ST; else return DEL_ST; } /* Function: to_matp_transtable() * * Purpose: Given a master state transition tables containing * counts, create a table specific for transitions * to an (i',j') assigned to MATP_NODE. * * Args: master_table[][] - the master table containing counts * trans[][] - the new table, returned containing counts * * Return: (void) * trans[][] is filled. */ static void to_matp_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]) { int fy,ty; /* indices for from state, to state */ /* Just copy the table. */ for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) trans[fy][ty] = master_table[fy][ty]; } /* Function: to_matr_transtable() * * Purpose: Given a master state transition tables containing * counts, create a table specific for transitions * to an (i',j') assigned to MATR_NODE. * * Args: master_table[][] - the master table containing counts * trans[][] - the new table, returned containing counts * * Return: (void) * trans[][] is filled. */ static void to_matr_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]) { int fy; /* indices for from state, to state */ for (fy = 0; fy < STATETYPES; fy++) { trans[fy][DEL_ST] = master_table[fy][DEL_ST] + master_table[fy][MATL_ST]; trans[fy][MATP_ST] = 0.0; trans[fy][MATL_ST] = 0.0; trans[fy][MATR_ST] = master_table[fy][MATR_ST] + master_table[fy][MATP_ST]; trans[fy][INSL_ST] = master_table[fy][INSL_ST]; trans[fy][INSR_ST] = master_table[fy][INSR_ST]; } } /* Function: to_matl_transtable() * * Purpose: Given a master state transition tables containing * counts, create a table specific for transitions * to an (i',j') assigned to MATL_NODE. * * Args: master_table[][] - the master table containing counts * trans[][] - the new table, returned containing counts * * Return: (void) * trans[][] is filled. */ static void to_matl_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]) { int fy; /* indices for from state, to state */ for (fy = 0; fy < STATETYPES; fy++) { trans[fy][DEL_ST] = master_table[fy][DEL_ST] + master_table[fy][MATR_ST]; trans[fy][MATP_ST] = 0.0; trans[fy][MATL_ST] = master_table[fy][MATL_ST] + master_table[fy][MATP_ST]; trans[fy][MATR_ST] = 0.0; trans[fy][INSL_ST] = master_table[fy][INSL_ST]; trans[fy][INSR_ST] = master_table[fy][INSR_ST]; } } /* Function: to_bifurc_transtable() * * Purpose: Given a master state transition tables containing * counts, create a table specific for transitions * to an (i',j') assigned to BIFURC_NODE. * * Args: master_table[][] - the master table containing counts * trans[][] - the new table, returned containing counts * * Return: (void) * trans[][] is filled. */ static void to_bifurc_transtable(double master_table[STATETYPES][STATETYPES], double trans[STATETYPES][STATETYPES]) { int fy; /* indices for from state, to state */ for (fy = 0; fy < STATETYPES; fy++) { trans[fy][BIFURC_ST] = master_table[fy][DEL_ST] + master_table[fy][MATR_ST] + master_table[fy][MATL_ST] + master_table[fy][MATP_ST]; trans[fy][MATP_ST] = 0.0; trans[fy][MATL_ST] = 0.0; trans[fy][MATR_ST] = 0.0; trans[fy][INSL_ST] = master_table[fy][INSL_ST]; trans[fy][INSR_ST] = master_table[fy][INSR_ST]; } } /* Function: dot_score() * * Purpose: Calculate the dot product of counts by probabilities * for s set of state transitions: i.e. a total score * * Args: cmx: count vector * pmx: probability vector * * Return: score, the dot product, as an integer */ static int dot_score(double *cvec, double *pvec, int veclen) { int i; double score = 0.0; for (i = 0; i < veclen; i++, cvec++, pvec++) if (*pvec > 0.0) score += *cvec * log(*pvec); return ((int) INTPRECISION * score); } #ifdef DEBUG static void print_mmx(struct maxmx_s **mmx, int alen) { int i,j,y; for (j = 0; j <= alen; j++) { for (y = 0; y < NODETYPES-1; y++) { for (i = 0; i <= j+1; i++) printf("%5d ", mmx[j][i].sc[y] * 10 / (int) INTPRECISION); printf("\n"); } puts(""); } } /* Function: print_assignments() * * Purpose: Using a master max likelihood tree constructed from an * alignment, print a line for each column of the alignment: * (column) (fractional occupancy) (assignment) * * The master tree, mtr, is a traceback tree with some of * the fields misused. emitl, emitr contain 0..alen-1 column * coordinates (even if only one of them is emitted by * this tree node). nodeidx is unused. type contains * a node type, not a state type. * * Code partly borrowed from Trace2ali(). */ static void print_assignments(struct trace_s *mtr, int nseq, int alen, double *gapcount) { struct align_s *ali; /* linear list of alignment */ struct t2ali_s *stack; /* stack used to traverse the traceback tr */ struct trace_s *currtr; struct align_s *currali; struct align_s *newafter; struct align_s *oldafter; int currpos; /* First, we generate a linear linked align_s list from the tree. * The align_s structure fields are used as follows: * pos = 0..alen-1 position in alignment * sym = unused * nodeidx = unused * type = node type (e.g. MATP_NODE) */ /* Initialize the linked list for the alignment of sequence to model */ ali = Init_align(); /* Initialize the pushdown stack for traversal of the traceback */ stack = Init_t2ali(); Push_t2ali(stack, mtr->nxtl, ali); while (Pop_t2ali(stack, &currtr, &oldafter)) { if (currtr->nxtl == NULL) continue; /* ignore END nodes */ if (currtr->nxtr != NULL) /* BIFURC node */ { /* deal with right branch; insert a dummy */ newafter = Insafter_align(-1, '-', ' ', 0, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtr, newafter); /* deal with left branch */ Push_t2ali(stack, currtr->nxtl, oldafter); } else{ switch (currtr->type) { case BEGINL_NODE: case BEGINR_NODE: case ROOT_NODE: Push_t2ali(stack, currtr->nxtl, oldafter); break; case MATP_NODE: (void) Insafter_align(currtr->emitr, ' ', ' ', 0, currtr->type, oldafter); newafter = Insafter_align(currtr->emitl, ' ', ' ', 0, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, newafter); break; case MATL_NODE: newafter = Insafter_align(currtr->emitl, ' ', ' ', 0, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, newafter); break; case MATR_NODE: (void) Insafter_align(currtr->emitr, ' ', ' ', 0, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, oldafter); break; case END_NODE: break; default: Die("no such node type %d", currtr->type); } } } Free_t2ali(stack); currpos = 0; for (currali = ali->nxt; currali != NULL; currali = currali->nxt) { currpos++; while (currpos < currali->pos) { printf("%4d %.3f INS\n", currpos, ((double) nseq - gapcount[currpos]) / (double) nseq); currpos++; } switch (currali->type) { case BEGINL_NODE: case BEGINR_NODE: case ROOT_NODE: break; case MATL_NODE: case MATR_NODE: printf("%4d %.3f MAT singlet\n", currali->pos, ((double) nseq - gapcount[currali->pos]) / (double) nseq); break; case MATP_NODE: printf("%4d %.3f MAT pairwise\n", currali->pos, ((double) nseq - gapcount[currali->pos]) / (double) nseq); break; } } while (currpos <= alen) { printf("%4d %.3f INS\n", currpos, ((double) nseq - gapcount[currpos]) / (double) nseq); currpos++; } Free_align(ali); } #endif /* DEBUG */ tRNAscan-SE-2.0/src/prior.h0000644000543100007160000004406211021467306014733 0ustar pchanlowelab#ifndef PRIORH_INCLUDED #define PRIORH_INCLUDED /* prior.h -- prior probability distributions, for regularization * SRE, Wed Sep 1 15:47:42 1993 * * There are 24 different node-node transitions; 6 kinds of node * we can come from (BIFURC_NODES are the exception, always with p=1.0 * transitions), and four kinds of node we can go to (MATP, MATL, * MATR, BIFURC; END is treated the same as BIFURC). * * There are 256 different kinds of state transitions, total. * For ease of indexing, we keep them in a [7][4][6][6] sparse * array, indexed [from node][to node][from statetype][to statetype]. * Thus there are 1008 numbers, 752 are 0's. Any given row * tprior[fn][tn][fs] sums to 1.0 * the number of possible * transitions. * * The tables are currently set up to do Laplace corrections; i.e., * a "plus-one" prior. * * Christ. This really _is_ the best way to do it, I think. */ /* DO NOT CHANGE THE ORDER OF THE DEFINITIONS IN THIS FILE. * The prior.h header files are sometimes parsed at run-time, * and they're expected to be in this format. */ #include "structs.h" /* A C G T */ static double def_rfreq[ALPHASIZE] = { 0.25, 0.25, 0.25, 0.25 }; static double def_talpha[STATETYPES] = /* DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }; static double def_emalpha[STATETYPES] = /* DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }; static double def_matp_prior[ALPHASIZE][ALPHASIZE] = { { 1.00, 1.00, 1.00, 1.00 }, { 1.00, 1.00, 1.00, 1.00 }, { 1.00, 1.00, 1.00, 1.00 }, { 1.00, 1.00, 1.00, 1.00 }, }; /* A C G T */ static double def_matl_prior[ALPHASIZE] = { 1.00, 1.00, 1.00, 1.00 }; static double def_matr_prior[ALPHASIZE] = { 1.00, 1.00, 1.00, 1.00 }; static double def_insl_prior[ALPHASIZE] = { 1.00, 1.00, 1.00, 1.00 }; static double def_insr_prior[ALPHASIZE] = { 1.00, 1.00, 1.00, 1.00 }; static double def_tprior[7][4][STATETYPES][STATETYPES] = { { /* BIFURC_NODE --> BIFURC_NODE or END: never happens */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BIFURC_NODE --> MATP_NODE: never happens. */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BIFURC_NODE --> MATL_NODE: never happens. */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BIFURC_NODE --> MATR_NODE: never happens */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, }, { /* MATP_NODE --> BIFURC_NODE or END */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* MATP_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* MATL_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* MATR_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 0.00, 0.00, 0.00, 0.00, 1.00 }, }, /* MATP_NODE --> MATP_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* MATP_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* MATL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* MATR_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* INSL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 1.00, 1.00, 1.00, 0.00, 1.00 }, }, /* MATP_NODE --> MATL_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* MATP_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* MATL_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* MATR_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* INSL_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 0.00, 1.00, 0.00, 0.00, 1.00 }, }, /* MATP_NODE --> MATR_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* MATP_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* MATL_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* MATR_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 0.00, 0.00, 1.00, 0.00, 1.00 }, }, }, { /* MATL_NODE --> BIFURC_NODE or END */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 0.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* MATL_NODE --> MATP_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 1.00, 1.00, 1.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* MATL_NODE --> MATL_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 1.00, 0.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* MATL_NODE --> MATR_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 1.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, }, { /* MATR_NODE --> BIFURC_NODE or END */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 0.00, 0.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 1.00, 0.00, 0.00, 0.00, 0.00, 1.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 1.00, 0.00, 0.00, 0.00, 0.00, 1.00 }, }, /* MATR_NODE --> MATP_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 1.00, 1.00, 1.00, 0.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 1.00, 1.00, 1.00, 1.00, 0.00, 1.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 1.00, 1.00, 1.00, 1.00, 0.00, 1.00 }, }, /* MATR_NODE --> MATL_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 1.00, 0.00, 0.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 1.00, 0.00, 1.00, 0.00, 0.00, 1.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 1.00, 0.00, 1.00, 0.00, 0.00, 1.00 }, }, /* MATR_NODE --> MATR_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 1.00, 0.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 1.00, 0.00, 0.00, 1.00, 0.00, 1.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 1.00, 0.00, 0.00, 1.00, 0.00, 1.00 }, }, }, { /* BEGINL_NODE --> BIFURC_NODE or END */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BEGINL_NODE --> MATP_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 1.00, 1.00, 1.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BEGINL_NODE --> MATL_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 0.00, 1.00, 0.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BEGINL_NODE --> MATR_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 0.00, 0.00, 1.00, 0.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, }, { /* BEGINR_NODE --> BIFURC_NODE or END */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 0.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BEGINR_NODE --> MATP_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 1.00, 1.00, 1.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BEGINR_NODE --> MATL_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 1.00, 0.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, /* BEGINR_NODE --> MATR_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* DEL_ST */ { { 1.00, 0.00, 0.00, 1.00, 1.00, 0.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 0.00 }, /* INSR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, }, }, { /* ROOT_NODE --> BIFURC_NODE or END */ /* fs: */ /* ts: BIFURC_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 0.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 0.00, 0.00, 0.00, 0.00, 1.00 }, }, /* ROOT_NODE --> MATP_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 1.00, 1.00, 1.00, 0.00, 1.00 }, }, /* ROOT_NODE --> MATL_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 1.00, 0.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 0.00, 1.00, 0.00, 0.00, 1.00 }, }, /* ROOT_NODE --> MATR_NODE */ /* fs: */ /* ts: DEL_ST MATP_ST MATL_ST MATR_ST INSL_ST INSR_ST */ /* BEGIN */ { { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* MATP_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATL_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* MATR_ST */ { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 }, /* INSL_ST */ { 1.00, 0.00, 0.00, 1.00, 1.00, 1.00 }, /* INSR_ST */ { 1.00, 0.00, 0.00, 1.00, 0.00, 1.00 }, }, }, }; #endif /* PRIORH_INCLUDED */ tRNAscan-SE-2.0/src/align.c0000644000543100007160000003517611021467303014670 0ustar pchanlowelab/* align.c * SRE, Tue Jun 29 14:05:48 1993 * 2.0: Thu Sep 30 14:43:05 1993 * * Code for producing a multiple sequence alignment from tracebacks. * * */ #include #include #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: create_master() * * Purpose: Produce a "master copy" of the linear order in * which a model's states produce a sequence. Used * for reference during alignment construction. * * This linked list is created with Init_align() * and must be free'd by Free_align(). Each structure * in the list represents a possible emission from * a match state, and contains the index of the responsible * state. The sym field is used as an internal flag * (temporary dummy structures are used for bifurc's, and * flagged in the sym field for later removal). The * pos and substate fields are not used or meaningful. ret_len * is the maximum number of symbols that can be * emitted by non-insert states. */ static int create_master(struct cm_s *cm, struct align_s **ret_master, int *ret_len) { struct m2ali_s *stack; /* stack used to traverse the model cv */ struct align_s *master; /* RETURN: maximal match emission alignment */ struct align_s *curr; int oldidx; int oldtype; struct align_s *oldafter; int newidx; int newtype; struct align_s *newafter; int len; /* Initialize the linked list of state order, master */ master = Init_align(); /* Initialize pushdown stack for traversing the model. */ stack = Init_m2ali(); newidx = (cm->nodes > 1) ? 1 : -1; newtype = uMATP_ST; /* assume 'worst' case, all nodes generate pairs */ Push_m2ali(stack, newidx, newtype, master); /* While there are still active nodes on the stack, pop one off * and deal with it. */ len = 0; while (Pop_m2ali(stack, &oldidx, &oldtype, &oldafter)) { if (oldidx == -1) continue; /* END */ if (cm->nd[oldidx].type == BIFURC_NODE) { /* deal with right branch. * Gotta insert a dummy temporarily */ newafter = Insafter_align(-1, '-', ' ', oldidx, uBIFURC_ST, oldafter); Push_m2ali(stack, cm->nd[oldidx].nxt2, uBEGIN_ST, newafter); len++; /* deal with left branch */ Push_m2ali(stack, cm->nd[oldidx].nxt, uBEGIN_ST, oldafter); } else if (cm->nd[oldidx].type == BEGINL_NODE || cm->nd[oldidx].type == BEGINL_NODE || cm->nd[oldidx].type == ROOT_NODE) { /* BEGIN's aren't reponsible for any MAT states */ Push_m2ali(stack, cm->nd[oldidx].nxt, uBEGIN_ST, oldafter); } else { (void) Insafter_align(-1, '.', ' ', oldidx, uMATP_ST, oldafter); newafter = Insafter_align(-1, '.', ' ', oldidx, uMATP_ST, oldafter); len += 2; Push_m2ali(stack, cm->nd[oldidx].nxt, uBEGIN_ST, newafter); } } /* Remove the temporary dummies used to BIFURC */ for (curr = master->nxt; curr->nxt != NULL; curr = curr->nxt) if (curr->nxt->sym == '-') { Delafter_align(curr); len--; } Free_m2ali(stack); *ret_len = len; *ret_master = master; #ifdef DEBUG /* print_align(master); */ #endif return 1; } /* Function: Trace2ali() * * Purpose: Given a traceback (tree-structure alignment of a model * to a sequence), construct a linear linked list representation * (align_s) of the sequence alignment to the model. * * Args: seq - 0..len-1 sequence to align * tr - traceback containing tree-wise alignment * watsoncrick - if TRUE, only canonical pairs get structure annotation * ret_ali - RETURN: linear list alignment */ int Trace2ali(char *seq, struct trace_s *tr, int watsoncrick, struct align_s **ret_ali) { struct align_s *ali; /* RETURN: linear list of alignment */ struct t2ali_s *stack; /* stack used to traverse the traceback tr */ struct trace_s *currtr; struct align_s *newafter; struct align_s *oldafter; struct align_s *curr; char ssl, ssr;/* symbols <.> for secondary structure rep. */ /* Initialize the linked list for the alignment of sequence to model */ ali = Init_align(); /* Initialize the pushdown stack for traversal of the traceback */ stack = Init_t2ali(); Push_t2ali(stack, tr, ali); while (Pop_t2ali(stack, &currtr, &oldafter)) { switch (currtr->type) { case END_ST: break; /* ignore END states */ case uBIFURC_ST: /* deal with right branch; insert a dummy */ newafter = Insafter_align(-1, '*', ' ', currtr->nodeidx, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtr, newafter); /* deal with left branch */ Push_t2ali(stack, currtr->nxtl, oldafter); break; case uBEGIN_ST: Push_t2ali(stack, currtr->nxtl, oldafter); break; case uDEL_ST: Insafter_align(-1, '-', '.', currtr->nodeidx, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, oldafter); break; case uMATP_ST: if (! watsoncrick || IsRNAComplement(seq[currtr->emitr], seq[currtr->emitl], TRUE)) { ssr = '<'; ssl = '>'; } else { ssr = '.'; ssl = '.'; } (void) Insafter_align(currtr->emitr, seq[currtr->emitr], ssr, currtr->nodeidx, currtr->type, oldafter); newafter = Insafter_align(currtr->emitl, seq[currtr->emitl], ssl, currtr->nodeidx, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, newafter); break; case uINSL_ST: case uMATL_ST: newafter = Insafter_align(currtr->emitl, seq[currtr->emitl], '.', currtr->nodeidx, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, newafter); break; case uINSR_ST: case uMATR_ST: (void) Insafter_align(currtr->emitr, seq[currtr->emitr], '.', currtr->nodeidx, currtr->type, oldafter); Push_t2ali(stack, currtr->nxtl, oldafter); break; } } /* Remove the temporary dummies used to BIFURC */ for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt) if (curr->nxt->sym == '*') Delafter_align(curr); Free_t2ali(stack); *ret_ali = ali; #ifdef DEBUG /* print_align(ali); */ #endif return 1; } /* Function: Traces2Alignment() * * Purpose: Given a set of tracebacks for alignments of multiple sequences to * a model, construct a multiple sequence alignment. * * The tricky bit, which involves some precalculations, is allowing * the proper amount of space for insertions. */ int Traces2Alignment(char **rseqs, SQINFO *sqinfo, struct trace_s **tr, int nseq, struct cm_s *cm, int watsoncrick, /* TRUE to annotate only canonical pairs */ char ***ret_aseqs, AINFO *ainfo) { struct align_s *master; /* representation of how MAT columns map onto the model */ struct align_s **ali; /* array of align_s alignments of the sequences to model */ int *matuse; /* 0 if MAT column is never used, 1 otherwise, 1..len */ int *insuse; /* per sequence, count use of inserts between MAT columns; 0..len */ int *max_insuse; /* overall maxima, keep track of ins use between columns */ int *matpos; /* array of MAT column positions in the alignment, 1..len */ int len; /* length of master - maximum number of MAT columns */ int idx; /* counter for sequences */ struct align_s *currmaster; struct align_s *currali; int aseqlen; /* length of multiple sequence alignment */ char **aseqs; /* RETURN: multiple sequence alignment */ char **ss; /* secondary structures */ int apos; /* position in absolute alignment columns */ int matcol; /* position in MAT column coord arrays (matuse, matpos) */ /* First we use the model to calculate "master", which will define * the columns of the multiple sequence alignment (MAT-produced) and * represents how they map onto the model states. Only the stateidx field of * the align_s structures is meaningful. */ if (! create_master(cm, &master, &len)) return 0; /* Next we "invert" each traceback (from seq->model tree alignments * to model->seq linear alignments) and create an array "ali" of individual * alignments of the model to the sequences. */ if ((ali = (struct align_s **) malloc (nseq * sizeof(struct align_s *))) == NULL) { Die("Memory failure, line %d of %s", __LINE__, __FILE__); return 0; } for (idx = 0; idx < nseq; idx++) if (! Trace2ali(rseqs[idx], tr[idx], watsoncrick, &ali[idx])) return 0; /* Now we're ready to start counting MAT and INS use. * * For MAT use, all we're doing is determining whether a given * column in the master alignment is used or not (matuse[1..len] is * 1 if yes, 0 if no) * * For INS use, we are counting the maximum number of occurrences * of insert emissions between each MAT column of the master. max_insuse[0..len] * keeps these numbers. max_insuse[5] is the maximum number of inserted * symbols between columns 5 and 6, for example. Because there is * some redundancy in the model -- different INS states may emit in * the same place -- we have to first increment a counter array, insuse, * for each individual sequence. */ if (((matuse = (int *) calloc (len+1, sizeof(int))) == NULL) || ((insuse = (int *) calloc (len+1, sizeof(int))) == NULL) || ((max_insuse = (int *) calloc (len+1, sizeof(int))) == NULL) ) { Die("Memory failure, line %d of %s", __LINE__, __FILE__); return 0; } for (idx = 0; idx < nseq; idx++) { for (matcol = 0; matcol <= len; matcol++) insuse[matcol] = 0; matcol = 0; currmaster = master->nxt; for (currali = ali[idx]->nxt; currali != NULL; currali = currali->nxt) { switch (currali->type) { case uMATP_ST: case uMATR_ST: case uMATL_ST: case uDEL_ST: matcol++; while (currmaster->nodeidx != currali->nodeidx) { currmaster = currmaster->nxt; matcol++; } matuse[matcol] = 1; currmaster = currmaster->nxt; break; case uINSR_ST: case uINSL_ST: insuse[matcol]++; break; } } /* update max_insuse with new maxima, if any*/ for (matcol = 0; matcol <= len; matcol++) if (insuse[matcol] > max_insuse[matcol]) max_insuse[matcol] = insuse[matcol]; } /* calculate length of mult seq alignment, and alloc */ aseqlen = 0; for (matcol = 0; matcol <= len; matcol++) { if (matuse[matcol] == 1) aseqlen++; aseqlen += max_insuse[matcol]; } if ((aseqs = (char **) malloc (nseq * sizeof(char *))) == NULL || (ss = (char **) malloc (nseq * sizeof(char *))) == NULL) { Die("Memory failure, line %d of %s", __LINE__, __FILE__); return 0; } for (idx = 0; idx < nseq; idx++) if ((aseqs[idx] = (char *) malloc ((aseqlen+1) * sizeof(char))) == NULL || (ss[idx] = (char *) malloc ((aseqlen+1) * sizeof(char))) == NULL) { Die("Memory failure, line %d of %s", __LINE__, __FILE__); return 0; } /* Now we use matuse and max_insuse to calculate an array for the * coordinates of the MAT columns in the multiple alignment. */ if ((matpos = (int *) calloc (len+1 , sizeof(int))) == NULL) { Die("Memory failure, line %d of %s", __LINE__, __FILE__); return 0; } for (matcol = 1; matcol <= len; matcol++) matpos[matcol] = matpos[matcol-1] + max_insuse[matcol-1] + matuse[matcol-1]; /* And finally, we're ready to actually construct the multiple sequence * alignment. The resulting alignment is flushed-right with gaps. */ for (idx = 0; idx < nseq; idx++) { matcol = 0; apos = 0; currmaster = master; for (currali = ali[idx]->nxt; currali != NULL; currali = currali->nxt) { switch (currali->type) { case uMATP_ST: case uMATR_ST: case uMATL_ST: /* goes in a MAT column */ while (currmaster->nodeidx != currali->nodeidx) { currmaster = currmaster->nxt; matcol++; } for (; apos < matpos[matcol]; apos++) { aseqs[idx][apos] = '.'; ss[idx][apos] = ' '; } aseqs[idx][apos] = toupper((int)currali->sym); ss[idx][apos] = currali->ss; apos++; currmaster = currmaster->nxt; matcol++; break; case uINSR_ST: case uINSL_ST: aseqs[idx][apos] = tolower((int)currali->sym); ss[idx][apos] = currali->ss; apos++; break; case uDEL_ST: aseqs[idx][apos] = '.'; ss[idx][apos] = ' '; apos++; currmaster = currmaster->nxt; matcol++; break; } } /* flush right */ for (; apos < aseqlen; apos++) { aseqs[idx][apos] = '.'; ss[idx][apos] = ' '; } aseqs[idx][apos] = '\0'; ss[idx][apos] = '\0'; } for (idx = 0; idx < nseq; idx++) Free_align(ali[idx]); Free_align(master); free(matpos); free(max_insuse); free(insuse); free(matuse); free(ali); if (ainfo != NULL) { int leftcount; int rightcount; ainfo->flags = 0; strcpy(ainfo->au, "CM RNA automatic alignment"); ainfo->flags |= AINFO_AUTH; ainfo->alen = aseqlen; ainfo->flags |= AINFO_ALEN; /* copy sqinfo structure array */ if ((ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO) * nseq)) == NULL) Die("malloc failed"); for (idx = 0; idx < nseq; idx++) SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx])); /* Construct a consensus structure string. * Secondary structure strings, ss, are currently aligned to * the aseqs. Calculate an aligned consensus structure from * them. */ if ((ainfo->cs = (char *) malloc (sizeof(char) * (aseqlen+1))) == NULL) Die("malloc failed"); for (apos = 0; apos < aseqlen; apos ++) { leftcount = rightcount = 0; for (idx = 0; idx < nseq; idx++) if (ss[idx][apos] == '<') rightcount++; else if (ss[idx][apos] == '>') leftcount++; if (rightcount > nseq / 2) ainfo->cs[apos] = '<'; else if (leftcount > nseq / 2) ainfo->cs[apos] = '>'; else ainfo->cs[apos] = '.'; } ainfo->cs[aseqlen] = '\0'; ainfo->flags |= AINFO_CS; /* Construct individual secondary structure strings by de-aligning * the individuals. */ for (idx = 0; idx < nseq; idx++) { MakeDealignedString(aseqs[idx], aseqlen, ss[idx], &(ainfo->sqinfo[idx].ss)); ainfo->sqinfo[idx].flags |= SQINFO_SS; } } Free2DArray(ss, nseq); *ret_aseqs = aseqs; return 1; } tRNAscan-SE-2.0/src/model.c0000644000543100007160000005126511021467304014674 0ustar pchanlowelab/* model.c * Allocation, initialization, free'ing of models. * * Includes code supporting both original node-based CM structure, as well * as the modified, state-based CM structure used by the newer alignment * implementations. * * SRE, Tue Sep 7 09:22:03 1993 * */ #include #include #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static void fill_state(struct istate_s *st, int nodeidx, int statetype, int offset); static void copy_singlet_emissions(struct istate_s *st, double *emvec, double *rfreq); static void copy_pairwise_emissions(struct istate_s *st, double *em, double *rfreq); static void copy_state_transitions(struct istate_s *st, double *tvec, int tflags); static void copy_pstate_transitions(struct pstate_s *st, double *tvec, int tflags); static void fill_pstate(struct pstate_s *st, int nodeidx, int statetype, int offset); /* Function: AllocCM() * * Purpose: Allocate for a model containing some number of nodes, * inclusive of root but exclusive of ENDs. Blank the model. * * Args: nodes - number of nodes to allocate for * * Return: pointer to the new model. Caller must free, with FreeCM() */ struct cm_s * AllocCM(int nodes) { struct cm_s *cm; int k; if ((cm = (struct cm_s *) malloc (sizeof(struct cm_s))) == NULL) Die("malloc failed"); cm->nodes = nodes; if ((cm->nd = (struct node_s *) malloc (nodes * sizeof(struct node_s))) == NULL) Die("malloc failed"); /* fast way to wipe everything to zero */ memset(cm->nd, 0, (nodes * sizeof(struct node_s))); /* set all the topology connections to -1 */ for (k = 0; k < nodes; k++) cm->nd[k].nxt = cm->nd[k].nxt2 = -1; return cm; } /* Function: FreeCM() * * Purpose: Free memory allocated to a covariance model. * * Return: (void) */ void FreeCM(struct cm_s *cm) { free(cm->nd); free(cm); } /* Function: NormalizeCM() * * Purpose: Normalize all the probability distributions in a model. * Only normalizes the meaningful ones: i.e., matr_emit * emission statistics are ignored for MATL_NODEs, etc. * * Return: (void) */ void NormalizeCM(struct cm_s *cm) { int k; /* counter over nodes */ int fy; /* from statetype, to statetype */ for (k = 0; k < cm->nodes; k++) { for (fy = 0; fy < STATETYPES; fy++) DNorm(cm->nd[k].tmx[fy], STATETYPES); /* state transitions */ DNorm((double *) cm->nd[k].mp_emit, ALPHASIZE * ALPHASIZE); /* MATP emissions */ DNorm(cm->nd[k].ml_emit, ALPHASIZE); /* MATL emissions */ DNorm(cm->nd[k].mr_emit, ALPHASIZE); /* MATR emissions */ DNorm(cm->nd[k].il_emit, ALPHASIZE); /* INSL emissions */ DNorm(cm->nd[k].ir_emit, ALPHASIZE); /* INSR emissions */ } } /* Function: VerifyCM() * * Purpose: Have a look at a CM and make sure nothing stupid * is wrong with it. Returns 0 if something's wrong * and writes diagnostics to stderr. Returns 1 if * everything looks OK. */ int VerifyCM(struct cm_s *cm) { int status = 1; int k; /* counter over nodes */ for (k = 0; k < cm->nodes; k++) { if (cm->nd[k].type < 0 || cm->nd[k].type >= NODETYPES) { status = 0; fprintf(stderr, "Node %d has invalid type %d\n", k, cm->nd[k].type); } if ((cm->nd[k].nxt <= k && cm->nd[k].nxt != -1) || (cm->nd[k].nxt >= cm->nodes)) { status = 0; fprintf(stderr, "Node %d points to invalid left child %d\n", k, cm->nd[k].nxt); } if ((cm->nd[k].nxt2 <= k && cm->nd[k].nxt2 != -1) || (cm->nd[k].nxt2 >= cm->nodes)) { status = 0; fprintf(stderr, "Node %d points to invalid right child %d\n", k, cm->nd[k].nxt2); } } return status; } /* Function: RearrangeCM() * * Purpose: Convert a cm into an "integer cm", a specialized structure * used only in the alignment algorithms. * * The integer CM is an array of istate_s structures; i.e., rather * than a node-oriented form, a state-oriented form. Rearrange * transition tables to optimize the recursion in recurse_mx(). * * The node is expanded into states in proper order (uDEL_ST, * uMATP_ST, uMATL_ST, uMATR_ST, uINSL_ST, uINSR_ST). However, the * state transition vectors are rearranged such that INSL, INSR * are the first elements. * * Insert states are explicitly assumed to have a zero emission * score! * * Args: cm - a covariance model, probability form * rfreq - frequencies to use as a random model (expected background) * ret_icm - RETURN: array of istate_s structures for states * in model. Contains a state 0 for the root; * does not contain anything for the end * ret_statenum - number of states in ret_icm, 0..statenum-1 * * Return: 1 on success, 0 on failure. * ret_icm is malloc'ed here and must be free'ed by caller; * use free(*ret_icm). */ int RearrangeCM(struct cm_s *cm, double *rfreq, struct istate_s **ret_icm, int *ret_statenum) { struct istate_s *icm; /* new int-lod states-based model */ struct istate_s *smallicm; /* streamlined (realloc'ed) icm */ struct m2ali_s *bifstack; /* pda for deferring bifurc connection assignment */ int bifidx; /* state index of a BEGINR's parent bifurc */ int k; /* counter for nodes */ int y; /* counter for states */ int tflags; /* flags for which to state transitions are used */ int fflags; /* flags for which from state transitions are used */ int offset; /* offset to next connected state */ /* We know we can fit the new model into cm->nodes * STATETYPES * states. We'll give back the excess memory later. */ if ((icm = (struct istate_s *) calloc ((cm->nodes * STATETYPES), sizeof(struct istate_s))) == NULL) return 0; bifstack = Init_m2ali(); y = 0; for (k = 0; k < cm->nodes; k++) { /* figure out what we're connected to */ if (cm->nd[k].nxt == -1) tflags = uEND_ST; else { switch (cm->nd[cm->nd[k].nxt].type) { case BIFURC_NODE: tflags = uBIFURC_ST; break; case MATP_NODE: tflags = uDEL_ST | uMATP_ST | uMATR_ST | uMATL_ST; break; case MATL_NODE: tflags = uDEL_ST | uMATL_ST; break; case MATR_NODE: tflags = uDEL_ST | uMATR_ST; break; case BEGINL_NODE: case BEGINR_NODE: tflags = uBEGIN_ST; break; default: Die("no such node type %d", cm->nd[cm->nd[k].nxt].type); } } /* figure out what we're coming from */ switch (cm->nd[k].type) { case BIFURC_NODE: fflags = uBIFURC_ST; offset = 1; break; case MATP_NODE: fflags = uDEL_ST | uMATP_ST | uMATL_ST | uMATR_ST | uINSL_ST | uINSR_ST; tflags |= uINSL_ST | uINSR_ST; offset = 4; break; case MATL_NODE: fflags = uDEL_ST | uMATL_ST | uINSL_ST; tflags |= uINSL_ST; offset = 2; break; case MATR_NODE: fflags = uDEL_ST | uMATR_ST | uINSR_ST; tflags |= uINSR_ST; offset = 2; break; case BEGINL_NODE: fflags = uBEGIN_ST; offset = 1; break; case BEGINR_NODE: fflags = uBEGIN_ST | uINSL_ST; offset = 1; tflags |= uINSL_ST; break; case ROOT_NODE: fflags = uBEGIN_ST | uINSL_ST | uINSR_ST; offset = 1; tflags |= uINSL_ST | uINSR_ST; break; default: Die("No such node type %d\n", cm->nd[k].type); } if (fflags & uDEL_ST) { fill_state(&icm[y], k, uDEL_ST, offset); copy_state_transitions(&icm[y], cm->nd[k].tmx[DEL_ST], tflags); offset--; y++; } else if (fflags & uBIFURC_ST) { fill_state(&icm[y], k, uBIFURC_ST, offset); /* A hack. tmx[0] gets the state index of the left connected BEGIN * child; tmx[1] gets the right connected BEGIN child. The left * child is guaranteed to be the next state, but the assignment * of the right state must be deferred: we push the bifurc state index * into a pda */ icm[y].tmx[0] = y+1; Push_m2ali(bifstack, y, 0, NULL); y++; } else if (fflags & uBEGIN_ST) { fill_state(&icm[y], k, uBEGIN_ST, offset); copy_state_transitions(&icm[y], cm->nd[k].tmx[DEL_ST], tflags); /* continuation of the above commentary. If we're a right BEGIN_ST, * then we pop the state index of our parent bifurc off the pda */ if (cm->nd[k].type == BEGINR_NODE) { Pop_m2ali(bifstack, &bifidx, (int *) NULL, (struct align_s **) NULL); icm[bifidx].tmx[1] = y; } offset--; y++; } if (fflags & uMATP_ST) { fill_state(&icm[y], k, uMATP_ST, offset); copy_pairwise_emissions(&icm[y], (double *) cm->nd[k].mp_emit, rfreq); copy_state_transitions(&icm[y], cm->nd[k].tmx[MATP_ST], tflags); offset--; y++; } if (fflags & uMATL_ST) { fill_state(&icm[y], k, uMATL_ST, offset); copy_singlet_emissions(&icm[y], cm->nd[k].ml_emit, rfreq); copy_state_transitions(&icm[y], cm->nd[k].tmx[MATL_ST], tflags); offset--; y++; } if (fflags & uMATR_ST) { fill_state(&icm[y], k, uMATR_ST, offset); copy_singlet_emissions(&icm[y], cm->nd[k].mr_emit, rfreq); copy_state_transitions(&icm[y], cm->nd[k].tmx[MATR_ST], tflags); offset--; y++; } if (fflags & uINSL_ST) { fill_state(&icm[y], k, uINSL_ST, 0); copy_singlet_emissions(&icm[y], cm->nd[k].il_emit, rfreq); copy_state_transitions(&icm[y], cm->nd[k].tmx[INSL_ST], tflags); y++; } if (fflags & uINSR_ST) { /* beware an asymmetry: INSR->INSL transits are disallowed */ fill_state(&icm[y], k, uINSR_ST, 0); copy_singlet_emissions(&icm[y], cm->nd[k].ir_emit, rfreq); copy_state_transitions(&icm[y], cm->nd[k].tmx[INSR_ST], tflags & ~uINSL_ST); y++; } /* End states must be added */ if (cm->nd[k].nxt == -1) { fill_state(&icm[y], -1, uEND_ST, 0); y++; } } /* end loop over nodes */ Free_m2ali(bifstack); /* Return some of the alloc'ed memory */ smallicm = (struct istate_s *) realloc (icm, y * sizeof(struct istate_s)); *ret_icm = (smallicm != NULL) ? smallicm : icm; *ret_statenum = y; return 1; } /* Function: fill_state() * * Purpose: fill in values in a state: node index, state type unique * identifier, offset to first of the next states, and number * of ynext connections. * * transition and emission probabilities are dealt with * elsewhere. * */ static void fill_state(struct istate_s *st, int nodeidx, int statetype, int offset) { st->nodeidx = nodeidx; st->statetype = statetype; st->offset = offset; } /* Function: copy_singlet_emissions() * * Purpose: Copy a singlet emission vector into a state structure, * converting the probabilities into integer log odds. */ static void copy_singlet_emissions(struct istate_s *st, double *emvec, double *rfreq) { int x; for (x = 0; x < ALPHASIZE; x++) st->emit[x] = ILOG2(emvec[x] / rfreq[x]); } /* Function: copy_pairwise_emissions() * * Purpose: Copy a pairwise emission table into a state structure, * converting the probabilities into integer log odds. * Beware the funny business with the pairwise emission * array; it was mp_emit[4][4], now cast to a pointer, * and accessed like a vector. */ static void copy_pairwise_emissions(struct istate_s *st, double *em, double *rfreq) { int x; for (x = 0; x < ALPHASIZE * ALPHASIZE; x++) st->emit[x] = ILOG2(em[x] / (rfreq[x % ALPHASIZE] * rfreq[x / ALPHASIZE])); } /* Function: copy_state_transitions() * * Purpose: Copy a state transition vector from a CM into a state * structure, copying only the used state transitions * as given by tflags. The state transition vector is * rearranged for an optimization: transits to INSL, INSR * are placed first. */ static void copy_state_transitions(struct istate_s *st, double *tvec, int tflags) { int stx; /* counter for state vector */ stx = 0; if (tflags & uINSL_ST) { st->tmx[stx] = ILOG2(tvec[INSL_ST]); stx++; } if (tflags & uINSR_ST) { st->tmx[stx] = ILOG2(tvec[INSR_ST]); stx++; } if (tflags & uDEL_ST || tflags & uBIFURC_ST || tflags & uBEGIN_ST || tflags & uEND_ST) { st->tmx[stx] = ILOG2(tvec[DEL_ST]); stx++; } if (tflags & uMATP_ST) { st->tmx[stx] = ILOG2(tvec[MATP_ST]); stx++; } if (tflags & uMATL_ST) { st->tmx[stx] = ILOG2(tvec[MATL_ST]); stx++; } if (tflags & uMATR_ST) { st->tmx[stx] = ILOG2(tvec[MATR_ST]); stx++; } st->connectnum = stx; } /* Function: MakePCM() * * Purpose: Like RearrangeCM(), but leaving the model * in floating-point probabilities in struct pstate_s * structures. * * Args: cm - a covariance model, probability form * ret_pcm - RETURN: array of pstate_s structures for states * in model. Contains a state 0 for the root. * end states are explicit. * ret_statenum - number of states in ret_pcm, 0..statenum-1 * * Return: 1 on success, 0 on failure. * ret_pcm is malloc'ed here and must be free'ed by caller; * use free(*ret_pcm). */ int MakePCM(struct cm_s *cm, struct pstate_s **ret_pcm, int *ret_statenum) { struct pstate_s *pcm; /* new states-based model */ struct pstate_s *smallpcm; /* streamlined (realloc'ed) pcm */ struct intstack_s *bifstack; /* pda for deferring bifurc connection assignment */ int bifidx; /* state index of a BEGINR's parent bifurc */ int k; /* counter for nodes */ int y; /* counter for states */ int tflags; /* flags for which to state transitions are used */ int fflags; /* flags for which from state transitions are used */ int offset; /* offset to next connected state */ /* We know we can fit the new model into cm->nodes * STATETYPES * states. We'll give back the excess memory later. */ if ((pcm = (struct pstate_s *) calloc ((cm->nodes * STATETYPES), sizeof(struct pstate_s))) == NULL) return 0; bifstack = InitIntStack(); y = 0; for (k = 0; k < cm->nodes; k++) { /* figure out what we're connected to */ if (cm->nd[k].nxt == -1) tflags = uEND_ST; else { switch (cm->nd[cm->nd[k].nxt].type) { case BIFURC_NODE: tflags = uBIFURC_ST; break; case MATP_NODE: tflags = uDEL_ST | uMATP_ST | uMATR_ST | uMATL_ST; break; case MATL_NODE: tflags = uDEL_ST | uMATL_ST; break; case MATR_NODE: tflags = uDEL_ST | uMATR_ST; break; case BEGINL_NODE: tflags = uBEGIN_ST; break; case BEGINR_NODE: tflags = uBEGIN_ST; break; default: Die("no such node type %d", cm->nd[cm->nd[k].nxt].type); } } /* figure out what we're coming from */ switch (cm->nd[k].type) { case BIFURC_NODE: fflags = uBIFURC_ST; offset = 1; break; case MATP_NODE: fflags = uDEL_ST | uMATP_ST | uMATL_ST | uMATR_ST | uINSL_ST | uINSR_ST; tflags |= uINSL_ST | uINSR_ST; offset = 4; break; case MATL_NODE: fflags = uDEL_ST | uMATL_ST | uINSL_ST; tflags |= uINSL_ST; offset = 2; break; case MATR_NODE: fflags = uDEL_ST | uMATR_ST | uINSR_ST; tflags |= uINSR_ST; offset = 2; break; case BEGINL_NODE: fflags = uBEGIN_ST; offset = 1; break; case BEGINR_NODE: fflags = uBEGIN_ST | uINSL_ST; offset = 1; tflags |= uINSL_ST; break; case ROOT_NODE: fflags = uBEGIN_ST | uINSL_ST | uINSR_ST; offset = 1; tflags |= uINSL_ST | uINSR_ST; break; default: Die("No such node type %d\n", cm->nd[k].type); } if (fflags & uDEL_ST) { fill_pstate(&pcm[y], k, uDEL_ST, offset); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[DEL_ST], tflags); offset--; y++; } else if (fflags & uBIFURC_ST) { fill_pstate(&pcm[y], k, uBIFURC_ST, offset); /* We defer the assignment of bifr */ PushIntStack(bifstack, y); y++; } else if (fflags & uBEGIN_ST) { fill_pstate(&pcm[y], k, uBEGIN_ST, offset); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[DEL_ST], tflags); /* continuation of the above commentary. If we're a right BEGIN_ST, * then we pop the state index of our parent bifurc off the pda */ if (cm->nd[k].type == BEGINR_NODE) { PopIntStack(bifstack, &bifidx); pcm[bifidx].bifr = y; } offset--; y++; } if (fflags & uMATP_ST) { fill_pstate(&pcm[y], k, uMATP_ST, offset); memcpy(pcm[y].emit, cm->nd[k].mp_emit, sizeof(double) * ALPHASIZE * ALPHASIZE); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[MATP_ST], tflags); offset--; y++; } if (fflags & uMATL_ST) { fill_pstate(&pcm[y], k, uMATL_ST, offset); memcpy(pcm[y].emit, cm->nd[k].ml_emit, sizeof(double) * ALPHASIZE); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[MATL_ST], tflags); offset--; y++; } if (fflags & uMATR_ST) { fill_pstate(&pcm[y], k, uMATR_ST, offset); memcpy(pcm[y].emit, cm->nd[k].mr_emit, sizeof(double) * ALPHASIZE); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[MATR_ST], tflags); offset--; y++; } if (fflags & uINSL_ST) { fill_pstate(&pcm[y], k, uINSL_ST, 0); memcpy(pcm[y].emit, cm->nd[k].il_emit, sizeof(double) * ALPHASIZE); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[INSL_ST], tflags); y++; } if (fflags & uINSR_ST) { /* beware an asymmetry: INSR->INSL transits are disallowed */ fill_pstate(&pcm[y], k, uINSR_ST, 0); memcpy(pcm[y].emit, cm->nd[k].ir_emit, sizeof(double) * ALPHASIZE); copy_pstate_transitions(&pcm[y], cm->nd[k].tmx[INSR_ST], tflags & ~uINSL_ST); y++; } /* End states must be added */ if (cm->nd[k].nxt == -1) { fill_pstate(&pcm[y], -1, uEND_ST, 0); y++; } } /* end loop over nodes */ FreeIntStack(bifstack); /* Return some of the alloc'ed memory */ smallpcm = (struct pstate_s *) realloc (pcm, y * sizeof(struct pstate_s)); *ret_pcm = (smallpcm != NULL) ? smallpcm : pcm; *ret_statenum = y; return 1; } /* Function: copy_pstate_transitions() * * Purpose: Copy a state transition vector from a CM into a state * structure, copying only the used state transitions * as given by tflags. The state transition vector is * rearranged for an optimization: transits to INSL, INSR * are placed first. */ static void copy_pstate_transitions(struct pstate_s *st, double *tvec, int tflags) { int stx; /* counter for state vector */ stx = 0; if (tflags & uINSL_ST) { st->tmx[stx] = tvec[INSL_ST]; stx++; } if (tflags & uINSR_ST) { st->tmx[stx] = tvec[INSR_ST]; stx++; } if (tflags & uDEL_ST || tflags & uBIFURC_ST || tflags & uBEGIN_ST || tflags & uEND_ST) { st->tmx[stx] = tvec[DEL_ST]; stx++; } if (tflags & uMATP_ST) { st->tmx[stx] = tvec[MATP_ST]; stx++; } if (tflags & uMATL_ST) { st->tmx[stx] = tvec[MATL_ST]; stx++; } if (tflags & uMATR_ST) { st->tmx[stx] = tvec[MATR_ST]; stx++; } st->connectnum = stx; } /* Function: fill_pstate() * * Purpose: fill in values in a state: node index, state type unique * identifier, offset to first of the next states. * * transition and emission probabilities are dealt with * elsewhere. * */ static void fill_pstate(struct pstate_s *st, int nodeidx, int statetype, int offset) { st->nodeidx = nodeidx; st->statetype = statetype; st->offset = offset; } /* Function: NormalizePCM() * * Purpose: Make damn sure a probability-form, states-based CM is * properly normalized. Workaround for a bug! */ void NormalizePCM(struct pstate_s *pcm, int M) { int y; for (y = 0; y < M; y++) { /* emission distributions */ switch (pcm[y].statetype) { case uMATP_ST: DNorm(pcm[y].emit, ALPHASIZE * ALPHASIZE); break; case uMATL_ST: case uMATR_ST: case uINSL_ST: case uINSR_ST: DNorm(pcm[y].emit, ALPHASIZE); break; } /* transition distributions */ if (pcm[y].statetype != uBIFURC_ST) DNorm(pcm[y].tmx, pcm[y].connectnum); } } tRNAscan-SE-2.0/src/smallviterbi.c0000644000543100007160000006030711021467305016267 0ustar pchanlowelab/* smallviterbi.c * Mon Jan 24 11:38:21 1994 * * Small-memory version of viterbi.c * * In the two-matrix version of the alignment algorithm, we keep * a matrix for the BEGIN states (B matrix) and a full matrix for all * the scores (A matrix). The database scanning version of the algorithm takes * advantage of the fact that, for scoring, we don't need to keep * a full cube around for matrix A; we only need the current row and the * last row. We only need a full cube of information for the BEGIN * state scores, which we get from the B matrix. * * This trick saves a large amount of memory, depending on the structure * of the model. (The fewer bifurcations and begins in the model, the more * memory this trick can save.) Unfortunately, it gives up the ability * to trace back and recover an alignment. * * In this module, we add back just enough information to the scanning * algorithm to enable a traceback, at the expense of re-doing some * calculation. The goal is to be able to fit 200-400 nt RNA sequences * into memory. * * I will refer to a model "segment". A segment is a linear (unbranched) * chunk of the model, starting at a BEGIN/ROOT state, ending at a * BIFURC/END state. * * Matrix cells now carry traceback information. This number, tback, indicates * the i,j coords that this segment's BIFURC/END aligns to. * tback is determined recursively. A BIFURC/END's tback points to itself. * All other tbacks are copied from the state that is connected to by * the maximally likely path. * * A traceback is done by using bmx as a framework, and recalculating bits * of the alignment in between BEGINs and BIFURC/ENDs. Thus, if * we know that a BEGIN aligns to i1, j1, we can get two numbers i2,j2 from * the BEGIN's tback, and know that the segment aligns to the sequence * (i1..i2-1)(j2+1..j1). Then we can recalculate the alignment of this * model segment to that subsequence, and reconstruct a full traceback. * * tback is a single unsigned integer. The two numbers i,j are restricted * to 16 bits and are packed into tback by bit-shifting, i<<16. * * */ /* * amx and atr are [j = 0..N] [diff = 0..j] [y = 0..statenum] * diff == 0 is for off-diagonal boundary conditions (this is why diff is shifted +1) * diff == 1 is for the diagonal, i==j * * bmx and btr are [y = 0..statenum] [j = 0..N] [ diff = 0..j] * a j,diff matrix exists only where y is a BEGIN state * * An optimization is made which requires END states to be explicitly * added, so statenum (the number of states in the integer model) * is *inclusive* of ENDs. */ #include #include #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* This is how we pack tracebacks into a single machine word. * We assume a 32 bit int. * If you're porting this code, all you have to do is make sure * pack_tb() puts two 16-bit ints in one data type TBACK, and unpack_tb() * gets them back. */ typedef unsigned int TBACK; #define pack_tb(i,j) ((i)<<16 | (j)) #define PACKED_I 0xFFFF0000U #define PACKED_J 0x0000FFFFU static void unpack_tb(TBACK tback, int *ret_i, int *ret_j) { *ret_j = tback & PACKED_J; *ret_i = (tback & PACKED_I) >> 16; } static int allocate_mx(struct istate_s *icm,int statenum, int seqlen, int ****ret_amx, TBACK ****ret_atr, int ****ret_bmx, TBACK ****ret_btr); static int init_mx (struct istate_s *icm, int statenum, int N, int ***amx, TBACK ***atr, int ***bmx, TBACK ***btr); static int recurse_mx (struct istate_s *icm, int statenum, char *seq, int N, int ***amx, TBACK ***atr, int ***bmx, TBACK ***btr); static int trace_mx (struct istate_s *icm, char *seq, int N, int ***bmx, TBACK ***btr, struct trace_s **ret_tr); static void free_mx (int ***amx, TBACK ***atr, int ***bmx, TBACK ***btr, int statenum, int seqlen); #ifdef DEBUG static void print_tb(int tb); static void print_small_mx(FILE *fp, struct istate_s *icm, int statenum, char *seq, int N, int ***bmx, TBACK ***btr); #endif /* DEBUG */ /* Function: SmallViterbiAlign() * * Purpose: Align a sequence to a model, using the small-memory * variant of the alignment algorithm. Return the score * of the alignment and the traceback. * * Args: icm - the model to align sequence to * statenum = number of states in icm * seq - sequence to align model to * ret_score - RETURN: global alignment score * ret_trace - RETURN: traceback tree * * Return: 1 on success, 0 on failure. */ int SmallViterbiAlign(struct istate_s *icm, int statenum, char *seq, double *ret_score, struct trace_s **ret_trace) { int ***amx; /* the main score matrix */ TBACK ***atr; /* amx's traceback pointers */ int ***bmx; /* the BEGIN score matrix */ TBACK ***btr; /* bmx's traceback pointers */ int N; /* length of sequence */ N = strlen(seq); seq--; /* convert to 1..N. Ugh! */ if (! allocate_mx(icm, statenum, N, &amx, &atr, &bmx, &btr)) return 0; #ifdef DEBUG printf("allocated matrices\n"); #endif if (! init_mx(icm, statenum, N, amx, atr, bmx, btr)) return 0; #ifdef DEBUG printf("matrices initialized\n"); print_small_mx(stdout, icm, statenum, seq, N, bmx, btr); #endif if (! recurse_mx(icm, statenum, seq, N, amx, atr, bmx, btr)) return 0; #ifdef DEBUG printf("recursion finished\n"); print_small_mx(stdout, icm, statenum, seq, N, bmx, btr); #endif *ret_score = ((double) bmx[0][N][N] / INTPRECISION); #ifdef DEBUG printf("have a score of %.2f, starting traceback\n", *ret_score); #endif if (! trace_mx(icm, seq, N, bmx, btr, ret_trace)) return 0; #ifdef DEBUG printf("trace complete\n"); #endif free_mx(amx, atr, bmx, btr, statenum, N); return 1; } /* Function: allocate_mx() * * Purpose: Malloc space for the score matrices. * amx and atr are indexed as j, i, y. * bmx and btr are indexed as k, j, i. * In the two sequence dimensions j, i they are * diagonal (+1 off diagonal) matrices with * rows j = 0..N, i = 1..j+1. * In the node dimension k bmx and btr are k = 0..M. * In the state dimension y amx and atr are y = 0..numstates. * * Args: icm - the int, log-odds, state-based model * statenum - number of states in model * seqlen - length of sequence * ret_amx - RETURN: main score matrix * ret_atr - RETURN: amx's traceback pointers * ret_bmx - RETURN: BEGIN/BIFURC/END score matrix * ret_btr - RETURN: bmx's traceback pointers * * Return: Ptr to allocated scoring matrix, or * dies and exits. */ static int allocate_mx(struct istate_s *icm, int statenum, int seqlen, int ****ret_amx, TBACK ****ret_atr, int ****ret_bmx, TBACK ****ret_btr) { int ***amx; TBACK ***atr; int ***bmx; TBACK ***btr; int diag, j, y; /* Main matrix, amx: fastest varying index is y (j,i,y) * we only keep two rows for j, 0 and 1. */ /* malloc for j = 0..1 rows */ if ((amx = (int ***) malloc (2 * sizeof(int **))) == NULL || (atr = (TBACK ***) malloc (2 * sizeof(TBACK **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (j = 0; j <= 1; j++) /* loop over rows j = 0..1 */ { /* malloc for diag = 0..j (0..seqlen) cols */ if ((amx[j] = (int **) malloc ((seqlen + 1) * sizeof(int *))) == NULL || (atr[j] = (TBACK **) malloc ((seqlen + 1) * sizeof(TBACK *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* loop over cols diag = 0..seqlen */ for (diag = 0; diag <= seqlen; diag++) /* malloc for y = 0..statenum-1 decks */ if ((amx[j][diag] = (int *) malloc ((statenum) * sizeof (int ))) == NULL || (atr[j][diag] = (TBACK *) malloc ((statenum) * sizeof (TBACK))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } /* B auxiliary matrices: fastest varying index is diag (y,j,diag) * bmx, btr keeps score, traceback decks for BEGIN states */ /* 0..statenum-1 decks */ if ((bmx = (int ***) malloc (statenum * sizeof(int **))) == NULL || (btr = (TBACK ***) malloc (statenum * sizeof(TBACK **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (y = 0; y < statenum; y++) { bmx[y] = NULL; btr[y] = NULL; /* we keep score info for BEGIN and BIFURC states */ if (icm[y].statetype == uBEGIN_ST || icm[y].statetype == uBIFURC_ST) { /* j= 0..seqlen rows */ if ((bmx[y] = (int **) malloc ((seqlen+1) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* i = 0..j columns */ for (j = 0; j <= seqlen; j++) if ((bmx[y][j] = (int *) malloc ((j+1) * sizeof(int ))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } /* We keep traceback info only for BEGIN states */ if (icm[y].statetype == uBEGIN_ST) { /* j= 0..seqlen rows */ if ((btr[y] = (TBACK **) malloc ((seqlen+1) * sizeof(TBACK *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* i = 0..j columns */ for (j = 0; j <= seqlen; j++) if ((btr[y][j] = (TBACK *) malloc ((j+1) * sizeof(TBACK))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } } *ret_amx = amx; *ret_atr = atr; *ret_bmx = bmx; *ret_btr = btr; return 1; } /* Function: free_mx() * * Purpose: Free the space allocated to the scoring and traceback matrices. * Precisely mirrors the allocations above in allocate_cvmx(). * * Return: (void) */ static void free_mx(int ***amx, TBACK ***atr, int ***bmx, TBACK ***btr, int statenum, int seqlen) { int diag, j, y; /* Free the main matrix, amx: * amx[j][i][y] = [0..1] [0..seqlen] [0..statenum-1] */ for (j = 0; j <= 1; j++) { for (diag = 0; diag <= seqlen; diag++) { free(amx[j][diag]); free(atr[j][diag]); } free(amx[j]); free(atr[j]); } free(amx); free(atr); /* Free the auxiliary matrices, bmx and btr * bmx[y][j][i] = [0..statenum-1] [0..seqlen] [0..seqlen] */ for (y = 0; y < statenum; y++) { if (bmx[y] != NULL) { for (j = 0; j <= seqlen; j++) free(bmx[y][j]); free(bmx[y]); } if (btr[y] != NULL) { for (j = 0; j <= seqlen; j++) free(btr[y][j]); free(btr[y]); } } free(bmx); free(btr); } /* Function: init_mx() * * Purpose: Initialization of the scoring matrices. We initialize the off-diagonal, * the diagonal, and the "floor" (end states) of the cube. * * Return: 1 on success, 0 on failure. */ static int init_mx(struct istate_s *icm, /* integer model */ int statenum, /* number of states in icm */ int N, /* length of seq */ int ***amx, TBACK ***atr, int ***bmx, TBACK ***btr) { int diag, j, y; /* counters for indices over the cvmx */ int ynext; /* index of next state k+1 */ int *beam; /* z-axis vector of numbers in amx */ /* Init the whole amx to -Infinity. We do this with memcpy, trying * to be fast. We fill in j=0,diag=0 by hand, then memcpy() the other * columns. */ for (y = 0; y < statenum; y++) amx[0][0][y] = amx[1][0][y] = NEGINFINITY; for (diag = 1; diag <= N; diag++) { memcpy(amx[0][diag], amx[0][0], statenum * sizeof(int)); memcpy(amx[1][diag], amx[0][0], statenum * sizeof(int)); } /* atr END and BIFURC traceback pointers point to themselves. * just set everything to point at itself. */ for (j = 0; j <= 1; j++) for (diag = 0; diag <= N; diag++) for (y = 0; y < statenum; y++) atr[j][diag][y] = pack_tb(diag, j); /* Init the whole bmx to -Inf. We know state 0 is a begin (it's ROOT), so we * start there, and memcpy rows as needed. */ for (diag = 0; diag <= N; diag++) bmx[0][N][diag] = NEGINFINITY; for (j = 0; j < N; j++) memcpy(bmx[0][j], bmx[0][N], (j+1) * sizeof(int)); for (y = 1; y < statenum; y++) if (bmx[y] != NULL) for (j = 0; j <= N; j++) memcpy(bmx[y][j], bmx[0][N], (j+1) * sizeof(int)); /* Set all btr traceback ptrs to point at themselves */ for (y = 0; y < statenum; y++) if (btr[y] != NULL) for (j = 0; j <= N; j++) for (diag = 0; diag <= j; diag++) btr[y][j][diag] = pack_tb(diag,j); /* Init the off-diagonal (j = 0..N; diag == 0) with -log P scores. * End state = 0; * del, bifurc states are calc'ed * begin states same as del's * THIS IS WASTEFUL AND SHOULD BE CHANGED. */ for (j = 0; j <= N; j++) for (y = statenum-1; y >= 0; y--) { if (icm[y].statetype == uEND_ST) amx[j%2][0][y] = 0; else if (icm[y].statetype == uBIFURC_ST) amx[j%2][0][y] = bmx[icm[y].tmx[0]][j][0] + bmx[icm[y].tmx[1]][j][0]; else if (icm[y].statetype == uDEL_ST || icm[y].statetype == uBEGIN_ST) { /* only calc DEL-DEL and BEGIN-DEL transitions. Since * we optimized the state transition tables, removing * the unused ones, we don't know where the number * for "to DEL" is! But we can find it, because it'll * be the connection to a non-infinite score */ beam = amx[j%2][0] + y + icm[y].offset; for (ynext = 0; ynext < icm[y].connectnum; ynext++) { if (*beam != NEGINFINITY) amx[j%2][0][y] = *beam + icm[y].tmx[ynext]; beam++; } } /* make a copy into bmx if y is a BEGIN or BIFURC */ if (icm[y].statetype == uBEGIN_ST || icm[y].statetype == uBIFURC_ST ) bmx[y][j][0] = amx[j%2][0][y]; } return 1; } /* Function: recurse_mx() * * Purpose: Carry out the fill stage of the dynamic programming * algorithm. * * Returns: 1 on success, 0 on failure. */ static int recurse_mx(struct istate_s *icm, /* integer, state-form model */ int statenum, /* number of states in icm */ char *seq, /* sequence, 1..N */ int N, /* length of seq */ int ***amx, /* main scoring matrix */ TBACK ***atr, /* tracebacks for amx */ int ***bmx, /* bifurc scoring matrix */ TBACK ***btr) /* tracebacks for btr */ { int i, j, y; /* indices for 3 dimensions */ int aj; /* 0 or 1, index for j in A matrices */ int diff; /* loop counter for difference: diff = j-i + 1 */ int symi, symj; /* symbol indices for seq[i], seq[j] */ int sc; /* tmp for a score */ int ynext; /* index of next state y */ int *beam; /* ptr to a beam (z-axis vector) */ TBACK *beamt; /* ptr into connected traceback beam */ int leftdiff; /* diff coord of BEGIN_L of a bifurc */ int leftj; /* j coord of BEGIN_L of a bifurc */ int **left_p; /* pointer into whole 2D deck of BEGINL's of a bifurc */ int *right_p; /* ptr into row of BEGIN_R's of a bifurc */ int *scp; /* score pointer: ptr into beam of scores being calc'ed */ TBACK *sct; /* tback beam being calc'ed */ struct istate_s *st; /* state pointer: ptr at current state in icm */ int *tmx; int emitsc; for (j = 1; j <= N; j++) { aj = j % 2; symj = SymbolIndex(seq[j]); /* we have to init END and BIF states to point at themselves in this row of atr */ for (diff = 0; diff <= j; diff++) for (y = 0; y < statenum; y++) if (icm[y].statetype == uBIFURC_ST || icm[y].statetype == uEND_ST) atr[aj][diff][y] = pack_tb(diff, j); for (diff = 1; diff <= j; diff++) { i = j - diff + 1; symi = SymbolIndex(seq[i]); scp = &amx[aj][diff][statenum-1]; sct = &atr[aj][diff][statenum-1]; st = &icm[statenum-1]; for (y = statenum-1; y >= 0; y--, scp--, sct--, st--) { /* loop over states */ if (st->statetype != uBIFURC_ST) /* a normal (non-BIFURC) state */ { /* Connect the "beam" pointer to the appropriate * starting place in the ynext scores we're connecting * y to */ switch (st->statetype) { case uBEGIN_ST: case uDEL_ST: beam = amx[aj][diff]; beamt = atr[aj][diff]; emitsc = 0; break; case uMATP_ST: /* !aj toggles from 0 to 1 and vice versa */ if (diff == 1) continue; beam = amx[!aj][diff-2]; beamt = atr[!aj][diff-2]; emitsc = st->emit[symi * ALPHASIZE + symj]; break; case uMATR_ST: case uINSR_ST: beam = amx[!aj][diff-1]; beamt = atr[!aj][diff-1]; emitsc = st->emit[symj]; break; case uMATL_ST: case uINSL_ST: beam = amx[aj][diff-1]; beamt = atr[aj][diff-1]; emitsc = st->emit[symi]; break; case uEND_ST: continue; default: Die("no such state type %d", st->statetype); } beam += y + st->offset; beamt += y + st->offset; tmx = st->tmx; /* Init for ynext == 0 case */ *scp = *beam + *tmx; *sct = *beamt; /* Calculate remaining cases */ for (ynext = 1; ynext < st->connectnum; ynext++) { beam++; beamt++; tmx++; if (*beam > *scp) { sc = *beam + *tmx; if (sc > *scp) { *scp = sc; *sct = *beamt; } } } /* Add emission scores now */ *scp += emitsc; /* Make a copy into bmx, btr if necessary */ if (st->statetype == uBEGIN_ST) { bmx[y][j][diff] = *scp; btr[y][j][diff] = *sct; } } /* end block of normal state stuff */ else /* a BIFURC state */ { leftdiff = diff; leftj = j; right_p = bmx[st->tmx[1]][j]; left_p = bmx[st->tmx[0]]; /* init w/ case that left branch emits it all */ *scp = left_p[leftj][leftdiff] + *right_p; while (leftdiff > 0) { leftdiff--; leftj--; right_p++; sc = left_p[leftj][leftdiff] + *right_p; if (sc > *scp) *scp = sc; } /* keep copy of score in bmx, for tracing */ bmx[y][j][diff] = *scp; } } /* end loop over states */ } /* end loop over diff */ } /* end loop over j */ return 1; } /* Function: trace_mx() * * Purpose: Trace stage of the dynamic programming: starting * at j=N, i=1, k=0/BEGIN, trace back the optimal * path. Returns a binary tree, ret_trace. * Caller is reponsible for free'ing ret_trace. */ static int trace_mx(struct istate_s *icm, /* the model to align */ char *seq, /* sequence to align it to 1..N */ int N, int ***bmx, /* matrix of BEGIN scores */ TBACK ***btr, /* matrix of BIFURC/END tbacks */ struct trace_s **ret_trace) /* RETURN: the traceback tree */ { struct trace_s *tr; /* the traceback tree under construction */ struct trace_s *curr_tr; /* ptr to node of tr we're working on */ struct tracestack_s *dolist; /* pushdown stack of active tr nodes */ int diff,i, j; /* coords in mx (0..N) */ int y; /* counter for states (0..statenum-1) */ int leftdiff; int leftj; int *right_p; int i2, j2; /* what's left unaccounted for at segment end */ int diff2; int end_y; /* index of state that ends segment */ /* Initialize. * Start at i = 1, j = N and work towards diagonal */ InitTrace(&tr, NULL); /* start a trace tree */ dolist = InitTracestack(); /* start a stack for traversing the trace tree */ curr_tr = AttachTrace(tr, NULL, 0, N-1, 0, BEGIN_ST); PushTracestack(dolist, curr_tr); /* Recursion. While there's active nodes in the stack, trace from them. * * This is cribbed from recurse_cvmx(); it's almost the exact reverse. * We know the best score, we just have to figure out where it came from. */ while ((curr_tr = PopTracestack(dolist)) != NULL) { /* get some useful numbers, mostly for clarity */ /* which is important, since we're sort of misusing * fields in the trace structures! */ i = curr_tr->emitl+1; j = curr_tr->emitr+1; y = curr_tr->nodeidx; diff = j - i + 1; /* find out which bifurc/end state terminates this segment */ end_y = y+1; while (icm[end_y].statetype != uBIFURC_ST && icm[end_y].statetype != uEND_ST) end_y++; /* find out i2,j2 that the terminal bifurc/end aligns to */ unpack_tb(btr[y][j][diff], &diff2, &j2); i2 = j2 - diff2 + 1; /* For now, just write out what the traceback looks like. */ printf("Segment from state %d to %d: aligns to %d..%d/%d..%d\n", y, end_y, i, i2-1, j2+1, j); /* push next BEGINs onto stack; they are connected to BIFURC end_y */ if (icm[end_y].statetype == uBIFURC_ST) { if (i2 > j2) { PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2-1, j2-1, icm[end_y].tmx[1], BEGIN_ST)); PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2-1, j2-1, icm[end_y].tmx[0], BEGIN_ST)); } else { leftdiff = diff2; leftj = j2; right_p = bmx[icm[end_y].tmx[1]][j2]; while (leftdiff >= 0) { if (bmx[end_y][j2][diff] == bmx[icm[end_y].tmx[0]][leftj][leftdiff] + *right_p) { printf("found the bifurc: it is %d-%d and %d-%d\n", i2 -1, i2+leftdiff-2, i2 + leftdiff-1, j2-1); PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2 + leftdiff-1, j2-1, icm[end_y].tmx[1], BEGIN_ST)); PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2 -1, i2+leftdiff-2, icm[end_y].tmx[0], BEGIN_ST)); break; } leftdiff--; leftj--; right_p++; } if (leftdiff < 0) Die("bifurc reconstruction failed at ijy %d,%d,%d", i,j,y); } } } /* (while something is in the tracestack) */ FreeTracestack(dolist); *ret_trace = tr; return 1; } #ifdef DEBUG /* Function: print_tb() * * Purpose: Print the two numbers of a packed traceback. */ static void print_tb(int tb) { int i, j; unpack_tb(tb, &i, &j); printf("%d %d\n", i, j); } /* Function: PrintSmallMX() * * Purpose: Debugging output; print out the three-dimensional * auxiliary alignment matrix produced by the * small-memory version. */ static void print_small_mx(FILE *fp, /* open file or just stdout/stderr */ struct istate_s *icm, /* the model to align */ int statenum, /* number of states in icm */ char *seq, /* sequence, 1..N */ int N, /* length of seq */ int ***bmx, /* auxiliary matrix */ TBACK ***btr) /* traceback ptrs for bmx */ { int j, diff, y; /* indices in 3D matrix */ int tbdiff, tbj; /* traceback pointers to a j, diff position */ for (y = 0; y < statenum; y++) if (bmx[y] != NULL) { fprintf(fp, "### B Matrix for state %d, type %d (%s), from node %d\n", y, icm[y].statetype, UstatetypeName(icm[y].statetype), icm[y].nodeidx); fprintf(fp, " "); for (diff = 0; diff <= N; diff++) fprintf(fp, "%6d ", diff); fprintf(fp, "\n"); for (j = 0; j <= N; j++) { fprintf(fp, "%c %3d ", (j > 0) ? seq[j] : (char) '*', j); for (diff = 0; diff <= j; diff++) fprintf(fp, "%6d ", bmx[y][j][diff]); fprintf(fp, "\n "); if (icm[y].statetype == uBEGIN_ST) for (diff = 0; diff <= j; diff++) { unpack_tb(btr[y][j][diff], &tbdiff, &tbj); fprintf(fp, "%3d/%3d ", tbdiff, tbj); } fprintf(fp, "\n"); } fprintf(fp, "\n\n"); } } #endif /* DEBUG */ tRNAscan-SE-2.0/src/gnuregex.h0000644000543100007160000004426511021467306015431 0ustar pchanlowelab/* Definitions for data structures and routines for the regular expression library, version 0.12. Copyright (C) 1985,1989,1990,1991,1992,1993 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef __REGEXP_LIBRARY_H__ #define __REGEXP_LIBRARY_H__ /* POSIX says that must be included (by the caller) before . */ #ifdef VMS /* VMS doesn't have `size_t' in , even though POSIX says it should be there. */ #include #endif /* The following bits are used to determine the regexp syntax we recognize. The set/not-set meanings are chosen so that Emacs syntax remains the value 0. The bits are given in alphabetical order, and the definitions shifted by one from the previous bit; thus, when we add or remove a bit, only one other definition need change. */ typedef unsigned reg_syntax_t; /* If this bit is not set, then \ inside a bracket expression is literal. If set, then such a \ quotes the following character. */ #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) /* If this bit is not set, then + and ? are operators, and \+ and \? are literals. If set, then \+ and \? are operators and + and ? are literals. */ #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) /* If this bit is set, then character classes are supported. They are: [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. If not set, then character classes are not supported. */ #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) /* If this bit is set, then ^ and $ are always anchors (outside bracket expressions, of course). If this bit is not set, then it depends: ^ is an anchor if it is at the beginning of a regular expression or after an open-group or an alternation operator; $ is an anchor if it is at the end of a regular expression, or before a close-group or an alternation operator. This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because POSIX draft 11.2 says that * etc. in leading positions is undefined. We already implemented a previous draft which made those constructs invalid, though, so we haven't changed the code back. */ #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) /* If this bit is set, then special characters are always special regardless of where they are in the pattern. If this bit is not set, then special characters are special only in some contexts; otherwise they are ordinary. Specifically, * + ? and intervals are only special when not after the beginning, open-group, or alternation operator. */ #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) /* If this bit is set, then *, +, ?, and { cannot be first in an re or immediately after an alternation or begin-group operator. */ #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) /* If this bit is set, then . matches newline. If not set, then it doesn't. */ #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) /* If this bit is set, then . doesn't match NUL. If not set, then it does. */ #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) /* If this bit is set, nonmatching lists [^...] do not match newline. If not set, they do. */ #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) /* If this bit is set, either \{...\} or {...} defines an interval, depending on RE_NO_BK_BRACES. If not set, \{, \}, {, and } are literals. */ #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) /* If this bit is set, +, ? and | aren't recognized as operators. If not set, they are. */ #define RE_LIMITED_OPS (RE_INTERVALS << 1) /* If this bit is set, newline is an alternation operator. If not set, newline is literal. */ #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) /* If this bit is set, then `{...}' defines an interval, and \{ and \} are literals. If not set, then `\{...\}' defines an interval. */ #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) /* If this bit is set, (...) defines a group, and \( and \) are literals. If not set, \(...\) defines a group, and ( and ) are literals. */ #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) /* If this bit is set, then \ matches . If not set, then \ is a back-reference. */ #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) /* If this bit is set, then | is an alternation operator, and \| is literal. If not set, then \| is an alternation operator, and | is literal. */ #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) /* If this bit is set, then an ending range point collating higher than the starting range point, as in [z-a], is invalid. If not set, then when ending range point collates higher than the starting range point, the range is ignored. */ #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) /* If this bit is set, then an unmatched ) is ordinary. If not set, then an unmatched ) is invalid. */ #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect already-compiled regexps. */ extern reg_syntax_t re_syntax_options; /* Define combinations of the above bits for the standard possibilities. (The [[[ comments delimit what gets put into the Texinfo file, so don't delete them!) */ /* [[[begin syntaxes]]] */ #define RE_SYNTAX_EMACS 0 #define RE_SYNTAX_AWK \ (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ | RE_NO_BK_PARENS | RE_NO_BK_REFS \ | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ | RE_UNMATCHED_RIGHT_PAREN_ORD) #define RE_SYNTAX_POSIX_AWK \ (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) #define RE_SYNTAX_GREP \ (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ | RE_NEWLINE_ALT) #define RE_SYNTAX_EGREP \ (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ | RE_NO_BK_VBAR) #define RE_SYNTAX_POSIX_EGREP \ (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ #define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC /* Syntax bits common to both basic and extended POSIX regex syntax. */ #define _RE_SYNTAX_POSIX_COMMON \ (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ | RE_INTERVALS | RE_NO_EMPTY_RANGES) #define RE_SYNTAX_POSIX_BASIC \ (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this isn't minimal, since other operators, such as \`, aren't disabled. */ #define RE_SYNTAX_POSIX_MINIMAL_BASIC \ (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) #define RE_SYNTAX_POSIX_EXTENDED \ (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ | RE_UNMATCHED_RIGHT_PAREN_ORD) /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ | RE_NO_BK_PARENS | RE_NO_BK_REFS \ | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) /* [[[end syntaxes]]] */ /* Maximum number of duplicates an interval can allow. Some systems (erroneously) define this in other header files, but we want our value, so remove any previous define. */ #ifdef RE_DUP_MAX #undef RE_DUP_MAX #endif #define RE_DUP_MAX ((1 << 15) - 1) /* POSIX `cflags' bits (i.e., information for `regcomp'). */ /* If this bit is set, then use extended regular expression syntax. If not set, then use basic regular expression syntax. */ #define REG_EXTENDED 1 /* If this bit is set, then ignore case when matching. If not set, then case is significant. */ #define REG_ICASE (REG_EXTENDED << 1) /* If this bit is set, then anchors do not match at newline characters in the string. If not set, then anchors do match at newlines. */ #define REG_NEWLINE (REG_ICASE << 1) /* If this bit is set, then report only success or fail in regexec. If not set, then returns differ between not matching and errors. */ #define REG_NOSUB (REG_NEWLINE << 1) /* POSIX `eflags' bits (i.e., information for regexec). */ /* If this bit is set, then the beginning-of-line operator doesn't match the beginning of the string (presumably because it's not the beginning of a line). If not set, then the beginning-of-line operator does match the beginning of the string. */ #define REG_NOTBOL 1 /* Like REG_NOTBOL, except for the end-of-line. */ #define REG_NOTEOL (1 << 1) /* If any error codes are removed, changed, or added, update the `re_error_msg' table in regex.c. */ typedef enum { REG_NOERROR = 0, /* Success. */ REG_NOMATCH, /* Didn't find a match (for regexec). */ /* POSIX regcomp return error codes. (In the order listed in the standard.) */ REG_BADPAT, /* Invalid pattern. */ REG_ECOLLATE, /* Not implemented. */ REG_ECTYPE, /* Invalid character class name. */ REG_EESCAPE, /* Trailing backslash. */ REG_ESUBREG, /* Invalid back reference. */ REG_EBRACK, /* Unmatched left bracket. */ REG_EPAREN, /* Parenthesis imbalance. */ REG_EBRACE, /* Unmatched \{. */ REG_BADBR, /* Invalid contents of \{\}. */ REG_ERANGE, /* Invalid range end. */ REG_ESPACE, /* Ran out of memory. */ REG_BADRPT, /* No preceding re for repetition op. */ /* Error codes we've added. */ REG_EEND, /* Premature end. */ REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ } reg_errcode_t; /* This data structure represents a compiled pattern. Before calling the pattern compiler, the fields `buffer', `allocated', `fastmap', `translate', and `no_sub' can be set. After the pattern has been compiled, the `re_nsub' field is available. All other fields are private to the regex routines. */ struct re_pattern_buffer { /* [[[begin pattern_buffer]]] */ /* Space that holds the compiled pattern. It is declared as `unsigned char *' because its elements are sometimes used as array indexes. */ unsigned char *buffer; /* Number of bytes to which `buffer' points. */ unsigned long allocated; /* Number of bytes actually used in `buffer'. */ unsigned long used; /* Syntax setting with which the pattern was compiled. */ reg_syntax_t syntax; /* Pointer to a fastmap, if any, otherwise zero. re_search uses the fastmap, if there is one, to skip over impossible starting points for matches. */ char *fastmap; /* Either a translate table to apply to all characters before comparing them, or zero for no translation. The translation is applied to a pattern when it is compiled and to a string when it is matched. */ char *translate; /* Number of subexpressions found by the compiler. */ size_t re_nsub; /* Zero if this pattern cannot match the empty string, one else. Well, in truth it's used only in `re_search_2', to see whether or not we should use the fastmap, so we don't set this absolutely perfectly; see `re_compile_fastmap' (the `duplicate' case). */ unsigned can_be_null : 1; /* If REGS_UNALLOCATED, allocate space in the `regs' structure for `max (RE_NREGS, re_nsub + 1)' groups. If REGS_REALLOCATE, reallocate space if necessary. If REGS_FIXED, use what's there. */ #define REGS_UNALLOCATED 0 #define REGS_REALLOCATE 1 #define REGS_FIXED 2 unsigned regs_allocated : 2; /* Set to zero when `regex_compile' compiles a pattern; set to one by `re_compile_fastmap' if it updates the fastmap. */ unsigned fastmap_accurate : 1; /* If set, `re_match_2' does not return information about subexpressions. */ unsigned no_sub : 1; /* If set, a beginning-of-line anchor doesn't match at the beginning of the string. */ unsigned not_bol : 1; /* Similarly for an end-of-line anchor. */ unsigned not_eol : 1; /* If true, an anchor at a newline matches. */ unsigned newline_anchor : 1; /* [[[end pattern_buffer]]] */ }; typedef struct re_pattern_buffer regex_t; /* search.c (search_buffer) in Emacs needs this one opcode value. It is defined both in `regex.c' and here. */ #define RE_EXACTN_VALUE 1 /* Type for byte offsets within the string. POSIX mandates this. */ typedef int regoff_t; /* This is the structure we store register match data in. See regex.texinfo for a full description of what registers match. */ struct re_registers { unsigned num_regs; regoff_t *start; regoff_t *end; }; /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, `re_match_2' returns information about at least this many registers the first time a `regs' structure is passed. */ #ifndef RE_NREGS #define RE_NREGS 30 #endif /* POSIX specification for registers. Aside from the different names than `re_registers', POSIX uses an array of structures, instead of a structure of arrays. */ typedef struct { regoff_t rm_so; /* Byte offset from string's start to substring's start. */ regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ } regmatch_t; /* Declarations for routines. */ /* To avoid duplicating every routine declaration -- once with a prototype (if we are ANSI), and once without (if we aren't) -- we use the following macro to declare argument types. This unfortunately clutters up the declarations a bit, but I think it's worth it. */ #if __STDC__ #define _RE_ARGS(args) args #else /* not __STDC__ */ #define _RE_ARGS(args) () #endif /* not __STDC__ */ /* Sets the current default syntax to SYNTAX, and return the old syntax. You can also simply assign to the `re_syntax_options' variable. */ extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); /* Compile the regular expression PATTERN, with length LENGTH and syntax given by the global `re_syntax_options', into the buffer BUFFER. Return NULL if successful, and an error string if not. */ extern char *re_compile_pattern _RE_ARGS ((char *pattern, int length, struct re_pattern_buffer *buffer)); /* Compile a fastmap for the compiled pattern in BUFFER; used to accelerate searches. Return 0 if successful and -2 if was an internal error. */ extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); /* Search in the string STRING (with length LENGTH) for the pattern compiled into BUFFER. Start searching at position START, for RANGE characters. Return the starting position of the match, -1 for no match, or -2 for an internal error. Also return register information in REGS (if REGS and BUFFER->no_sub are nonzero). */ extern int re_search _RE_ARGS ((struct re_pattern_buffer *buffer, char *string, int length, int start, int range, struct re_registers *regs)); /* Like `re_search', but search in the concatenation of STRING1 and STRING2. Also, stop searching at index START + STOP. */ extern int re_search_2 _RE_ARGS ((struct re_pattern_buffer *buffer, char *string1, int length1, char *string2, int length2, int start, int range, struct re_registers *regs, int stop)); /* Like `re_search', but return how many characters in STRING the regexp in BUFFER matched, starting at position START. */ extern int re_match _RE_ARGS ((struct re_pattern_buffer *buffer, char *string, int length, int start, struct re_registers *regs)); /* Relates to `re_match' as `re_search_2' relates to `re_search'. */ extern int re_match_2 _RE_ARGS ((struct re_pattern_buffer *buffer, char *string1, int length1, char *string2, int length2, int start, struct re_registers *regs, int stop)); /* Set REGS to hold NUM_REGS registers, storing them in STARTS and ENDS. Subsequent matches using BUFFER and REGS will use this memory for recording register information. STARTS and ENDS must be allocated with malloc, and must each be at least `NUM_REGS * sizeof (regoff_t)' bytes long. If NUM_REGS == 0, then subsequent matches should allocate their own register data. Unless this function is called, the first search or match using PATTERN_BUFFER will allocate its own register data, without freeing the old data. */ extern void re_set_registers _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, unsigned num_regs, regoff_t *starts, regoff_t *ends)); /* 4.2 bsd compatibility. */ extern char *re_comp _RE_ARGS ((char *)); extern int re_exec _RE_ARGS ((char *)); /* POSIX compatibility. */ extern int regcomp _RE_ARGS ((regex_t *preg, char *pattern, int cflags)); extern int regexec _RE_ARGS ((regex_t *preg, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); extern size_t regerror _RE_ARGS ((int errcode, regex_t *preg, char *errbuf, size_t errbuf_size)); extern void regfree _RE_ARGS ((regex_t *preg)); #endif /* not __REGEXP_LIBRARY_H__ */ /* Local variables: make-backup-files: t version-control: t trim-versions-without-asking: nil End: */ tRNAscan-SE-2.0/src/align_main.c0000644000543100007160000001622411021467303015665 0ustar pchanlowelab/* align_main.c * SRE, Wed Jun 30 09:56:15 1993 * 2.0 Thu Sep 30 14:23:57 1993 * * main() for covea * Multiple sequence alignment to a covariance HMM model. */ #include #include #include #include #include #ifdef NEED_GETOPTH #include #endif #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "aho:s:S" static char usage[] = "\ Usage: covea [-options] \n\ where supported options are:\n\ -a : annotate all base pairs, not just canonical ones\n\ -h : print short help and version info\n\ -o : write alignment to in SELEX format\n\ -s : save individual alignment scores to \n\ \n\ Experimental options:\n\ -S : use small-memory variant of alignment algorithm\n"; static char banner[] = "covea: multiple sequence alignment to a covariance model"; int main(int argc, char **argv) { char **rseqs; /* raw sequences to align */ char **aseqs; /* multiple sequence alignment */ SQINFO *sqinfo; /* array of info structures */ int nseq; /* number of seqs */ char *seqfile; /* sequence file */ int format; /* format of sequence file */ char *cmfile; /* cvhmm save file to read */ struct cm_s *cm; /* model */ struct trace_s **tr; /* array of tracebacks for seqs */ int idx; /* counter for sequences */ double score; /* score of indiv. alignment */ double tot_score; /* sum of scores */ AINFO ainfo; /* optional alignment info (sec structure) */ struct istate_s *icm; /* model, integer log odds form */ int statenum; /* # of states in icm */ double rfreq[ALPHASIZE]; /* expected background symbol frequencies */ char *outfile; /* file to write alignment to */ char *scorefile; /* file to save scores to */ FILE *ofp; /* opened outfile */ FILE *sfp; /* opened scorefile */ int do_smallmemory; /* use small-memory viterbi variant */ int watsoncrick; /* annotate only canonical pairs */ int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, current_size; #endif /*********************************************** * Parse command line ***********************************************/ outfile = NULL; scorefile = NULL; do_smallmemory = FALSE; watsoncrick = TRUE; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'a': watsoncrick = FALSE; break; case 'o': outfile = optarg; break; case 's': scorefile = optarg; break; case 'S': do_smallmemory = TRUE; break; case 'h': printf("%s\n version %s (%s)\n%s\n", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("unrecognized option %c\n", optc); } if (argc - optind != 2) Die("%s\n", usage); cmfile = argv[argc-2]; seqfile = argv[argc-1]; #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif /*********************************************** * Get sequence data and model; open output ptrs ***********************************************/ if (! SeqfileFormat(seqfile, &format, NULL)) Die("Failed to determine format of file %s\n", seqfile); if (! ReadMultipleRseqs(seqfile, format, &rseqs, &sqinfo, &nseq)) Die("Failed to read sequences from file %s", seqfile); if (! ReadCM(cmfile, &cm)) Die("Failed to read model from file %s", cmfile); rfreq[0] = rfreq[1] = rfreq[2] = rfreq[3] = 0.25; if (! RearrangeCM(cm, rfreq, &icm, &statenum)) Die("Failed to convert CM to integer log odds"); if (outfile != NULL) if ((ofp = fopen(outfile, "w")) == NULL) Die("Open failed for alignment output file %s", outfile); if (scorefile != NULL) if ((sfp = fopen(scorefile, "w")) == NULL) Die("Open failed for score output file %s", scorefile); /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" release %s, %s\n\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); printf("Sequence data: %s (%d sequences)\n", seqfile, nseq); printf("Covariance model: %s (%d nodes)\n", cmfile, cm->nodes); if (outfile != NULL) printf("Alignment saved to: %s\n", outfile); if (scorefile != NULL) printf("Indiv. scores saved to: %s\n", scorefile); printf("---------------------------------------------------\n"); puts(""); /*********************************************** * Do the alignment ***********************************************/ if ((tr = (struct trace_s **) malloc (nseq * sizeof(struct trace_s *))) == NULL) Die("Memory failure, line %d of %s", __LINE__, __FILE__); tot_score = 0.0; for (idx = 0; idx < nseq; idx++) { char *prepseq; prepseq = Strdup(rseqs[idx]); PrepareSequence(prepseq); if (do_smallmemory) { if (! SmallViterbiAlign(icm, statenum, prepseq, &score, &tr[idx])) Die("SmallViterbiAlign() failed on sequence %d", idx); } else if (! ViterbiAlign(icm, statenum, prepseq, &score, &tr[idx])) Die("ViterbiAlign() failed on sequence %d", idx); tot_score += score; if (scorefile != NULL) fprintf(sfp, "%-8.3f : %s\n", score, sqinfo[idx].name); free(prepseq); } if (do_smallmemory) { printf("aborting... no traceback/alignment code yet for small memory variant\n"); Free2DArray(rseqs, nseq); FreeCM(cm); exit(0); } if (! Traces2Alignment(rseqs, sqinfo, tr, nseq, cm, watsoncrick, &aseqs, &ainfo)) Die("Traces2Alignment() failed"); /*********************************************** * Print the alignment ***********************************************/ if (outfile != NULL) { if (! WriteSELEX(ofp, aseqs, nseq, &ainfo, 60)) Die("Write failed: can't save alignment to %s", outfile); fclose(ofp); printf("Alignment written to %s\n", outfile); } else { if (! WriteSELEX(stdout, aseqs, nseq, &ainfo, 60)) Die("Write failed: can't print alignment"); } if (scorefile != NULL) fclose(sfp); printf("Overall alignment score: %.2f\n", tot_score / (double) nseq); /*********************************************** * Garbage collect and exit ***********************************************/ for (idx = 0; idx < nseq; idx++) { FreeTrace(tr[idx], NULL); FreeSequence(rseqs[idx],&(sqinfo[idx])); } free(tr); free(sqinfo); FreeAlignment(aseqs, nseq, &ainfo); FreeCM(cm); free(icm); #ifdef MEMDEBUG current_size = malloc_size(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "No memory leaks, sir.\n"); #endif return 0; } tRNAscan-SE-2.0/src/dbviterbi.c0000644000543100007160000003521111021467304015537 0ustar pchanlowelab/* dbviterbi.c * Mon Jan 31 10:06:14 1994 * * Search variant of the alignment algorithm. Derived from viterbi.c * * To optimize memory access patterns, the score storage is implemented * as a two-matrix version. amx is the * main storage. bmx is a smaller auxiliary matrix with a different * access pattern, holding scores of BEGIN state alignments; it * is used when calculating BIFURC scores. * * amx is [j = 0..1] [diff = 0..j] [y = 0..statenum] * diff == 0 is for off-diagonal boundary conditions (this is why diff is shifted +1) * diff == 1 is for the diagonal, i==j * We only need to keep two j rows in memory (current and previous). * * bmx is [y = 0..statenum] [j = 0..N] [ diff = 0..j] * a j,diff matrix exists only where y is a BEGIN state * * The 2.0 implementation allows variable storage per node rather * than storing and calculating a fixed max number of states per node, * which should save up to 2x in both time and space. * * An optimization is made which requires END states to be explicitly * added, so statenum (the number of states in the integer model) * is *inclusive* of ENDs. */ #include #include #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static int allocate_mx(struct istate_s *icm, int statenum, int window, int ****ret_amx, int ****ret_bmx); static int init_mx (struct istate_s *icm, int statenum, int N, int ***amx, int ***bmx); static int recurse_mx (struct istate_s *icm, int statenum, char *seq, int seqlen, int window, int ***amx, int ***bmx, int ithresh, int (*gotone_f)(int, int, double)); static void free_mx (int ***amx, int ***bmx, int statenum, int window); /* Function: ViterbiScan() * * Purpose: Scanning version of the Viterbi alignment algorithm, * for finding matches in a long sequence. * * Args: cm - the model to align sequence to * seq - sequence to align model to * window - scanning window size (nucleotides) * thresh - scores above this are reported through gotone_f() * gotone_f - function which gets told about a match * * Return: 1 on success, 0 on failure. */ int ViterbiScan(struct istate_s *icm, int statenum, char *seq, int window, double thresh, int (*gotone_f)(int, int, double)) { int ***amx; /* the main score matrix */ int ***bmx; /* the BEGIN score matrix */ int N; /* length of sequence */ int ithresh; /* thresh, converted and scaled to int */ N = strlen(seq); seq--; /* convert to 1..N. Ugh! */ ithresh = (int) (thresh * INTPRECISION); if (! allocate_mx(icm, statenum, window, &amx, &bmx)) return 0; #ifdef DEBUG printf("allocated matrices\n"); #endif if (! init_mx(icm, statenum, window, amx, bmx)) return 0; #ifdef DEBUG printf("matrices initialized\n"); #endif if (! recurse_mx(icm, statenum, seq, N, window, amx, bmx, ithresh, gotone_f)) return 0; #ifdef DEBUG printf("recursion finished\n"); #endif /* terminate scanning hit reporting */ ReportScanHit(-1,-1, 0.0, gotone_f); free_mx(amx, bmx, statenum, window); return 1; } /* Function: allocate_mx() * * Purpose: Malloc space for the score matrices. * amx and atr are indexed as j, i, y. * bmx and btr are indexed as k, j, i. * In the two sequence dimensions j, i they are * diagonal (+1 off diagonal) matrices with * rows j = 0..N, i = 1..j+1. * In the node dimension k bmx and btr are k = 0..M. * In the state dimension y amx and atr are y = 0..numstates. * * Args: icm - the int, log-odds, state-based model * statenum - number of states in model * window - length of scanning window * ret_amx - RETURN: main score matrix * ret_bmx - RETURN: BEGIN score matrix * * Return: Ptr to allocated scoring matrix, or * dies and exits. */ static int allocate_mx(struct istate_s *icm, int statenum, int window, int ****ret_amx, int ****ret_bmx) { int ***amx; int ***bmx; int diag, j, y; /* Main matrix, amx: fastest varying index is y (j,i,y) * we only keep two rows for j, 0 and 1. */ /* malloc for j = 0..1 rows */ if ((amx = (int ***) malloc (2 * sizeof(int **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (j = 0; j <= 1; j++) /* loop over rows j = 0..1 */ { /* malloc for diag = 0..window cols */ if ((amx[j] = (int **) malloc ((window + 1) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* loop over cols diag = 0..window */ for (diag = 0; diag <= window; diag++) /* malloc for y = 0..statenum-1 decks */ if ((amx[j][diag] = (int *) malloc ((statenum) * sizeof (int ))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } /* B auxiliary matrix: fastest varying index is diag (y,j,diag) * bmx keeps score decks for BEGIN states */ /* 0..statenum-1 decks */ if ((bmx = (int ***) malloc (statenum * sizeof(int **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (y = 0; y < statenum; y++) { bmx[y] = NULL; /* we keep score info for BEGIN states */ if (icm[y].statetype == uBEGIN_ST) { /* j= 0..window-1 rows */ if ((bmx[y] = (int **) malloc ((window) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* diff = 0..window columns */ for (j = 0; j < window; j++) if ((bmx[y][j] = (int *) malloc ((window+1) * sizeof(int ))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } } *ret_amx = amx; *ret_bmx = bmx; return 1; } /* Function: free_mx() * * Purpose: Free the space allocated to the scoring and traceback matrices. * Precisely mirrors the allocations above in allocate_cvmx(). * * Return: (void) */ static void free_mx(int ***amx, int ***bmx, int statenum, int window) { int diag, j, y; /* Free the main matrix, amx: * amx[j][i][y] = [0..1] [0..window] [0..statenum-1] */ for (j = 0; j <= 1; j++) { for (diag = 0; diag <= window; diag++) free(amx[j][diag]); free(amx[j]); } free(amx); /* Free the auxiliary matrix, bmx * bmx[y][j][i] = [0..statenum-1] [0..window] [0..window] */ for (y = 0; y < statenum; y++) { if (bmx[y] != NULL) { for (j = 0; j < window; j++) free(bmx[y][j]); free(bmx[y]); } } free(bmx); } /* Function: init_mx() * * Purpose: Initialization of the scoring matrices. We initialize the off-diagonal, * the diagonal, and the "floor" (end states) of the cube. * * Return: 1 on success, 0 on failure. */ static int init_mx(struct istate_s *icm, /* integer model */ int statenum, /* number of states in icm */ int window, /* size of scanning window on sequence */ int ***amx, int ***bmx) { int diag, j, y; /* counters for indices over the cvmx */ int ynext; /* index of next state k+1 */ int *beam; /* z-axis vector of numbers in amx */ /* Init the whole amx to -Infinity. We do this with memcpy, trying * to be fast. We fill in j=0,diag=0 by hand, then memcpy() the other * columns. */ for (y = 0; y < statenum; y++) amx[0][0][y] = amx[1][0][y] = NEGINFINITY; for (diag = 1; diag <= window; diag++) { memcpy(amx[0][diag], amx[0][0], statenum * sizeof(int)); memcpy(amx[1][diag], amx[0][0], statenum * sizeof(int)); } /* Init the whole bmx to -Inf. We know state 0 is a begin (it's ROOT), so we * start there, and memcpy rows as needed. */ for (diag = 0; diag <= window; diag++) bmx[0][0][diag] = NEGINFINITY; for (j = 1; j < window; j++) memcpy(bmx[0][j], bmx[0][0], (window+1) * sizeof(int)); for (y = 1; y < statenum; y++) if (bmx[y] != NULL) for (j = 0; j < window; j++) memcpy(bmx[y][j], bmx[0][0], (window+1) * sizeof(int)); /* Init the off-diagonal (j = 0..window-1; diag == 0) with -log P scores. * End state = 0; * del, bifurc states are calc'ed * begin states same as del's * THIS IS WASTEFUL AND SHOULD BE CHANGED. */ for (j = 0; j < window; j++) for (y = statenum-1; y >= 0; y--) { /* Set the alignment of END states to the off-diagonal (diag = 0) * to be zero, and never touch them again. */ if (icm[y].statetype == uEND_ST) amx[j%2][0][y] = 0; else if (icm[y].statetype == uBIFURC_ST) amx[j%2][0][y] = bmx[icm[y].tmx[0]][j][0] + bmx[icm[y].tmx[1]][j][0]; else if (icm[y].statetype == uDEL_ST || icm[y].statetype == uBEGIN_ST) { /* only calc DEL-DEL and BEGIN-DEL transitions. Since * we optimized the state transition tables, removing * the unused ones, we don't know where the number * for "to DEL" is! But we can find it, because it'll * be the connection to a non-infinite score */ beam = amx[j%2][0] + y + icm[y].offset; for (ynext = 0; ynext < icm[y].connectnum; ynext++) { if (*beam != NEGINFINITY) amx[j%2][0][y] = *beam + icm[y].tmx[ynext]; beam++; } } /* make a copy into bmx if y is a BEGIN */ if (icm[y].statetype == uBEGIN_ST) bmx[y][j][0] = amx[j%2][0][y]; } return 1; } /* Function: recurse_mx() * * Purpose: Carry out the fill stage of the dynamic programming * algorithm. After each j row is filled in, check the score * of best full alignment ending at this row; if greater * than threshold (ithresh), report it. * * Returns: 1 on success, 0 on failure. */ static int recurse_mx(struct istate_s *icm, /* integer, state-form model */ int statenum, /* number of states in icm */ char *seq, /* sequence, 1..seqlen */ int seqlen, /* length of seq */ int window, /* length of scanning window on seq */ int ***amx, /* main scoring matrix */ int ***bmx, /* bifurc scoring matrix */ int ithresh, /* reporting threshold */ int (*gotone_f)(int, int, double)) { int i, j, y; /* indices for 3 dimensions */ int aj; /* 0 or 1, index for j in A matrix */ int bj; /* 0..window-1, index for j in B matrix */ int diff; /* loop counter for difference: diff = j-i + 1 */ int symi, symj; /* symbol indices for seq[i], seq[j] */ int sc; /* tmp for a score */ int ynext; /* index of next state y */ int bestdiff, bestscore; int *beam; /* ptr to a beam (z-axis vector) */ int leftdiff; /* diff coord of BEGIN_L of a bifurc */ int leftj; /* j coord of BEGIN_L of a bifurc */ int **left_p; /* pointer into whole 2D deck of BEGINL's of a bifurc */ int *right_p; /* ptr into row of BEGIN_R's of a bifurc */ int *scp; /* score pointer: ptr into beam of scores being calc'ed */ struct istate_s *st; /* state pointer: ptr at current state in icm */ int *tmx; int emitsc; for (j = 1; j <= seqlen; j++) { aj = j % 2; /* 0 or 1 index in amx */ bj = j % window; /* 0..window-1 index in bmx */ symj = SymbolIndex(seq[j]); for (diff = 1; diff <= window && diff <= j; diff++) { i = j - diff + 1; symi = SymbolIndex(seq[i]); scp = &amx[aj][diff][statenum-1]; st = &icm[statenum-1]; for (y = statenum-1; y >= 0; y--, scp--, st--) { /* loop over states */ if (st->statetype != uBIFURC_ST) /* a normal (non-BIFURC) state */ { /* Connect the "beam" pointer to the appropriate * starting place in the ynext scores we're connecting * y to */ switch (st->statetype) { case uBEGIN_ST: case uDEL_ST: beam = amx[aj][diff]; emitsc = 0; break; case uMATP_ST: /* !aj toggles from 0 to 1 and vice versa */ if (diff == 1) continue; beam = amx[!aj][diff-2]; emitsc = st->emit[symi * ALPHASIZE + symj]; break; case uMATR_ST: case uINSR_ST: beam = amx[!aj][diff-1]; emitsc = st->emit[symj]; break; case uMATL_ST: case uINSL_ST: beam = amx[aj][diff-1]; emitsc = st->emit[symi]; break; case uEND_ST: continue; default: Die("no such state type %d", st->statetype); } beam += y + st->offset; tmx = st->tmx; /* Init for ynext == 0 case */ *scp = *beam + *tmx; /* Calculate remaining cases */ for (ynext = 1; ynext < st->connectnum; ynext++) { beam++; tmx++; if (*beam > *scp) { sc = *beam + *tmx; if (sc > *scp) *scp = sc; } } /* Add emission scores now */ *scp += emitsc; /* Make a copy into bmx, btr if necessary */ if (st->statetype == uBEGIN_ST) bmx[y][bj][diff] = *scp; } /* end block of normal state stuff */ else /* a BIFURC state */ { leftdiff = diff; leftj = bj; right_p = bmx[st->tmx[1]][leftj]; left_p = bmx[st->tmx[0]]; /* init w/ case that left branch emits it all */ *scp = left_p[leftj][leftdiff] + *right_p; while (leftdiff > 0) { leftdiff--; leftj = leftj ? leftj-1 : window-1; /* scan window wraparound */ right_p++; sc = left_p[leftj][leftdiff] + *right_p; if (sc > *scp) *scp = sc; } } } /* end loop over states */ } /* end loop over diff */ /* We've completed a row. Now we can examine the scores in diff, * aj, ROOT_ST to decide whether to report this row. If we do, * we report the 1..seqlen i, j coords of the matching subsequence * in seq, as well as the score converted to double-precision bits. */ bestdiff = 1; bestscore = bmx[0][bj][1]; for (diff = 2; diff <= window; diff++) if (bmx[0][bj][diff] > bestscore) { bestscore = bmx[0][bj][diff]; bestdiff = diff; } if (bestscore > ithresh) if (! ReportScanHit(j - bestdiff + 1, j, (double)(bestscore / INTPRECISION), gotone_f)) Warn("caller ignored report of a match!"); } /* end loop over j */ return 1; } tRNAscan-SE-2.0/src/scorestack.c0000644000543100007160000001673111672025272015742 0ustar pchanlowelab/* scorestack.c * SRE, Tue Aug 17 09:50:39 1993 * * For unidirectional scanning search procedures, implement * a score reporting system that filters out hits that overlap * with higher-scoring printed scores. * * The simplest rule to use would be to keep a record of the last hit; * on receiving a new hit: 1) if new hit overlaps with last hit and * new hit is higher, replace last with new; 2) if new hit overlaps * with last hit and new hit is lower, ignore new; 3) if new hit * does not overlap with last, report last and assign new to last. * At end, report last. This is essentially the rule used by the * original hmm and cove scanning procedures. * * There is a small weakness in this strategy, in that for three * hits A > B > C which all overlap, only A will be reported; but * although A overlaps B and B overlaps C, A may not overlap C. * (Will this ever happen in reality? I dunno. I don't want to * be surprised.) * * Thus, this more complicated strategy. * Keep a stack of last hits. * On receiving a new hit: * 1) if new overlaps last and new > last, push new onto stack * 2) if new overlaps last and new <= last, ignore new * 3) if new doesn't overlap, resolve stack; start new stack and * push new onto it. * At end: resolve stack. * * Stack resolution: * set "previously reported hit" to -1,-1 so it won't overlap w/ anything * while something is in the stack: * pop top hit off stack * if it overlaps with previously reported hit, continue; * if it doesn't overlap, report it * * Testing overlap: * Given two subsequences with endpoints al,ar and bl, br, * with no other knowledge, we would need to test whether any of * the four endpoints are within the opposing subsequence. * However, because we're scanning unidirectionally, we know * that the new right end is greater than the old right end, * so we only need to test whether the old right end >= new left * end. * * External function: * * ReportScanHit() - report a hit * or, if reported coords are -1,-1, resolve old * stack, cleanup and exit. * */ #include #include #include #ifdef MEMDEBUG #include "dbmalloc.h" #endif #include "squid.h" /* Data structure for the stack of previous hits; * declarations of the functions to manipulate it. */ struct hitstack_s { int left; /* left coord of matched segment */ int right; /* right coord of matched segment */ double score; /* score of match */ struct hitstack_s *nxt; /* pointer to next elem in stack */ }; static struct hitstack_s *init_hitstack(void); static void push_hitstack(struct hitstack_s *hstack,int left,int right, double score); static int pop_hitstack(struct hitstack_s *hstack, int *ret_left, int *ret_right, double *ret_score); static void free_hitstack(struct hitstack_s *hstack); /* Function: ReportScanHit() * * Purpose: Caller reports a hit during a search scan, and * provides a pointer to a function we can call to * print non-overlapping hits. Caller reports * -1,-1 for coords to request cleanup and end. * * Two special cases must be dealt with: * INIT: If the hit stack hasn't been started yet, * we need to initialize it before doing * anything else * END: If coords are -1,-1, we resolve the stack * and cleanup; caller is finished with us * for now. * * Args: left - left coord of hit segment * right - right coord of hit segment * score - score of the hit * print_hit - pointer to a function to print * nonoverlapping hits * * Return: 1 on success, 0 on failure. */ int ReportScanHit(int left, int right, double score, int (*print_hit)(int,int,double)) { static struct hitstack_s *hstack = NULL; /* static local handle; set to NULL on 1st entry */ static int oldright = -1; /* -1 is guaranteed not to overlap w/ first report */ int oldleft; int newleft, newright; double newscore; /* Check whether this is first entry; * init hit stack if so. */ if (hstack == NULL) hstack = init_hitstack(); /* Check whether we have to resolve the old stack: * if caller is reporting it's done (-1,-1 coords), * or if new hit doesn't overlap last stacked hit. */ if (left > oldright || (left == -1 && right == -1)) { /* Stack resolution. */ oldleft = INT_MAX; while (pop_hitstack(hstack, &newleft, &newright, &newscore)) { /* does this hit not overlap w/ previous printed one? */ if (newright < oldleft) { (*print_hit)(newleft, newright, newscore); oldleft = newleft; } } free_hitstack(hstack); hstack = NULL; oldright = -1; /* Start new stack, if not done. */ if (left != -1 || right != -1) { hstack = init_hitstack(); push_hitstack(hstack, left, right, score); oldright = right; } } /* else, they overlap; if new reported score is better than last one, * push new one. We're guaranteed to have something in * the stack, so we can use the score in hstack->nxt->score. * Reset oldright to be the new right edge of the stack, if we add something. */ else if (score > hstack->nxt->score) { push_hitstack(hstack, left, right, score); oldright = right; } /* else, they overlap and the newly reported score * isn't better, so we ignore it. */ return 1; } /* Functions: init_hitstack() * push_hitstack() * pop_hitstack() * free_hitstack() * * Purpose: Implementation of the pushdown stack for * keeping old hit positions and scores. * * The hitstack has a dummy begin element, * so the first legitimate element is * hstack->nxt. The last legitimate element * has a NULL nxt pointer. */ static struct hitstack_s * init_hitstack(void) { struct hitstack_s *hstack; if ((hstack = (struct hitstack_s *) malloc (sizeof(struct hitstack_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); hstack->nxt = NULL; return hstack; } static void push_hitstack(struct hitstack_s *hstack, int left, int right, double score) { struct hitstack_s *new; if ((new = (struct hitstack_s *) malloc (sizeof(struct hitstack_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->left = left; new->right = right; new->score = score; new->nxt = hstack->nxt; hstack->nxt = new; } static int pop_hitstack(struct hitstack_s *hstack, int *ret_left, int *ret_right, double *ret_score) { struct hitstack_s *old; if (hstack->nxt == NULL) return 0; old = hstack->nxt; hstack->nxt = old->nxt; *ret_left = old->left; *ret_right = old->right; *ret_score = old->score; free(old); return 1; } static void free_hitstack(struct hitstack_s *hstack) { int left, right; double score; while (pop_hitstack(hstack, &left, &right, &score) != 0) ; /* do nothing */ free(hstack); } tRNAscan-SE-2.0/src/pavesi.h0000644000543100007160000000272214044142171015062 0ustar pchanlowelab/* eufindtRNA - Eukaryotic tRNA finder * * pavesi.h - functions for finding transcriptional control regions * * C implementation of algorithm described by Pavesi, Conterio, * Bolchi, Dieci, & Ottonello in NAR 22:1247-56 (94) * "Identification of new eukaryotic tRNA genes in genomic DNA * databases by a multistep weight matix analysis of transcriptional * control regions" * * To be used in tRNAscan-SE package to increase sensitivity by * complementing tRNAscan 1.3 first-pass scan * * by Todd MJ Lowe 4/8/96 * * Uses Sean Eddy's function library for biological sequence analysis * (Squid v1.5g) * */ #include "squid.h" #include "eufind_const.h" void Init_tRNA(TRNA_TYPE *tRNA); int IntEncodeSeq (char *intseq, char *seq, int seqlen); int GetBbox (float *score, int *seqidx, char *iseq, int seqlen, int strand, int verbose); float Get_ABdist_weight(int ABdist); int GetSecABox(TRNA_TYPE *tRNA, char *seq); void GetBestABox (TRNA_TYPE *tRNA, char *seq, char *iseq, int seqlen, int strand, int verbose, int Max_AB_dist, int prev_Abox_st); int GetBestTrxTerm (TRNA_TYPE *tRNA, char *seq, int seqlen, float TermPenalty); void Get_IsoType (TRNA_TYPE *tRNA); void Get_anticodon (TRNA_TYPE *tRNA, char *seq); void Get_tRNA_stats (TRNA_TYPE *tRNA, char *seq, int seqlen, int strand); void Save_tRNA (TRNA_TYPE *tRNA, SQINFO *sqinfo, char *seq, int strand, int ShowScores, long int sqoffset); int tRNAOverlap (TRNA_TYPE *tRNA1, TRNA_TYPE *tRNA2, int strand); tRNAscan-SE-2.0/src/reformat_main.c0000644000543100007160000001166411021467311016414 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* reformat_main.c * Mon Sep 13 13:06:51 1993 * * reformat - reformat sequence files. */ #include #include #include "squid.h" #define OPTIONS "dhlru" char usage[] = "Usage: reformat [-options] \n\ Convert between sequence file formats.\n\ Available formats are:\n\ embl\n\ fasta\n\ xfasta\n\ genbank\n\ gcg\n\ gcgdata\n\ msf\n\ strider\n\ zuker\n\ ig\n\ nbrf\n\ pir\n\ selex\n\ squid\n\ raw\n\n\ Available options are:\n\ -d : force DNA alphabet for nucleic acid sequence\n\ -r : force RNA alphabet for nucleic acid sequence\n\ -l : force lower case\n\ -u : force upper case\n\ -h : print short help and usage info\n"; struct seqfmt_s { char *formatname; int fmt; } seqfmt[] = { { "embl", kEMBL }, { "fasta", kPearson }, { "xfasta", kXPearson}, { "genbank", kGenBank }, { "gcg", kGCG }, { "gcgdata", kGCGdata }, { "msf", kMSF }, { "strider", kStrider }, { "zuker", kZuker }, { "ig", kIG }, { "nbrf", kNBRF }, { "pir", kPIR }, { "selex", kSelex }, { "squid", kSquid }, { "raw", kRaw }, }; #define NUMFORMATS (sizeof(seqfmt) / sizeof(struct seqfmt_s)) int main(int argc, char **argv) { char *seqfile; /* name of sequence file */ char *format; SQFILE *dbfp; /* open sequence file */ int fmt; /* format of seqfile */ int outfmt; /* output format */ char *seq; /* sequence */ SQINFO sqinfo; int i; int force_rna; /* TRUE to force RNA alphabet */ int force_dna; /* TRUE to force DNA alphabet */ int force_lower; /* TRUE to force lower case */ int force_upper; /* TRUE to force upper case */ int optchar; /* option character, command line */ extern int optind; /*********************************************** * Parse command line ***********************************************/ force_rna = FALSE; force_dna = FALSE; force_upper = FALSE; force_lower = FALSE; while ((optchar = getopt(argc, argv, OPTIONS)) != -1) switch (optchar) { case 'd': force_dna = TRUE; break; case 'l': force_lower = TRUE; break; case 'r': force_rna = TRUE; break; case 'u': force_upper = TRUE; break; case 'h': printf("reformat %s, %s\n%s\n", squid_version, squid_date, usage); exit(EXIT_SUCCESS); default: Die("%s\n", usage); } if (argc - optind != 2) Die("%s\n", usage); if (force_lower && force_upper) Die("Can't force both upper case and lower case. Stop trying to confuse me.\n%s", usage); if (force_rna && force_dna) Die("Can't force both RNA and DNA. Stop trying to find bugs, you'll be sorry.\n%s", usage); format = argv[optind]; optind++; seqfile = argv[optind]; optind++; /*********************************************** * Figure out what format we're supposed to write ***********************************************/ outfmt = kUnknown; for (i = 0; i < NUMFORMATS; i++) if (strcasecmp(format, seqfmt[i].formatname) == 0) outfmt = seqfmt[i].fmt; if (outfmt == kUnknown) Die("Unknown output format %s\n%s", format, usage); /*********************************************** * Reformat the file, printing to stdout. ***********************************************/ if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Can't determine format of file %s\n", seqfile); if ((fmt == kMSF || fmt == kSelex || fmt == kClustal) && (outfmt == kMSF || outfmt == kSelex)) { char **aseqs; int num; AINFO ainfo; ReadAlignment(seqfile, fmt, &aseqs, &num, &ainfo); for (i = 0; i < num; i++) { if (force_dna) ToDNA(aseqs[i]); if (force_rna) ToRNA(aseqs[i]); if (force_lower) s2lower(aseqs[i]); if (force_upper) s2upper(aseqs[i]); } switch (outfmt) { case kMSF: WriteMSF(stdout, aseqs, num, &ainfo); break; case kSelex: WriteSELEX(stdout, aseqs, num, &ainfo, 50); break; } FreeAlignment(aseqs, num, &ainfo); } else if (outfmt == kMSF || outfmt == kSelex) { Die("Sorry, you can't make alignment files from unaligned files"); } else { if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) { if (force_dna) ToDNA(seq); if (force_rna) ToRNA(seq); if (force_lower) s2lower(seq); if (force_upper) s2upper(seq); WriteSeq(stdout, outfmt, seq, &sqinfo); FreeSequence(seq, &sqinfo); } SeqfileClose(dbfp); } return 0; } tRNAscan-SE-2.0/src/modelmaking.c0000644000543100007160000004104211021467304016053 0ustar pchanlowelab/* modelmaking.c * Tue Oct 4 15:33:21 1994 * * Bring together common elements of the model construction process. * Also, provides EasyModelmaker() for making a model given a structure. * * All model makers have in common that they construct a "master" traceback * for the alignment, specifying which columns are match vs. insert and * how the model tree branches. This traceback is assigned a numbering * system by NumberMasterTrace(), which returns the number of nodes; * the caller then allocates a new CM. This new model is numbered (assigned * a branching structure) by TopofyNewCM(). Then individual tracebacks * are constructed from individual aligned sequences by Transmogrify(). * The individual tracebacks are counted into a new model with TraceCount() * and the counts converted to probabilities with ProbifyCM(). * * The master tree is a (slightly misused) trace_s structure with the following * properties: * insert columns are not represented at all. Transmogrify() must deal. * * emitl, emitr == 0..alen-1 coords of assigned columns. Set and valid for all * nodes, even non-emitters. END values will be * on the off-diagonal, emitl = emitr+1. (If this is * not true, Transmogrify() breaks.) The trace * construction function is responsible for this. * * nodeidx == this is numbered by preorder traversal by NumberMasterTrace(), * which also numbers a model. ENDs are not explicitly * represented in a CM, so they get nodeidx = -1. * * type == a number 0..6 for *node* type. (Usually this is a unique * state type identifier.) ENDs are -1. The trace construction * function is reponsible for this. */ #include #include #include "version.h" #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: NumberMasterTrace() * * Purpose: Given a master trace for an alignment, number the trace tree * (in the nodeidx field) in preorder traversal. END nodes * will not be represented explicitly in the final CM. They * get numbered -1. */ void NumberMasterTrace(struct trace_s *mtr, int *ret_nodes) { struct trace_s *curr; struct tracestack_s *dolist; int nodes = 0; dolist = InitTracestack(); PushTracestack(dolist, mtr->nxtl); /* push root onto stack */ while ((curr = PopTracestack(dolist)) != NULL) { if (curr->nxtl == NULL) /* END node */ curr->nodeidx = -1; else /* other nodes */ curr->nodeidx = nodes++; if (curr->nxtr != NULL) PushTracestack(dolist, curr->nxtr); if (curr->nxtl != NULL) PushTracestack(dolist, curr->nxtl); } FreeTracestack(dolist); *ret_nodes = nodes; } /* Function: TopofyNewCM() * * Purpose: Given the mtr master traceback tree, which defines the * topology of the model, write the nxt and nxt2 connections * into the model. For the most part, these are already * contained in mtr thanks to NumberMasterTrace(); the * only tricky bit is converting END states from multiple * real states (in mtr) to -1 nxt flags in the cm. * * Return: 1 on success, 0 on failure. */ void TopofyNewCM(struct cm_s *cm, struct trace_s *mtr) { struct tracestack_s *dolist; struct trace_s *curr; dolist = InitTracestack(); PushTracestack(dolist, mtr->nxtl); /* push ROOT onto stack */ while ((curr = PopTracestack(dolist)) != NULL) { if (curr->nxtl == NULL) continue; /* ignore ENDs */ if (curr->nxtr != NULL) /* deal with BIFURC states */ { cm->nd[curr->nodeidx].nxt2 = curr->nxtr->nodeidx; PushTracestack(dolist, curr->nxtr); } else cm->nd[curr->nodeidx].nxt2 = -1; /* watch out for curr pointing to END states. */ cm->nd[curr->nodeidx].type = curr->type; cm->nd[curr->nodeidx].nxt = (curr->nxtl->nxtl == NULL) ? -1 : curr->nxtl->nodeidx; PushTracestack(dolist, curr->nxtl); } FreeTracestack(dolist); } /* Function: Transmogrify() * * Purpose: Given a master consensus traceback, create an individual * "fake" traceback. The fake traceback contains inserts * and converts the type field of mtr (which contains NODE * type indices) into _ST type indices, including proper * classification of nodes into DEL_ST or the various match * states depending on what aseq[idx] looks like. * * Args: mtr - master consensus traceback tree * aseq - 0..alen-1 aligned sequence. * ret_tr - RETURN: individual traceback * ret_pool - RETURN: memory pool for traceback * * Return: (void). *ret_tr must be free'd by the caller */ void Transmogrify(struct trace_s *mtr, char *aseq, struct trace_s **ret_tr, struct trmem_s **ret_pool) { struct trace_s *tr; struct trmem_s *pool; struct tracestack_s *mtr_stack; struct tracestack_s *tr_stack; struct trace_s *curr_mtr; struct trace_s *curr_tr; int i2,j2; mtr_stack = InitTracestack(); tr_stack = InitTracestack(); InitTrace(&tr, &pool); /* Push ROOT onto both stacks */ PushTracestack(mtr_stack, mtr->nxtl); PushTracestack(tr_stack, AttachTrace(tr, pool, -1, -1, 0, uBEGIN_ST)); while ((curr_mtr = PopTracestack(mtr_stack)) != NULL) { curr_tr = PopTracestack(tr_stack); switch (curr_mtr->type) { case uEND_ST: DeleteTracenode(curr_tr, pool); break; case MATP_NODE: if (isgap(aseq[curr_mtr->emitl])) { if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uDEL_ST; else curr_tr->type = uMATR_ST; } else { if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uMATL_ST; else curr_tr->type = uMATP_ST; } /* May have to deal with INSL and INSR; INSL precedes INSR */ for (i2 = curr_mtr->emitl+1; i2 < curr_mtr->nxtl->emitl; i2++) if (!isgap(aseq[i2])) curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST); /* May have to deal with INSR */ for (j2 = curr_mtr->emitr-1; j2 > curr_mtr->nxtl->emitr; j2--) if (! isgap(aseq[j2])) curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST); break; case MATL_NODE: if (isgap(aseq[curr_mtr->emitl])) curr_tr->type = uDEL_ST; else curr_tr->type = uMATL_ST; /* May have to deal with INSL */ for (i2 = curr_mtr->emitl+1; i2 < curr_mtr->nxtl->emitl; i2++) if (!isgap(aseq[i2])) curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST); break; case MATR_NODE: if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uDEL_ST; else curr_tr->type = uMATR_ST; /* May have to deal with INSR */ for (j2 = curr_mtr->emitr-1; j2 > curr_mtr->nxtl->emitr; j2--) if (! isgap(aseq[j2])) curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST); break; case BIFURC_NODE: curr_tr->type = uBIFURC_ST; break; case BEGINL_NODE: curr_tr->type = uBEGIN_ST; break; case BEGINR_NODE: curr_tr->type = uBEGIN_ST; /* May have to deal with INSL. * Inserts from BEGINR are *inclusive* of i */ for (i2 = curr_mtr->emitl; i2 < curr_mtr->nxtl->emitl; i2++) if (!isgap(aseq[i2])) curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST); break; case ROOT_NODE: curr_tr->type = uBEGIN_ST; /* May have to deal with INSL and INSR; note INSL precedes INSR * inserts from root are inclusive of i */ for (i2 = curr_mtr->emitl; i2 < curr_mtr->nxtl->emitl; i2++) if (!isgap(aseq[i2])) curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST); /* May have to deal with INSR */ for (j2 = curr_mtr->emitr; j2 > curr_mtr->nxtl->emitr; j2--) if (! isgap(aseq[j2])) curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST); break; default: Die("Invalid node type %d", curr_mtr->type); } /* Push the children onto stacks, if they're not END nodes */ if (curr_mtr->nxtr != NULL) { PushTracestack(mtr_stack, curr_mtr->nxtr); PushTracestack(tr_stack, AttachTrace(curr_tr, pool, curr_mtr->nxtr->emitl, curr_mtr->nxtr->emitr, curr_mtr->nxtr->nodeidx, curr_mtr->nxtr->type)); } if (curr_mtr->nxtl != NULL) { PushTracestack(mtr_stack, curr_mtr->nxtl); PushTracestack(tr_stack, AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, curr_mtr->nxtl->emitr, curr_mtr->nxtl->nodeidx, curr_mtr->nxtl->type)); } } FreeTracestack(mtr_stack); FreeTracestack(tr_stack); *ret_pool = pool; *ret_tr = tr; } /* Function: EasyModelmaker() * * Purpose: The customer always knows best. * * Construct a model given a stated structure. The structure * is provided via a "cs" (consensus sequence) line, as would * occur in an annotated SELEX file. Only > and < characters * in this line are interpreted (as base pairs). * * Match vs. insert can be determined one of two ways. By default, * the assignment is made by "gapthresh"; for columns with * fractional occurence of gaps greater than this, the column * is assigned to insert. If "use_rf" is TRUE, the rf (reference) * line is interpreted as the assignment -- columns with non-space * characters in the rf line are assigned to MATCH. * * Both rf and cs are provided in the ainfo structure. * * Args: aseq - aligned sequences. [0..nseq-1] by [0..alen-1] * ainfo - info about the alignment, including alen, cs, * and rf * nseq - number of seqs in aseq * prior - prior distributions for CM construction * gapthresh - over this fraction of gaps, assign column as INS * use_rf - if TRUE, use rf field of ainfo for MAT/INS assignment * ret_cm - RETURN: new model (maybe NULL) * ret_mtr - RETURN: master traceback for alignment (maybe NULL) * * Return: void * cm is allocated here. FreeCM(*ret_cm). * tr is allocated here. FreeTrace() on each one, then free(*ret_tr). */ void EasyModelmaker(char **aseq, AINFO *ainfo, int nseq, struct prior_s *prior, double gapthresh, int use_rf, struct cm_s **ret_cm, struct trace_s **ret_mtr) { struct cm_s *cm; /* new covariance model */ struct trace_s *mtr; /* master traceback tree for alignment */ struct trace_s *tr; /* individual sequence traceback tree */ struct trmem_s *pool; /* memory pool for traceback tree */ struct tracestack_s *dolist; struct trace_s *cur; int *matassign; int nodes; int idx, apos; int *ct; int i,j, nxti, nxtj; if (! (ainfo->flags & AINFO_CS)) Die("No cs (consensus structure) line available for that alignment."); /* Determine match/insert assignments * matassign is 0..alen-1. Values are 1 if MAT, 0 if INS. */ matassign = (int *) MallocOrDie(sizeof(int) * ainfo->alen); if (use_rf) { if (! (ainfo->flags & AINFO_RF)) Die("No rf (reference coord) line available for that alignment."); for (apos = 0; apos < ainfo->alen; apos++) matassign[apos] = (ainfo->rf[apos] == ' ') ? 0 : 1; } else { int gaps; for (apos = 0; apos < ainfo->alen; apos++) { for (gaps = 0, idx = 0; idx < nseq; idx++) if (isgap(aseq[idx][apos])) gaps++; matassign[apos] = ((double) gaps / (double) nseq > gapthresh) ? 0 : 1; } } /* Determine a "ct" array, base-pairing partners for each position */ if (! KHS2ct(ainfo->cs, ainfo->alen, FALSE, &ct)) Die("Consensus structure string is inconsistent"); /* Make sure the consensus structure "ct" is consistent with the match assignments. * Wipe out all structure under INS; including the base-paired * partner of INS-assigned positions */ for (apos = 0; apos < ainfo->alen; apos++) if (! matassign[apos]) { if (ct[apos] != -1) ct[ct[apos]] = -1; ct[apos] = -1; } /* Construct a master traceback tree. * This code is borrowed from yarn's KHS2Trace(). * mtr's emitl, emitr, and type are properly set by this section. */ InitTrace(&mtr, NULL); dolist = InitTracestack(); cur = AttachTrace(mtr, NULL, 0, ainfo->alen-1, -1, ROOT_NODE); /* attach the root */ PushTracestack(dolist, cur); while ((cur = PopTracestack(dolist)) != NULL) { i = cur->emitl; j = cur->emitr; /* This node accounts for i..j, but we don't know how yet. * Six possibilities: * i > j; this is an END state; do nothing. * this is already assigned as a BEGIN; push i,j * i is unpaired; this is a MATL state; push i+1, j * j is unpaired; this is a MATR state; push i,j-1 * i,j pair to each other; this is a MATP state; push i+1,j-1 * i,j pair but not to each other; this is a BIFURC state; * pick mid ip <= mid < jp; push BEGIN i,mid and working i,mid, * and push BEGIN mid+1,j and working mid+1,j */ if (i > j) cur->type = uEND_ST; else if (cur->type == ROOT_NODE) { /* try to push i,j; but deal with INSL and INSR */ for (nxti = i; nxti <= j; nxti++) if (matassign[nxti]) break; for (nxtj = j; nxtj >= nxti; nxtj--) if (matassign[nxtj]) break; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, nxtj, -1, uEND_ST)); else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = nxtj; } /* deal with END_ST */ } else if (cur->type == BEGINL_NODE) /* no inserts */ { if (i <= j) PushTracestack(dolist, AttachTrace(cur, NULL, i, j, -1, uEND_ST)); else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } } else if (cur->type == BEGINR_NODE) /* INSL */ { for (nxti = i; nxti <= j; nxti++) if (matassign[nxti]) break; if (nxti <= j) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, j, -1, uEND_ST)); else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } /* deal with END_ST */ } else if (ct[i] == -1) /* i unpaired. This is a MATL node; allow INSL */ { cur->type = MATL_NODE; for (nxti = i+1; nxti <= j; nxti++) if (matassign[nxti]) break; if (nxti <= j) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, j, -1, uEND_ST)); else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } /* deal with END_ST */ } else if (ct[j] == -1) /* j unpaired. MATR node. Deal with INSR */ { cur->type = MATR_NODE; for (nxtj = j-1; nxtj >= i; nxtj--) if (matassign[nxtj]) break; if (i <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, i, nxtj, -1, uEND_ST)); else { cur->nxtl->emitl = i; cur->nxtl->emitr = nxtj; } /* deal with END_ST */ } else if (ct[i] == j) /* i,j paired to each other. MATP. deal with INSL, INSR */ { cur->type = MATP_NODE; for (nxti = i+1; nxti <= j; nxti++) if (matassign[nxti]) break; for (nxtj = j-1; nxtj >= nxti; nxtj--) if (matassign[nxtj]) break; if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, nxtj, -1, uEND_ST)); else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = nxtj; } /* deal with END_ST */ } else /* i,j paired but not to each other. BIFURC. no INS. */ { /* by convention, right side of bifurc deals with insert in middle */ cur->type = BIFURC_NODE; PushTracestack(dolist, AttachTrace(cur, NULL, ct[i]+1, j, -1, BEGINR_NODE)); PushTracestack(dolist, AttachTrace(cur, NULL, i, ct[i], -1, BEGINL_NODE)); } } /* while something's on dolist stack */ FreeTracestack(dolist); free(ct); /* Now, do the drill for constructing a model using this master trace. */ NumberMasterTrace(mtr, &nodes); if ((cm = AllocCM(nodes)) == NULL) Die("failed to allocate for new model of %d nodes\n", nodes); TopofyNewCM(cm, mtr); for (idx = 0; idx < nseq; idx++) { Transmogrify(mtr, aseq[idx], &tr, &pool); if (! TraceCount(cm, aseq[idx], (ainfo->sqinfo[idx].flags & SQINFO_WGT) ? (double) ainfo->sqinfo[idx].weight : 1.0, tr)) Die("TraceCount() failed"); FreeTrace(tr, pool); } ProbifyCM(cm, prior); free(matassign); if (ret_cm != NULL) *ret_cm = cm; else FreeCM(cm); if (ret_mtr != NULL) *ret_mtr = mtr; else FreeTrace(mtr, NULL); } tRNAscan-SE-2.0/src/sre_math.c0000644000543100007160000002622111021467305015371 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* sre_math.c * * Portability for and extensions to C math library. * */ #include #include #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static int sre_reseed = 0; /* TRUE to reinit sre_random() */ static int sre_randseed = 666; /* default seed for sre_random() */ /* Function: Gaussrandom() * * Pick a Gaussian-distributed random variable * with some mean and standard deviation, and * return it. * * Based on RANLIB.c public domain implementation. * Thanks to the authors, Barry W. Brown and James Lovato, * University of Texas, M.D. Anderson Cancer Center, Houston TX. * Their implementation is from Ahrens and Dieter, "Extensions * of Forsythe's method for random sampling from the normal * distribution", Math. Comput. 27:927-937 (1973). * * Impenetrability of the code is to be blamed on its FORTRAN/f2c lineage. * */ float Gaussrandom(float mean, float stddev) { static float a[32] = { 0.0,3.917609E-2,7.841241E-2,0.11777,0.1573107,0.1970991,0.2372021,0.2776904, 0.3186394,0.36013,0.4022501,0.4450965,0.4887764,0.5334097,0.5791322, 0.626099,0.6744898,0.7245144,0.7764218,0.8305109,0.8871466,0.9467818, 1.00999,1.077516,1.150349,1.229859,1.318011,1.417797,1.534121,1.67594, 1.862732,2.153875 }; static float d[31] = { 0.0,0.0,0.0,0.0,0.0,0.2636843,0.2425085,0.2255674,0.2116342,0.1999243, 0.1899108,0.1812252,0.1736014,0.1668419,0.1607967,0.1553497,0.1504094, 0.1459026,0.14177,0.1379632,0.1344418,0.1311722,0.128126,0.1252791, 0.1226109,0.1201036,0.1177417,0.1155119,0.1134023,0.1114027,0.1095039 }; static float t[31] = { 7.673828E-4,2.30687E-3,3.860618E-3,5.438454E-3,7.0507E-3,8.708396E-3, 1.042357E-2,1.220953E-2,1.408125E-2,1.605579E-2,1.81529E-2,2.039573E-2, 2.281177E-2,2.543407E-2,2.830296E-2,3.146822E-2,3.499233E-2,3.895483E-2, 4.345878E-2,4.864035E-2,5.468334E-2,6.184222E-2,7.047983E-2,8.113195E-2, 9.462444E-2,0.1123001,0.136498,0.1716886,0.2276241,0.330498,0.5847031 }; static float h[31] = { 3.920617E-2,3.932705E-2,3.951E-2,3.975703E-2,4.007093E-2,4.045533E-2, 4.091481E-2,4.145507E-2,4.208311E-2,4.280748E-2,4.363863E-2,4.458932E-2, 4.567523E-2,4.691571E-2,4.833487E-2,4.996298E-2,5.183859E-2,5.401138E-2, 5.654656E-2,5.95313E-2,6.308489E-2,6.737503E-2,7.264544E-2,7.926471E-2, 8.781922E-2,9.930398E-2,0.11556,0.1404344,0.1836142,0.2790016,0.7010474 }; static long i; static float snorm,u,s,ustar,aa,w,y,tt; u = sre_random(); s = 0.0; if(u > 0.5) s = 1.0; u += (u-s); u = 32.0*u; i = (long) (u); if(i == 32) i = 31; if(i == 0) goto S100; /* * START CENTER */ ustar = u-(float)i; aa = *(a+i-1); S40: if(ustar <= *(t+i-1)) goto S60; w = (ustar-*(t+i-1))**(h+i-1); S50: /* * EXIT (BOTH CASES) */ y = aa+w; snorm = y; if(s == 1.0) snorm = -y; return (stddev*snorm + mean); S60: /* * CENTER CONTINUED */ u = sre_random(); w = u*(*(a+i)-aa); tt = (0.5*w+aa)*w; goto S80; S70: tt = u; ustar = sre_random(); S80: if(ustar > tt) goto S50; u = sre_random(); if(ustar >= u) goto S70; ustar = sre_random(); goto S40; S100: /* * START TAIL */ i = 6; aa = *(a+31); goto S120; S110: aa += *(d+i-1); i += 1; S120: u += u; if(u < 1.0) goto S110; u -= 1.0; S140: w = u**(d+i-1); tt = (0.5*w+aa)*w; goto S160; S150: tt = u; S160: ustar = sre_random(); if(ustar > tt) goto S50; u = sre_random(); if(ustar >= u) goto S150; u = sre_random(); goto S140; } /* Function: Linefit() * * Purpose: Given points x[0..N-1] and y[0..N-1], fit to * a straight line y = a + bx. * a, b, and the linear correlation coefficient r * are filled in for return. * * Args: x - x values of data * y - y values of data * N - number of data points * ret_a - RETURN: intercept * ret_b - RETURN: slope * ret_r - RETURN: correlation coefficient * * Return: 1 on success, 0 on failure. */ int Linefit(float *x, float *y, int N, float *ret_a, float *ret_b, float *ret_r) { float xbar, ybar; float sxx, syy, sxy; int i; /* Calculate averages, xbar and ybar */ xbar = ybar = 0.0; for (i = 0; i < N; i++) { xbar += x[i]; ybar += y[i]; } xbar /= N; ybar /= N; sxx = syy = sxy = 0.0; for (i = 0; i < N; i++) { sxx += (x[i] - xbar) * (x[i] - xbar); syy += (y[i] - ybar) * (y[i] - xbar); sxy += (x[i] - xbar) * (y[i] - ybar); } *ret_b = sxy / sxx; *ret_a = ybar - xbar*(*ret_b); *ret_r = sxy / (sqrt(sxx) * sqrt(syy)); return 1; } /* Function: Gammln() * * Returns the natural log of the gamma function of x. * x is > 0.0. * * Adapted from a public domain implementation in the * NCBI core math library. Thanks to John Spouge and * the NCBI. (According to the NCBI, that's Dr. John * "Gammas Galore" Spouge to you, pal.) */ float Gammln(float x) { int i; double xx, tx; double tmp, value; static double cof[11] = { 4.694580336184385e+04, -1.560605207784446e+05, 2.065049568014106e+05, -1.388934775095388e+05, 5.031796415085709e+04, -9.601592329182778e+03, 8.785855930895250e+02, -3.155153906098611e+01, 2.908143421162229e-01, -2.319827630494973e-04, 1.251639670050933e-10 }; /* Protect against x=0. We see this in Dirichlet code, * for terms alpha = 0. This is a severe hack but it is effective * and safe. (due to GJM) */ if (x <= 0.0) return 999999.; xx = x - 1.0; tx = tmp = xx + 11.0; value = 1.0; for (i = 10; i >= 0; i--) /* sum least significant terms first */ { value += cof[i] / tmp; tmp -= 1.0; } value = log(value); tx += 0.5; value += 0.918938533 + (xx+0.5)*log(tx) - tx; return (float) value; } /* Vector operations for doubles and floats. * DNorm(), FNorm() -- normalize a probability vector of length n. * return 0 if all values were zero. * DScale(), FScale() -- multiply all items in vector by scale * DSet(), FSet() -- set all items in vector to value. */ int DNorm(double *vec, int n) { int x; double sum; sum = 0.0; for (x = 0; x < n; x++) sum += vec[x]; if (sum != 0.0) for (x = 0; x < n; x++) vec[x] /= sum; else { squid_errno = SQERR_DIVZERO; return 0; } return 1; } int FNorm(float *vec, int n) { int x; float sum; sum = 0.0; for (x = 0; x < n; x++) sum += vec[x]; if (sum != 0.0) for (x = 0; x < n; x++) vec[x] /= sum; else { squid_errno = SQERR_DIVZERO; return 0; } return 1; } void DScale(double *vec, int n, double scale) { int x; for (x = 0; x < n; x++) vec[x] *= scale; } void FScale(float *vec, int n, float scale) { int x; for (x = 0; x < n; x++) vec[x] *= scale; } void DSet(double *vec, int n, double value) { int x; for (x = 0; x < n; x++) vec[x] *= value; } void FSet(float *vec, int n, float value) { int x; for (x = 0; x < n; x++) vec[x] *= value; } double DSum(double *vec, int n) { double sum = 0.; int x; for (x = 0; x < n; x++) sum += vec[x]; return sum; } float FSum(float *vec, int n) { float sum = 0.; int x; for (x = 0; x < n; x++) sum += vec[x]; return sum; } /* Function: sre_random() * * Purpose: Return a uniform deviate from 0.0 to 1.0. * sre_randseed is a static variable, set * by sre_srandom(). sre_reseed is a static flag * raised by sre_srandom(), saying that we need * to re-initialize. * [0.0 <= x < 1.0] * * Uses a simple linear congruential generator with * period 2^28. Based on discussion in Robert Sedgewick's * _Algorithms in C_, Addison-Wesley, 1990. * * Requires that long int's have at least 32 bits. * * Reliable and portable, but slow. Benchmarks on wol, * using IRIX cc and IRIX C library rand() and random(): * sre_random(): 0.8 usec/call * random(): 0.3 usec/call * rand(): 0.3 usec/call */ #define RANGE 268435456 /* 2^28 */ #define DIV 16384 /* sqrt(RANGE) */ #define MULT 72530821 /* my/Cathy's birthdays, x21, x even (Knuth)*/ float sre_random(void) { static long rnd; static int firsttime = 1; long high1, low1; long high2, low2; if (sre_reseed || firsttime) { sre_reseed = firsttime = 0; if (sre_randseed <= 0) sre_randseed = 666; /* seeds of zero break me */ high1 = sre_randseed / DIV; low1 = sre_randseed % DIV; high2 = MULT / DIV; low2 = MULT % DIV; rnd = (((high2*low1 + high1*low2) % DIV)*DIV + low1*low2) % RANGE; } high1 = rnd / DIV; low1 = rnd % DIV; high2 = MULT / DIV; low2 = MULT % DIV; rnd = (((high2*low1 + high1*low2) % DIV)*DIV + low1*low2) % RANGE; return ((float) rnd / (float) RANGE); } #undef RANGE #undef DIV #undef MULT /* Function: sre_srandom() * * Purpose: Initialize with a random seed. Seed can be * any integer. */ void sre_srandom(int seed) { if (seed < 0) seed = -1 * seed; sre_reseed = 1; sre_randseed = seed; } /* Functions: DChoose(), FChoose() * * Purpose: Make a random choice from a normalized distribution. * DChoose() is for double-precision vectors; * FChoose() is for single-precision float vectors. * Returns the number of the choice. */ int DChoose(double *p, int N) { double roll; /* random fraction */ double sum; /* integrated prob */ int i; /* counter over the probs */ roll = sre_random(); sum = 0.0; for (i = 0; i < N; i++) { sum += p[i]; if (roll < sum) return i; } return (int) (sre_random() * N); /* bulletproof */ } int FChoose(float *p, int N) { float roll; /* random fraction */ float sum; /* integrated prob */ int i; /* counter over the probs */ roll = sre_random(); sum = 0.0; for (i = 0; i < N; i++) { sum += p[i]; if (roll < sum) return i; } return (int) (sre_random() * N); /* bulletproof */ } /* Functions: DLogSum(), FLogSum() * * Calculate the sum of a log vector * *in normal space*, and return the log of the sum. */ double DLogSum(double *logp, int n) { int x; double max = -1.0e30; double sum; for (x = 0; x < n; x++) if (logp[x] > max) max = logp[x]; sum = 0.0; for (x = 0; x < n; x++) if (logp[x] > max - 50.) sum += exp(logp[x] - max); sum = log(sum) + max; return sum; } float FLogSum(float *logp, int n) { int x; float max = -1.0e30; float sum; for (x = 0; x < n; x++) if (logp[x] > max) max = logp[x]; sum = 0.0; for (x = 0; x < n; x++) if (logp[x] > max - 50.) sum += exp(logp[x] - max); sum = log(sum) + max; return sum; } tRNAscan-SE-2.0/src/iupac.c0000644000543100007160000001044011021467304014663 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* iupac.c * * Globally defines the IUPAC symbols for nucleic acid sequence * Slowly evolving into a repository of globals. Tue Apr 20 1993 */ #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif char squid_version[] = "1.5g"; char squid_date[] = "March 1996"; /* Dayhoff f(i) amino acid occurrence frequencies. From BLAST 1.0.5, NCBI. * In alphabetic order by single-letter code. */ float aafq[20] = { .08713, .03347, .04687, .04953, .03977, .08861, .03362, .03689, .08048, .08536, .01475, .04043, .05068, .03826, .04090, .06958, .05854, .06472, .01049, .02992 }; char aa_alphabet[] = AMINO_ALPHABET; /* aa_index converts to pam's 27x27 scheme */ int aa_index[20] = { 0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 24 }; /* IUPAC code translations */ /* note: sequence chars are UPPER CASE */ struct iupactype iupac[] = { { 'A', 'T', NTA, NTT, }, { 'C', 'G', NTC, NTG, }, { 'G', 'C', NTG, NTC, }, { 'T', 'A', NTT, NTA, }, { 'U', 'A', NTU, NTA, }, { 'N', 'N', NTN, NTN, }, { ' ', ' ', NTGAP, NTGAP, }, { 'R', 'Y', NTR, NTY, }, { 'Y', 'R', NTY, NTR, }, { 'M', 'K', NTM, NTK, }, { 'K', 'M', NTK, NTM, }, { 'S', 'S', NTS, NTS, }, { 'W', 'W', NTW, NTW, }, { 'H', 'D', NTH, NTD, }, { 'B', 'V', NTB, NTV, }, { 'V', 'B', NTV, NTB, }, { 'D', 'H', NTD, NTH, }, }; char *stdcode1[65] = { "K", /* AAA */ "N", /* AAC */ "K", /* AAG */ "N", /* AAU */ "T", /* ACA */ "T", /* ACC */ "T", /* ACG */ "T", /* ACU */ "R", /* AGA */ "S", /* AGC */ "R", /* AGG */ "S", /* AGU */ "I", /* AUA */ "I", /* AUC */ "M", /* AUG */ "I", /* AUU */ "Q", /* CAA */ "H", /* CAC */ "Q", /* CAG */ "H", /* CAU */ "P", /* CCA */ "P", /* CCC */ "P", /* CCG */ "P", /* CCU */ "R", /* CGA */ "R", /* CGC */ "R", /* CGG */ "R", /* CGU */ "L", /* CUA */ "L", /* CUC */ "L", /* CUG */ "L", /* CUU */ "E", /* GAA */ "D", /* GAC */ "E", /* GAG */ "D", /* GAU */ "A", /* GCA */ "A", /* GCC */ "A", /* GCG */ "A", /* GCU */ "G", /* GGA */ "G", /* GGC */ "G", /* GGG */ "G", /* GGU */ "V", /* GUA */ "V", /* GUC */ "V", /* GUG */ "V", /* GUU */ "*", /* UAA */ "Y", /* UAC */ "*", /* UAG */ "Y", /* UAU */ "S", /* UCA */ "S", /* UCC */ "S", /* UCG */ "S", /* UCU */ "*", /* UGA */ "C", /* UGC */ "W", /* UGG */ "C", /* UGU */ "L", /* UUA */ "F", /* UUC */ "L", /* UUG */ "F", /* UUU */ "X", /* unknown */ }; char *stdcode3[65] = { "Lys", /* AAA */ "Asn", /* AAC */ "Lys", /* AAG */ "Asn", /* AAU */ "Thr", /* ACA */ "Thr", /* ACC */ "Thr", /* ACG */ "Thr", /* ACU */ "Arg", /* AGA */ "Ser", /* AGC */ "Arg", /* AGG */ "Ser", /* AGU */ "Ile", /* AUA */ "Ile", /* AUC */ "Met", /* AUG */ "Ile", /* AUU */ "Gln", /* CAA */ "His", /* CAC */ "Gln", /* CAG */ "His", /* CAU */ "Pro", /* CCA */ "Pro", /* CCC */ "Pro", /* CCG */ "Pro", /* CCU */ "Arg", /* CGA */ "Arg", /* CGC */ "Arg", /* CGG */ "Arg", /* CGU */ "Leu", /* CUA */ "Leu", /* CUC */ "Leu", /* CUG */ "Leu", /* CUU */ "Glu", /* GAA */ "Asp", /* GAC */ "Glu", /* GAG */ "Asp", /* GAU */ "Ala", /* GCA */ "Ala", /* GCC */ "Ala", /* GCG */ "Ala", /* GCU */ "Gly", /* GGA */ "Gly", /* GGC */ "Gly", /* GGG */ "Gly", /* GGU */ "Val", /* GUA */ "Val", /* GUC */ "Val", /* GUG */ "Val", /* GUU */ "***", /* UAA */ "Tyr", /* UAC */ "***", /* UAG */ "Tyr", /* UAU */ "Ser", /* UCA */ "Ser", /* UCC */ "Ser", /* UCG */ "Ser", /* UCU */ "***", /* UGA */ "Cys", /* UGC */ "Trp", /* UGG */ "Cys", /* UGU */ "Leu", /* UUA */ "Phe", /* UUC */ "Leu", /* UUG */ "Trp", /* UUU */ "XXX", /* unknown */ }; tRNAscan-SE-2.0/src/alignio.c0000644000543100007160000003213011021467303015203 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* alignio.c * SRE, Mon Jul 12 11:57:37 1993 * * Input/output of sequence alignments. */ #include #include #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: FreeAlignment() * * Purpose: Free the space allocated to alignment, names, and optional * information. * * Args: aseqs - sequence alignment * nseq - number of sequences * ainfo - optional extra data. May be NULL. */ void FreeAlignment(char **aseqs, int nseq, struct aliinfo_s *ainfo) { int i; for (i = 0; i < nseq; i++) { if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss); if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa); } if (ainfo->flags & AINFO_CS) free(ainfo->cs); if (ainfo->flags & AINFO_RF) free(ainfo->rf); free(ainfo->sqinfo); Free2DArray(aseqs, nseq); } /* Function: MakeAlignedString() * * Purpose: Given a raw string of some type (secondary structure, say), * align it to a given aseq by putting gaps wherever the * aseq has gaps. * * Args: aseq: template for alignment * alen: length of aseq * ss: raw string to align to aseq * ret_s: RETURN: aligned ss * * Return: 1 on success, 0 on failure (and squid_errno is set.) * ret_ss is malloc'ed here and must be free'd by caller. */ int MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s) { char *new; int apos, rpos; if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseq[apos])) { new[apos] = ss[rpos]; rpos++; } else new[apos] = '.'; new[apos] = '\0'; if (rpos != strlen(ss)) { squid_errno = SQERR_PARAMETER; free(new); return 0; } *ret_s = new; return 1; } /* Function: MakeDealignedString() * * Purpose: Given an aligned string of some type (either sequence or * secondary structure, for instance), dealign it relative * to a given aseq. Return a ptr to the new string. * * Args: aseq : template alignment * alen : length of aseq * ss: : string to make dealigned copy of; same length as aseq * ret_s : RETURN: dealigned copy of ss * * Return: 1 on success, 0 on failure (and squid_errno is set) * ret_s is alloc'ed here and must be freed by caller */ int MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s) { char *new; int apos, rpos; if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseq[apos])) { new[rpos] = ss[apos]; rpos++; } new[rpos] = '\0'; if (alen != strlen(ss)) { squid_errno = SQERR_PARAMETER; free(new); return 0; } *ret_s = new; return 1; } /* Function: WritePairwiseAlignment() * * Purpose: Write a nice formatted pairwise alignment out, * with a BLAST-style middle line showing identities * as themselves (single letter) and conservative * changes as '+'. * * Args: ofp - open fp to write to (stdout, perhaps) * aseq1, aseq2 - alignments to write (not necessarily * flushed right with gaps) * name1, name2 - names of sequences * spos1, spos2 - starting position in each (raw) sequence * pam - PAM matrix; positive values define * conservative changes * indent - how many extra spaces to print on left * * Return: 1 on success, 0 on failure */ int WritePairwiseAlignment(FILE *ofp, char *aseq1, char *name1, int spos1, char *aseq2, char *name2, int spos2, int **pam, int indent) { char sname1[11]; /* shortened name */ char sname2[11]; int still_going; /* True if writing another block */ char buf1[61]; /* buffer for writing seq1; CPL+1*/ char bufmid[61]; /* buffer for writing consensus */ char buf2[61]; char *s1, *s2; /* ptrs into each sequence */ int count1, count2; /* number of symbols we're writing */ int rpos1, rpos2; /* position in raw seqs */ int rawcount1, rawcount2; /* number of nongap symbols written */ int apos; strncpy(sname1, name1, 10); sname1[10] = '\0'; strtok(sname1, WHITESPACE); strncpy(sname2, name2, 10); sname2[10] = '\0'; strtok(sname2, WHITESPACE); s1 = aseq1; s2 = aseq2; rpos1 = spos1; rpos2 = spos2; still_going = True; while (still_going) { still_going = False; /* get next line's worth from both */ strncpy(buf1, s1, 60); buf1[60] = '\0'; strncpy(buf2, s2, 60); buf2[60] = '\0'; count1 = strlen(buf1); count2 = strlen(buf2); /* is there still more to go? */ if ((count1 == 60 && s1[60] != '\0') || (count2 == 60 && s2[60] != '\0')) still_going = True; /* shift seq ptrs by a line */ s1 += count1; s2 += count2; /* assemble the consensus line */ for (apos = 0; apos < count1 && apos < count2; apos++) { if (!isgap(buf1[apos]) && !isgap(buf2[apos])) { if (buf1[apos] == buf2[apos]) bufmid[apos] = buf1[apos]; else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0) bufmid[apos] = '+'; else bufmid[apos] = ' '; } else bufmid[apos] = ' '; } bufmid[apos] = '\0'; rawcount1 = 0; for (apos = 0; apos < count1; apos++) if (!isgap(buf1[apos])) rawcount1++; rawcount2 = 0; for (apos = 0; apos < count2; apos++) if (!isgap(buf2[apos])) rawcount2++; (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", sname1, rpos1, buf1, rpos1 + rawcount1 -1); (void) fprintf(ofp, "%*s %s\n", indent, "", bufmid); (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", sname2, rpos2, buf2, rpos2 + rawcount2 -1); (void) fprintf(ofp, "\n"); rpos1 += rawcount1; rpos2 += rawcount2; } return 1; } /* Function: MingapAlignment() * * Purpose: Remove all-gap columns from a multiple sequence alignment * and its associated data. The alignment is assumed to be * flushed (all aseqs the same length). */ int MingapAlignment(char **aseqs, int num, struct aliinfo_s *ainfo) { int apos; /* position in original alignment */ int mpos; /* position in new alignment */ int idx; /* We overwrite aseqs, using its allocated memory. */ for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++) { /* check for all-gap in column */ for (idx = 0; idx < num; idx++) if (! isgap(aseqs[idx][apos])) break; if (idx == num) continue; /* shift alignment and ainfo */ if (mpos != apos) { for (idx = 0; idx < num; idx++) aseqs[idx][mpos] = aseqs[idx][apos]; if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = ainfo->cs[apos]; if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = ainfo->rf[apos]; } mpos++; } /* null terminate everything */ for (idx = 0; idx < num; idx++) aseqs[idx][mpos] = '\0'; ainfo->alen = mpos; /* set new length */ if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = '\0'; if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = '\0'; return 1; } /* Function: RandomAlignment() * * Purpose: Create a random alignment from raw sequences. * * Ideally, we would like to sample an alignment from the * space of possible alignments according to its probability, * given a prior probability distribution for alignments. * I don't see how to describe such a distribution, let alone * sample it. * * This is a rough approximation that tries to capture some * desired properties. We assume the alignment is generated * by a simple HMM composed of match and insert states. * Given parameters (pop, pex) for the probability of opening * and extending an insertion, we can find the expected number * of match states, M, in the underlying model for each sequence. * We use an average M taken over all the sequences (this is * an approximation. The expectation of M given all the sequence * lengths is a nasty-looking summation.) * * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) ) * * Then, we assign positions in each raw sequence onto the M match * states and M+1 insert states of this "HMM", by rolling random * numbers and inserting the (rlen-M) inserted positions randomly * into the insert slots, taking into account the relative probability * of open vs. extend. * * The resulting alignment has two desired properties: insertions * tend to follow the HMM-like exponential distribution, and * the "sparseness" of the alignment is controllable through * pop and pex. * * Args: rseqs - raw sequences to "align", 0..nseq-1 * sqinfo - array of 0..nseq-1 info structures for the sequences * nseq - number of sequences * pop - probability to open insertion (0 minlen) M = minlen; /* make arrays that count insertions in M+1 possible insert states */ if ((ins = (int **) malloc (sizeof(int *) * nseq)) == NULL || (master_ins = (int *) malloc (sizeof(int) * (M+1))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (idx = 0; idx < nseq; idx++) { if ((ins[idx] = (int *) malloc (sizeof(int) * (M+1))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (rpos = 0; rpos <= M; rpos++) ins[idx][rpos] = 0; } /* normalize */ pop = pop / (pop+pex); pex = 1.0 - pop; /* make insertions for individual sequences */ for (idx = 0; idx < nseq; idx++) { apos = -1; for (rpos = 0; rpos < rlen[idx]-M; rpos++) { if (sre_random() < pop || apos == -1) /* open insertion */ apos = CHOOSE(M+1); /* choose 0..M */ ins[idx][apos]++; } } /* calculate master_ins, max inserts */ alen = M; for (apos = 0; apos <= M; apos++) { master_ins[apos] = 0; for (idx = 0; idx < nseq; idx++) if (ins[idx][apos] > master_ins[apos]) master_ins[apos] = ins[idx][apos]; alen += master_ins[apos]; } /* Now, construct alignment */ if ((aseqs = (char **) malloc (sizeof (char *) * nseq)) == NULL) { squid_errno = SQERR_MEM; return 0; } for (idx = 0; idx < nseq; idx++) if ((aseqs[idx] = (char *) malloc (sizeof(char) * (alen+1))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (idx = 0; idx < nseq; idx++) { apos = rpos = 0; for (statepos = 0; statepos <= M; statepos++) { for (count = 0; count < ins[idx][statepos]; count++) aseqs[idx][apos++] = rseqs[idx][rpos++]; for (; count < master_ins[statepos]; count++) aseqs[idx][apos++] = ' '; if (statepos != M) aseqs[idx][apos++] = rseqs[idx][rpos++]; } aseqs[idx][alen] = '\0'; } ainfo->flags = 0; ainfo->alen = alen; ainfo->flags |= AINFO_ALEN; if ((ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO) * nseq)) == NULL) Die("malloc failed"); for (idx = 0; idx < nseq; idx++) SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx])); free(rlen); free(master_ins); Free2DArray(ins, nseq); *ret_aseqs = aseqs; return 1; } tRNAscan-SE-2.0/src/dbmalloc.h0000644000543100007160000004316014044625413015356 0ustar pchanlowelab/* * (c) Copyright 1990, 1991, 1992 Conor P. Cahill (uunet!virtech!cpcahil). * * This code is distributed under GNU GENERAL PUBLIC LICENSE v3. */ /* * $Id: malloc.h.org,v 1.31 1992/06/22 23:40:10 cpcahil Exp $ */ #ifndef _DEBUG_MALLOC_INC #define _DEBUG_MALLOC_INC 1 #ifdef force_cproto_to_use_defines /* * these are just here because cproto used the c-preprocessor to generate * the prototypes and if they were left as #defines the prototypes.h file * would have the contents of the define, not the define itself */ typedef char DATATYPE; typedef int SIZETYPE; typedef void VOIDTYPE; typedef char MEMDATA; typedef int MEMSIZE; typedef int STRSIZE; #ifdef WRTSIZE #undef WRTSIZE #endif typedef unsigned int WRTSIZE; /* * for now, define CONST as const. A sed script in the makefile will change * this back to CONST in the prototypes.h file. */ #define CONST const #else /* force_cproto_to_use_defines */ /* * The following entries are automatically added by the Configure script. * If they are not correct for your system, then Configure is not handling * your system correctly. Please report this to the author along with * a description of your system and the correct values */ #if __GNUC__ && __STDC__ && __cplusplus #define VOIDTYPE void #define CONST const #define DATATYPE void #define SIZETYPE size_t #define MEMDATA void #define MEMSIZE size_t #define MEMCMPTYPE unsigned char #define STRSIZE size_t #define STRCMPTYPE unsigned char #else /* __GNUC__ && __STDC__ && __cplusplus */ #define VOIDTYPE void #define CONST const #define DATATYPE void #define SIZETYPE size_t #define MEMDATA void #define MEMSIZE int #define MEMCMPTYPE unsigned char #define STRSIZE size_t #define STRCMPTYPE unsigned char #endif /* __GNUC__ && __STDC__ && __cplusplus */ /* * END of automatic configuration stuff. */ /* * if DATATYPE is not defined, then the configure script must have had a * problem, or was used with a different compiler. So we have to stop * here and get the user to fix the problem. */ #ifndef DATATYPE /* * the following string should cause a comilation error and get the * user to look at this stuff to find out what is wrong. */ "This file is not configured correctly for this system. Run configure and check its results" char * malloc(); /* DON'T REMOVE THIS LINE if you get a compiler error here it is because the malloc.h file is not configured correctly See the readme/problems files for more info */ #endif /* DATATYPE */ #endif /* force_cproto_to_use_defines */ #define VOIDCAST (VOIDTYPE) /* * since we redefine much of the stuff that is #defined in string.h and * memory.h, we should do what we can to make sure that they don't get * included after us. This is typically accomplished by a special symbol * (similar to _DEBUG_MALLOC_INC defined above) that is #defined when the * file is included. Since we don't want the file to be included we will * #define the symbol ourselves. These will typically have to change from * one system to another. I have put in several standard mechanisms used to * support this mechanism, so hopefully you won't have to modify this file. */ #ifndef _H_STRING #define _H_STRING 1 #endif #ifndef __STRING_H #define __STRING_H 1 #endif #ifndef _STRING_H_ #define _STRING_H_ 1 #endif #ifndef _STRING_H #define _STRING_H 1 #endif #ifndef _STRING_INCLUDED #define _STRING_INCLUDED 1 #endif #ifndef __string_h #define __string_h 1 #endif #ifndef _string_h #define _string_h 1 #endif #ifndef _strings_h #define _strings_h 1 #endif #ifndef __strings_h #define __strings_h 1 #endif #ifndef _H_MEMORY #define _H_MEMORY 1 #endif #ifndef __MEMORY_H #define __MEMORY_H 1 #endif #ifndef _MEMORY_H_ #define _MEMORY_H_ 1 #endif #ifndef _MEMORY_H #define _MEMORY_H 1 #endif #ifndef _MEMORY_INCLUDED #define _MEMORY_INCLUDED 1 #endif #ifndef __memory_h #define __memory_h 1 #endif #ifndef _memory_h #define _memory_h 1 #endif /* * Malloc warning/fatal error handler defines... */ #define M_HANDLE_DUMP 0x80 /* 128 */ #define M_HANDLE_IGNORE 0 #define M_HANDLE_ABORT 1 #define M_HANDLE_EXIT 2 #define M_HANDLE_CORE 3 /* * Mallopt commands and defaults * * the first four settings are ignored by the debugging mallopt, but are * here to maintain compatibility with the system malloc.h. */ #define M_MXFAST 1 /* ignored by mallopt */ #define M_NLBLKS 2 /* ignored by mallopt */ #define M_GRAIN 3 /* ignored by mallopt */ #define M_KEEP 4 /* ignored by mallopt */ #define MALLOC_WARN 100 /* set malloc warning handling */ #define MALLOC_FATAL 101 /* set malloc fatal handling */ #define MALLOC_ERRFILE 102 /* specify malloc error file */ #define MALLOC_CKCHAIN 103 /* turn on chain checking */ #define MALLOC_FILLAREA 104 /* turn off area filling */ #define MALLOC_LOWFRAG 105 /* use best fit allocation mech */ #define MALLOC_CKDATA 106 /* turn off/on data checking */ #define MALLOC_REUSE 107 /* turn off/on freed seg reuse */ #define MALLOC_SHOWLINKS 108 /* turn off/on adjacent link disp */ union malloptarg { int i; char * str; }; /* * Malloc warning/fatal error codes */ #define M_CODE_CHAIN_BROKE 1 /* malloc chain is broken */ #define M_CODE_NO_END 2 /* chain end != endptr */ #define M_CODE_BAD_PTR 3 /* pointer not in malloc area */ #define M_CODE_BAD_MAGIC 4 /* bad magic number in header */ #define M_CODE_BAD_CONNECT 5 /* chain poingers corrupt */ #define M_CODE_OVERRUN 6 /* data overrun in malloc seg */ #define M_CODE_REUSE 7 /* reuse of freed area */ #define M_CODE_NOT_INUSE 8 /* pointer is not in use */ #define M_CODE_NOMORE_MEM 9 /* no more memory available */ #define M_CODE_OUTOF_BOUNDS 10 /* gone beyound bounds */ #define M_CODE_FREELIST_BAD 11 /* inuse segment on freelist */ #define M_CODE_NOBOUND 12 /* can't calculate boundry */ #define M_CODE_STK_NOCUR 13 /* no current element on stack */ #define M_CODE_STK_BADFUNC 14 /* current func doesn't match */ #define M_CODE_UNDERRUN 15 /* data underrun in malloc seg */ #ifndef __STDCARGS #if __STDC__ || __cplusplus #define __STDCARGS(a) a #else #define __STDCARGS(a) () #endif #endif #if __cplusplus extern "C" { #endif VOIDTYPE malloc_dump __STDCARGS((int)); VOIDTYPE malloc_list __STDCARGS((int,unsigned long, unsigned long)); int mallopt __STDCARGS((int, union malloptarg)); DATATYPE * debug_calloc __STDCARGS((CONST char *,int,SIZETYPE,SIZETYPE)); VOIDTYPE debug_cfree __STDCARGS((CONST char *, int, DATATYPE *)); VOIDTYPE debug_free __STDCARGS((CONST char *, int, DATATYPE *)); DATATYPE * debug_malloc __STDCARGS((CONST char *,int, SIZETYPE)); DATATYPE * debug_realloc __STDCARGS((CONST char *,int, DATATYPE *,SIZETYPE)); unsigned long DBmalloc_size __STDCARGS((CONST char *,int,unsigned long *)); int DBmalloc_chain_check __STDCARGS((CONST char *,int,int)); void StackEnter __STDCARGS((CONST char *, CONST char *, int)); void StackLeave __STDCARGS((CONST char *, CONST char *, int)); /* * X allocation related prototypes */ char * debug_XtMalloc __STDCARGS((CONST char *, int, unsigned int)); char * debug_XtRealloc __STDCARGS((CONST char *, int, char *, unsigned int)); char * debug_XtCalloc __STDCARGS((CONST char *, int, unsigned int, unsigned int)); void debug_XtFree __STDCARGS((CONST char *, int, char *)); void * debug_XtBCopy __STDCARGS((CONST char *, int, char *, char *, int)); extern void (*XtAllocErrorHandler) __STDCARGS((CONST char *)); /* * memory(3) related prototypes */ MEMDATA * DBmemccpy __STDCARGS((CONST char *file, int line, MEMDATA *ptr1, CONST MEMDATA *ptr2, int ch, MEMSIZE len)); MEMDATA * DBmemchr __STDCARGS((CONST char *file, int line, CONST MEMDATA *ptr1, int ch, MEMSIZE len)); MEMDATA * DBmemmove __STDCARGS((CONST char *file, int line, MEMDATA *ptr1, CONST MEMDATA *ptr2, MEMSIZE len)); MEMDATA * DBmemcpy __STDCARGS((CONST char *file, int line, MEMDATA *ptr1, CONST MEMDATA *ptr2, MEMSIZE len)); int DBmemcmp __STDCARGS((CONST char *file, int line, CONST MEMDATA *ptr1, CONST MEMDATA *ptr2, MEMSIZE len)); MEMDATA * DBmemset __STDCARGS((CONST char *file, int line, MEMDATA *ptr1, int ch, MEMSIZE len)); MEMDATA * DBbcopy __STDCARGS((CONST char *file, int line, CONST MEMDATA *ptr2, MEMDATA *ptr1, MEMSIZE len)); MEMDATA * DBbzero __STDCARGS((CONST char *file, int line, MEMDATA *ptr1, MEMSIZE len)); int DBbcmp __STDCARGS((CONST char *file, int line, CONST MEMDATA *ptr2, CONST MEMDATA *ptr1, MEMSIZE len)); /* * string(3) related prototypes */ char * DBstrcat __STDCARGS((CONST char *file,int line, char *str1, CONST char *str2)); char * DBstrdup __STDCARGS((CONST char *file, int line, CONST char *str1)); char * DBstrncat __STDCARGS((CONST char *file, int line, char *str1, CONST char *str2, STRSIZE len)); int DBstrcmp __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2)); int DBstrncmp __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2, STRSIZE len)); int DBstricmp __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2)); int DBstrincmp __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2, STRSIZE len)); char * DBstrcpy __STDCARGS((CONST char *file, int line, char *str1, CONST char *str2)); char * DBstrncpy __STDCARGS((CONST char *file, int line, char *str1, CONST char *str2, STRSIZE len)); STRSIZE DBstrlen __STDCARGS((CONST char *file, int line, CONST char *str1)); char * DBstrchr __STDCARGS((CONST char *file, int line, CONST char *str1, int c)); char * DBstrrchr __STDCARGS((CONST char *file, int line, CONST char *str1, int c)); char * DBindex __STDCARGS((CONST char *file, int line, CONST char *str1, int c)); char * DBrindex __STDCARGS((CONST char *file, int line, CONST char *str1, int c)); char * DBstrpbrk __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2)); STRSIZE DBstrspn __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2)); STRSIZE DBstrcspn __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2)); char * DBstrstr __STDCARGS((CONST char *file, int line, CONST char *str1, CONST char *str2)); char * DBstrtok __STDCARGS((CONST char *file, int line, char *str1, CONST char *str2)); #if __cplusplus }; #endif /* * Macro which enables logging of the file and line number for each allocation * so that it is easier to determine where the offending malloc comes from. * * NOTE that only code re-compiled with this include file will have this * additional info. Calls from libraries that have not been recompiled will * just have a null string for this info. */ #ifndef IN_MALLOC_CODE /* * allocation functions */ #define malloc(len) debug_malloc( __FILE__,__LINE__, (len)) #define realloc(ptr,len) debug_realloc(__FILE__,__LINE__, (ptr), (len)) #define calloc(numelem,size) debug_calloc(__FILE__,__LINE__,(numelem),(size)) #define cfree(ptr) debug_cfree(__FILE__,__LINE__,(ptr)) #define free(ptr) debug_free(__FILE__,__LINE__,(ptr)) #define malloc_size(histptr) DBmalloc_size(__FILE__,__LINE__,(histptr)) #define malloc_chain_check(todo) DBmalloc_chain_check(__FILE__,__LINE__,(todo)) /* * X allocation routines */ #define XtCalloc(_num,_size) debug_XtCalloc(__FILE__,__LINE__,_num,_size) #define XtMalloc(_size) debug_XtMalloc(__FILE__,__LINE__,_size) #define XtRealloc(_ptr,_size) debug_XtRealloc(__FILE__,__LINE__,_ptr,_size) #define XtFree(_ptr) debug_XtFree(__FILE__,__LINE__,_ptr) #define _XtBCopy(ptr1,ptr2,len) debug_XtBcopy(__FILE__,__LINE__,ptr1,ptr2,len) /* * Other allocation functions */ #define _malloc(_size) debug_malloc(__FILE__,__LINE__,_size) #define _realloc(_ptr,_size) debug_realloc(__FILE__,__LINE__,_ptr,_size) #define _calloc(_num,_size) debug_calloc(__FILE__,__LINE__,_num,_size) #define _free(_ptr) debug_free(__FILE__,__LINE__,_ptr) /* * memory(3) related functions */ #ifdef bcopy #undef bcopy #endif #ifdef bzero #undef bzero #endif #ifdef bcmp #undef bcmp #endif #define memccpy(ptr1,ptr2,ch,len) DBmemccpy(__FILE__,__LINE__,ptr1,ptr2,ch,len) #define memchr(ptr1,ch,len) DBmemchr(__FILE__,__LINE__,ptr1,ch,len) #define memmove(ptr1,ptr2,len) DBmemmove(__FILE__,__LINE__,ptr1, ptr2, len) #define memcpy(ptr1,ptr2,len) DBmemcpy(__FILE__, __LINE__, ptr1, ptr2, len) #define memcmp(ptr1,ptr2,len) DBmemcmp(__FILE__,__LINE__,ptr1, ptr2, len) #define memset(ptr1,ch,len) DBmemset(__FILE__,__LINE__,ptr1, ch, len) #define bcopy(ptr2,ptr1,len) DBbcopy(__FILE__,__LINE__,ptr2,ptr1,len) #define bzero(ptr1,len) DBbzero(__FILE__,__LINE__,ptr1,len) #define bcmp(ptr2,ptr1,len) DBbcmp(__FILE__, __LINE__, ptr2, ptr1, len) #define _bcopy(ptr2,ptr1,len) DBbcopy(__FILE__,__LINE__,ptr2,ptr1,len) #define _bzero(ptr1,len) DBbzero(__FILE__,__LINE__,ptr1,len) #define _bcmp(ptr2,ptr1,len) DBbcmp(__FILE__,__LINE__,ptr2,ptr1,len) #define __dg_bcopy(ptr2,ptr1,len) DBbcopy(__FILE__,__LINE__,ptr2,ptr1,len) #define __dg_bzero(ptr1,len) DBbzero(__FILE__,__LINE__,ptr1,len) #define __dg_bcmp(ptr2,ptr1,len) DBbcmp(__FILE__,__LINE__,ptr2,ptr1,len) /* * string(3) related functions */ #ifdef index #undef index #endif #ifdef rindex #undef rindex #endif #ifdef strcpy #undef strcpy #endif #ifdef strcpy #undef strcmp #endif #define index(str1,c) DBindex(__FILE__, __LINE__, str1, c) #define rindex(str1,c) DBrindex(__FILE__, __LINE__, str1, c) #define strcat(str1,str2) DBstrcat(__FILE__,__LINE__,str1,str2) #define strchr(str1,c) DBstrchr(__FILE__, __LINE__, str1,c) #define strcmp(str1,str2) DBstrcmp(__FILE__, __LINE__, str1, str2) #define strcpy(str1,str2) DBstrcpy(__FILE__, __LINE__, str1, str2) #define strcspn(str1,str2) DBstrcspn(__FILE__, __LINE__, str1,str2) #define strdup(str1) DBstrdup(__FILE__, __LINE__, str1) #define stricmp(str1,str2) DBstricmp(__FILE__, __LINE__, str1, str2) #define strincmp(str1,str2,len) DBstrincmp(__FILE__, __LINE__, str1,str2,len) #define strlen(str1) DBstrlen(__FILE__, __LINE__, str1) #define strncat(str1,str2,len) DBstrncat(__FILE__, __LINE__, str1,str2,len) #define strncpy(str1,str2,len) DBstrncpy(__FILE__,__LINE__,str1,str2,len) #define strncmp(str1,str2,len) DBstrncmp(__FILE__, __LINE__, str1,str2,len) #define strpbrk(str1,str2) DBstrpbrk(__FILE__, __LINE__, str1,str2) #define strrchr(str1,c) DBstrrchr(__FILE__,__LINE__,str1,c) #define strspn(str1,str2) DBstrspn(__FILE__, __LINE__, str1,str2) #define strstr(str1,str2) DBstrstr(__FILE__, __LINE__, str1, str2) #define strtok(str1,str2) DBstrtok(__FILE__, __LINE__, str1, str2) /* * malloc stack related functions */ #define malloc_enter(func) StackEnter(func,__FILE__,__LINE__) #define malloc_leave(func) StackLeave(func,__FILE__,__LINE__) #endif /* IN_MALLOC_CODE */ #endif /* _DEBUG_MALLOC_INC */ /* * $Log: malloc.h.org,v $ * Revision 1.31 1992/06/22 23:40:10 cpcahil * many fixes for working on small int systems * * Revision 1.30 1992/05/06 04:53:29 cpcahil * performance enhancments * * Revision 1.29 1992/04/22 18:17:32 cpcahil * added support for Xt Alloc functions, linted code * * Revision 1.28 1992/04/13 19:08:18 cpcahil * fixed case insensitive stuff * * Revision 1.27 1992/04/13 18:41:18 cpcahil * added case insensitive string comparison routines * * Revision 1.26 1992/04/13 17:26:25 cpcahil * minor portability changes * * Revision 1.25 1992/04/13 14:13:18 cpcahil * cleanup of log message. * * Revision 1.24 1992/04/13 03:09:14 cpcahil * lots of changes. * * Revision 1.23 1992/03/01 12:42:38 cpcahil * added support for managing freed areas and fixed doublword bndr problems * * Revision 1.22 1992/02/07 15:51:07 cpcahil * mods for sun4 * * Revision 1.21 1992/01/29 01:35:32 cpcahil * added sgi definition. * * Revision 1.20 1992/01/28 21:42:25 cpcahil * changes for the ibmRS6000 * * Revision 1.19 1992/01/28 18:05:37 cpcahil * misc fixes for patch 7 * * Revision 1.18 1992/01/22 16:21:35 cpcahil * added code to prevent inclusions of string.h and memory.h after malloc.h * was included. * * Revision 1.17 1992/01/10 17:26:46 cpcahil * fixed prototypes use of void. * * Revision 1.16 1992/01/10 16:53:39 cpcahil * added more info on sizetype and datatype. added support for overriding * use of void type. * * Revision 1.15 1992/01/09 17:19:11 cpcahil * put the close brace in the correct position. * * Revision 1.14 1992/01/09 17:12:36 cpcahil * added code to support inclusion in C++ modules * * Revision 1.13 1991/12/31 21:31:26 cpcahil * changes for patch 6. See CHANGES file for more info * * Revision 1.12 1991/12/26 22:31:29 cpcahil * added check to make sure file is not included twice. * * Revision 1.11 1991/12/06 17:58:46 cpcahil * added cfree() for compatibility with some wierd systems * * Revision 1.10 91/12/06 08:54:18 cpcahil * cleanup of __STDC__ usage and addition of CHANGES file * * Revision 1.9 91/12/04 09:23:40 cpcahil * several performance enhancements including addition of free list * * Revision 1.8 91/12/02 19:10:11 cpcahil * changes for patch release 5 * * Revision 1.7 91/11/25 14:42:00 cpcahil * Final changes in preparation for patch 4 release * * Revision 1.6 91/11/24 00:49:28 cpcahil * first cut at patch 4 * * Revision 1.5 91/11/20 11:54:10 cpcahil * interim checkin * * Revision 1.4 90/08/29 22:23:38 cpcahil * fixed mallopt to use a union as an argument. * * Revision 1.3 90/05/11 11:04:10 cpcahil * took out some extraneous lines * * Revision 1.2 90/05/11 00:13:09 cpcahil * added copyright statment * * Revision 1.1 90/02/23 07:09:03 cpcahil * Initial revision * */ tRNAscan-SE-2.0/src/squid.h0000644000543100007160000003244611021467306014730 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ #ifndef SQUIDH_INCLUDED #define SQUIDH_INCLUDED /* squid.h * last modified Sun Aug 15 12:05:58 1993 * * Header file for my library of sequence functions. * */ #include #include #include /* Library version info is made available as a global to * any interested program. These are defined in iupac.c * with the other globals. */ extern char squid_version[]; extern char squid_date[]; /**************************************************** * Error codes returned by squid library functions ****************************************************/ #define SQERR_OK 0 /* no error */ #define SQERR_UNKNOWN 1 /* generic error, unidentified */ #define SQERR_NODATA 2 /* unexpectedly NULL stream */ #define SQERR_MEM 3 /* malloc or realloc failed */ #define SQERR_NOFILE 4 /* file not found */ #define SQERR_FORMAT 5 /* file format not recognized */ #define SQERR_PARAMETER 6 /* bad parameter passed to func */ #define SQERR_DIVZERO 7 /* error in sre_math.c */ extern int squid_errno; /**************************************************** * Single sequence information ****************************************************/ #define SQINFO_NAMELEN 64 #define SQINFO_DESCLEN 128 struct seqinfo_s { int flags; /* what extra data are available */ char name[SQINFO_NAMELEN];/* up to 63 characters of name */ char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */ char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */ char desc[SQINFO_DESCLEN];/* up to 127 char of description */ int len; /* length of this seq */ int start; /* (1..len) start position on source seq */ int stop; /* (1..len) end position on source seq */ int olen; /* original length of source seq */ int type; /* kRNA, kDNA, kAmino, or kOther */ float weight; /* weight on sequence */ char *ss; /* 0..len-1 secondary structure string */ char *sa; /* 0..len-1 % side chain surface access. */ }; typedef struct seqinfo_s SQINFO; #define SQINFO_NAME (1 << 0) #define SQINFO_ID (1 << 1) #define SQINFO_ACC (1 << 2) #define SQINFO_DESC (1 << 3) #define SQINFO_START (1 << 4) #define SQINFO_STOP (1 << 5) #define SQINFO_LEN (1 << 6) #define SQINFO_TYPE (1 << 7) #define SQINFO_WGT (1 << 8) #define SQINFO_OLEN (1 << 9) #define SQINFO_SS (1 << 10) #define SQINFO_SA (1 << 12) /**************************************************** * Sequence i/o: originally from Don Gilbert's readseq ****************************************************/ /* buffer size for reading in lines from sequence files*/ #define LINEBUFLEN 4096 /* sequence types parsed by Seqtype() */ #define kOtherSeq 0 #define kDNA 1 #define kRNA 2 #define kAmino 3 /* Sequence file formats recognized */ #define kUnknown 0 /* format not determinable */ #define kIG 1 #define kGenBank 2 #define kNBRF 3 #define kEMBL 4 #define kGCG 5 #define kStrider 6 #define kPearson 7 #define kZuker 8 #define kIdraw 9 /* idraw-style PostScript (write only) */ #define kSelex 10 /* my flat text alignment format */ #define kMSF 11 /* GCG MSF multiple alignment format */ #define kPIR 12 /* PIR-CODATA format */ #define kRaw 13 /* unformatted, raw sequence (output only) */ #define kSquid 14 /* my sequence database format */ #define kXPearson 15 /* my extended FASTA format */ #define kGCGdata 16 /* GCG data library format */ #define kClustal 17 /* Clustal V or W multiple alignment format */ #define kMinFormat 1 /* SRE: kUnknown doesn't count */ #define kMaxFormat 17 #define kNumFormats (kMaxFormat + 1) #define kNoformat -1 /* format not tested */ struct ReadSeqVars { FILE *f; char sbuffer[LINEBUFLEN]; /* current line we're working on */ int seqlen; int maxseq; int dash_equals_n; /* a hack - affects EMBL reading, to deal with EMBL */ char *seq; SQINFO *sqinfo; /* name, id, etc. */ char *sp; }; typedef struct ReadSeqVars SQFILE; /**************************************************** * Database indexing (GSI index file format) ****************************************************/ /* A GSI (generic sequence index) file is composed of * recnum + nfiles + 1 records. Each record contains * three fields; key, file number, and disk offset. * Record 0 contains: * [ "GSI" ] [ nfiles ] [ recnum ] * Records 1..nfiles map file names to file numbers, and contain: * [ filename ] [ file number, 1..nfiles ] [ 0 (unused) ] * Records nfiles+1 to recnum+nfiles+1 provide disk offset * and file number indices for every key: * [ key ] [ file number ] [ offset] */ struct gsi_s { FILE *gsifp; /* open GSI index file */ long recnum; /* number of records */ short nfiles; /* number of files */ }; typedef struct gsi_s GSIFILE; /* Used for GSI (general sequence index) files, for rapid fetching * from databases. A GSI record contains: * [ key name] [file number] [disk offset] */ #define GSI_RECSIZE (32 * sizeof(char) + sizeof(short) + sizeof(long)) #define GSI_KEYSIZE (32 * sizeof(char)) /**************************************************** * Sequence alphabet: see also iupac.c ****************************************************/ /* IUPAC symbols defined globally in iupac.c */ struct iupactype { char sym; /* character representation */ char symcomp; /* complement (regular char */ char code; /* my binary rep */ char comp; /* binary encoded complement */ }; extern struct iupactype iupac[]; #define IUPACSYMNUM 17 extern char *stdcode1[]; /* 1-letter amino acid translation code */ extern char *stdcode3[]; /* 3-letter amino acid translation code */ extern float aafq[]; /* amino acid occurrence frequencies */ extern char aa_alphabet[]; /* amino acid alphabet */ extern int aa_index[]; /* convert 0..19 indices to 0..26 */ /* valid symbols in IUPAC code */ #define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd" #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY" #define DNA_ALPHABET "ACGT" #define RNA_ALPHABET "ACGU" #define WHITESPACE " \t\n" #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-') /**************************************************** * Alignment information ****************************************************/ /* Structure: aliinfo_s * * Purpose: Optional information returned from an alignment file. * * flags: always used. Flags for which info is valid/alloced. * * alen: always returned. Alignments are always flushed right * with gaps so that all aseqs are the same length, alen. * Available for all alignment formats. * * cs: 0..alen-1, just like the alignment. Contains single-letter * secondary structure codes for consensus structure; "<>^+" * for RNA, "EHL." for protein. May be NULL if unavailable * from seqfile. Only available for SELEX format files. * * rf: 0..alen-1, just like the alignment. rf is an arbitrary string * of characters, used for annotating columns. Blanks are * interpreted as non-canonical columns and anything else is * considered canonical. Only available from SELEX format files. * * sqinfo: always returned. Array of 0..nseq-1 * per-sequence information structures, carrying * name, id, accession, coords, and weight. * */ struct aliinfo_s { int flags; /* flags for what info is valid */ int alen; /* length of alignment (columns) */ char au[64]; /* "author" information */ char *cs; /* consensus secondary structure string */ char *rf; /* reference coordinate system */ struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */ }; typedef struct aliinfo_s AINFO; #define AINFO_ALEN (1 << 0) #define AINFO_AUTH (1 << 1) #define AINFO_CS (1 << 2) #define AINFO_RF (1 << 3) /**************************************************** * Cluster analysis and phylogenetic tree support ****************************************************/ /* struct phylo_s - a phylogenetic tree * * For N sequences, there will generally be an array of 0..N-2 * phylo_s structures. [0] is the root. The indexes of left and * right children are somewhat confusing so be careful. The * indexes can have values of 0..2N-2. If they are 0..N-1, they * represent pointers to individual sequences. If they are * >= N, they represent pointers to a clustree_s structure * at (index - N). */ struct phylo_s { int parent; /* index of parent, N..2N-2, or -1 for root */ int left; /* index of one of the branches, 0..2N-2 */ int right; /* index of other branch, 0..2N-2 */ float diff; /* difference score between seqs */ float lblen; /* left branch length */ float rblen; /* right branch length */ char *is_in; /* 0..N flag array, 1 if seq included */ int incnum; /* number of seqs included at this node */ }; /* Strategies for cluster analysis; cluster by mean distance, * minimum distance, or maximum distance. */ enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN }; /**************************************************** * Generic data structure support ****************************************************/ /* a struct intstack_s implements a pushdown stack for storing * single integers. */ struct intstack_s { int data; struct intstack_s *nxt; }; /**************************************************** * Binary nucleotide alphabet support ****************************************************/ /* Binary encoding of the IUPAC code for nucleotides * * four-bit "word", permitting rapid degenerate matching * A C G T/U * 0 0 1 0 */ #define NTA 8 #define NTC 4 #define NTG 2 #define NTT 1 #define NTU 1 #define NTN 15 /* A|C|G|T */ #define NTR 10 /* A|G */ #define NTY 5 /* C|T */ #define NTM 12 /* A|C */ #define NTK 3 /* G|T */ #define NTS 6 /* C|G */ #define NTW 9 /* A|T */ #define NTH 13 /* A|C|T */ #define NTB 7 /* C|G|T */ #define NTV 14 /* A|C|G */ #define NTD 11 /* A|G|T */ #define NTGAP 16 /* GAP */ #define NTEND 0 /* null string terminator */ /* ntmatch(): bitwise comparison of two nuc's * note that it's sensitive to the order; * probe may be degenerate but target should not be */ #define ntmatch(probe, target) ((probe & target) == target) /**************************************************** * Support for a portable, flexible Getopt() ****************************************************/ /* Structure: opt_s * * Structure for declaring options to a main(). */ struct opt_s { char *name; /* name of option, e.g. "--option1" or "-o" */ int single; /* TRUE if a single letter option */ int argtype; /* for typechecking, e.g. ARG_INT */ }; /* acceptable argtype's... */ #define ARG_NONE 0 /* no argument */ #define ARG_INT 1 /* something that atoi() can grok */ #define ARG_FLOAT 2 /* something that atof() can grok */ #define ARG_CHAR 3 /* require single character or digit */ #define ARG_STRING 4 /* anything goes */ /**************************************************** * Miscellaneous macros and defines ****************************************************/ #define CHOOSE(a) ((int) (sre_random() * (a))) /* must declare swapfoo to use SWAP() */ #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;} #define Free2DArray(ptr, n) \ { int fooidx;\ if (ptr != NULL) { \ for (fooidx = 0; fooidx < (n); fooidx++) if (ptr[fooidx] != NULL) free(ptr[fooidx]);\ free(ptr);\ } } #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7) #ifndef MIN #define MIN(a,b) ((ab)?a:b) #endif #ifndef TRUE #define TRUE 1 #endif #ifndef True #define True 1 #endif #ifndef FALSE #define FALSE 0 #endif #ifndef False #define False 0 #endif /* someday, Sun Microsystems will conform to ANSI... */ #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 #endif #include "sqfuncs.h" /* squid function declarations */ #endif /* SQUIDH_INCLUDED */ tRNAscan-SE-2.0/src/fastmodelmaker.c0000644000543100007160000004460111021467304016566 0ustar pchanlowelab/* fastmodelmaker.c * * Construct a covariance model from an alignment, using an approximate * but fast algorithm. Complexity is N^2 in memory, N^3 in time. Get * back a covariance model, and information contents in bits for * both primary and secondary structure. * * - use Stormo/Gutell MIXY algorithm to calculate pairwise covariances * for all aligned column pairs i,j. This produces the matrix **mxy; * it is indexed 0..alen-1 by 0..alen-1, as a flipped diagonal matrix * mxy[j][i], j > i. The diagonal (i == j) is unused (and unallocated * for). * * - use a quick and dirty Zuker-like algorithm to reduce this N^2 * matrix to the optimal model of non-overlapping chords. This * produces the matrix **zmat. zmat is a flipped diagonal matrix, * zmat[j][i], j >= i. The diagonal (i==j) is initialized to zero. * All other cells take on positive values, summing covariances. * * - traceback thru the zmat matrix and construct a dynamic binary * tree *ztr, excluding inserted columns. *ztr * is constructed much the same as a normal traceback tree is constructed. * emitl and emitr store the 0..alen-1 indices of the column this node is * responsible for, even for BEGIN and BIFURC nodes. type is * a node type (MATP_NODE), not a state type. * Note that these are exceptions to the indexing scheme used * by traceback structures everywhere else! * * - Then, for each *individual* sequence, construct a fake traceback based * on *ztr. Use these tracebacks to collect statistics from the alignment * and initialize probabilities in a new model. */ #include #include #include #include #include "version.h" #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static void mixy(char **aseq, int nseq, int alen, int ***ret_mixy); static void free_mixy(int **mxy, int acol); static void zfill(int **mxy, int acol, int ***ret_zmat, double *ret_sum); static void free_zmat(int **zmat, int acol); static void ztrace(int **zmat, double *gapfq, double gapthresh, int **mxy, int acol, struct trace_s **ret_ztr); #ifdef DEBUG static void dump_mixy(int **mxy, int alen); static void dump_zmat(int **zmat, int alen); static void dump_ztr(struct trace_s *ztr); #endif /* DEBUG */ /* Function: Fastmodelmaker() * * Purpose: Given a sequence alignment, construct a reasonable * starting model. Returns the model, as well as numbers * for the primary and secondary structure information * content of the alignment. * * Args: aseqs - sequence alignment. * ainfo - info about the alignment * nseq - number of sequences * prior - prior distributions for CM construction * gapthresh - over this fraction of gaps, assign to INS * ret_secinfo - RETURN: sec structure info content (bits) (maybe NULL) * ret_cm - RETURN: new model (maybe NULL) * ret_mtr - RETURN: master traceback for aseqs (maybe NULL) * * Return: 1 on success, 0 on failure. */ int Fastmodelmaker(char **aseqs, AINFO *ainfo, int nseq, struct prior_s *prior, double gapthresh, double *ret_secinfo, struct cm_s **ret_cm, struct trace_s **ret_mtr) { struct cm_s *cm; int **mxy; int **zmat; int M; double secinfo; double *gapfq; /* frequencies of gap occurrence per column */ struct trace_s *ztr; struct trace_s *tr; /* individual traceback */ struct trmem_s *pool; /* memory pool for traceback */ int idx; int gaps; int apos; /* Precalculate gap frequencies seen in each column of alignment */ if ((gapfq = (double *) malloc (sizeof(double) * ainfo->alen)) == NULL) Die("malloc failed, line %d file %s", __LINE__, __FILE__); for (apos = 0; apos < ainfo->alen; apos++) { gaps = 0; for (idx = 0; idx < nseq; idx++) if (isgap(aseqs[idx][apos])) gaps++; gapfq[apos] = (double) gaps / (double) nseq; } /* build Mxy matrix */ mixy(aseqs, nseq, ainfo->alen, &mxy); /* dynamic programming fill of zmat, using mxy */ zfill(mxy, ainfo->alen, &zmat, &secinfo); /* traceback of zmat to make ztr tree */ ztrace(zmat, gapfq, gapthresh, mxy, ainfo->alen, &ztr); /* PrintTrace(stdout, ztr); */ NumberMasterTrace(ztr, &M); /* allocate for a model */ if ((cm = AllocCM(M)) == NULL) Die("AllocCM() failed"); TopofyNewCM(cm, ztr); /* For each sequence: convert consensus tree ztr to individual fake traceback. * Count traceback into new model. */ for (idx = 0; idx < nseq; idx++) { Transmogrify(ztr, aseqs[idx], &tr, &pool); if (! TraceCount(cm, aseqs[idx], (ainfo->sqinfo[idx].flags & SQINFO_WGT) ? ainfo->sqinfo[idx].weight : 1.0, tr)) Die("TraceCount() failed"); FreeTrace(tr, pool); } if (! VerifyCM(cm)) Die("Bad cm after trace counts."); /* convert CM to probabilities */ ProbifyCM(cm, prior); /* garbage collect & return */ free_mixy(mxy, ainfo->alen); free_zmat(zmat, ainfo->alen); free(gapfq); if (ret_mtr != NULL) *ret_mtr = ztr; else FreeTrace(ztr, NULL); if (ret_cm != NULL) *ret_cm = cm; else FreeCM(cm); if (ret_secinfo != NULL) *ret_secinfo = secinfo / (double) nseq; return 1; } /* Function: mixy() * * Purpose: given a set of N aligned sequences aseq, calculate * pairwise covariances (mutual information). ret_mixy * is allocated, filled, and returned, as a diagonal 2D * (NxN) matrix of values. It must be freed by * the caller. It is a lower diagonal matrix mxy[j][i], * j > i, 0..alen-1 by 0..j-1. * * The values in mxy are integers. They are the average * secondary structure information content (i.e. weighted for * the number of pairs actually occurring in columns i,j) * in bits, to two decimal places (i.e. info*100). * * Returns: mxy, which must be free'd by caller with free_mixy(). */ static void mixy(char **aseq, /* array of aligned sequences, flushed right */ int nseq, /* number of aligned sequences */ int alen, /* length of each sequence (all the same) */ int ***ret_mxy) /* RETURN: mxy array */ { int **mxy; /* RETURN: diagonal covariance matrix */ float fx[ALPHASIZE]; /* singlet frequency vector */ float fy[ALPHASIZE]; /* another singlet frequency vector */ float fxy[ALPHASIZE][ALPHASIZE]; /* pairwise frequency 2D array */ int idx; /* counter for sequences */ int i, j; /* counters for columns x,y */ int symi, symj; /* counters for symbols */ int pairs; /* counter for pairs in which there are no gaps */ long test; /* Allocate for mxy */ if ((mxy = (int **) malloc (alen * sizeof(int *))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); for (j = 1; j < alen; j++) if ((mxy[j] = (int *) malloc (j * sizeof(int))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); test = (long) mxy[20]; /* calculate mxy */ for (j = 1; j < alen; j++) for (i = 0; i < j; i++) { /* zero counter array */ for (symj = 0; symj < ALPHASIZE; symj++) { fx[symj] = fy[symj] = 0.0; for (symi = 0; symi < ALPHASIZE; symi++) fxy[symj][symi] = 0.0; } /* count symbols in a column */ pairs = 0; for (idx = 0; idx < nseq; idx++) { if (isgap(aseq[idx][i]) || isgap(aseq[idx][j])) continue; symi = SymbolIndex(aseq[idx][i]); symj = SymbolIndex(aseq[idx][j]); fx[symi] += 1.0; fy[symj] += 1.0; fxy[symi][symj] += 1.0; pairs++; } /* convert to frequencies */ if (pairs > 0) for (symi = 0; symi < ALPHASIZE; symi++) { fx[symi] /= (float) pairs; fy[symi] /= (float) pairs; for (symj = 0; symj < ALPHASIZE; symj++) fxy[symi][symj] /= (float) pairs; } /* calculate mxy. 144.269504 is a conversion of ln's into * bits * 100: i.e. 100 * (1/log(2)) */ mxy[j][i] = 0; for (symi = 0; symi < ALPHASIZE; symi++) for (symj = 0; symj < ALPHASIZE; symj++) { if (fxy[symi][symj] > 0.0) mxy[j][i] += (int) (144.269504 * fxy[symi][symj] * log((fxy[symi][symj] / (fx[symi] * fy[symj])))); } /* Sat Jul 17 22:17:17 1993: We weight by pairs to get an expected score * over all the sequences. Fixes a problem that columns with few symbols * could dominate the calculation just because of noise. */ mxy[j][i] = (mxy[j][i] * pairs) / nseq; } /* dump debugging info */ #ifdef DEBUG dump_mixy(mxy, alen); #endif *ret_mxy = mxy; } /* Function: free_mixy() * * Purpose: free the space allocated for a flipped diagonal * covariance matrix. To do this we also need to * know alen, the number of columns in the starting * sequence alignment. * * Returns: (void) */ static void free_mixy(int **mxy, int alen) { int j; for (j = 1; j < alen; j++) free(mxy[j]); free(mxy); } /* Function: zfill() * * Purpose: Calculate the optimal structure for a covariance matrix * produced by mixy(). Uses a way-simplified form of the * Zuker/Nussinov dynamic programming RNA folding algorithm * to find the structure which a) the emitted pairs sum * to a maximum number of bits of covariance and b) * has no overlapping chords (no pseudoknots). The dynamic * programming matrix is allocated, filled, and returned. * * Returns: ret_zmat is returned thru a passed pointer; it must be * free'd by the caller using free_zmat(). */ static void zfill(int **mxy, /* diagonal covariance matrix from mixy() */ int acol, /* size of mxy; number of aligned columns */ int ***ret_zmat, /* RETURN: filled dynamic programming matrix */ double *ret_sum) /* RETURN: total sum of Mxy for tree (bits) */ { int **zmat; int i, j; int diff; int mid; /* Allocations. * zmat is a flipped diagonal matrix, inclusive of the diagonal; * positions on both axes are numbered 0..acol-1. */ if ((zmat = (int **) malloc (acol * sizeof (int *))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); for (j = 0; j < acol; j++) if ((zmat[j] = (int *) malloc ((j+1) * sizeof(int))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); /* Initialization. * We initialize the diagonal to 0. */ for (j = 0; j < acol; j++) zmat[j][j] = 0; /* Dynamic programming stage. * Our recursion is: * Sij = max { Si+1,j (emit left, no covariance) * Si,j-1 (emit right, no covariance) * Si+1,j-1 + mxy[j][i]. * max over mid: Si,mid + Smid+1,j (bifurcation) * } */ for (diff = 1; diff < acol; diff++) for (i = 0; (j = i+diff) < acol; i++) { if (j-1 >= i+1) zmat[j][i] = zmat[j-1][i+1] + mxy[j][i]; else zmat[j][i] = mxy[j][i]; if (zmat[j][i+1] > zmat[j][i]) zmat[j][i] = zmat[j][i+1]; if (zmat[j-1][i] > zmat[j][i]) zmat[j][i] = zmat[j-1][i]; for (mid = i+1; mid < j-1; mid++) if (zmat[mid][i] + zmat[j][mid+1] > zmat[j][i]) zmat[j][i] = zmat[mid][i] + zmat[j][mid+1]; } /* Return */ #ifdef DEBUG dump_zmat(zmat, acol); #endif *ret_sum = (double) zmat[acol-1][0] / 100.0; *ret_zmat = zmat; } static void free_zmat(int **zmat, int acol) { int j; for (j = 0; j < acol; j++) free(zmat[j]); free(zmat); } /* Function: ztrace() * * Purpose: Traceback through the dynamic programming matrix constructed * by zfill(). Constructs a dynamic binary tree (ztr) of ztrace_s * structures, which keep track of both topology and the * order in which various aligned columns are emitted. * * ztr ends up being the "shell" or template upon which the * final model is built. ztr captures the branching structure * of the model tree. * * Inserts are dealt with at this point. Columns with a gap frequency * exceeding gapthresh are excluded from ztr. The final tree ztr contains * MATR, MATL, MATP nodes only (with BEGIN and BIFURC of course). * * Data: ztr: a traceback tree. * emitl = index of column emitted left (0..acol-1) * or -1 * emitr = index of column emitted right (0..acol-1) * or -1 * nodeidx = index of node in new model * type = type of node (MATP_NODE, etc.) * * Return: ret_ztr is allocated here and must be free'd by the caller. */ static void ztrace(int **zmat, /* dynamic programming matrix from zfill() */ double *gapfq, /* frequencies of gaps in columns 0..acol-1 */ double gapthresh, /* above this, column is INS-generated */ int **mxy, /* the covariance matrix from mixy() */ int acol, /* number of aligned columns (size of zmat) */ struct trace_s **ret_ztr)/* RETURN: binary tree of best structure */ { struct trace_s *ztr; /* binary tree of best structure */ struct trace_s *curr_ztr; /* ptr to node of ztr we're working on */ struct tracestack_s *dolist; /* pushdown stack of active ztr nodes */ int i,j; /* coords in zmat (0..acol-1) */ int mid; /* midpoint of a bifurcation */ /* Initialize. * Start at i = 0, j = acol-1 and work towards diagonal. */ InitTrace(&ztr, NULL); /* start a trace tree */ dolist = InitTracestack(); /* start a stack for traversing the trace tree */ /* start with root aligned to 0..acol-1 */ curr_ztr = AttachTrace(ztr, NULL, 0, acol-1, -1, ROOT_NODE); curr_ztr = AttachTrace(curr_ztr, NULL, 0, acol-1, -1, -1); PushTracestack(dolist, curr_ztr); while ((curr_ztr = PopTracestack(dolist)) != NULL) { /* where we are now in the traceback. */ i = curr_ztr->emitl; j = curr_ztr->emitr; /* dummy END state on trace tree leaves. */ if (i > j) curr_ztr->type = uEND_ST; /* never executes. */ /* watch out for diagonal, where j-1,i+1 is nonsense */ else if (i == j) /* default to push-left; i is explained */ { curr_ztr->type = MATL_NODE; curr_ztr->nxtl->emitl = i+1; curr_ztr->nxtl->emitr = j; } else if (j-1 == i && zmat[j][i] == mxy[j][i]) { curr_ztr->type = MATP_NODE; curr_ztr->nxtl->emitl = i+1; curr_ztr->nxtl->emitr = j-1; } else if (zmat[j][i] == zmat[j-1][i+1] + mxy[j][i]) { curr_ztr->type = MATP_NODE; if (j-1 >= i+1) PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i+1, j-1, -1,-1)); else { curr_ztr->nxtl->emitl = i+1; curr_ztr->nxtl->emitr = j-1; } } else if (zmat[j][i] == zmat[j][i+1]) { curr_ztr->type = MATL_NODE; if ( j >= i+1) PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i+1, j, -1,-1)); else { curr_ztr->nxtl->emitl = i+1; curr_ztr->nxtl->emitr = j; } } else if (zmat[j][i] == zmat[j-1][i]) { curr_ztr->type = MATR_NODE; if ( j-1 >= i) PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i, j-1, -1,-1)); else { curr_ztr->nxtl->emitl = i; curr_ztr->nxtl->emitr = j-1; } } else { for (mid = i+1; mid < j-1; mid++) if (zmat[j][i] == zmat[mid][i] + zmat[j][mid+1]) { struct trace_s *branch; /* the current node is a bifurc node */ curr_ztr->type = BIFURC_NODE; /* it will connect to BEGIN-SEGMENT * nodes on either side, which are followed * by normal segments again */ /* right branch */ branch = AttachTrace(curr_ztr, NULL, mid+1, j, -1, BEGINR_NODE); if (mid+1 <= j) PushTracestack(dolist, AttachTrace(branch, NULL, mid+1, j, -1,-1)); else { branch->nxtl->emitl = mid+1; branch->nxtl->emitr = j; } /* left branch */ branch = AttachTrace(curr_ztr, NULL, i, mid, -1, BEGINL_NODE); if (i <= mid) PushTracestack(dolist, AttachTrace(branch, NULL, i, mid, -1,-1)); else { branch->nxtl->emitl = i; branch->nxtl->emitr = mid; } break; } } /* clean up current node: deal with insertions */ if (curr_ztr->type == MATP_NODE) { if (gapfq[i] > gapthresh && gapfq[j] > gapthresh) DeleteTracenode(curr_ztr, NULL); else if (gapfq[i] > gapthresh) curr_ztr->type = MATR_NODE; else if (gapfq[j] > gapthresh) curr_ztr->type = MATL_NODE; } else if ( (curr_ztr->type == MATR_NODE && gapfq[j] > gapthresh) || (curr_ztr->type == MATL_NODE && gapfq[i] > gapthresh)) DeleteTracenode(curr_ztr, NULL); } FreeTracestack(dolist); *ret_ztr = ztr; } #ifdef DEBUG static void dump_mixy(int **mxy, int alen) { int i, j; for (j = 1; j < alen; j++) { for (i = 0; i < j; i++) printf("%6d", mxy[j][i]); puts("\n"); } } static void dump_zmat(int **zmat, int alen) { int i, j; for (j = 0; j < alen; j++) { for (i = 0; i <= j; i++) printf("%6d", zmat[j][i]); puts("\n"); } } static void dump_ztr(struct trace_s *ztr) { struct tracestack_s *dolist; struct trace_s *curr; dolist = InitTracestack(); PushTracestack(dolist, ztr->nxtl); while ((curr = PopTracestack(dolist)) != NULL) { printf("## ZTR STATE %#x\n", curr); printf("emitl : %d\n", curr->emitl); printf("emitr : %d\n", curr->emitr); printf("nodeidx : %d\n", curr->nodeidx); printf("type : %d\n", curr->type); if (curr->nxtl != NULL) printf("nxtl : %#x\n", curr->nxtl); else printf("nxtl : NULL\n"); if (curr->nxtr != NULL) printf("nxtr : %#x\n", curr->nxtr); else printf("nxtr : NULL\n"); if (curr->nxtr != NULL) PushTracestack(dolist, curr->nxtr); if (curr->nxtl != NULL) PushTracestack(dolist, curr->nxtl); } FreeTracestack(dolist); } #endif tRNAscan-SE-2.0/src/version.h0000644000543100007160000000070611021467306015262 0ustar pchanlowelab/* version.h * SRE, Mon Sep 6 09:32:36 1993 * * During a real build, these won't be used at all -- they'll * be overridden from the Makefile. This is only used during * manual compilations and to keep lint quiet. It also might * make a good record of the revisions the package goes through. * */ /* #define DEBUG*/ /* turn all debugging output on */ #ifndef RELEASE #define RELEASE "2.4.4" #define RELEASEDATE "January 1996" #endif tRNAscan-SE-2.0/src/viterbi.c0000644000543100007160000004500411021467306015234 0ustar pchanlowelab/* viterbi.c * for 2.0: SRE, Tue Sep 28 09:15:07 1993 * from 1.0: SRE, Wed Jun 30 17:42:44 1993 * as revised Tue Aug 24 12:06:12 1993: integer two-matrix version * * Implementation of the three-dimensional dynamic programming * algorithm for aligning a covariance model to a sequence. * * To optimize memory access patterns, the score storage is implemented * as a two-matrix version. amx is the * main storage. bmx is a smaller auxiliary matrix with a different * access pattern, holding scores of BEGIN state alignments; it * is used when calculating BIFURC scores. * * amx is [j = 0..N] [diff = 0..j] [y = 0..statenum] * diff == 0 is for off-diagonal boundary conditions (this is why diff is shifted +1) * diff == 1 is for the diagonal, i==j * * bmx is [y = 0..statenum] [j = 0..N] [ diff = 0..j] * a j,diff matrix exists only where y is a BEGIN state * * The 2.0 implementation allows variable storage per node rather * than storing and calculating a fixed max number of states per node, * which should save up to 2x in both time and space. * * An optimization is made which requires END states to be explicitly * added, so statenum (the number of states in the integer model) * is *inclusive* of ENDs. */ #include #include #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #ifdef DEBUG #include #endif static int allocate_mx(struct istate_s *icm,int statenum, int seqlen, int ****ret_amx, int ****ret_bmx); static int init_mx(struct istate_s *icm, int statenum, int N, int ***amx, int ***bmx); static int recurse_mx(struct istate_s *icm, int statenum, char *seq, int N, int ***amx, int ***bmx); static int trace_mx(struct istate_s *icm, char *seq, int N, int ***amx, int ***bmx, struct trace_s **ret_tr); static void free_mx(int ***amx, int ***bmx, int statenum, int seqlen); /* Function: ViterbiAlign() * * Purpose: Align a sequence to a model, using the alignment * algorithm. Return the score of the alignment and * the traceback. * * Args: icm - the model to align sequence to * statenum - # of states in the model * seq - sequence to align model to * ret_score - RETURN: global alignment score * ret_trace - RETURN: traceback tree * * Return: 1 on success, 0 on failure. */ int ViterbiAlign(struct istate_s *icm, int statenum, char *seq, double *ret_score, struct trace_s **ret_trace) { int ***amx; /* the main score matrix */ int ***bmx; /* the BEGIN score matrix */ int N; /* length of sequence */ N = strlen(seq); seq--; /* convert to 1..N. Ugh! */ if (! allocate_mx(icm, statenum, N, &amx, &bmx)) return 0; #ifdef DEBUG printf("allocated matrices\n"); #endif if (! init_mx(icm, statenum, N, amx, bmx)) return 0; #ifdef DEBUG printf("matrices initialized\n"); #endif if (! recurse_mx(icm, statenum, seq, N, amx, bmx)) return 0; #ifdef DEBUG printf("recursion finished\n"); PrintViterbiAMX(stdout, icm, statenum, seq, N, amx); #endif *ret_score = ((double) bmx[0][N][N] / INTPRECISION); #ifdef DEBUG printf("have a score of %.2f, starting traceback\n", *ret_score); #endif if (! trace_mx(icm, seq, N, amx, bmx, ret_trace)) return 0; #ifdef DEBUG printf("trace complete\n"); PrintTrace(stdout, *ret_trace); #endif free_mx(amx, bmx, statenum, N); #ifdef SRE_REMOVED /* OK, boys, crank up the MasPar DPU. * 24 bytes = 6 addresses passed * 4 bytes/address (always right?) * MPViterbiAlign() copies ret_score out, but doesn't do anything * with the traceback yet */ callRequest(MPViterbiAlign, 24, icm, &statenum, seq, &N, ret_score, ret_trace); #endif /* MASPAR */ return 1; } /* Function: allocate_cvmx() * * Purpose: Malloc space for the score matrices. * amx is indexed as j, i, y. * bmx is indexed as k, j, i. * In the two sequence dimensions j, i they are * diagonal (+1 off diagonal) matrices with * rows j = 0..N, i = 1..j+1. * In the node dimension k bmx is k = 0..M. * In the state dimension y amx is y = 0..numstates. * * Args: icm - the int, log-odds, state-based model * statenum - number of states in model * seqlen - length of sequence * ret_amx - RETURN: main score matrix * ret_bmx - RETURN: BEGIN score matrix * * Return: Ptr to allocated scoring matrix, or * dies and exits. */ static int allocate_mx(struct istate_s *icm, int statenum, int seqlen, int ****ret_amx, int ****ret_bmx) { int ***amx; int ***bmx; int diag, j, y; /* Main matrix, amx: fastest varying index is y (j,i,y) */ /* malloc for j = 0..seqlen rows */ if ((amx = (int ***) malloc ((seqlen + 1)* sizeof(int **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (j = 0; j <= seqlen; j++) /* loop over rows j = 0..N */ { /* malloc for diag = 0..j cols */ if ((amx[j] = (int **) malloc ((j + 1) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); /* loop over cols diag = 0..j */ for (diag = 0; diag <= j; diag++) /* malloc for y = 0..statenum-1 decks */ if ((amx[j][diag] = (int *) malloc ((statenum) * sizeof (int))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } /* BEGIN auxiliary matrix: fastest varying index is diag (y,j,diag) */ /* 0..statenum-1 decks */ if ((bmx = (int ***) malloc (statenum * sizeof(int **))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (y = 0; y < statenum; y++) { bmx[y] = NULL; if (icm[y].statetype == uBEGIN_ST) { if ((bmx[y] = (int **) malloc ((seqlen+1) * sizeof(int *))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); for (j = 0; j <= seqlen; j++) if ((bmx[y][j] = (int *) malloc ((j+1) * sizeof(int))) == NULL) Die("Memory allocation error in %s line %d", __FILE__, __LINE__); } } *ret_amx = amx; *ret_bmx = bmx; return 1; } /* Function: free_mx() * * Purpose: Free the space allocated to the two scoring matrices. * Precisely mirrors the allocations above in allocate_cvmx(). * * Return: (void) */ static void free_mx(int ***amx, int ***bmx, int statenum, int seqlen) { int diag, j, y; /* Free the main matrix, amx */ for (j = 0; j <= seqlen; j++) { for (diag = 0; diag <= j; diag++) free(amx[j][diag]); free(amx[j]); } free(amx); /* Free the auxiliary BEGIN matrix, bmx */ for (y = 0; y < statenum; y++) if (bmx[y] != NULL) { for (j = 0; j <= seqlen; j++) free(bmx[y][j]); free(bmx[y]); } free(bmx); } /* Function: init_mx() * * Purpose: Initialization of the scoring matrices. We initialize the off-diagonal, * the diagonal, and the "floor" (end states) of the cube. * * Return: 1 on success, 0 on failure. */ static int init_mx(struct istate_s *icm, /* integer model */ int statenum, /* number of states in icm */ int N, /* length of seq */ int ***amx, int ***bmx) { int diag, j, y; /* counters for indices over the cvmx */ int ynext; /* index of next state k+1 */ int *beam; /* z-axis vector of numbers in amx */ /* Init the whole amx to -Infinity. We do this with memcpy, trying * to be fast. We fill in j=0,diag=0 by hand, then memcpy() the other * columns. */ for (y = 0; y < statenum; y++) amx[0][0][y] = NEGINFINITY; for (j = 1; j <= N; j++) for (diag = 0; diag <= j; diag++) memcpy(amx[j][diag], amx[0][0], statenum * sizeof(int)); /* Init the whole bmx to -Inf. We know state 0 is a begin (it's ROOT), so we * start there, and memcpy rows as needed. */ for (diag = 0; diag <= N; diag++) bmx[0][N][diag] = NEGINFINITY; for (j = 0; j < N; j++) memcpy(bmx[0][j], bmx[0][N], (j+1) * sizeof(int)); for (y = 1; y < statenum; y++) if (bmx[y] != NULL) for (j = 0; j <= N; j++) memcpy(bmx[y][j], bmx[0][N], (j+1) * sizeof(int)); /* Init the off-diagonal (j = 0..N; diag == 0) with -log P scores. * End state = 0; * del, bifurc states are calc'ed * begin states same as del's */ for (j = 0; j <= N; j++) for (y = statenum-1; y >= 0; y--) { if (icm[y].statetype == uEND_ST) amx[j][0][y] = 0; else if (icm[y].statetype == uBIFURC_ST) amx[j][0][y] = bmx[icm[y].tmx[0]][j][0] + bmx[icm[y].tmx[1]][j][0]; else if (icm[y].statetype == uDEL_ST || icm[y].statetype == uBEGIN_ST) { /* only calc DEL-DEL and BEGIN-DEL transitions. Since * we optimized the state transition tables, removing * the unused ones, we don't know where the number * for "to DEL" is! But we can find it, because it'll * be the connection to a non-infinite score */ beam = amx[j][0] + y + icm[y].offset; for (ynext = 0; ynext < icm[y].connectnum; ynext++) { if (*beam != NEGINFINITY) amx[j][0][y] = *beam + icm[y].tmx[ynext]; beam++; } /* make a copy into bmx if y is a BEGIN */ if (icm[y].statetype == uBEGIN_ST) bmx[y][j][0] = amx[j][0][y]; } } return 1; } /* Function: recurse_mx() * * Purpose: Carry out the fill stage of the dynamic programming * algorithm. * * Returns: 1 on success, 0 on failure. */ static int recurse_mx(struct istate_s *icm, /* integer, state-form model */ int statenum, /* number of states in icm */ char *seq, /* sequence, 1..N */ int N, /* length of seq */ int ***amx, /* main scoring matrix */ int ***bmx) /* bifurc scoring matrix */ { int i, j, y; /* indices for 4 dimensions */ int diff; /* loop counter for difference: diff = j-i + 1 */ int symi, symj; /* symbol indices for seq[i], seq[j] */ int sc; /* tmp for a score */ int ynext; /* index of next state y */ int *beam; /* ptr to a beam (z-axis vector) */ int leftdiff; /* diff coord of BEGIN_L of a bifurc */ int leftj; /* j coord of BEGIN_L of a bifurc */ int **left_p; /* pointer into whole 2D deck of BEGINL's of a bifurc */ int *right_p; /* ptr into row of BEGIN_R's of a bifurc */ int *scp; /* score pointer: ptr into beam of scores being calc'ed */ struct istate_s *st; /* state pointer: ptr at current state in icm */ int *tmx; int emitsc; for (j = 1; j <= N; j++) { symj = SymbolIndex(seq[j]); for (diff = 1; diff <= j; diff++) { i = j - diff + 1; if (i < 1) break; symi = SymbolIndex(seq[i]); #ifdef DEBUG assert(symi >= 0 && symi < ALPHASIZE); assert(symj >= 0 && symj < ALPHASIZE); #endif scp = &amx[j][diff][statenum-1]; st = &icm[statenum-1]; for (y = statenum-1; y >= 0; y--, scp--, st--) { /* loop over states */ if (st->statetype != uBIFURC_ST) /* a normal (non-BIFURC) state */ { /* Connect the "beam" pointer to the appropriate * starting place in the ynext scores we're connecting * y to */ switch (st->statetype) { case uBEGIN_ST: case uDEL_ST: beam = amx[j][diff]; emitsc = 0; break; case uMATP_ST: if (diff == 1) continue; beam = amx[j-1][diff-2]; emitsc = st->emit[symi * ALPHASIZE + symj]; break; case uMATR_ST: case uINSR_ST: beam = amx[j-1][diff-1]; emitsc = st->emit[symj]; break; case uMATL_ST: case uINSL_ST: beam = amx[j][diff-1]; emitsc = st->emit[symi]; break; case uEND_ST: continue; default: Die("no such state type %d", st->statetype); } beam += y + st->offset; tmx = st->tmx; /* Init for ynext == 0 case */ *scp = *beam + *tmx; /* Calculate remaining cases */ for (ynext = 1; ynext < st->connectnum; ynext++) { beam++; tmx++; if (*beam > *scp) { sc = *beam + *tmx; if (sc > *scp) *scp = sc; } } /* Add emission scores now */ *scp += emitsc; /* Make a copy into bmx if necessary */ if (st->statetype == uBEGIN_ST) bmx[y][j][diff] = *scp; } /* end block of normal state stuff */ else /* a BIFURC state */ { leftdiff = diff; leftj = j; right_p = bmx[st->tmx[1]][j]; left_p = bmx[st->tmx[0]]; /* init w/ case that left branch emits it all */ *scp = left_p[leftj][leftdiff] + *right_p; while (leftdiff > 0) { leftdiff--; leftj--; right_p++; sc = left_p[leftj][leftdiff] + *right_p; if (sc > *scp) *scp = sc; } } } /* end loop over states */ } /* end loop over diff */ } /* end loop over j */ return 1; } /* Function: trace_cvmx() * * Purpose: Trace stage of the dynamic programming: starting * at j=N, i=1, k=0/BEGIN, trace back the optimal * path. Returns a binary tree, ret_trace. * Caller is reponsible for free'ing ret_trace. */ static int trace_mx(struct istate_s *icm, /* the model to align */ char *seq, /* sequence to align it to 1..N */ int N, int ***amx, int ***bmx, struct trace_s **ret_trace) /* RETURN: the traceback tree */ { struct trace_s *tr; /* the traceback tree under construction */ struct trace_s *curr_tr; /* ptr to node of tr we're working on */ struct tracestack_s *dolist; /* pushdown stack of active tr nodes */ int diff,i, j; /* coords in mx (0..N) */ int y; /* counter for states (0..statenum-1) */ int ynext; /* holds "k+1" value */ int symi, symj; /* array indices for left, right symbols */ int leftdiff; int leftj; int *right_p; int *beam; int conni, connj; int sc; /* Initialize. * Start at i = 1, j = N and work towards diagonal */ InitTrace(&tr, NULL); /* start a trace tree */ dolist = InitTracestack(); /* start a stack for traversing the trace tree */ curr_tr = AttachTrace(tr, NULL, 0, N-1, 0, uBEGIN_ST); PushTracestack(dolist, curr_tr); /* Recursion. While there's active nodes in the stack, trace from them. * * This is cribbed from recurse_cvmx(); it's almost the exact reverse. * We know the best score, we just have to figure out where it came from. */ while ((curr_tr = PopTracestack(dolist)) != NULL) { /* get some useful numbers, mostly for clarity */ /* which is important, since we're sort of misusing * fields in the trace structures! */ i = curr_tr->emitl+1; j = curr_tr->emitr+1; y = curr_tr->nodeidx; diff = j - i + 1; /* During use here, nodeidx field is used to hold a *state* index, * when we leave, everything must look like the rest of the package * expects, so we clean up here. */ curr_tr->nodeidx = icm[y].nodeidx; /* We used an END state here. * (We'd better be near the diagonal!) * We're done here. */ if (icm[y].statetype == uEND_ST) { if (i <= j) Warn("trace: didn't reach off-diag, stop at i=%d j=%d y=%d", i,j,y); curr_tr->nodeidx = -1; continue; } else if (icm[y].statetype == uBIFURC_ST) /* bifurc state */ { /* We used a BIFURC state here. * It came from two branches. Redo the recurse_cvmx() * calculation to find them. */ if (i > j) { PushTracestack(dolist, AttachTrace(curr_tr, NULL, i-1, j-1, icm[y].tmx[1], uBEGIN_ST)); PushTracestack(dolist, AttachTrace(curr_tr, NULL, i-1, j-1, icm[y].tmx[0], uBEGIN_ST)); } else { leftdiff = diff; leftj = j; right_p = bmx[icm[y].tmx[1]] [j]; while (leftdiff >= 0) { if (amx[j][diff][y] == bmx[icm[y].tmx[0]][leftj][leftdiff] + *right_p) { PushTracestack(dolist, AttachTrace(curr_tr, NULL, i + leftdiff-1, j-1, icm[y].tmx[1], uBEGIN_ST)); PushTracestack(dolist, AttachTrace(curr_tr, NULL, i -1, i+leftdiff-2, icm[y].tmx[0], uBEGIN_ST)); break; } leftdiff--; leftj--; right_p++; } if (leftdiff < 0) Die("bifurc reconstruction failed at ijy %d,%d,%d", i,j,y); } } else /* a normal (non-BIFURC) state */ { if (i > 0 && i <= N) symi = SymbolIndex(seq[i]); if (j > 0 && j <= N) symj = SymbolIndex(seq[j]); switch (icm[y].statetype) { case uBEGIN_ST: case uDEL_ST: beam = amx[j][diff]; conni = i; connj = j; break; case uMATP_ST: beam = amx[j-1][diff-2]; conni = i+1; connj = j-1; break; case uMATR_ST: case uINSR_ST: beam = amx[j-1][diff-1]; conni = i; connj = j-1; break; case uMATL_ST: case uINSL_ST: beam = amx[j][diff-1]; conni = i+1; connj = j; break; default: Die("no such state type %d", icm[y].statetype); } beam += y + icm[y].offset; /* Calculate the score we'll try to match, by subtracting emission score as needed */ sc = amx[j][diff][y]; switch (icm[y].statetype) { case uBEGIN_ST: case uDEL_ST: break; case uMATP_ST: sc -= icm[y].emit[symi * ALPHASIZE + symj]; break; case uMATR_ST: case uINSR_ST: sc -= icm[y].emit[symj]; break; case uMATL_ST: case uINSL_ST: sc -= icm[y].emit[symi]; break; default: Die("no such state type %d", icm[y].statetype); } /* find the right connection */ for (ynext = 0; ynext < icm[y].connectnum; ynext++, beam++) if (sc == *beam + icm[y].tmx[ynext]) { PushTracestack(dolist, AttachTrace(curr_tr, NULL, conni-1, connj-1, ynext + y + icm[y].offset, icm[ynext + y + icm[y].offset].statetype)); break; } if (ynext == icm[y].connectnum) { Warn("can't continue traceback"); return 0; } } /* (a normal statetype) */ } /* (while something is in the tracestack) */ FreeTracestack(dolist); *ret_trace = tr; return 1; } tRNAscan-SE-2.0/src/msf.c0000644000543100007160000001036111021467305014352 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* msf.c * SRE, Sun Jul 11 16:17:32 1993 * * Export of GCG MSF multiple sequence alignment * formatted files. * */ #include #include #include #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: WriteMSF() * * Purpose: Write aseqs, names, weights to an open fp, * in GCG MSF format. The alignment must * be flushed (all aseqs the same length, padded * with gaps) * * Returns 1 on success. Returns 0 on failure, and sets * squid_errno to indicate the cause. */ int WriteMSF(FILE *fp, /* open fp for writing */ char **aseqs, /* aligned sequences */ int num, struct aliinfo_s *ainfo) { int still_going; /* True if writing another block */ int idx; /* counter for sequences */ int pos; /* position counter */ int namelen; /* maximum name length used */ int len; /* tmp variable for name lengths */ char buffer[51]; /* buffer for writing seq */ char **sqptr; /* ptrs into each sequence */ int charcount; /* num. symbols we're writing */ float weight; /* allocate seq pointers that we'll move across each sequence */ if ((sqptr = (char **) malloc (num * sizeof(char *))) == NULL) { squid_errno = SQERR_MEM; return 0; } /* set sqptrs to start of each seq */ for (idx = 0; idx < num; idx++) sqptr[idx] = aseqs[idx]; /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < num; idx++) if ((len = strlen(ainfo->sqinfo[idx].name)) > namelen) namelen = len; /***************************************************** * Write the title line *****************************************************/ fprintf(fp, "\n"); /* ack! we're writing bullshit here */ fprintf(fp, " MSF: 000 Type: X Check: 0000 ..\n"); fprintf(fp, "\n"); /***************************************************** * Write the names *****************************************************/ for (idx = 0; idx < num; idx++) { weight = 1.0; if (ainfo->sqinfo[idx].flags & SQINFO_WGT) weight = ainfo->sqinfo[idx].weight; fprintf(fp, " Name: %-*.*s Len: %5d Check: %5d Weight: %.4f\n", namelen, namelen, ainfo->sqinfo[idx].name, ainfo->alen, GCGchecksum(aseqs[idx], ainfo->alen), weight); } fprintf(fp, "\n"); fprintf(fp, "//\n"); fprintf(fp, "\n"); /***************************************************** * Write the sequences *****************************************************/ still_going = 1; while (still_going) { still_going = 0; for (idx = 0; idx < num; idx++) { fprintf(fp, "%-*.*s ", namelen, namelen, ainfo->sqinfo[idx].name); /* get next line's worth of 50 from seq */ strncpy(buffer, sqptr[idx], 50); buffer[50] = '\0'; charcount = strlen(buffer); /* is there still more to go? */ if (charcount == 50 && sqptr[idx][50] != '\0') still_going = 1; /* shift the seq ptr by a line */ sqptr[idx] += charcount; /* draw the sequence line */ pos = 0; while (pos < charcount) { if (isgap(buffer[pos])) fputc('.', fp); else fputc(buffer[pos], fp); pos++; if (!(pos % 10)) fputc(' ', fp); } fputc('\n', fp); } /* put blank line between blocks */ fputc('\n', fp); } free(sqptr); return 1; } void FlushAlignment(char **aseqs, int num, int *ret_alen) { int len, alen; int idx; int apos; alen = strlen(aseqs[0]); for (idx = 1; idx < num; idx++) if ((len = strlen(aseqs[idx])) > alen) alen = len; for (idx = 0; idx < num; idx++) { if ((aseqs[idx] = (char *) realloc (aseqs[idx], sizeof(char) * (alen+1))) == NULL) Die("realloc failed"); for (apos = strlen(aseqs[idx]); apos < alen; apos++) aseqs[idx][apos] = '.'; aseqs[idx][apos] = '\0'; } *ret_alen = alen; } tRNAscan-SE-2.0/src/revcomp.c0000644000543100007160000000221311021467305015235 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* revcomp.c * * Reverse complement of a IUPAC character string * */ #include #include #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif char * revcomp(char *comp, char *seq) { long bases; char *bckp, *fwdp; int idx; long pos; int c; if (comp == NULL) return NULL; if (seq == NULL) return NULL; bases = strlen(seq); fwdp = comp; bckp = seq + bases -1; for (pos = 0; pos < bases; pos++) { c = *bckp; c = sre_toupper(c); for (idx = 0; c != iupac[idx].sym && idx < IUPACSYMNUM; idx++); if (idx == IUPACSYMNUM) { Warn("Can't reverse complement an %c, pal. Using N.", c); *fwdp = 'N'; } else *fwdp = iupac[idx].symcomp; if (islower((int) *bckp)) *fwdp = sre_tolower((int) *fwdp); fwdp++; bckp--; } *fwdp = '\0'; return comp; } tRNAscan-SE-2.0/src/instman.pl0000644000543100007160000000154411021467312015430 0ustar pchanlowelab#! /usr/bin/perl # Sean Eddy, Wed Jul 29 15:24:43 1992 # instman - "cp" replacement for formatting and installing man pages # Requires that GNU groff is installed: geqn, gtbl, groff. # # Usage: instman # $usage = "Usage: instman \n where is a man-formatted input file,\n and is the name to store the formatted file under\n (such as /usr/catman/local/cat1/foo)\n"; if ($#ARGV != 1) { die "Incorrect argument number.\n$usage"; } $man = shift(@ARGV); $dest = shift(@ARGV); print "Reading $man, installing as $dest\n"; die "Error: can't read $man.\n$usage" unless -r $man; die "Error: can't write $dest.\n$usage" unless -w $man; die "Error: $dest is a directory, not a file name\n$usage" if -d $dest; system "geqn $man | gtbl | groff -man -Tascii > $dest"; tRNAscan-SE-2.0/src/eufind_main.c0000644000543100007160000002155214044141766016057 0ustar pchanlowelab /* eufindtRNA - Eukaryotic tRNA finder * * C implementation of algorithm described by Pavesi, Conterio, * Bolchi, Dieci, & Ottonello in NAR 22:1247-56 (94) * "Identification of new eukaryotic tRNA genes in genomic DNA * databases by a multistep weight matix analysis of transcriptional * control regions" * * To be used in tRNAscan-SE package to increase sensitivity by * complementing tRNAscan 1.3 first-pass scan * * by Todd MJ Lowe 4/8/96 * * Uses Sean Eddy's function library for biological sequence analysis * (Squid v1.5g) * * v1.1: Small bug fixed 8/2000 that caused second of two consecutive tRNAs * (within 40bp) to be missed if the second tRNA scored lower than the first */ #include #include #include #include #include "squid.h" #include "sqfuncs.h" #include "eufind_const.h" #include "pavesi.h" char eufind_version[] = "1.1"; char eufind_date[] = "Aug 2000"; #define OPTIONS "ho:l:X:I:rsDFi:" char usage[] = "\n\ Usage: eufindtRNA [-options] \n\ Find tRNAs in eukaryotic sequences\n\n\ Available options:\n\ -h : help - print version and usage info\n\ -o : save tRNAs in \n\ -r : relaxed mode (no terminators searched for)\n\ -s : strict mode (discard tRNAs with missing terminators)\n\ -l : set max intron+variable loop length (default=140)\n\ -X : manually set final score cutoff to (def=-31.8)\n\ -I : set cutoff for intermediate score (def=-31.25)\n\ -D : save tRNA score components (for Debugging)\n\ -F : don't check for uppercase or DNA alphabet\n\ -i : start nucleotide numbering at (def=1)\n\n"; int main (int argc, char **argv) { char *seqfile; /* file containing aligned seqs */ char *outfile; /* destination file for tRNAs */ int fmt; /* format of seqfile */ FILE *outfp; /* open outfile */ SQFILE *seqfp; SQINFO sqinfo; int i, errno, ShowScores, /* flag for type of info output when saving tRNAs */ RelaxedMode, /* flag for relaxed scanning mode, do not look for poly T terminator signal */ StrictMode, /* require poly T terminator */ NoReformat; /* flag to prevent extra work of */ /* changing seqs to DNA & upper case */ /* alphabet */ float NoTermPenalty; /* penalty val for tRNAs with no */ /* poly T terminator */ long int sqoffset; /* nucleotide numbering offset (set with -i param) */ char *seq, *revseq, /* sequence */ *iseq, *reviseq; /* encoded seq & encoded reverse comp */ int strand, /* 1 for orig seq, -1 for rev comp */ seqidx; /* current position in seq */ float FirstScore, /* initial (Bbox) logodds score */ IntScore, TotScore, /* cum tRNA logodds scores */ IntScoreCutoff, TotScoreCutoff; /* cutoff for reporting tRNAs */ TRNA_TYPE *tRNA, *prev_tRNA, *swap_tRNA; /* current & previous tRNA info */ int Max_AB_dist; /* max nuc. distance searched upstream */ /* of candidate B boxes for A boxes */ int tRNA_ct; int optchar; extern char *optarg; extern int optind; /*********************************************** * Parse command line ***********************************************/ outfile = NULL; TotScoreCutoff = TOT_SCORE_THRESH; IntScoreCutoff = INT_SCORE_THRESH; Max_AB_dist = MIN_AB_BOX_DIST + AB_BOX_DIST_RANGE; sqoffset = 0; ShowScores = 0; NoTermPenalty = MAX_PENALTY; RelaxedMode = 0; StrictMode = 0; NoReformat = 0; while ((optchar = getopt(argc, argv, OPTIONS)) != -1) switch (optchar) { case 'o': outfile = optarg; break; case 'h': printf("eufindtRNA %s, %s\n%s\n", eufind_version, eufind_date, usage); exit(EXIT_SUCCESS); case 'r': RelaxedMode = 1; break; case 's': NoTermPenalty = 10*MAX_PENALTY; StrictMode = 1; break; case 'X': TotScoreCutoff = atof(optarg); break; case 'D': ShowScores = 1; break; case 'F': NoReformat = 1; break; case 'l': Max_AB_dist = MIN_AB_BOX_DIST + atof(optarg); break; case 'I': IntScoreCutoff = atof(optarg); break; case 'i': sqoffset = atof(optarg)-1; break; default: Die("%s\n", usage); } if (argc -optind != 1) Die("Wrong number of arguments specified on command line\n%s\n", usage); seqfile = argv[optind]; if (outfile == NULL) outfp = stdout; else if ((outfp = fopen(outfile, "w")) == NULL) Die("Failed to open tRNA output file %s", outfile); if ((tRNA = (TRNA_TYPE *) malloc (sizeof(TRNA_TYPE))) == NULL) Die("Memory failure, couldn't allocate tRNA memory\n"); if ((prev_tRNA = (TRNA_TYPE *) malloc (sizeof(TRNA_TYPE))) == NULL) Die("Memory failure, couldn't allocate tRNA memory\n"); /*********************************************** * Determine seq format & open for reading * ***********************************************/ if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Can't determine format of file %s\n", seqfile); if ((seqfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(seqfp, fmt, &seq, &sqinfo)) { if (ShowScores) printf ("Seq: %s\n",sqinfo.name); tRNA_ct = 0; if (!NoReformat) { ToDNA(seq); s2upper(seq); } /* allocate mem for integer-encoded seq (A=0,C=1,G=2,T=3) */ if ((iseq = calloc (sqinfo.len+2, sizeof(char))) == NULL) Die("Memory failure, couldn't allocate sequence\n"); /* integer-encode sequence */ if ((errno = IntEncodeSeq(iseq,seq,sqinfo.len))) Die("Unable to encode sequence %s at base %d\n", sqinfo.name,errno); /* Search both strands (0=top strand, -1=bottom strand) */ for (strand=0; strand >= -1; strand--) { Init_tRNA(prev_tRNA); /* clear previous tRNA */ /* take reverse complement of encoded seq if searching bottom strand */ if (strand == -1) { if ((revseq = calloc (sqinfo.len+2, sizeof(char))) == NULL) Die("Memory failure, couldn't allocate reverse sequence\n"); revcomp(revseq, seq); free(seq); seq = revseq; if (IntEncodeSeq(iseq,seq,sqinfo.len)) Die("Unable to encode sequence\n"); } /*********************************************** * Find transcriptional promotor elements * ***********************************************/ seqidx=BBOX_START_IDX-2; while (GetBbox(&FirstScore,&seqidx,iseq,sqinfo.len,strand, ShowScores)) { Init_tRNA(tRNA); tRNA->Bbox_st = seqidx; tRNA->Bbox_end = seqidx + BBOX_LEN-1; tRNA->BboxSc = FirstScore; if (((FirstScore >= SEC_LOBOUND) && (FirstScore <= SEC_HIBOUND)) && (GetSecABox(tRNA,seq))){ if (!GetBestTrxTerm(tRNA,seq,sqinfo.len,NoTermPenalty) && StrictMode) continue; /* look for next B box */ strcpy(tRNA->acodon,"TCA"); /* tRNA->Bbox_end++; */ tRNA->totSc = FirstScore; } else { /* Searching for non-SelCys tRNA */ GetBestABox(tRNA,seq,iseq,sqinfo.len,strand,ShowScores, Max_AB_dist,prev_tRNA->Abox_st); IntScore = tRNA->AboxSc + tRNA->BboxSc + tRNA->ABdistSc; if (IntScore < IntScoreCutoff) continue; /* look for next B box */ if (!RelaxedMode) { if (!GetBestTrxTerm(tRNA,seq,sqinfo.len,NoTermPenalty) && StrictMode) continue; TotScore = IntScore + tRNA->TermSc; if (TotScore < TotScoreCutoff) continue; /* look for next B box */ tRNA->totSc = TotScore; } else tRNA->totSc = IntScore; } Get_tRNA_stats(tRNA,seq,sqinfo.len,strand); if (tRNAOverlap(tRNA,prev_tRNA,strand)) { if (tRNA->totSc < prev_tRNA->totSc) { Init_tRNA(tRNA); /* skip repeat tRNA */ } else { /* swap, but don't save yet */ swap_tRNA = prev_tRNA; prev_tRNA = tRNA; Init_tRNA(swap_tRNA); tRNA = swap_tRNA; } } else { /* no overlap, save & then swap */ if (prev_tRNA->start > 0) { prev_tRNA->idno = ++tRNA_ct; Save_tRNA(prev_tRNA,&sqinfo,seq,strand,ShowScores,sqoffset); } swap_tRNA = prev_tRNA; prev_tRNA = tRNA; Init_tRNA(swap_tRNA); tRNA = swap_tRNA; } } /* find B box */ /* save last buffered tRNA before going to other strand */ if (prev_tRNA->start > 0) { prev_tRNA->idno = ++tRNA_ct; Save_tRNA(prev_tRNA,&sqinfo,seq,strand,ShowScores,sqoffset); } } /* search both strands */ FreeSequence(seq, &sqinfo); free(iseq); } SeqfileClose(seqfp); fclose(outfp); return 0; } tRNAscan-SE-2.0/src/interleaved.c0000644000543100007160000004016614044137146016102 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* interleaved.c * I/O of interleaved format multiple alignments. * Modified from selex.c * * Fri Dec 4 17:43:24 1992, SRE: * Reading and writing aligned sequences to/from disk files. * Implements a new, broader specification of SELEX format * and supercedes alignio.c. * * SRE, Tue Nov 9 17:40:50 1993: * major revision. #= special comments and aliinfo_s optional * alignment info support added. Support for #=CS (consensus * secondary structure), #=SS (individual secondary structure), * #=RF (reference coordinate system), #=SQ (per-sequence header info), * and #=AU ("author") added. * * SRE, Mon Jan 30 14:41:49 1995: * #=SA side chain % surface accessibility annotation supported * * SRE, Mon Sep 11 09:20:08 1995 * selex.c generalized and simplified to make interleaved.c * * SELEX format is documented in Docs/formats.tex. **************************************************************************** */ #include #include #include #include #include #include /* SunOS 4.x isn't fully ANSI-compliant. */ #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static void homogenize_gapsym(char *s, char gapsym); static int copy_alignment_line(char *aseq, int apos, int name_rcol, char *buffer, int lcol, int rcol); static char commentsyms[] = "#%"; /* Function: homogenize_gapsym() * * Purpose: Make gap symbols homogeneous. */ static void homogenize_gapsym(char *s, char gapsym) { for (; *s != '\0'; s++) if (isgap(*s)) *s = gapsym; } /* Function: copy_alignment_line() * * Purpose: Given a line from an alignment file, and bounds lcol,rcol * on what part of it may be sequence, save the alignment into * aseq starting at position apos. * * name_rcol is set to the rightmost column this aseqs's name * occupies; if name_rcol >= lcol, we have a special case in * which the name intrudes into the sequence zone. */ static int copy_alignment_line(char *aseq, int apos, int name_rcol, char *buffer, int lcol, int rcol) { char *s1, *s2; int i; s1 = aseq + apos; s2 = buffer; /* be careful that buffer doesn't end before lcol! */ for (i = 0; i < lcol; i++) if (*s2) s2++; for (i = lcol; i <= rcol; i++) { if (*s2 == '\t') { Warn("TAB characters will corrupt a SELEX alignment! Please remove them first."); return 0; } if (name_rcol >= i) *s1 = '.'; /* name intrusion: pad left w/ gaps */ else if (*s2 == '\0' || *s2 == '\n') *s1 = '.'; /* short buffer: pad right w/ gaps */ else *s1 = *s2; /* normal: copy buffer into aseq */ s1++; if (*s2) s2++; } return 1; } /* Function: is_blankline() * * Return TRUE if line is all whitespace. */ static int is_blankline(char *buffer) { for (; *buffer != '\0'; buffer++) if (! isspace(*buffer)) return 0; return 1; } /* CLUSTALV and CLUSTALW support * * parse_header() and is_dataline() functions for ClustalV and ClustalW * interleaved multiple alignment format */ /*ARGSUSED*/ static int parse_clustal(FILE *fp, AINFO *ainfo, int *got_sqinfo) { char buffer[LINEBUFLEN]; do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; } } while (strncmp(buffer, "CLUSTAL ", 8) != 0 || strstr(buffer, "multiple sequence alignment") == NULL); *got_sqinfo = FALSE; return 1; } static int dataline_clustal(char *buf, char *expected_name) { while (*buf && isspace(*buf)) buf++; if (*buf == '\0' || strchr(commentsyms, *buf) != NULL) return 0; /* blank or comment */ if (expected_name != NULL && strncmp(buf, expected_name, strlen(expected_name)) == 0) return 1; /* matches expected seq name */ for (; *buf != '\0'; buf++) { /* Clustal has no coord lines to worry about */ if (*buf == '*' || *buf == '.') continue; /* possible consensus line */ if (isalnum(*buf)) return 1; /* name or seq character */ if (*buf != ' ' && isgap(*buf)) return 1; /* possible all-gap line */ } return 0; } /* GCG MSF support * * parse_header() and is_dataline() routines for GCG MSF alignments */ static int parse_MSF(FILE *fp, AINFO *ainfo, int *got_sqinfo) { char buffer[LINEBUFLEN]; char *sptr; int nseq; /* Get first dividing line. MSF format specifies ints after MSF: and Check: * but we don't make sure of this */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; } } while (strstr(buffer, " MSF: ") == NULL || strstr(buffer, " Check: ") == NULL || strstr(buffer, " ..") == NULL); /* Get names, weight from header */ nseq = 0; /*CONSTCOND*/ while (1) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; } if (is_blankline(buffer)) continue; if (strncmp(buffer, "//", 2) == 0) break; sptr = strtok(buffer, WHITESPACE); if (sptr == NULL || strcmp(sptr, "Name:") != 0 || strstr(sptr+5, "Weight:") != 0) { squid_errno = SQERR_FORMAT; return 0; } if (nseq == 0) ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO)); else ainfo->sqinfo = (SQINFO *) realloc (ainfo->sqinfo, (nseq + 1) * sizeof(SQINFO)); if (ainfo->sqinfo == NULL) { squid_errno = SQERR_MEM; return 0; } ainfo->sqinfo[nseq].flags = 0; if ((sptr = strtok(NULL, WHITESPACE)) == NULL) {squid_errno=SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[nseq]), sptr, SQINFO_NAME); while (sptr != NULL && strcmp(sptr, "Weight:") != 0) sptr = strtok(NULL, WHITESPACE); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) {squid_errno=SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[nseq]), sptr, SQINFO_WGT); nseq++; } *got_sqinfo = TRUE; return 1; } static int dataline_MSF(char *buf, char *expected_name) { while (*buf && isspace(*buf)) buf++; if (*buf == '\0' || strchr(commentsyms, *buf) != NULL) return 0; /* blank or comment */ if (expected_name != NULL && strncmp(buf, expected_name, strlen(expected_name)) == 0) return 1; /* matches expected seq name */ for (; *buf != '\0'; buf++) { /* MSF has coordinate lines to worry about */ if (isspace(*buf)) continue; /* no info from spaces */ if (isalpha(*buf)||isgap(*buf)) return 1; /* has data on it */ } return 0; } /* Function: ReadInterleaved() * * Purpose: Read multiple aligned sequences from the file seqfile. * Store aligned sequences in aseqs, names in names, and * the number of sequences in num. * * Memory is allocated for aseqs and names, and they must be * free'd by the caller. * * If optional information is desired, a non-NULL ainfo * pointer is passed. * * Args: seqfile: name of alignment file to read. * parse_header(): routine to parse the header of the file * is_dataline(): routine to determine if a line contains data * ret_aseqs: RETURN: 2D array of aligned sequences * ret_anum: RETURN: number of aligned sequences * ainfo: RETURN: optional alignment information * * Returns 1 on success. Returns 0 on failure and sets * squid_errno to indicate the cause of the failure. */ int ReadInterleaved(char *seqfile, int (*parse_header)(FILE *, AINFO *, int *), int (*is_dataline)(char *, char *), char ***ret_aseqs, int *ret_num, AINFO *ainfo) { FILE *fp; /* ptr to opened seqfile */ char **aseqs; /* aligned seqs */ int nseq; /* number of seqs read */ char buffer[LINEBUFLEN]; /* input buffer for lines */ char bufcpy[LINEBUFLEN]; /* copy of buffer for strtok */ struct block_struc { /** alignment data for a block: */ int lcol; /* furthest left aligned sym */ int rcol; /* furthest right aligned sym */ } *blocks; int blocknum; /* number of blocks in file */ char *sptr; /* ptr into line during parsing */ int currblock; /* index for blocks */ int seqidx; /* counter for seqs */ int alen; /* length of alignment */ int warn_names; /* becomes TRUE if names don't match between blocks */ int currlen; int count; int inblock; /* TRUE if in a block of data */ long offset; /* used for skipping header */ int got_sqinfo; /* TRUE if header gave us sqinfo */ /*************************************************** * Parse the header of the file, according to caller-supplied function. * The parser is responsible for making sure there are * no non-comment lines before the first block. Comment * or blank lines are OK. The parser may also fill fields * into ainfo. ***************************************************/ /* open the file for reading */ fp = fopen(seqfile, "r"); if (fp == NULL) { squid_errno = SQERR_NOFILE; return 0; } if (! (*parse_header) (fp, ainfo, &got_sqinfo)) return 0; offset = ftell(fp); /* where are we in the file? */ /*************************************************** * First pass across file. * Count seqs, get names, determine column info ***************************************************/ ainfo->flags = 0; blocknum = 0; nseq = 0; inblock = FALSE; blocks = NULL; warn_names = FALSE; while (!feof(fp)) { /* allocate for info about this block. */ if (blocknum == 0) blocks = (struct block_struc *) malloc (sizeof(struct block_struc)); else blocks = (struct block_struc *) realloc (blocks, (blocknum+1) * sizeof(struct block_struc)); if (blocks == NULL) { squid_errno = SQERR_MEM; return 0; } blocks[blocknum].lcol = LINEBUFLEN+1; blocks[blocknum].rcol = -1; seqidx = 0; /*CONSTCOND*/ while (1) /* breaks out when blank line or EOF is hit, see below */ { /* get a data line */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) goto BREAKOUT; if (inblock && is_blankline(buffer)) goto BREAKOUT; } while (! (*is_dataline)(buffer, (got_sqinfo || (seqidx < nseq && blocknum > 0)) ? ainfo->sqinfo[seqidx].name : NULL)); /* copy line for strtok'ing. set sptr to first word */ inblock = TRUE; strcpy(bufcpy, buffer); sptr = strtok(bufcpy, WHITESPACE); /* First block only: save names */ if (blocknum == 0 && !got_sqinfo) { if (seqidx == 0) ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO)); else ainfo->sqinfo = (SQINFO *) realloc (ainfo->sqinfo, (seqidx + 1) * sizeof(SQINFO)); if (ainfo->sqinfo == NULL) { squid_errno = SQERR_MEM; return 0; } ainfo->sqinfo[seqidx].flags = 0; SetSeqinfoString(&(ainfo->sqinfo[seqidx]), sptr, SQINFO_NAME); } else /* in each additional block: check names */ { if (strcmp(ainfo->sqinfo[seqidx].name, sptr) != 0) warn_names = TRUE; } seqidx++; /* bump sequence counter */ /* check rcol, lcol */ if ((sptr = strtok(NULL, WHITESPACE)) != NULL) { /* is this the furthest left we've seen word 2 in this block? */ if (sptr - bufcpy < blocks[blocknum].lcol) blocks[blocknum].lcol = sptr - bufcpy; /* look for right side in buffer */ for (sptr = buffer + strlen(buffer) - 1; isspace(*sptr); sptr --) /* do nothing */ ; if (sptr - buffer > blocks[blocknum].rcol) blocks[blocknum].rcol = sptr - buffer; } } /* check that number of sequences matches expected */ BREAKOUT: if (inblock) { if (blocknum != 0 && seqidx != nseq) { squid_errno = SQERR_FORMAT; return 0; } nseq = seqidx; blocknum++; inblock = FALSE; } } /*************************************************** * Get ready for second pass: * figure out the length of the alignment * malloc space * rewind the file ***************************************************/ alen = 0; for (currblock = 0; currblock < blocknum; currblock++) alen += blocks[currblock].rcol - blocks[currblock].lcol + 1; fseek(fp, offset, SEEK_SET); /* rewind to first data block */ /* allocations */ if ((aseqs = (char **) malloc (nseq * sizeof(char *))) == NULL) { squid_errno = SQERR_MEM; return 0; } for (seqidx = 0; seqidx < nseq; seqidx++) if ((aseqs[seqidx] = (char *) malloc ((alen+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return 0; } ainfo->alen = alen; ainfo->flags |= AINFO_ALEN; /*************************************************** * Second pass across file. Parse alignment ***************************************************/ /* We've now made a complete first pass over the file. We know how * many blocks it contains, we know the number of seqs in the first * block, and we know every block has the same number of blocks; * so we can be a bit more cavalier about error-checking as we * make the second pass. */ currlen = 0; for (currblock = 0 ; currblock < blocknum; currblock++) { for (seqidx = 0; seqidx < nseq; seqidx++) { /* get next line */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; } } while (! (*is_dataline)(buffer, ainfo->sqinfo[seqidx].name)); /* find right boundary of name */ sptr = buffer; while (*sptr && isspace(*sptr)) sptr++; while (*sptr && !isspace(*sptr)) sptr++; /* parse line */ if (! copy_alignment_line(aseqs[seqidx], currlen, sptr - buffer, buffer, blocks[currblock].lcol, blocks[currblock].rcol)) { squid_errno = SQERR_FORMAT; return 0; } } currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; } /* NULL-terminate all the strings */ for (seqidx = 0; seqidx < nseq; seqidx++) { aseqs[seqidx][alen] = '\0'; homogenize_gapsym(aseqs[seqidx], (char) '.'); } /* find raw sequence lengths for sqinfo */ for (seqidx = 0; seqidx < nseq; seqidx++) { count = 0; for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) if (!isgap(*sptr)) count++; ainfo->sqinfo[seqidx].len = count; ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; } /* tidy up the alignment */ MingapAlignment(aseqs, nseq, ainfo); /*************************************************** * Garbage collection and return ***************************************************/ fclose(fp); free(blocks); if (warn_names) Warn("sequences may be in different orders in blocks of %s?", seqfile); *ret_num = nseq; *ret_aseqs = aseqs; return 1; } /* Function: ReadAlignment() * * Purpose: Given a seqfile name and format, hand it off to appropriate * parser. * * Currently, squid can parse alignments from the following * multiple sequence alignment formats: * MSF (U. of Wisconsin GCG package MSF format) * SELEX (NeXagen/CU Boulder SELEX format) * CLUSTAL (Des Higgins' CLUSTALV and CLUSTALW programs) * * Return: 1 on success; 0 on failure. * Returned data should be freed by caller with FreeAlignment() */ int ReadAlignment(char *seqfile, int format, char ***ret_aseqs, int *ret_num, struct aliinfo_s *ret_ainfo) { switch (format) { case kMSF: if (! ReadInterleaved(seqfile, parse_MSF, dataline_MSF, ret_aseqs, ret_num, ret_ainfo)) return 0; break; case kSelex: if (! ReadSELEX(seqfile, ret_aseqs, ret_num, ret_ainfo)) return 0; break; case kClustal: if (! ReadInterleaved(seqfile, parse_clustal, dataline_clustal, ret_aseqs, ret_num, ret_ainfo)) return 0; break; default: squid_errno = SQERR_FORMAT; return 0; } return 1; } tRNAscan-SE-2.0/src/sqio.c0000644000543100007160000013242611672026406014554 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* File: sqio.c * From: ureadseq.c in Don Gilbert's sequence i/o package * * Reads and writes nucleic/protein sequence in various * formats. Data files may have multiple sequences. * * Heavily modified from READSEQ package * Copyright (C) 1990 by D.G. Gilbert * Biology Dept., Indiana University, Bloomington, IN 47405 * email: gilbertd@bio.indiana.edu * Thanks Don! * * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992 * Packaged for squid, Thu Oct 1 10:07:11 1992 * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993 */ #include #include #include #include #ifndef SEEK_SET #include /* may Sun Microsystems rot in hell */ #endif #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Our hack for "sequential" access of sequences in * interleaved-format alignment files (MSF, SELEX): * static pointers at data for the entire alignment. */ static char **ali_aseqs; static char **ali_rseqs; static struct aliinfo_s ali_ainfo; static int ali_num; static int curridx; #define kStartLength 500 static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; static char *primenuc = "ACGTUN"; static char *protonly = "EFIPQZ"; /* static char stdsymbols[6] = "_.-*?"; */ static char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'|`~\"\\"; static char *seqsymbols = allsymbols; /* use general form of isseqchar -- all chars + symbols. no formats except nbrf (?) use symbols in data area as anything other than sequence chars. (wrong. PIR-CODATA does. Remove /) */ void FreeSequence(char *seq, SQINFO *sqinfo) { if (seq != NULL) free(seq); if (sqinfo->flags & SQINFO_SS) free(sqinfo->ss); if (sqinfo->flags & SQINFO_SA) free(sqinfo->sa); } int SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag) { int len; int pos; while (*sptr == ' ') sptr++; /* ignore leading whitespace */ for (pos = strlen(sptr)-1; pos >= 0; pos--) if (! isspace(sptr[pos])) break; sptr[pos+1] = '\0'; /* ignore trailing whitespace */ switch (flag) { case SQINFO_NAME: if (*sptr != '-') { strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1); sqinfo->name[SQINFO_NAMELEN-1] = '\0'; sqinfo->flags |= SQINFO_NAME; } break; case SQINFO_ID: if (*sptr != '-') { strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1); sqinfo->id[SQINFO_NAMELEN-1] = '\0'; sqinfo->flags |= SQINFO_ID; } break; case SQINFO_ACC: if (*sptr != '-') { strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1); sqinfo->acc[SQINFO_NAMELEN-1] = '\0'; sqinfo->flags |= SQINFO_ACC; } break; case SQINFO_DESC: if (*sptr != '-') { if (sqinfo->flags & SQINFO_DESC) /* append? */ { len = strlen(sqinfo->desc); if (len < SQINFO_DESCLEN-2) /* is there room? */ { strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++; strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len); } } else /* else copy */ strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1); sqinfo->desc[SQINFO_DESCLEN-1] = '\0'; sqinfo->flags |= SQINFO_DESC; } break; case SQINFO_START: if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->start = atoi(sptr); if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START; break; case SQINFO_STOP: if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->stop = atoi(sptr); if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP; break; case SQINFO_OLEN: if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->olen = atoi(sptr); if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN; break; case SQINFO_WGT: if (*sptr != '-') { if (!IsReal(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->weight = (float) atof(sptr); sqinfo->flags |= SQINFO_WGT; } break; default: Die("Invalid flag %d to SetSeqinfoString()"); } return 1; } void SeqinfoCopy(SQINFO *sq1, SQINFO *sq2) { sq1->flags = sq2->flags; if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name); if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id); if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc); if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc); if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len; if (sq2->flags & SQINFO_START) sq1->start = sq2->start; if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop; if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen; if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type; if (sq2->flags & SQINFO_WGT) sq1->weight = sq2->weight; if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss); if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa); } /* Function: ToDNA() * * Purpose: Convert a sequence to DNA. * U --> T */ void ToDNA(char *seq) { for (; *seq != '\0'; seq++) { if (*seq == 'U') *seq = 'T'; else if (*seq == 'u') *seq = 't'; } } /* Function: ToRNA() * * Purpose: Convert a sequence to RNA. * T --> U */ void ToRNA(char *seq) { for (; *seq != '\0'; seq++) { if (*seq == 'T') *seq = 'U'; else if (*seq == 't') *seq = 'u'; } } static int isSeqChar(int c) { if (c > 127) return 0; /* IRIX 4.0 bug! isascii(255) returns TRUE */ return (isalpha(c) || strchr(seqsymbols,c)); } static void readline(FILE *f, char *s) { char *cp; if (NULL == fgets(s, LINEBUFLEN, f)) *s = 0; else { cp = strchr(s, '\n'); if (cp != NULL) *cp = 0; } } static void GetLine(struct ReadSeqVars *V) { readline(V->f, V->sbuffer); } /* Function: addseq() * * Purpose: Add a line of sequence to the growing string in V. */ static void addseq(char *s, struct ReadSeqVars *V) { char *ptr; while (*s != 0) { if (isSeqChar((int) *s)) { if (*s == '-' && V->dash_equals_n) *s = 'N'; if (V->seqlen >= V->maxseq) { V->maxseq += kStartLength; ptr = (char*) realloc(V->seq, V->maxseq+1); if (ptr==NULL) { squid_errno = SQERR_MEM; return; } else V->seq = ptr; } V->seq[(V->seqlen)++] = *s; } s++; } } static void addstruc(char *s, struct ReadSeqVars *V) { char *sptr; if (! (V->sqinfo->flags & SQINFO_SS)) { if ((V->sqinfo->ss = (char *) malloc ((V->maxseq+1) * sizeof(char))) == NULL) { squid_errno = SQERR_MEM; return; } V->sqinfo->flags |= SQINFO_SS; sptr = V->sqinfo->ss; } else { if ((V->sqinfo->ss = (char *) realloc(V->sqinfo->ss, V->maxseq+1)) == NULL) { squid_errno = SQERR_MEM; return; } sptr = V->sqinfo->ss; while (*sptr != '\0') sptr++; } while (*s != 0) { if (isSeqChar((int)*s)) { *sptr = *s; sptr++; } s++; } *sptr = '\0'; } static void readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V) { int addend = 0; int done = 0; V->seqlen = 0; if (addfirst) addseq(V->sbuffer, V); do { GetLine(V); done = feof(V->f); done |= (*endTest)(V->sbuffer, &addend); if (addend || !done) addseq(V->sbuffer, V); } while (!done); } static int endPIR(char *s, int *addend) { *addend = 0; if ((strncmp(s, "///", 3) == 0) || (strncmp(s, "ENTRY", 5) == 0)) return 1; else return 0; } static void readPIR(struct ReadSeqVars *V) { char *sptr; /* load first line of entry */ while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0) GetLine(V); if (feof(V->f)) return; if ((sptr = strtok(V->sbuffer + 15, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } do { GetLine(V); if (!feof(V->f) && strncmp(V->sbuffer, "TITLE", 5) == 0) SetSeqinfoString(V->sqinfo, V->sbuffer+15, SQINFO_DESC); else if (!feof(V->f) && strncmp(V->sbuffer, "ACCESSION", 9) == 0) { if ((sptr = strtok(V->sbuffer+15, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); } } while (! feof(V->f) && (strncmp(V->sbuffer,"SEQUENCE", 8) != 0)); GetLine(V); /* skip next line, coords */ readLoop(0, endPIR, V); /* reading a real PIR-CODATA database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; /* get next line */ while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0) GetLine(V); } static int endIG(char *s, int *addend) { *addend = 1; /* 1 or 2 occur in line w/ bases */ return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL)); } static void readIG(struct ReadSeqVars *V) { char *nm; /* position past ';' comments */ do { GetLine(V); } while (! (feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer != ';')) )); if (!feof(V->f)) { if ((nm = strtok(V->sbuffer, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); readLoop(0, endIG, V); } while (!(feof(V->f) || ((*V->sbuffer != '\0') && (*V->sbuffer == ';')))) GetLine(V); } static int endStrider(char *s, int *addend) { *addend = 0; return (strstr( s, "//") != NULL); } static void readStrider(struct ReadSeqVars *V) { char *nm; while ((!feof(V->f)) && (*V->sbuffer == ';')) { if (strncmp(V->sbuffer,"; DNA sequence", 14) == 0) { if ((nm = strtok(V->sbuffer+16, ",\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); } GetLine(V); } if (! feof(V->f)) readLoop(1, endStrider, V); /* load next line */ while ((!feof(V->f)) && (*V->sbuffer != ';')) GetLine(V); } static int endGB(char *s, int *addend) { *addend = 0; return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s)); } static void readGenBank(struct ReadSeqVars *V) { char *sptr; int in_definition; while (strncmp(V->sbuffer, "LOCUS", 5) != 0) GetLine(V); if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } in_definition = FALSE; while (! feof(V->f)) { GetLine(V); if (! feof(V->f) && strstr(V->sbuffer, "DEFINITION") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+12, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); in_definition = TRUE; } else if (! feof(V->f) && strstr(V->sbuffer, "ACCESSION") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); in_definition = FALSE; } else if (strncmp(V->sbuffer,"ORIGIN", 6) != 0) { if (in_definition) SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); } else break; } readLoop(0, endGB, V); /* reading a real GenBank database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; while (!(feof(V->f) || ((*V->sbuffer!=0) && (strstr(V->sbuffer,"LOCUS") == V->sbuffer)))) GetLine(V); /* SRE: V->s now holds "//", so sequential reads are wedged: fixed Tue Jul 13 1993 */ while (!feof(V->f) && strstr(V->sbuffer, "LOCUS ") != V->sbuffer) GetLine(V); } static int endNBRF(char *s, int *addend) { char *a; if ((a = strchr(s, '*')) != NULL) { /* end of 1st seq */ /* "*" can be valid base symbol, drop it here */ *a = 0; *addend = 1; return(1); } else if (*s == '>') { /* start of next seq */ *addend = 0; return(1); } else return(0); } static void readNBRF(struct ReadSeqVars *V) { char *sptr; if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); GetLine(V); /*skip title-junk line*/ readLoop(0, endNBRF, V); while (!(feof(V->f) || (*V->sbuffer != 0 && *V->sbuffer == '>'))) GetLine(V); } static int endGCGdata(char *s, int *addend) { *addend = 0; return (*s == '>'); } static void readGCGdata(struct ReadSeqVars *V) { char *sptr, *lptr; int binary = FALSE; /* whether data are binary or not */ int blen; /* length of binary sequence */ /* first line contains ">>>>" followed by name */ if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, NULL, 2, &sptr, &lptr) == 0) { binary = TRUE; SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); blen = atoi(lptr); free(sptr); free(lptr); } else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, NULL, 1, &sptr) == 0) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); free(sptr); } else Die("bogus GCGdata format? %s", V->sbuffer); /* second line contains free text description */ GetLine(V); SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); if (binary) { /* allocate for blen characters +3... (allow for 3 bytes of slop) */ if (blen >= V->maxseq) { V->maxseq = blen; if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) Die("malloc failed"); } /* read (blen+3)/4 bytes from file */ if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < ((blen+3)/4)) Die("fread failed"); V->seqlen = blen; /* convert binary code to seq */ GCGBinaryToSequence(V->seq, blen); } else readLoop(0, endGCGdata, V); while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) GetLine(V); } static int endPearson(char *s, int *addend) { *addend = 0; return(*s == '>'); } static void readPearson(struct ReadSeqVars *V) { char *sptr; /* check for my special FASTA format */ if (strstr(V->sbuffer, "..") != NULL && strstr(V->sbuffer, "::") != NULL) { if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_WGT); if ((sptr = strtok(NULL, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); if ((sptr = strtok(NULL, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); if ((sptr = strtok(NULL, ".")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ":\t\n ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } else /* else we're normal FASTA format */ { if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } readLoop(0, endPearson, V); while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) GetLine(V); } static int endEMBL(char *s, int *addend) { *addend = 0; /* Some people (Berlin 5S rRNA database, f'r instance) use * an extended EMBL format that attaches extra data after * the sequence -- watch out for that. We use the fact that * real EMBL sequence lines begin with five spaces. * * We can use this as the sole end test because readEMBL() will * advance to the next ID line before starting to read again. */ return (strncmp(s," ",5) != 0); /* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */ } static void readEMBL(struct ReadSeqVars *V) { char *sptr; /* make sure we have first line */ while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0) GetLine(V); if ((sptr = strtok(V->sbuffer+5, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } do { GetLine(V); if (!feof(V->f) && strstr(V->sbuffer, "AC ") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+5, "; \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); } else if (!feof(V->f) && strstr(V->sbuffer, "DE ") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+5, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } } while (! feof(V->f) && strncmp(V->sbuffer,"SQ",2) != 0); readLoop(0, endEMBL, V); /* reading a real EMBL database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; /* load next record's ID line */ while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0) GetLine(V); } static int endZuker(char *s, int *addend) { *addend = 0; return( *s == '(' ); } static void readZuker(struct ReadSeqVars *V) { char *sptr; GetLine(V); /*s == "seqLen seqid string..."*/ if ((sptr = strtok(V->sbuffer+6, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); readLoop(0, endZuker, V); while (!(feof(V->f) | ((*V->sbuffer != '\0') & (*V->sbuffer == '(')))) GetLine(V); } static void readUWGCG(struct ReadSeqVars *V) { char *si; char *sptr; int done; V->seqlen = 0; /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ /*drop above or ".." from id*/ if ((si = strstr(V->sbuffer," Length: ")) != NULL) *si = 0; else if ((si = strstr(V->sbuffer,"..")) != NULL) *si = 0; if ((sptr = strtok(V->sbuffer, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); do { done = feof(V->f); GetLine(V); if (! done) addseq(V->sbuffer, V); } while (!done); } static void readSquid(struct ReadSeqVars *V) { char *sptr; int dostruc = FALSE; while (strncmp(V->sbuffer, "NAM ", 4) != 0) GetLine(V); if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); /*CONSTCOND*/ while (1) { GetLine(V); if (feof(V->f)) {squid_errno = SQERR_FORMAT; return; } if (strncmp(V->sbuffer, "SRC ", 4) == 0) { if ((sptr = strtok(V->sbuffer+4, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); if ((sptr = strtok(NULL, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); if ((sptr = strtok(NULL, ".")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ": \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN); } else if (strncmp(V->sbuffer, "DES ", 4) == 0) { if ((sptr = strtok(V->sbuffer+4, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } else if (strncmp(V->sbuffer,"SEQ", 3) == 0) break; } if (strstr(V->sbuffer, "+SS") != NULL) dostruc = TRUE; V->seqlen = 0; /*CONSTCOND*/ while (1) { /* sequence line */ GetLine(V); if (feof(V->f) || strncmp(V->sbuffer, "++", 2) == 0) break; addseq(V->sbuffer, V); /* structure line */ if (dostruc) { GetLine(V); if (feof(V->f)) { squid_errno = SQERR_FORMAT; return; } addstruc(V->sbuffer, V); } } while (!feof(V->f) && strncmp(V->sbuffer, "NAM ", 4) != 0) GetLine(V); } /* Function: SeqfileOpen() * * Purpose : Open a sequence database file and prepare for reading * sequentially. * * Args: filename - name of file to open * format - format of file * env - environment variable for path (e.g. BLASTDB) * * Returns opened SQFILE ptr, or NULL on failure. */ SQFILE * SeqfileOpen(char *filename, int format, char *env) { SQFILE *dbfp; if ((dbfp = (SQFILE *) malloc (sizeof(SQFILE))) == NULL) { squid_errno = SQERR_MEM; return NULL; } if ((dbfp->f = fopen(filename, "r")) == NULL && (dbfp->f = EnvFileOpen(filename, env)) == NULL) { squid_errno = SQERR_NOFILE; return NULL; } /* If it's a SELEX- or MSF-formatted alignment file, we * hack a way to fake sequential access: read the whole * alignment into a static at once. Relies on the prediction * that these files will always be relatively small, unlike * GenBank or EMBL databases. */ /* Note (Tue Aug 30 11:57:19 1994): This cause a bug: * ReadSeq() is not reentrant for alignment files. */ if (format == kSelex || format == kMSF || format == kClustal) { if (! ReadAlignment(filename, format, &ali_aseqs, &ali_num, &ali_ainfo)) return NULL; if (! DealignAseqs(ali_aseqs, ali_num, &ali_rseqs)) return NULL; curridx = 0; } /* Load the first line. */ GetLine(dbfp); return dbfp; } /* Function: SeqfilePosition() * * Purpose: Move to a particular offset in a seqfile. * Will not work on interleaved files (SELEX, MSF). */ void SeqfilePosition(SQFILE *sqfp, long offset) { fseek(sqfp->f, offset, SEEK_SET); GetLine(sqfp); } void SeqfileClose(SQFILE *sqfp) { fclose(sqfp->f); free(sqfp); } /* Function: ReadSeq() * * Purpose: Read next sequence from an open database file. * Return the sequence and associated info. * * Args: fp - open sequence database file pointer * format - format of the file (previously determined * by call to SeqfileFormat()) * ret_seq - RETURN: sequence * sqinfo - RETURN: filled in w/ other information * * Return: 1 on success, 0 on failure. * ret_seq and some field of sqinfo are allocated here, * The preferred call mechanism to properly free the memory is: * * SQINFO sqinfo; * char *seq; * * ReadSeq(fp, format, &seq, &sqinfo); * ... do something... * FreeSequence(seq, &sqinfo); */ int ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo) { int gotuw; squid_errno = SQERR_OK; if (format < kMinFormat || format > kMaxFormat) { squid_errno = SQERR_FORMAT; *ret_seq = NULL; return 0; } /* Here's the hack for accessing sequences from * the multiple sequence alignment formats */ if (format == kMSF || format == kSelex || format == kClustal) { if (curridx >= ali_num) { /* none left; free static ptrs */ FreeAlignment(ali_aseqs, ali_num, &ali_ainfo); free(ali_rseqs); /* only free rseqs pointer array */ return 0; } SeqinfoCopy(sqinfo, &(ali_ainfo.sqinfo[curridx])); *ret_seq = ali_rseqs[curridx]; curridx++; return 1; } else { if (feof(V->f)) return 0; V->seq = (char*) calloc (kStartLength+1, sizeof(char)); V->maxseq = kStartLength; V->seqlen = 0; V->sqinfo = sqinfo; V->sqinfo->flags = 0; V->dash_equals_n = (format == kEMBL) ? TRUE : FALSE; switch (format) { case kIG : readIG(V); break; case kStrider : readStrider(V); break; case kGenBank : readGenBank(V); break; case kNBRF : readNBRF(V); break; case kPearson : readPearson(V); break; case kXPearson: readPearson(V); break; case kEMBL : readEMBL(V); break; case kZuker : readZuker(V); break; case kPIR : readPIR(V); break; case kSquid : readSquid(V); break; case kGCGdata : readGCGdata(V); break; case kGCG: do { /* skip leading comments on GCG file */ gotuw = (strstr(V->sbuffer,"..") != NULL); if (gotuw) readUWGCG(V); GetLine(V); } while (! feof(V->f)); break; case kIdraw: /* SRE: no attempt to read idraw postscript */ default: squid_errno = SQERR_FORMAT; free(V->seq); return 0; } V->seq[V->seqlen] = 0; /* stick a string terminator on it */ } /* Cleanup */ sqinfo->len = V->seqlen; sqinfo->flags |= SQINFO_LEN; *ret_seq = V->seq; if (squid_errno == SQERR_OK) return 1; else return 0; } /* Function: SeqfileFormat() * * Purpose: Determine format of seqfile, and return it * through ret_format. From Gilbert's seqFileFormat(). * * Args: filename - name of sequence file * ret_format - RETURN: format code for file, see squid.h * for codes. * env - name of environment variable containing * a directory path that filename might also be * found in. "BLASTDB", for example. Can be NULL. * * Return: 1 on success, 0 on failure. */ int SeqfileFormat(char *filename, int *ret_format, char *env) { int foundIG = 0; int foundStrider = 0; int foundGB = 0; int foundEMBL = 0; int foundNBRF = 0; int foundPearson = 0; int foundXPearson= 0; int foundZuker = 0; int gotGCGdata = 0; int gotPIR = 0; int gotSquid = 0; int gotuw = 0; int gotMSF = 0; int gotClustal = 0; int done = 0; int format = kUnknown; int nlines= 0, dnalines= 0; int splen = 0; char sp[LINEBUFLEN]; FILE *fseq; #define ReadOneLine(sp) \ { done |= (feof(fseq)); \ readline( fseq, sp); \ if (!done) { splen = (int) strlen(sp); ++nlines; } } if ((fseq = fopen(filename, "r")) == NULL && (fseq = EnvFileOpen(filename, env)) == NULL) { squid_errno = SQERR_NOFILE; return 0; } /* Look at a line at a time */ while ( !done ) { ReadOneLine(sp); if (sp==NULL || *sp=='\0') /*EMPTY*/ ; /* high probability identities: */ else if (strstr(sp, " MSF:") != NULL && strstr(sp, " Type:") != NULL && strstr(sp, " Check:") != NULL) gotMSF = 1; else if (strncmp(sp, "CLUSTAL ", 8) == 0 && strstr( sp, "multiple sequence alignment")) gotClustal = 1; else if (strstr(sp," Check: ") != NULL) gotuw= 1; else if (strncmp(sp, "///", 3) == 0 || strncmp(sp, "ENTRY ", 6) == 0) gotPIR = 1; else if (strncmp(sp, "++", 2) == 0 || strncmp(sp, "NAM ", 4) == 0) gotSquid = 1; else if (strncmp(sp, ">>>>", 4) == 0 && strstr(sp, "Len: ")) gotGCGdata = 1; /* uncertain identities: */ else if (*sp ==';') { if (strstr(sp,"Strider") !=NULL) foundStrider= 1; else foundIG= 1; } else if (strncmp(sp,"LOCUS",5) == 0 || strncmp(sp,"ORIGIN",5) == 0) foundGB= 1; else if (*sp == '>') { if (sp[3] == ';') foundNBRF = 1; else if (strstr(sp, "::") && strstr(sp, "..")) foundXPearson = 1; else foundPearson = 1; } else if (strstr(sp,"ID ") == sp || strstr(sp,"SQ ") == sp) foundEMBL= 1; else if (*sp == '(') foundZuker= 1; else { switch (Seqtype( sp )) { case kDNA: case kRNA: if (splen>20) dnalines++; break; default: break; } } if (gotMSF) {format = kMSF; done = 1; } else if (gotClustal) {format = kClustal; done = 1; } else if (gotSquid) {format = kSquid; done = 1; } else if (gotPIR) {format = kPIR; done = 1; } else if (gotGCGdata) {format = kGCGdata; done = 1; } else if (gotuw) { if (foundIG) format= kIG; /* a TOIG file from GCG for certain */ else format= kGCG; done= 1; } else if ((dnalines > 1) || done || (nlines > 500)) { /* decide on most likely format */ /* multichar idents: */ if (foundStrider) format= kStrider; else if (foundGB) format= kGenBank; else if (foundEMBL) format= kEMBL; else if (foundNBRF) format= kNBRF; /* single char idents: */ else if (foundIG) format= kIG; else if (foundPearson) format= kPearson; else if (foundXPearson)format= kXPearson; else if (foundZuker) format= kZuker; /* spacing ident: */ else if (IsSELEXFormat(filename)) format= kSelex; /* no format chars: */ else { squid_errno = SQERR_FORMAT; return 0; } done= 1; } } if (fseq!=NULL) fclose(fseq); *ret_format = format; return 1; #undef ReadOneLine } /* Function: GCGBinaryToSequence() * * Purpose: Convert a GCG 2BIT binary string to DNA sequence. * 0 = C 1 = T 2 = A 3 = G * 4 nts/byte * * Args: seq - binary sequence. Converted in place to DNA. * len - length of DNA. binary is (len+3)/4 bytes */ int GCGBinaryToSequence(char *seq, int len) { int bpos; /* position in binary */ int spos; /* position in sequence */ char twobit; int i; for (bpos = (len-1)/4; bpos >= 0; bpos--) { twobit = seq[bpos]; spos = bpos*4; for (i = 3; i >= 0; i--) { switch (twobit & 0x3) { case 0: seq[spos+i] = 'C'; break; case 1: seq[spos+i] = 'T'; break; case 2: seq[spos+i] = 'A'; break; case 3: seq[spos+i] = 'G'; break; } twobit = twobit >> 2; } } seq[len] = '\0'; return 1; } int GCGchecksum(char *seq, int seqlen) { int check = 0, count = 0, i; for (i = 0; i < seqlen; i++) { count++; check += count * sre_toupper((int) seq[i]); if (count == 57) count = 0; } return (check % 10000); } /* Function: GCGMultchecksum() * * Purpose: Simple modification of GCGchecksum(), * to create a checksum for multiple sequences. * Gaps count. * * Args: seqs - sequences to be checksummed * nseq - number of sequences * * Return: the checksum, a number between 0 and 9999 */ int GCGMultchecksum(char **seqs, int nseq) { int check = 0; int count = 0; int idx; char *sptr; for (idx = 0; idx < nseq; idx++) for (sptr = seqs[idx]; *sptr; sptr++) { count++; check += count * sre_toupper((int) *sptr); if (count == 57) count = 0; } return (check % 10000); } /* Function: Seqtype() * * Purpose: Returns a (very good) guess about type of sequence: * kDNA, kRNA, kAmino, or kOtherSeq. * * Modified from, and replaces, Gilbert getseqtype(). */ int Seqtype(char *seq) { int saw; /* how many non-gap characters I saw */ char c; int po = 0; /* count of protein-only */ int nt = 0; /* count of t's */ int nu = 0; /* count of u's */ int na = 0; /* count of nucleotides */ int aa = 0; /* count of amino acids */ int no = 0; /* count of others */ /* Look at the first 300 non-gap characters */ for (saw = 0; *seq != '\0' && saw < 300; seq++) { c = sre_toupper((int) *seq); if (! isgap(c)) { if (strchr(protonly, c)) po++; else if (strchr(primenuc,c)) { na++; if (c == 'T') nt++; else if (c == 'U') nu++; } else if (strchr(aminos,c)) aa++; else if (isalpha(c)) no++; saw++; } } if (no > 0) return kOtherSeq; else if (po > 0) return kAmino; else if (na > aa) { if (nu > nt) return kRNA; else return kDNA; } else return kAmino; } int WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo) { int numline = 0; int lines = 0, spacer = 0, width = 50, tab = 0; int i, j, l, l1, ibase; char endstr[10]; char s[100]; /* buffer for sequence */ char ss[100]; /* buffer for structure */ int checksum = 0; int seqlen; int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */ int dostruc; /* TRUE to print structure lines*/ which_case = 0; dostruc = FALSE; seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq); /* intercept Selex-format requests - SRE */ if (outform == kSelex) { fprintf(outf, "%10s %s\n", sqinfo->name, seq); return 1; } if (outform == kClustal || outform == kMSF) { Warn("Tried to write Clustal or MSF with WriteSeq() -- bad, bad."); return 1; } strcpy( endstr,""); l1 = 0; /* 10Nov91: write this out in all possible formats: */ checksum = GCGchecksum(seq, seqlen); switch (outform) { case kUnknown: /* no header, just sequence */ strcpy(endstr,"\n"); /* end w/ extra blank line */ break; case kGenBank: fprintf(outf,"LOCUS %s %d bp\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name, seqlen); fprintf(outf,"DEFINITION %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf,"ACCESSION %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf,"ORIGIN \n"); spacer = 11; numline = 1; strcpy(endstr, "\n//"); break; case kGCGdata: fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen); fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); break; case kNBRF: if (Seqtype(seq) == kAmino) fprintf(outf,">P1;%s\n", sqinfo->name); else fprintf(outf,">DL;%s\n", sqinfo->name); fprintf(outf,"%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); spacer = 11; strcpy(endstr,"*"); break; case kPIR: fprintf(outf, "ENTRY %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); fprintf(outf, "TITLE %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf, "ACCESSION %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf, "SUMMARY #Length %d #Checksum %d\n", sqinfo->len, checksum); fprintf(outf, "SEQUENCE\n"); fprintf(outf, " 5 10 15 20 25 30\n"); spacer = 2; /* spaces after every residue */ numline = 1; /* number lines w/ coords */ width = 30; /* 30 aa per line */ strcpy(endstr, "\n///"); break; case kSquid: fprintf(outf, "NAM %s\n", sqinfo->name); if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN)) fprintf(outf, "SRC %s %s %d..%d::%d\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0); if (sqinfo->flags & SQINFO_DESC) fprintf(outf, "DES %s\n", sqinfo->desc); if (sqinfo->flags & SQINFO_SS) { fprintf(outf, "SEQ +SS\n"); dostruc = TRUE; /* print structure lines too */ } else fprintf(outf, "SEQ\n"); numline = 1; /* number seq lines w/ coords */ strcpy(endstr, "\n++"); break; case kEMBL: fprintf(outf,"ID %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); fprintf(outf,"AC %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf,"DE %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf,"SQ %d BP\n", seqlen); strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ tab = 5; /** added 31jan91 */ spacer = 11; /** added 31jan91 */ break; case kGCG: fprintf(outf,"%s\n", sqinfo->name); if (sqinfo->flags & SQINFO_ACC) fprintf(outf,"ACCESSION %s\n", sqinfo->acc); if (sqinfo->flags & SQINFO_DESC) fprintf(outf,"DEFINITION %s\n", sqinfo->desc); fprintf(outf," %s Length: %d (today) Check: %d ..\n", sqinfo->name, seqlen, checksum); spacer = 11; numline = 1; strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ break; case kStrider: /* ?? map ?*/ fprintf(outf,"; ### from DNA Strider ;-)\n"); fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n", sqinfo->name, seqlen, checksum); strcpy(endstr, "\n//"); break; /* SRE: Don had Zuker default to Pearson, which is not intuitive or helpful, since Zuker's MFOLD can't read Pearson format. More useful to use kIG */ case kZuker: which_case = 1; /* MFOLD requires upper case. */ /*FALLTHRU*/ case kIG: fprintf(outf,";%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); fprintf(outf,"%s\n", sqinfo->name); strcpy(endstr,"1"); /* == linear dna */ break; case kRaw: /* Raw: just print the whole sequence. */ fprintf(outf, "%s\n", seq); return 1; case kXPearson: if (sqinfo->flags & SQINFO_WGT) fprintf(outf, "> %s %f ", sqinfo->name, sqinfo->weight); else fprintf(outf, "> %s - ", sqinfo->name); fprintf(outf, "%s %s %d..%d::%d %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); break; default : case kPearson: fprintf(outf,">%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); break; } if (which_case == 1) s2upper(seq); if (which_case == 2) s2lower(seq); width = MIN(width,100); for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) { if (l1 < 0) l1 = 0; else if (l1 == 0) { if (numline) fprintf(outf,"%8d ",ibase); for (j=0; jflags & SQINFO_SS) ? sqinfo->ss[i] : '.'; l++; i++; l1++; /* don't count spaces for width*/ if (l1 == width || i == seqlen) { s[l] = ss[l] = '\0'; l = 0; l1 = 0; if (dostruc) { fprintf(outf, "%s\n", s); if (numline) fprintf(outf," "); for (j=0; jgsifp = fopen(gsifile, "r")) == NULL) { squid_errno = SQERR_NOFILE; return NULL; } if (! fread(magic, GSI_KEYSIZE, 1, gsi->gsifp)) { squid_errno = SQERR_NODATA; return NULL; } if (strcmp(magic, "GSI") != 0) { squid_errno = SQERR_FORMAT; return NULL; } if (! fread(&(gsi->nfiles), sizeof(short), 1, gsi->gsifp)) { squid_errno = SQERR_NODATA; return NULL; } if (! fread(&(gsi->recnum), sizeof(long), 1, gsi->gsifp)) { squid_errno = SQERR_NODATA; return NULL; } return gsi; } /* Function: GSIGetOffset() * * Purpose: From a key (sequence name), find a disk offset * in an open general sequence index file by binary * search. Presumably GSI indexing could be even faster * if we used hashing. * * Args: gsi - GSI index file, opened by GSIOpen() * key - name of key to retrieve indices for * ret_seqfile - pre-alloced char[32] array for seqfile name * ret_offset - return: disk offset in seqfile. */ int GSIGetOffset(GSIFILE *gsi, char *key, char *ret_seqfile, long *ret_offset) { long left, right, mid; int cmp; char name[GSI_KEYSIZE + 1]; short filenum; name[GSI_KEYSIZE] = '\0'; left = gsi->nfiles + 1; right = gsi->nfiles + gsi->recnum; mid = (left + right) / 2; fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET); while (fread(name, GSI_KEYSIZE, 1, gsi->gsifp)) { cmp = strcmp(name, key); if (cmp == 0) break; /* name same as key: done! */ else if (left >= right) return 0; /* failed to find key: fail! */ else if (cmp < 0) left = mid + 1; /* name greater than key */ else if (cmp > 0) right = mid - 1; /* name less than key */ mid = (left + right) / 2; fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET); } /* Read the remainder of the record */ if (! fread(&filenum, sizeof(short), 1, gsi->gsifp)) { squid_errno = SQERR_NODATA; return 0; } if (! fread(ret_offset, sizeof(long), 1, gsi->gsifp)) { squid_errno = SQERR_NODATA; return 0; } /* Look up the sequence name */ fseek(gsi->gsifp, filenum * GSI_RECSIZE, SEEK_SET); if (! fread(ret_seqfile, GSI_KEYSIZE, 1, gsi->gsifp)) { squid_errno = SQERR_NODATA; return 0; } return 1; } /* Function: GSIClose() * * Purpose: Close an open GSI sequence index file. */ void GSIClose(GSIFILE *gsi) { fclose(gsi->gsifp); free(gsi); } /* Function: EnvFileOpen() * Date: Sun Feb 12 10:55:29 1995 * * Purpose: Open a file, given a file name and an environment * variable that contains a directory path. Files * are opened read-only. Uses UNIX getenv() and * thus is UNIX-specific. * * For instance: * fp = EnvFileOpen("BLOSUM45", "BLASTMAT"); * or: * fp = EnvFileOpen("swiss", "BLASTDB"); * * Args: fname - name of file to open * env - name of environment variable containing path * * Return: FILE * to open file, or NULL on failure -- same as fopen() */ FILE * EnvFileOpen(char *fname, char *env) { FILE *fp; char *path; char full[512]; if (env == NULL) return NULL; if ((path = getenv(env)) == NULL) return NULL; if (((int) strlen(fname) + (int) strlen(env) + 2) > 512) return NULL; sprintf(full, "%s/%s", path, fname); fp = fopen(full, "r"); return fp; } tRNAscan-SE-2.0/src/types.c0000644000543100007160000000443511021467306014737 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* file: types.c * * Finicky type checkers for strings. Return 1 (TRUE) if ok, 0 elsewise. * */ #include #include #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: IsInt() * * Returns TRUE if s points to something that atoi() will parse * completely and convert to an integer. */ int IsInt(char *s) { int hex = 0; if (s == NULL) {squid_errno = SQERR_PARAMETER; return 0; } /* skip whitespace */ while (isspace(*s)) s++; /* skip leading sign */ if (*s == '-' || *s == '+') s++; /* skip leading conversion signals */ if ((strncmp(s, "0x", 2) == 0 && (int) strlen(s) > 2) || (strncmp(s, "0X", 2) == 0 && (int) strlen(s) > 2)) { s += 2; hex = 1; } else if (*s == '0' && (int) strlen(s) > 1) s++; /* examine remainder for garbage chars */ if (!hex) while (*s != '\0') { if (!isdigit(*s)) return 0; s++; } else while (*s != '\0') { if (!isxdigit(*s)) return 0; s++; } return 1; } /* Function: IsReal() * * Purpose: Returns TRUE if s is a string representation * of a valid floating point number. */ int IsReal(char *s) { int gotdecimal = 0; int gotexp = 0; int gotreal = 0; if (s == NULL) return 0; while (isspace(*s)) s++; /* skip leading whitespace */ if (*s == '-' || *s == '+') s++; /* skip leading sign */ /* Examine remainder for garbage. Allowed one '.' and * one 'e' or 'E'; if both '.' and e/E occur, '.' * must be first. */ while (*s != '\0') { if (isdigit(*s)) gotreal++; else if (*s == '.') { if (gotdecimal) return 0; /* can't have two */ if (gotexp) return 0; /* e/E preceded . */ else gotdecimal++; } else if (*s == 'e' || *s == 'E') { if (gotexp) return 0; /* can't have two */ else gotexp++; } else if (isspace(*s)) break; s++; } while (isspace(*s)) s++; /* skip trailing whitespace */ if (*s == '\0' && gotreal) return 1; else return 0; } tRNAscan-SE-2.0/src/sre_string.c0000644000543100007160000001241311021467305015744 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* sre_string.c * * my library of extra string functions. Some for portability * across UNIXes */ #include #include #include #include "gnuregex.h" #include "squid.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Obsolete. Anyone who doesn't have strstr() is * not ANSI-compliant and must die. */ #ifdef NOSTR char *strstr(char *s, char *subs) { int i; for ( ; *s != 0; s++) { if (*s == *subs) { for (i = 1; subs[i] != 0 && subs[i] == s[i]; i++) ; if (subs[i] == 0) return(s); } } return (NULL); } #endif /* NOSTR */ char * Strdup(char *s) { char *new; if ((new = (char *) malloc (strlen(s) +1)) == NULL) return NULL; strcpy(new, s); return new; } int Strinsert(char *s1, /* string to insert a char into */ char c, /* char to insert */ int pos) /* position in s1 to insert c at */ { char oldc; char *s; for (s = s1 + pos; c; s++) { /* swap current char for inserted one */ oldc = *s; /* pick up current */ *s = c; /* put down inserted one */ c = oldc; /* old becomes next to insert */ } *s = '\0'; return 1; } int Strdelete(char *s1, /* string to delete a char from */ int pos) /* position of char to delete 0..n-1 */ { char *s; for (s = s1 + pos; *s; s++) *s = *(s + 1); return 1; } void s2lower(char *s) { for (; *s != '\0'; s++) *s = sre_tolower((int) *s); } void s2upper(char *s) { for (; *s != '\0'; s++) *s = sre_toupper((int) *s); } void * MallocOrDie(size_t size) { void *ptr; if ((ptr = malloc (size)) == NULL) Die("malloc failed"); return ptr; } void * ReallocOrDie(void *p, size_t size) { void *ptr; if ((ptr = realloc(p, size)) == NULL) Die("realloc failed"); return ptr; } /* Function: Strparse() * * Purpose: Match a regexp to a string. * Return 0 if it matches, REG_NOMATCH if it doesn't. * The caller may request a copy of the text that matched by * passing a non-NULL pointer to a string pointer. * The called may also request copies of the text that matched * sub-regexps (parenthesized expressions) by passing * ntok > 0, and pointers to ntok different string pointers. * * Uses the GNU regexp library in extended POSIX compatibility * mode. * * I built this for ease of use, not speed nor efficiency. * * Example: Strparse("foo-...-baz", "foo-bar-baz", NULL, 0) returns 0 * Strparse("foo-\(...\)-baz", "foo-bar-baz", &buf1, 1, &buf2) * returns 0, copies "foo-bar-baz" into buf1, and "bar" * into buf2. * * Args: rexp - regular expression, extended POSIX form * s - string to match against * buf - if non-NULL, where to put copy of matching text * ntok - number of sub-regexps returned. * ... - variable number of pointers to strings to keep * copies of matched subtexts * * Return: 0 on match, REG_NOMATCH on failure to match * buf and the ptrs in varargs list are malloc'ed here, * must be free'd by caller. */ int Strparse(char *rexp, char *s, char **buf, int ntok, ...) { va_list ap; regex_t pat; int code; regmatch_t *pmatch; char **sp; int len; int i; code = regcomp(&pat, rexp, REG_EXTENDED); if (code > 0) { fprintf(stderr, "regular expression compilation failed\n"); exit(1); } if ((pmatch = (regmatch_t *) malloc (sizeof(regmatch_t) * (ntok+1))) == NULL) { fprintf(stderr, "malloc failed\n"); exit(1); } code = regexec(&pat, s, ntok+1, pmatch, 0); if (code == 0) { /* make copy of full matched text */ if (buf != NULL) { len = pmatch[0].rm_eo - pmatch[0].rm_so; if ((*buf = (char *) malloc(sizeof(char) * (len+1))) == NULL) { fprintf(stderr, "malloc failed\n"); exit(1); } strncpy(*buf, s+pmatch[0].rm_so, len); (*buf)[len] = '\0'; } /* make copies of subtexts */ if (ntok > 0) { va_start(ap, ntok); for (i = 1; i <= ntok; i++) { sp = va_arg(ap, char **); len = pmatch[i].rm_eo - pmatch[i].rm_so; if ((*sp = (char *) malloc(sizeof(char) * (len+1))) == NULL) { fprintf(stderr, "malloc failed\n"); exit(1); } strncpy(*sp, s+pmatch[i].rm_so, len); (*sp)[len] = '\0'; } } } va_end(ap); free(pmatch); regfree(&pat); return code; } /* Function: StrShuffle() * * Purpose: Returns a shuffled version of s2, in s1. * * Args: s1 - allocated space for shuffled string. * s2 - string to shuffle. * * Return: void */ void StrShuffle(char *s1, char *s2) { int len; int pos; char c; strcpy(s1, s2); for (len = strlen(s1); len > 1; len--) { pos = CHOOSE(len); c = s1[pos]; s1[pos] = s1[len-1]; s1[len-1] = c; } } tRNAscan-SE-2.0/src/eufind_const.h0000644000543100007160000000244114044141557016260 0ustar pchanlowelab#ifndef EUFIND_CONST_ #define EUFIND_CONST_ /* tRNA scanning cutoffs */ #define BBOX_CUTOFF -14.14 #define BBOX_START_IDX 45 #define SEC_LOBOUND -4.9 /* orig: -3.6 euk: -3.8 prok (hflu) -4.9 */ #define SEC_HIBOUND -2.1 /* -2.2 */ #define MAX_PENALTY -5.442 /* log(1/231) */ #define INT_SCORE_THRESH -31.25 #define TOT_SCORE_THRESH -31.8 /* -31.8 */ #define MAX_AB_BOX_DIST 140 /* not used anymore, instead */ /* AB_BOX_DIST_RANGE used */ #define MIN_AB_BOX_DIST 24 #define AB_BOX_DIST_RANGE 116 /* check this far over MIN_AB_BOX */ /* distance for A-B box pairs */ #define SEC_AB_BOX_DIST 26 #define SEC_BBOX_DIST_CORR 12 #define MIN_BTERM_DIST 11 #define MAX_TERM_SEARCH 133 /* Max distance to search for termination signal (was 59, changed to 133 (as in Pavesi paper) on 11/96 since was missing 4 yeast tRNAs */ #define ABOX_LEN 21 #define BBOX_LEN 11 #define MAX_OVLAP 10 /* max #bp tRNA hits are allowed to overlap */ struct trna_info_s { char iso_type[5]; char acodon[4]; int start, end, Abox_st, Abox_end, Abox_gap, Bbox_st, Bbox_end, Term_st, acodon_idx, intron, idno; float totSc, AboxSc, BboxSc, ABdistSc, TermSc; }; typedef struct trna_info_s TRNA_TYPE; #endif /*EUFIND_CONST_*/ tRNAscan-SE-2.0/src/revcomp_main.c0000644000543100007160000000447011021467312016246 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* main for revcomp * * revcomp - generate reverse complement of sequences * SRE, Thu Aug 5 17:36:57 1993 */ #include #include #include "squid.h" #define OPTIONS "h" char usage[] = "Usage: revcomp [-options] \n\ Reverse complement a nucleic acid sequence.\n\ Available options:\n\ -h : help; print version and usage info\n"; int main(int argc, char **argv) { char *seqfile; /* name of sequence file */ SQFILE *dbfp; /* open sequence file */ int fmt; /* format of seqfile */ char *seq; /* sequence */ SQINFO sqinfo; /* additional sequence info */ char *rev; /* reverse complement */ int swap; int optchar; /* option character, command line */ extern int optind; /*********************************************** * Parse command line ***********************************************/ while ((optchar = getopt(argc, argv, OPTIONS)) != -1) switch (optchar) { case 'h': printf("revcomp %s, %s\n%s\n", squid_version, squid_date, usage); exit(EXIT_SUCCESS); default: Die("%s\n", usage); } if (argc - optind != 1) Die("%s\n", usage); seqfile = argv[optind]; if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Failed to determine format of file %s", seqfile); if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) { if ((rev = (char *) malloc ((sqinfo.len + 1) * sizeof(char))) == NULL) Die("malloc failed"); revcomp(rev, seq); if (sqinfo.flags & (SQINFO_START | SQINFO_STOP)) { swap = sqinfo.start; sqinfo.start = sqinfo.stop; sqinfo.stop = swap; } /* secondary structure of reverse strand is nonsense */ if (sqinfo.flags & SQINFO_SS) { sqinfo.flags = sqinfo.flags & ~SQINFO_SS; free(sqinfo.ss); } WriteSeq(stdout, kPearson, rev, &sqinfo); free(rev); FreeSequence(seq, &sqinfo); } SeqfileClose(dbfp); return 0; } tRNAscan-SE-2.0/src/pavesi.c0000644000543100007160000003262714044142160015062 0ustar pchanlowelab/* eufindtRNA - Eukaryotic tRNA finder * * pavesi.c - functions for finding transcriptional control regions * * C implementation of algorithm described by Pavesi, Conterio, * Bolchi, Dieci, & Ottonello in NAR 22:1247-56 (94) * "Identification of new eukaryotic tRNA genes in genomic DNA * databases by a multistep weight matix analysis of transcriptional * control regions" * * To be used in tRNAscan-SE package to increase sensitivity by * complementing tRNAscan 1.3 first-pass scan * * by Todd MJ Lowe 4/8/96 * * Uses Sean Eddy's function library for biological sequence analysis * (Squid v1.5g) * */ #include #include #include #include "squid.h" #include "eufind_const.h" #include "pavesi.h" /* #define NO_AMBIG -use this option to eliminate conservative * calling of 'N's as best possible matches * in tRNAs -- useful for unfinished seqs with many N's */ /* log scores for each position in A Box */ /* six rows are for 1) A, 2) C, 3) G, 4) T, 5) (gap), 6) ambiguous base the ambiguous base value is the MIN (best score) of the ACGT rows */ /* position 17a eliminated since always an empty pos (gap) */ float Abox_Mat[6][ABOX_LEN] = { {-1.268,-3.651,-0.899,-4.749,-5.442,-2.351,-3.363,-0.009,-1.977,-3.497,-5.442, -5.442,-5.442,-2.498,-4.749,-5.442,-0.031,-1.417,-1.180,-1.048,-4.344}, {-3.651,-5.442,-4.056,-2.958,-0.480,-1.073,-0.857,-5.442,-5.442,-1.887,-2.498, -5.442,-5.442,-2.958,-2.224,-5.442,-5.442,-3.363,-1.417,-3.651,-0.393}, {-0.779,-5.442,-0.598,-0.076,-3.651,-1.435,-1.614,-4.749,-0.154,-2.803,-5.442, 0.000,0.000,-3.363,-3.651,-5.442,-3.497,-0.672,-1.012,-0.473,-3.651}, {-1.453,-0.026,-3.651,-4.344,-1.036,-1.125,-1.073,-5.442,-5.442,-0.278,-1.399, -5.442,-5.442,-0.185,-0.827,-2.041,-5.442,-1.551,-2.447,-5.442,-1.253}, {-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-0.412, -5.442,-5.442,-5.442,-0.868,-0.144,-5.442,-5.442,-5.442,-5.442,-5.442}, #ifdef NO_AMBIG {-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245, -3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245} #else {-0.779,-0.026,-0.598,-0.076,-0.480,-1.073,-0.857,-0.009,-0.154,-0.278,-1.399, 0.000,0.000,-0.185,-0.827,-2.041,-0.031,-0.672,-1.012,-0.473,-0.393} #endif }; #define GAP_ROW 4 /* row in ABox mat with Gap weight */ float Bbox_Mat[6][BBOX_LEN] = { {-2.351,-5.442,-2.670,-5.442,-5.442,-1.472,0.000,-0.798,-2.498,-5.442,-3.497}, {-3.245,-5.442,-5.442,-5.442,-0.004,-5.442,-5.442,-2.498,-1.435,-0.009,-0.190}, {-0.175,-0.004,-5.442,-5.442,-5.442,-0.272,-5.442,-2.147,-5.442,-5.442,-3.651}, {-3.651,-5.442,-0.072,0.000,-5.442,-4.749,-5.442,-1.048,-0.393,-5.442,-2.147}, {-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442,-5.442}, #ifdef NO_AMBIG {-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245,-3.245} #else {-0.175,-0.004,-0.072,0.000,-0.004,-0.272,0.000,-0.798,-0.393,-0.009,-0.190} #endif }; #define ABDIST_MAT_SIZE 7 int ABDistIdx_Mat[ABDIST_MAT_SIZE] = {30,36,42,48,54,60,66}; float ABDistSc_Mat[ABDIST_MAT_SIZE] = {-0.46,-1.83,-2.35,-3.24, -4.06,-3.83,-4.75}; #define BTERM_MAT_SIZE 9 int BTermDistIdx_Mat[BTERM_MAT_SIZE] = {17,23,29,35,41,47,53,59,100}; float BTermDistSc_Mat[BTERM_MAT_SIZE] = {-0.54,-1.40,-2.80,-3.36, -3.24,-5.44,-5.44,-4.06,-5.44}; void Init_tRNA(TRNA_TYPE *tRNA) { strcpy(tRNA->iso_type,"???"); strcpy(tRNA->acodon,"???"); tRNA->start = tRNA->end = 0; tRNA->Abox_st = tRNA->Abox_end = tRNA->Abox_gap = 0; tRNA->Bbox_st = tRNA->Bbox_end = tRNA->Term_st = tRNA->acodon_idx= 0; tRNA->intron = tRNA->idno = 0; tRNA->totSc = tRNA->AboxSc = tRNA->BboxSc = -1000; tRNA->ABdistSc = tRNA->TermSc = -100; } int IntEncodeSeq (char *intseq, char *seq, int seqlen) { int i; for (i=0; i BBOX_CUTOFF) { if (verbose) { if (strand == 0) { printf("Bbox at %i (End=%d), Sc= %.2f\n",i,i+BBOX_LEN+11,*score); } else { printf("Bbox at %i (End=%d), Sc= %.2f\n",seqlen-i+1,seqlen-(i+BBOX_LEN+11)+1,*score); } } *seqidx = i; return 1; } } return 0; } float Get_ABdist_weight(int ABdist) { int ct; if (ABdist < MIN_AB_BOX_DIST) return MAX_PENALTY; for (ct=0; ct < ABDIST_MAT_SIZE; ct++) { if (ABdist <= ABDistIdx_Mat[ct]) return ABDistSc_Mat[ct]; } return MAX_PENALTY; } int GetSecABox(TRNA_TYPE *tRNA, char *seq) { char *seqp; int i, startidx; /* Search for eukaryotic SelCys motif */ startidx = tRNA->Bbox_st - ABOX_LEN - SEC_AB_BOX_DIST - 1; seqp = seq + MAX(0,startidx); for (i=0; i<5; i++, seqp++) { if ((!strncmp(seqp,"GGTC",4) && (seqp[4] == 'T' || seqp[4] == 'C') && (seqp[5] == 'G') && (seqp[6] == 'T' || seqp[6] == 'G') && (seqp[7] == 'G') && (seqp[8] == 'G') && (seqp[9] == 'T'))) { tRNA->Abox_st = MAX(0,startidx+i-SEC_BBOX_DIST_CORR); strcpy(tRNA->iso_type,"SeCe"); return 1; } } /* Search for Prokaryotic SelCys */ startidx = tRNA->Bbox_st - 46 - 1; seqp = seq + MAX(0,startidx); for (i=0; i<16; i++, seqp++) { if ((!strncmp(seqp,"GG",2) && (seqp[2] == 'A' || seqp[2] == 'T') && (seqp[3] == 'C' || seqp[3] == 'T') && (seqp[4] == 'T') && (seqp[5] == 'T') && (seqp[6] == 'C') && (seqp[7] == 'A') && (seqp[8] == 'A') && (seqp[9] == 'A') && (seqp[10] == 'A' || seqp[10] == 'T') && (seqp[11] == 'C') && (seqp[12] == 'C'))) { tRNA->Abox_st = MAX(0,startidx+i-23); strcpy(tRNA->iso_type,"SeCp"); return 1; } } return 0; } void GetBestABox (TRNA_TYPE *tRNA, char *seq, char *iseq, int seqlen, int strand, int verbose, int Max_AB_dist, int prev_Abox_st) { int i, /* sequence position index */ startidx, endidx, abox_end; int j, /* matrix position index */ offset1, offset2, /* offset counters to keep track of gaps */ gapct, /* keeps track of 4 types of 2bp gaps in positions 20a & 20b */ best_gap, /* gapct & offset1 vals for best score so far */ best_offset1; float sc1, sc2, sc3, /* components of total score */ bestsc, /* best score so far */ abdistSc; startidx = MAX(MAX(0,(tRNA->Bbox_st - Max_AB_dist - ABOX_LEN)), prev_Abox_st+2); endidx = MAX(0,(tRNA->Bbox_st - MIN_AB_BOX_DIST - ABOX_LEN +4)); for (i=startidx; i < endidx; i++) { sc1=sc2=sc3=0; /* scoring Abox with weight matrix at tRNA pos 7-16 */ for (j=0; j<=9; j++) { sc1 += Abox_Mat[(int)(iseq[i+j])][j]; } j=10; /* score gap at pos 17 by looking for conserved 'GG' at pos 18 & 19 */ if (seq[i+j] == 'G') { /* && (seq[i+j+1] == 'G')) { */ sc2 = Abox_Mat[GAP_ROW][j]; offset1 = 1; } else { sc2 = 0; offset1 = 0; } /* scoring Abox with weight matrix at tRNA pos 18-20 */ for (j=10; (j+offset1) < 14; j++) { sc2 += Abox_Mat[(int)(iseq[i+j])][j+offset1]; } /* score potential gap at 20a & 20b, plus rest of matrix up to position; gapct, enumerates all possible 2bp gaps */ for (gapct=0; gapct<4; gapct++) { j=14-offset1; offset2=0; switch (gapct) { case 0: sc3 = Abox_Mat[(int)(iseq[i+j])][j+offset1+offset2]; j++; sc3 += Abox_Mat[(int)(iseq[i+j])][j+offset1+offset2]; j++; break; case 1: sc3 = Abox_Mat[GAP_ROW][j+offset1+offset2]; offset2++; sc3 += Abox_Mat[(int)(iseq[i+j])][j+offset1+offset2]; j++; break; case 2: sc3 = Abox_Mat[(int)(iseq[i+j])][j+offset1+offset2]; j++; sc3 += Abox_Mat[GAP_ROW][j+offset1+offset2]; offset2++; break; case 3: sc3 = Abox_Mat[GAP_ROW][j+offset1+offset2]; offset2++; sc3 += Abox_Mat[GAP_ROW][j+offset1+offset2]; offset2++; break; } for (; (j+offset1+offset2) < ABOX_LEN; j++) { sc3+= Abox_Mat[(int)(iseq[i+j])][j+offset1+offset2]; } abox_end = i+ABOX_LEN-offset1-offset2-1; abdistSc = Get_ABdist_weight(tRNA->Bbox_st-abox_end-1); if ((sc1 + sc2 + sc3 + abdistSc) > (tRNA->AboxSc + tRNA->ABdistSc)) { tRNA->Abox_st = i; tRNA->Abox_end = abox_end; tRNA->ABdistSc = abdistSc; tRNA->AboxSc = sc1+sc2+sc3; best_offset1= offset1; best_gap = gapct; if (verbose) { if (strand == 0) { printf("Abox at %d (St=%d) A:%.2f AB(%d):%.2f I:%.2f\n", i,i-5,tRNA->AboxSc, tRNA->Bbox_st-abox_end-1,tRNA->ABdistSc, tRNA->AboxSc+tRNA->BboxSc+tRNA->ABdistSc); } else { printf("Abox at %d (St=%d) A:%.2f AB(%d):%.2f I:%.2f\n", seqlen-i+1,seqlen-(i-5)+1,tRNA->AboxSc, tRNA->Bbox_st-abox_end-1,tRNA->ABdistSc, tRNA->AboxSc+tRNA->BboxSc+tRNA->ABdistSc); } } } } /* for gapct, enumerating all possible gaps */ } /* for i, starting pos for A box */ } int GetBestTrxTerm (TRNA_TYPE *tRNA, char *seq, int seqlen, float TermPenalty) { int i, /* current seq position */ startidx, endidx; /* start & end points for term search */ int ct, BTermdist; float score; /* current score */ startidx = tRNA->Bbox_end+MIN_BTERM_DIST-1; endidx = MIN(startidx+MAX_TERM_SEARCH,(seqlen-4)); for (i=startidx; iBbox_end-1; for (ct=0; BTermDistIdx_Mat[ct] < BTermdist; ct++) { } tRNA->TermSc = BTermDistSc_Mat[ct]; tRNA->Term_st = i; return 1; } } tRNA->Term_st = -1; if (endidx == (seqlen-4)) { tRNA->TermSc = TOT_SCORE_THRESH - (INT_SCORE_THRESH); return 1; } else { tRNA->TermSc = MAX_PENALTY; return 0; } return 0; } /* Uses tranlation scheme & AA lookup table from Squid library */ void Get_IsoType (TRNA_TYPE *tRNA) { int i, codon; char codon_seq[4]; revcomp(codon_seq,tRNA->acodon); codon = 0; for (i = 0; i < 3; i++) { codon *= 4; switch (codon_seq[i]) { case 'A': case 'a': break; case 'C': case 'c': codon += 1; break; case 'G': case 'g': codon += 2; break; case 'T': case 't': codon += 3; break; case 'U': case 'u': codon += 3; break; default: codon = 64; break; } if (codon == 64) break; } strcpy(tRNA->iso_type,stdcode3[codon]); } void Get_anticodon (TRNA_TYPE *tRNA, char *seq) { char *acodonp; int startidx, i, besti, score, bestsc; startidx = tRNA->Abox_end + 7; acodonp = seq + startidx; bestsc = besti = 0; for (i=0; i<7; i++) { score = 0; if ((acodonp[i-2] == 'C') || (acodonp[i-2] == 'T')) score++; if (acodonp[i-1] == 'T') score++; if ((acodonp[i+3] == 'A') || (acodonp[i+3] == 'G')) score++; if (score > bestsc) { bestsc = score; besti = i; } } strncpy(tRNA->acodon,acodonp+besti,3); /* tRNA->acodon[3]='\0'; */ tRNA->acodon_idx = startidx + besti; } void Get_tRNA_stats (TRNA_TYPE *tRNA, char *seq, int seqlen, int strand) { tRNA->start = MAX(1,(tRNA->Abox_st - 5)); tRNA->end = MIN((tRNA->Bbox_end + 12),seqlen); if (strand == -1) { tRNA->start = seqlen - tRNA->start + 1; tRNA->end = seqlen - tRNA->end + 1; } if (!strncmp("SeC",tRNA->iso_type,3)) { strcpy(tRNA->acodon,"TCA"); tRNA->Bbox_end++; } else { Get_anticodon(tRNA,seq); Get_IsoType(tRNA); } } void Save_tRNA (TRNA_TYPE *tRNA, SQINFO *sqinfo, char *seq, int strand, int ShowScores, long int sqoffset) { if (ShowScores) printf("%s.%d\t%ld\t%ld\t%s\t%d\tA:%.2f B:%.2f AB:%.2f T:%.2f Tot:%.2f\n", sqinfo->name,tRNA->idno,tRNA->start+sqoffset,tRNA->end+sqoffset, tRNA->acodon,strand, tRNA->AboxSc,tRNA->BboxSc,tRNA->ABdistSc, tRNA->TermSc,tRNA->totSc); else printf("%-10s\t%d\t%ld\t%ld\t%s\t%s\t0\t0\t%.2f\n", sqinfo->name,tRNA->idno,tRNA->start+sqoffset,tRNA->end+sqoffset, tRNA->iso_type,tRNA->acodon,tRNA->totSc); } int tRNAOverlap (TRNA_TYPE *tRNA1, TRNA_TYPE *tRNA2, int strand) { if (strand == 0) { if ((((tRNA1->start >= tRNA2->start) && (tRNA1->start < tRNA2->end-MAX_OVLAP)) || ((tRNA1->end > tRNA2->start+MAX_OVLAP) && (tRNA1->end <= tRNA2->end))) || (((tRNA2->start >= tRNA1->start) && (tRNA2->start < tRNA1->end-MAX_OVLAP)) || ((tRNA2->end > tRNA1->start+MAX_OVLAP) && (tRNA2->end <= tRNA1->end)))) return 1; else return 0; } else { if ((((tRNA1->start <= tRNA2->start) && (tRNA1->start > tRNA2->end+MAX_OVLAP)) || ((tRNA1->end < tRNA2->start-MAX_OVLAP) && (tRNA1->end >= tRNA2->end))) || (((tRNA2->start <= tRNA1->start) && (tRNA2->start > tRNA1->end+MAX_OVLAP)) || ((tRNA2->end < tRNA1->start-MAX_OVLAP) && (tRNA2->end >= tRNA1->end)))) return 1; else return 0; } } tRNAscan-SE-2.0/src/emit.c0000644000543100007160000003736311021467304014535 0ustar pchanlowelab/* emit.c * 1.0: Fri Jun 11 12:59:33 1993 * 2.0: SRE, Thu Sep 9 13:44:18 1993 * * generate sequences randomly from a model. * * The growing sequence is kept as a linked list (align_s). * The model tree is traversed by pushing nodes onto a stack * (m2ali_s). The information kept for each active node * is nodeidx, state type, and a pointer into the growing linked * list where the next emissions should go. * * The recursion is to pop an active node off; * then, switch (statetype) * MATP: pick symbol pair. insert right symbol. insert * left symbol. new insertion pointer on left symbol. * INSL, MATL: pick symbol. insert symbol. new insertion pointer * on new symbol. * INSR, MATR: pick symbol. insert symbol. new insertion pointer * stays where it was. * BIFURC: insert dummy symbol. one new insertion pointer * stays where it was (BIFL), other points to the * new dummy (BIFR). * DELETE: no symbol. new insertion pointer stays where it * was. */ #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif static void pick_transit(struct cm_s *cm, int oldidx, int oldtype, int *ret_newidx, int *ret_newtype); static void pick_double_emit(struct cm_s *cm, int nodeidx, char *ret_syml, char *ret_symr); static void pick_single_emit(struct cm_s *cm, int nodeidx, int type, char *ret_sym); static void pick_best_transit(struct cm_s *cm, int oldidx, int oldtype, int *ret_newidx, int *ret_newtype); static void pick_best_double(struct cm_s *cm, int nodeidx, char *ret_syml, char *ret_symr); static void pick_best_single(struct cm_s *cm, int nodeidx, int type, char *ret_sym); /* Function: EmitSequence() * * Purpose: Generate a sequence probabilistically from a model. * The returned sequence contains upper case letters * for MATCH-generated positions, lower case letters * for INSERT-generated positions. * * Caller is reponsible for free'ing memory allocated * to the sequence. * */ int EmitSequence(struct cm_s *cm, /* model */ int watsoncrick,/* if TRUE, annotate only canonical pairs */ struct align_s **ret_ali, /* RETURN: generated "alignment" */ char **ret_khseq, /* RETURN: generated structure */ char **ret_seq) /* RETURN: generated sequence */ { struct m2ali_s *emstack; struct align_s *emlist; int oldidx; int oldtype; struct align_s *oldafter; int newidx; int newtype; struct align_s *newafter; char syml; char symr; int pos; char *seq; char *khseq; char ssl, ssr; /* secondary structure annotation */ /* Initialize the linked list of emitted sequence, emlist. */ emlist = Init_align(); /* Initialize the pushdown stack for traversing the model, * emstack. */ emstack = Init_m2ali(); pick_transit(cm, 0, uBEGIN_ST, &newidx, &newtype); Push_m2ali(emstack, newidx, newtype, emlist); /* While there's still active model nodes in the stack, * pop one off and deal with it. */ while (Pop_m2ali(emstack, &oldidx, &oldtype, &oldafter)) { /* look out for end */ if (oldidx == -1) continue; /* check for BIFURC, which makes automatic transits to BEGIN states * of next two segments. */ if (cm->nd[oldidx].type == BIFURC_NODE) { /* deal with right branch */ newafter = Insafter_align(0, '-', ' ', oldidx, uBIFURC_ST, oldafter); /* insert a dummy */ Push_m2ali(emstack, cm->nd[oldidx].nxt2, uBEGIN_ST, newafter); /* deal with left branch */ Push_m2ali(emstack, cm->nd[oldidx].nxt, uBEGIN_ST, oldafter); } else { switch (oldtype) { case uDEL_ST: pick_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(emstack, newidx, newtype, oldafter); break; case uMATP_ST: pick_double_emit(cm, oldidx, &syml, &symr); if (! watsoncrick || IsRNAComplement(syml, symr, TRUE)) { ssl = '>'; ssr = '<'; } else { ssl = ssr = '.'; } (void) Insafter_align(0, symr, ssr, oldidx, oldtype, oldafter); newafter = Insafter_align(0, syml, ssl, oldidx, oldtype, oldafter); pick_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(emstack, newidx, newtype, newafter); break; case uINSL_ST: case uMATL_ST: pick_single_emit(cm, oldidx, oldtype, &syml); newafter = Insafter_align(0, syml, '.', oldidx, oldtype, oldafter); pick_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(emstack, newidx, newtype, newafter); break; case uINSR_ST: case uMATR_ST: pick_single_emit(cm, oldidx, oldtype, &symr); (void) Insafter_align(0, symr, '.', oldidx, oldtype, oldafter); pick_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(emstack, newidx, newtype, oldafter); break; default: Die("Unrecognized state type %d in model.", oldtype); } } } Free_m2ali(emstack); /* Now go through and write the correct 'pos' fields in *emlist, * because the caller might expect them for some reason. */ pos = 0; for (newafter = emlist->nxt; newafter->nxt != NULL; newafter = newafter->nxt) { if (newafter->type == uDEL_ST) newafter->pos = -1; else { newafter->pos = pos; pos++; } } /* Now we extract the sequence from the linked list. * For now, we leave the dummy characters in. */ if (! Align2kh(emlist, &seq, &khseq)) Warn("Align2kh() failed"); *ret_ali = emlist; *ret_khseq = khseq; *ret_seq = seq; return 1; } /* Function: EmitBestSequence() * * Purpose: Generate the most probable sequence from a model by picking * the most probable transitions and emissions. * Very similar to EmitSequence(), above. * */ int EmitBestSequence(struct cm_s *cm, /* model */ int watsoncrick, /* if TRUE, annotate only canonical pairs */ struct align_s **ret_ali, /* RETURN: generated "alignment" */ char **ret_khseq, /* RETURN: generated structure */ char **ret_seq) /* RETURN: generated sequence */ { struct align_s *ali; /* generated "alignment" linked list */ struct align_s *curr; /* ptr to current insertion pt in ali */ struct align_s *new; /* ptr to newly inserted pt in ali */ struct m2ali_s *stack; /* pushdown stack for traversing model */ int oldidx; /* stateidx of current insertion pt in ali */ int oldtype; /* subtype of current insertion pt in ali */ int newidx; /* stateidx of newly inserted pt in ali */ int newtype; /* subtype of newly inserted pt in ali */ char syml; /* emitted symbol to the left */ char symr; /* emitted symbol to the right */ int pos; /* position in seq */ char *seq; /* RETURN: generated most probable sequence*/ char *khseq; /* RETURN: structure rep. of seq */ char ssl, ssr; /* secondary structure annotation */ /* Initialize the linked list of emitted sequence, ali */ ali = Init_align(); /* Initialize the pushdown stack for traversing the model */ stack = Init_m2ali(); pick_best_transit(cm, 0, uBEGIN_ST, &newidx, &newtype); Push_m2ali(stack, newidx, newtype, ali); /* While there's still active model nodes in the stack, * pop one off and deal with it. */ while (Pop_m2ali(stack, &oldidx, &oldtype, &curr)) { /* look out for end */ if (oldidx == -1) continue; /* check for BIFURC, which makes automatic transits to BEGIN states * of next two segments. */ if (cm->nd[oldidx].type == BIFURC_NODE) { /* deal with right branch */ new = Insafter_align(0, '-', ' ', oldidx, uBIFURC_ST, curr); /* insert a dummy */ Push_m2ali(stack, cm->nd[oldidx].nxt2, uBEGIN_ST, new); /* deal with left branch */ Push_m2ali(stack, cm->nd[oldidx].nxt, uBEGIN_ST, curr); } else { switch (oldtype) { case uDEL_ST: pick_best_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(stack, newidx, newtype, curr); break; case uMATP_ST: pick_best_double(cm, oldidx, &syml, &symr); if (! watsoncrick || IsRNAComplement(syml, symr, TRUE)) { ssl = '>'; ssr = '<'; } else { ssl = '.'; ssr = '.'; } (void) Insafter_align(0, symr, ssr, oldidx, oldtype, curr); new = Insafter_align(0, syml, ssl, oldidx, oldtype, curr); pick_best_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(stack, newidx, newtype, new); break; case uINSL_ST: case uMATL_ST: pick_best_single(cm, oldidx, oldtype, &syml); new = Insafter_align(0, syml, '.', oldidx, oldtype, curr); pick_best_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(stack, newidx, newtype, new); break; case uINSR_ST: case uMATR_ST: pick_best_single(cm, oldidx, oldtype, &symr); (void) Insafter_align(0, symr, '.', oldidx, oldtype, curr); pick_best_transit(cm, oldidx, oldtype, &newidx, &newtype); Push_m2ali(stack, newidx, newtype, curr); break; default: Die("Unrecognized state type %d in model.", oldtype); } } } Free_m2ali(stack); /* Now go through and write the correct 'pos' fields in *ali, * because the caller might expect them for some reason. */ pos = 0; for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt) { if (curr->type == uDEL_ST) curr->pos = -1; else { curr->pos = pos; pos++; } } /* Now we extract the sequence from the linked list. * For now, we leave the dummy characters in. */ if (! Align2kh(ali, &seq, &khseq)) Warn("Align2kh() failed"); *ret_ali = ali; *ret_khseq = khseq; *ret_seq = seq; return 1; } /* Function: pick_transit() * * Purpose: Pick a random state transition, given a current state * (specified by a stateidx and a subtype). Pass back * the new state (newidx, newtype). */ static void pick_transit(struct cm_s *cm, int oldidx, int oldtype, int *ret_newidx, int *ret_newtype) { int newidx; int newtype; double sum; double roll; /* Picking a new subtype involves rolling a random * fraction and examining the appropriate row of the * 7x7 state transition matrix. */ sum = 0.0; roll = sre_random(); for (newtype = 0; newtype < STATETYPES; newtype++) { sum += cm->nd[oldidx].tmx[oldtype][newtype]; if (roll <= sum) break; } if (newtype == STATETYPES) Die("Failed to transit from stateidx %d subtype %d, roll %.2f", oldidx, oldtype, roll); /* Picking a new nodeidx is a function of the current * state type. This function should never be called for BIFURCs. */ if (newtype == INSL_ST || newtype == INSR_ST) newidx = oldidx; else newidx = cm->nd[oldidx].nxt; *ret_newidx = newidx; *ret_newtype = UniqueStatetype(cm->nd[newidx].type, newtype); } /* Function: pick_double_emit() * * Purpose: Given a model and a current state (stateidx, subtype), * which must be an INSC or MATC, pick a pairwise emission * (syml, symr) according to the probabilities in the * appropriate emission matrix. */ static void pick_double_emit(struct cm_s *cm, int nodeidx, char *ret_syml, char *ret_symr) { double sum; double roll; int i, j; sum = 0.0; roll = sre_random(); for (i = 0; i < ALPHASIZE; i++) for (j = 0; j < ALPHASIZE; j++) { sum += cm->nd[nodeidx].mp_emit[i][j]; if (roll <= sum) goto breakout; } breakout: *ret_syml = ALPHABET[i]; *ret_symr = ALPHABET[j]; } /* Function: pick_single_emit() * * Purpose: Given a model and a current state (nodeidx, type), * which must be an INS(L/R) or MAT(L/R), pick an emission * (sym) according to the probabilities in the * appropriate emission vector. */ static void pick_single_emit(struct cm_s *cm, int nodeidx, int type, char *ret_sym) { double sum; double roll; int i; double *emit; /* find correct emission vector */ switch (type) { case uINSL_ST: emit = cm->nd[nodeidx].il_emit; break; case uINSR_ST: emit = cm->nd[nodeidx].ir_emit; break; case uMATL_ST: emit = cm->nd[nodeidx].ml_emit; break; case uMATR_ST: emit = cm->nd[nodeidx].mr_emit; break; default: Die("can't single emit from state type %d", type); } sum = 0.0; roll = sre_random(); for (i = 0; i < ALPHASIZE; i++) { sum += emit[i]; if (roll <= sum) break; } *ret_sym = ALPHABET[i]; } /* Function: pick_best_transit() * * Purpose: Pick most probable state transition, given a current state * (specified by a nodeidx and a type). Pass back * the new state (newidx, newtype). */ static void pick_best_transit(struct cm_s *cm, int oldidx, int oldtype, int *ret_newidx, int *ret_newtype) { int y; int newidx; int newtype; double best; /* find maximum probability */ best = 0.0; for (y = 0; y < STATETYPES; y++) if (cm->nd[oldidx].tmx[oldtype][y] > best) { best = cm->nd[oldidx].tmx[oldtype][y]; newtype = y; } /* Picking a new nodeidx is a function of the current * type. This function should never be called for BIFURCs. */ if (newtype == INSL_ST || newtype == INSR_ST) newidx = oldidx; else newidx = cm->nd[oldidx].nxt; *ret_newidx = newidx; *ret_newtype = UniqueStatetype(cm->nd[newidx].type, newtype); } /* Function: pick_best_double() * * Purpose: Given a model and a current state (nodeidx, type), * (which must a MATP), pick most probable pairwise emission * (syml, symr) according to the probabilities in the * appropriate emission matrix. * */ static void pick_best_double(struct cm_s *cm, int nodeidx, char *ret_syml, char *ret_symr) { double best; int i, j; int besti, bestj; best = 0.0; for (i = 0; i < ALPHASIZE; i++) for (j = 0; j < ALPHASIZE; j++) if (cm->nd[nodeidx].mp_emit[i][j] > best) { best = cm->nd[nodeidx].mp_emit[i][j]; besti = i; bestj = j; } *ret_syml = ALPHABET[besti]; *ret_symr = ALPHABET[bestj]; } /* Function: pick_best_single() * * Purpose: Given a model and a current state (nodeidx, type), * which must be an INS(L/R) or MAT(L/R), pick most probable emission * (sym) according to the probabilities in the * appropriate emission vector. */ static void pick_best_single(struct cm_s *cm, int nodeidx, int type, char *ret_sym) { double best; int besti; int i; double *emit; /* find correct emission vector */ switch (type) { case uINSL_ST: emit = cm->nd[nodeidx].il_emit; break; case uINSR_ST: emit = cm->nd[nodeidx].ir_emit; break; case uMATL_ST: emit = cm->nd[nodeidx].ml_emit; break; case uMATR_ST: emit = cm->nd[nodeidx].mr_emit; break; default: Die("can't single emit from type %d", type); } best = 0.0; for (i = 0; i < ALPHASIZE; i++) if (emit[i] > best) { best = emit[i]; besti = i; } *ret_sym = ALPHABET[besti]; } tRNAscan-SE-2.0/src/score_main.c0000644000543100007160000001415714044116125015712 0ustar pchanlowelab/* score_main.c * Fri Feb 18 10:31:48 1994 * * main() for scoring test sequences with a model. * Also, can print out alignments of model to sequence so * that the pairwise assignments can be seen. */ #include #include #include #include #include #include /*#ifdef NEED_GETOPTH #include #endif*/ #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "ag:ms" static char usage[] = "\ Usage: coves [-options] \n\ where options are:\n\ -a : show all pairs, not just Watson-Crick\n\ -g : set expected background GC composition (default 0.5)\n\ -m : mountain representation of structural alignment\n\ -s : secondary structure string representation of \n\ structural alignment\n"; static char banner[] = "\ coves - scoring and structure prediction of RNA sequences\n\ using a covariance model"; int main(int argc, char **argv) { char *seq; /* a sequence to score */ SQINFO sqinfo; /* info about seq */ char *seqfile; /* sequence file */ int fmt; /* format of sequence file */ SQFILE *dbfp; /* open sequence file for reading */ char *cmfile; /* file containing covariance model */ struct cm_s *cm; /* model */ double score; /* score of alignment */ struct trace_s *tr; /* traceback of alignment */ struct align_s *ali; /* alignment of seq to model */ char *aseq; /* "aligned" sequence string */ char *khstruct; /* secondary structure string */ char buffer[61]; /* output buffer for structures */ int len; /* length of aseq, khstruct */ int apos; /* position in aseq, khstruct */ double rfreq[ALPHASIZE]; /* expected background symbol frequencies */ struct istate_s *icm; /* integer log odds model */ int statenum; /* # of states in icm */ int do_khstructure; /* TRUE if we print a structure string */ int do_mountain; /* TRUE if we show a mountain structure */ int watsoncrick; /* TRUE if only canonical pairs are indicated */ double gcfrac; /* expected background GC fraction */ #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, curr_size; #endif int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ /*********************************************** * Parse command line ***********************************************/ do_khstructure = FALSE; do_mountain = FALSE; watsoncrick = TRUE; gcfrac = 0.5; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'a': watsoncrick = FALSE; break; case 'g': gcfrac = (double) atof(optarg); break; case 'm': do_mountain = TRUE; break; case 's': do_khstructure = TRUE; break; case 'h': printf("%s\n version %s (%s)\n%s\n", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("unrecognized option %c\n", optc); } if (argc - optind != 2) Die("%s\n", usage); cmfile = argv[argc-2]; seqfile = argv[argc-1]; if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Failed to determine format of sequence database %s", seqfile); if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); if (! ReadCM(cmfile, &cm)) Die("Failed to read model from file %s", cmfile); rfreq[1] = rfreq[2] = gcfrac / 2.; rfreq[0] = rfreq[3] = (1. - gcfrac) / 2.; if (! RearrangeCM(cm, rfreq, &icm, &statenum)) Die("failed to make integer log odds model"); /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" version %s, %s\n\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); printf("Database to search/score: %s\n", seqfile); printf("Model: %s\n", cmfile); printf("GC%% of background model: %.0f%%\n", (gcfrac*100.)); printf("---------------------------------------------------\n"); puts(""); /*********************************************** * Score each sequence ***********************************************/ #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) { char *prepseq; prepseq = Strdup(seq); PrepareSequence(prepseq); if (! ViterbiAlign(icm, statenum, prepseq, &score, &tr)) Die("ViterbiAlign() failed on sequence %s", sqinfo.name); free(prepseq); printf("%6.2f bits : %s\n", score, sqinfo.name); if (do_khstructure || do_mountain) { if (! Trace2ali(seq, tr, watsoncrick, &ali)) Die("Trace2ali failed"); if (do_khstructure) { if (! Align2kh(ali, &aseq, &khstruct)) Die("Align2kh failed\n"); /* Print out the sequence and structure */ len = strlen(aseq); buffer[60] = '\0'; for (apos = 1; apos <= len; apos += 60) { strncpy(buffer, aseq + apos - 1, 60); printf(" %10s %s\n", sqinfo.name, buffer); strncpy(buffer, khstruct + apos - 1, 60); printf(" %10s %s\n", sqinfo.name, buffer); puts(""); } free(aseq); free(khstruct); } if (do_mountain) { PrintAliLandscape(stdout, cm, ali); puts(""); } Free_align(ali); } FreeSequence(seq, &sqinfo); #ifdef MEMDEBUG curr_size = malloc_size(&histid2); if (curr_size != orig_size) { Warn("malloc-debug: current size %ul, starting size %ul\n", curr_size, orig_size); malloc_list(2,histid1, histid2); } #endif } SeqfileClose(dbfp); FreeCM(cm); free(icm); return 0; } tRNAscan-SE-2.0/src/trace.c0000644000543100007160000004163411021467306014673 0ustar pchanlowelab/* trace.c * cove 1.0: Mon May 17 09:38:14 1993 * moved to cove 2.0, Mon Sep 6 13:34:55 1993 * * Unlike a traceback of a normal HMM alignment, which is linear, * the traceback of a covariance HMM is a tree structure. Here * we provide support for the traceback data structures: the * tree itself, and a pushdown stack used for traversing the * tree. * * The trace tree structure has a dummy node at its beginning, * and dummy end nodes at the termination of each branch. Non-BIFURC * states have a NULL right branch. * * The pushdown stack structure has a dummy begin node, and the * end is signified by a final NULL ptr. */ #include #include #include #include "structs.h" /* struct trace_s and struct tracestack_s */ #include "funcs.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #ifdef DEBUG #include #endif /* Function: InitTrace() * * Purpose: Initialize a traceback tree structure. * ret_tmem may be passed as NULL for default behavior; * if ret_tmem is passed, enables optimized memory * behavior for traces. * * Return: ptr to the new tree. */ void InitTrace(struct trace_s **ret_new, struct trmem_s **ret_tmem) { struct trace_s *new; struct trace_s *end; struct trmem_s *pool; if (ret_tmem != NULL) { InitTracepool(&pool); new = PopTracepool(pool); } else if ((new = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->emitl = new->emitr = -1; new->nodeidx = 0; new->type = uBEGIN_ST; new->nxtr = NULL; new->prv = NULL; if (ret_tmem != NULL) end = PopTracepool(pool); else if ((end = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); end->type = uEND_ST; end->emitl = end->emitr = end->nodeidx = -1; end->nxtr = end->nxtl = NULL; end->prv = new; new->nxtl = end; *ret_new = new; if (ret_tmem != NULL) *ret_tmem = pool; } /* Function: AttachTrace() * * Purpose: attach a new node to a tracetree node. * There are dummy END nodes. * * Because of the mechanics of tracebacks through a Viterbi matrix, * we have to make sure that BIFURC children are attached * right first, left second. * * trmem_s may be NULL (default behavior) or an active * trace pool (optimized memory behavior) * * Returns: ptr to the new node, or NULL on failure. */ struct trace_s * AttachTrace(struct trace_s *parent, struct trmem_s *pool, int emitl, int emitr, int nodeidx, int type) { struct trace_s *new; struct trace_s *end; if (parent->nxtr != NULL) Die("That trace node is already full, fool."); /* If left branch is already connected to something, swap it over to the * right (thus enforcing the necessary rule that BIFURCS attach to the right * branch first), and attach a new dummy end to the left branch. */ if (parent->nxtl->nxtl != NULL) { parent->nxtr = parent->nxtl; if (pool != NULL) end = PopTracepool(pool); else if ((end = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); end->type = uEND_ST; end->emitl = end->emitr = end->nodeidx = -1; end->nxtl = end->nxtr = NULL; end->prv = parent; parent->nxtl = end; } if (pool != NULL) new = PopTracepool(pool); else if ((new = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->nxtr = NULL; new->nxtl = parent->nxtl; new->prv = parent; parent->nxtl->prv = new; /* end state also points back, to new */ parent->nxtl = new; new->emitl = emitl; new->emitr = emitr; new->nodeidx = nodeidx; new->type = type; return new; } void FreeTrace(struct trace_s *tr, struct trmem_s *pool) { if (pool == NULL) { struct tracestack_s *stack; struct trace_s *currtr; stack = InitTracestack(); PushTracestack(stack, tr); while ((currtr = PopTracestack(stack)) != NULL) { if (currtr->nxtr != NULL) PushTracestack(stack, currtr->nxtr); if (currtr->nxtl != NULL) PushTracestack(stack, currtr->nxtl); free(currtr); } FreeTracestack(stack); } else FreeTracepool(pool); } void DeleteTracenode(struct trace_s *oldtr, struct trmem_s *pool) { struct trace_s *parent; parent = oldtr->prv; parent->nxtl = oldtr->nxtl; parent->nxtr = oldtr->nxtr; oldtr->nxtl->prv = parent; if (oldtr->nxtr) oldtr->nxtr->prv = parent; if (pool == NULL) free(oldtr); } /* Functions: InitTracepool(), PopTracepool(), FreeTracepool() * * Purpose: Malloc() optimizations for building lots of * trace trees. A "trace pool" just lets me malloc * a lot of trace_s structures at once (InitTracepool). * They are retrieved one at a time using PopTracepool(). * When done (free'ing the trace), one would call * FreeTracepool(). * * Make one trace pool per trace, if using this optimization. */ void InitTracepool(struct trmem_s **ret_tmem) { struct trmem_s *tmem; tmem = (struct trmem_s *) MallocOrDie (sizeof(struct trmem_s)); tmem->next = 0; tmem->num = TMEM_BLOCK; tmem->pool = (struct trace_s *) MallocOrDie (TMEM_BLOCK * sizeof(struct trace_s)); tmem->used = InitTracestack(); *ret_tmem = tmem; } struct trace_s * PopTracepool(struct trmem_s *tmem) { struct trace_s *tr; if (tmem->next == tmem->num) { /* need a new pool */ PushTracestack(tmem->used, tmem->pool); tmem->next = 0; tmem->num = TMEM_BLOCK; tmem->pool = (struct trace_s *) MallocOrDie (TMEM_BLOCK * sizeof(struct trace_s)); } tr = tmem->pool + tmem->next; tmem->next++; return tr; } void FreeTracepool(struct trmem_s *tmem) { struct trace_s *pop; while ((pop = PopTracestack(tmem->used)) != NULL) free(pop); FreeTracestack(tmem->used); free(tmem->pool); free(tmem); } /* Functions: InitTracestack() * PushTracestack() * PopTracestack() * FreeTracestack() * * Purpose: Implementation of the pushdown stack for * traversing traceback trees. */ struct tracestack_s * InitTracestack(void) { struct tracestack_s *stack; stack = (struct tracestack_s *) MallocOrDie (sizeof(struct tracestack_s)); stack->next = 0; stack->num = TSTACK_BLOCK; stack->list = (struct trace_s **) MallocOrDie (sizeof(struct trace_s *) * TSTACK_BLOCK); return stack; } void PushTracestack(struct tracestack_s *stack, struct trace_s *tracenode) { if (stack->next == stack->num) { stack->num += TSTACK_BLOCK; stack->list = (struct trace_s **) ReallocOrDie (stack->list, sizeof(struct trace_s *) * stack->num); } stack->list[stack->next] = tracenode; stack->next++; } struct trace_s * PopTracestack(struct tracestack_s *stack) { struct trace_s *pop; if (stack->next == 0) return NULL; stack->next--; pop = stack->list[stack->next]; return pop; } void FreeTracestack(struct tracestack_s *stack) { free(stack->list); free(stack); } /* Function: TraceCount() * * Purpose: Given a trace structure, and the sequence it traces across, * and a nascent model (counts form), bump the appropriate * emission and transition counters in the model. * * Return: 1 on success, 0 on failure. */ int TraceCount(struct cm_s *cm, /* model */ char *seq, /* sequence, 0..len-1 */ double weight, /* weight on sequence */ struct trace_s *tr) /* traceback */ { struct tracestack_s *dolist; /* stack for traversal of traceback tree */ struct trace_s *curr; /* current node in the tree */ int symr, syml; #ifdef DEBUG int len; len = strlen(seq); #endif dolist = InitTracestack(); PushTracestack(dolist, tr->nxtl); while ((curr = PopTracestack(dolist)) != NULL) { /* ignore END states */ if (curr->nodeidx == -1 || curr->nxtl == NULL) continue; /* BIFURC states: no transits, no emission */ if (curr->nxtr != NULL) { #ifdef DEBUG assert(curr->nxtr != NULL && curr->nxtl != NULL); #endif PushTracestack(dolist, curr->nxtr); PushTracestack(dolist, curr->nxtl); } else if (curr->type == uINSL_ST) { #ifdef DEBUG assert(curr->emitl >= 0 && curr->emitl < len); #endif syml = SymbolIndex(seq[curr->emitl]); #ifdef DEBUG assert(syml >= 0 && syml < 4); assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes); assert(curr->nxtl != NULL); #endif cm->nd[curr->nodeidx].tmx[INSL_ST][StatetypeIndex(curr->nxtl->type)] += weight; cm->nd[curr->nodeidx].il_emit[syml] += weight; PushTracestack(dolist, curr->nxtl); } else if (curr->type == uINSR_ST) { #ifdef DEBUG assert(curr->emitr >= 0 && curr->emitr < len); #endif symr = SymbolIndex(seq[curr->emitr]); #ifdef DEBUG assert(symr >= 0 && symr < 4); assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes); assert(curr->nxtl != NULL); #endif cm->nd[curr->nodeidx].tmx[INSR_ST][StatetypeIndex(curr->nxtl->type)] += weight; cm->nd[curr->nodeidx].ir_emit[symr] += weight; PushTracestack(dolist, curr->nxtl); } else if (curr->type == uMATP_ST) { #ifdef DEBUG assert(curr->emitr >= 0 && curr->emitr < len); assert(curr->emitl >= 0 && curr->emitl < len); #endif syml = SymbolIndex(seq[curr->emitl]); symr = SymbolIndex(seq[curr->emitr]); #ifdef DEBUG assert(syml >= 0 && syml < 4); assert(symr >= 0 && symr < 4); assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes); assert(curr->nxtl != NULL); #endif cm->nd[curr->nodeidx].tmx[MATP_ST][StatetypeIndex(curr->nxtl->type)] += weight; cm->nd[curr->nodeidx].mp_emit[syml][symr] += weight; PushTracestack(dolist, curr->nxtl); } else if (curr->type == uMATL_ST) { #ifdef DEBUG assert(curr->emitl >= 0 && curr->emitl < len); #endif syml = SymbolIndex(seq[curr->emitl]); #ifdef DEBUG assert(syml >= 0 && syml < 4); assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes); assert(curr->nxtl != NULL); #endif cm->nd[curr->nodeidx].tmx[MATL_ST][StatetypeIndex(curr->nxtl->type)] += weight; cm->nd[curr->nodeidx].ml_emit[syml] += weight; PushTracestack(dolist, curr->nxtl); } else if (curr->type == uMATR_ST) { #ifdef DEBUG assert(curr->emitr >= 0 && curr->emitr < len); #endif symr = SymbolIndex(seq[curr->emitr]); #ifdef DEBUG assert(symr >= 0 && symr < 4); assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes); assert(curr->nxtl != NULL); #endif cm->nd[curr->nodeidx].tmx[MATR_ST][StatetypeIndex(curr->nxtl->type)] += weight; cm->nd[curr->nodeidx].mr_emit[symr] += weight; PushTracestack(dolist, curr->nxtl); } else /* DEL or BEGIN state */ { #ifdef DEBUG assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes); assert(curr->nxtl->type >= 0 && curr->nxtl->type < STATETYPES); assert(curr->nxtl != NULL); #endif cm->nd[curr->nodeidx].tmx[DEL_ST][StatetypeIndex(curr->nxtl->type)] += weight; PushTracestack(dolist, curr->nxtl); } } FreeTracestack(dolist); return 1; } /* Function: TraceCountPrior() * * Purpose: Same as above, except that we register the counts * in a prior instead of a model. Used for "training" * new priors. * * Return: 1 on success, 0 on failure. */ int TraceCountPrior(struct cm_s *cm, /* covariance model */ struct prior_s *prior, /* prior to count into */ char *seq, /* sequence, 0..len-1 */ double weight, /* weight on sequence */ struct trace_s *tr) /* traceback */ { struct tracestack_s *dolist; /* stack for traversal of traceback tree */ struct trace_s *curr; /* current node in the tree */ int symr, syml; int fnode, tnode; int fs, ts; dolist = InitTracestack(); PushTracestack(dolist, tr->nxtl); while ((curr = PopTracestack(dolist)) != NULL) { /* ignore END states */ if (curr->nodeidx == -1 || curr->nxtl == NULL) continue; /* BIFURC states: no transits, no emission */ if (curr->nxtr != NULL) { PushTracestack(dolist, curr->nxtr); PushTracestack(dolist, curr->nxtl); continue; } syml = symr = 0; if (curr->emitl != -1 && !isgap(seq[curr->emitl])) syml = SymbolIndex(seq[curr->emitl]); if (curr->emitr != -1 && !isgap(seq[curr->emitr])) symr = SymbolIndex(seq[curr->emitr]); fnode = cm->nd[curr->nodeidx].type; tnode = (cm->nd[curr->nodeidx].nxt != -1) ? cm->nd[cm->nd[curr->nodeidx].nxt].type : END_NODE; fs = StatetypeIndex(curr->type); ts = (cm->nd[curr->nodeidx].nxt != -1) ? StatetypeIndex(curr->nxtl->type) : END_ST; /* Verify where we're writing in memory. Had some problems here! */ if (fnode < 0 || fnode > 6) Die("fnode is %d", fnode); if (tnode < 0 || tnode > 3) Die("tnode is %d", tnode); if (fs < 0 || fs >= STATETYPES) Die("fs is %d", fs); if (ts < 0 || ts >= STATETYPES) Die("ts is %d", ts); if (syml < 0 || syml >= ALPHASIZE) Die("syml is %d", syml); if (symr < 0 || symr >= ALPHASIZE) Die("symr is %d", symr); prior->tprior[fnode][tnode][fs][ts] += weight; switch (curr->type) { case uMATP_ST: prior->matp_prior[syml][symr] += weight; break; case uMATL_ST: prior->matl_prior[syml] += weight; break; case uMATR_ST: prior->matr_prior[symr] += weight; break; case uINSL_ST: prior->insl_prior[syml] += weight; break; case uINSR_ST: prior->insr_prior[symr] += weight; break; case uDEL_ST: break; default: Die("no such state type %d", curr->type); } PushTracestack(dolist, curr->nxtl); } FreeTracestack(dolist); return 1; } /* Function: TraceScore() * * Purpose: Given a trace structure, and the sequence it traces across, * and a model (probability form), calculate the log-odds * probability score. * * * Return: 1 on success, 0 on failure. */ double TraceScore(struct cm_s *cm, /* model */ char *seq, /* sequence, 0..len-1 */ struct trace_s *tr) /* traceback */ { struct tracestack_s *dolist; /* stack for traversal of traceback tree */ struct trace_s *curr; /* current node in the tree */ int symr, syml; double score; score = 0; dolist = InitTracestack(); PushTracestack(dolist, tr->nxtl); while ((curr = PopTracestack(dolist)) != NULL) { /* ignore END states */ if (curr->nodeidx == -1 || curr->nxtl == NULL) continue; /* BIFURC states: no transits, no emission */ if (curr->nxtr != NULL) { PushTracestack(dolist, curr->nxtr); PushTracestack(dolist, curr->nxtl); } else if (curr->type == uINSL_ST) { syml = SymbolIndex(seq[curr->emitl]); score += log(cm->nd[curr->nodeidx].tmx[INSL_ST][StatetypeIndex(curr->nxtl->type)]); score += log(cm->nd[curr->nodeidx].il_emit[syml]); score += log(4.0); /* for log-odds */ PushTracestack(dolist, curr->nxtl); } else if (curr->type == uINSR_ST) { symr = SymbolIndex(seq[curr->emitr]); score += log(cm->nd[curr->nodeidx].tmx[INSR_ST][StatetypeIndex(curr->nxtl->type)]); score += log(cm->nd[curr->nodeidx].ir_emit[symr]); score += log(4.0); /* for log-odds */ PushTracestack(dolist, curr->nxtl); } else if (curr->type == uMATP_ST) { syml = SymbolIndex(seq[curr->emitl]); symr = SymbolIndex(seq[curr->emitr]); score += log(cm->nd[curr->nodeidx].tmx[MATP_ST][StatetypeIndex(curr->nxtl->type)]); score += log(cm->nd[curr->nodeidx].mp_emit[syml][symr]); score += log(16.0); /* for log-odds */ PushTracestack(dolist, curr->nxtl); } else if (curr->type == uMATL_ST) { syml = SymbolIndex(seq[curr->emitl]); score += log(cm->nd[curr->nodeidx].tmx[MATL_ST][StatetypeIndex(curr->nxtl->type)]); score += log(cm->nd[curr->nodeidx].ml_emit[syml]); score += log(4.0); /* for log-odds */ PushTracestack(dolist, curr->nxtl); } else if (curr->type == uMATR_ST) { symr = SymbolIndex(seq[curr->emitr]); score += log(cm->nd[curr->nodeidx].tmx[MATR_ST][StatetypeIndex(curr->nxtl->type)]); score += log(cm->nd[curr->nodeidx].mr_emit[symr]); score += log(4.0); /* for log-odds */ PushTracestack(dolist, curr->nxtl); } else /* DEL or BEGIN state */ { score += log(cm->nd[curr->nodeidx].tmx[DEL_ST][StatetypeIndex(curr->nxtl->type)]); PushTracestack(dolist, curr->nxtl); } } FreeTracestack(dolist); score = score / log(2.0); /* convert to bits */ return score; } tRNAscan-SE-2.0/src/gnuregex.c0000644000543100007160000047315114044352300015417 0ustar pchanlowelab#ifndef lint /* SRE: lint *hates* this code! */ /* This is the GNU regular expression library. * I've modified it for portability and clarity (!). * "SRE" comments flag the changes I've made. * Thanks to the GNU folks for providing this code! * SRE, Sun Jan 7 09:48:26 1996 */ /* Extended regular expression matching and search library, version 0.12. (Implements POSIX draft P10003.2/D11.2, except for internationalization features.) Copyright (C) 1993 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* AIX requires this to be the first thing in the file. */ #if defined (_AIX) && !defined (REGEX_MALLOC) #pragma alloca #endif #define _GNU_SOURCE /* We need this for `regex.h', and perhaps for the Emacs include files. */ #include #ifdef HAVE_CONFIG_H #include "config.h" #endif /* SRE: removed ifdef emacs; removed if HAVE_STRING_H; removed STDC_HEADERS */ #include #include #ifndef bcmp #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) #endif #ifndef bcopy #define bcopy(s, d, n) memcpy ((d), (s), (n)) #endif #ifndef bzero #define bzero(s, n) memset ((s), 0, (n)) #endif /* Define the syntax stuff for \<, \>, etc. */ /* This must be nonzero for the wordchar and notwordchar pattern commands in re_match_2. */ #ifndef Sword #define Sword 1 #endif #ifdef SYNTAX_TABLE extern char *re_syntax_table; #else /* not SYNTAX_TABLE */ /* How many characters in the character set. */ #define CHAR_SET_SIZE 256 static char re_syntax_table[CHAR_SET_SIZE]; static void init_syntax_once () { register int c; static int done = 0; if (done) return; bzero (re_syntax_table, sizeof re_syntax_table); for (c = 'a'; c <= 'z'; c++) re_syntax_table[c] = Sword; for (c = 'A'; c <= 'Z'; c++) re_syntax_table[c] = Sword; for (c = '0'; c <= '9'; c++) re_syntax_table[c] = Sword; re_syntax_table['_'] = Sword; done = 1; } #endif /* not SYNTAX_TABLE */ #define SYNTAX(c) re_syntax_table[c] /* Get the interface, including the syntax bits. */ #include "gnuregex.h" /* isalpha etc. are used for the character classes. */ #include #ifndef isascii #define isascii(c) 1 #endif #ifdef isblank #define ISBLANK(c) (isascii (c) && isblank (c)) #else #define ISBLANK(c) ((c) == ' ' || (c) == '\t') #endif #ifdef isgraph #define ISGRAPH(c) (isascii (c) && isgraph (c)) #else #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) #endif #define ISPRINT(c) (isascii (c) && isprint (c)) #define ISDIGIT(c) (isascii (c) && isdigit (c)) #define ISALNUM(c) (isascii (c) && isalnum (c)) #define ISALPHA(c) (isascii (c) && isalpha (c)) #define ISCNTRL(c) (isascii (c) && iscntrl (c)) #define ISLOWER(c) (isascii (c) && islower (c)) #define ISPUNCT(c) (isascii (c) && ispunct (c)) #define ISSPACE(c) (isascii (c) && isspace (c)) #define ISUPPER(c) (isascii (c) && isupper (c)) #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) #ifndef NULL #define NULL 0 #endif /* We remove any previous definition of `SIGN_EXTEND_CHAR', since ours (we hope) works properly with all combinations of machines, compilers, `char' and `unsigned char' argument types. (Per Bothner suggested the basic approach.) */ #undef SIGN_EXTEND_CHAR #if __STDC__ #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) #else /* not __STDC__ */ /* As in Harbison and Steele. */ #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) #endif /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we use `alloca' instead of `malloc'. This is because using malloc in re_search* or re_match* could cause memory leaks when C-g is used in Emacs; also, malloc is slower and causes storage fragmentation. On the other hand, malloc is more portable, and easier to debug. Because we sometimes use alloca, some routines have to be macros, not functions -- `alloca'-allocated space disappears at the end of the function it is called in. */ #define REGEX_MALLOC /* SRE -- alloca not portable? */ #ifdef REGEX_MALLOC #define REGEX_ALLOCATE malloc #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) #else /* not REGEX_MALLOC */ /* Emacs already defines alloca, sometimes. */ #ifndef alloca /* Make alloca work the best possible way. */ #ifdef __GNUC__ #define alloca __builtin_alloca #else /* not __GNUC__ */ #if HAVE_ALLOCA_H #include #else /* not __GNUC__ or HAVE_ALLOCA_H */ #ifndef _AIX /* Already did AIX, up at the top. */ char *alloca (); #endif /* not _AIX */ #endif /* not HAVE_ALLOCA_H */ #endif /* not __GNUC__ */ #endif /* not alloca */ #define REGEX_ALLOCATE alloca /* Assumes a `char *destination' variable. */ #define REGEX_REALLOCATE(source, osize, nsize) \ (destination = (char *) alloca (nsize), \ bcopy (source, destination, osize), \ destination) #endif /* not REGEX_MALLOC */ /* True if `size1' is non-NULL and PTR is pointing anywhere inside `string1' or just past its end. This works if PTR is NULL, which is a good thing. */ #define FIRST_STRING_P(ptr) \ (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) /* (Re)Allocate N items of type T using malloc, or fail. */ #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) #define BYTEWIDTH 8 /* In bits. */ #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) typedef char boolean; #define false 0 #define true 1 /* These are the command codes that appear in compiled regular expressions. Some opcodes are followed by argument bytes. A command code can specify any interpretation whatsoever for its arguments. Zero bytes may appear in the compiled regular expression. The value of `exactn' is needed in search.c (search_buffer) in Emacs. So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of `exactn' we use here must also be 1. */ typedef enum { no_op = 0, /* Followed by one byte giving n, then by n literal bytes. */ exactn = 1, /* Matches any (more or less) character. */ anychar, /* Matches any one char belonging to specified set. First following byte is number of bitmap bytes. Then come bytes for a bitmap saying which chars are in. Bits in each byte are ordered low-bit-first. A character is in the set if its bit is 1. A character too large to have a bit in the map is automatically not in the set. */ charset, /* Same parameters as charset, but match any character that is not one of those specified. */ charset_not, /* Start remembering the text that is matched, for storing in a register. Followed by one byte with the register number, in the range 0 to one less than the pattern buffer's re_nsub field. Then followed by one byte with the number of groups inner to this one. (This last has to be part of the start_memory only because we need it in the on_failure_jump of re_match_2.) */ start_memory, /* Stop remembering the text that is matched and store it in a memory register. Followed by one byte with the register number, in the range 0 to one less than `re_nsub' in the pattern buffer, and one byte with the number of inner groups, just like `start_memory'. (We need the number of inner groups here because we don't have any easy way of finding the corresponding start_memory when we're at a stop_memory.) */ stop_memory, /* Match a duplicate of something remembered. Followed by one byte containing the register number. */ duplicate, /* Fail unless at beginning of line. */ begline, /* Fail unless at end of line. */ endline, /* Succeeds if at beginning of buffer (if emacs) or at beginning of string to be matched (if not). */ begbuf, /* Analogously, for end of buffer/string. */ endbuf, /* Followed by two byte relative address to which to jump. */ jump, /* Same as jump, but marks the end of an alternative. */ jump_past_alt, /* Followed by two-byte relative address of place to resume at in case of failure. */ on_failure_jump, /* Like on_failure_jump, but pushes a placeholder instead of the current string position when executed. */ on_failure_keep_string_jump, /* Throw away latest failure point and then jump to following two-byte relative address. */ pop_failure_jump, /* Change to pop_failure_jump if know won't have to backtrack to match; otherwise change to jump. This is used to jump back to the beginning of a repeat. If what follows this jump clearly won't match what the repeat does, such that we can be sure that there is no use backtracking out of repetitions already matched, then we change it to a pop_failure_jump. Followed by two-byte address. */ maybe_pop_jump, /* Jump to following two-byte address, and push a dummy failure point. This failure point will be thrown away if an attempt is made to use it for a failure. A `+' construct makes this before the first repeat. Also used as an intermediary kind of jump when compiling an alternative. */ dummy_failure_jump, /* Push a dummy failure point and continue. Used at the end of alternatives. */ push_dummy_failure, /* Followed by two-byte relative address and two-byte number n. After matching N times, jump to the address upon failure. */ succeed_n, /* Followed by two-byte relative address, and two-byte number n. Jump to the address N times, then fail. */ jump_n, /* Set the following two-byte relative address to the subsequent two-byte number. The address *includes* the two bytes of number. */ set_number_at, wordchar, /* Matches any word-constituent character. */ notwordchar, /* Matches any char that is not a word-constituent. */ wordbeg, /* Succeeds if at word beginning. */ wordend, /* Succeeds if at word end. */ wordbound, /* Succeeds if at a word boundary. */ notwordbound /* Succeeds if not at a word boundary. */ #ifdef emacs ,before_dot, /* Succeeds if before point. */ at_dot, /* Succeeds if at point. */ after_dot, /* Succeeds if after point. */ /* Matches any character whose syntax is specified. Followed by a byte which contains a syntax code, e.g., Sword. */ syntaxspec, /* Matches any character whose syntax is not that specified. */ notsyntaxspec #endif /* emacs */ } re_opcode_t; /* Common operations on the compiled pattern. */ /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ #define STORE_NUMBER(destination, number) \ do { \ (destination)[0] = (number) & 0377; \ (destination)[1] = (number) >> 8; \ } while (0) /* Same as STORE_NUMBER, except increment DESTINATION to the byte after where the number is stored. Therefore, DESTINATION must be an lvalue. */ #define STORE_NUMBER_AND_INCR(destination, number) \ do { \ STORE_NUMBER (destination, number); \ (destination) += 2; \ } while (0) /* Put into DESTINATION a number stored in two contiguous bytes starting at SOURCE. */ #define EXTRACT_NUMBER(destination, source) \ do { \ (destination) = *(source) & 0377; \ (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ } while (0) #ifdef DEBUG static void extract_number (dest, source) int *dest; unsigned char *source; { int temp = SIGN_EXTEND_CHAR (*(source + 1)); *dest = *source & 0377; *dest += temp << 8; } #ifndef EXTRACT_MACROS /* To debug the macros. */ #undef EXTRACT_NUMBER #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) #endif /* not EXTRACT_MACROS */ #endif /* DEBUG */ /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. SOURCE must be an lvalue. */ #define EXTRACT_NUMBER_AND_INCR(destination, source) \ do { \ EXTRACT_NUMBER (destination, source); \ (source) += 2; \ } while (0) #ifdef DEBUG static void extract_number_and_incr (destination, source) int *destination; unsigned char **source; { extract_number (destination, *source); *source += 2; } #ifndef EXTRACT_MACROS #undef EXTRACT_NUMBER_AND_INCR #define EXTRACT_NUMBER_AND_INCR(dest, src) \ extract_number_and_incr (&dest, &src) #endif /* not EXTRACT_MACROS */ #endif /* DEBUG */ /* If DEBUG is defined, Regex prints many voluminous messages about what it is doing (if the variable `debug' is nonzero). If linked with the main program in `iregex.c', you can enter patterns and strings interactively. And if linked with the main program in `main.c' and the other test files, you can run the already-written tests. */ #ifdef DEBUG /* We use standard I/O for debugging. */ #include /* It is useful to test things that ``must'' be true when debugging. */ #include static int debug = 0; #define DEBUG_STATEMENT(e) e #define DEBUG_PRINT1(x) if (debug) printf (x) #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ if (debug) print_partial_compiled_pattern (s, e) #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ if (debug) print_double_string (w, s1, sz1, s2, sz2) extern void printchar (); /* Print the fastmap in human-readable form. */ void print_fastmap (fastmap) char *fastmap; { unsigned was_a_range = 0; unsigned i = 0; while (i < (1 << BYTEWIDTH)) { if (fastmap[i++]) { was_a_range = 0; printchar (i - 1); while (i < (1 << BYTEWIDTH) && fastmap[i]) { was_a_range = 1; i++; } if (was_a_range) { printf ("-"); printchar (i - 1); } } } putchar ('\n'); } /* Print a compiled pattern string in human-readable form, starting at the START pointer into it and ending just before the pointer END. */ void print_partial_compiled_pattern (start, end) unsigned char *start; unsigned char *end; { int mcnt, mcnt2; unsigned char *p = start; unsigned char *pend = end; if (start == NULL) { printf ("(null)\n"); return; } /* Loop over pattern commands. */ while (p < pend) { switch ((re_opcode_t) *p++) { case no_op: printf ("/no_op"); break; case exactn: mcnt = *p++; printf ("/exactn/%d", mcnt); do { putchar ('/'); printchar (*p++); } while (--mcnt); break; case start_memory: mcnt = *p++; printf ("/start_memory/%d/%d", mcnt, *p++); break; case stop_memory: mcnt = *p++; printf ("/stop_memory/%d/%d", mcnt, *p++); break; case duplicate: printf ("/duplicate/%d", *p++); break; case anychar: printf ("/anychar"); break; case charset: case charset_not: { register int c; printf ("/charset%s", (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); assert (p + *p < pend); for (c = 0; c < *p; c++) { unsigned bit; unsigned char map_byte = p[1 + c]; putchar ('/'); for (bit = 0; bit < BYTEWIDTH; bit++) if (map_byte & (1 << bit)) printchar (c * BYTEWIDTH + bit); } p += 1 + *p; break; } case begline: printf ("/begline"); break; case endline: printf ("/endline"); break; case on_failure_jump: extract_number_and_incr (&mcnt, &p); printf ("/on_failure_jump/0/%d", mcnt); break; case on_failure_keep_string_jump: extract_number_and_incr (&mcnt, &p); printf ("/on_failure_keep_string_jump/0/%d", mcnt); break; case dummy_failure_jump: extract_number_and_incr (&mcnt, &p); printf ("/dummy_failure_jump/0/%d", mcnt); break; case push_dummy_failure: printf ("/push_dummy_failure"); break; case maybe_pop_jump: extract_number_and_incr (&mcnt, &p); printf ("/maybe_pop_jump/0/%d", mcnt); break; case pop_failure_jump: extract_number_and_incr (&mcnt, &p); printf ("/pop_failure_jump/0/%d", mcnt); break; case jump_past_alt: extract_number_and_incr (&mcnt, &p); printf ("/jump_past_alt/0/%d", mcnt); break; case jump: extract_number_and_incr (&mcnt, &p); printf ("/jump/0/%d", mcnt); break; case succeed_n: extract_number_and_incr (&mcnt, &p); extract_number_and_incr (&mcnt2, &p); printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); break; case jump_n: extract_number_and_incr (&mcnt, &p); extract_number_and_incr (&mcnt2, &p); printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); break; case set_number_at: extract_number_and_incr (&mcnt, &p); extract_number_and_incr (&mcnt2, &p); printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); break; case wordbound: printf ("/wordbound"); break; case notwordbound: printf ("/notwordbound"); break; case wordbeg: printf ("/wordbeg"); break; case wordend: printf ("/wordend"); #ifdef emacs case before_dot: printf ("/before_dot"); break; case at_dot: printf ("/at_dot"); break; case after_dot: printf ("/after_dot"); break; case syntaxspec: printf ("/syntaxspec"); mcnt = *p++; printf ("/%d", mcnt); break; case notsyntaxspec: printf ("/notsyntaxspec"); mcnt = *p++; printf ("/%d", mcnt); break; #endif /* emacs */ case wordchar: printf ("/wordchar"); break; case notwordchar: printf ("/notwordchar"); break; case begbuf: printf ("/begbuf"); break; case endbuf: printf ("/endbuf"); break; default: printf ("?%d", *(p-1)); } } printf ("/\n"); } void print_compiled_pattern (bufp) struct re_pattern_buffer *bufp; { unsigned char *buffer = bufp->buffer; print_partial_compiled_pattern (buffer, buffer + bufp->used); printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); if (bufp->fastmap_accurate && bufp->fastmap) { printf ("fastmap: "); print_fastmap (bufp->fastmap); } printf ("re_nsub: %d\t", bufp->re_nsub); printf ("regs_alloc: %d\t", bufp->regs_allocated); printf ("can_be_null: %d\t", bufp->can_be_null); printf ("newline_anchor: %d\n", bufp->newline_anchor); printf ("no_sub: %d\t", bufp->no_sub); printf ("not_bol: %d\t", bufp->not_bol); printf ("not_eol: %d\t", bufp->not_eol); printf ("syntax: %d\n", bufp->syntax); /* Perhaps we should print the translate table? */ } void print_double_string (where, string1, size1, string2, size2) char *where; char *string1; char *string2; int size1; int size2; { unsigned this_char; if (where == NULL) printf ("(null)"); else { if (FIRST_STRING_P (where)) { for (this_char = where - string1; this_char < size1; this_char++) printchar (string1[this_char]); where = string2; } for (this_char = where - string2; this_char < size2; this_char++) printchar (string2[this_char]); } } #else /* not DEBUG */ #undef assert #define assert(e) #define DEBUG_STATEMENT(e) #define DEBUG_PRINT1(x) #define DEBUG_PRINT2(x1, x2) #define DEBUG_PRINT3(x1, x2, x3) #define DEBUG_PRINT4(x1, x2, x3, x4) #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) #endif /* not DEBUG */ /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; /* Specify the precise syntax of regexps for compilation. This provides for compatibility for various utilities which historically have different, incompatible syntaxes. The argument SYNTAX is a bit mask comprised of the various bits defined in regex.h. We return the old syntax. */ reg_syntax_t re_set_syntax (syntax) reg_syntax_t syntax; { reg_syntax_t ret = re_syntax_options; re_syntax_options = syntax; return ret; } /* This table gives an error message for each of the error codes listed in regex.h. Obviously the order here has to be same as there. */ static char *re_error_msg[] = { NULL, /* REG_NOERROR */ "No match", /* REG_NOMATCH */ "Invalid regular expression", /* REG_BADPAT */ "Invalid collation character", /* REG_ECOLLATE */ "Invalid character class name", /* REG_ECTYPE */ "Trailing backslash", /* REG_EESCAPE */ "Invalid back reference", /* REG_ESUBREG */ "Unmatched [ or [^", /* REG_EBRACK */ "Unmatched ( or \\(", /* REG_EPAREN */ "Unmatched \\{", /* REG_EBRACE */ "Invalid content of \\{\\}", /* REG_BADBR */ "Invalid range end", /* REG_ERANGE */ "Memory exhausted", /* REG_ESPACE */ "Invalid preceding regular expression", /* REG_BADRPT */ "Premature end of regular expression", /* REG_EEND */ "Regular expression too big", /* REG_ESIZE */ "Unmatched ) or \\)", /* REG_ERPAREN */ }; /* Subroutine declarations and macros for regex_compile. */ static void store_op1 (), store_op2 (); static void insert_op1 (), insert_op2 (); static boolean at_begline_loc_p (), at_endline_loc_p (); static boolean group_in_compile_stack (); static reg_errcode_t compile_range (); /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use as an array index (in, e.g., `translate'). */ #define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ if (translate) c = translate[c]; \ } while (0) /* Fetch the next character in the uncompiled pattern, with no translation. */ #define PATFETCH_RAW(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ } while (0) /* Go backwards one character in the pattern. */ #define PATUNFETCH p-- /* If `translate' is non-null, return translate[D], else just D. We cast the subscript to translate because some data is declared as `char *', to avoid warnings when a string constant is passed. But when we use a character as a subscript we must make it unsigned. */ #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) /* Macros for outputting the compiled pattern into `buffer'. */ /* If the buffer isn't allocated when it comes in, use this. */ #define INIT_BUF_SIZE 32 /* Make sure we have at least N more bytes of space in buffer. */ #define GET_BUFFER_SPACE(n) \ while (b - bufp->buffer + (n) > bufp->allocated) \ EXTEND_BUFFER () /* Make sure we have one more byte of buffer space and then add C to it. */ #define BUF_PUSH(c) \ do { \ GET_BUFFER_SPACE (1); \ *b++ = (unsigned char) (c); \ } while (0) /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ #define BUF_PUSH_2(c1, c2) \ do { \ GET_BUFFER_SPACE (2); \ *b++ = (unsigned char) (c1); \ *b++ = (unsigned char) (c2); \ } while (0) /* As with BUF_PUSH_2, except for three bytes. */ #define BUF_PUSH_3(c1, c2, c3) \ do { \ GET_BUFFER_SPACE (3); \ *b++ = (unsigned char) (c1); \ *b++ = (unsigned char) (c2); \ *b++ = (unsigned char) (c3); \ } while (0) /* Store a jump with opcode OP at LOC to location TO. We store a relative address offset by the three bytes the jump itself occupies. */ #define STORE_JUMP(op, loc, to) \ store_op1 (op, loc, (to) - (loc) - 3) /* Likewise, for a two-argument jump. */ #define STORE_JUMP2(op, loc, to, arg) \ store_op2 (op, loc, (to) - (loc) - 3, arg) /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ #define INSERT_JUMP(op, loc, to) \ insert_op1 (op, loc, (to) - (loc) - 3, b) /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ #define INSERT_JUMP2(op, loc, to, arg) \ insert_op2 (op, loc, (to) - (loc) - 3, arg, b) /* This is not an arbitrary limit: the arguments which represent offsets into the pattern are two bytes long. So if 2^16 bytes turns out to be too small, many things would have to change. */ #define MAX_BUF_SIZE (1L << 16) /* Extend the buffer by twice its current size via realloc and reset the pointers that pointed into the old block to point to the correct places in the new one. If extending the buffer results in it being larger than MAX_BUF_SIZE, then flag memory exhausted. */ #define EXTEND_BUFFER() \ do { \ unsigned char *old_buffer = bufp->buffer; \ if (bufp->allocated == MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ bufp->allocated = MAX_BUF_SIZE; \ bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ if (bufp->buffer == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ if (old_buffer != bufp->buffer) \ { \ b = (b - old_buffer) + bufp->buffer; \ begalt = (begalt - old_buffer) + bufp->buffer; \ if (fixup_alt_jump) \ fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ if (laststart) \ laststart = (laststart - old_buffer) + bufp->buffer; \ if (pending_exact) \ pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ } \ } while (0) /* Since we have one byte reserved for the register number argument to {start,stop}_memory, the maximum number of groups we can report things about is what fits in that byte. */ #define MAX_REGNUM 255 /* But patterns can have more than `MAX_REGNUM' registers. We just ignore the excess. */ typedef unsigned regnum_t; /* Macros for the compile stack. */ /* Since offsets can go either forwards or backwards, this type needs to be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ typedef int pattern_offset_t; typedef struct { pattern_offset_t begalt_offset; pattern_offset_t fixup_alt_jump; pattern_offset_t inner_group_offset; pattern_offset_t laststart_offset; regnum_t regnum; } compile_stack_elt_t; typedef struct { compile_stack_elt_t *stack; unsigned size; unsigned avail; /* Offset of next open position. */ } compile_stack_type; #define INIT_COMPILE_STACK_SIZE 32 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) /* The next available element. */ #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) /* Set the bit for character C in a list. */ #define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH)) /* Get the next unsigned number in the uncompiled pattern. */ #define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while (ISDIGIT (c)) \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ } #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ #define IS_CHAR_CLASS(string) \ (STREQ (string, "alpha") || STREQ (string, "upper") \ || STREQ (string, "lower") || STREQ (string, "digit") \ || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank")) /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. Returns one of error codes defined in `regex.h', or zero for success. Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. If it succeeds, results are put in BUFP (if it returns an error, the contents of BUFP are undefined): `buffer' is the compiled pattern; `syntax' is set to SYNTAX; `used' is set to the length of the compiled pattern; `fastmap_accurate' is zero; `re_nsub' is the number of subexpressions in PATTERN; `not_bol' and `not_eol' are zero; The `fastmap' and `newline_anchor' fields are neither examined nor set. */ static reg_errcode_t regex_compile (pattern, size, syntax, bufp) char *pattern; int size; reg_syntax_t syntax; struct re_pattern_buffer *bufp; { /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ register unsigned char c, c1; /* A random tempory spot in PATTERN. */ char *p1; /* Points to the end of the buffer, where we should append. */ register unsigned char *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ char *p = pattern; char *pend = pattern + size; /* How to translate the characters in the pattern. */ char *translate = bufp->translate; /* Address of the count-byte of the most recently inserted `exactn' command. This makes it possible to tell if a new exact-match character can be added to that command or if the character requires a new `exactn' command. */ unsigned char *pending_exact = 0; /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its operand. Reset at the beginning of groups and alternatives. */ unsigned char *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ unsigned char *begalt; /* Place in the uncompiled pattern (i.e., the {) to which to go back if the interval is invalid. */ char *beg_interval; /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ unsigned char *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the matching close-group on the compile stack, so the same register number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0; #ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) { unsigned debug_count; for (debug_count = 0; debug_count < size; debug_count++) printchar (pattern[debug_count]); putchar ('\n'); } #endif /* DEBUG */ /* Initialize the compile stack. */ compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) return REG_ESPACE; compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; /* Initialize the pattern buffer. */ bufp->syntax = syntax; bufp->fastmap_accurate = 0; bufp->not_bol = bufp->not_eol = 0; /* Set `used' to zero, so that if we return an error, the pattern printer (for debugging) will think there's no pattern. We reset it at the end. */ bufp->used = 0; /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0; #if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once (); #endif if (bufp->allocated == 0) { if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but that is the user's responsibility. */ RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); } else { /* Caller did not allocate a buffer. Do it for them. */ bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); } if (!bufp->buffer) return REG_ESPACE; bufp->allocated = INIT_BUF_SIZE; } begalt = b = bufp->buffer; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) { PATFETCH (c); switch (c) { case '^': { if ( /* If at start of pattern, it's an operator. */ p == pattern + 1 /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p (pattern, p, syntax)) BUF_PUSH (begline); else goto normal_char; } break; case '$': { if ( /* If at end of pattern, it's an operator. */ p == pend /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p (p, pend, syntax)) BUF_PUSH (endline); else goto normal_char; } break; case '+': case '?': if ((syntax & RE_BK_PLUS_QM) || (syntax & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern... */ if (!laststart) { if (syntax & RE_CONTEXT_INVALID_OPS) return REG_BADRPT; else if (!(syntax & RE_CONTEXT_INDEP_OPS)) goto normal_char; } { /* Are we optimizing this jump? */ boolean keep_string_p = false; /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine interval operators with these because of, e.g., `a{2}*', which should only match an even number of `a's. */ for (;;) { zero_times_ok |= c != '+'; many_times_ok |= c != '?'; if (p == pend) break; PATFETCH (c); if (c == '*' || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) ; else if (syntax & RE_BK_PLUS_QM && c == '\\') { if (p == pend) return REG_EESCAPE; PATFETCH (c1); if (!(c1 == '+' || c1 == '?')) { PATUNFETCH; PATUNFETCH; break; } c = c1; } else { PATUNFETCH; break; } /* If we get here, we found another repeat character. */ } /* Star, etc. applied to an empty pattern is equivalent to an empty pattern. */ if (!laststart) break; /* Now we know whether or not zero matches is allowed and also whether or not two or more matches is allowed. */ if (many_times_ok) { /* More than one repetition is allowed, so put in at the end a backward relative jump from `b' to before the next jump we're going to put in below (which jumps from laststart to after this jump). But if we are at the `*' in the exact sequence `.*\n', insert an unconditional jump backwards to the ., instead of the beginning of the loop. This way we only push a failure point once, instead of every time through the loop. */ assert (p - 1 > pattern); /* Allocate the space for the jump. */ GET_BUFFER_SPACE (3); /* We know we are not at the first character of the pattern, because laststart was nonzero. And we've already incremented `p', by the way, to be the character after the `*'. Do we have to do something analogous here for null bytes, because of RE_DOT_NOT_NULL? */ if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') && zero_times_ok && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP (jump, b, laststart); keep_string_p = true; } else /* Anything else. */ STORE_JUMP (maybe_pop_jump, b, laststart - 3); /* We've added more stuff to the buffer. */ b += 3; } /* On failure, jump from laststart to b + 3, which will be the end of the buffer after this jump is inserted. */ GET_BUFFER_SPACE (3); INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump : on_failure_jump, laststart, b + 3); pending_exact = 0; b += 3; if (!zero_times_ok) { /* At least one repetition is required, so insert a `dummy_failure_jump' before the initial `on_failure_jump' instruction of the loop. This effects a skip over that instruction the first time we hit that loop. */ GET_BUFFER_SPACE (3); INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); b += 3; } } break; case '.': laststart = b; BUF_PUSH (anychar); break; case '[': { boolean had_char_class = false; if (p == pend) return REG_EBRACK; /* Ensure that we have enough space to push a charset: the opcode, the length count, and the bitset; 34 bytes in all. */ GET_BUFFER_SPACE (34); laststart = b; /* We test `*p == '^' twice, instead of using an if statement, so we only need one BUF_PUSH. */ BUF_PUSH (*p == '^' ? charset_not : charset); if (*p == '^') p++; /* Remember the first position in the bracket expression. */ p1 = p; /* Push the number of bytes in the bitmap. */ BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); /* Clear the whole map. */ bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); /* charset_not matches newline according to a syntax bit. */ if ((re_opcode_t) b[-2] == charset_not && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT ('\n'); /* Read in characters and ranges, setting map bits. */ for (;;) { if (p == pend) return REG_EBRACK; PATFETCH (c); /* \ might escape characters inside [...] and [^...]. */ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { if (p == pend) return REG_EESCAPE; PATFETCH (c1); SET_LIST_BIT (c1); continue; } /* Could be the end of the bracket expression. If it's not (i.e., when the bracket expression is `[]' so far), the ']' character bit gets set way below. */ if (c == ']' && p != p1 + 1) break; /* Look ahead to see if it's a range when the last thing was a character class. */ if (had_char_class && c == '-' && *p != ']') return REG_ERANGE; /* Look ahead to see if it's a range when the last thing was a character: if this is a hyphen not at the beginning or the end of a list, then it's the range operator. */ if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') && *p != ']') { reg_errcode_t ret = compile_range (&p, pend, translate, syntax, b); if (ret != REG_NOERROR) return ret; } else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */ reg_errcode_t ret; /* Move past the `-'. */ PATFETCH (c1); ret = compile_range (&p, pend, translate, syntax, b); if (ret != REG_NOERROR) return ret; } /* See if we're at the beginning of a possible character class. */ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ char str[CHAR_CLASS_MAX_LENGTH + 1]; PATFETCH (c); c1 = 0; /* If pattern is `[[:'. */ if (p == pend) return REG_EBRACK; for (;;) { PATFETCH (c); if (c == ':' || c == ']' || p == pend || c1 == CHAR_CLASS_MAX_LENGTH) break; str[c1++] = c; } str[c1] = '\0'; /* If isn't a word bracketed by `[:' and:`]': undo the ending character, the letters, and leave the leading `:' and `[' (but set bits for them). */ if (c == ':' && *p == ']') { int ch; boolean is_alnum = STREQ (str, "alnum"); boolean is_alpha = STREQ (str, "alpha"); boolean is_blank = STREQ (str, "blank"); boolean is_cntrl = STREQ (str, "cntrl"); boolean is_digit = STREQ (str, "digit"); boolean is_graph = STREQ (str, "graph"); boolean is_lower = STREQ (str, "lower"); boolean is_print = STREQ (str, "print"); boolean is_punct = STREQ (str, "punct"); boolean is_space = STREQ (str, "space"); boolean is_upper = STREQ (str, "upper"); boolean is_xdigit = STREQ (str, "xdigit"); if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; /* Throw away the ] at the end of the character class. */ PATFETCH (c); if (p == pend) return REG_EBRACK; for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { if ( (is_alnum && ISALNUM (ch)) || (is_alpha && ISALPHA (ch)) || (is_blank && ISBLANK (ch)) || (is_cntrl && ISCNTRL (ch)) || (is_digit && ISDIGIT (ch)) || (is_graph && ISGRAPH (ch)) || (is_lower && ISLOWER (ch)) || (is_print && ISPRINT (ch)) || (is_punct && ISPUNCT (ch)) || (is_space && ISSPACE (ch)) || (is_upper && ISUPPER (ch)) || (is_xdigit && ISXDIGIT (ch))) SET_LIST_BIT (ch); } had_char_class = true; } else { c1++; while (c1--) PATUNFETCH; SET_LIST_BIT ('['); SET_LIST_BIT (':'); had_char_class = false; } } else { had_char_class = false; SET_LIST_BIT (c); } } /* Discard any (non)matching list bytes that are all 0 at the end of the map. Decrease the map-length byte too. */ while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; b += b[-1]; } break; case '(': if (syntax & RE_NO_BK_PARENS) goto handle_open; else goto normal_char; case ')': if (syntax & RE_NO_BK_PARENS) goto handle_close; else goto normal_char; case '\n': if (syntax & RE_NEWLINE_ALT) goto handle_alt; else goto normal_char; case '|': if (syntax & RE_NO_BK_VBAR) goto handle_alt; else goto normal_char; case '{': if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) goto handle_interval; else goto normal_char; case '\\': if (p == pend) return REG_EESCAPE; /* Do not translate the character after the \, so that we can distinguish, e.g., \B from \b, even if we normally would translate, e.g., B to b. */ PATFETCH_RAW (c); switch (c) { case '(': if (syntax & RE_NO_BK_PARENS) goto normal_backslash; handle_open: bufp->re_nsub++; regnum++; if (COMPILE_STACK_FULL) { RETALLOC (compile_stack.stack, compile_stack.size << 1, compile_stack_elt_t); if (compile_stack.stack == NULL) return REG_ESPACE; compile_stack.size <<= 1; } /* These are the values to restore when we hit end of this group. They are all relative offsets, so that if the whole pattern moves because of realloc, they will still be valid. */ COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; COMPILE_STACK_TOP.fixup_alt_jump = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; COMPILE_STACK_TOP.regnum = regnum; /* We will eventually replace the 0 with the number of groups inner to this one. But do not push a start_memory for groups beyond the last one we can represent in the compiled pattern. */ if (regnum <= MAX_REGNUM) { COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; BUF_PUSH_3 (start_memory, regnum, 0); } compile_stack.avail++; fixup_alt_jump = 0; laststart = 0; begalt = b; /* If we've reached MAX_REGNUM groups, then this open won't actually generate any code, so we'll have to clear pending_exact explicitly. */ pending_exact = 0; break; case ')': if (syntax & RE_NO_BK_PARENS) goto normal_backslash; if (COMPILE_STACK_EMPTY) { if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_backslash; else return REG_ERPAREN; } handle_close: if (fixup_alt_jump) { /* Push a dummy failure point at the end of the alternative for a possible future `pop_failure_jump' to pop. See comments at `push_dummy_failure' in `re_match_2'. */ BUF_PUSH (push_dummy_failure); /* We allocated space for this jump when we assigned to `fixup_alt_jump', in the `handle_alt' case below. */ STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); } /* See similar code for backslashed left paren above. */ if (COMPILE_STACK_EMPTY) { if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; else return REG_ERPAREN; } /* Since we just checked for an empty stack above, this ``can't happen''. */ assert (compile_stack.avail != 0); { /* We don't just want to restore into `regnum', because later groups should continue to be numbered higher, as in `(ab)c(de)' -- the second group is #2. */ regnum_t this_group_regnum; compile_stack.avail--; begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; fixup_alt_jump = COMPILE_STACK_TOP.fixup_alt_jump ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0; laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; this_group_regnum = COMPILE_STACK_TOP.regnum; /* If we've reached MAX_REGNUM groups, then this open won't actually generate any code, so we'll have to clear pending_exact explicitly. */ pending_exact = 0; /* We're at the end of the group, so now we know how many groups were inside this one. */ if (this_group_regnum <= MAX_REGNUM) { unsigned char *inner_group_loc = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; *inner_group_loc = regnum - this_group_regnum; BUF_PUSH_3 (stop_memory, this_group_regnum, regnum - this_group_regnum); } } break; case '|': /* `\|'. */ if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) goto normal_backslash; handle_alt: if (syntax & RE_LIMITED_OPS) goto normal_char; /* Insert before the previous alternative a jump which jumps to this alternative if the former fails. */ GET_BUFFER_SPACE (3); INSERT_JUMP (on_failure_jump, begalt, b + 6); pending_exact = 0; b += 3; /* The alternative before this one has a jump after it which gets executed if it gets matched. Adjust that jump so it will jump to this alternative's analogous jump (put in below, which in turn will jump to the next (if any) alternative's such jump, etc.). The last such jump jumps to the correct final destination. A picture: _____ _____ | | | | | v | v a | b | c If we are at `b', then fixup_alt_jump right now points to a three-byte space after `a'. We'll put in the jump, set fixup_alt_jump to right after `b', and leave behind three bytes which we'll fill in when we get to after `c'. */ if (fixup_alt_jump) STORE_JUMP (jump_past_alt, fixup_alt_jump, b); /* Mark and leave space for a jump after this alternative, to be filled in later either by next alternative or when know we're at the end of a series of alternatives. */ fixup_alt_jump = b; GET_BUFFER_SPACE (3); b += 3; laststart = 0; begalt = b; break; case '{': /* If \{ is a literal. */ if (!(syntax & RE_INTERVALS) /* If we're at `\{' and it's not the open-interval operator. */ || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern && p == pend)) goto normal_backslash; handle_interval: { /* If got here, then the syntax allows intervals. */ /* At least (most) this many matches must be made. */ int lower_bound = -1, upper_bound = -1; beg_interval = p - 1; if (p == pend) { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; else return REG_EBRACE; } GET_UNSIGNED_NUMBER (lower_bound); if (c == ',') { GET_UNSIGNED_NUMBER (upper_bound); if (upper_bound < 0) upper_bound = RE_DUP_MAX; } else /* Interval such as `{1}' => match exactly once. */ upper_bound = lower_bound; if (lower_bound < 0 || upper_bound > RE_DUP_MAX || lower_bound > upper_bound) { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; else return REG_BADBR; } if (!(syntax & RE_NO_BK_BRACES)) { if (c != '\\') return REG_EBRACE; PATFETCH (c); } if (c != '}') { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; else return REG_BADBR; } /* We just parsed a valid interval. */ /* If it's invalid to have no preceding re. */ if (!laststart) { if (syntax & RE_CONTEXT_INVALID_OPS) return REG_BADRPT; else if (syntax & RE_CONTEXT_INDEP_OPS) laststart = b; else goto unfetch_interval; } /* If the upper bound is zero, don't want to succeed at all; jump from `laststart' to `b + 3', which will be the end of the buffer after we insert the jump. */ if (upper_bound == 0) { GET_BUFFER_SPACE (3); INSERT_JUMP (jump, laststart, b + 3); b += 3; } /* Otherwise, we have a nontrivial interval. When we're all done, the pattern will look like: set_number_at set_number_at succeed_n jump_n (The upper bound and `jump_n' are omitted if `upper_bound' is 1, though.) */ else { /* If the upper bound is > 1, we need to insert more at the end of the loop. */ unsigned nbytes = 10 + (upper_bound > 1) * 10; GET_BUFFER_SPACE (nbytes); /* Initialize lower bound of the `succeed_n', even though it will be set during matching by its attendant `set_number_at' (inserted next), because `re_compile_fastmap' needs to know. Jump to the `jump_n' we might insert below. */ INSERT_JUMP2 (succeed_n, laststart, b + 5 + (upper_bound > 1) * 5, lower_bound); b += 5; /* Code to initialize the lower bound. Insert before the `succeed_n'. The `5' is the last two bytes of this `set_number_at', plus 3 bytes of the following `succeed_n'. */ insert_op2 (set_number_at, laststart, 5, lower_bound, b); b += 5; if (upper_bound > 1) { /* More than one repetition is allowed, so append a backward jump to the `succeed_n' that starts this interval. When we've reached this during matching, we'll have matched the interval once, so jump back only `upper_bound - 1' times. */ STORE_JUMP2 (jump_n, b, laststart + 5, upper_bound - 1); b += 5; /* The location we want to set is the second parameter of the `jump_n'; that is `b-2' as an absolute address. `laststart' will be the `set_number_at' we're about to insert; `laststart+3' the number to set, the source for the relative address. But we are inserting into the middle of the pattern -- so everything is getting moved up by 5. Conclusion: (b - 2) - (laststart + 3) + 5, i.e., b - laststart. We insert this at the beginning of the loop so that if we fail during matching, we'll reinitialize the bounds. */ insert_op2 (set_number_at, laststart, b - laststart, upper_bound - 1, b); b += 5; } } pending_exact = 0; beg_interval = NULL; } break; unfetch_interval: /* If an invalid interval, match the characters as literals. */ assert (beg_interval); p = beg_interval; beg_interval = NULL; /* normal_char and normal_backslash need `c'. */ PATFETCH (c); if (!(syntax & RE_NO_BK_BRACES)) { if (p > pattern && p[-1] == '\\') goto normal_backslash; } goto normal_char; #ifdef emacs /* There is no way to specify the before_dot and after_dot operators. rms says this is ok. --karl */ case '=': BUF_PUSH (at_dot); break; case 's': laststart = b; PATFETCH (c); BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); break; case 'S': laststart = b; PATFETCH (c); BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); break; #endif /* emacs */ case 'w': laststart = b; BUF_PUSH (wordchar); break; case 'W': laststart = b; BUF_PUSH (notwordchar); break; case '<': BUF_PUSH (wordbeg); break; case '>': BUF_PUSH (wordend); break; case 'b': BUF_PUSH (wordbound); break; case 'B': BUF_PUSH (notwordbound); break; case '`': BUF_PUSH (begbuf); break; case '\'': BUF_PUSH (endbuf); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (syntax & RE_NO_BK_REFS) goto normal_char; c1 = c - '0'; if (c1 > regnum) return REG_ESUBREG; /* Can't back reference to a subexpression if inside of it. */ if (group_in_compile_stack (compile_stack, c1)) goto normal_char; laststart = b; BUF_PUSH_2 (duplicate, c1); break; case '+': case '?': if (syntax & RE_BK_PLUS_QM) goto handle_plus; else goto normal_backslash; default: normal_backslash: /* You might think it would be useful for \ to mean not to translate; but if we don't translate it it will never match anything. */ c = TRANSLATE (c); goto normal_char; } break; default: /* Expects the character in `c'. */ normal_char: /* If no exactn currently being built. */ if (!pending_exact /* If last exactn not at current position. */ || pending_exact + *pending_exact + 1 != b /* We have only one byte following the exactn for the count. */ || *pending_exact == (1 << BYTEWIDTH) - 1 /* If followed by a repetition operator. */ || *p == '*' || *p == '^' || ((syntax & RE_BK_PLUS_QM) ? *p == '\\' && (p[1] == '+' || p[1] == '?') : (*p == '+' || *p == '?')) || ((syntax & RE_INTERVALS) && ((syntax & RE_NO_BK_BRACES) ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) { /* Start building a new exactn. */ laststart = b; BUF_PUSH_2 (exactn, 0); pending_exact = b - 1; } BUF_PUSH (c); (*pending_exact)++; break; } /* switch (c) */ } /* while p != pend */ /* Through the pattern now. */ if (fixup_alt_jump) STORE_JUMP (jump_past_alt, fixup_alt_jump, b); if (!COMPILE_STACK_EMPTY) return REG_EPAREN; free (compile_stack.stack); /* We have succeeded; set the length of the buffer. */ bufp->used = b - bufp->buffer; #ifdef DEBUG if (debug) { DEBUG_PRINT1 ("\nCompiled pattern: "); print_compiled_pattern (bufp); } #endif /* DEBUG */ return REG_NOERROR; } /* regex_compile */ /* Subroutines for `regex_compile'. */ /* Store OP at LOC followed by two-byte integer parameter ARG. */ static void store_op1 (op, loc, arg) re_opcode_t op; unsigned char *loc; int arg; { *loc = (unsigned char) op; STORE_NUMBER (loc + 1, arg); } /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ static void store_op2 (op, loc, arg1, arg2) re_opcode_t op; unsigned char *loc; int arg1, arg2; { *loc = (unsigned char) op; STORE_NUMBER (loc + 1, arg1); STORE_NUMBER (loc + 3, arg2); } /* Copy the bytes from LOC to END to open up three bytes of space at LOC for OP followed by two-byte integer parameter ARG. */ static void insert_op1 (op, loc, arg, end) re_opcode_t op; unsigned char *loc; int arg; unsigned char *end; { register unsigned char *pfrom = end; register unsigned char *pto = end + 3; while (pfrom != loc) *--pto = *--pfrom; store_op1 (op, loc, arg); } /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ static void insert_op2 (op, loc, arg1, arg2, end) re_opcode_t op; unsigned char *loc; int arg1, arg2; unsigned char *end; { register unsigned char *pfrom = end; register unsigned char *pto = end + 5; while (pfrom != loc) *--pto = *--pfrom; store_op2 (op, loc, arg1, arg2); } /* P points to just after a ^ in PATTERN. Return true if that ^ comes after an alternative or a begin-subexpression. We assume there is at least one character before the ^. */ static boolean at_begline_loc_p (pattern, p, syntax) char *pattern, *p; reg_syntax_t syntax; { char *prev = p - 2; boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; return /* After a subexpression? */ (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) /* After an alternative? */ || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); } /* The dual of at_begline_loc_p. This one is for $. We assume there is at least one character after the $, i.e., `P < PEND'. */ static boolean at_endline_loc_p (p, pend, syntax) char *p, *pend; int syntax; { char *next = p; boolean next_backslash = *next == '\\'; char *next_next = p + 1 < pend ? p + 1 : NULL; return /* Before a subexpression? */ (syntax & RE_NO_BK_PARENS ? *next == ')' : next_backslash && next_next && *next_next == ')') /* Before an alternative? */ || (syntax & RE_NO_BK_VBAR ? *next == '|' : next_backslash && next_next && *next_next == '|'); } /* Returns true if REGNUM is in one of COMPILE_STACK's elements and false if it's not. */ static boolean group_in_compile_stack (compile_stack, regnum) compile_stack_type compile_stack; regnum_t regnum; { int this_element; for (this_element = compile_stack.avail - 1; this_element >= 0; this_element--) if (compile_stack.stack[this_element].regnum == regnum) return true; return false; } /* Read the ending character of a range (in a bracket expression) from the uncompiled pattern *P_PTR (which ends at PEND). We assume the starting character is in `P[-2]'. (`P[-1]' is the character `-'.) Then we set the translation of all bits between the starting and ending characters (inclusive) in the compiled pattern B. Return an error code. We use these short variable names so we can use the same macros as `regex_compile' itself. */ static reg_errcode_t compile_range (p_ptr, pend, translate, syntax, b) char **p_ptr, *pend; char *translate; reg_syntax_t syntax; unsigned char *b; { unsigned this_char; char *p = *p_ptr; int range_start, range_end; if (p == pend) return REG_ERANGE; /* Even though the pattern is a signed `char *', we need to fetch with unsigned char *'s; if the high bit of the pattern character is set, the range endpoints will be negative if we fetch using a signed char *. We also want to fetch the endpoints without translating them; the appropriate translation is done in the bit-setting loop below. */ range_start = ((unsigned char *) p)[-2]; range_end = ((unsigned char *) p)[0]; /* Have to increment the pointer into the pattern string, so the caller isn't still at the ending character. */ (*p_ptr)++; /* If the start is after the end, the range is empty. */ if (range_start > range_end) return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; /* Here we see why `this_char' has to be larger than an `unsigned char' -- the range is inclusive, so if `range_end' == 0xff (assuming 8-bit characters), we would otherwise go into an infinite loop, since all characters <= 0xff. */ for (this_char = range_start; this_char <= range_end; this_char++) { SET_LIST_BIT (TRANSLATE (this_char)); } return REG_NOERROR; } /* Failure stack declarations and macros; both re_compile_fastmap and re_match_2 use a failure stack. These have to be macros because of REGEX_ALLOCATE. */ /* Number of failure points for which to initially allocate space when matching. If this number is exceeded, we allocate more space, so it is not a hard limit. */ #ifndef INIT_FAILURE_ALLOC #define INIT_FAILURE_ALLOC 5 #endif /* Roughly the maximum number of failure points on the stack. Would be exactly that if always used MAX_FAILURE_SPACE each time we failed. This is a variable only so users of regex can assign to it; we never change it ourselves. */ int re_max_failures = 2000; typedef unsigned char *fail_stack_elt_t; typedef struct { fail_stack_elt_t *stack; unsigned size; unsigned avail; /* Offset of next open position. */ } fail_stack_type; #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ #define INIT_FAIL_STACK() \ do { \ fail_stack.stack = (fail_stack_elt_t *) \ REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ \ if (fail_stack.stack == NULL) \ return -2; \ \ fail_stack.size = INIT_FAILURE_ALLOC; \ fail_stack.avail = 0; \ } while (0) /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. Return 1 if succeeds, and 0 if either ran out of memory allocating space for it or it was already too large. REGEX_REALLOCATE requires `destination' be declared. */ #define DOUBLE_FAIL_STACK(fail_stack) \ ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ ? 0 \ : ((fail_stack).stack = (fail_stack_elt_t *) \ REGEX_REALLOCATE ((fail_stack).stack, \ (fail_stack).size * sizeof (fail_stack_elt_t), \ ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ \ (fail_stack).stack == NULL \ ? 0 \ : ((fail_stack).size <<= 1, \ 1))) /* Push PATTERN_OP on FAIL_STACK. Return 1 if was able to do so and 0 if ran out of memory allocating space to do so. */ #define PUSH_PATTERN_OP(pattern_op, fail_stack) \ ((FAIL_STACK_FULL () \ && !DOUBLE_FAIL_STACK (fail_stack)) \ ? 0 \ : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ 1)) /* This pushes an item onto the failure stack. Must be a four-byte value. Assumes the variable `fail_stack'. Probably should only be called from within `PUSH_FAILURE_POINT'. */ #define PUSH_FAILURE_ITEM(item) \ fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) ((unsigned long) item) /* The complement operation. Assumes `fail_stack' is nonempty. */ #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] /* Used to omit pushing failure point id's when we're not debugging. */ #ifdef DEBUG #define DEBUG_PUSH PUSH_FAILURE_ITEM #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () #else #define DEBUG_PUSH(item) #define DEBUG_POP(item_addr) #endif /* Push the information about the state we will need if we ever fail back to it. Requires variables fail_stack, regstart, regend, reg_info, and num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be declared. Does `return FAILURE_CODE' if runs out of memory. */ #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ do { \ char *destination; \ /* Must be int, so when we don't save any registers, the arithmetic \ of 0 + -1 isn't done as unsigned. */ \ int this_reg; \ \ DEBUG_STATEMENT (failure_id++); \ DEBUG_STATEMENT (nfailure_points_pushed++); \ DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ \ DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ \ /* Ensure we have enough space allocated for what we will push. */ \ while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ { \ if (!DOUBLE_FAIL_STACK (fail_stack)) \ return failure_code; \ \ DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ (fail_stack).size); \ DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ } \ \ /* Push the info, starting with the registers. */ \ DEBUG_PRINT1 ("\n"); \ \ for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ this_reg++) \ { \ DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ DEBUG_STATEMENT (num_regs_pushed++); \ \ DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ PUSH_FAILURE_ITEM (regstart[this_reg]); \ \ DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ PUSH_FAILURE_ITEM (regend[this_reg]); \ \ DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ DEBUG_PRINT2 (" match_null=%d", \ REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ DEBUG_PRINT2 (" matched_something=%d", \ MATCHED_SOMETHING (reg_info[this_reg])); \ DEBUG_PRINT2 (" ever_matched=%d", \ EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ DEBUG_PRINT1 ("\n"); \ PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ } \ \ DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ PUSH_FAILURE_ITEM (lowest_active_reg); \ \ DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ PUSH_FAILURE_ITEM (highest_active_reg); \ \ DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ PUSH_FAILURE_ITEM (pattern_place); \ \ DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ size2); \ DEBUG_PRINT1 ("'\n"); \ PUSH_FAILURE_ITEM (string_place); \ \ DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ DEBUG_PUSH (failure_id); \ } while (0) /* This is the number of items that are pushed and popped on the stack for each register. */ #define NUM_REG_ITEMS 3 /* Individual items aside from the registers. */ #ifdef DEBUG #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ #else #define NUM_NONREG_ITEMS 4 #endif /* We push at most this many items on the stack. */ #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) /* We actually push this many items. */ #define NUM_FAILURE_ITEMS \ ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ + NUM_NONREG_ITEMS) /* How many items can still be added to the stack without overflowing it. */ #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) /* Pops what PUSH_FAIL_STACK pushes. We restore into the parameters, all of which should be lvalues: STR -- the saved data position. PAT -- the saved pattern position. LOW_REG, HIGH_REG -- the highest and lowest active registers. REGSTART, REGEND -- arrays of string positions. REG_INFO -- array of information about each subexpression. Also assumes the variables `fail_stack' and (if debugging), `bufp', `pend', `string1', `size1', `string2', and `size2'. */ #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ { \ DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ int this_reg; \ unsigned char *string_temp; \ \ assert (!FAIL_STACK_EMPTY ()); \ \ /* Remove failure points and point to how many regs pushed. */ \ DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ \ assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ \ DEBUG_POP (&failure_id); \ DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ \ /* If the saved string location is NULL, it came from an \ on_failure_keep_string_jump opcode, and we want to throw away the \ saved NULL, thus retaining our current position in the string. */ \ string_temp = POP_FAILURE_ITEM (); \ if (string_temp != NULL) \ str = (char *) string_temp; \ \ DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ DEBUG_PRINT1 ("'\n"); \ \ pat = (unsigned char *) POP_FAILURE_ITEM (); \ DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ \ /* Restore register info. */ \ high_reg = (unsigned long) POP_FAILURE_ITEM (); \ DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ \ low_reg = (unsigned long) POP_FAILURE_ITEM (); \ DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ \ for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ { \ DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ \ reg_info[this_reg].word = POP_FAILURE_ITEM (); \ DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ \ regend[this_reg] = (char *) POP_FAILURE_ITEM (); \ DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ \ regstart[this_reg] = (char *) POP_FAILURE_ITEM (); \ DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ } \ \ DEBUG_STATEMENT (nfailure_points_popped++); \ } /* POP_FAILURE_POINT */ /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible characters can start a string that matches the pattern. This fastmap is used by re_search to skip quickly over impossible starting points. The caller must supply the address of a (1 << BYTEWIDTH)-byte data area as BUFP->fastmap. We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in the pattern buffer. Returns 0 if we succeed, -2 if an internal error. */ int re_compile_fastmap (bufp) struct re_pattern_buffer *bufp; { int j, k; fail_stack_type fail_stack; #ifndef REGEX_MALLOC char *destination; #endif /* We don't push any register information onto the failure stack. */ unsigned num_regs = 0; register char *fastmap = bufp->fastmap; unsigned char *pattern = bufp->buffer; unsigned long size = bufp->used; unsigned char *p = pattern; register unsigned char *pend = pattern + size; /* Assume that each path through the pattern can be null until proven otherwise. We set this false at the bottom of switch statement, to which we get only if a particular path doesn't match the empty string. */ boolean path_can_be_null = true; /* We aren't doing a `succeed_n' to begin with. */ boolean succeed_n_p = false; assert (fastmap != NULL && p != NULL); INIT_FAIL_STACK (); bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ bufp->fastmap_accurate = 1; /* It will be when we're done. */ bufp->can_be_null = 0; while (p != pend || !FAIL_STACK_EMPTY ()) { if (p == pend) { bufp->can_be_null |= path_can_be_null; /* Reset for next path. */ path_can_be_null = true; p = fail_stack.stack[--fail_stack.avail]; } /* We should never be about to go beyond the end of the pattern. */ assert (p < pend); #ifdef SWITCH_ENUM_BUG switch ((int) ((re_opcode_t) *p++)) #else switch ((re_opcode_t) *p++) #endif { /* I guess the idea here is to simply not bother with a fastmap if a backreference is used, since it's too hard to figure out the fastmap for the corresponding group. Setting `can_be_null' stops `re_search_2' from using the fastmap, so that is all we do. */ case duplicate: bufp->can_be_null = 1; return 0; /* Following are the cases which match a character. These end with `break'. */ case exactn: fastmap[p[1]] = 1; break; case charset: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) fastmap[j] = 1; break; case charset_not: /* Chars beyond end of map must be allowed. */ for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) fastmap[j] = 1; break; case wordchar: for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX (j) == Sword) fastmap[j] = 1; break; case notwordchar: for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX (j) != Sword) fastmap[j] = 1; break; case anychar: /* `.' matches anything ... */ for (j = 0; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; /* ... except perhaps newline. */ if (!(bufp->syntax & RE_DOT_NEWLINE)) fastmap['\n'] = 0; /* Return if we have already set `can_be_null'; if we have, then the fastmap is irrelevant. Something's wrong here. */ else if (bufp->can_be_null) return 0; /* Otherwise, have to check alternative paths. */ break; #ifdef emacs case syntaxspec: k = *p++; for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX (j) == (enum syntaxcode) k) fastmap[j] = 1; break; case notsyntaxspec: k = *p++; for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX (j) != (enum syntaxcode) k) fastmap[j] = 1; break; /* All cases after this match the empty string. These end with `continue'. */ case before_dot: case at_dot: case after_dot: continue; #endif /* not emacs */ case no_op: case begline: case endline: case begbuf: case endbuf: case wordbound: case notwordbound: case wordbeg: case wordend: case push_dummy_failure: continue; case jump_n: case pop_failure_jump: case maybe_pop_jump: case jump: case jump_past_alt: case dummy_failure_jump: EXTRACT_NUMBER_AND_INCR (j, p); p += j; if (j > 0) continue; /* Jump backward implies we just went through the body of a loop and matched nothing. Opcode jumped to should be `on_failure_jump' or `succeed_n'. Just treat it like an ordinary jump. For a * loop, it has pushed its failure point already; if so, discard that as redundant. */ if ((re_opcode_t) *p != on_failure_jump && (re_opcode_t) *p != succeed_n) continue; p++; EXTRACT_NUMBER_AND_INCR (j, p); p += j; /* If what's on the stack is where we are now, pop it. */ if (!FAIL_STACK_EMPTY () && fail_stack.stack[fail_stack.avail - 1] == p) fail_stack.avail--; continue; case on_failure_jump: case on_failure_keep_string_jump: handle_on_failure_jump: EXTRACT_NUMBER_AND_INCR (j, p); /* For some patterns, e.g., `(a?)?', `p+j' here points to the end of the pattern. We don't want to push such a point, since when we restore it above, entering the switch will increment `p' past the end of the pattern. We don't need to push such a point since we obviously won't find any more fastmap entries beyond `pend'. Such a pattern can match the null string, though. */ if (p + j < pend) { if (!PUSH_PATTERN_OP (p + j, fail_stack)) return -2; } else bufp->can_be_null = 1; if (succeed_n_p) { EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ succeed_n_p = false; } continue; case succeed_n: /* Get to the number of times to succeed. */ p += 2; /* Increment p past the n for when k != 0. */ EXTRACT_NUMBER_AND_INCR (k, p); if (k == 0) { p -= 4; succeed_n_p = true; /* Spaghetti code alert. */ goto handle_on_failure_jump; } continue; case set_number_at: p += 4; continue; case start_memory: case stop_memory: p += 2; continue; default: abort (); /* We have listed all the cases. */ } /* switch *p++ */ /* Getting here means we have found the possible starting characters for one path of the pattern -- and that the empty string does not match. We need not follow this path further. Instead, look at the next alternative (remembered on the stack), or quit if no more. The test at the top of the loop does these things. */ path_can_be_null = false; p = pend; } /* while p */ /* Set `can_be_null' for the last path (also the first path, if the pattern is empty). */ bufp->can_be_null |= path_can_be_null; return 0; } /* re_compile_fastmap */ /* Set REGS to hold NUM_REGS registers, storing them in STARTS and ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use this memory for recording register information. STARTS and ENDS must be allocated using the malloc library routine, and must each be at least NUM_REGS * sizeof (regoff_t) bytes long. If NUM_REGS == 0, then subsequent matches should allocate their own register data. Unless this function is called, the first search or match using PATTERN_BUFFER will allocate its own register data, without freeing the old data. */ void re_set_registers (bufp, regs, num_regs, starts, ends) struct re_pattern_buffer *bufp; struct re_registers *regs; unsigned num_regs; regoff_t *starts, *ends; { if (num_regs) { bufp->regs_allocated = REGS_REALLOCATE; regs->num_regs = num_regs; regs->start = starts; regs->end = ends; } else { bufp->regs_allocated = REGS_UNALLOCATED; regs->num_regs = 0; regs->start = regs->end = (regoff_t*) 0; } } /* Searching routines. */ /* Like re_search_2, below, but only one string is specified, and doesn't let you say where to stop matching. */ int re_search (bufp, string, size, startpos, range, regs) struct re_pattern_buffer *bufp; char *string; int size, startpos, range; struct re_registers *regs; { return re_search_2 (bufp, NULL, 0, string, size, startpos, range, regs, size); } /* Using the compiled pattern in BUFP->buffer, first tries to match the virtual concatenation of STRING1 and STRING2, starting first at index STARTPOS, then at STARTPOS + 1, and so on. STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. RANGE is how far to scan while trying to match. RANGE = 0 means try only at STARTPOS; in general, the last start tried is STARTPOS + RANGE. In REGS, return the indices of the virtual concatenation of STRING1 and STRING2 that matched the entire BUFP->buffer and its contained subexpressions. Do not consider matching one past the index STOP in the virtual concatenation of STRING1 and STRING2. We return either the position in the strings at which the match was found, -1 if no match, or -2 if error (such as failure stack overflow). */ int re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) struct re_pattern_buffer *bufp; char *string1, *string2; int size1, size2; int startpos; int range; struct re_registers *regs; int stop; { int val; register char *fastmap = bufp->fastmap; register char *translate = bufp->translate; int total_size = size1 + size2; int endpos = startpos + range; /* Check for out-of-range STARTPOS. */ if (startpos < 0 || startpos > total_size) return -1; /* Fix up RANGE if it might eventually take us outside the virtual concatenation of STRING1 and STRING2. */ if (endpos < -1) range = -1 - startpos; else if (endpos > total_size) range = total_size - startpos; /* If the search isn't to be a backwards one, don't waste time in a search for a pattern that must be anchored. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) { if (startpos > 0) return -1; else range = 1; } /* Update the fastmap now if not correct already. */ if (fastmap && !bufp->fastmap_accurate) if (re_compile_fastmap (bufp) == -2) return -2; /* Loop through the string, looking for a place to start matching. */ for (;;) { /* If a fastmap is supplied, skip quickly over characters that cannot be the start of a match. If the pattern can match the null string, however, we don't need to skip characters; we want the first null string. */ if (fastmap && startpos < total_size && !bufp->can_be_null) { if (range > 0) /* Searching forwards. */ { register char *d; register int lim = 0; int irange = range; if (startpos < size1 && startpos + range >= size1) lim = range - (size1 - startpos); d = (startpos >= size1 ? string2 - size1 : string1) + startpos; /* Written out as an if-else to avoid testing `translate' inside the loop. */ if (translate) while (range > lim && !fastmap[(unsigned char) translate[(unsigned char) *d++]]) range--; else while (range > lim && !fastmap[(unsigned char) *d++]) range--; startpos += irange - range; } else /* Searching backwards. */ { register char c = (size1 == 0 || startpos >= size1 ? string2[startpos - size1] : string1[startpos]); if (!fastmap[(unsigned char) TRANSLATE (c)]) goto advance; } } /* If can't match the null string, and that's all we have left, fail. */ if (range >= 0 && startpos == total_size && fastmap && !bufp->can_be_null) return -1; val = re_match_2 (bufp, string1, size1, string2, size2, startpos, regs, stop); if (val >= 0) return startpos; if (val == -2) return -2; advance: if (!range) break; else if (range > 0) { range--; startpos++; } else { range++; startpos--; } } return -1; } /* re_search_2 */ /* Declarations and macros for re_match_2. */ static int bcmp_translate (); static boolean alt_match_null_string_p (), common_op_match_null_string_p (), group_match_null_string_p (); /* Structure for per-register (a.k.a. per-group) information. This must not be longer than one word, because we push this value onto the failure stack. Other register information, such as the starting and ending positions (which are addresses), and the list of inner groups (which is a bits list) are maintained in separate variables. We are making a (strictly speaking) nonportable assumption here: that the compiler will pack our bit fields into something that fits into the type of `word', i.e., is something that fits into one item on the failure stack. */ typedef union { fail_stack_elt_t word; struct { /* This field is one if this group can match the empty string, zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ #define MATCH_NULL_UNSET_VALUE 3 unsigned match_null_string_p : 2; unsigned is_active : 1; unsigned matched_something : 1; unsigned ever_matched_something : 1; } bits; } register_info_type; #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) #define IS_ACTIVE(R) ((R).bits.is_active) #define MATCHED_SOMETHING(R) ((R).bits.matched_something) #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) /* Call this when have matched a real character; it sets `matched' flags for the subexpressions which we are currently inside. Also records that those subexprs have matched. */ #define SET_REGS_MATCHED() \ do \ { \ unsigned r; \ for (r = lowest_active_reg; r <= highest_active_reg; r++) \ { \ MATCHED_SOMETHING (reg_info[r]) \ = EVER_MATCHED_SOMETHING (reg_info[r]) \ = 1; \ } \ } \ while (0) /* This converts PTR, a pointer into one of the search strings `string1' and `string2' into an offset from the beginning of that string. */ #define POINTER_TO_OFFSET(ptr) \ (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) /* Registers are set to a sentinel when they haven't yet matched. */ #define REG_UNSET_VALUE ((char *) -1) #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) /* Macros for dealing with the split strings in re_match_2. */ #define MATCHING_IN_FIRST_STRING (dend == end_match_1) /* Call before fetching a character with *d. This switches over to string2 if necessary. */ #define PREFETCH() \ while (d == dend) \ { \ /* End of string2 => fail. */ \ if (dend == end_match_2) \ goto fail; \ /* End of string1 => advance to string2. */ \ d = string2; \ dend = end_match_2; \ } /* Test if at very beginning or at very end of the virtual concatenation of `string1' and `string2'. If only one string, it's `string2'. */ #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) #define AT_STRINGS_END(d) ((d) == end2) /* Test if D points to a character which is word-constituent. We have two special cases to check for: if past the end of string1, look at the first character in string2; and if before the beginning of string2, look at the last character in string1. */ #define WORDCHAR_P(d) \ (SYNTAX ((d) == end1 ? *string2 \ : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ == Sword) /* Test if the character before D and the one at D differ with respect to being word-constituent. */ #define AT_WORD_BOUNDARY(d) \ (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) /* Free everything we malloc. */ #ifdef REGEX_MALLOC #define FREE_VAR(var) if (var) free (var); var = NULL #define FREE_VARIABLES() \ do { \ FREE_VAR (fail_stack.stack); \ FREE_VAR (regstart); \ FREE_VAR (regend); \ FREE_VAR (old_regstart); \ FREE_VAR (old_regend); \ FREE_VAR (best_regstart); \ FREE_VAR (best_regend); \ FREE_VAR (reg_info); \ FREE_VAR (reg_dummy); \ FREE_VAR (reg_info_dummy); \ } while (0) #else /* not REGEX_MALLOC */ /* Some MIPS systems (at least) want this to free alloca'd storage. */ #define FREE_VARIABLES() alloca (0) #endif /* not REGEX_MALLOC */ /* These values must meet several constraints. They must not be valid register values; since we have a limit of 255 registers (because we use only one byte in the pattern for the register number), we can use numbers larger than 255. They must differ by 1, because of NUM_FAILURE_ITEMS above. And the value for the lowest register must be larger than the value for the highest register, so we do not try to actually save any registers when none are active. */ #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) /* Matching routines. */ #ifndef emacs /* Emacs never uses this. */ /* re_match is like re_match_2 except it takes only a single string. */ int re_match (bufp, string, size, pos, regs) struct re_pattern_buffer *bufp; char *string; int size, pos; struct re_registers *regs; { return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); } #endif /* not emacs */ /* re_match_2 matches the compiled pattern in BUFP against the the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and SIZE2, respectively). We start matching at POS, and stop matching at STOP. If REGS is non-null and the `no_sub' field of BUFP is nonzero, we store offsets for the substring each group matched in REGS. See the documentation for exactly how many groups we fill. We return -1 if no match, -2 if an internal error (such as the failure stack overflowing). Otherwise, we return the length of the matched substring. */ int re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) struct re_pattern_buffer *bufp; char *string1, *string2; int size1, size2; int pos; struct re_registers *regs; int stop; { /* General temporaries. */ int mcnt; unsigned char *p1; /* Just past the end of the corresponding string. */ char *end1, *end2; /* Pointers into string1 and string2, just past the last characters in each to consider matching. */ char *end_match_1, *end_match_2; /* Where we are in the data, and the end of the current string. */ char *d, *dend; /* Where we are in the pattern, and the end of the pattern. */ unsigned char *p = bufp->buffer; register unsigned char *pend = p + bufp->used; /* We use this to map every character in the string. */ char *translate = bufp->translate; /* Failure point stack. Each place that can handle a failure further down the line pushes a failure point on this stack. It consists of restart, regend, and reg_info for all registers corresponding to the subexpressions we're currently inside, plus the number of such registers, and, finally, two char *'s. The first char * is where to resume scanning the pattern; the second one is where to resume scanning the strings. If the latter is zero, the failure point is a ``dummy''; if a failure happens and the failure point is a dummy, it gets discarded and the next next one is tried. */ fail_stack_type fail_stack; #ifdef DEBUG static unsigned failure_id = 0; unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; #endif /* We fill all the registers internally, independent of what we return, for use in backreferences. The number here includes an element for register zero. */ unsigned num_regs = bufp->re_nsub + 1; /* The currently active registers. */ unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; /* Information on the contents of registers. These are pointers into the input strings; they record just what was matched (on this attempt) by a subexpression part of the pattern, that is, the regnum-th regstart pointer points to where in the pattern we began matching and the regnum-th regend points to right after where we stopped matching the regnum-th subexpression. (The zeroth register keeps track of what the whole pattern matches.) */ char **regstart, **regend; /* If a group that's operated upon by a repetition operator fails to match anything, then the register for its start will need to be restored because it will have been set to wherever in the string we are when we last see its open-group operator. Similarly for a register's end. */ char **old_regstart, **old_regend; /* The is_active field of reg_info helps us keep track of which (possibly nested) subexpressions we are currently in. The matched_something field of reg_info[reg_num] helps us tell whether or not we have matched any of the pattern so far this time through the reg_num-th subexpression. These two fields get reset each time through any loop their register is in. */ register_info_type *reg_info; /* The following record the register info as found in the above variables when we find a match better than any we've seen before. This happens as we backtrack through the failure points, which in turn happens only if we have not yet matched the entire string. */ unsigned best_regs_set = false; char **best_regstart, **best_regend; /* Logically, this is `best_regend[0]'. But we don't want to have to allocate space for that if we're not allocating space for anything else (see below). Also, we never need info about register 0 for any of the other register vectors, and it seems rather a kludge to treat `best_regend' differently than the rest. So we keep track of the end of the best match so far in a separate variable. We initialize this to NULL so that when we backtrack the first time and need to test it, it's not garbage. */ char *match_end = NULL; /* Used when we pop values we don't care about. */ char **reg_dummy; register_info_type *reg_info_dummy; #ifdef DEBUG /* Counts the total number of registers pushed. */ unsigned num_regs_pushed = 0; #endif DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); INIT_FAIL_STACK (); /* Do not bother to initialize all the register variables if there are no groups in the pattern, as it takes a fair amount of time. If there are groups, we include space for register 0 (the whole pattern), even though we never use it, since it simplifies the array indexing. We should fix this. */ if (bufp->re_nsub) { regstart = REGEX_TALLOC (num_regs, char *); regend = REGEX_TALLOC (num_regs, char *); old_regstart = REGEX_TALLOC (num_regs, char *); old_regend = REGEX_TALLOC (num_regs, char *); best_regstart = REGEX_TALLOC (num_regs, char *); best_regend = REGEX_TALLOC (num_regs, char *); reg_info = REGEX_TALLOC (num_regs, register_info_type); reg_dummy = REGEX_TALLOC (num_regs, char *); reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); if (!(regstart && regend && old_regstart && old_regend && reg_info && best_regstart && best_regend && reg_dummy && reg_info_dummy)) { FREE_VARIABLES (); return -2; } } #ifdef REGEX_MALLOC else { /* We must initialize all our variables to NULL, so that `FREE_VARIABLES' doesn't try to free them. */ regstart = regend = old_regstart = old_regend = best_regstart = best_regend = reg_dummy = NULL; reg_info = reg_info_dummy = (register_info_type *) NULL; } #endif /* REGEX_MALLOC */ /* The starting position is bogus. */ if (pos < 0 || pos > size1 + size2) { FREE_VARIABLES (); return -1; } /* Initialize subexpression text positions to -1 to mark ones that no start_memory/stop_memory has been seen for. Also initialize the register information struct. */ for (mcnt = 1; mcnt < num_regs; mcnt++) { regstart[mcnt] = regend[mcnt] = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; IS_ACTIVE (reg_info[mcnt]) = 0; MATCHED_SOMETHING (reg_info[mcnt]) = 0; EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; } /* We move `string1' into `string2' if the latter's empty -- but not if `string1' is null. */ if (size2 == 0 && string1 != NULL) { string2 = string1; size2 = size1; string1 = 0; size1 = 0; } end1 = string1 + size1; end2 = string2 + size2; /* Compute where to stop matching, within the two strings. */ if (stop <= size1) { end_match_1 = string1 + stop; end_match_2 = string2; } else { end_match_1 = end1; end_match_2 = string2 + stop - size1; } /* `p' scans through the pattern as `d' scans through the data. `dend' is the end of the input string that `d' points within. `d' is advanced into the following input string whenever necessary, but this happens before fetching; therefore, at the beginning of the loop, `d' can be pointing at the end of a string, but it cannot equal `string2'. */ if (size1 > 0 && pos <= size1) { d = string1 + pos; dend = end_match_1; } else { d = string2 + pos - size1; dend = end_match_2; } DEBUG_PRINT1 ("The compiled pattern is: "); DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); DEBUG_PRINT1 ("The string to match is: `"); DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); DEBUG_PRINT1 ("'\n"); /* This loops over pattern commands. It exits by returning from the function if the match is complete, or it drops through if the match fails at this starting point in the input data. */ for (;;) { DEBUG_PRINT2 ("\n0x%x: ", p); if (p == pend) { /* End of pattern means we might have succeeded. */ DEBUG_PRINT1 ("end of pattern ... "); /* If we haven't matched the entire string, and we want the longest match, try backtracking. */ if (d != end_match_2) { DEBUG_PRINT1 ("backtracking.\n"); if (!FAIL_STACK_EMPTY ()) { /* More failure points to try. */ boolean same_str_p = (FIRST_STRING_P (match_end) == MATCHING_IN_FIRST_STRING); /* If exceeds best match so far, save it. */ if (!best_regs_set || (same_str_p && d > match_end) || (!same_str_p && !MATCHING_IN_FIRST_STRING)) { best_regs_set = true; match_end = d; DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); for (mcnt = 1; mcnt < num_regs; mcnt++) { best_regstart[mcnt] = regstart[mcnt]; best_regend[mcnt] = regend[mcnt]; } } goto fail; } /* If no failure points, don't restore garbage. */ else if (best_regs_set) { restore_best_regs: /* Restore best match. It may happen that `dend == end_match_1' while the restored d is in string2. For example, the pattern `x.*y.*z' against the strings `x-' and `y-z-', if the two strings are not consecutive in memory. */ DEBUG_PRINT1 ("Restoring best registers.\n"); d = match_end; dend = ((d >= string1 && d <= end1) ? end_match_1 : end_match_2); for (mcnt = 1; mcnt < num_regs; mcnt++) { regstart[mcnt] = best_regstart[mcnt]; regend[mcnt] = best_regend[mcnt]; } } } /* d != end_match_2 */ DEBUG_PRINT1 ("Accepting match.\n"); /* If caller wants register contents data back, do it. */ if (regs && !bufp->no_sub) { /* Have the register data arrays been allocated? */ if (bufp->regs_allocated == REGS_UNALLOCATED) { /* No. So allocate them with malloc. We need one extra element beyond `num_regs' for the `-1' marker GNU code uses. */ regs->num_regs = MAX (RE_NREGS, num_regs + 1); regs->start = TALLOC (regs->num_regs, regoff_t); regs->end = TALLOC (regs->num_regs, regoff_t); if (regs->start == NULL || regs->end == NULL) return -2; bufp->regs_allocated = REGS_REALLOCATE; } else if (bufp->regs_allocated == REGS_REALLOCATE) { /* Yes. If we need more elements than were already allocated, reallocate them. If we need fewer, just leave it alone. */ if (regs->num_regs < num_regs + 1) { regs->num_regs = num_regs + 1; RETALLOC (regs->start, regs->num_regs, regoff_t); RETALLOC (regs->end, regs->num_regs, regoff_t); if (regs->start == NULL || regs->end == NULL) return -2; } } else assert (bufp->regs_allocated == REGS_FIXED); /* Convert the pointer data in `regstart' and `regend' to indices. Register zero has to be set differently, since we haven't kept track of any info for it. */ if (regs->num_regs > 0) { regs->start[0] = pos; regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 : d - string2 + size1); } /* Go through the first `min (num_regs, regs->num_regs)' registers, since that is all we initialized. */ for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) { if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) regs->start[mcnt] = regs->end[mcnt] = -1; else { regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); } } /* If the regs structure we return has more elements than were in the pattern, set the extra elements to -1. If we (re)allocated the registers, this is the case, because we always allocate enough to have at least one -1 at the end. */ for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) regs->start[mcnt] = regs->end[mcnt] = -1; } /* regs && !bufp->no_sub */ FREE_VARIABLES (); DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", nfailure_points_pushed, nfailure_points_popped, nfailure_points_pushed - nfailure_points_popped); DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); mcnt = d - pos - (MATCHING_IN_FIRST_STRING ? string1 : string2 - size1); DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); return mcnt; } /* Otherwise match next pattern command. */ #ifdef SWITCH_ENUM_BUG switch ((int) ((re_opcode_t) *p++)) #else switch ((re_opcode_t) *p++) #endif { /* Ignore these. Used to ignore the n of succeed_n's which currently have n == 0. */ case no_op: DEBUG_PRINT1 ("EXECUTING no_op.\n"); break; /* Match the next n pattern characters exactly. The following byte in the pattern defines n, and the n bytes after that are the characters to match. */ case exactn: mcnt = *p++; DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); /* This is written out as an if-else so we don't waste time testing `translate' inside the loop. */ if (translate) { do { PREFETCH (); if (translate[(unsigned char) *d++] != (char) *p++) goto fail; } while (--mcnt); } else { do { PREFETCH (); if (*d++ != (char) *p++) goto fail; } while (--mcnt); } SET_REGS_MATCHED (); break; /* Match any character except possibly a newline or a null. */ case anychar: DEBUG_PRINT1 ("EXECUTING anychar.\n"); PREFETCH (); if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) goto fail; SET_REGS_MATCHED (); DEBUG_PRINT2 (" Matched `%d'.\n", *d); d++; break; case charset: case charset_not: { register unsigned char c; boolean not = (re_opcode_t) *(p - 1) == charset_not; DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); PREFETCH (); c = TRANSLATE (*d); /* The character to match. */ /* Cast to `unsigned' instead of `unsigned char' in case the bit list is a full 32 bytes long. */ if (c < (unsigned) (*p * BYTEWIDTH) && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) not = !not; p += 1 + *p; if (!not) goto fail; SET_REGS_MATCHED (); d++; break; } /* The beginning of a group is represented by start_memory. The arguments are the register number in the next byte, and the number of groups inner to this one in the next. The text matched within the group is recorded (in the internal registers data structure) under the register number. */ case start_memory: DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); /* Find out if this group can match the empty string. */ p1 = p; /* To send to group_match_null_string_p. */ if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) REG_MATCH_NULL_STRING_P (reg_info[*p]) = group_match_null_string_p (&p1, pend, reg_info); /* Save the position in the string where we were the last time we were at this open-group operator in case the group is operated upon by a repetition operator, e.g., with `(a*)*b' against `ab'; then we want to ignore where we are now in the string in case this attempt to match fails. */ old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) ? REG_UNSET (regstart[*p]) ? d : regstart[*p] : regstart[*p]; DEBUG_PRINT2 (" old_regstart: %d\n", POINTER_TO_OFFSET (old_regstart[*p])); regstart[*p] = d; DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); IS_ACTIVE (reg_info[*p]) = 1; MATCHED_SOMETHING (reg_info[*p]) = 0; /* This is the new highest active register. */ highest_active_reg = *p; /* If nothing was active before, this is the new lowest active register. */ if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) lowest_active_reg = *p; /* Move past the register number and inner group count. */ p += 2; break; /* The stop_memory opcode represents the end of a group. Its arguments are the same as start_memory's: the register number, and the number of inner groups. */ case stop_memory: DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); /* We need to save the string position the last time we were at this close-group operator in case the group is operated upon by a repetition operator, e.g., with `((a*)*(b*)*)*' against `aba'; then we want to ignore where we are now in the string in case this attempt to match fails. */ old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) ? REG_UNSET (regend[*p]) ? d : regend[*p] : regend[*p]; DEBUG_PRINT2 (" old_regend: %d\n", POINTER_TO_OFFSET (old_regend[*p])); regend[*p] = d; DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); /* This register isn't active anymore. */ IS_ACTIVE (reg_info[*p]) = 0; /* If this was the only register active, nothing is active anymore. */ if (lowest_active_reg == highest_active_reg) { lowest_active_reg = NO_LOWEST_ACTIVE_REG; highest_active_reg = NO_HIGHEST_ACTIVE_REG; } else { /* We must scan for the new highest active register, since it isn't necessarily one less than now: consider (a(b)c(d(e)f)g). When group 3 ends, after the f), the new highest active register is 1. */ unsigned char r = *p - 1; while (r > 0 && !IS_ACTIVE (reg_info[r])) r--; /* If we end up at register zero, that means that we saved the registers as the result of an `on_failure_jump', not a `start_memory', and we jumped to past the innermost `stop_memory'. For example, in ((.)*) we save registers 1 and 2 as a result of the *, but when we pop back to the second ), we are at the stop_memory 1. Thus, nothing is active. */ if (r == 0) { lowest_active_reg = NO_LOWEST_ACTIVE_REG; highest_active_reg = NO_HIGHEST_ACTIVE_REG; } else highest_active_reg = r; } /* If just failed to match something this time around with a group that's operated on by a repetition operator, try to force exit from the ``loop'', and restore the register information for this group that we had before trying this last match. */ if ((!MATCHED_SOMETHING (reg_info[*p]) || (re_opcode_t) p[-3] == start_memory) && (p + 2) < pend) { boolean is_a_jump_n = false; p1 = p + 2; mcnt = 0; switch ((re_opcode_t) *p1++) { case jump_n: is_a_jump_n = true; case pop_failure_jump: case maybe_pop_jump: case jump: case dummy_failure_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (is_a_jump_n) p1 += 2; break; default: /* do nothing */ ; } p1 += mcnt; /* If the next operation is a jump backwards in the pattern to an on_failure_jump right before the start_memory corresponding to this stop_memory, exit from the loop by forcing a failure after pushing on the stack the on_failure_jump's jump in the pattern, and d. */ if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) { /* If this group ever matched anything, then restore what its registers were before trying this last failed match, e.g., with `(a*)*b' against `ab' for regstart[1], and, e.g., with `((a*)*(b*)*)*' against `aba' for regend[3]. Also restore the registers for inner groups for, e.g., `((a*)(b*))*' against `aba' (register 3 would otherwise get trashed). */ if (EVER_MATCHED_SOMETHING (reg_info[*p])) { unsigned r; EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; /* Restore this and inner groups' (if any) registers. */ for (r = *p; r < *p + *(p + 1); r++) { regstart[r] = old_regstart[r]; /* xx why this test? */ if ((unsigned long) old_regend[r] >= (unsigned long) regstart[r]) regend[r] = old_regend[r]; } } p1++; EXTRACT_NUMBER_AND_INCR (mcnt, p1); PUSH_FAILURE_POINT (p1 + mcnt, d, -2); goto fail; } } /* Move past the register number and the inner group count. */ p += 2; break; /* \ has been turned into a `duplicate' command which is followed by the numeric value of as the register number. */ case duplicate: { register char *d2, *dend2; int regno = *p++; /* Get which register to match against. */ DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); /* Can't back reference a group which we've never matched. */ if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) goto fail; /* Where in input to try to start matching. */ d2 = regstart[regno]; /* Where to stop matching; if both the place to start and the place to stop matching are in the same string, then set to the place to stop, otherwise, for now have to use the end of the first string. */ dend2 = ((FIRST_STRING_P (regstart[regno]) == FIRST_STRING_P (regend[regno])) ? regend[regno] : end_match_1); for (;;) { /* If necessary, advance to next segment in register contents. */ while (d2 == dend2) { if (dend2 == end_match_2) break; if (dend2 == regend[regno]) break; /* End of string1 => advance to string2. */ d2 = string2; dend2 = regend[regno]; } /* At end of register contents => success */ if (d2 == dend2) break; /* If necessary, advance to next segment in data. */ PREFETCH (); /* How many characters left in this segment to match. */ mcnt = dend - d; /* Want how many consecutive characters we can match in one shot, so, if necessary, adjust the count. */ if (mcnt > dend2 - d2) mcnt = dend2 - d2; /* Compare that many; failure if mismatch, else move past them. */ if (translate ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt)) goto fail; d += mcnt, d2 += mcnt; } } break; /* begline matches the empty string at the beginning of the string (unless `not_bol' is set in `bufp'), and, if `newline_anchor' is set, after newlines. */ case begline: DEBUG_PRINT1 ("EXECUTING begline.\n"); if (AT_STRINGS_BEG (d)) { if (!bufp->not_bol) break; } else if (d[-1] == '\n' && bufp->newline_anchor) { break; } /* In all other cases, we fail. */ goto fail; /* endline is the dual of begline. */ case endline: DEBUG_PRINT1 ("EXECUTING endline.\n"); if (AT_STRINGS_END (d)) { if (!bufp->not_eol) break; } /* We have to ``prefetch'' the next character. */ else if ((d == end1 ? *string2 : *d) == '\n' && bufp->newline_anchor) { break; } goto fail; /* Match at the very beginning of the data. */ case begbuf: DEBUG_PRINT1 ("EXECUTING begbuf.\n"); if (AT_STRINGS_BEG (d)) break; goto fail; /* Match at the very end of the data. */ case endbuf: DEBUG_PRINT1 ("EXECUTING endbuf.\n"); if (AT_STRINGS_END (d)) break; goto fail; /* on_failure_keep_string_jump is used to optimize `.*\n'. It pushes NULL as the value for the string on the stack. Then `pop_failure_point' will keep the current value for the string, instead of restoring it. To see why, consider matching `foo\nbar' against `.*\n'. The .* matches the foo; then the . fails against the \n. But the next thing we want to do is match the \n against the \n; if we restored the string value, we would be back at the foo. Because this is used only in specific cases, we don't need to check all the things that `on_failure_jump' does, to make sure the right things get saved on the stack. Hence we don't share its code. The only reason to push anything on the stack at all is that otherwise we would have to change `anychar's code to do something besides goto fail in this case; that seems worse than this. */ case on_failure_keep_string_jump: DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); PUSH_FAILURE_POINT (p + mcnt, NULL, -2); break; /* Uses of on_failure_jump: Each alternative starts with an on_failure_jump that points to the beginning of the next alternative. Each alternative except the last ends with a jump that in effect jumps past the rest of the alternatives. (They really jump to the ending jump of the following alternative, because tensioning these jumps is a hassle.) Repeats start with an on_failure_jump that points past both the repetition text and either the following jump or pop_failure_jump back to this on_failure_jump. */ case on_failure_jump: on_failure: DEBUG_PRINT1 ("EXECUTING on_failure_jump"); EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); /* If this on_failure_jump comes right before a group (i.e., the original * applied to a group), save the information for that group and all inner ones, so that if we fail back to this point, the group's information will be correct. For example, in \(a*\)*\1, we need the preceding group, and in \(\(a*\)b*\)\2, we need the inner group. */ /* We can't use `p' to check ahead because we push a failure point to `p + mcnt' after we do this. */ p1 = p; /* We need to skip no_op's before we look for the start_memory in case this on_failure_jump is happening as the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 against aba. */ while (p1 < pend && (re_opcode_t) *p1 == no_op) p1++; if (p1 < pend && (re_opcode_t) *p1 == start_memory) { /* We have a new highest active register now. This will get reset at the start_memory we are about to get to, but we will have saved all the registers relevant to this repetition op, as described above. */ highest_active_reg = *(p1 + 1) + *(p1 + 2); if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) lowest_active_reg = *(p1 + 1); } DEBUG_PRINT1 (":\n"); PUSH_FAILURE_POINT (p + mcnt, d, -2); break; /* A smart repeat ends with `maybe_pop_jump'. We change it to either `pop_failure_jump' or `jump'. */ case maybe_pop_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); { register unsigned char *p2 = p; /* Compare the beginning of the repeat with what in the pattern follows its end. If we can establish that there is nothing that they would both match, i.e., that we would have to backtrack because of (as in, e.g., `a*a') then we can change to pop_failure_jump, because we'll never have to backtrack. This is not true in the case of alternatives: in `(a|ab)*' we do need to backtrack to the `ab' alternative (e.g., if the string was `ab'). But instead of trying to detect that here, the alternative has put on a dummy failure point which is what we will end up popping. */ /* Skip over open/close-group commands. */ while (p2 + 2 < pend && ((re_opcode_t) *p2 == stop_memory || (re_opcode_t) *p2 == start_memory)) p2 += 3; /* Skip over args, too. */ /* If we're at the end of the pattern, we can change. */ if (p2 == pend) { /* Consider what happens when matching ":\(.*\)" against ":/". I don't really understand this code yet. */ p[-3] = (unsigned char) pop_failure_jump; DEBUG_PRINT1 (" End of pattern: change to `pop_failure_jump'.\n"); } else if ((re_opcode_t) *p2 == exactn || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) { register unsigned char c = *p2 == (unsigned char) endline ? '\n' : p2[2]; p1 = p + mcnt; /* p1[0] ... p1[2] are the `on_failure_jump' corresponding to the `maybe_finalize_jump' of this case. Examine what follows. */ if ((re_opcode_t) p1[3] == exactn && p1[5] != c) { p[-3] = (unsigned char) pop_failure_jump; DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", c, p1[5]); } else if ((re_opcode_t) p1[3] == charset || (re_opcode_t) p1[3] == charset_not) { int not = (re_opcode_t) p1[3] == charset_not; if (c < (unsigned char) (p1[4] * BYTEWIDTH) && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) not = !not; /* `not' is equal to 1 if c would match, which means that we can't change to pop_failure_jump. */ if (!not) { p[-3] = (unsigned char) pop_failure_jump; DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); } } } } p -= 2; /* Point at relative address again. */ if ((re_opcode_t) p[-1] != pop_failure_jump) { p[-1] = (unsigned char) jump; DEBUG_PRINT1 (" Match => jump.\n"); goto unconditional_jump; } /* Note fall through. */ /* The end of a simple repeat has a pop_failure_jump back to its matching on_failure_jump, where the latter will push a failure point. The pop_failure_jump takes off failure points put on by this pop_failure_jump's matching on_failure_jump; we got through the pattern to here from the matching on_failure_jump, so didn't fail. */ case pop_failure_jump: { /* We need to pass separate storage for the lowest and highest registers, even though we don't care about the actual values. Otherwise, we will restore only one register from the stack, since lowest will == highest in `pop_failure_point'. */ unsigned dummy_low_reg, dummy_high_reg; unsigned char *pdummy; char *sdummy; DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); POP_FAILURE_POINT (sdummy, pdummy, dummy_low_reg, dummy_high_reg, reg_dummy, reg_dummy, reg_info_dummy); } /* Note fall through. */ /* Unconditionally jump (without popping any failure points). */ case jump: unconditional_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); p += mcnt; /* Do the jump. */ DEBUG_PRINT2 ("(to 0x%x).\n", p); break; /* We need this opcode so we can detect where alternatives end in `group_match_null_string_p' et al. */ case jump_past_alt: DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); goto unconditional_jump; /* Normally, the on_failure_jump pushes a failure point, which then gets popped at pop_failure_jump. We will end up at pop_failure_jump, also, and with a pattern of, say, `a+', we are skipping over the on_failure_jump, so we have to push something meaningless for pop_failure_jump to pop. */ case dummy_failure_jump: DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); /* It doesn't matter what we push for the string here. What the code at `fail' tests is the value for the pattern. */ PUSH_FAILURE_POINT (0, 0, -2); goto unconditional_jump; /* At the end of an alternative, we need to push a dummy failure point in case we are followed by a `pop_failure_jump', because we don't want the failure point for the alternative to be popped. For example, matching `(a|ab)*' against `aab' requires that we match the `ab' alternative. */ case push_dummy_failure: DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); /* See comments just above at `dummy_failure_jump' about the two zeroes. */ PUSH_FAILURE_POINT (0, 0, -2); break; /* Have to succeed matching what follows at least n times. After that, handle like `on_failure_jump'. */ case succeed_n: EXTRACT_NUMBER (mcnt, p + 2); DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); assert (mcnt >= 0); /* Originally, this is how many times we HAVE to succeed. */ if (mcnt > 0) { mcnt--; p += 2; STORE_NUMBER_AND_INCR (p, mcnt); DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); } else if (mcnt == 0) { DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); p[2] = (unsigned char) no_op; p[3] = (unsigned char) no_op; goto on_failure; } break; case jump_n: EXTRACT_NUMBER (mcnt, p + 2); DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); /* Originally, this is how many times we CAN jump. */ if (mcnt) { mcnt--; STORE_NUMBER (p + 2, mcnt); goto unconditional_jump; } /* If don't have to jump any more, skip over the rest of command. */ else p += 4; break; case set_number_at: { DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); EXTRACT_NUMBER_AND_INCR (mcnt, p); p1 = p + mcnt; EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); STORE_NUMBER (p1, mcnt); break; } case wordbound: DEBUG_PRINT1 ("EXECUTING wordbound.\n"); if (AT_WORD_BOUNDARY (d)) break; goto fail; case notwordbound: DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); if (AT_WORD_BOUNDARY (d)) goto fail; break; case wordbeg: DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) break; goto fail; case wordend: DEBUG_PRINT1 ("EXECUTING wordend.\n"); if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) break; goto fail; #ifdef emacs #ifdef emacs19 case before_dot: DEBUG_PRINT1 ("EXECUTING before_dot.\n"); if (PTR_CHAR_POS ((unsigned char *) d) >= point) goto fail; break; case at_dot: DEBUG_PRINT1 ("EXECUTING at_dot.\n"); if (PTR_CHAR_POS ((unsigned char *) d) != point) goto fail; break; case after_dot: DEBUG_PRINT1 ("EXECUTING after_dot.\n"); if (PTR_CHAR_POS ((unsigned char *) d) <= point) goto fail; break; #else /* not emacs19 */ case at_dot: DEBUG_PRINT1 ("EXECUTING at_dot.\n"); if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) goto fail; break; #endif /* not emacs19 */ case syntaxspec: DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); mcnt = *p++; goto matchsyntax; case wordchar: DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); mcnt = (int) Sword; matchsyntax: PREFETCH (); if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail; SET_REGS_MATCHED (); break; case notsyntaxspec: DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); mcnt = *p++; goto matchnotsyntax; case notwordchar: DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); mcnt = (int) Sword; matchnotsyntax: PREFETCH (); if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail; SET_REGS_MATCHED (); break; #else /* not emacs */ case wordchar: DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); PREFETCH (); if (!WORDCHAR_P (d)) goto fail; SET_REGS_MATCHED (); d++; break; case notwordchar: DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); PREFETCH (); if (WORDCHAR_P (d)) goto fail; SET_REGS_MATCHED (); d++; break; #endif /* not emacs */ default: abort (); } continue; /* Successfully executed one pattern command; keep going. */ /* We goto here if a matching operation fails. */ fail: if (!FAIL_STACK_EMPTY ()) { /* A restart point is known. Restore to that state. */ DEBUG_PRINT1 ("\nFAIL:\n"); POP_FAILURE_POINT (d, p, lowest_active_reg, highest_active_reg, regstart, regend, reg_info); /* If this failure point is a dummy, try the next one. */ if (!p) goto fail; /* If we failed to the end of the pattern, don't examine *p. */ assert (p <= pend); if (p < pend) { boolean is_a_jump_n = false; /* If failed to a backwards jump that's part of a repetition loop, need to pop this failure point and use the next one. */ switch ((re_opcode_t) *p) { case jump_n: is_a_jump_n = true; case maybe_pop_jump: case pop_failure_jump: case jump: p1 = p + 1; EXTRACT_NUMBER_AND_INCR (mcnt, p1); p1 += mcnt; if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) || (!is_a_jump_n && (re_opcode_t) *p1 == on_failure_jump)) goto fail; break; default: /* do nothing */ ; } } if (d >= string1 && d <= end1) dend = end_match_1; } else break; /* Matching at this starting point really fails. */ } /* for (;;) */ if (best_regs_set) goto restore_best_regs; FREE_VARIABLES (); return -1; /* Failure to match. */ } /* re_match_2 */ /* Subroutine definitions for re_match_2. */ /* We are passed P pointing to a register number after a start_memory. Return true if the pattern up to the corresponding stop_memory can match the empty string, and false otherwise. If we find the matching stop_memory, sets P to point to one past its number. Otherwise, sets P to an undefined byte less than or equal to END. We don't handle duplicates properly (yet). */ static boolean group_match_null_string_p (p, end, reg_info) unsigned char **p, *end; register_info_type *reg_info; { int mcnt; /* Point to after the args to the start_memory. */ unsigned char *p1 = *p + 2; while (p1 < end) { /* Skip over opcodes that can match nothing, and return true or false, as appropriate, when we get to one that can't, or to the matching stop_memory. */ switch ((re_opcode_t) *p1) { /* Could be either a loop or a series of alternatives. */ case on_failure_jump: p1++; EXTRACT_NUMBER_AND_INCR (mcnt, p1); /* If the next operation is not a jump backwards in the pattern. */ if (mcnt >= 0) { /* Go through the on_failure_jumps of the alternatives, seeing if any of the alternatives cannot match nothing. The last alternative starts with only a jump, whereas the rest start with on_failure_jump and end with a jump, e.g., here is the pattern for `a|b|c': /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 /exactn/1/c So, we have to first go through the first (n-1) alternatives and then deal with the last one separately. */ /* Deal with the first (n-1) alternatives, which start with an on_failure_jump (see above) that jumps to right past a jump_past_alt. */ while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) { /* `mcnt' holds how many bytes long the alternative is, including the ending `jump_past_alt' and its number. */ if (!alt_match_null_string_p (p1, p1 + mcnt - 3, reg_info)) return false; /* Move to right after this alternative, including the jump_past_alt. */ p1 += mcnt; /* Break if it's the beginning of an n-th alternative that doesn't begin with an on_failure_jump. */ if ((re_opcode_t) *p1 != on_failure_jump) break; /* Still have to check that it's not an n-th alternative that starts with an on_failure_jump. */ p1++; EXTRACT_NUMBER_AND_INCR (mcnt, p1); if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) { /* Get to the beginning of the n-th alternative. */ p1 -= 3; break; } } /* Deal with the last alternative: go back and get number of the `jump_past_alt' just before it. `mcnt' contains the length of the alternative. */ EXTRACT_NUMBER (mcnt, p1 - 2); if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) return false; p1 += mcnt; /* Get past the n-th alternative. */ } /* if mcnt > 0 */ break; case stop_memory: assert (p1[1] == **p); *p = p1 + 2; return true; default: if (!common_op_match_null_string_p (&p1, end, reg_info)) return false; } } /* while p1 < end */ return false; } /* group_match_null_string_p */ /* Similar to group_match_null_string_p, but doesn't deal with alternatives: It expects P to be the first byte of a single alternative and END one byte past the last. The alternative can contain groups. */ static boolean alt_match_null_string_p (p, end, reg_info) unsigned char *p, *end; register_info_type *reg_info; { int mcnt; unsigned char *p1 = p; while (p1 < end) { /* Skip over opcodes that can match nothing, and break when we get to one that can't. */ switch ((re_opcode_t) *p1) { /* It's a loop. */ case on_failure_jump: p1++; EXTRACT_NUMBER_AND_INCR (mcnt, p1); p1 += mcnt; break; default: if (!common_op_match_null_string_p (&p1, end, reg_info)) return false; } } /* while p1 < end */ return true; } /* alt_match_null_string_p */ /* Deals with the ops common to group_match_null_string_p and alt_match_null_string_p. Sets P to one after the op and its arguments, if any. */ static boolean common_op_match_null_string_p (p, end, reg_info) unsigned char **p, *end; register_info_type *reg_info; { int mcnt; boolean ret; int reg_no; unsigned char *p1 = *p; switch ((re_opcode_t) *p1++) { case no_op: case begline: case endline: case begbuf: case endbuf: case wordbeg: case wordend: case wordbound: case notwordbound: #ifdef emacs case before_dot: case at_dot: case after_dot: #endif break; case start_memory: reg_no = *p1; assert (reg_no > 0 && reg_no <= MAX_REGNUM); ret = group_match_null_string_p (&p1, end, reg_info); /* Have to set this here in case we're checking a group which contains a group and a back reference to it. */ if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; if (!ret) return false; break; /* If this is an optimized succeed_n for zero times, make the jump. */ case jump: EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (mcnt >= 0) p1 += mcnt; else return false; break; case succeed_n: /* Get to the number of times to succeed. */ p1 += 2; EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (mcnt == 0) { p1 -= 4; EXTRACT_NUMBER_AND_INCR (mcnt, p1); p1 += mcnt; } else return false; break; case duplicate: if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) return false; break; case set_number_at: p1 += 4; default: /* All other opcodes mean we cannot match the empty string. */ return false; } *p = p1; return true; } /* common_op_match_null_string_p */ /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN bytes; nonzero otherwise. */ static int bcmp_translate (s1, s2, len, translate) unsigned char *s1, *s2; register int len; char *translate; { register unsigned char *p1 = s1, *p2 = s2; while (len) { if (translate[*p1++] != translate[*p2++]) return 1; len--; } return 0; } /* Entry points for GNU code. */ /* re_compile_pattern is the GNU regular expression compiler: it compiles PATTERN (of length SIZE) and puts the result in BUFP. Returns 0 if the pattern was valid, otherwise an error string. Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. We call regex_compile to do the actual compilation. */ char * re_compile_pattern (char *pattern, int length, struct re_pattern_buffer *bufp) { reg_errcode_t ret; /* GNU code is written to assume at least RE_NREGS registers will be set (and at least one extra will be -1). */ bufp->regs_allocated = REGS_UNALLOCATED; /* And GNU code determines whether or not to get register information by passing null for the REGS argument to re_match, etc., not by setting no_sub. */ bufp->no_sub = 0; /* Match anchors at newline. */ bufp->newline_anchor = 1; ret = regex_compile (pattern, length, re_syntax_options, bufp); return re_error_msg[(int) ret]; } /* Entry points compatible with 4.2 BSD regex library. We don't define them if this is an Emacs or POSIX compilation. */ #if !defined (emacs) && !defined (_POSIX_SOURCE) /* BSD has one and only one pattern buffer. */ static struct re_pattern_buffer re_comp_buf; char * re_comp (s) char *s; { reg_errcode_t ret; if (!s) { if (!re_comp_buf.buffer) return "No previous regular expression"; return 0; } if (!re_comp_buf.buffer) { re_comp_buf.buffer = (unsigned char *) malloc (200); if (re_comp_buf.buffer == NULL) return "Memory exhausted"; re_comp_buf.allocated = 200; re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); if (re_comp_buf.fastmap == NULL) return "Memory exhausted"; } /* Since `re_exec' always passes NULL for the `regs' argument, we don't need to initialize the pattern buffer fields which affect it. */ /* Match anchors at newlines. */ re_comp_buf.newline_anchor = 1; ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); /* Yes, we're discarding `const' here. */ return (char *) re_error_msg[(int) ret]; } int re_exec (s) char *s; { int len = strlen (s); return 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); } #endif /* not emacs and not _POSIX_SOURCE */ /* POSIX.2 functions. Don't define these for Emacs. */ #ifndef emacs /* regcomp takes a regular expression as a string and compiles it. PREG is a regex_t *. We do not expect any fields to be initialized, since POSIX says we shouldn't. Thus, we set `buffer' to the compiled pattern; `used' to the length of the compiled pattern; `syntax' to RE_SYNTAX_POSIX_EXTENDED if the REG_EXTENDED bit in CFLAGS is set; otherwise, to RE_SYNTAX_POSIX_BASIC; `newline_anchor' to REG_NEWLINE being set in CFLAGS; `fastmap' and `fastmap_accurate' to zero; `re_nsub' to the number of subexpressions in PATTERN. PATTERN is the address of the pattern string. CFLAGS is a series of bits which affect compilation. If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we use POSIX basic syntax. If REG_NEWLINE is set, then . and [^...] don't match newline. Also, regexec will try a match beginning after every newline. If REG_ICASE is set, then we considers upper- and lowercase versions of letters to be equivalent when matching. If REG_NOSUB is set, then when PREG is passed to regexec, that routine will report only success or failure, and nothing about the registers. It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for the return codes and their meanings.) */ int regcomp (preg, pattern, cflags) regex_t *preg; char *pattern; int cflags; { reg_errcode_t ret; unsigned syntax = (cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; /* regex_compile will allocate the space for the compiled pattern. */ preg->buffer = 0; preg->allocated = 0; /* Don't bother to use a fastmap when searching. This simplifies the REG_NEWLINE case: if we used a fastmap, we'd have to put all the characters after newlines into the fastmap. This way, we just try every character. */ preg->fastmap = 0; if (cflags & REG_ICASE) { unsigned i; preg->translate = (char *) malloc (CHAR_SET_SIZE); if (preg->translate == NULL) return (int) REG_ESPACE; /* Map uppercase characters to corresponding lowercase ones. */ for (i = 0; i < CHAR_SET_SIZE; i++) preg->translate[i] = ISUPPER (i) ? tolower (i) : i; } else preg->translate = NULL; /* If REG_NEWLINE is set, newlines are treated differently. */ if (cflags & REG_NEWLINE) { /* REG_NEWLINE implies neither . nor [^...] match newline. */ syntax &= ~RE_DOT_NEWLINE; syntax |= RE_HAT_LISTS_NOT_NEWLINE; /* It also changes the matching behavior. */ preg->newline_anchor = 1; } else preg->newline_anchor = 0; preg->no_sub = !!(cflags & REG_NOSUB); /* POSIX says a null character in the pattern terminates it, so we can use strlen here in compiling the pattern. */ ret = regex_compile (pattern, strlen (pattern), syntax, preg); /* POSIX doesn't distinguish between an unmatched open-group and an unmatched close-group: both are REG_EPAREN. */ if (ret == REG_ERPAREN) ret = REG_EPAREN; return (int) ret; } /* regexec searches for a given pattern, specified by PREG, in the string STRING. If NMATCH is zero or REG_NOSUB was set in the cflags argument to `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at least NMATCH elements, and we set them to the offsets of the corresponding matched substrings. EFLAGS specifies `execution flags' which affect matching: if REG_NOTBOL is set, then ^ does not match at the beginning of the string; if REG_NOTEOL is set, then $ does not match at the end. We return 0 if we find a match and REG_NOMATCH if not. */ int regexec (preg, string, nmatch, pmatch, eflags) regex_t *preg; char *string; size_t nmatch; regmatch_t pmatch[]; int eflags; { int ret; struct re_registers regs; regex_t private_preg; int len = strlen (string); boolean want_reg_info = !preg->no_sub && nmatch > 0; private_preg = *preg; private_preg.not_bol = !!(eflags & REG_NOTBOL); private_preg.not_eol = !!(eflags & REG_NOTEOL); /* The user has told us exactly how many registers to return information about, via `nmatch'. We have to pass that on to the matching routines. */ private_preg.regs_allocated = REGS_FIXED; if (want_reg_info) { regs.num_regs = nmatch; regs.start = TALLOC (nmatch, regoff_t); regs.end = TALLOC (nmatch, regoff_t); if (regs.start == NULL || regs.end == NULL) return (int) REG_NOMATCH; } /* Perform the searching operation. */ ret = re_search (&private_preg, string, len, /* start: */ 0, /* range: */ len, want_reg_info ? ®s : (struct re_registers *) 0); /* Copy the register information to the POSIX structure. */ if (want_reg_info) { if (ret >= 0) { unsigned r; for (r = 0; r < nmatch; r++) { pmatch[r].rm_so = regs.start[r]; pmatch[r].rm_eo = regs.end[r]; } } /* If we needed the temporary register info, free the space now. */ free (regs.start); free (regs.end); } /* We want zero return to mean success, unlike `re_search'. */ return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; } /* Returns a message corresponding to an error code, ERRCODE, returned from either regcomp or regexec. We don't use PREG here. */ size_t regerror (errcode, preg, errbuf, errbuf_size) int errcode; regex_t *preg; char *errbuf; size_t errbuf_size; { char *msg; size_t msg_size; if (errcode < 0 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) /* Only error codes returned by the rest of the code should be passed to this routine. If we are given anything else, or if other regex code generates an invalid error code, then the program has a bug. Dump core so we can fix it. */ abort (); msg = re_error_msg[errcode]; /* POSIX doesn't require that we do anything in this case, but why not be nice. */ if (! msg) msg = "Success"; msg_size = strlen (msg) + 1; /* Includes the null. */ if (errbuf_size != 0) { if (msg_size > errbuf_size) { strncpy (errbuf, msg, errbuf_size - 1); errbuf[errbuf_size - 1] = 0; } else strcpy (errbuf, msg); } return msg_size; } /* Free dynamically allocated space used by PREG. */ void regfree (preg) regex_t *preg; { if (preg->buffer != NULL) free (preg->buffer); preg->buffer = NULL; preg->allocated = 0; preg->used = 0; if (preg->fastmap != NULL) free (preg->fastmap); preg->fastmap = NULL; preg->fastmap_accurate = 0; if (preg->translate != NULL) free (preg->translate); preg->translate = NULL; } #endif /* not emacs */ /* Local variables: make-backup-files: t version-control: t trim-versions-without-asking: nil End: */ #endif /* not lint */ tRNAscan-SE-2.0/src/scan_main.c0000644000543100007160000002201114044106617015514 0ustar pchanlowelab/* scan_main.c * Mon Jan 31 11:04:57 1994 * * main() for database scanning with a covariance model. */ #include #include #include #include #include #include /*#ifdef NEED_GETOPTH #include #endif*/ #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "cg:ho:t:w:D:E:F" static char usage[] = "\ Usage: covels [-options] \n\ where options are:\n\ -c : do complementary strand too\n\ -g : set background expected GC content (0.5 default)\n\ -h : print short help and version info\n\ -o : save hits in \n\ -t : set score reporting threshold\n\ -w : set scanning window size\n\ CRASH PROTECTION OPTIONS:\n\ -D : save name of last sequence processed\n\ EXPERIMENTAL OPTIONS:\n\ -E : set epsilon for fast search\n\ -F : fast heuristic search\n"; static char banner[] = "covels - scan sequences for matches to an RNA covariance model"; static char *ext_seqname; static int ext_seqlen; static int in_complement; static FILE *ext_ofp; static int print_hit(int i, int j, double sc); int main(int argc, char **argv) { char *seq; /* a sequence to score */ SQINFO sqinfo; /* info about sequence */ char *rev; /* rev complement of seq */ char *seqfile; /* sequence file */ int fmt; /* format of sequence file */ SQFILE *dbfp; /* open sequence file for reading */ char *cmfile; /* file containing covariance model */ struct cm_s *cm; /* model */ struct istate_s *icm; /* integer log-odds search model */ struct pstate_s *pcm; /* rearranged probability model */ int statenum; /* # of states in icm, pcm */ int *minb, *maxb; /* bounds for fast heuristic search */ double rfreq[ALPHASIZE]; /* random model */ int fast; /* TRUE if trying fast heuristic search */ double epsilon; /* how much probability we're willing to lose */ int do_complement; /* TRUE if searching complementary strand too */ double thresh; /* threshold score for reporting a match */ char *outfile; /* save file for scores */ int window; /* size of search window, symbols */ char *donefile; /* crash protection: save name of last seq */ double gcfrac; /* background gc fraction */ #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, curr_size; #endif int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ /*********************************************** * Parse command line ***********************************************/ do_complement = FALSE; thresh = 0.0; window = 100; outfile = NULL; fast = FALSE; epsilon = 1e-9; donefile = NULL; gcfrac = 0.5; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'c': do_complement = TRUE; break; case 'g': gcfrac = (double) atof(optarg); break; case 'o': outfile = optarg; break; case 't': thresh = (double) atof(optarg); break; case 'w': window = atoi(optarg); break; case 'D': donefile = optarg; break; case 'E': epsilon = atof(optarg); break; case 'F': fast = TRUE; break; case 'h': printf("%s\n version %s (%s)\n%s\n", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("unrecognized option %c\n", optc); } if (argc - optind != 2) Die("%s\n", usage); cmfile = argv[argc-2]; seqfile = argv[argc-1]; /* The random model probabilities */ rfreq[1] = rfreq[2] = gcfrac / 2.0; rfreq[0] = rfreq[3] = (1.0 - gcfrac) / 2.0; if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Failed to determine format of sequence database %s", seqfile); if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); if (! ReadCM(cmfile, &cm)) Die("Failed to read model from file %s", cmfile); if (! RearrangeCM(cm, rfreq, &icm, &statenum)) Die("Failed to create search CM"); /* Fast version. * Use lower/upper bounds on possible subsequence lengths * for each state. These are obtained probabilistically. */ if (fast) { double **lmx; if (! MakePCM(cm, &pcm, &statenum)) Die("Failed to rearrange CM for bounds calculations"); NormalizePCM(pcm, statenum); LengthDistribution(pcm, statenum, window, &lmx); LengthBounds(lmx, statenum, window, epsilon, &minb, &maxb); Free2DArray(lmx, statenum); free(pcm); } ext_ofp = NULL; if (outfile != NULL) if ((ext_ofp = fopen(outfile, "w")) == NULL) Die("Failed to open score output file %s for writing", outfile); /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" version %s, %s\n\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); printf("Database to search/score: %s\n", seqfile); printf("Model: %s\n", cmfile); printf("Reporting threshold: %.2f\n", thresh); printf("Maximum match size: %d\n", window); printf("Complementary strand searched: %s\n", do_complement? "yes":"no"); if (outfile != NULL) printf("Scores saved to file: %s\n", outfile); printf("GC%% of background model: %.0f%%\n", (gcfrac*100.)); printf("---------------------------------------------------\n"); puts(""); /*********************************************** * Score each sequence ***********************************************/ #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) { s2upper(seq); /* some communication to report_local through statics */ ext_seqname = sqinfo.name; ext_seqlen = sqinfo.len; in_complement = FALSE; if (fast) { if (! FastViterbiScan(icm, statenum, minb, maxb, seq, window, thresh, print_hit)) Die("Search across sequence %s failed", sqinfo.name); } else if (! ViterbiScan(icm, statenum, seq, window, thresh, print_hit)) Die("Search across sequence %s failed", sqinfo.name); if (do_complement) { in_complement = TRUE; if ((rev = (char *) malloc ((sqinfo.len+1) * sizeof(char))) == NULL) Die("malloc failed"); revcomp(rev, seq); if (fast) { if (! FastViterbiScan(icm, statenum, minb, maxb, rev, window, thresh, print_hit)) Die("Search across sequence %s failed", sqinfo.name); } else if (! ViterbiScan(icm, statenum, rev, window, thresh, print_hit)) Die("Search across complement of sequence %s failed", sqinfo.name); free(rev); } if (donefile != NULL) { FILE *dfp; if ((dfp = fopen(donefile, "w")) == NULL) Die("Failed to open file for saving name of last finished sequence"); fprintf(dfp, "%s\n", sqinfo.name); fclose(dfp); } FreeSequence(seq, &sqinfo); #ifdef MEMDEBUG curr_size = malloc_size(&histid2); if (curr_size != orig_size) { Warn("memory leak: current size %ul, starting size %ul\n", curr_size, orig_size); malloc_list(2,histid1, histid2); } #endif } if (donefile != NULL) { FILE *dfp; if ((dfp = fopen(donefile, "w")) == NULL) Die("Failed to open file for saving name of last finished sequence"); fprintf(dfp, "Search complete."); fclose(dfp); } if (fast) { free(minb); free(maxb); } free(icm); FreeCM(cm); SeqfileClose(dbfp); return 0; } /* Function: print_hit() * * Purpose: Simple function that determines the format of printing * out scanning hits. It gets the start point, end point, * and score of the match; any other info it wants to print * must come through static external variables in this file. * * Args: i - start point of match * j - end point of match * sc - score of match * * Return: (void) */ static int print_hit(int i, int j, double sc) { if (in_complement) printf("%6.2f %5d %5d : %s\n", sc, ext_seqlen-i+1, ext_seqlen-j+1, ext_seqname); else printf("%6.2f %5d %5d : %s\n", sc, i, j, ext_seqname); if (ext_ofp != NULL) { if (in_complement) fprintf(ext_ofp, "%6.2f %5d %5d : %s\n", sc, ext_seqlen-i+1, ext_seqlen-j+1, ext_seqname); else fprintf(ext_ofp, "%6.2f %5d %5d : %s\n", sc, i, j, ext_seqname); fflush(ext_ofp); } return 1; } tRNAscan-SE-2.0/src/seqstat_main.c0000644000543100007160000000676411021467311016266 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* seqstat_main.c * Wed Aug 10 15:47:14 1994 * * Look at a sequence file, determine some simple statistics. */ #include #include #include "squid.h" #define OPTIONS "ah" #ifdef MEMDEBUG #include "dbmalloc.h" #endif char usage[] = "Usage: seqstat [-options] \n\ Verify a sequence file; print some simple statistics and info.\n\ Available options:\n\ -a : report per-sequence info, not just a summary\n\ -h : help; display usage and version\n"; int main(int argc, char **argv) { char *seqfile; /* name of sequence file */ SQFILE *dbfp; /* open sequence file */ int fmt; /* format of seqfile */ char *seq; /* sequence */ SQINFO sqinfo; /* extra info about sequence */ int nseqs; int small; /* smallest length */ int large; /* largest length */ int total; /* total length */ int optchar; /* option character, command line */ extern int optind; int allreport; #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, current_size; #endif /*********************************************** * Parse command line ***********************************************/ allreport = FALSE; /* default: summary only */ while ((optchar = getopt(argc, argv, OPTIONS)) != -1) switch (optchar) { case 'a': allreport = TRUE; break; case 'h': printf("seqstat %s, %s\n%s\n", squid_version, squid_date, usage); exit(EXIT_SUCCESS); default: Die("%s\n", usage); } if (argc - optind != 1) Die("%s\n", usage); seqfile = argv[argc-1]; #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif /*********************************************** * Read the file. ***********************************************/ printf("seqstat %s, %s\n\n", squid_version, squid_date); if (! SeqfileFormat(seqfile, &fmt, NULL)) Die("Can't determine format of file %s\n", seqfile); if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); if (allreport) { printf(" %-15s %-5s %s\n", " NAME", "LEN", "DESCRIPTION"); printf(" --------------- ----- -----------\n"); } small = 9999999; large = 0; nseqs = 0; total = 0; while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) { if (allreport) printf("* %-15s %5d %-50.50s\n", sqinfo.name, sqinfo.len, sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); if (sqinfo.len < small) small = sqinfo.len; if (sqinfo.len > large) large = sqinfo.len; total += sqinfo.len; nseqs++; FreeSequence(seq, &sqinfo); } if (allreport) puts(""); printf("Format: %s\n", SeqFormatString(fmt)); printf("Number of sequences: %d\n", nseqs); printf("Total # residues: %d\n", total); printf("Smallest: %d\n", small); printf("Largest: %d\n", large); printf("Average length: %.1f\n", (float) total / (float) nseqs); SeqfileClose(dbfp); #ifdef MEMDEBUG current_size = malloc_size(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "[No memory leaks]\n"); #endif return 0; } tRNAscan-SE-2.0/src/prior.c0000644000543100007160000002673411021467305014733 0ustar pchanlowelab/* prior.c * Configure prior probability distributions * Mon Sep 6 09:42:16 1993 * * Designed to make it fairly easy to replace the prior with * different numbers, or ever completely different structures, * but for now we just copy a default prior out of prior.h */ #include #include #include "structs.h" #include "prior.h" static int get_real(FILE *fp, double *ret_real); /* Function: DefaultPrior() * * Purpose: Copy the default prior from prior.h into a structure. * * Args: ret_prior: RETURN: new struct containing prior prob. distributions * * Return: 1 on success, 0 on failure. ret_prior is malloc'ed here and * must be free'd by caller by a free(*ret_prior). */ int DefaultPrior(struct prior_s **ret_prior) { struct prior_s *prior; int i,j,fy,ty; /* counters */ if ((prior = (struct prior_s *) malloc (sizeof(struct prior_s))) == NULL) Die("malloc failed"); for (i = 0; i < 7; i++) for (j = 0; j < 4; j++) for (fy = 0; fy < STATETYPES; fy++) for (ty = 0; ty < STATETYPES; ty++) prior->tprior[i][j][fy][ty] = def_tprior[i][j][fy][ty]; for (i = 0; i < ALPHASIZE; i++) { prior->rfreq[i] = def_rfreq[i]; prior->matl_prior[i] = def_matl_prior[i]; prior->matr_prior[i] = def_matr_prior[i]; prior->insl_prior[i] = def_insl_prior[i]; prior->insr_prior[i] = def_insr_prior[i]; for (j = 0; j < ALPHASIZE; j++) prior->matp_prior[i][j] = def_matp_prior[i][j]; } for (ty = 0; ty < STATETYPES; ty++) { prior->talpha[ty] = def_talpha[ty]; prior->emalpha[ty] = def_emalpha[ty]; } *ret_prior = prior; return 1; } /* Function: ReadPrior() * * Purpose: Get a prior from a file. * * Return: 1 on success, 0 on failure. * ret_prior is alloced here, must be free'd by caller */ int ReadPrior(FILE *fp, struct prior_s **ret_prior) { struct prior_s *prior; double param; int fnode,tnode,fs,ts; /* counters */ int i,j; if ((prior = (struct prior_s *) malloc (sizeof(struct prior_s))) == NULL) Die("malloc failed"); /* Read rfreq's: expected symbol emission probability distribution for * unrelated background sequence (the "random model") */ if (!get_real(fp, ¶m)) return 0; prior->rfreq[0] = param; for (i = 1; i < ALPHASIZE; i++) { if (!get_real(NULL, ¶m)) return 0; prior->rfreq[i] = param; } /* Read talpha's: weights attached to state transition priors. * Often all 1.0 */ for (i = 0; i < STATETYPES; i++) { if (!get_real(NULL, ¶m)) return 0; prior->talpha[i] = param; } /* Read emalpha's: weights attached to symbol emission priors. * Often all 1.0. INSR_ST, INSL_ST sometimes set very high to * fix insert states as unlearnable */ for (i = 0; i < STATETYPES; i++) { if (!get_real(NULL, ¶m)) return 0; prior->emalpha[i] = param; } /* Read matp_prior: symbol emission priors for matp */ for (i = 0; i < ALPHASIZE; i++) for (j = 0; j < ALPHASIZE; j++) { if (!get_real(NULL, ¶m)) return 0; prior->matp_prior[i][j] = param; } /* Read singlet emission priors, in order MATL, MATR, INSL, INSR */ for (i = 0; i < ALPHASIZE; i++) { if (!get_real(NULL, ¶m)) return 0; prior->matl_prior[i] = param; } for (i = 0; i < ALPHASIZE; i++) { if (!get_real(NULL, ¶m)) return 0; prior->matr_prior[i] = param; } for (i = 0; i < ALPHASIZE; i++) { if (!get_real(NULL, ¶m)) return 0; prior->insl_prior[i] = param; } for (i = 0; i < ALPHASIZE; i++) { if (!get_real(NULL, ¶m)) return 0; prior->insr_prior[i] = param; } /* Read tprior: state transition priors. * In order [7 from nodes][4 to nodes][from state][to state] */ for (fnode = 0; fnode < 7; fnode++) for (tnode = 0; tnode < 4; tnode++) for (fs = 0; fs < STATETYPES; fs++) for (ts = 0; ts < STATETYPES; ts++) { if (!get_real(NULL, ¶m)) return 0; prior->tprior[fnode][tnode][fs][ts] = param; } *ret_prior = prior; return 1; } /* Function: WritePrior() * * Purpose: Write a prior to an open file pointer. * The file is usable as a prior.h header file. * * Return: 1 on success, 0 on failure. */ int WritePrior(FILE *fp, struct prior_s *prior) { int fnode,tnode,fs,ts; /* counters */ int i,j; if (fp == NULL) return 0; fprintf(fp, "#ifdef PRIORH_INCLUDED\n\n"); fprintf(fp, "#include \"structs.h\"\n\n"); /* Write rfreq */ fprintf(fp, "static double def_rfreq[ALPHASIZE] = { "); for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%f%s", prior->rfreq[i], (i == ALPHASIZE-1) ? " };\n\n" : ", "); /* Write talpha */ fprintf(fp, "static double def_talpha[STATETYPES] =\n{ "); for (i = 0; i < STATETYPES; i++) fprintf(fp, "%f%s", prior->talpha[i], (i == STATETYPES-1) ? " };\n\n" : ", "); /* Write emalpha */ fprintf(fp, "static double def_emalpha[STATETYPES] =\n{ "); for (i = 0; i < STATETYPES; i++) fprintf(fp, "%f%s", prior->emalpha[i], (i == STATETYPES-1) ? " };\n\n" : ", "); /* Write matp_prior */ fprintf(fp, "static double def_matp_prior[ALPHASIZE][ALPHASIZE] =\n{\n"); for (i = 0; i < ALPHASIZE; i++) { fprintf(fp, " { "); for (j = 0; j < ALPHASIZE; j++) fprintf(fp, "%f%s", prior->matp_prior[i][j], (j == ALPHASIZE-1) ? " },\n" : ", "); } fprintf(fp, "};\n\n"); /* Write the singlet emission priors, in order MATL, MATR, INSL, INSR */ fprintf(fp, "static double def_matl_prior[ALPHASIZE] = { "); for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%f%s", prior->matl_prior[i], (i == ALPHASIZE-1) ? " };\n\n" : ", "); fprintf(fp, "static double def_matr_prior[ALPHASIZE] = { "); for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%f%s", prior->matr_prior[i], (i == ALPHASIZE-1) ? " };\n\n" : ", "); fprintf(fp, "static double def_insl_prior[ALPHASIZE] = { "); for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%f%s", prior->insl_prior[i], (i == ALPHASIZE-1) ? " };\n\n" : ", "); fprintf(fp, "static double def_insr_prior[ALPHASIZE] = { "); for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%f%s", prior->insr_prior[i], (i == ALPHASIZE-1) ? " };\n\n" : ", "); /* Write the state transition priors */ fprintf(fp, "static double def_tprior[7][4][STATETYPES][STATETYPES] =\n{\n"); for (fnode = 0; fnode < 7; fnode++) { fprintf(fp, " {\n"); /* open block of 4 tonodes */ for (tnode = 0; tnode < 4; tnode++) { fprintf(fp, " { "); /* open block of 6 from states */ for (fs = 0; fs < STATETYPES; fs++) { fprintf(fp, "%s", fs == 0? " { " : " { "); for (ts = 0; ts < STATETYPES; ts++) fprintf(fp, "%f%s", prior->tprior[fnode][tnode][fs][ts], (ts == STATETYPES-1) ? " },\n" : ", "); } fprintf(fp, " },\n\n"); /* end block of 6 from states */ } fprintf(fp, " },\n\n"); /* end block of 4 tonodes */ } fprintf(fp, "};\n"); fprintf(fp, "#endif /* PRIORH_INCLUDED */\n"); return 1; } /* Function: NormalizePrior() * * Purpose: convert a prior containing counts to one suitable for saving; * the sum of each vector is the number of possibilities. * i.e., an emission vector sums to ALPHASIZE, a state transition * vector sums to the number of downstream states, and the MATP * emission table sums to ALPHASIZE*ALPHASIZE. */ void NormalizePrior(struct prior_s *prior) { int fnode, tnode, fs, ts; /* counters */ double sum; double conn; /* count of downstream connected states */ /* Normalize the vectors */ DNorm(prior->rfreq, ALPHASIZE); /* rfreq, random model */ DNorm((double *) prior->matp_prior, ALPHASIZE * ALPHASIZE); /* MATP emission prior */ DNorm(prior->matl_prior, ALPHASIZE); /* MATL emission prior */ DNorm(prior->matr_prior, ALPHASIZE); /* MATR emission prior */ DNorm(prior->insl_prior, ALPHASIZE); /* INSL emission prior */ DNorm(prior->insr_prior, ALPHASIZE); /* INSR emission prior */ /* Scale them to ALPHASIZE or ALPHASIZE*ALPHASIZE */ DScale(prior->rfreq, ALPHASIZE, (double) ALPHASIZE); /* rfreq, random model */ DScale((double *) prior->matp_prior, ALPHASIZE*ALPHASIZE, (double)(ALPHASIZE * ALPHASIZE)); /* MATP */ DScale(prior->matl_prior, ALPHASIZE, (double) ALPHASIZE); /* MATL emission prior */ DScale(prior->matr_prior, ALPHASIZE, (double) ALPHASIZE); /* MATR emission prior */ DScale(prior->insl_prior, ALPHASIZE, (double) ALPHASIZE); /* INSL emission prior */ DScale(prior->insr_prior, ALPHASIZE, (double) ALPHASIZE); /* INSR emission prior */ /* state transition priors * deal with specially */ for (fnode = 0; fnode < 7; fnode++) for (tnode = 0; tnode < 4; tnode++) for (fs = 0; fs < STATETYPES; fs++) { sum = conn = 0.0; for (ts = 0; ts < STATETYPES; ts++) if (prior->tprior[fnode][tnode][fs][ts] > 0.0) { conn += 1.0; sum += prior->tprior[fnode][tnode][fs][ts]; } if (sum > 0.0) for (ts = 0; ts < STATETYPES; ts++) prior->tprior[fnode][tnode][fs][ts] = prior->tprior[fnode][tnode][fs][ts] * conn / sum; } } /* Function: get_real() * * Purpose: Read next parameter from fp. * This is a very general reading function for reading * parameters from files with C-style comments. * As long as the caller * knows the order of what he's reading, he can format * the file any way he wants. * * Works somewhat like strtok. Call it with fp * on first invocation; call with NULL on subsequent * invocations. Can't work on multiple files * simultaneously, and can't do anything else * to them (lest we lose track of begin/end comments) * * Also note that we use strtok internally, so * the caller can't call strtok() between get_real calls * on the same file. * * Return: 1 on success; 0 on failure (such as end of file) */ static int get_real(FILE *fp, double *ret_real) { static int in_comment = 0; static FILE *internalfp = NULL; static char *lineptr = NULL; static char stripbuffer[512]; /* buffer with comments removed */ char buffer[512]; char *sptr; char *stripptr; if (fp != NULL) { internalfp = fp; lineptr = NULL; } while (1) { if (lineptr == NULL) { /* Get next line. */ if (fgets(buffer, 512, internalfp) == NULL) return 0; /* Preprocess the line, stripping out comments. * The stripped copy goes into stripbuffer. */ stripptr = stripbuffer; for (sptr = buffer; *sptr; sptr++) { /* If we're in a comment, we're ignoring stuff * until we see end-comment. Else, we're saving * stuff until we see start-comment. */ if (in_comment) { if (*sptr == '*' && *(sptr+1) == '/') { in_comment = 0; sptr++; } } else { if (*sptr == '/' && *(sptr+1) == '*') { in_comment = 1; sptr++; } else { *stripptr = *sptr; stripptr++; } } } *stripptr = '\0'; lineptr = strtok(stripbuffer, WHITESPACE); } /* Now, get the first real and return it */ while (lineptr != NULL) { if (IsReal(lineptr)) { *ret_real = atof(lineptr); lineptr = strtok(NULL, WHITESPACE); return 1; } else lineptr = strtok(NULL, WHITESPACE); } } /*NOTREACHED*/ return 0; } tRNAscan-SE-2.0/src/save.c0000644000543100007160000002063411672025122014526 0ustar pchanlowelab/* save.c * Saving models to disk, and reading them back in * SRE, Wed Sep 8 17:14:43 1993 * * Both binary and flat text save formats are supported. */ #include #include #include #include "funcs.h" #include "structs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* The magic number for our binary file is "cm20" + 0x80808080 */ static int v20magic = 0xe3edb2b0; static int read_cm20 (FILE *fp, struct cm_s **ret_cm); static int read_bincm20(FILE *fp, struct cm_s **ret_cm); /* Function: WriteCM() * * Purpose: Print a flat text copy of the data from a model * to a file handle. * * Returns: 1 on success, 0 on failure. */ int WriteCM(FILE *fp, struct cm_s *cm) { int i, j, k; /* header info */ fprintf(fp, "### cove V2\n"); fprintf(fp, "%d \tnodes\n", cm->nodes); /* over all nodes, 0..cm->nodes-1 */ for (k = 0; k < cm->nodes; k++) { fprintf(fp, "### node %d type %d\n", k, cm->nd[k].type); fprintf(fp, "%d %d\n", cm->nd[k].nxt, cm->nd[k].nxt2); /* transitions */ for (i = 0; i < STATETYPES; i++) { for (j = 0; j < STATETYPES; j++) fprintf(fp, "%.5f ", cm->nd[k].tmx[i][j]); putc('\n', fp); } /* INSL emissions */ for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%.5f ", cm->nd[k].il_emit[i]); fputs("# INSL\n", fp); /* INSR emissions */ for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%.5f ", cm->nd[k].ir_emit[i]); fputs("# INSR\n", fp); /* MATP emissions */ for (i = 0; i < ALPHASIZE; i++) { for (j = 0; j < ALPHASIZE; j++) fprintf(fp, "%.5f ", cm->nd[k].mp_emit[i][j]); fputs("# MATP\n", fp); } /* MATL emissions */ for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%.5f ", cm->nd[k].ml_emit[i]); fputs("# MATL\n", fp); /* MATR emissions */ for (i = 0; i < ALPHASIZE; i++) fprintf(fp, "%.5f ", cm->nd[k].mr_emit[i]); fputs("# MATR\n", fp); } return 1; } /* Function: WriteBinaryCM() * * Purpose: Save a model in binary format; somewhat nonportable * but much more compressed. * * Args: fp - open file pointer to write to * cm - model to save * * Return: 1 on success, 0 on failure. */ int WriteBinaryCM(FILE *fp, struct cm_s *cm) { int k; /* counter for nodes */ /* Write the four-byte magic number. It identifies the file * as binary (because the high bits are set), and identifies * the major version of the program. It can be used to * identify and possibly humor byte-swapped architectures, * although we don't bother for now. */ fwrite((void *) &v20magic, 4, 1, fp); /* header info */ fwrite((void *) &(cm->nodes), sizeof(int), 1, fp); /* loop over all nodes in model */ for (k = 0; k < cm->nodes; k++) { /* type of node */ fwrite((void *) &cm->nd[k].type, sizeof(int), 1, fp); /* indices of child nodes */ fwrite((void *) &cm->nd[k].nxt, sizeof(int), 1, fp); fwrite((void *) &cm->nd[k].nxt2, sizeof(int), 1, fp); /* transitions */ fwrite((void *) cm->nd[k].tmx, sizeof(double), STATETYPES*STATETYPES, fp); /* INS emissions */ fwrite((void *) cm->nd[k].il_emit, sizeof(double), ALPHASIZE, fp); fwrite((void *) cm->nd[k].ir_emit, sizeof(double), ALPHASIZE, fp); /* MAT emissions */ fwrite((void *) cm->nd[k].mp_emit, sizeof(double), ALPHASIZE * ALPHASIZE, fp); fwrite((void *) cm->nd[k].ml_emit, sizeof(double), ALPHASIZE, fp); fwrite((void *) cm->nd[k].mr_emit, sizeof(double), ALPHASIZE, fp); } return 1; } /* Function: ReadCM() * * Purpose: Read a flat text copy of the data from a model * from a file. * * Returns: 1 on success, 0 on failure. */ int ReadCM(char *filename, struct cm_s **ret_cm) { FILE *fp; char buffer[512]; int magic_number; /* Open file for reading */ if ((fp = fopen(filename, "r")) == NULL) { Warn("Cannot open model file %s for reading", filename); return 0; } /* Look for "magic" header and dispatch reading * to the appropriate routine. First we check the leading * 4 bytes to see if it's a binary save file. */ if (! fread((void *) &magic_number, 4, 1, fp)) Die("Failed to read magic number from model file %s", filename); if (magic_number == v20magic) { if (! read_bincm20(fp, ret_cm)) Die("Failed to read binary model file %s", filename); } else { rewind(fp); if (fgets(buffer, 512, fp) == NULL) { Warn("ain't no data in the model file %s, pal", filename); return 0; } if (strncmp(buffer, "### cove V2", 11) == 0) { if (! read_cm20(fp, ret_cm)) return 0; } else { Warn("File %s is not a recognized covariance model format", filename); return 0; } } /* We're vulnerable to some roundoff error when we've read * files in; make sure all probabilities sum to 1. */ NormalizeCM(*ret_cm); fclose(fp); return 1; } /* Function: read_cm20() * * Purpose: Read flat text model files from version 2.0 of the * package. The file pointer fp is positioned on the line * just after the "magic" header. Allocates, reads in, * and returns the model. */ static int read_cm20(FILE *fp, struct cm_s **ret_cm) { struct cm_s *cm; int i, j, k; int nodes; int ret = 0; /* header info */ ret = fscanf(fp, "%d \tnodes\n", &nodes); /* Given that header info, alloc for a model. */ cm = AllocCM(nodes); if (cm == NULL) { Warn("Failed to allocate model"); return 0; } /* over all nodes, 0..nodes-1 */ for (k = 0; k < nodes; k++) { ret = fscanf(fp, "### node %*d"); ret = fscanf(fp, " type %d\n", &cm->nd[k].type); ret = fscanf(fp, "%d %d\n", &cm->nd[k].nxt, &cm->nd[k].nxt2); /* transitions */ for (i = 0; i < STATETYPES; i++) { for (j = 0; j < STATETYPES; j++) ret = fscanf(fp, "%lf ", &cm->nd[k].tmx[i][j]); ret = fscanf(fp, "\n"); } /* INSL emissions */ for (i = 0; i < ALPHASIZE; i++) ret = fscanf(fp, "%lf ", &cm->nd[k].il_emit[i]); ret = fscanf(fp, "# INSL\n"); /* INSR emissions */ for (i = 0; i < ALPHASIZE; i++) ret = fscanf(fp, "%lf ", &cm->nd[k].ir_emit[i]); ret = fscanf(fp, "# INSR\n"); /* MATP emissions */ for (i = 0; i < ALPHASIZE; i++) { for (j = 0; j < ALPHASIZE; j++) ret = fscanf(fp, "%lf ", &cm->nd[k].mp_emit[i][j]); ret = fscanf(fp, "# MATP\n"); } /* MATL emissions */ for (i = 0; i < ALPHASIZE; i++) ret = fscanf(fp, "%lf ", &cm->nd[k].ml_emit[i]); ret = fscanf(fp, "# MATL\n"); /* MATR emissions */ for (i = 0; i < ALPHASIZE; i++) ret = fscanf(fp, "%lf ", &cm->nd[k].mr_emit[i]); ret = fscanf(fp, "# MATR\n"); } *ret_cm = cm; return 1; } /* Function: read_bincm20() * * Purpose: Read binary save files. * * Args: fp - open file pointer for reading, positioned after magic number * ret_cm - RETURN: model * * Return: 1 on success, 0 on failure */ static int read_bincm20(FILE *fp, struct cm_s **ret_cm) { struct cm_s *cm; int nodes; int k; /* counter for nodes */ if (! fread((void *) &(nodes), sizeof(int), 1, fp)) return 0; /* now create space for CM. */ cm = AllocCM(nodes); if (cm == NULL) return 0; /* everything else is nodes */ for (k = 0; k < nodes; k++) { /* type of node */ if (! fread((void *) &cm->nd[k].type, sizeof(int), 1, fp)) return 0; /* indices of child nodes */ if (! fread((void *) &cm->nd[k].nxt, sizeof(int), 1, fp)) return 0; if (! fread((void *) &cm->nd[k].nxt2, sizeof(int), 1, fp)) return 0; /* transitions */ if (! fread((void *) cm->nd[k].tmx, sizeof(double), STATETYPES*STATETYPES, fp)) return 0; /* INS emissions */ if (! fread((void *) cm->nd[k].il_emit, sizeof(double), ALPHASIZE, fp)) return 0; if (! fread((void *) cm->nd[k].ir_emit, sizeof(double), ALPHASIZE, fp)) return 0; /* MAT emissions */ if (! fread((void *) cm->nd[k].mp_emit, sizeof(double), ALPHASIZE*ALPHASIZE, fp)) return 0; if (! fread((void *) cm->nd[k].ml_emit, sizeof(double), ALPHASIZE, fp)) return 0; if (! fread((void *) cm->nd[k].mr_emit, sizeof(double), ALPHASIZE, fp)) return 0; } *ret_cm = cm; return 1; } tRNAscan-SE-2.0/src/sqerror.c0000644000543100007160000000271111021467305015262 0ustar pchanlowelab/* SQUID - A C function library for biological sequence analysis * Copyright (C) 1992-1996 Sean R. Eddy * * This source code is distributed under terms of the * GNU General Public License. See the files COPYING * and GNULICENSE for further details. * */ /* sqerror.c * * error handling for the squid library */ /* a global errno equivalent */ int squid_errno; #include #include #include #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: Die() * * Purpose: Print an error message and die. The arguments * are formatted exactly like arguments to printf(). * * Return: None. Exits the program. */ /* VARARGS0 */ int Die(char *format, ...) { va_list argp; /* format the error mesg */ fprintf(stderr, "FATAL: "); va_start(argp, format); vfprintf(stderr, format, argp); va_end(argp); fprintf(stderr, "\n"); fflush(stderr); /* exit */ exit(1); /*NOTREACHED*/ return 1; /* fool lint */ } /* Function: Warn() * * Purpose: Print an error message and return. The arguments * are formatted exactly like arguments to printf(). * * Return: (void) */ /* VARARGS0 */ int Warn(char *format, ...) { va_list argp; /* format the error mesg */ fprintf(stderr, "WARNING: "); va_start(argp, format); vfprintf(stderr, format, argp); va_end(argp); fprintf(stderr, "\n"); fflush(stderr); return 1; } tRNAscan-SE-2.0/src/structcheck_main.c0000644000543100007160000003023511021467306017116 0ustar pchanlowelab/* structcheck_main.c * SRE, Mon Dec 20 07:48:07 1993 * * Check a set of individual RNA structures for non-Watson-Crick * or GU base pairs. Keep statistics on the number of such "errors" found * overall and per sequence. Convert the offending base pairs to "*" * characters in the structure string and print out the structures * and sequences. */ #include #include #include #ifdef NEED_GETOPTH #include #endif #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "hmo:psu" static char usage[] = "\ Usage: structcheck [-options] \n\ where options are:\n\ -h : print short help and usage info\n\ -m : only check for more possible base pairs\n\ -o : save flagged structure annotation alignment to \n\ -p : only check for non-Watson-Crick pair assignments\n\ -s : only check for agreement with consensus structure\n\ -u : only check for upper-case structured positions\n"; static char banner[] = "structcheck: check RNA secondary structures, flag questionables"; extern int VerifyKHS(char *ss); int main(int argc, char **argv) { char **aseqs; /* RNA sequences */ AINFO ainfo; /* misc. associated alignment info */ int nseq; /* number of seqs */ char *seqfile; /* sequence file */ int idx; /* index for sequences */ int pos; /* position index in a seq */ int *ct; /* CT0 representation of a structure */ int badseq; /* number of bad structures */ int badpairs; /* total bad base pairs */ int pairs; /* total base pairs */ int is_bad_seq; int structure_agrees; /* TRUE if consensus and secondary structure are same */ int nonconsensus; /* count seqs w/ differing cons/indiv ss assignments */ int noncons_positions; /* count of pos w/ differing cons/indiv ss assignments */ int npos; /* number of non-gap positions */ int has_morepairs; /* TRUE if more base pairs are possible than indicated in ss */ int morepairs; /* how many more pairs should've been made in alignment */ int morepair_seqs; /* how many seqs should've had more pairs made in them */ int badupper_seqs; int badupper_bases; int has_badupper; char *ss; /* aligned secondary structure string */ char *outfile; /* save flagged annotated alignment to */ FILE *ofp; /* open outfile for writing */ int check_pairs; /* if TRUE, check that pairs are complementary */ int check_consensus; /* if TRUE, compare indiv structs against consensus */ int check_morepairs; /* if TRUE, check for more obvious base-pairing in ss */ int check_isupper; /* if TRUE, check structured pos's are upper, single-s is lower */ int optc; /* for getopt() */ extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ #ifdef MEMDEBUG /* for Cahill's dbmalloc */ unsigned long histid1, histid2, orig_size, current_size; #endif /*********************************************** * Parse command line ***********************************************/ outfile = NULL; check_pairs = FALSE; check_consensus = FALSE; check_morepairs = FALSE; check_isupper = FALSE; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'o': outfile = optarg; break; case 'm': check_morepairs = TRUE; break; case 'p': check_pairs = TRUE; break; case 's': check_consensus = TRUE; break; case 'u': check_isupper = TRUE; break; case 'h': printf("%s\n version %s (%s)\n%s", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("%s", usage); } /* Default behaviour if we're not doing a single thing */ if (! check_morepairs && ! check_pairs && ! check_consensus && ! check_isupper) { check_morepairs = TRUE; check_pairs = TRUE; check_consensus = TRUE; check_isupper = FALSE; /* did this only for SRP-RNA alignment */ } if (argc - optind != 1) Die("Wrong number of command line arguments.\n%s\n", usage); seqfile = argv[argc-1]; #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif /*********************************************** * Get sequence data ***********************************************/ /* read the training seqs from file */ if (! ReadSELEX(seqfile, &aseqs, &nseq, &ainfo)) Die("Failed to read aligned sequence file %s", seqfile); for (idx = 0; idx < nseq; idx++) if (ainfo.sqinfo[idx].flags & SQINFO_SS) break; if (idx == nseq) Die("No secondary structure info in sequence file %s", seqfile); if ( check_consensus == TRUE && !(ainfo.flags & AINFO_CS)) { check_consensus = FALSE; Warn("No consensus structure in %s; can't check against it", seqfile); } /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" release %s, %s\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); puts(""); badpairs = pairs = badseq = 0; nonconsensus = npos = noncons_positions = 0; morepairs = morepair_seqs = 0; badupper_seqs = badupper_bases = 0; for (idx = 0; idx < nseq; idx++) { if (! (ainfo.sqinfo[idx].flags & SQINFO_SS)) continue; /* Make an aligned secondary structure string for our tests */ MakeAlignedString(aseqs[idx], ainfo.alen, ainfo.sqinfo[idx].ss, &ss); /* Get a ct structure to do the other tests */ if (! KHS2ct(ss, ainfo.sqinfo[idx].len, FALSE, &ct)) { printf("sequence %-10.10s (#%d) has an improper secondary structure\n", ainfo.sqinfo[idx].name, idx); free(ct); ct = NULL; VerifyKHS(ss); } /* Check if structured positions are upper-case and unstructured * positions are not. I used this for checking the SRP-RNA alignment * of Larsen and Zwieb, and it may come in handy later too. */ if (check_isupper) { has_badupper = FALSE; for (pos = 0; pos < ainfo.alen; pos++) { if (isgap(aseqs[idx][pos])) continue; if ( (ss[pos] == '.' && ! islower(aseqs[idx][pos])) || (ss[pos] != '.' && ! isupper(aseqs[idx][pos])) ) { has_badupper = TRUE; badupper_bases++; ss[pos] = '*'; } } if (has_badupper) badupper_seqs++; } /* Test for non-Watson-Crick base pairs */ if (check_pairs && ct != NULL) { is_bad_seq = False; for (pos = 0; pos < ainfo.alen; pos++) /* second test makes sure we only look at * each bp once */ if (ct[pos] != -1 && pos < ct[pos]) { pairs++; if (! IsRNAComplement(aseqs[idx][pos], aseqs[idx][ct[pos]], TRUE)) { ss[pos] = ss[ct[pos]] = '*'; is_bad_seq = True; badpairs++; } } if (is_bad_seq) badseq++; } /* Test for disagreement with consensus structure */ if (check_consensus) { structure_agrees = True; for (pos = 0; pos < ainfo.alen; pos++) { if (isgap(aseqs[idx][pos])) continue; npos++; if (ss[pos] != ainfo.cs[pos] ) { structure_agrees = False; ss[pos] = '*'; noncons_positions++; } } if (! structure_agrees) nonconsensus++; } /* Test for whether more pairs are obviously possible in the structure. * *Very* crude. For each base pair i,j, if (i-1,j+1) or * (i+1,j-1) are unpaired but complementary, flag them. This is *not* * a full-blown structure optimization algorithm (such a thing * is possible, but would require dynamic programming), but it should * flag most suspicious spots. */ if (check_morepairs && ct != NULL) { has_morepairs = FALSE; for (pos = 0; pos < ainfo.alen; pos++) /* second test makes sure we only look at * each bp once */ if (ct[pos] != -1 && pos < ct[pos]) { /* check i-1,j+1 pair; careful of ends */ if (pos > 0 && ct[pos] < ainfo.alen && ct[pos-1] == -1 && ct[ct[pos]+1] == -1 && IsRNAComplement(aseqs[idx][pos-1], aseqs[idx][ct[pos]+1], TRUE)) { has_morepairs = TRUE; morepairs++; ss[pos-1] = ss[ct[pos]+1] = '*'; } /* check i+1,j-1 pair; don't need to worry about ends */ if (ct[pos+1] == -1 && ct[ct[pos]-1] == -1 && IsRNAComplement(aseqs[idx][pos+1], aseqs[idx][ct[pos]-1], TRUE)) { has_morepairs = TRUE; morepairs++; ss[pos+1] = ss[ct[pos]-1] = '*'; } } if (has_morepairs) morepair_seqs++; } free(ct); /* Convert aligned ss back to dealigned ss */ free(ainfo.sqinfo[idx].ss); MakeDealignedString(aseqs[idx], ainfo.alen, ss, &(ainfo.sqinfo[idx].ss)); free(ss); } if (outfile == NULL) { if (! WriteSELEX(stdout, aseqs, nseq, &ainfo, 60)) Die("Failed to write alignment to stdout"); } else { if ((ofp = fopen(outfile, "w")) == NULL) Die("Failed to open flagged annotated alignment file %s", outfile); if (! WriteSELEX(ofp, aseqs, nseq, &ainfo, 60)) Die("Failed to write alignment to %s", outfile); fclose(ofp); printf("Wrote flagged annotated alignment file to %s\n", outfile); } if (check_pairs) { printf("\nComplementarity check:\n"); printf("%d/%d structures contain non-Watson-Crick, non-GU pairs\n", badseq, nseq); printf("%d/%d base pairs are questionable\n", badpairs, pairs); } if (check_consensus) { printf("\nConsensus structure check:\n"); printf("%d/%d structures disagree with consensus\n", nonconsensus, nseq); printf("%d/%d non-gap sequence positions disagree\n", noncons_positions, npos); } if (check_morepairs) { printf("\nAdditional structure check:\n"); printf("%d/%d structures have obvious additional pairings\n", morepair_seqs, nseq); printf("%d additional base pairs are predicted\n", morepairs); } if (check_isupper) { printf("\nCheck that structured positions are upper case:\n"); printf("%d/%d structures have conflicts\n", badupper_seqs, nseq); printf("%d conflicts are detected\n", badupper_bases); } FreeAlignment(aseqs, nseq, &ainfo); #ifdef MEMDEBUG current_size = malloc_size(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "No memory leaks, sir.\n"); #endif return 0; } /* Function: VerifyKHS() * * Purpose: Examine a bad structure string and print out diagnostics * about it. * * Return: 1 if string is OK, 0 if string is bad. */ int VerifyKHS(char *ss) { int symcount[27]; /* 0 is normal pairs. 1-26 for pseudoknots */ int i; int pos; int status = 1; for (i = 0; i < 27; i++) symcount[i] = 0; for (pos = 0; ss[pos] != '\0'; pos++) { if (ss[pos] > 127) /* evade SGI ctype.h islower(), isupper() bug */ { status = 0; fprintf(stderr, " structure has garbage symbol (val %d) at position %d\n", (int) ss[pos], pos); } else if (ss[pos] == '>') symcount[0] ++; else if (ss[pos] == '<') symcount[0] --; else if (isupper((int) ss[pos])) symcount[ss[pos] - 'A' + 1] ++; else if (islower((int) ss[pos])) symcount[ss[pos] - 'a' + 1] --; else if (ss[pos] != '.') { status = 0; fprintf(stderr, " structure has invalid symbol %c at position %d\n", ss[pos], pos); } } if (symcount[0] != 0) { status = 0; fprintf(stderr, " structure has extra paired bases: %d on %s\n", abs(symcount[0]), (symcount[0] > 0) ? "left" : "right"); } for (i = 1; i < 27; i++) if (symcount[i] != 0) { status = 0; fprintf(stderr, " structure has extra paired bases for pseudoknot %c: %d on %s\n", (char) (i + 'A' - 1), abs(symcount[i]), (symcount[i] > 0) ? "left" : "right"); } return status; } tRNAscan-SE-2.0/src/probify.c0000644000543100007160000001561711021467305015250 0ustar pchanlowelab/* probify.c * Convert counts to probabilities, using regularization. * SRE, Fri Sep 3 08:02:49 1993 * * Because our training sequence set is finite (and often small) * and our number of free parameters is large, we have a small * sample statistics problem in estimating each parameter. * * The simplest way to deal with this problem is to use the * so-called Laplace law of succession. If I have N sequences * with a base symbol assigned to some state, and see n A's, * I calculate the emission probability of A as (n+1)/(N+4), * i.e., adding 1 to the numerator and the number of possible * outcomes (4) to the denominator. A summary of the proof of * this appears in Berg & von Hippel (J Mol Biol 193:723-750, 1987). * It is referred to as the "plus-one prior" by David Haussler * and by us. * * The plus-one prior implies that we have no knowledge at all about * the prior probabilities; in absence of any data, the probabilities * default to 1/4. What if we do have prior information? (For instance, * for state transitions, we know that deletes and inserts are * relatively rare.) We use a generalization of the Laplace law * of succession: * n(x) + alpha * R(x) * P(x) = ------------------- * --- * \ n(i) + alpha * R(i) * /__ * i * * Here, R(x) is a "regularizer" and alpha is a weight applied * to the regularizer. Both were 1.0 in the plus-one prior. * Now, we can bias R(x) to reflect our prior expectations * about the probability distribution P(x). (In practice, * we calculate R(x) by specifying a prior probability distribution * and multiplying each term by the number of possible outcomes.) * alpha is a "confidence" term; the higher alpha is, the more * data it takes to outweigh the prior. We usually set alpha to * 1.0, but sometimes -- such as for insert state emissions, * where we may assert that the emission probabilities are * the same as random regardless of the data -- we might use * arbitrarily high alpha's to freeze certain probability distributions * at their priors. * * All this follows the description in Krogh et. al's HMM paper * (in press, JMB, 1993). * */ #include "structs.h" #include "funcs.h" /* Function: ProbifyCM() * * Purpose: Convert all the state transitions and symbol emissions in * a covariance model from counts to probabilities. * * Args: cm - the model to convert * * Return: (void). Counts in cm become probabilities. */ void ProbifyCM(struct cm_s *cm, struct prior_s *prior) { int k; for (k = 0; k < cm->nodes; k++) { if (cm->nd[k].type != BIFURC_NODE) { if (cm->nd[k].nxt == -1) ProbifyTransitionMatrix(cm->nd[k].tmx, cm->nd[k].type, END_NODE, prior); else ProbifyTransitionMatrix(cm->nd[k].tmx, cm->nd[k].type, cm->nd[k+1].type, prior); } switch (cm->nd[k].type) { case MATP_NODE: ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior); ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior); ProbifySingletEmission(cm->nd[k].ml_emit, uMATL_ST, prior); ProbifySingletEmission(cm->nd[k].mr_emit, uMATR_ST, prior); ProbifyPairEmission(cm->nd[k].mp_emit, prior); break; case MATL_NODE: ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior); ProbifySingletEmission(cm->nd[k].ml_emit, uMATL_ST, prior); break; case MATR_NODE: ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior); ProbifySingletEmission(cm->nd[k].mr_emit, uMATR_ST, prior); break; case BEGINR_NODE: ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior); break; case ROOT_NODE: ProbifySingletEmission(cm->nd[k].il_emit, uINSL_ST, prior); ProbifySingletEmission(cm->nd[k].ir_emit, uINSR_ST, prior); break; case BIFURC_NODE: break; case BEGINL_NODE: break; default: Die("Unrecognized node type %d at node %d", cm->nd[k].type, k); } } } /* Function: ProbifyTransitionMatrix() * * Purpose: Convert the state transition matrix between two nodes * from counts to probabilities. * * Args: tmx: 6x6 state transition matrix of counts * from_node: e.g. MATP_NODE, type of node we transit from * to_node: type of node we transit to * prior: prior probability distributions * * Return: (void). Values in tmx become probabilities. */ void ProbifyTransitionMatrix(double tmx[STATETYPES][STATETYPES], int from_node, int to_node, struct prior_s *prior) { int i,j; double denom; for (i = 0; i < STATETYPES; i++) { /* if no transitions to DEL in prior, this must be an unused vector */ if (prior->tprior[from_node][to_node][i][0] > 0.0) { denom = 0.0; for (j = 0; j < STATETYPES; j++) { tmx[i][j] = tmx[i][j] + prior->talpha[i] * prior->tprior[from_node][to_node][i][j]; denom += tmx[i][j]; } for (j = 0; j < STATETYPES; j++) tmx[i][j] /= denom; } } } /* Function: ProbifySingletEmission() * * Purpose: Convert a singlet emission vector from counts to probabilities. * * Args: emvec: the emission vector * statetype: type of state: uMATL_ST, uMATR_ST, uINSL_ST, uINSR_ST * prior: prior probability distributions * * Return: (void). Values in emvec become probabilities. */ void ProbifySingletEmission(double emvec[ALPHASIZE], int statetype, struct prior_s *prior) { int x; double denom; double *em_prior; /* Choose the correct prior probability distribution to use. */ switch (statetype) { case uMATL_ST: em_prior = prior->matl_prior; break; case uMATR_ST: em_prior = prior->matr_prior; break; case uINSL_ST: em_prior = prior->insl_prior; break; case uINSR_ST: em_prior = prior->insr_prior; break; default: Die("statetype %d is not a singlet emitting state\n", statetype); } denom = 0.0; for (x = 0; x < ALPHASIZE; x++) { emvec[x] = emvec[x] + prior->emalpha[StatetypeIndex(statetype)] * em_prior[x]; denom += emvec[x]; } if (denom > 0.0) for (x = 0; x < ALPHASIZE; x++) emvec[x] /= denom; } /* Function: ProbifyPairEmission() * * Purpose: Convert a MATP pairwise emission matrix from counts to probabilities. * * Args: emx: the emission matrix * prior: prior probability distributions * * Return: (void). Values in emx become probabilities. */ void ProbifyPairEmission(double emx[ALPHASIZE][ALPHASIZE], struct prior_s *prior) { int x,y; double denom; denom = 0.0; for (x = 0; x < ALPHASIZE; x++) for (y = 0; y < ALPHASIZE; y++) { emx[x][y] = emx[x][y] + prior->emalpha[MATP_ST] * prior->matp_prior[x][y]; denom += emx[x][y]; } if (denom > 0.0) for (x = 0; x < ALPHASIZE; x++) for (y = 0; y < ALPHASIZE; y++) emx[x][y] /= denom; } tRNAscan-SE-2.0/src/misc.c0000644000543100007160000001524011021467304014520 0ustar pchanlowelab/* misc.c * Stuff with no obvious other place to go; * mostly alphabet-related functions. * SRE, Mon Sep 6 10:50:46 1993 */ #include #include #include #include #include "squid.h" #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif char *ALPHABET = RNA_ALPHABET; /* Function: SymbolIndex() * * Purpose: Given a sequence symbol, return an array index. * Used for retrieving emission statistics from the * model. This would be trivial, except that we must * expect to see degenerate codes for both RNA and * protein sequence. The rule we follow for degenerate * codes is like BLAST -- we choose one possibility * at random. Note that this is unlike the HMM package, * which goes to the trouble of calculating a weighted * average of the possibilities. * * If the symbol is not recognized, print a warning but * treat it as a fully ambiguous position (N or X). * * Return: the index, usually 0..3 or 0..19. */ int SymbolIndex(char sym) { char *sptr; /* trivial case: sym is in alphabet */ if ((sptr = strchr(ALPHABET, sym)) != NULL) return (sptr - ALPHABET); /* non-trivial case: possible degenerate symbol */ if (ALPHATYPE == kAmino) { switch (sym) { case 'B': return (strchr(ALPHABET, "ND"[CHOOSE(2)]) - ALPHABET); case 'Z': return (strchr(ALPHABET, "QE"[CHOOSE(2)]) - ALPHABET); default: Warn("Warning: unrecognized character %c in sequence\n", sym); /* break thru to case 'X' */ case 'X': return(CHOOSE(20)); } } else if (ALPHATYPE == kDNA) { switch (sym) { case 'B': return (strchr(ALPHABET, "CGT"[CHOOSE(3)]) - ALPHABET); case 'D': return (strchr(ALPHABET, "AGT"[CHOOSE(3)]) - ALPHABET); case 'H': return (strchr(ALPHABET, "ACT"[CHOOSE(3)]) - ALPHABET); case 'K': return (strchr(ALPHABET, "GT" [CHOOSE(2)]) - ALPHABET); case 'M': return (strchr(ALPHABET, "AC" [CHOOSE(2)]) - ALPHABET); case 'R': return (strchr(ALPHABET, "AG" [CHOOSE(2)]) - ALPHABET); case 'S': return (strchr(ALPHABET, "CG" [CHOOSE(2)]) - ALPHABET); case 'U': return (strchr(ALPHABET, 'T') - ALPHABET); case 'V': return (strchr(ALPHABET, "ACG"[CHOOSE(3)]) - ALPHABET); case 'W': return (strchr(ALPHABET, "AT" [CHOOSE(2)]) - ALPHABET); case 'Y': return (strchr(ALPHABET, "CT" [CHOOSE(2)]) - ALPHABET); case 'N': return (CHOOSE(4)); case 'X': return (CHOOSE(4)); /* X is not IUPAC, but that doesn't stop biologists from using it. */ default: Warn("Warning: unrecognized character %c in sequence\n", sym); return (CHOOSE(4)); } } else if (ALPHATYPE == kRNA) { switch (sym) { case 'B': return (strchr(ALPHABET, "CGU"[CHOOSE(3)]) - ALPHABET); case 'D': return (strchr(ALPHABET, "AGU"[CHOOSE(3)]) - ALPHABET); case 'H': return (strchr(ALPHABET, "ACU"[CHOOSE(3)]) - ALPHABET); case 'K': return (strchr(ALPHABET, "GU" [CHOOSE(2)]) - ALPHABET); case 'M': return (strchr(ALPHABET, "AC" [CHOOSE(2)]) - ALPHABET); case 'R': return (strchr(ALPHABET, "AG" [CHOOSE(2)]) - ALPHABET); case 'S': return (strchr(ALPHABET, "CG" [CHOOSE(2)]) - ALPHABET); case 'T': return (strchr(ALPHABET, 'U') - ALPHABET); case 'V': return (strchr(ALPHABET, "ACG"[CHOOSE(3)]) - ALPHABET); case 'W': return (strchr(ALPHABET, "AU" [CHOOSE(2)]) - ALPHABET); case 'Y': return (strchr(ALPHABET, "CU" [CHOOSE(2)]) - ALPHABET); case 'N': return (CHOOSE(4)); case 'X': return (CHOOSE(4)); /* X is not IUPAC, but that doesn't stop biologists from using it. */ default: Warn("Warning: unrecognized character %c in sequence\n", sym); return (CHOOSE(4)); } } return 0; /* not reached */ } /* Function: PrepareSequence() * * Purpose: Ran into a severe bug caused by degenerate symbols. Original * strategy was to randomly assign a single symbol as we do * Viterbi calculations, but since we don't keep traceback pointers * when the trace tries to recalculate, it doesn't know the * random choices made by VitFill(). * * This is a fix, and it's a bit more extreme. Go through a * sequence and *replace* degenerate symbols once * and for all with single randomly chosen ones. Also, * we convert to upper case ALPHATYPE alphabet. * * Args: seq - sequence to prepare. * * Return: 1 on success, 0 on failure. */ int PrepareSequence(char *seq) { char *sym; for (sym = seq; *sym != '\0'; sym++) { *sym = toupper((int)*sym); /* sym is in alphabet, or a gap? ok, go to next one */ if (strchr(ALPHABET, *sym) != NULL || isgap(*sym)) continue; /* then it's a degenerate symbol. * According to alphabet, choose a single symbol to represent it. * watch out for too-clever scheme for random choice: "ABC"[random() % 3] */ if (ALPHATYPE == kRNA) { switch (*sym) { case 'B': *sym = "CGU"[CHOOSE(3)]; break; case 'D': *sym = "AGU"[CHOOSE(3)]; break; case 'H': *sym = "ACU"[CHOOSE(3)]; break; case 'K': *sym = "GU" [CHOOSE(2)]; break; case 'M': *sym = "AC" [CHOOSE(2)]; break; case 'R': *sym = "AG" [CHOOSE(2)]; break; case 'S': *sym = "CG" [CHOOSE(2)]; break; case 'T': *sym = 'U'; break; case 'V': *sym = "ACG"[CHOOSE(3)]; break; case 'W': *sym = "AU" [CHOOSE(2)]; break; case 'Y': *sym = "CU" [CHOOSE(2)]; break; default: Warn("Warning: unrecognized character %c in sequence\n", *sym); /* break through to case 'N' */ case 'N': *sym = ALPHABET[CHOOSE(4)]; break; } } else if (ALPHATYPE == kDNA) { switch (*sym) { case 'B': *sym = "CGT"[CHOOSE(3)]; break; case 'D': *sym = "AGT"[CHOOSE(3)]; break; case 'H': *sym = "ACT"[CHOOSE(3)]; break; case 'K': *sym = "GT" [CHOOSE(2)]; break; case 'M': *sym = "AC" [CHOOSE(2)]; break; case 'R': *sym = "AG" [CHOOSE(2)]; break; case 'S': *sym = "CG" [CHOOSE(2)]; break; case 'U': *sym = 'T'; break; case 'V': *sym = "ACG"[CHOOSE(3)]; break; case 'W': *sym = "AT" [CHOOSE(2)]; break; case 'Y': *sym = "CT" [CHOOSE(3)]; break; default: Warn("Warning: unrecognized character %c in sequence\n", *sym); /* break through to case 'N' */ case 'N': *sym = ALPHABET[CHOOSE(4)]; break; } } else { Warn("Warning: non-nucleic acid alphabet, unrecognized character %c in sequence\n", *sym); *sym = ALPHABET[CHOOSE(ALPHASIZE)]; } } return 1; } tRNAscan-SE-2.0/src/structs.c0000644000543100007160000001733511021467306015305 0ustar pchanlowelab/* structs.c * 1.0: SRE, Tue Jul 6 18:52:34 1993 * 2.0: SRE, Thu Sep 9 14:19:19 1993 * * Boring stuff which had better be flawless. * * Implementation of data structures. Includes * various pushdown stacks used for traversing model trees and * traceback trees, and linked lists used for collapsing trees * into linear alignments/strings. * * Stacks have dummy start states. The end is just a NULL * pointer off the last state in the stack. * * Linked lists have dummy start and end states, to facilitate * insertion and deletion. * * Pop functions only return values when the passed pointers * are non-NULL, so you can ask for whatever fields you want. * * For implementation of traceback tree structures, see trace.c. * For implementation of model structures, see model.c. */ #include #include #include "structs.h" #include "funcs.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif /* Function: StatetypeIndex() * * Purpose: Convert a unique statetype identifier to a valid * array index */ int StatetypeIndex(int type) { switch (type) { case uBEGIN_ST: return BEGIN_ST; case uBIFURC_ST: return BIFURC_ST; case uDEL_ST: return DEL_ST; case uEND_ST: return END_ST; case uMATP_ST: return MATP_ST; case uMATL_ST: return MATL_ST; case uMATR_ST: return MATR_ST; case uINSR_ST: return INSR_ST; case uINSL_ST: return INSL_ST; default: Die("no such state, %d", type); } /*NOTREACHED*/ return 0; } /* Function: UniqueStatetype() * * Purpose: Convert an array index statetype into a unique statetype, * using the additional information of what kind of node * the state is from. */ int UniqueStatetype(int nodetype, int stidx) { switch (stidx) { case DEL_ST: switch (nodetype) { case -1: return uEND_ST; case BIFURC_NODE: return uBIFURC_ST; case BEGINL_NODE: case BEGINR_NODE: return uBEGIN_ST; default: return uDEL_ST; } case MATP_ST: return uMATP_ST; case MATL_ST: return uMATL_ST; case MATR_ST: return uMATR_ST; case INSR_ST: return uINSR_ST; case INSL_ST: return uINSL_ST; default: Die("no such state index %d", stidx); } /*NOTREACHED*/ return 0; } /************************************************************ * m2ali_s implementation. * * Functions: Init_m2ali() * Push_m2ali() * Pop_m2ali() * Free_m2ali() * * Implementation of the pushdown stack for traversing a model * and producing an alignment as a linked list of align_s * structures. Must keep track of a current node in the model * tree (stateidx, subtype) and a current insertion point in * the alignment (insafter) *************************************************************/ struct m2ali_s * Init_m2ali(void) { struct m2ali_s *stack; if ((stack = (struct m2ali_s *) malloc (sizeof(struct m2ali_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); stack->nxt = NULL; return stack; } void Push_m2ali(struct m2ali_s *stack, int nodeidx, int type, struct align_s *after) { struct m2ali_s *new; if ((new = (struct m2ali_s *) malloc (sizeof(struct m2ali_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->nodeidx= nodeidx; new->type = type; new->after = after; new->nxt = stack->nxt; stack->nxt = new; } int Pop_m2ali(struct m2ali_s *stack, int *ret_nodeidx, int *ret_type, struct align_s **ret_after) { struct m2ali_s *old; if (stack->nxt == NULL) return 0; old = stack->nxt; stack->nxt = old->nxt; if (ret_nodeidx != NULL) *ret_nodeidx = old->nodeidx; if (ret_type != NULL) *ret_type = old->type; if (ret_after != NULL) *ret_after = old->after; free(old); return 1; } void Free_m2ali( struct m2ali_s *stack ) { while (Pop_m2ali(stack, (int *) NULL, (int *) NULL, (struct align_s **) NULL)) ; free(stack); } /*************************************************************** * t2ali_s implementation. * * Functions: Init_t2ali() * Push_t2ali() * Pop_t2ali() * Free_t2ali() * * Implementation of the pushdown stack for traversing a traceback * and producing a linked list of align_s structures. ****************************************************************/ struct t2ali_s * Init_t2ali(void) { struct t2ali_s *stack; if ((stack = (struct t2ali_s *) malloc (sizeof(struct t2ali_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); stack->nxt = NULL; return stack; } void Push_t2ali(struct t2ali_s *stack, struct trace_s *tracenode, struct align_s *after) { struct t2ali_s *new; if ((new = (struct t2ali_s *) malloc (sizeof(struct t2ali_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->tracenode = tracenode; new->after = after; new->nxt = stack->nxt; stack->nxt = new; } int Pop_t2ali(struct t2ali_s *stack, struct trace_s **ret_tracenode, struct align_s **ret_after) { struct t2ali_s *old; if (stack->nxt == NULL) return 0; old = stack->nxt; stack->nxt = old->nxt; if (ret_tracenode != NULL) *ret_tracenode = old->tracenode; if (ret_after != NULL) *ret_after = old->after; free(old); return 1; } void Free_t2ali( struct t2ali_s *stack ) { while (Pop_t2ali(stack, (struct trace_s **) NULL, (struct align_s **) NULL)) ; free(stack); } /************************************************************ * align_s implementation * * Functions: Init_align() * Insafter_align() * Free_align() * Print_align() * * Implementation of a forward-linked list for alignment of * a model to a sequence. ************************************************************/ struct align_s * Init_align(void) { struct align_s *head; struct align_s *tail; if ((head = (struct align_s *) malloc (sizeof(struct align_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); if ((tail = (struct align_s *) malloc (sizeof(struct align_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); head->sym = tail->sym = ' '; head->ss = tail->ss = ' '; head->pos = tail->pos = -1; head->nodeidx = tail->nodeidx = -1; head->type = tail->type = -1; head->nxt = tail; tail->nxt = NULL; return head; } struct align_s * Insafter_align(int pos, char sym, /* ACGU base character */ char ss, /* <.> secondary structure character */ int nodeidx, int type, struct align_s *after) { struct align_s *new; if ((new = (struct align_s *) malloc (sizeof(struct align_s))) == NULL) Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); new->pos = pos; new->sym = sym; new->ss = ss; new->nodeidx = nodeidx; new->type = type; new->nxt = after->nxt; after->nxt = new; return new; } void Delafter_align(struct align_s *after) { struct align_s *old; old = after->nxt; after->nxt = old->nxt; free(old); } void Free_align(struct align_s *head) { struct align_s *old; while (head != NULL) { old = head; head = head->nxt; free(old); } } #ifdef DEBUG void Print_align(struct align_s *head) { struct align_s *curr; for (curr = head->nxt; curr->nxt != NULL; curr = curr->nxt) fprintf(stderr, "%2d %c %c %2d %2d\n", curr->pos, curr->sym, curr->ss, curr->nodeidx, curr->type); } #endif /* DEBUG */ tRNAscan-SE-2.0/src/train_main.c0000644000543100007160000003407011021467306015712 0ustar pchanlowelab/* train_main.c * 2.0: Fri Oct 1 16:02:11 1993 * SRE, Mon Jun 28 17:52:54 1993 * * main() for covet: training of a covariance hmm from aligned seqs */ #include #include #include #include #include #ifdef NEED_GETOPTH #include #endif #include "structs.h" #include "funcs.h" #include "squid.h" #include "version.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif #define OPTIONS "a:A:b:fhG:i:mp:s:X:" static char usage[] = "\ Usage: covet [-options] \n\ where options are:\n\ -a : make starting model from alignment\n\ -A : save alignments to filename.1, etc., for animation\n\ -b : each iteration, back up curr model to \n\ -f : use flat text save formats, portable but clumsy\n\ -G : gap-open prob 0 < gop < 1 for random alignment generation\n\ -h : print short help and version info\n\ -i : take start model from \n\ -m : do maximum likelihood model construction (slow!)\n\ -p : use prior in ; default is Laplace plus-one\n\ -s : set random() seed\n\ -X : gap-extend prob 0 < gex < 1 for random alignment generation\n"; static char banner[] = "covet: training of a covariance model"; int main(int argc, char **argv) { char **rseqs; /* training sequences */ SQINFO *sqinfo; /* array of sqinfo structures for rseqs */ AINFO ainfo; /* alignment info */ int nseq; /* number of seqs */ char *seqfile; /* training sequence file */ int format; /* seqfile format */ char *cmfile; /* OUTPUT: saved cvhmm */ FILE *cmfp; /* OUTPUT: fp to cvfile */ struct cm_s *cm; /* model */ struct cm_s *newcm; /* new model */ struct istate_s *icm; /* model, integer log odds form */ int statenum; /* # of states in the model */ struct prior_s *prior; /* prior prob. distributions */ int idx; /* counter for sequences */ double score; /* score of individual alignment */ double totscore; /* summed scores over training seqs */ double oldscore; /* previous totscore for old model */ double delta; /* fractional change in scores for iteration */ int iteration; /* iteration number we're on */ struct trace_s **tr; /* tracebacks for each sequence */ double rfreq[ALPHASIZE]; /* expected background symbol frequencies */ int max_iterations; double threshold; /* fractional tolerance, test for convergence */ char *aseqfile; /* sequence alignment for model building */ char **aseqs; /* aligned sequences in aseqfile */ int num_aseqs; /* number of aseqs in aseqfile */ char *in_cmfile; /* file containing input model */ char *bckfile; /* backup file for saving models each iteration */ FILE *bckfp; /* open pointer to backup file */ int do_flatformat; /* TRUE if we save in flat text format */ int seed; /* seed for random() */ char *animateroot; /* root name for alignment animation save files */ char animationfile[256]; /* full animation file name (root.1, etc.) */ FILE *animfp; double random_open; /* insert-open probability for random alignment */ double random_extend; /* ins-extend probability for random alignment */ int fast_version; /* do Fastmodelmaker(), not Maxmodelmaker() */ char *prifile; /* file to obtain prior from */ FILE *prifp; /* open prifile for reading */ int watsoncrick; /* if TRUE, annotate only canonical pairs */ int optc; extern char *optarg; /* for getopt() */ extern int optind; /* for getopt() */ #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, current_size; #endif /*********************************************** * Parse command line ***********************************************/ threshold = 0.01; /* default: 1% */ max_iterations = 100; aseqfile = NULL; in_cmfile = NULL; bckfile = NULL; do_flatformat = 0; seed = (int) time (0); /* default: "random" seed */ animateroot = NULL; random_open = 0.02; random_extend = 0.39; fast_version = TRUE; prifile = NULL; watsoncrick = TRUE; while ((optc = getopt(argc, argv, OPTIONS)) != -1) switch (optc) { case 'a': aseqfile = optarg; break; case 'A': animateroot = optarg; break; case 'b': bckfile = optarg; break; case 'f': do_flatformat = 1; break; case 'G': random_open = atof(optarg); break; case 'i': in_cmfile = optarg; break; case 'm': fast_version = FALSE; break; case 'p': prifile = optarg; break; case 's': seed = atoi(optarg); break; case 'X': random_extend = atof(optarg); break; case 'h': printf("%s\n version %s (%s)\n%s\n", banner, RELEASE, RELEASEDATE, usage); exit(0); default: Die("Error: unrecognized option %c\n", optc); } if (argc - optind != 2) Die("%s\n", usage); if (aseqfile != NULL && in_cmfile != NULL) Die("options -i and -a are exclusive\n%s", usage); cmfile = argv[argc-2]; seqfile = argv[argc-1]; sre_srandom(seed); #ifdef MEMDEBUG orig_size = malloc_size(&histid1); #endif /*********************************************** * Get sequence data and a prior ***********************************************/ /* random background model */ rfreq[0] = rfreq[1] = rfreq[2] = rfreq[3] = 0.25; if (! SeqfileFormat(seqfile, &format, NULL)) Die("Failed to determine format of sequence file %s", seqfile); /* read the training seqs from file */ if (! ReadMultipleRseqs(seqfile, format, &rseqs, &sqinfo, &nseq)) Die("Failed to read sequences from file %s", seqfile); for (idx = 0; idx < nseq; idx++) PrepareSequence(rseqs[idx]); if (prifile == NULL) { if (! DefaultPrior(&prior)) Die("Failed to copy prior probability distribution information"); } else { if ((prifp = fopen(prifile, "r")) == NULL) Die("Failed to open prior probability file %s", prifile); if (! ReadPrior(prifp, &prior)) Die("Failed to read prior probabilities from %s", prifile); fclose(prifp); } /*********************************************** * Create the starting model ***********************************************/ if (aseqfile != NULL) /* A start from an alignment */ { if (! SeqfileFormat(aseqfile, &format, NULL)) Die("Failed to determine format of seed alignment file %s", aseqfile); if (! ReadAlignment(aseqfile, format, &aseqs, &num_aseqs, &ainfo)) Die("Failed to read alignment from %s", aseqfile); for (idx = 0; idx < num_aseqs; idx++) s2upper(aseqs[idx]); if (fast_version) { if (! Fastmodelmaker(aseqs, &ainfo, nseq, prior, 0.70, NULL, &cm, NULL)) Die("Fastmodelmaker failed to create starting model from alignment"); } else { if (! Maxmodelmaker(aseqs, &ainfo, num_aseqs, -1.0, prior, NULL, &cm, NULL)) Die("Failed to create starting model from alignment"); } FreeAlignment(aseqs, num_aseqs, &ainfo); } else if (in_cmfile) /* A start from an existing model */ { if (! ReadCM(in_cmfile, &cm)) Die("Failed to read starting model from file %s", in_cmfile); } else /* A start from a flat model */ { RandomAlignment(rseqs, sqinfo, nseq, random_open, random_extend, &aseqs, &ainfo); if (fast_version) { if (! Fastmodelmaker(aseqs, &ainfo, nseq, prior, 0.70, NULL, &cm, NULL)) Die("Fastmodelmaker failed to create starting model from alignment"); } else { if (! Maxmodelmaker(aseqs, &ainfo, nseq, -1.0, prior, NULL, &cm, NULL)) Die("Failed to create starting model from alignment"); } FreeAlignment(aseqs, nseq, &ainfo); } /*********************************************** * Print banner ***********************************************/ puts(banner); printf(" release %s, %s\n\n", RELEASE, RELEASEDATE); printf("---------------------------------------------------\n"); printf("Training data: %s (%d sequences)\n", seqfile, nseq); if (aseqfile != NULL) printf("Starting model: from alignment in %s (%d seqs)\n", aseqfile, num_aseqs); else if (in_cmfile != NULL ) printf("Starting model: from existing model in %s\n", in_cmfile); else printf("Starting model: random alignment\n"); printf("Prior distributions: %s\n", prifile == NULL ? "plus-one" : prifile); printf("Modelmaking strategy: %s\n", fast_version ? "fast heuristic" : "max likelihood"); printf("Convergence threshold: %.4f\n", threshold); printf("Maximum iterations: %d\n", max_iterations); if (bckfile != NULL) printf("Backup model file: %s\n", bckfile); printf("seed for random(): %d\n", seed); printf("---------------------------------------------------\n"); puts(""); /*********************************************** * Train model by expectation maximization ***********************************************/ if ((tr = (struct trace_s **) malloc (nseq * sizeof(struct trace_s *))) == NULL) Die("Memory failure, line %d of %s", __LINE__, __FILE__); oldscore = -1.0 * HUGE_VAL; iteration = 0; while (iteration < max_iterations) { iteration++; printf("Iteration %4d : model of %d nodes, ", iteration, cm->nodes); /* Make a search model */ if (! RearrangeCM(cm, rfreq, &icm, &statenum)) Die("Failed to make an integer log-odds model"); /* First we align all the sequences to the model, * and construct a multiple sequence alignment */ totscore = 0.0; for (idx = 0; idx < nseq; idx++) { if (! ViterbiAlign(icm, statenum, rseqs[idx], &score, &tr[idx])) Die("viterbi alignment failed on sequence %d", idx); totscore += score; } /* An option for producing cool figures and animations: * save the alignment at each iteration, so we can animate * the learning process. I produced covariance matrices * with MIXY for each iteration, and used GNUPLOT to * animate the data as a series of 3D surface plots. */ if (animateroot != NULL) { sprintf(animationfile, "%s%d", animateroot, iteration); if ((animfp = fopen(animationfile, "w")) == NULL) Warn("Failed to open animation output file %s", animationfile); else { if (! Traces2Alignment(rseqs, sqinfo, tr, nseq, cm, watsoncrick, &aseqs, &ainfo)) Warn("Traces2Alignment() failed for animation"); WriteSELEX(animfp, aseqs, nseq, &ainfo, 60); FreeAlignment(aseqs, nseq, &ainfo); fclose(animfp); } } /* If we've converged, stop. * Else, make a new model from the alignment. */ delta = (totscore - oldscore) / fabs(totscore); printf("score %.3f, delta %.3f\n", totscore / (double) nseq, delta); if (delta > threshold || delta < 0) { if (! Traces2Alignment(rseqs, sqinfo, tr, nseq, cm, watsoncrick, &aseqs, &ainfo)) Die("Traces2Alignment() failed"); for (idx = 0; idx < nseq; idx++) s2upper(aseqs[idx]); if (fast_version) { if (! Fastmodelmaker(aseqs, &ainfo, nseq, prior, 0.70, NULL, &newcm, NULL)) Die("Failed to create new model from alignment"); } else { if (! Maxmodelmaker(aseqs, &ainfo, nseq, -1.0, prior, NULL, &newcm, NULL)) Die("Failed to create new model from alignment"); } FreeAlignment(aseqs, nseq, &ainfo); for (idx = 0; idx < nseq; idx++) FreeTrace(tr[idx], NULL); oldscore = totscore; } else { /* we've converged. Free traces and break out of iteration loop. */ for (idx = 0; idx < nseq; idx++) FreeTrace(tr[idx], NULL); break; } /* switch new model for old */ FreeCM(cm); free(icm); cm = newcm; /* Training takes a long time, and sysadmins are nasty evil * people who like to shut machines down without warning, particularly * during a long training run right before one gives a talk. * Therefore, we have an option for backing up the model every * iteration so we can resume after a crash or shutdown... */ if (bckfile != NULL) { if ((bckfp = fopen(bckfile, "w")) == NULL) Warn("Failed to open backup file %s\n", bckfile); else { if (! WriteBinaryCM(bckfp, newcm)) Warn("Failed to save to backup file %s\n", bckfile); fclose(bckfp); } } } /*********************************************** * Save the new model and exit. ***********************************************/ if ((cmfp = fopen(cmfile, "w")) == NULL) Die("Failed to open %s for writing", cmfile); if (do_flatformat && ! WriteCM(cmfp, cm)) Die("Failed to save the model to %s", cmfile); else if (! WriteBinaryCM(cmfp, cm)) Die("Failed to save the model to %s", cmfile); fclose(cmfp); free(tr); FreeCM(cm); for (idx = 0; idx < nseq; idx++) FreeSequence(rseqs[idx], &(sqinfo[idx])); free(sqinfo); printf("New covariance model written to file %s\n", cmfile); #ifdef MEMDEBUG current_size = malloc_size(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "No memory leaks, sir.\n"); #endif return 0; } tRNAscan-SE-2.0/tRNAscan-SE.conf.src0000644000543100007160000001755314020214740016252 0ustar pchanlowelab# tRNAscan-SE 2.0 # Configuration File # default paths bin_dir: @bindir@ lib_dir: @libdir@/tRNAscan-SE infernal_dir: {bin_dir} # temporary files temp_dir: /tmp tmp_raw: {temp_dir}/tscan$$.raw tmp_fa: {temp_dir}/tscan$$.fa tmp_trnaseq_file: {temp_dir}/tscan$$.trna tmp_masked_fa: {temp_dir}/tscan$$.masked.fa # sequence process really_big_number: 1000000000 upstream_len: 70 downstream_len: 70 # default search mode cm_mode: infernal # default cutoff bit score for infernal running as first pass scanner infernal_fp_cutoff: 10 # default cutoff score for reporting of tRNA cm_cutoff: 20 # default cutoff score for reporting of tRNA in organelles organelle_cm_cutoff: 15 isotype_cm_cutoff.euk: 20 isotype_cm_cutoff.arch: 20 isotype_cm_cutoff.bact: 20 isotype_cm_cutoff.mito_mammal: 20 isotype_cm_cutoff.mito_vert: 20 # max size of -w parameter passed to covels when using a pre-scanner (eufind or tRNAscan) max_tRNA_length: 500 # max size of -w param if only Cove is being used (too slow otherwise) max_cove_tRNA_length: 250 # max size of -w param if only cmsearch is being used max_cmsearch_tRNA_length: 250 # min length for average tRNA with no intron min_tRNA_no_intron: 70 # min size of introns detected by parsing of coves output min_intron_length: 3 # Below this score, tRNAs are checked for min primary and secondary structure # scores to catch pseudogene repeats like rat ID & rodent B2 elements min_cove_pseudo_filter_score: 55 # Below this score, tRNAs are checked for min primary and secondary structure # scores to catch pseudogene repeats like rat ID & rodent B2 elements min_cmsearch_pseudo_filter_score: 55 # Below this secondary structure score, tRNA is considered a pseudogene min_ss_score: 5 # Below this primary structure score, tRNA is considered a pseudogene min_hmm_score: 10 # legacy mode default searching parameters # Intermediate score cutoff for use with eufindtRNA eufind.intscore: -32.10 eufind.orig_intscore: -31.25 eufind.bact_intscore: -36.0 eufind.arch_intscore: -36.0 eufind.strict_param: -s # relaxed params to be used with eufindtRNA program by default eufind.relaxed_param: -r tscan.strict_param: -s tscan.relaxed_param: -r tscan.alt_param: -a # default cutoff score for rescanning non-canonical introns nci_scan_cutoff: 70 # default score for considering non-canonical intron BHB_cm_cutoff: 6.5 # default cutoff score for rescanning split tRNA split_tRNA_scan_cutoff: 38 # default cutoff score for half tRNA half_tRNA_cutoff: 15 left_splicing_len: 27 right_splicing_len: 28 # alternate genetic code gc_yeast_mito: {lib_dir}/gcode/gcode.ystmito gc_vert_mito: {lib_dir}/gcode/gcode.vertmito gc_invert_mito: {lib_dir}/gcode/gcode.invmito gc_ciliate_cyto: {lib_dir}/gcode/gcode.cilnuc gc_echinoderm_mito: {lib_dir}/gcode/gcode.echdmito gc_other_mito: {lib_dir}/gcode/gcode.othmito gc_marsu_mito: {lib_dir}/gcode/gcode.marsumito # Infernal 1.1 covariance models cm_dir: {lib_dir}/models cm.general: {cm_dir}/TRNAinf.cm cm.general-ns: {cm_dir}/TRNAinf-ns.cm cm.general1415: {cm_dir}/TRNAinf-1415.cm cm.general1415-ns: {cm_dir}/TRNAinf-1415-ns.cm cm.eukaryota: {cm_dir}/TRNAinf-euk.cm cm.eukaryota-ns: {cm_dir}/TRNAinf-euk-ns.cm cm.archaea: {cm_dir}/TRNAinf-arch.cm cm.archaea-ns: {cm_dir}/TRNAinf-arch-ns.cm cm.bacteria: {cm_dir}/TRNAinf-bact.cm cm.bacteria-ns: {cm_dir}/TRNAinf-bact-ns.cm cm.arch_5h: {cm_dir}/TRNAinf-arch-5h.cm cm.arch_3h: {cm_dir}/TRNAinf-arch-3h.cm # COVE covariance models cove_cm_dir: {lib_dir}/models cove_cm.general: {cove_cm_dir}/TRNA2.cm cove_cm.general-ns: {cove_cm_dir}/TRNA2ns.cm cove_cm.eukaryota: {cove_cm_dir}/TRNA2-euk.cm cove_cm.eukaryota-ns: {cove_cm_dir}/TRNA2-eukns.cm cove_cm.archaea: {cove_cm_dir}/TRNA2-arch.cm cove_cm.archaea-ns: {cove_cm_dir}/TRNA2-archns.cm cove_cm.bacteria: {cove_cm_dir}/TRNA2-bact.cm cove_cm.bacteria-ns: {cove_cm_dir}/TRNA2-bactns.cm cove_cm.ESELC: {cove_cm_dir}/ESELC.cm cove_cm.PSELC: {cove_cm_dir}/PSELC.cm # Isotype-specific model db isotype_cm_dir: {lib_dir}/models isotype_cm.eukaryota: {isotype_cm_dir}/TRNAinf-euk-iso isotype_cm.bacteria: {isotype_cm_dir}/TRNAinf-bact-iso isotype_cm.archaea: {isotype_cm_dir}/TRNAinf-arch-iso # tRNA-SeC models euk_cm.SeC: {isotype_cm_dir}/TRNAinf-euk-SeC.cm bact_cm.SeC: {isotype_cm_dir}/TRNAinf-bact-SeC.cm arch_cm.SeC: {isotype_cm_dir}/TRNAinf-arch-SeC.cm # Non-canonical intron models nci_cm.cren: {cm_dir}/Cren-eury-BHB-noncan.cm nci_cm.thaum: {cm_dir}/Thaum-BHB-noncan.cm # Mito-tRNA model db mito_cm_db_dir: {lib_dir}/models mito_cm.mammal: {mito_cm_db_dir}/TRNAinf-mito-mammal mito_cm.vert: {mito_cm_db_dir}/TRNAinf-mito-vert # Mammalian mito-tRNA models mito_cm_mammal_dir: {lib_dir}/models mito_cm_mammal.Ala: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Ala.cm mito_cm_mammal.Arg: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Arg.cm mito_cm_mammal.Asn: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Asn.cm mito_cm_mammal.Asp: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Asp.cm mito_cm_mammal.Cys: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Cys.cm mito_cm_mammal.Gln: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Gln.cm mito_cm_mammal.Glu: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Glu.cm mito_cm_mammal.Gly: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Gly.cm mito_cm_mammal.His: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-His.cm mito_cm_mammal.Ile: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Ile.cm mito_cm_mammal.LeuTAA: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-LeuTAA.cm mito_cm_mammal.LeuTAG: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-LeuTAG.cm mito_cm_mammal.Lys: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Lys.cm mito_cm_mammal.Met: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Met.cm mito_cm_mammal.Phe: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Phe.cm mito_cm_mammal.Pro: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Pro.cm mito_cm_mammal.SerGCT: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-SerGCT.cm mito_cm_mammal.SerTGA: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-SerTGA.cm mito_cm_mammal.Thr: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Thr.cm mito_cm_mammal.Trp: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Trp.cm mito_cm_mammal.Tyr: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Tyr.cm mito_cm_mammal.Val: {mito_cm_mammal_dir}/TRNAinf-mito-mammal-Val.cm # Vertebrate mito-tRNA models mito_cm_vert_dir: {lib_dir}/models mito_cm_vert.Ala: {mito_cm_vert_dir}/TRNAinf-mito-vert-Ala.cm mito_cm_vert.Arg: {mito_cm_vert_dir}/TRNAinf-mito-vert-Arg.cm mito_cm_vert.Asn: {mito_cm_vert_dir}/TRNAinf-mito-vert-Asn.cm mito_cm_vert.Asp: {mito_cm_vert_dir}/TRNAinf-mito-vert-Asp.cm mito_cm_vert.Cys: {mito_cm_vert_dir}/TRNAinf-mito-vert-Cys.cm mito_cm_vert.Gln: {mito_cm_vert_dir}/TRNAinf-mito-vert-Gln.cm mito_cm_vert.Glu: {mito_cm_vert_dir}/TRNAinf-mito-vert-Glu.cm mito_cm_vert.Gly: {mito_cm_vert_dir}/TRNAinf-mito-vert-Gly.cm mito_cm_vert.His: {mito_cm_vert_dir}/TRNAinf-mito-vert-His.cm mito_cm_vert.Ile: {mito_cm_vert_dir}/TRNAinf-mito-vert-Ile.cm mito_cm_vert.LeuTAA: {mito_cm_vert_dir}/TRNAinf-mito-vert-LeuTAA.cm mito_cm_vert.LeuTAG: {mito_cm_vert_dir}/TRNAinf-mito-vert-LeuTAG.cm mito_cm_vert.Lys: {mito_cm_vert_dir}/TRNAinf-mito-vert-Lys.cm mito_cm_vert.Met: {mito_cm_vert_dir}/TRNAinf-mito-vert-Met.cm mito_cm_vert.Phe: {mito_cm_vert_dir}/TRNAinf-mito-vert-Phe.cm mito_cm_vert.Pro: {mito_cm_vert_dir}/TRNAinf-mito-vert-Pro.cm mito_cm_vert.SerGCT: {mito_cm_vert_dir}/TRNAinf-mito-vert-SerGCT.cm mito_cm_vert.SerTGA: {mito_cm_vert_dir}/TRNAinf-mito-vert-SerTGA.cm mito_cm_vert.Thr: {mito_cm_vert_dir}/TRNAinf-mito-vert-Thr.cm mito_cm_vert.Trp: {mito_cm_vert_dir}/TRNAinf-mito-vert-Trp.cm mito_cm_vert.Tyr: {mito_cm_vert_dir}/TRNAinf-mito-vert-Tyr.cm mito_cm_vert.Val: {mito_cm_vert_dir}/TRNAinf-mito-vert-Val.cm # to be used for sprinzl position alignment # sprinzl_cm_dir: {lib_dir}/models # sprinzl_cm.euk: {sprinzl_cm_dir}/euk_sprinzl.cm # sprinzl_cm.arch: {sprinzl_cm_dir}/arch_sprinzl.cm # sprinzl_cm.bact: {sprinzl_cm_dir}/bact_sprinzl.cm # alternate covariance models # To be used in place of default models # Example: # alt_cm_dir: /tmp # alt_cm.Ala: {alt_cm_dir}/Ala.cm # alt_cm.Arg: {alt_cm_dir}/Arg.cm tRNAscan-SE-2.0/COPYING0000644000543100007160000000442013072631703013667 0ustar pchanlowelabtRNAscan-SE -- a program for finding transfer RNAs Copyright (C) 2017 Patricia P. Chan, Brian Lin, and Todd M. Lowe This set of programs is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program, in the file GNULICENSE; if not, check out the web site at https://www.gnu.org/licenses/gpl-3.0.en.html. This distribution includes source code originally derived from other sources. In particular, modified source code from the program trnascan 1.3 (Fichant & Burks, 1991) is included under the conditions of the following notice: Copyright, 1991, The Regents of the University of California. This software was produced by the Los Alamos National Laboratory, which is operated by the University of California for the United States Department of Energy under contract W-7405-ENG-36. The U. S. Government is licensed to use, reproduce, and distribute this software. Permission is granted to the public to copy and use this software without charge, provided that this Notice and any statement of authorship are reproduced on all copies. Neither the Government nor the University makes any warranty, express or implied, or assumes any liability or responsibility for the use of this software. Also, the complete COVE package (Covariance models of RNA sequence and structure, Eddy & Durbin, 1994) is included under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Please see accompanying files from the COVE package for more information. References 1. Fichant, G.A. and Burks, C. (1991) "Identifying potential tRNA genes in genomic DNA sequences", J. Mol. Biol., 220, 659-671. 2. Eddy, S.R. and Durbin, R. (1994) "RNA sequence analysis using covariance models", Nucl. Acids Res., 22, 2079-2088. tRNAscan-SE-2.0/tRNAscan-SE.src0000755000543100007160000021200014073131631015316 0ustar pchanlowelab#! @PERL@ # # -------------------------------------------------------------------- # tRNAscan-SE: a program for improved detection of transfer RNA # genes in genomic sequence # # Version 2.0.9 # # Copyright (C) 2021 Patricia Chan and Todd Lowe # # School of Engineering, University of California, Santa Cruz # trna@soe.ucsc.edu # http://trna.ucsc.edu/ # -------------------------------------------------------------------- # # Usage: # tRNAscan-SE [options] # use strict; use lib "@libdir@/tRNAscan-SE"; use Getopt::Long; use tRNAscanSE::Configuration; use tRNAscanSE::tRNA; use tRNAscanSE::SprinzlPos; use tRNAscanSE::ArraytRNA; use tRNAscanSE::Utils; use tRNAscanSE::GeneticCode; use tRNAscanSE::Options; use tRNAscanSE::Eufind; use tRNAscanSE::Tscan; use tRNAscanSE::CM; use tRNAscanSE::LogFile; use tRNAscanSE::Stats; use tRNAscanSE::Sequence; use tRNAscanSE::FpScanResultFile; use tRNAscanSE::ScanResult; use tRNAscanSE::IntResultFile; use tRNAscanSE::MultiResultFile; use tRNAscanSE::SS; our $version = "2.0.9"; our $release_date = "July 2021"; our $program_id = "tRNAscan-SE-".$version; # modified by 'make' our $default_conf = "@bindir@/tRNAscan-SE.conf"; # Signal handling $SIG{'TERM'} = 'error_handler'; $SIG{'QUIT'} = 'error_handler'; $SIG{'INT'} = 'error_handler'; # Global variables our @fp_start_time; our $opts = tRNAscanSE::Options->new; our $global_constants = tRNAscanSE::Configuration->new(); our $log = tRNAscanSE::LogFile->new("default"); our $sprinzl = tRNAscanSE::SprinzlPos->new; our $fp_tRNAs = tRNAscanSE::ArraytRNA->new; our $sp_tRNAs = tRNAscanSE::ArraytRNA->new; our $fp_result_file = tRNAscanSE::FpScanResultFile->new(""); our $sp_int_results = tRNAscanSE::IntResultFile->new; our $iso_int_results = tRNAscanSE::MultiResultFile->new; our $gc = tRNAscanSE::GeneticCode->new; our $stats = tRNAscan::Stats->new; our $seq_file = tRNAscanSE::Sequence->new; our $eufind = tRNAscanSE::Eufind->new; our $tscan = tRNAscanSE::Tscan->new; our $cm = tRNAscanSE::CM->new; $global_constants->config_file($default_conf); our %global_vars = (global_constants => $global_constants, log_file => $log, options => $opts, sprinzl => $sprinzl, fp_tRNAs => $fp_tRNAs, sp_tRNAs => $sp_tRNAs, fp_result_file => $fp_result_file, sp_int_results => $sp_int_results, iso_int_results => $iso_int_results, sequence => $seq_file, gc => $gc, stats => $stats ); # set user-selectable options &set_options(); # set location of binaries & data files, # plus, check to make sure they are there $cm->set_file_paths(\%global_vars); $cm->check_lib_files($opts); $cm->set_bin($global_constants->get("bin_dir")); $cm->set_infernal_bin($global_constants->get("infernal_dir")); $eufind->set_bin($global_constants->get("bin_dir")); $tscan->set_bin($global_constants->get("bin_dir")); # initialize variables $gc->read_transl_table($opts); if ($opts->save_stats()) { $stats->file_name($opts->stats_file()); } # Start processing &initialize_process(); # prescan with either tRNAscan/eufind or both if ($opts->tscan_mode() || $opts->eufind_mode() || $opts->infernal_fp()) { &first_pass_prescan(); } # Check to see if no sequences were read from input file(s) if (($stats->numscanned() == 0) && ($opts->eufind_mode() || $opts->tscan_mode() || $opts->infernal_fp())) { if ($opts->seq_key() ne '\S*') { die "\nNo FASTA sequences matching \'".$opts->raw_seq_key()."\' key found\n\n"; } elsif ($opts->multiple_files()) { die "\nFATAL: No sequences in FASTA format found in ", join(', ',@ARGV),"\n\n"; } else { die "\nFATAL: No sequences in FASTA format found in file ".$opts->fasta_file()."\n\n"; } } # Run Cove or Infernal on candidate tRNAs picked in first pass, # or by itself on seqs if no first pass searches elsif ($opts->cove_mode() || $opts->infernal_mode()) { $sp_int_results->file_name($opts->secondpass_int_result_file()); $iso_int_results->file_name($opts->isotype_int_result_file()); &run_cm_scan(); } $stats->end_sp_timer(); if ($opts->save_stats()) { $stats->open_file(); $stats->save_final_stats($opts, $gc, $fp_result_file->get_hit_count(), $cm->tab_results()); $stats->close_file(); } $log->finish_process(); &cleanup(); # clean up temp files exit(0); # END main sub initialize_process { # print program info header, credits, & selected run options if (!$opts->quiet_mode()) { print STDERR "\ntRNAscan-SE v.$version ($release_date) -", " scan sequences for transfer RNAs\n"; &display_credits(); $opts->display_run_options($cm, $tscan, $eufind, $global_constants, *STDERR); } $stats->start_fp_timer(); # save starting time # if statistics are being saved, write run options in stats file if ($opts->save_stats()) { my $host = `hostname`; chomp($host); $stats->open_file(); $stats->write_line("\ntRNAscan-SE v.$version ($release_date) scan results (on host $host)\nStarted: ".`date`); $opts->display_run_options($cm, $tscan, $eufind, $global_constants, $stats->FILE_H()); $stats->close_file(); } } # Running tRNAscan and/or EufindtRNA sub first_pass_prescan { if ($opts->infernal_fp()) { $log->status("Phase I: Searching for tRNAs with HMM-enabled Infernal"); } else { $log->status("Phase I: Searching for tRNAs with tRNAscan and/or EufindtRNA"); } # open seq file to search $seq_file->open_file($opts->fasta_file(), "read"); # Main loop for reading seqs & scanning with tRNAscan and/or EufindtRNA my $targ_seq_id = 0; # Don't look for a specific Seq number my $start_index = 1; my $sequence_scanned = 0; my $printed_header = 0; my $eufind_output; my @hit_list = (); my $tmp_raw = $global_constants->get("tmp_raw"); my $tmp_fa = $global_constants->get("tmp_fa"); my $tmp_fa_file = tRNAscanSE::Sequence->new; my $missing_fa_file = tRNAscanSE::Sequence->new; while ($seq_file->read_fasta($opts, $targ_seq_id)) { if ($opts->cove_mode() || $opts->infernal_mode()) { $log->broadcast("Scanned seqs: ".$stats->numscanned()." (at ".$seq_file->seq_name().")"); } $stats->increment_numscanned(); $stats->increment_first_pass_base_ct($seq_file->seq_length()); do { # Write one input sequence / seq buffer to tmp_fa file $tmp_fa_file->open_file($tmp_fa, "write"); $tmp_fa_file->set_seq_info($seq_file->seq_name(), $seq_file->seq_description(), $seq_file->seq_length(), $seq_file->sequence()); $tmp_fa_file->write_fasta(); $tmp_fa_file->close_file(); if ($opts->infernal_fp()) { $cm->first_pass_scan(\%global_vars, $start_index, $seq_file->seq_name()); } else { # Run tRNAscan on $tmp_fa file & write results to $tmp_raw output file if ($opts->tscan_mode()) { $tscan->run_tRNAscan($tmp_fa, $tmp_raw, $start_index, $global_constants->get("lib_dir"), $seq_file->seq_name()); if ($opts->save_verbose()) { $tscan->append_verbfile($opts->verb_file(), $tmp_fa, $seq_file->seq_name()); } $tscan->process_tRNAscan_hits(\%global_vars, $seq_file->seq_name()); } # Run eufindtRNA program & save results in memory in $Eufind_output array if ($opts->eufind_mode()) { $eufind_output = $eufind->run_eufind($tmp_fa, $start_index, $opts->max_int_len(), $seq_file->seq_name()); if ($eufind_output ne "") { $eufind->process_Eufind_hits(\%global_vars, $eufind_output); $eufind_output = ""; } } } $sequence_scanned = 1; # Flag indicating current sequence has been scanned # Check to see if all of sequence was read in last buffer-sized chunck if ($seq_file->seq_buf_overrun()) { $start_index = $seq_file->buffer_end_index() + 1; if ($seq_file->read_more_fasta($opts)) { $sequence_scanned = 0; } } } until ($sequence_scanned); if ($fp_tRNAs->get_count() > 0) { $stats->increment_seqs_hit(); # save results in ACeDB format now if not using Cove analysis if ($opts->ace_output() && (!$opts->CM_mode())) { &save_Acedb_from_firstpass($opts->output_codon(), $gc->one_let_trans_map(), $fp_tRNAs, $opts->out_file()); } else { # save all hits for this seq my $fpass_trna_base_ct = $stats->fpass_trna_base_ct(); if (!$opts->CM_mode()) { if (!($opts->brief_output() || $printed_header)) { &open_for_append(\*TABOUT, $opts->out_file()); &print_results_header(\*TABOUT, $opts, 0, 8, 8, 1); close (TABOUT); $printed_header = 1; } } $fp_result_file->save_firstpass_output($opts, $fp_tRNAs, \$fpass_trna_base_ct, $seq_file->seq_length(), $seq_file->seq_id()); $stats->fpass_trna_base_ct($fpass_trna_base_ct); } # clear hit array $fp_tRNAs->clear(); } elsif ($opts->save_missed()) { # save sequence that had no tRNA hits if -M param set # NOTE: only writes last frame of seq buffer if seq length > max_seq_buffer $missing_fa_file->open_file($opts->missed_seq_file(), "append"); $missing_fa_file->set_seq_info($seq_file->seq_name(), $seq_file->seq_description(), $seq_file->seq_length(), $seq_file->sequence()); $missing_fa_file->write_fasta(); $missing_fa_file->close_file(); } $seq_file->reset_buffer_ct(); $start_index = 1; } # while (read_fasta()) - still more seqs to scan $seq_file->close_file(); # remove temporary files system("rm -f $tmp_raw $tmp_fa"); $seq_file->release_memory(); # release memory $log->broadcast("\n".$stats->numscanned()." seqs scanned, ".$stats->seqs_hit()." seqs had at ". "least one hit.\n".$stats->trnatotal()." total tRNAs predicted in first pass scans"); if ((!$opts->CM_mode()) && ($stats->trnatotal() == 0) && (!$opts->quiet_mode())) { $log->status("No tRNAs found."); } $stats->end_fp_timer(); # save time first-pass scans are done if ($opts->save_stats()) { $stats->open_file(); $stats->save_firstpass_stats(); $stats->close_file(); } } # Run Cove or Infernal sub run_cm_scan { $stats->start_sp_timer(); if ($opts->tscan_mode() || $opts->eufind_mode() || $opts->infernal_fp()) { $log->status("Phase II: ".$opts->second_pass_label()." verification of candidate ". "tRNAs detected with first-pass scan"); } else { $log->status("Running ".$opts->second_pass_label()." analysis"); if (!$opts->use_prev_ts_run()) { $fp_result_file->prep_for_secpass_only($opts, $stats, $seq_file); } } # Name of tRNA sequence currently in memory my $prev_seq_name = ''; # flag indicates if seqid and seqlen are saved in firstpass result file my $seqinfo_flag = 0; my $curseq_trnact = 0; my $prescan_trna = tRNAscanSE::tRNA->new; my $tRNAs_found = 0; my $index = -1; $seq_file->open_file($opts->fasta_file(), "read"); $fp_result_file->index_results(\$seqinfo_flag); my @fp_result_file_indexes = $fp_result_file->get_indexes(); $fp_result_file->open_file(); for (my $seq_ct = 0; $seq_ct < scalar(@fp_result_file_indexes); $seq_ct++) { $sp_int_results->file_name($opts->secondpass_int_result_file()); $global_vars{sp_int_results} = $sp_int_results; $sp_int_results->open_file("write"); $sp_tRNAs->clear(); $log->broadcast("Scanning ".$fp_result_file_indexes[$seq_ct]->[1]); if ($opts->cove_mode()) { $fp_result_file->reset_current_seq(); $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_trna); while ($prescan_trna->seqname() ne "") { # Retrieve tRNA sequence and write to tmp_trnaseq_file if (!&prepare_tRNA_to_scan($seq_file, $prescan_trna)) { next; } $tRNAs_found = $cm->analyze_with_cove(\%global_vars, $prescan_trna, \$curseq_trnact); if (!$cm->CM_check_for_introns()) { $stats->increment_total_secpass_ct($tRNAs_found); } $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_trna); } } else { # Retrieve tRNA sequences and write to tmp_trnaseq_file if (!&prepare_multi_tRNAs_to_scan($seqinfo_flag, $seq_file, $seq_ct)) { next; } if ($opts->mito_mode()) { $tRNAs_found = $cm->analyze_mito(\%global_vars, $seqinfo_flag, $seq_ct, $fp_result_file_indexes[$seq_ct]->[1], \$curseq_trnact); } elsif($opts->alternate_mode()) { $tRNAs_found = $cm->analyze_alternate(\%global_vars, $seqinfo_flag, $seq_ct, $fp_result_file_indexes[$seq_ct]->[1], \$curseq_trnact); } elsif ($opts->metagenome_mode()) { } elsif ($opts->numt_mode()) { } elsif ($opts->infernal_mode()) { $tRNAs_found = $cm->analyze_with_cmsearch(\%global_vars, $seqinfo_flag, $seq_ct, $fp_result_file_indexes[$seq_ct]->[1], \$curseq_trnact); } $stats->increment_total_secpass_ct($tRNAs_found); } $sp_int_results->close_file(); if (($curseq_trnact > 0) and $cm->CM_check_for_introns()) { if (&prepare_intron_scan($seq_file)) { $cm->scan_noncanonical_introns(\%global_vars, $fp_result_file_indexes[$seq_ct]->[1]); } } if ($curseq_trnact > 0) { if ($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) { $cm->truncated_tRNA_search(\%global_vars, $fp_result_file_indexes[$seq_ct]->[1]); if (!$opts->no_isotype()) { $cm->isotype_cmsearch(\%global_vars); } } &output_tRNA(\%global_vars, $cm, $cm->tab_results(), $cm->get_hmm_score(), $program_id); } if (($sp_int_results->get_count() > 0) and $cm->CM_check_for_split_halves()) { my @sp_indexes = $sp_int_results->get_indexes(); if ($sp_int_results->open_file("read")) { for (my $i = 0; $i < scalar(@sp_indexes); $i++) { my $cm_tRNA = tRNAscanSE::tRNA->new; $sp_int_results->get_tRNA($sp_indexes[$i]->[0], $cm_tRNA); $sp_tRNAs->put($cm_tRNA); } $sp_int_results->close_file(); $cm->scan_split_tRNAs(\%global_vars); } } if ($opts->bed_file() ne "") { if ($curseq_trnact > 0) { &write_bed(\%global_vars); } } if ($opts->gff_file() ne "") { if ($curseq_trnact > 0) { &write_gff(\%global_vars); } } $sp_int_results->clear_index(); $curseq_trnact = 0; } $fp_result_file->close_file(); $seq_file->close_file(); if (($stats->total_secpass_ct() == 0) && (!$opts->quiet_mode())) { print STDERR "No tRNAs found.\n\n"; } } # Extracts tRNA sequences with given coordinates, and writes to $tmp_ sub prepare_multi_tRNAs_to_scan { my ($seqinfo_flag, $seq_file, $seq_ct) = @_; system("rm -f ".$global_constants->get("tmp_trnaseq_file")); my $trna_file = tRNAscanSE::Sequence->new; $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); my $flanking = 0; my $trna = tRNAscanSE::tRNA->new; $fp_result_file->reset_current_seq(); $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $trna); if ($fp_result_file->open_flanking("write")) { $flanking = 1; } while ($trna->seqname() ne "") { $seq_file->get_tRNA_sequence(\%global_vars, $trna); $stats->increment_secpass_base_ct($trna->len()); $trna_file->set_seq_info($trna->seqname().".t".&pad_num($trna->id(), 6), $seq_file->seq_description(), length($trna->seq()), $trna->seq()); $trna_file->write_fasta(); if ($flanking) { $fp_result_file->write_tRNA_flanking($trna); } $seq_file->release_memory(); $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $trna); } $trna_file->close_file(); $fp_result_file->close_flanking(); return 1; } # Extracts tRNA sequence with given coordinates, and writes to $tmp_ sub prepare_tRNA_to_scan { my ($seq_file, $trna) = @_; $seq_file->get_tRNA_sequence(\%global_vars, $trna); $stats->increment_secpass_base_ct($trna->len()); &write_tRNA($global_constants->get("tmp_trnaseq_file"), $seq_file->seq_name(), $seq_file->seq_description(), $trna->seq(), 1); $seq_file->release_memory(); return 1; } # Extracts tRNA sequences with given coordinates, and writes to $tmp_ sub prepare_intron_scan { my ($seq_file) = @_; my $ret_value = 1; system("rm -f ".$global_constants->get("tmp_trnaseq_file")); my $trna_file = tRNAscanSE::Sequence->new; my $cm_tRNA = undef; $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); my $trna = tRNAscanSE::tRNA->new; my $padded_seq = ""; $sp_tRNAs->clear(); my @sp_indexes = $sp_int_results->get_indexes(); if ($sp_int_results->open_file("read")) { for (my $i = 0; $i < scalar(@sp_indexes); $i++) { $cm_tRNA = tRNAscanSE::tRNA->new; $sp_int_results->get_tRNA($sp_indexes[$i]->[0], $cm_tRNA); my $orig_seq = $cm_tRNA->seq(); $seq_file->get_tRNA_sequence(\%global_vars, $cm_tRNA); if (uc($orig_seq) ne uc($cm_tRNA->seq())) { $ret_value = 0; $log->error("tRNA sequence does not match for intron scan: ".$cm_tRNA->tRNAscan_id()." ".$cm_tRNA->seqname().":".$cm_tRNA->start()."-".$cm_tRNA->end()); } $padded_seq = $cm_tRNA->upstream().$cm_tRNA->seq().$cm_tRNA->downstream(); $trna_file->set_seq_info($cm_tRNA->seqname().".trna".&pad_num($cm_tRNA->id(), 6), $cm_tRNA->tRNAscan_id(), length($padded_seq), $padded_seq); $trna_file->write_fasta(); $sp_tRNAs->put($cm_tRNA); $seq_file->release_memory(); } $sp_int_results->close_file(); } $trna_file->close_file(); return $ret_value; } # clean up temp files sub cleanup { system("rm -f ".$global_constants->get("temp_dir")."/tscan$$"."_*"); system("rm -f ".$global_constants->get("temp_dir")."/tscan$$".".*"); system("rm -f ".$opts->fafile().".pid"); } sub error_handler { print "\nAborting tRNAscan-SE\n\n"; my $ppid = $$; my $psout = `ps -ef`; my @ps_lines = split(/\n/,$psout); foreach my $line (0..$#ps_lines) { if ($ps_lines[$line] =~/^\s+\S+\s+(\d+)\s+($ppid)\s/) { print STDERR "Killing process $1:\n",$ps_lines[$line],"\n"; my $killct = kill 'KILL', $1; print STDERR "$killct jobs received the kill signal\n"; } } &cleanup(); exit(1); } sub display_credits { print STDERR "Copyright (C) 2020 Patricia Chan and Todd Lowe\n", " University of California Santa Cruz\n", "Freely distributed under the GNU General Public License (GPLv3)\n\n"; } sub print_usage { print STDERR "\nUsage: tRNAscan-SE [-options] \n\n"; print STDERR " Scan a sequence file for tRNAs \n", " -- default: use Infernal & tRNA covariance models\n", " with eukaryotic sequences \n", " (use -B, -A, -M, -O or -G to scan other types of sequences)\n\n", "Basic Options\n", " -E : search for eukaryotic tRNAs (default)\n", " -B : search for bacterial tRNAs\n", " -A : search for archaeal tRNAs\n", " -M : search for mitochondrial tRNAs\n", " options: mammal, vert\n", " -O : search for other organellar tRNAs\n", " -G : use general tRNA model (cytoslic tRNAs from all 3 domains included)\n", " -L : search using the legacy method (tRNAscan, EufindtRNA, and COVE)\n", " use with -E, -B, -A, -O, or -G\n", " -I : search using Infernal (default)\n", " use with -E, -B, -A, -O, or -G\n", # " -T : search for tRNAs in metagenome\n", # " -N : search for tRNAs in nuclear mitochondrial DNA regions (NUMTs)\n", " -o : save final results in \n", " -f : save tRNA secondary structures to \n", " -m : save statistics summary for run in \n", " (speed, # tRNAs found in each part of search, etc)\n", " -H : show both primary and secondary structure components to\n", " covariance model bit scores\n", " -q : quiet mode (credits & run option selections suppressed)\n\n", " -h : print full list (long) of available options\n\n"; } sub print_all_options { print "\nUsage: tRNAscan-SE [-options] \n\n"; print " Scan a sequence file for tRNAs \n", " -- default: use Infernal & tRNA covariance models\n", " with eukaryotic sequences \n", " (use 'Search Mode Options' below to scan other types of sequences)\n\n", "Search Mode Options:\n\n", " -E : search for eukaryotic tRNAs (default)\n", " -B : search for bacterial tRNAs\n", " -A : search for archaeal tRNAs\n", " -M : search for mitochondrial tRNAs\n", " options: mammal, vert\n", " -O : search for other organellar tRNAs\n", " -G : use general tRNA model (cytoslic tRNAs from all 3 domains included)\n", " --mt : use mito tRNA models for cytosolic/mito detemination\n", " (if not specified, only cytosolic isotype-specific model scan will be performed)\n", # " -T : search for tRNAs in metagenome\n", # " -N : search for tRNAs in nuclear mitochondrial DNA regions (NUMTs)\n", " -I : search using Infernal\n", " default use with -E, -B, -A, or -G; optional for -O\n", " --max : maximum sensitivity mode - search using Infernal without hmm filter (very slow)\n", " -L : search using the legacy method (tRNAscan, EufindtRNA, and COVE)\n", " use with -E, -B, -A or -G\n", " -C --cove : search using COVE analysis only (legacy, extremely slow)\n", " default use with -O\n", " -H --breakdown : show breakdown of primary and secondary structure components to\n", " covariance model bit scores\n", " -D --nopseudo : disable pseudogene checking\n\n", "Output options:\n\n", " -o --output : save final results in \n", " -f --struct : save tRNA secondary structures to \n", " -s --isospecific : save results using isotype-specific models in \n", " -m --stats : save statistics summary for run in \n", " (speed, # tRNAs found in each part of search, etc)\n", " -b --bed : save results in BED file format of \n", " -j --gff : save results in GFF3 file format of \n", " -a --fasta : save predicted tRNA sequences in FASTA file format of \n", " -l --log : save log of program progress in \n", " --detail : display prediction outputs in detailed view\n", " --brief : brief output format (no column headers)\n\n", " -? \# : '#' in place of chooses default name for output files\n", " -p --prefix