debian/0000755000000000000000000000000012034577553007201 5ustar debian/README.Debian0000644000000000000000000000025011717544051011231 0ustar sift for Debian --------------- This package presents `sift-3.0'. Tested only for FASTA input. -- Laszlo Kajan Mon, 07 Dec 2009 15:22:27 +0100 debian/watch0000644000000000000000000000017111721743455010227 0ustar version=3 # Homepage Pattern [Version [Action]] http://sift.jcvi.org \ www/sift([0-9][0-9a-z.]*)\.tar\.[gzbl2]{2,3} debian/rules0000755000000000000000000000510412034577522010255 0ustar #!/usr/bin/make -f # Uncomment this to turn on verbose mode. #export DH_VERBOSE=1 PACKAGE:=sift DESTDIR:=$(CURDIR)/debian/$(PACKAGE) VERSION:=4.0.3b prefix:=/usr pkgdatadir:=$(prefix)/share/$(PACKAGE) SIFT_BINDIR := $(prefix)/lib/sift/bin SIFT_SCRIPTDIR := $(prefix)/lib/sift/bin parallel := 1 $(eval $(DEB_BUILD_OPTIONS)) %: dh $@ MAN1:=info_on_seqs.1 SIFT_for_submitting_fasta_seq.csh.1 .PHONY: man man: $(MAN1) %.1: %.pod pod2man -c 'User Commands' --release="$(VERSION)" $< > $@ override_dh_auto_clean: if [ -e src/Makefile ]; then make -j$(parallel) -C src prefix=$(prefix) distclean; fi rm -f $(MAN1) # remove pre-compiled binaries from bin - thanks to Michael Bienia rm -f bin/choose_seqs_via_psiblastseedmedian rm -f bin/clump_output_alignedseq rm -f bin/consensus_to_seq rm -f bin/fastaseqs rm -f bin/info_on_seqs rm -f bin/process_alignment rm -f bin/psiblast_res_to_fasta_dbpairwise rm -f bin/seqs_from_psiblast_res override_dh_auto_build: man make -j$(parallel) -C src prefix=$(prefix) VERSION=$(VERSION) deps.mk && \ make -j$(parallel) -C src prefix=$(prefix) VERSION=$(VERSION) override_dh_auto_install: make -C src DESTDIR=$(DESTDIR) prefix=$(prefix) install override_dh_install: # lkajan: fastaseqs comes from the blimps package, this source does not have rules for creating it dh_install -X.svn -X.swp -XIntersectFeatures.jar -XManuals -Xfastaseqs -Xlinux -Xsolaris bin $(prefix)/lib/sift/ dh_install bin/IntersectFeatures.jar $(pkgdatadir)/ dh_install -X.svn -X.swp blimps/docs $(prefix)/share/sift/blimps/ #Links are created by debian/links for f in SIFT_for_submitting_fasta_seq.csh SIFT_for_submitting_NCBI_gi_id.csh; do \ sed --in-place -e 's|\b__MAKE_PREFIX__\b|$(prefix)|g;s|__SIFT_SCRIPTDIR__|$(SIFT_SCRIPTDIR)|g;s|__SIFT_BINDIR__|$(SIFT_BINDIR)|g;' $(DESTDIR)$(SIFT_SCRIPTDIR)/$$f; \ done; for f in seqs_chosen_via_median_info.csh SIFT_for_submitting_fasta_seq.csh SIFT_for_submitting_NCBI_gi_id.csh; do \ sed --in-place -e 's|\b__MAKE_PREFIX__\b|$(prefix)|g;s|__SIFT_SCRIPTDIR__|$(SIFT_SCRIPTDIR)|g;s|__SIFT_BINDIR__|$(SIFT_BINDIR)|g;' $(DESTDIR)$(SIFT_SCRIPTDIR)/$$f; \ done; chmod +x $(DESTDIR)$(prefix)/lib/sift/bin/perlscripts/separate_query_from_rest_of_seqs.pl # look out: DNA_PROT_SUBROUTINES.pl is really a module (should be pm)! for f in SIFT_subroutines.pm DNA_PROT_SUBROUTINES.pl; do \ chmod -x $(DESTDIR)$(prefix)/lib/sift/bin/$$f; \ done; for f in IntersectFeatures.jar; do \ chmod -x $(DESTDIR)$(pkgdatadir)/$$f; \ done; get-orig-source: mkdir -p ../tarballs uscan --verbose --force-download --destdir=../tarballs debian/manpages0000644000000000000000000000006311721476525010715 0ustar info_on_seqs.1 SIFT_for_submitting_fasta_seq.csh.1 debian/changelog0000644000000000000000000000200412034577522011043 0ustar sift (4.0.3b-4) unstable; urgency=low [ Andreas Tille ] * debian/upstream: Add citations [ Laszlo Kajan ] * Merge updates from Ubuntu 4.0.3b-3ubuntu1. -- Laszlo Kajan Mon, 08 Oct 2012 16:41:15 +0200 sift (4.0.3b-3ubuntu1) quantal; urgency=low * Merge with Debian unstable. Remaining changes: - Update patches/debian/ld-as-needed: Move libs from LDFLAGS to LIBS. -- Michael Bienia Sat, 26 May 2012 11:38:24 +0200 sift (4.0.3b-3) unstable; urgency=low * Added patches from Michael Bienia . -- Laszlo Kajan Wed, 23 May 2012 18:46:18 +0200 sift (4.0.3b-2) unstable; urgency=low * Missing dependency blimps-utils added. blimps-utils provides fastaseqs, called by some scripts in sift. -- Laszlo Kajan Mon, 02 Apr 2012 11:50:09 +0200 sift (4.0.3b-1) unstable; urgency=low * Initial release (Closes: #660272) -- Laszlo Kajan Thu, 23 Feb 2012 18:55:23 +0100 debian/upstream0000644000000000000000000000375711757643324011001 0ustar Reference: - Author: Prateek Kumar and Steven Henikoff and Pauline C Ng Title: Predicting the effects of coding non-synonymous variants on protein function using the SIFT algorithm Journal: Nature Protocols Year: 2009 Volume: 4 Number: 7 Pages: 1073-81 DOI: 10.1038/nprot.2009.86 PMID: 19561590 URL: http://www.nature.com/nprot/journal/v4/n7/full/nprot.2009.86.html eprint: http://www.nature.com/nprot/journal/v4/n7/pdf/nprot.2009.86.pdf - Author: Pauline C Ng and Steven Henikoff Title: Predicting the effects of amino acid substitutions on protein function Journal: Annu Rev Genomics Hum Genet. Year: 2006 Volume: 7 Pages: 61-80 DOI: 10.1146/annurev.genom.7.080505.115630 PMID: 16824020 URL: http://www.annualreviews.org/doi/abs/10.1146/annurev.genom.7.080505.115630 eprint: http://www.annualreviews.org/doi/pdf/10.1146/annurev.genom.7.080505.115630 - Author: Pauline C Ng and Steven Henikoff Title: "SIFT: Predicting amino acid changes that affect protein function" Journal: Nucleic Acids Research Year: 2003 Volume: 31 Number: 13 Pages: 3812-4 DOI: 10.1093/nar/gkg509 PMID: 12824425 URL: http://nar.oxfordjournals.org/content/31/13/3812.abstract eprint: http://nar.oxfordjournals.org/content/31/13/3812.full.pdf+html - Author: Pauline C Ng and Steven Henikoff Title: "Accounting for human polymorphisms predicted to affect protein function" Journal: Genome Research Year: 2002 Volume: 12 Number: 3 Pages: 436-46 DOI: 10.1101/gr.212802 PMID: 11875032 URL: http://genome.cshlp.org/content/12/3/436 eprint: http://genome.cshlp.org/content/12/3/436.full.pdf+html - Author: Pauline C Ng and Steven Henikoff Title: Predicting deleterious amino acid substitutions Journal: Genome Research Year: 2001 Volume: 11 Number: 5 Pages: 863-74 DOI: 10.1101/gr.176601 PMID: 11337480 URL: http://genome.cshlp.org/content/11/5/863 eprint: http://genome.cshlp.org/content/11/5/863.full.pdf+html debian/examples0000644000000000000000000000015211721467043010732 0ustar test/lacI.SIFTprediction test/lacI.alignedfasta test/lacI.fasta test/lacI.fasta.query.out test/lacI.subst debian/control0000644000000000000000000000221111736273456010603 0ustar Source: sift Section: non-free/science XS-Autobuild: yes Priority: extra Maintainer: Debian Med Packaging Team Uploaders: H. Soon Gweon , Laszlo Kajan , Tim Booth Build-Depends: debhelper (>= 8), libblimps3-dev Standards-Version: 3.9.3 Homepage: http://sift.jcvi.org Vcs-Svn: svn://svn.debian.org/debian-med/trunk/packages/sift/trunk/ Vcs-Browser: http://svn.debian.org/wsvn/debian-med/trunk/packages/sift/trunk/ DM-Upload-Allowed: yes Package: sift Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, blimps-utils, csh Description: predicts if a substitution in a protein has a phenotypic effect SIFT is a sequence homology-based tool that sorts intolerant from tolerant amino acid substitutions and predicts whether an amino acid substitution in a protein will have a phenotypic effect. SIFT is based on the premise that protein evolution is correlated with protein function. Positions important for function should be conserved in an alignment of the protein family, whereas unimportant positions should appear diverse in an alignment. debian/compat0000644000000000000000000000000211722003531010356 0ustar 8 debian/README.source0000644000000000000000000000045111721501752011346 0ustar sift for Debian ----------------- Patches ....... Patches are documented in the headers. Autobuilding ............ The FHCRC NONCOMMERCIAL LICENSE does not prohibit building the source. It also does not prohibit the distribution of the binary software on a royalty-free basis. # vim:et:ts=2: debian/docs0000644000000000000000000000004311721434376010046 0ustar README README_4.0.3 VERSION_UPDATE debian/copyright0000644000000000000000000001504511730415323011125 0ustar Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: sift Upstream-Contact: Pauline Ng Source: http://sift.jcvi.org, http://sift.bii.a-star.edu.sg/ Disclaimer: This package is not part of Debian because of the non-free FHCRC NONCOMMERCIAL LICENSE license. The license does not restrict auto-building of the package. Distribution on a royalty-free basis is allowed, therefore Debian is allowed to distribute the program. Files: debian/* Copyright: 2009-2012 Laszlo Kajan , 2012 H. Soon Gweon , 2012 Tim Booth , 2012 Andreas Tille License: GPL Packaging is licensed under the GPL, see `/usr/share/common-licenses/GPL'. Files: * Copyright: 1993-2008, Fred Hutchinson Cancer Research Center License: FHCRC LICENSE -- License and copyright information for SIFT and BLIMPS . --------------------------------------------------------------------------- FHCRC NONCOMMERCIAL LICENSE Fred Hutchinson Cancer Research Center 1100 Fairview AV N Seattle, WA 98109-1024 . . This license may be copied and distributed but may not be modified in any way. . RECITALS: . A. Fred Hutchinson Cancer Research Center is a nonprofit corporation dedicated to investigating the nature and causes of cancer and methods of cancer prevention and treatment and for conducting education in all phases of cancer research. . B. This license is designed to permit certain activities with respect to any computer programs to which it applies on the terms stated in this license. . TERMS AND CONDITIONS: . 1. Definitions. . "Program" means the computer software and associated documentation in whatever form or format including source and object code. . "Derivative Work" means any modification, addition, upgrade, update or improvement of the Program and any other work constituting a derivative work under the copyright laws. . "Licensee" means licensee of the Program under this license. . "FHCRC" means Fred Hutchinson Cancer Research Center. . 2. License, Copy, Distribute and Create Derivative Work. FHCRC hereby grants Licensee and Licensee accepts, subject to the terms and conditions of this Agreement, a nonexclusive, royalty-free license to use, copy and sublicense the Program and prepare and sublicense Derivative Works. This license is expressly limited as follows: . a. All copies of the Program must conspicuously show an appropriate copyright notice and disclaimer of warranty and must expressly refer to this license. In addition, Derivative Works must conspicuously state that the Program has been modified and the date of any modification. . b. Licensee may not charge any fee or royalty for copies of the Program or Derivative Work. Licensee is authorized to sublicense the Program and any Derivative Work to third parties on a royalty-free basis subject to the terms of this license but may not otherwise distribute the Program or any Derivative Work. Notwithstanding the foregoing, Licensee may be reimbursed for any actual expenses it incurs in reproducing or delivering a copy of the Program or any Derivative Work. . c. This license does not apply to a work written entirely by Licensee. . d. Licensee agrees to make available to any person to whom it sublicenses this Program or a Derivative Work source code and object code for the Program or Derivative Work. . e. Licensee may not impose any terms in addition to the terms of this license on any third party to whom it distributes the Program, any Derivative Work or any copy thereof. . 3. Reservation of Rights to FHCRC. FHCRC hereby reserves to itself all rights of any nature whatsoever with respect to the Program. Any Derivative Work created by FHCRC or its employees or agents is not subject to the terms of this license. . 4. Termination of license. This license precludes any copying, modification, sublicensing or other distribution of any Program or Derivative Work except as expressly permitted under this license. Violation of the terms of this license by Licensee will be deemed an automatic termination of Licensee's rights under this license; provided, however, that sublicenses derived from Licensee will remain in full force and effect provided that any such sublicensees do not breach the terms of this license. . 5. Acceptance of Terms of License. Licensee agrees that by copying or distributing this Program or by creating, copying or distributing any Derivative Work, Licensee has agreed to the terms of this license. . 6. Limitation of Remedies and Indemnity. Because this Program is licensed on a royalty-free basis, FHCRC and any person distributing this Program or any Derivative Work expressly disclaim all warranties express or implied including, without limitation, any implied warranty of merchantability or fitness for a particular purpose. Neither FHCRC nor any other party distributing the Program or any Derivative Work will be liable for any damages arising from or related to the use of the Program or such Derivative Work, including, without limitation, incidental or consequential damages caused by the use or inability to use the Program or loss or damage to data. . Licensee agrees to indemnify, defend and hold FHCRC harmless from and against any claims, causes of action, damages, liabilities and expenses, including without limitation attorney's fees, arising from or related to Licensee's use, copying or distribution of the and expenses, including without limitation attorney's fees, arising from or related to Licensee's use, copying or distribution of the Program or any Derivative Work. . 7. Severability. If any provision of this license is held invalid under any applicable law or for any other reason, such invalidity shall not effect any other provision of this license that can be given effect without the invalid provision. . 8. Notice. Persons wishing to communicate with FHCRC concerning this license or the Program should address their communications as follows: . Fred Hutchinson Cancer Research Center Attn: Dr. Steven Henikoff 1100 Fairview AV N, A1-162 PO Box 19024 Seattle, WA 98109-1024 . . --------------------------------------------------------------------------- debian/links0000644000000000000000000000117611721712572010243 0ustar usr/lib/sift/bin/info_on_seqs usr/bin/info_on_seqs usr/lib/sift/bin/SIFT_exome_nssnvs.pl usr/bin/SIFT_exome_nssnvs.pl usr/lib/sift/bin/sift_feed_to_chr_coords_batch.pl usr/bin/sift_feed_to_chr_coords_batch.pl usr/lib/sift/bin/snv_db_engine.pl usr/bin/snv_db_engine.pl usr/lib/sift/bin/map_coords_to_bin.pl usr/bin/map_coords_to_bin.pl usr/lib/sift/bin/SIFT_for_submitting_fasta_seq.csh usr/bin/SIFT_for_submitting_fasta_seq.csh usr/lib/sift/bin/SIFT_for_submitting_fasta_seq.csh usr/bin/sift_for_submitting_fasta_seq.csh usr/share/man/man1/SIFT_for_submitting_fasta_seq.csh.1.gz usr/share/man/man1/sift_for_submitting_fasta_seq.csh.1.gz debian/patches/0000755000000000000000000000000012034577547010633 5ustar debian/patches/series0000644000000000000000000000011012034577522012031 0ustar fix_perl_interpreter_and_lib_path ld-as-needed fix_gcc_warnings jarfile debian/patches/jarfile0000644000000000000000000000057312034577522012170 0ustar Author: Laszlo Kajan Description: path to IntersectFeatures.jar IntersectFeatures.jar file now in /usr/share, patch for calling script. --- a/bin/IntersectLocations.sh +++ b/bin/IntersectLocations.sh @@ -1,3 +1,3 @@ #!/bin/sh -bin="/usr/local/projects/SIFT/sift4.0/bin/" +bin="/usr/share/sift" java -Xmx500m -jar $bin/IntersectFeatures.jar $1 $2 $3 $4 $5 debian/patches/fix_gcc_warnings0000644000000000000000000000673411757234275014101 0ustar Description: Fix gcc warnings. Author: Michael Bienia --- a/src/Alignment.c +++ b/src/Alignment.c @@ -766,7 +766,7 @@ void read_psiblast_header_until_first_alignment (FILE* fp) { - char line[LARGE_BUFF_LENGTH]; + char line[LINE_LEN > LARGE_BUFF_LENGTH ? LINE_LEN : LARGE_BUFF_LENGTH]; read_psiblast_header_until_first (fp); @@ -783,7 +783,7 @@ int read_psiblast_header_until_first_no_error (FILE* fp,int return_error ) { - char line[MAXLEN]; + char line[LINE_LEN]; while (!feof (fp) && fgets (line, LINE_LEN, fp) != NULL && strstr (line, "Sequences producing significant alignments") == NULL) @@ -1694,7 +1694,7 @@ assert (fp != NULL); fgets (line, LARGE_BUFF_LENGTH, fp); if (strstr (line, "Query") == NULL) { - fprintf (stderr,"%s line should have Query:\n"); + fprintf (stderr,"'%s' line should have 'Query'\n", line); exit (-1); } strptr = strtok (line, " \t\r\n"); /* this should be Query */ --- a/src/PN_convert.c +++ b/src/PN_convert.c @@ -2184,7 +2184,7 @@ else { fprintf (stderr, "Can't open file '%s': %s.\n", qijname, strerror(errno) ); - fprintf (stderr, "Current dir:\n", qijname ); + fprintf (stderr, "Current dir: %s\n", qijname ); system( "pwd" ); exit (-1); } --- a/src/Psiblast.c +++ b/src/Psiblast.c @@ -77,14 +77,14 @@ buff = strstr(Buffer, "Score="); if (buff != NULL) { - sscanf (buff, "Score = %d", alignment->score); + sscanf (buff, "Score = %d", &alignment->score); } else { fprintf (errorfp, "Unable to read Score, parsing incorrect"); exit (-1); } buff = strstr (Buffer, "Expect = "); if (buff != NULL) { - sscanf (buff, "Expect = %lf", alignment->evalue); + sscanf (buff, "Expect = %lf", &alignment->evalue); } else { fprintf (errorfp, "Unable to read e-value, parsing incorrect"); exit (-1); @@ -95,7 +95,7 @@ fgets (Buffer, LARGE_BUFF_LENGTH, fp); /* get blank line */ fgets (Buffer, LARGE_BUFF_LENGTH, fp); /* get first line */; get_start_pos = TRUE; - while (Buffer != "\n") { + while ( strcmp( Buffer, "\n" ) != 0 ) { read_4_alignment_lines (Buffer, alignment, fp, get_start_pos); get_start_pos = FALSE; /* read 1rst 4 alignment lines*/ /* already got the starting pos*/ @@ -274,7 +274,7 @@ FILE* psiblastfp; char name[KEY_WIDTH]; char *strptr; - char line[LARGE_BUFF_LENGTH]; + char line[LINE_LEN]; if ((psiblastfp = fopen (psiblastres_file, "r")) == NULL) --- a/src/clump_output_alignedseq.c +++ b/src/clump_output_alignedseq.c @@ -137,7 +137,7 @@ char queryfilename[LARGE_BUFF_LENGTH]; if (argc < 4) { - printf ("clump.c : Clusters sequences into % clus"); + printf ("clump.c : Clusters sequences into %% clus"); } if (argc > 1 ) strcpy (seqfilename, argv[1]); --- a/src/Matrix_Info.c +++ b/src/Matrix_Info.c @@ -467,7 +467,7 @@ case 2: /* +- phenotype is - */ if ( matrix->weights[aa_atob[c]][l] >= 0) { pos_error_fp++; error_fp++; - printf ("incorrect less severe %d %d %c\n", l, matrix->weights[aa_atob[c]][l], c); + printf ("incorrect less severe %d %g %c\n", l, matrix->weights[aa_atob[c]][l], c); print_type = print_incorrect; } else if (matrix->weights[aa_atob[c]][l] < 0){ pos_notobserved_intolerant++; debian/patches/fix_perl_interpreter_and_lib_path0000644000000000000000000025674711722743721017512 0ustar * Fix perl interpreter paths. * Support parallel build of binaries. * Enhance verbosity in csh scripts for bug hunting. * Add blast processors argument SIFT_for_submitting_fasta_seq.csh . * Trailing whitespace removed from lines. --- a/bin/SIFT_exome_nssnvs.pl +++ b/bin/SIFT_exome_nssnvs.pl @@ -1,4 +1,8 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl + +#Location of all the SIFT grot on Debian. +use lib '/usr/lib/sift/bin'; + use List::Util qw[min max]; use File::Copy; use Getopt::Std; @@ -12,7 +16,7 @@ $| = 1; system("umask 006"); -$ENV{'SIFT_HOME'} = '/usr/local/projects/SIFT/sift4.0.2/'; +$ENV{'SIFT_HOME'} = '/usr/lib/sift/'; my $SIFT_HOME = $ENV{'SIFT_HOME'}; use vars qw($opt_i $opt_d $opt_o $opt_A $opt_B $opt_C $opt_D $opt_E $opt_F $opt_G $opt_H $opt_I $opt_J $opt_K $opt_L); getopts("i:d:o:A:B:C:D:E:F:G:H:I:J:K:L:"); @@ -62,7 +66,7 @@ my $oo12 = defined($opt_L) ? 1:0; my $pid = $$; my $bin = "$SIFT_HOME/bin"; -my $tmp = defined($opt_o) ? "$opt_o/$pid" : "$SIFT_HOME/tmp/$pid"; +my $tmp = defined($opt_o) ? "$opt_o/$pid" : "/tmp/sift_$pid"; my $Variation_db_dir = $opt_d; mkdir $tmp; my $num_coords_per_split = 1000; --- a/bin/SIFT_exome_indels.pl +++ b/bin/SIFT_exome_indels.pl @@ -1,4 +1,6 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl + +use lib '/usr/lib/sift/bin'; # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) @@ -12,7 +14,7 @@ use Tie::IxHash; use File::Copy; use Getopt::Std; -$ENV{'SIFT_HOME'} = '/usr/local/projects/SIFT/sift4.0.1'; +$ENV{'SIFT_HOME'} = '/usr/lib/sift'; $SIFT_HOME = $ENV{'SIFT_HOME'}; use vars qw($opt_i $opt_c $opt_d $opt_o); getopts("i:c:d:o:"); @@ -38,7 +40,7 @@ system("umask 006"); my $bin = "$SIFT_HOME/bin"; my $pid = $$; -my $tmp = defined($opt_o) ? "$opt_o/$pid" : "$SIFT_HOME/tmp/$pid"; +my $tmp = defined($opt_o) ? "$opt_o/$pid" : "/tmp/sift_$pid"; require "$bin/SIFT_subroutines.pm"; mkdir $tmp; chmod 0777, $tmp; --- a/bin/perlscripts/get_BLINK_seq.pl +++ b/bin/perlscripts/get_BLINK_seq.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl #use strict; use LWP::Simple; use LWP::UserAgent; --- a/bin/seqs_chosen_via_median_info.csh +++ b/bin/seqs_chosen_via_median_info.csh @@ -1,7 +1,8 @@ -#!/bin/csh +#!/bin/csh -e # seqs_chosen_via_median_info.csh # Arg1 = protein sequence in fasta format # Arg2 = blastpgp database +# Arg3 = blastpgp processors (-a) # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) @@ -9,14 +10,16 @@ # http://blocks.fhcrc.org/sift/license.html and should be attached # to this software -set tmpdir = $SIFT_DIR/tmp -set bindir = $SIFT_DIR/bin - -unalias cp -unalias rm +unalias cp || true +unalias rm || true set seq_database = $2 set median_threshold = 2.75; +if ( $3 == "" ) then + set blastprocessors = 1 +else + set blastprocessors = $3 +endif # Cleanup query sequence. NCBI stuff is intolerant of format # deviations. @@ -24,22 +27,24 @@ set tail = $1:t set pid = $tail:r echo tail is $tail -set query = $tmpdir/$tail.query +set query = $TMPDIR/$tail.query echo query is $query -$bindir/fastaseqs $1 $query >& /dev/null +echo "fastaseqs $1 $query >& /dev/null" +fastaseqs $1 $query >& /dev/null # (1) PSIBLAST - set iterations = 1 +set iterations = 2 # the query is called $$.fasta, get pid from that ####### FILTER ############################ #$BLASTFILTER/seg $query -x > $query.imp +echo "cp $query $query.unfiltered" cp $query $query.unfiltered -#mv -f $query.imp $query # this has the filtered query +#mv -f $query.imp $query # this has the filtered query # b option, # of alignments to show (else it is 250) # m option, type of alignment 0-pairwise, 6-flat master slave w/out identites @@ -47,10 +52,18 @@ # make error file & remove only if PSI-BLAST is finished within time allotted # not filtering query because Jorja says new statistics take care of that -echo "PSI-BLAST time limit exceeded." > $tmpdir/$pid.time.error +echo "PSI-BLAST time limit exceeded." > $TMPDIR/$pid.time.error limit cputime 30m -$NCBI/blastpgp -d $seq_database -i $query -o $query.out -m 0 -j $iterations -e .0001 -h 0.002 -b 399 +if ( $?SIFT_DEBUG ) then + echo "if ! ( -e "/tmp/savethis.out" ) blastpgp -a $blastprocessors -d $seq_database -i $query -o /tmp/savethis.out -m 0 -j $iterations -e .0001 -h 0.002 -b 399" + if ! ( -e "/tmp/savethis.out" ) blastpgp -a $blastprocessors -d $seq_database -i $query -o /tmp/savethis.out -m 0 -j $iterations -e .0001 -h 0.002 -b 399; + echo "cp -a /tmp/savethis.out $query.out" + cp -a /tmp/savethis.out $query.out +else + echo "blastpgp -a $blastprocessors -d $seq_database -i $query -o $query.out -m 0 -j $iterations -e .0001 -h 0.002 -b 399" + blastpgp -a $blastprocessors -d $seq_database -i $query -o $query.out -m 0 -j $iterations -e .0001 -h 0.002 -b 399 +endif if ($status != 0) then echo "exiting because stauts not equal to 0" @@ -59,45 +72,50 @@ unlimit cputime # finished within 10 minutes, remove error file -rm -f $tmpdir/$pid.time.error +echo "rm -f $TMPDIR/$pid.time.error" +rm -f $TMPDIR/$pid.time.error ##echo done psiblast # get the psiblast alignment of all the sequences found. -$bindir/psiblast_res_to_fasta_dbpairwise $query.out $query.globalX $iterations $query.unfiltered +echo "__SIFT_BINDIR__/psiblast_res_to_fasta_dbpairwise $query.out $query.globalX $iterations $query.unfiltered" + __SIFT_BINDIR__/psiblast_res_to_fasta_dbpairwise $query.out $query.globalX $iterations $query.unfiltered if ($status != 0) then exit (-1) endif -$bindir/clump_output_alignedseq $query.globalX $tmpdir/$pid.clumped .9 0 >& /dev/null +echo "__SIFT_BINDIR__/clump_output_alignedseq $query.globalX $TMPDIR/$pid.clumped .9 0 >& /dev/null" + __SIFT_BINDIR__/clump_output_alignedseq $query.globalX $TMPDIR/$pid.clumped .9 0 >& /dev/null -echo "choose seqs time limit exceeded." > $tmpdir/$pid.time.error +echo "choose seqs time limit exceeded." > $TMPDIR/$pid.time.error limit cputime 30m -$bindir/choose_seqs_via_psiblastseedmedian $query.unfiltered $tmpdir/$pid.clumped $query.selectedclumped 1.0 $pid $median_threshold >& /dev/null +echo "__SIFT_BINDIR__/choose_seqs_via_psiblastseedmedian $query.unfiltered $TMPDIR/$pid.clumped $query.selectedclumped 1.0 $pid $median_threshold" + __SIFT_BINDIR__/choose_seqs_via_psiblastseedmedian $query.unfiltered $TMPDIR/$pid.clumped $query.selectedclumped 1.0 $pid $median_threshold if ($status != 0) then echo "exiting because stauts not equal to 0" exit (-1) endif unlimit cputime -rm -f $tmpdir/$pid.time.error +rm -f $TMPDIR/$pid.time.error -$bindir/consensus_to_seq $query.selectedclumped $tmpdir/$pid.clumped.consensuskey $tmpdir/$pid.selected >& /dev/null +echo "__SIFT_BINDIR__/consensus_to_seq $query.selectedclumped $TMPDIR/$pid.clumped.consensuskey $TMPDIR/$pid.selected >& /dev/null" + __SIFT_BINDIR__/consensus_to_seq $query.selectedclumped $TMPDIR/$pid.clumped.consensuskey $TMPDIR/$pid.selected >& /dev/null -$bindir/seqs_from_psiblast_res $query.out $tmpdir/$pid.selected $iterations $query.unfiltered $tmpdir/$pid.alignedfasta $pid +echo "__SIFT_BINDIR__/seqs_from_psiblast_res $query.out $TMPDIR/$pid.selected $iterations $query.unfiltered $TMPDIR/$pid.alignedfasta $pid" + __SIFT_BINDIR__/seqs_from_psiblast_res $query.out $TMPDIR/$pid.selected $iterations $query.unfiltered $TMPDIR/$pid.alignedfasta $pid - rm $tmpdir/$pid.clumped* - rm $query.selectedclumped* - rm $tmpdir/$pid.selected - rm $tmpdir/$pid.unaligned - rm $query.globalX - rm $tmpdir/$pid.psiblastout - rm $pid.TEMP* - rm $query - rm $query.unfiltered +echo "rm -f" + rm -f $TMPDIR/$pid.clumped* + rm -f $query.selectedclumped* + rm -f $TMPDIR/$pid.selected + rm -f $TMPDIR/$pid.unaligned + rm -f $query.globalX + rm -f $TMPDIR/$pid.psiblastout + rm -f $pid.TEMP* + rm -f $query + rm -f $query.unfiltered exit (0) - - --- a/bin/SIFT_for_submitting_fasta_seq.csh +++ b/bin/SIFT_for_submitting_fasta_seq.csh @@ -1,4 +1,4 @@ -#!/bin/csh +#!/bin/csh -e # SIFT.csh # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) @@ -10,30 +10,37 @@ # Argument 1: the protein sequence file in fasta format # Argument 2: the pathname to the protein sequence database # Argument 3: the substitution file (file containing amino acid substitutions to be predicted on. +# Argument 4: blast processors ### Set these for your installation # Location of blastpgp -setenv NCBI /usr/local/packages/blast-2.2.18/bin/ +#setenv NCBI /usr/local/packages/blast-2.2.18/bin/ # Location of BLIMPS -setenv BLIMPS_DIR /opt/www/sift/sift3.0/blimps/ - -# Location of SIFT -setenv SIFT_DIR /opt/www/sift/sift3.0 +setenv BLIMPS_DIR __MAKE_PREFIX__/share/sift/blimps # SIFT's output files are written here -set tmpdir = /opt/www/sift/sift3.0/tmp +setenv TMPDIR `mktemp -d` || exit 1 + +setenv SIFT_BINDIR __SIFT_BINDIR__ +setenv SIFT_SCRIPTDIR __SIFT_SCRIPTDIR__ +setenv BLASTMAT __MAKE_PREFIX__/share/ncbi/data + +if ( -e "$HOME/.siftrc" ) source "$HOME/.siftrc"; ### Shouldn't need to make any more changes, look for output in $tmpsift -set bindir = $SIFT_DIR/bin set root_file = $1:r set tail_of_root = $root_file:t -set tmpfasta = $tmpdir/$tail_of_root.alignedfasta -set tmpsift = $tmpdir/$tail_of_root.SIFTprediction - -$bindir/seqs_chosen_via_median_info.csh $1 $2 $4 -$bindir/info_on_seqs $tmpfasta $3 $tmpsift -echo "Output in $tmpsift" +set tmpfasta = $TMPDIR/$tail_of_root.alignedfasta +#set tmpsift = $TMPDIR/$tail_of_root.SIFTprediction +set tmpsift = $tail_of_root.SIFTprediction + +echo "$SIFT_SCRIPTDIR/seqs_chosen_via_median_info.csh $1 $2 $4" + $SIFT_SCRIPTDIR/seqs_chosen_via_median_info.csh $1 $2 $4 +echo "$SIFT_SCRIPTDIR/info_on_seqs $tmpfasta $3 $tmpsift" + $SIFT_SCRIPTDIR/info_on_seqs $tmpfasta $3 $tmpsift +rm -rf $TMPDIR +echo "Output in ($PWD/)$tmpsift" exit --- a/bin/SIFT_for_submitting_NCBI_gi_id.csh +++ b/bin/SIFT_for_submitting_NCBI_gi_id.csh @@ -1,4 +1,4 @@ -#!/bin/csh +#!/bin/csh -e # SIFT.csh # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) @@ -6,53 +6,56 @@ # http://blocks.fhcrc.org/sift/license.html and should be attached # to this software -# Argument 1: NCBI protein gi #. Just the number, nothing else. -# Argument 2: The substitution file (file containing amino acid substitutions to be predicted on. -# Argument 3: Type of sequences to get from NCBI BLink. "BEST" or "ALL" -# BEST is recommended if high-confidence scores can be obtained +# Argument 1: NCBI protein gi #. Just the number, nothing else. +# Argument 2: The substitution file (file containing amino acid substitutions to be predicted on. +# Argument 3: Type of sequences to get from NCBI BLink. "BEST" or "ALL" +# BEST is recommended if high-confidence scores can be obtained ### Set these for your installation # Location of blastpgp -setenv NCBI ../../blast -setenv NCBI /usr/local/blast/ +#setenv NCBI ../../blast # Location of BLIMPS -setenv BLIMPS_DIR ../blimps - -# Location of SIFT -setenv SIFT_DIR ../ +setenv BLIMPS_DIR __MAKE_PREFIX__/share/sift/blimps # SIFT's output files are written here -set tmpdir = ../tmp +setenv TMPDIR `mktemp -d` || exit 1 + +setenv SIFT_SCRIPTDIR __SIFT_SCRIPTDIR__ +setenv BLASTMAT __MAKE_PREFIX__/share/ncbi/data + +if ( -e "$HOME/.siftrc" ) source "$HOME/.siftrc"; -### Shouldn't need to make any more changes, look for output in $tmpdir -set bindir = $SIFT_DIR/bin +### Shouldn't need to make any more changes, look for output in $tmpdir set gi = $1 set polymorphism_file = $2 -set outfile = $tmpdir/$1.SIFTprediction -set bestORall = $3 # if set to BEST, takes best reciprocal hits from BLink, +#set outfile = $TMPDIR/$1.SIFTprediction +set outfile = $1.SIFTprediction +set bestORall = $3 # if set to BEST, takes best reciprocal hits from BLink, # otherwise it will take all hits from BLINK # get unaligned sequences from NCBI -set unalignedseqs = $tmpdir/$gi.unaligned -perl $bindir/perlscripts/get_BLINK_seq.pl $gi $tmpdir/$gi.unaligned $bestORall +set unalignedseqs = $TMPDIR/$gi.unaligned +$SIFT_SCRIPTDIR/get_BLINK_seq.pl $gi $TMPDIR/$gi.unaligned $bestORall # getting the alignment from PSI-BLAST, quick and dirty cat $unalignedseqs | perl -pe 's/\|//' | perl -pe 's/\|/\t/' | perl -pe 's/^\n//' | awk '{if ($1 ~ /^>/) { print $1} else { print;}}' > $unalignedseqs.2 set alignedseqs = $unalignedseqs.aligned - perl $bindir/perlscripts/separate_query_from_rest_of_seqs.pl $unalignedseqs.2 $unalignedseqs.queryseq $unalignedseqs.database - $NCBI/formatdb -i $unalignedseqs.database -o T -p T + $SIFT_SCRIPTDIR/separate_query_from_rest_of_seqs.pl $unalignedseqs.2 $unalignedseqs.queryseq $unalignedseqs.database + formatdb -i $unalignedseqs.database -o T -p T # extremely large evalues and multipass threshold because want to make sure get #all the # sequences the user submits - $NCBI/blastpgp -d $unalignedseqs.database -i $unalignedseqs.queryseq -o $unalignedseqs.psiblastout -m 0 -j 4 -e 10 -h 1 -b 399 + blastpgp -d $unalignedseqs.database -i $unalignedseqs.queryseq -o $unalignedseqs.psiblastout -m 0 -j 4 -e 10 -h 1 -b 399 echo QUERY > $unalignedseqs.listseq grep ">" $unalignedseqs.database | cut -d" " -f1 | cut -c2- >> $unalignedseqs.listseq - $bindir/seqs_from_psiblast_res $unalignedseqs.psiblastout $unalignedseqs.listseq 4 $unalignedseqs.queryseq $alignedseqs $$ + seqs_from_psiblast_res $unalignedseqs.psiblastout $unalignedseqs.listseq 4 $unalignedseqs.queryseq $alignedseqs $$ -# final scoring -$bindir/info_on_seqs $alignedseqs $polymorphism_file $outfile +# final scoring +info_on_seqs $alignedseqs $polymorphism_file $outfile +rm -rf $TMPDIR +echo "Output in ($PWD/)$outfile" exit --- a/src/Alignment.c +++ b/src/Alignment.c @@ -17,7 +17,7 @@ #define _ALIGNMENT_C_ #include "PN_convert.c" -#define MAXSEQ 400 +#define MAXSEQ 20000 #include "Alignment.h" @@ -478,6 +478,7 @@ fprintf(errorfp, "ERROR! Why haven't %s and %s been freed?\n", prev_name, prev_residues); fprintf (errorfp, "Current %s %s\n", name, residues); + fprintf (stderr, "Current %s %s\n", name, residues); exit (-1); } } /* end of if (line[0] != ' ') */ @@ -752,7 +753,8 @@ } if (feof (fp)) { fclose (fp); - fprintf (errorfp, "ERROR! PSI-BLAST found no hits. Program terminating.\n"); + fprintf (errorfp, "ERROR! PSI-BLAST found no hits. Program terminating.\n"); + fprintf (stderr, "%s:%d ERROR! PSI-BLAST found no hits. Program terminating.\n", __FILE__, __LINE__ ); exit (-1); } @@ -792,7 +794,8 @@ fclose (fp); if (return_error) { fprintf (errorfp, "ERROR! PSI-BLAST found no hits. Program terminating.\n"); - exit (-1); + fprintf (stderr, "%s:%d ERROR! PSI-BLAST found no hits. Program terminating.\n", __FILE__, __LINE__ ); + exit (-1); } else { return (-1); } @@ -827,6 +830,7 @@ if (feof (fp)) { fclose (fp); fprintf (errorfp, "ERROR! PSI-BLAST found no hits. Program terminating.\n"); + fprintf (stderr, "%s:%d ERROR! PSI-BLAST found no hits. Program terminating.\n", __FILE__, __LINE__ ); exit (-1); } sscanf (line, "Results from round %d", &iteration); @@ -1273,6 +1277,7 @@ } } fprintf (errorfp, "EXITING -- couldn't find %s in get_index_from_seqs\n", name); + fprintf (stderr, "EXITING -- couldn't find %s in get_index_from_seqs\n", name); exit (-1); } @@ -1405,8 +1410,9 @@ sscanf (strptr, "Expect = %lf", &evalue); } else { fprintf (errorfp, "Unable to read e-value, parsing incorrect"); - exit (-1); - } + fprintf (stderr, "Unable to read e-value, parsing incorrect"); + exit (-1); + } /* printf ("evalue is %lf\n", evalue); */ /* end get evalue */ @@ -1491,6 +1497,7 @@ assert (seq1->length == seq2->length); if (strcmp (seq1->name, seq2->name ) != 0) { fprintf (errorfp, "seeing whether %s and %s overlap, should be same name\n", seq1->name, seq2->name); + fprintf (stderr, "seeing whether %s and %s overlap, should be same name\n", seq1->name, seq2->name); exit (-1); } for (i = 0; i < seq1->length; i++) { @@ -1595,7 +1602,7 @@ } if (strstr (line, "Query") == NULL) { - printf ("%s line should have Query:\n", line); + fprintf (stderr, "%s line should have Query:\n", line); exit (-1); } strptr = strtok (line, " \t\r\n"); /* this should be Query */ @@ -1687,7 +1694,7 @@ assert (fp != NULL); fgets (line, LARGE_BUFF_LENGTH, fp); if (strstr (line, "Query") == NULL) { - printf ("%s line should have Query:\n"); + fprintf (stderr,"%s line should have Query:\n"); exit (-1); } strptr = strtok (line, " \t\r\n"); /* this should be Query */ @@ -2107,10 +2114,10 @@ } - printf ("Clumped %d sequences into %d clusters\n", nseq, i2); + printf("Clumped %d sequences into %d clusters\n", nseq, i2); /* keep this print because get # of clustersin awk statement */ free(pairs); -printf ("exiting clump\n"); +fprintf (stderr, "%s:%d exiting clump\n", __FILE__, __LINE__); return no_of_clumps; } /* end of cluster */ --- a/src/blocklist.c +++ b/src/blocklist.c @@ -9,7 +9,7 @@ #include #include "blocklist.h" -#define MAXSEQ 400 +#define MAXSEQ 20000 /*====================================================================== routines for a list of blocks --- a/src/choose_seqs_via_psiblastseedmedian.c +++ b/src/choose_seqs_via_psiblastseedmedian.c @@ -1,6 +1,6 @@ /* (C) Copyright 2000, Fred Hutchinson Cancer Research Center */ /* Use, modification or distribution of these programs is subject to */ -/* the terms of the non-commercial licensing agreement in license.h. */ +/* the terms of the non-commercial licensing agreement in license.h. */ /* choose_seqs_via_psiblast.c */ /* 02-14-01 use median threshold 05-18-01, changed to add PSIBLAST_TURN sequences at once, if threshold has not @@ -8,7 +8,7 @@ recalculate median threshold for the 1 .. PSIBLAST_TURN sequences that have been added. Should reduce time for calculation of median threshold O (length * seq) - by PSIBLAST_TURN x fold + by PSIBLAST_TURN x fold */ #define EXTERN #include "blocksprogs.h" @@ -20,7 +20,7 @@ #include "Matrix_Info.c" #include "Protdist.c" #include "stringhash.c" -#include "Psiblast.c" +#include "Psiblast.c" #include "PN_blocks.c" #include "Clumping.c" @@ -29,9 +29,9 @@ void getargs (int argc, char* argv[], char query_seq_file[LARGE_BUFF_LENGTH], - FILE** seqfp, + FILE** seqfp, FILE** outfp, - double* clumping_threshold, + double* clumping_threshold, char pid[SMALL_BUFF_LENGTH], double* median_threshold); @@ -53,10 +53,10 @@ FILE* outfp; FILE* temp_chk_fp; FILE* seqfp; int nseqs, db_type, seq_type; - int i, k, index, nseqs_written; double dtemp; + int i, k, index = 0, nseqs_written; double dtemp; double clumping_threshold; double info_median; - Block* workingblock; + Block* workingblock; HashTable seqnamehash; char seqname[KEY_WIDTH]; char pid[SMALL_BUFF_LENGTH]; @@ -77,22 +77,22 @@ init_frq_qij(); /* matrix conversion in order to calculate R */ - getargs (argc, argv, query_seq_file, &seqfp, &outfp, + getargs (argc, argv, query_seq_file, &seqfp, &outfp, &clumping_threshold, pid, &median_threshold); seqnamehash = InitializeTable (10); - + i = 0; /**************** READ IN SEQUENCES (to get info comments) ***************/ nseqs = 0; db_type = type_dbs (seqfp, DbInfo); seq_type = AA_SEQ; rewind (seqfp); - while ( nseqs < MAXSEQ && + while ( nseqs < MAXSEQ && (seqs[nseqs] = read_a_sequence(seqfp, db_type, seq_type)) != NULL) { nseqs++; } fclose (seqfp); -printf ("clumping teshodl is %.2f\n",clumping_threshold); +fprintf( stderr, "clumping teshodl is %.2f\n",clumping_threshold); /**************** GET SEED BLOCK BY PERCENTAGE IDENTITY ******************/ /* cluster_of_seqs = cluster_seqs (clumping_threshold, seqs, nseqs, &num_clusters); @@ -101,13 +101,13 @@ */ /*only copy query sequence (argumen 3 is 1 =>copy 1rst seq. only )*/ - workingblock = make_block (seqs[0]->length, 0, 1, seqs, FALSE); + workingblock = make_block (seqs[0]->length, 0, 1, seqs, FALSE); assert (workingblock != NULL); for (i = 0; i < workingblock->num_sequences; i++) { - printf ("inserting %s in hash\n", workingblock->sequences[i].name); + fprintf( stderr, "inserting %s in hash\n", workingblock->sequences[i].name); Insert (workingblock->sequences[i].name, seqnamehash); } - median = calculate_median_information_of_block (workingblock, FALSE, TRUE); + median = calculate_median_information_of_block (workingblock, FALSE, TRUE); /* write the first sequence into a file that will be used as the query for the psiblast search */ @@ -116,20 +116,20 @@ write_seq_to_file (query_filename, seqs[0]); assert (workingblock != NULL); -printf ("median is %.2f\n", median); +fprintf( stderr, "median is %.2f\n", median); /* printf ("finished extracting\n"); */ -/* fprintf (outfp, "*********SEED BLOCK********\n"); +/* fprintf (outfp, "*********SEED BLOCK********\n"); fprintf (outfp, "median %.3f \n", median); - output_block_s (workingblock, outfp, FLOAT_OUTPUT); + output_block_s (workingblock, outfp, FLOAT_OUTPUT); fflush (outfp); */ - done = FALSE; + done = FALSE; /**********ITERATIVE LOOP *************************************/ -while ( median > median_threshold && !done) { +while ( median > median_threshold && !done) { /* before add next sequence, copy this block, which we know that passes criterion, to finalblock */ - /****PSIBLAST CALL do it only every PSIBLAST_TURN round */ + /****PSIBLAST CALL do it only every PSIBLAST_TURN round */ strcpy (temp_chk_filename, pid); strcat (temp_chk_filename, ".TEMP"); strcpy (psiblastres_file, pid); @@ -138,31 +138,31 @@ strcat (query_database_file, ".TEMP_DB"); /*strcpy (temp_chk_filename,"tmpout3pid.TEMP"); */ - nseqs_written = write_seqs_not_in_hash (query_database_file, + nseqs_written = write_seqs_not_in_hash (query_database_file, seqs, nseqs, seqnamehash); if (nseqs_written == 0) { fprintf (stderr, "wrote 0 seqs to database\n"); done = TRUE; } else { - formatdb_system_call (query_database_file) ; + formatdb_system_call (query_database_file) ; if ((temp_chk_fp = fopen (temp_chk_filename, "w")) == NULL) { fprintf (errorfp, "couldn't open checkpoint file %s when choosing sequences\n", temp_chk_filename); exit (-1); } - block_to_checkpoint_file (workingblock, temp_chk_fp, logfp); - fclose (temp_chk_fp); - printf ("prior to psiblast system call"); + block_to_checkpoint_file (workingblock, temp_chk_fp, logfp); + fclose (temp_chk_fp); + fprintf( stderr, "prior to psiblast system call"); psiblast_system_call (temp_chk_filename, query_database_file, - psiblastres_file, query_filename, logfp); - printf ("finished psiblast system call"); + psiblastres_file, query_filename, logfp); + fprintf( stderr, "finished psiblast system call"); } /* end of if seqs written to database file */ /****END OF PSIBLAST CALL ****/ /* GET TOP PSIBLAST_TURN HITS ******/ - for (k = 0; k < PSIBLAST_TURN && !done; k++) { - get_top_seq (seqname, seqnamehash, psiblastres_file); - printf ( "top sequence is %s\n", seqname); + for (k = 0; k < PSIBLAST_TURN && !done; k++) { + get_top_seq (seqname, seqnamehash, psiblastres_file); + fprintf( stderr, "top sequence is %s\n", seqname); if (seqname[0] != '\0' && seqname[0] != ' ') { /**** ADD TOP SEQ to BLOCK *******************/ index = get_index_from_seqs (seqname, seqs, nseqs); @@ -170,28 +170,28 @@ /* block not weighted */ add_sequence_to_block (workingblock,seqs[index], FALSE); if (workingblock->num_sequences == nseqs) { - /* added all the possible sequences to the block + /* added all the possible sequences to the block have chosen all of the sequences, done*/ done = TRUE; } - Insert (seqname, seqnamehash); /* update hash */ + Insert (seqname, seqnamehash); /* update hash */ } else { fprintf (stderr, "done\n"); done = TRUE; } } /* done adding PSIBLAST_TURN sequences ***/ - median = calculate_median_information_of_block (workingblock, - FALSE, FALSE); + median = calculate_median_information_of_block (workingblock, + FALSE, FALSE); fprintf (logfp, "added %s %s median:%.3f median %d seq total\n", workingblock->sequences[workingblock->num_sequences-1].name, - seqs[index]->info, median, workingblock->num_sequences); + seqs[index]->info, median, workingblock->num_sequences); fflush (logfp); fflush (stderr); - fflush (errorfp); fflush (outfp); + fflush (errorfp); fflush (outfp); /* prepare to run psiblast on next iteration */ fprintf (stderr, "rm search files\n"); - rm_file (temp_chk_filename); + rm_file (temp_chk_filename); rm_file (psiblastres_file); strcat (query_database_file, ".*"); rm_file (query_database_file); @@ -208,15 +208,15 @@ /* print_block_ids (workingblock, outfp, TRUE); */ /* should free memeory, clean up later */ /* fclose (outfp); fclose (errorfp); fclose (logfp); - exit (0);*/ + exit (0);*/ /* exit 0 else will generate an error in shell that calls this program*/ } /* now of the last PSIBLAST_TURN sequences added, find out which one gives - approaches the right median */ + approaches the right median */ num_seqs_removed = 0; while (median < median_threshold) { workingblock->num_sequences--; - median = calculate_median_information_of_block (workingblock, + median = calculate_median_information_of_block (workingblock, FALSE, FALSE); num_seqs_removed++; } @@ -225,30 +225,30 @@ fprintf (logfp, "********FINAL BLOCK ****** "); fprintf (logfp, "last block : median %.3f with %d sequences\n", median, - workingblock->num_sequences); -/* output_block_s (workingblock, outfp, FLOAT_OUTPUT); */ + workingblock->num_sequences); +/* output_block_s (workingblock, outfp, FLOAT_OUTPUT); */ -/* print_block_sequences (workingblock, outfp); */ - print_block_ids (workingblock, outfp, TRUE); +/* print_block_sequences (workingblock, outfp); */ + print_block_ids (workingblock, outfp, TRUE); - workingblock->num_sequences+= num_seqs_removed; + workingblock->num_sequences+= num_seqs_removed; /* increase it back to deallocate - memory properly */ + memory properly */ free_seqs (seqs, nseqs); free_block (workingblock); DestroyTable (seqnamehash); fprintf (logfp, "SUCCESSFUL\n"); - fclose (outfp); + fclose (outfp); fclose (errorfp); - fclose (logfp); + fclose (logfp); rm_file (errorfilename); exit (0); } /* end of main */ /* return the number of sequences written */ int -write_seqs_not_in_hash (char query_database_file[LARGE_BUFF_LENGTH], +write_seqs_not_in_hash (char query_database_file[LARGE_BUFF_LENGTH], Sequence* seqs[MAXSEQ], int nseqs, HashTable hash) { @@ -259,7 +259,7 @@ nseqs_written = 0; if ( (outfp = fopen (query_database_file, "w")) == NULL) { - fprintf (errorfp, "couldn't write to %s \n", + fprintf (errorfp, "couldn't write to %s \n", query_database_file); exit (-1); } @@ -273,15 +273,15 @@ fclose (outfp); return nseqs_written; - + } /* end write_seqs_not_in_block */ - + void -getargs (int argc, char* argv[], +getargs (int argc, char* argv[], char query_seq_file[LARGE_BUFF_LENGTH], - FILE** seqfp, - FILE** outfp, - double* clumping_threshold, + FILE** seqfp, + FILE** outfp, + double* clumping_threshold, char pid[SMALL_BUFF_LENGTH], double* median_threshold) { @@ -297,7 +297,7 @@ if (argc > 1) strcpy (query_seq_file, argv[1]); else { printf ("Enter query sequence file\n"); - gets (query_seq_file); + fgets(query_seq_file, LARGE_BUFF_LENGTH, stdin ); } @@ -306,7 +306,7 @@ else { printf ("Enter sequence file \n"); - gets (seqfile); + fgets(seqfile, LARGE_BUFF_LENGTH, stdin); } if ( (*seqfp = fopen (seqfile, "r")) == NULL ) { @@ -315,10 +315,10 @@ } if ( argc > 3) strcpy (outfile, argv[3]); - else + else { printf ("Enter outfile \n"); - gets (outfile); + fgets( outfile, LARGE_BUFF_LENGTH, stdin ); } strcpy (errorfilename, outfile); @@ -341,7 +341,7 @@ } if (argc > 4) *clumping_threshold = atof (argv[4]); - else + else { printf ("Enter clumping threshold\n"); scanf ("%lf\n", clumping_threshold); @@ -351,10 +351,10 @@ if (argc > 5) strcpy (pid, argv[5]); else { printf ("Enter process identification number\n"); - gets (pid); - printf ("gets %s\n", pid); + fgets( pid, SMALL_BUFF_LENGTH, stdin ); + printf ("fgets %s\n", pid); } - + if (argc > 6) *median_threshold = atof (argv[6]); else { printf ("Enter median threshold\n"); @@ -363,10 +363,10 @@ } -} /* end of getargs */ +} /* end of getargs */ void -write_seq_to_file (char filename[LARGE_BUFF_LENGTH], Sequence* seq) +write_seq_to_file (char filename[LARGE_BUFF_LENGTH], Sequence* seq) { FILE* fp; --- a/src/clump_output_alignedseq.c +++ b/src/clump_output_alignedseq.c @@ -25,7 +25,7 @@ #include "Clumping.c" #define MINSEQ 2 -#define MAXSEQ 400 +#define MAXSEQ 20000 #define INDEX(n, col, row) (col*n - (col*(col+3))/2 - 1 + row) #define MIN_CLUSTERS 3 @@ -144,7 +144,7 @@ else { printf ("Enter filename with sequences:\n"); - gets (seqfilename); + fgets( seqfilename, LARGE_BUFF_LENGTH, stdin ); } if ((*seqfp = fopen (seqfilename, "r")) == NULL) { @@ -156,7 +156,7 @@ else { printf ("Enter name of outfile\n"); - gets (outfilename); + fgets( outfilename, LARGE_BUFF_LENGTH, stdin ); } --- a/src/consensus_to_seq.c +++ b/src/consensus_to_seq.c @@ -120,7 +120,7 @@ else { printf ("Enter filename with sequences:\n"); - gets (seqfilename); + fgets( seqfilename, LINE_LEN, stdin ); } if ((*seqfp = fopen (seqfilename, "r")) == NULL) @@ -133,7 +133,7 @@ else { printf ("Enter file with keys and elements\n"); - gets (keyfilename); + fgets( keyfilename, LINE_LEN, stdin ); } if ((*keyfp = fopen (keyfilename, "r")) == NULL) { @@ -144,7 +144,7 @@ else { printf ("Enter name of outfile\n"); - gets (outfilename); + fgets( outfilename, LINE_LEN, stdin ); } if ( (*outfp = fopen (outfilename, "w")) == NULL) { --- a/src/info_on_seqs.c +++ b/src/info_on_seqs.c @@ -11,7 +11,7 @@ #include "Psiblast.c" /* #include "Information.c */ -#define MAXSEQ 400 /* Maximum number of sequences */ +#define MAXSEQ 20000 /* Maximum number of sequences */ #define LINE_LEN 800 #define FALSE 0 #define TRUE 1 @@ -198,10 +198,10 @@ int substitution, original_aa; int substitution_exists; int pos; - double median; + double median = 0; AAnode current; - Block* block_with_seqs_at_pos; - double* info_array; + Block* block_with_seqs_at_pos = NULL; + double* info_array = NULL; int block_constructed; substitution_exists = FALSE; @@ -337,7 +337,7 @@ fprintf (outfp, "%d\n", residues_stored[pos]); */ /* the array will be sorted in call for median_of_array*/ -printf ("looking for median\n"); +fprintf( stderr, "looking for median\n"); median = median_of_array (info_array, prot_length); printf ("found mediant %.2f\n", median); fprintf (outfp, "%.3f\t%d\t%d\n", @@ -352,7 +352,7 @@ assert (oldblock->sequences[0].sequence != NULL); assert (oldblock->sequences != NULL); assert (oldblock != NULL); -printf ("finished comments oninfo\n"); +fprintf( stderr, "finished comments oninfo\n"); } double @@ -524,23 +524,23 @@ else { printf ("Enter filename with sequences:\n"); - gets (seqfilename); + fgets( seqfilename, LINE_LEN, stdin ); } -printf ("fawegwa\n"); +fprintf( stderr, "%s:%d\n", __FILE__, __LINE__ ); if ((*seqfp = fopen (seqfilename, "r")) == NULL) { printf ("cannot open file %s \n", seqfilename); exit (-1); } -printf ("eaegrtjkl\n"); +fprintf( stderr, "%s:%d\n", __FILE__, __LINE__ ); if (argc > 2) strcpy (substfilename, argv[2]); else { - printf ("Enter file with substitutions.\n"); - printf ("format:\n"); - printf ("M1Y\n"); - printf ("S2K\n"); - printf ("...\n\n"); - gets (substfilename); + fprintf( stderr, "Enter file with substitutions.\n"); + fprintf( stderr, "format:\n"); + fprintf( stderr, "M1Y\n"); + fprintf( stderr, "S2K\n"); + fprintf( stderr, "...\n\n"); + fgets( substfilename, LARGE_BUFF_LENGTH, stdin ); } if (substfilename[0] == '-') { *polymorphfp = NULL; } else if ((*polymorphfp = fopen (substfilename, "r")) == NULL) @@ -553,8 +553,8 @@ if (argc > 3) strcpy (outfilename, argv[3]); else { - printf ("Enter name of outfile\n"); - gets (outfilename); + fprintf( stderr, "Enter name of outfile\n"); + fgets( outfilename, LARGE_BUFF_LENGTH, stdin ); } strcpy (errorfilename, outfilename); --- /dev/null +++ b/src/Makefile @@ -0,0 +1,45 @@ +include-prefix := $(prefix) +lib-prefix := $(prefix) + +BIN := choose_seqs_via_psiblastseedmedian \ + clump_output_alignedseq \ + consensus_to_seq \ + info_on_seqs \ + psiblast_res_to_fasta_dbpairwise \ + seqs_from_psiblast_res + +CC := gcc +CFLAGS := -I$(include-prefix)/include/blimps -O2 -D__MAKE_PREFIX__=\"$(prefix)\" $(CFLAGS) +LDFLAGS := -L$(lib-prefix)/lib -lblimps $(LDFLAGS) + +all: bin + +bin: $(BIN) + +clean: + rm -f *.o $(BIN) + +distclean: clean + rm -f deps.mk + +install: + mkdir -p $(DESTDIR)$(prefix)/lib/sift/bin && \ + cp -a \ + choose_seqs_via_psiblastseedmedian \ + clump_output_alignedseq \ + consensus_to_seq \ + info_on_seqs \ + psiblast_res_to_fasta_dbpairwise \ + seqs_from_psiblast_res \ + $(DESTDIR)$(prefix)/lib/sift/bin/ + +help: + +.PHONY: all bin clean distclean help install + +deps.mk: + $(CC) -MM $(CFLAGS) $(addsuffix .c,$(BIN)) > $@ + +include deps.mk + +# vim:ai: --- a/src/Matrix_Info.c +++ b/src/Matrix_Info.c @@ -384,7 +384,7 @@ int pos_observed_tolerant, pos_error_tn, pos_notobserved_intolerant, pos_error_fp, pos_total; char c; int l; - int print_correct, print_incorrect, print_X, print_type; + int print_correct, print_incorrect, print_X, print_type = 0; print_correct = 1; print_incorrect = 2; print_X =3; correct = 0; error_fp=0; error_tn = 0, total=0; @@ -438,7 +438,7 @@ observed_tolerant++; pos_observed_tolerant++; print_type = print_correct; } - else { printf ("missing somethingdsaff\n"); } + else { fprintf( stderr, "%s:%d missing something\n", __FILE__, __LINE__ ); } } else if (info_matrix->info[aa_atob[c]][l] == not_allowed) { if ( matrix->weights[aa_atob[c]][l] >= 0 ) { /* ERROR of OBSERVED, but INTOLERANT false positive */ @@ -449,7 +449,7 @@ notobserved_intolerant++; pos_notobserved_intolerant++; print_type = print_correct; - } else {printf ("dfafaw\n"); } + } else {fprintf( stderr, "%s:%d\n", __FILE__, __LINE__ ); } } else if (info_matrix->info[aa_atob[c]][l] == less_severe) { switch (severity_option) { case 1: /* include +- phenotype with + */ @@ -520,7 +520,7 @@ pos_error_tn++; error_tn++; print_type = print_incorrect; - } else {printf ("something is wrong\n");} + } else {fprintf( stderr, "%s:%d something is wrong\n", __FILE__, __LINE__ );} break; case 2: /* +- phenotype is - */ if ( matrix->weights[aa_atob[c]][l] > 0.0) { @@ -531,7 +531,7 @@ pos_notobserved_intolerant++; notobserved_intolerant++; print_type = print_correct; - }else {printf ("what?\n"); } + }else {fprintf( stderr, "%s:%d what?\n", __FILE__, __LINE__ ); } break; default: fprintf(errorfp, "severity option error %d\n", severity_option); @@ -612,7 +612,7 @@ int pos_observed_tolerant, pos_error_tn, pos_notobserved_intolerant, pos_error_fp, pos_total; char c; int l; - int print_correct, print_incorrect, print_X, print_type; + int print_correct, print_incorrect, print_X, print_type = 0; print_correct = 1; print_incorrect = 2; print_X =3; correct = 0; error_fp=0; error_tn = 0, total=0; @@ -657,7 +657,7 @@ observed_tolerant++; pos_observed_tolerant++; print_type = print_correct; } - else { printf ("missing somethingdsaff\n"); } + else { fprintf( stderr, "%s:%d missing something\n", __FILE__, __LINE__ ); } } else if (info_matrix->info[aa_atob[c]][l] == not_allowed) { if ( matrix->weights[aa_atob[c]][l] >= 0.01 ) { /* ERROR of OBSERVED, but INTOLERANT false positive */ @@ -735,7 +735,7 @@ pos_error_tn++; error_tn++; print_type = print_incorrect; - } else {printf ("something is wrong\n");} + } else {fprintf( stderr, "%s:%d something is wrong\n", __FILE__, __LINE__ );} break; case 2: /* +- phenotype is - */ if ( matrix->weights[aa_atob[c]][l] > 0.01) { @@ -847,10 +847,10 @@ */ len = 0; -printf ("matrix width %d\n", matrix->width); +fprintf( stderr, "matrix width %d\n", matrix->width); /* skip the letters */ fgets(Buffer, LARGE_BUFF_LENGTH, mfp); -printf ("skiipping letters %s\n", Buffer); +fprintf( stderr, "skipping letters %s\n", Buffer); /* read in the numbers */ while (fgets(Buffer, LARGE_BUFF_LENGTH, mfp) && !blank_line(Buffer) && @@ -924,7 +924,7 @@ min_aa_in_column (Matrix * matrix, int pos) { double min; - int min_aa, aa; + int min_aa = 0, aa; min = 1000; for (aa = 0; aa < AAS; aa++) { @@ -1043,7 +1043,7 @@ case 'W': return 18; case 'Y' : return 19; - default: printf ("unknown character %c\n", c ); + default: fprintf( stderr, "unknown character %c\n", c ); } /* end switch */ } @@ -1060,7 +1060,7 @@ int pos_observed_tolerant, pos_error_tn, pos_notobserved_intolerant, pos_error_fp, pos_total; char c; int l; - int print_correct, print_incorrect, print_X, print_type; + int print_correct, print_incorrect, print_X, print_type = 0; int uncertain_aa, uncertain_tolerant, uncertain_intolerant; int pos_uncertain_aa, pos_uncertain_tolerant, pos_uncertain_intolerant; int total_correct, total_counted; @@ -1124,7 +1124,7 @@ ; print_type = print_correct; } - else { printf ("missing somethingdsaff\n"); } + else { fprintf( stderr, "%s:%d missing something\n", __FILE__, __LINE__ ); } } else if (info_matrix->info[aa_atob[c]][l] == not_allowed) { if ( matrix->weights[aa_atob[c]][l] >= THRESHOLD ) { /* ERROR of OBSERVED, but INTOLERANT false positive */ @@ -1506,9 +1506,9 @@ polymorph_data[i]->aa = aa_atob['-']; polymorph_data[i]->next = NULL; } - if (infofp == NULL) { printf ("no substitutions read\n"); + if (infofp == NULL) { fprintf( stderr, "no substitutions read\n"); return polymorph_data;} /* no file to read*/ -printf ("before seg fault?\n"); +fprintf( stderr, "%s:%d before seg fault?\n", __FILE__, __LINE__ ); while (fgets (line, LARGE_BUFF_LENGTH, infofp) != NULL) { word1[0] = '\0'; word2[0] = '\0'; stringp = strtok (line, " \r\n\t"); --- a/src/PN_blocks.c +++ b/src/PN_blocks.c @@ -671,7 +671,7 @@ Block* newblock; Residue* residue_pointer; int i, pos; - int non_X_length, newblockpos; + int non_X_length, newblockpos = 0; non_X_length = 0; for (i=0; i < block->width; i++) { --- a/src/PN_convert.c +++ b/src/PN_convert.c @@ -1,6 +1,6 @@ /* (C) Copyright 2000, Fred Hutchinson Cancer Research Center */ /* Use, modification or distribution of these programs is subject to */ -/* the terms of the non-commercial licensing agreement in license.h. */ +/* the terms of the non-commercial licensing agreement in license.h. */ /* convert.c: functions for different methods of converting a block into a */ /* matrix */ @@ -8,8 +8,8 @@ /* Change log information is at the end of the file. */ /* PN_convert.c copied from convert.c to allow for large block widths */ -/*07-13-00 changed in counts() so that if an X is not observed, counts is NOT -evenly distributed throughout. For gaps, counts is evenly distributed +/*07-13-00 changed in counts() so that if an X is not observed, counts is NOT +evenly distributed throughout. For gaps, counts is evenly distributed throughout. */ /* system headers not in global.h */ @@ -18,6 +18,9 @@ #include #include +#include +#include +#include "sift_blimps.h" #include "PN_convert.h" #include "queue.c" #include "SortList.c" @@ -47,7 +50,7 @@ void pb_weights(/*block*/); /* - * Matrix construction methods. Build matricies from blocks and + * Matrix construction methods. Build matricies from blocks and * sequence weights. */ @@ -133,7 +136,7 @@ break; case 22: /* instead of adding RTot * diffaas*/ /* as in option 20, add RTot * (diffass - 1), thus not adding*/ - /* any pseudocounts to completely conserved columns */ + /* any pseudocounts to completely conserved columns */ /* qij's as pseudocounts */ PN_altschul_data_dependent_conversion_method (block, matrix, 22); break; @@ -141,13 +144,13 @@ /* exp. pseudocounts*/ PN_altschul_data_dependent_conversion_method (block, matrix, 23); break; - case 24: + case 24: PN_altschul_data_dependent_conversion_method (block, matrix, 24); break; case 25: PN_altschul_data_dependent_conversion_method (block, matrix, 25); break; - case 26: + case 26: PN_altschul_data_dependent_conversion_method (block, matrix, 26); break; case 27: @@ -172,7 +175,7 @@ qij_matrix_profile (block, matrix, Qij); break; default: /* the default case */ - sprintf(ErrorBuffer, + sprintf(ErrorBuffer, "Invalid block to matrix conversion method specified, %d.", conversion_method); ErrorReport(WARNING_ERR_LVL); @@ -213,10 +216,10 @@ ErrorReport(WARNING_ERR_LVL); sprintf(ErrorBuffer, /* ^^^^----------------vvvvvvvvvvvvvvvvvvv */ "Using the default conversion method of Altschul's data-dependent method.\n"); - + ErrorReport(WARNING_ERR_LVL); altschul_data_dependent_conversion_method(block, matrix, 0); - } + } /* return the matrix */ return matrix; @@ -241,11 +244,11 @@ { struct working *col; int aa; - + CheckMem( col = (struct working *) malloc(sizeof(struct working)) ); - + col->totcnt = col->totreg = 0.0; for (aa=0; aa < AASALL; aa++) { col->cnt[aa] = col->reg[aa] = 0.0; @@ -262,7 +265,7 @@ struct work_pssm *pssm; int pos, aa; double* double_pointer; - + CheckMem( pssm = (struct work_pssm *) malloc(sizeof(struct work_pssm)) ); @@ -270,7 +273,7 @@ double_pointer = (double*) calloc (length * AASALL, sizeof (double)); pssm->value = (double **) calloc (length, sizeof (double*)); pssm->sum = (double *) calloc (length, sizeof (double)); - + for (pos = 0; pos < length; pos++) { pssm->value[pos] = &(double_pointer[pos * AASALL]); } @@ -306,7 +309,7 @@ } } /* end of for */ } /* end of subroutine */ - + /*======================================================================*/ static void counts(block, col, pos) Block *block; @@ -316,7 +319,7 @@ int seq, aa, aa1; col->totcnt = col->totraw = col->totreg = 0.0; - for (aa = 0; aa < AASALL; aa++) { + for (aa = 0; aa < AASALL; aa++) { col->cnt[aa] = col->raw[aa] = col->reg[aa] = 0.0; } @@ -358,7 +361,7 @@ struct working *col; { int aa, nr; - + nr = 0; for (aa = 1; aa < AAS; aa++) { if (col->cnt[aa] > 0.0) nr++; @@ -375,9 +378,9 @@ double beta; { int aa, row; - + col->totreg = 0.0; - + /*---------- get the pseudo counts -------------------------------*/ for (aa=1; aa < AAS; aa++) { col->reg[aa] = 0.0; @@ -395,7 +398,7 @@ int find_min_aa_in_pssm (struct working* col, struct work_pssm* pssm, const int pos) { - int aa, min_aa; + int aa, min_aa = 0; double min_value; min_value = 1000; @@ -422,7 +425,7 @@ min_aa = find_min_aa_in_pssm(col, pssm, pos); min = pssm->value[pos][min_aa] * threshold; for (aa = 0; aa < AAS; aa++) { - pssm->value[pos][aa] -= min; + pssm->value[pos][aa] -= min; } @@ -438,14 +441,14 @@ FILE* matrixfp; char matrix_file[LARGE_BUFF_LENGTH]; int pos, original_aa, aa; - char* blimps_dir; + const char* blimps_dir; /* read matrix */ - blimps_dir = getenv ("BLIMPS_DIR"); + blimps_dir = getblimpsdir(); sprintf (matrix_file, "%s/docs/blosum62.bla.new", blimps_dir); -/* strcpy (matrix_file, "/tuna/blocks_5.0/blosum/blosum62.bla.new");*/ - +/* strcpy (matrix_file, "/tuna/blocks_5.0/blosum/blosum62.bla.new");*/ + if ( (matrixfp = fopen (matrix_file, "r")) == NULL) { fprintf (errorfp, "can't read the matrix %s\n", matrix_file); exit (-1); @@ -479,7 +482,7 @@ } /* end of qij_matrix_profile */ - + static void Pauline_pseudo_alts (col, qij, epsilon) struct working* col; struct float_qij *qij; @@ -487,7 +490,7 @@ { int aa, row; - col->totreg = 0.0; + col->totreg = 0.0; /*---------- get the pseudo counts -------------------------------*/ for (aa=1; aa < AAS; aa++) { col->reg[aa] = 0.0; @@ -508,13 +511,13 @@ struct work_pssm *pssm; { int aa; - - pssm->sum[pos] = 0.0; + + pssm->sum[pos] = 0.0; for (aa=1; aa < AAS; aa++) { pssm->value[pos][aa] = col->cnt[aa] + col->reg[aa]; pssm->value[pos][aa] /= (col->totcnt + col->totreg); - + /* compute the odds ratio */ if (freqs[aa] > 0.0) { pssm->value[pos][aa] /= freqs[aa]; @@ -524,7 +527,7 @@ if (pssm->value[pos][aa] > 0.0) { pssm->value[pos][aa] = log(pssm->value[pos][aa]); } - + pssm->sum[pos] += pssm->value[pos][aa]; } } /* end of log_odds */ @@ -553,17 +556,17 @@ /* * find the partitions of D, N, E, and Q for B and Z */ - part_D = frequency[aa_atob['D']] / + part_D = frequency[aa_atob['D']] / ( frequency[aa_atob['D']] + frequency[aa_atob['N']] ); - part_N = frequency[aa_atob['N']] / + part_N = frequency[aa_atob['N']] / ( frequency[aa_atob['D']] + frequency[aa_atob['N']] ); - part_E = frequency[aa_atob['E']] / + part_E = frequency[aa_atob['E']] / ( frequency[aa_atob['E']] + frequency[aa_atob['Q']] ); - part_Q = frequency[aa_atob['Q']] / + part_Q = frequency[aa_atob['Q']] / ( frequency[aa_atob['E']] + frequency[aa_atob['Q']] ); /* fill in the matrix for B, Z, X, gap, stop and non */ - matrix->weights[aa_atob['B']][col] = + matrix->weights[aa_atob['B']][col] = (part_D * matrix->weights[aa_atob['D']][col] + part_N * matrix->weights[aa_atob['N']][col]); matrix->weights[aa_atob['Z']][col] = @@ -589,7 +592,7 @@ /*========================================================================= Adds negative minval to give all positive matrix, - then multiplies by 99/maxval to give scores ranging from 0 to 99 + then multiplies by 99/maxval to give scores ranging from 0 to 99 NOTE: Not 0 to 100 because "output_matrix" routine might not leave enough space. ===========================================================================*/ @@ -609,7 +612,7 @@ if (pssm->value[pos][aa] > maxval) maxval = pssm->value[pos][aa]; } } - + if (minval < 0.0) { factor = 99.0 / (maxval - minval); } @@ -638,7 +641,7 @@ { int aa; -/* old version. changed July 2, 2001 to make it linux compatible +/* old version. changed July 2, 2001 to make it linux compatible assert (oldfreqs[MATRIX_AA_WIDTH] != NULL); assert (newfreqs[MATRIX_AA_WIDTH] != NULL); */ @@ -653,7 +656,7 @@ /* pseudocounts from Altschul's 1997 NAR gapped blast and PSI-BLAST paper */ -void +void psiblast_alts (Block* block, Matrix* matrix, double* freqs, struct float_qij* qij) { double beta; @@ -703,14 +706,14 @@ free (pssm->value); free(pssm->sum); free(pssm); -} +} -int +int find_max_aa_in_col (struct working* col) { - int aa, max_aa; + int aa, max_aa = 0; double max = 0.0; - + for (aa = 1; aa < AAS; aa++) { if (col->cnt[aa] > max) { max = col->cnt[aa]; @@ -724,7 +727,7 @@ int find_max_aa_in_pssm (struct work_pssm* pssm , int pos) { - int aa, max_aa; + int aa, max_aa = 0; double max; max = 0.0; @@ -736,7 +739,7 @@ } return max_aa; } - + /*========================================================================== Uses Altschul's method of getting pseudo-counts with a qij matrix, ===========================================================================*/ @@ -756,12 +759,12 @@ int original_aa; struct dirichlet* diric; int min_aa, max_aa; - double min_freq, no_of_gaps; + double min_freq = 0, no_of_gaps; double cutoff_freq[MATRIX_AA_WIDTH]; int rank_matrix[AAS][AAS]; double number_of_observed_aas; int diri_pseudo; - char* blimps_dir; + const char* blimps_dir; char diri_file[LARGE_BUFF_LENGTH]; if (scale == 30) { @@ -774,15 +777,15 @@ col = make_col(); pssm = PN_make_pssm(block->width); - blimps_dir = getenv ("BLIMPS_DIR"); - printf ("blimps dir is %s\n", blimps_dir); - sprintf (diri_file , "%s/docs/default.diri", blimps_dir); - printf ("diri file is %s\n", diri_file); + blimps_dir = getblimpsdir(); + fprintf( stderr, "blimps dir is %s\n", blimps_dir); + sprintf (diri_file , "%s/docs/default.diri", blimps_dir); + fprintf( stderr, "diri file is %s\n", diri_file); /* diric = load_diri_file ("/howard2/pauline/src/merge-opt.13compnew"); */ diric = load_diri_file (diri_file); construct_rank_matrix ( rank_matrix); -printf ("diri file loaded\n"); +fprintf( stderr, "diri file loaded\n"); /*-------------- Do one position at a time -------------------*/ for (pos = 0; pos < block->width; pos++) { @@ -796,8 +799,8 @@ if (itemp ==1 ) {epsilon = 0; } else { original_aa = block->residues[0][pos]; - dtemp = similarity_dependent_scale_0 ( col, rank_matrix, original_aa); - epsilon = exp (dtemp); + dtemp = similarity_dependent_scale_0 ( col, rank_matrix, original_aa); + epsilon = exp (dtemp); } } else if (scale == 20) { @@ -813,7 +816,7 @@ /*---------- get the pseudo counts -------------------------------*/ if (scale == 27 || scale == 29 || scale == 30 || scale == 26) { diri_pseudo = TRUE; - pseudo_diric (col, diric, epsilon ); + pseudo_diric (col, diric, epsilon ); } else { pseudo_alts(col, qij, epsilon); } @@ -832,8 +835,8 @@ /* Count+pseudo proportions; returned if scale == 20 */ if (scale <= 20 || scale == 22 || scale == 25 || scale == 23 - || scale == 26) - + || scale == 26) + { pssm->value[pos][aa] = col->cnt[aa] + col->reg[aa]; if ( (col->totcnt + col->totreg) > 0.0) @@ -890,7 +893,7 @@ /* by subtracting the min. value for all positions ----- */ if (!scale) positive_matrix(freqs, pssm, matrix); -printf ("left make _alts\n"); +fprintf( stderr, "left make _alts\n"); free (diric); free(col); free_struct_work_pssm (pssm ); @@ -903,7 +906,7 @@ option: diri_pseudocounts TRUE: use 13-component Dirichlet FALSE use BLOSUM62 qij's -option: gap: TRUE -allow everything +option: gap: TRUE -allow everything FALSE - just look at 20 amino acids option: exp (m) m=0 # of amino acids at pos (default) @@ -919,9 +922,9 @@ Matrix* pssm; pssm = copy_block_info_to_matrix (block); - + if (Qij == NULL) { - printf ("qij matrix missng\n"); + fprintf( stderr, "qij matrix missing\n"); exit (-1); } normalize (block); @@ -963,13 +966,13 @@ for (aa = 1; aa < AAS; aa++) { printf ("pos %d aa %c counts %.2f weight %.2f\n", pos, aa_btoa[aa], col->cnt[aa], col->totcnt); - } + } } } -static void SIFT_alts(Block* block, Matrix* matrix, double* freqs, - struct float_qij* qij, - int diri_pseudocounts, int gap_option, +static void SIFT_alts(Block* block, Matrix* matrix, double* freqs, + struct float_qij* qij, + int diri_pseudocounts, int gap_option, int exp_option, int subtract_threshold) { double dtemp, dtemp2, epsilon; @@ -984,7 +987,7 @@ int div_by_max; int rank_matrix[AAS][AAS]; FILE* rfp; - char* blimps_dir; + const char* blimps_dir; char diri_file[LARGE_BUFF_LENGTH]; div_by_max = TRUE; @@ -992,25 +995,25 @@ col = make_col(); pssm = PN_make_pssm(block->width); - blimps_dir = getenv ("BLIMPS_DIR"); + blimps_dir = getblimpsdir(); sprintf (diri_file , "%s/docs/default.diri", blimps_dir); diric = load_diri_file (diri_file); if (exp_option == 1) { - - construct_rank_matrix ( rank_matrix); + + construct_rank_matrix ( rank_matrix); } /*-------------- Do one position at a time -------------------*/ - for (pos = 0; pos < block->width; pos++) + for (pos = 0; pos < block->width; pos++) { /*-------- count the number of each aa in this position ------------*/ if (gap_option) { counts (block,col, pos); } else { - counts_no_gaps(block, col, pos); - } + counts_no_gaps(block, col, pos); + } /*-------- determine total number of pseudo-counts in column ------*/ epsilon = 0; @@ -1020,24 +1023,24 @@ if (exp_option == 1) { /*printf ("pos %d : ", pos); */ dtemp = similarity_dependent_scale_0(col, rank_matrix, original_aa); - if (itemp == 1) { - epsilon = 0; - } else { + if (itemp == 1) { + epsilon = 0; + } else { epsilon = exp (dtemp); } - } else { + } else { if (itemp == 1) { epsilon = 0; } else if (itemp > 7) { epsilon = 1000; } else { epsilon = exp( (double) itemp) ; } - - } - + + } + /*---------- get the pseudo counts -------------------------------*/ if (diri_pseudocounts) { - pseudo_diric (col, diric, epsilon ); + pseudo_diric (col, diric, epsilon ); } else { pseudo_alts(col, qij, epsilon); - } + } /*--------- Fill in the matrix entries --------------------*/ pssm->sum[pos] = 0.0; @@ -1058,11 +1061,11 @@ for (aa = 1; aa < AAS; aa++) { pssm->value[pos][aa] /= min_freq; } - } + } if (subtract_threshold) { for (aa = 1; aa < AAS; aa++) { - pssm->value[pos][aa] -= TOLERANCE_PROB_THRESHOLD; + pssm->value[pos][aa] -= TOLERANCE_PROB_THRESHOLD; } } @@ -1078,7 +1081,7 @@ if ( (matrix->weights[original_aa][pos] < TOLERANCE_PROB_THRESHOLD && (!subtract_threshold)) || - (matrix->weights[original_aa][pos] < 0.0 && subtract_threshold) + (matrix->weights[original_aa][pos] < 0.0 && subtract_threshold) ) { printf ("WARNING!!! Amino acid %c at pos %d in your original sequence had score %.3f and was not allowed by the prediction.
\n", aa_btoa[original_aa], pos + 1, matrix->weights[original_aa][pos]); } @@ -1111,7 +1114,7 @@ int div_by_max; int rank_matrix[AAS][AAS]; FILE* rfp; - char* blimps_dir; + const char* blimps_dir; char diri_file[LARGE_BUFF_LENGTH]; char matrix_file[LARGE_BUFF_LENGTH]; @@ -1121,7 +1124,7 @@ col = make_col(); pssm = PN_make_pssm(block->width); diri_pseudocounts = TRUE; - blimps_dir = getenv ("BLIMPS_DIR"); + blimps_dir = getblimpsdir(); sprintf (diri_file , "%s/docs/default.diri", blimps_dir); @@ -1132,8 +1135,8 @@ construct_rank_matrix ( rank_matrix); } - sprintf (matrix_file, "%s/docs/default.qij"); - init_frq_qij_for_matrix (matrix_file); + sprintf (matrix_file, "%s/docs/default.qij", blimps_dir ); + init_frq_qij_for_matrix (matrix_file); /*-------------- Do one position at a time -------------------*/ for (pos = 0; pos < block->width; pos++) @@ -1262,7 +1265,7 @@ } -void +void ratio_mutated_to_normal (struct working* col, int standard_aa) { int aa; @@ -1280,7 +1283,7 @@ { int gaps; int seq; - + assert (matrix->block != NULL); /* printf ("entering gap_positive scores\n"); */ gaps = 0; @@ -1292,7 +1295,7 @@ if (gaps == 0) { /* printf ("no gaps\n"); */return; } else { /* printf ("gaps %d\n", gaps) ; */ assign_positive_scores_to_column (matrix, pos, gaps); } - + } /* end of assign_gap_positive_scores */ void @@ -1305,7 +1308,7 @@ for (aa = 1; aa < AAS; aa++) { matrix->weights[aa][pos] = 0.05; } -} /* end of positive_column */ +} /* end of positive_column */ /*========================================================================= Normalize sequence weights to add up to the number of sequences @@ -1337,11 +1340,11 @@ } /* check to see if the block has sequence weights */ -printf ("enter normalize block\n"); +fprintf( stderr, "enter normalize block\n"); normalize(block); /* Make weights sum to number of sequences */ -printf ("enter PN_make_alts\n"); - PN_make_alts(block, matrix, frequency, Qij, RTot, scale); -printf ("exit PN_make_alts\n"); +fprintf( stderr, "enter PN_make_alts\n"); + PN_make_alts(block, matrix, frequency, Qij, RTot, scale); +fprintf( stderr, "exit PN_make_alts\n"); } /* @@ -1383,7 +1386,7 @@ for (clust=0; clustnum_clusters; clust++) { weight = 1.0/(double)block->clusters[clust].num_sequences; /* fill each sequence in the cluster */ - for (seq=seq_num; seqclusters[clust].num_sequences; + for (seq=seq_num; seqclusters[clust].num_sequences; seq++) { seq_weight[seq] = weight; } @@ -1402,7 +1405,7 @@ * Return codes: a pointer to the sequence weights array. * Error codes: NULL if all of the weights in the block are <= zero. */ - + static double *pre_weighted_sequences(block) Block* block; { @@ -1422,9 +1425,9 @@ CheckMem( seq_weight = (double *) calloc(block->num_sequences, sizeof(double)) ); - + num_seqs = block->num_sequences; - + /* fill each sequence weight from the block */ max_weight = 0.0; for (seq=0; seqweights[aa_atob['B']][col] = + matrix->weights[aa_atob['B']][col] = (part_D * matrix->weights[aa_atob['D']][col] + part_N * matrix->weights[aa_atob['N']][col]); matrix->weights[aa_atob['Z']][col] = @@ -1637,7 +1640,7 @@ int pos; seq = (Sequence *) malloc (sizeof (Sequence)); - seq->sequence = (Residue *) calloc (matrix->width, sizeof (Residue) ); + seq->sequence = (Residue *) calloc (matrix->width, sizeof (Residue) ); strcpy (seq->name, "CONSENSUS"); seq->info[0] = '\0'; seq->position = 0; @@ -1647,7 +1650,7 @@ for (pos = 0; pos < matrix->width; pos++) { res = find_max_aa_in_pos (matrix, pos); seq->sequence[pos] = res; - printf ("res %c pos %d\n", aa_btoa[seq->sequence[pos]], pos); + //fprintf( stderr, "res %c pos %d\n", aa_btoa[seq->sequence[pos]], pos); } return seq; @@ -1709,7 +1712,7 @@ { int i; double index; - + index = 0; @@ -1721,7 +1724,7 @@ } -/* returns a state , given a cumulative probability vector P with +/* returns a state , given a cumulative probability vector P with MATRIX_AA_WIDTH states ranging from 0 to 1 */ int random_pick (double* P) @@ -1732,13 +1735,13 @@ /* printf ("random pick: "); */ random = (double) rand() / (double) RAND_MAX * 100 ; while (random > P[i] && i < MATRIX_AA_WIDTH) { -/* printf ("| %.3f, %c |", P[i], aa_btoa[i]); */ +/* printf ("| %.3f, %c |", P[i], aa_btoa[i]); */ i++; } /* printf ("picked %c with rand %.3f prob %.3f\n", aa_btoa[i], random, P[i]); */ if (i == MATRIX_AA_WIDTH) { /* no aa in this column, return an X */ return (aa_atob['X']); - } else { + } else { return (i); } } @@ -1924,21 +1927,21 @@ /*=====================================================================*/ /* This routine initializes the blimps global variables - Qij, RTot and frequency[] + Qij, RTot and frequency[] Input Parameters .qij file .out file (Jorja's format, to get frq) */ -void init_frq_qij_for_matrix(char qijname[LARGE_BUFF_LENGTH]) +void init_frq_qij_for_matrix(char qijname[LARGE_BUFF_LENGTH]) { FILE *fqij; - + if (Qij != NULL) { free (Qij); Qij = NULL; } if ( (fqij=fopen(qijname, "r")) != NULL) { Qij = load_qij(fqij); fclose(fqij); } - else {printf ("could not open %s qij file\n", qijname); } + else {fprintf( stderr, "could not open %s qij file\n", qijname); } RTot = LOCAL_QIJ_RTOT; } /* end of frq_qij */ @@ -1995,7 +1998,7 @@ { col->probj[j] = log(diric->q[j]) + col->probn[j] - denom; /* printf("j=%d probn[j]=%f probj[j]=%f\n", - j, exp(col->probn[j]), exp(col->probj[j])); */ + j, exp(col->probn[j]), exp(col->probj[j])); */ } @@ -2004,31 +2007,31 @@ for (aa = 1; aa < AAS; aa++) { for (j = 0; j < diric->ncomp; j++) { - col->reg[aa] += + col->reg[aa] += (exp(col->probj[j]) * diric->alpha[j][aa]); - /* * epsilon); */ + /* * epsilon); */ } col->totreg += col->reg[aa]; } /* printf (" total prob. %.2f\n", col->totreg); */ -/* scale dirichlet to probabilities */ +/* scale dirichlet to probabilities */ total = 0.0; for (aa = 1; aa < AAS; aa ++) { col->reg[aa] /= col->totreg; total += col->reg[aa]; } /* printf ("%.2f total\n", total); */ - + /* printf ("%c %.3f", aa_btoa[aa], col->reg[aa]); - if (col->cnt[aa] > 0.0) {printf ("*\n"); } else { printf ("\n");} */ + if (col->cnt[aa] > 0.0) {printf ("*\n"); } else { printf ("\n");} */ } /* end of pseudo_diric */ /* aa observed in block that has small pseudocount */ -int +int find_min_aa_in_col_reg (struct working* col) { - int aa, min_aa; + int aa, min_aa = 0; double min; min = 10000; @@ -2042,10 +2045,10 @@ return min_aa; } -char +char find_min_aa_in_col (struct working* col) { - int aa, min_aa; + int aa, min_aa = 0; double min; min = 1000; @@ -2073,7 +2076,7 @@ struct dirichlet *diric; double denom; double background_frequency[21]; - printf ("filename is %s\n", filename); + fprintf( stderr, "filename is %s\n", filename); if ((fin = fopen (filename, "r")) == NULL) { fprintf (errorfp, "Cannot open dirichlet file %s\n", filename); exit (-1); @@ -2124,7 +2127,7 @@ for (aa = 1; aa < AAS; aa++) { denom = 0.0; for (i = 0; i < numc; i++) { - denom += (diric->q[i] * diric->alpha[i][aa] / + denom += (diric->q[i] * diric->alpha[i][aa] / diric->altot[i]); } background_frequency[aa] = denom; @@ -2132,7 +2135,7 @@ for (i =0; i < numc; i++) { /* printf ("Component %d ratio of aa relative to background \n", i); */ for (aa = 1; aa < AAS; aa++) { - diric->frequency_to_background_ratio[i][aa] = + diric->frequency_to_background_ratio[i][aa] = diric->alpha[i][aa]/(diric->altot[i] * background_frequency[aa]); } } @@ -2142,7 +2145,7 @@ -void +void cutoff_target_freq (double* cutoff_freq, int original_aa, struct float_qij *qij) { @@ -2160,8 +2163,9 @@ void init_frq_qij() { FILE *fqij; - char *blimps_dir, qijname[80], frqname[80]; - blimps_dir = getenv ("BLIMPS_DIR"); + const char *blimps_dir; + char qijname[80], frqname[80]; + blimps_dir = getblimpsdir(); /*"/howard/servers/blimps/blimps";*/ /* getenv("BLIMPS_DIR"); */ if (blimps_dir != NULL) { @@ -2177,6 +2181,13 @@ Qij = NULL; if ( (fqij=fopen(qijname, "r")) != NULL) { Qij = load_qij(fqij); fclose(fqij); } + else + { + fprintf (stderr, "Can't open file '%s': %s.\n", qijname, strerror(errno) ); + fprintf (stderr, "Current dir:\n", qijname ); + system( "pwd" ); + exit (-1); + } RTot = LOCAL_QIJ_RTOT; } /* end of frq_qij */ @@ -2195,7 +2206,7 @@ original_aa = max_aa; /* change to max aa 10/24/00 */ n = 0; - sum = 0.0; + sum = 0.0; for (aa = 1; aa <= 20 ; aa++) { if (col->cnt[aa] > 0.0) { n++; @@ -2217,20 +2228,20 @@ struct float_qij* rank_Qij; FILE* qijfp; int aa; - char* blimps_dir; + const char* blimps_dir; char filename[80]; - blimps_dir = getenv ("BLIMPS_DIR"); + blimps_dir = getblimpsdir(); sprintf (filename, "%s/docs/default.sij", blimps_dir); - + if ( (qijfp = fopen (filename, "r")) == NULL) { fprintf (stderr, "Can't opeen blosum62.sij file in construct_rank_matrix looked in %s\n", blimps_dir); exit (-1); } - + rank_Qij = load_qij (qijfp); - fclose (qijfp); + fclose (qijfp); for (original_aa = 0; original_aa < AAS; original_aa++) { copy_values_to_ranklist (aalist, original_aa, rank_Qij); @@ -2248,14 +2259,14 @@ for (rank = 0; rank < AAS; rank++) { aa = aalist[rank].aa; - rank_matrix[original_aa][aa] = rank + 1; + rank_matrix[original_aa][aa] = rank + 1; /* otherwise rank starts from 0 to AAS-1 rather than 1 to AAS */ } } void -copy_values_to_ranklist (struct rank_cell aalist[AAS], int original_aa, +copy_values_to_ranklist (struct rank_cell aalist[AAS], int original_aa, struct float_qij* rankQij) { int aa; @@ -2263,8 +2274,8 @@ /* 01/03/00 rankQij has -1 for gap, which will mess up ranks. Assign it a really negative value so it will have the lowest rank */ - - aalist[0].aa = 0; + + aalist[0].aa = 0; aalist[0].value = -50; for (aa = 1; aa < AAS; aa++) { @@ -2332,7 +2343,7 @@ count = 0; for (seq = 0; seq < block->num_sequences; seq++) { - if (block->residues[seq][pos] >= 1 && + if (block->residues[seq][pos] >= 1 && block->residues[seq][pos] <=20) { count++; } @@ -2341,7 +2352,7 @@ } /* end of number_of_real_aminoacids */ double* -calculate_basic_aa_fraction (Block* block) +calculate_basic_aa_fraction (Block* block) { double* fraction_stored; double percent_obsaa_div_seq; @@ -2371,15 +2382,15 @@ } void -free_struct_work_pssm (struct work_pssm* pssm) +free_struct_work_pssm (struct work_pssm* pssm) { free (pssm->value[0]); /* holy cow, Ithink this works. This points to space allocated in double pointer*/ free (pssm->value); /* this points to an array of block length . - each cell points to values allocated by double pointer*/ + each cell points to values allocated by double pointer*/ free (pssm->sum); free (pssm); - pssm = NULL; + pssm = NULL; } --- a/src/Protdist.c +++ b/src/Protdist.c @@ -124,7 +124,7 @@ { double max_dist; int i; - int index_of_max_seq; + int index_of_max_seq = 0; max_dist = 0; --- a/src/Psiblast.c +++ b/src/Psiblast.c @@ -1,6 +1,6 @@ /* (C) Copyright 2000, Fred Hutchinson Cancer Research Center */ /* Use, modification or distribution of these programs is subject to */ -/* the terms of the non-commercial licensing agreement in license.h. */ +/* the terms of the non-commercial licensing agreement in license.h. */ #ifndef _PSIBLAST_C_ #define _PSIBLAST_C_ @@ -9,6 +9,7 @@ #include "Alignment.c" /* for the function read_psiblast_header_until_first*/ #include "PN_convert.c" #include "stringhash.c" +#include void process_sequence_line(Sequence* seq, char* buff); @@ -18,7 +19,7 @@ /* passing first line in entry that begins with >seq_id in Buffer */ Aligned_Pair* alignment, *new_alignment; Aligned_Pair * cursor; - + alignment = initialize_Aligned_Pair (Buffer); cursor = alignment; @@ -33,14 +34,14 @@ new_alignment =initialize_Aligned_Pair (Buffer); cursor->next = new_alignment; cursor = new_alignment; - reading_alignment_at_score_line (cursor, Buffer, fp); + reading_alignment_at_score_line (cursor, Buffer, fp); } return (alignment); } /* end of read_psiblast entry*/ Aligned_Pair* -initialize_Aligned_Pair (char Buffer[LARGE_BUFF_LENGTH]) +initialize_Aligned_Pair (char Buffer[LARGE_BUFF_LENGTH]) { Aligned_Pair* alignment; char* stringptr; @@ -68,7 +69,7 @@ } /* end of initialize_Aligned_Pair*/ void -reading_alignment_at_score_line (Aligned_Pair* alignment, +reading_alignment_at_score_line (Aligned_Pair* alignment, char Buffer[LARGE_BUFF_LENGTH], FILE* fp) { char* buff; @@ -165,7 +166,7 @@ /* Note: The buff may have characters that are not residues, this is */ /* ok, there will just be a little extra space */ while ((saved_length + estimated_length) > seq->max_length) { - resize_sequence(seq); + resize_sequence(seq); } /* temporarily increase the room to put the sequence into */ @@ -192,23 +193,23 @@ free (alignment); } alignment = NULL; - + } -void block_to_checkpoint_file (Block* block, FILE* fp, FILE* outfp) +void block_to_checkpoint_file (Block* block, FILE* fp, FILE* outfp) { Matrix* pssm; - long ltemp; + int32_t ltemp; int pos; char ctemp[12]; int aa; double dtemp; -printf ("in block to checkpoint file\n"); +fprintf( stderr, "in block to checkpoint file\n"); pssm = PN_block_to_matrix (block, 20); -/* print_matrix (pssm);*/ -printf ("after conversion\n"); - ltemp = (long) block->width; - fwrite (<emp, sizeof (long), 1, fp); +/* print_matrix (pssm);*/ +fprintf( stderr, "after conversion\n"); + ltemp = (int32_t) block->width; + fwrite (<emp, sizeof(ltemp), 1, fp); for (pos = 0; pos < block->width; pos++) { ctemp[0] = toupper (aa_btoa[block->sequences[0].sequence[pos]]); @@ -222,24 +223,24 @@ } } free_matrix (pssm); -printf ("finished converting to checkpointsuccessful\n"); - +fprintf( stderr, "finished converting to checkpoint successful\n"); + } /* end block_to_checkpoint_file */ void psiblast_system_call (char chkpoint_filename[], char database[], char result_file[], char query_seq_file[], FILE* outfp) { + int ret; char command_line[LARGE_BUFF_LENGTH * 4]; - char* ncbi_dir; - - ncbi_dir = getenv ("NCBI"); - sprintf (command_line, "%s/blastpgp -d %s -i %s -o %s -m 6 -R %s\n", - ncbi_dir, database, query_seq_file, result_file, + sprintf (command_line, "blastpgp -d %s -i %s -o %s -m 6 -R %s", + database, query_seq_file, result_file, chkpoint_filename); - system (command_line); + fprintf( stderr, "%s:%d About to call '%s'\n", __FILE__, __LINE__, command_line ); + ret = system (command_line); + if( ret ) { fprintf( stderr, "%s:%d Error calling '%s'!\n", __FILE__, __LINE__, command_line ); exit(-1); } } void @@ -247,29 +248,27 @@ { char command_line[LARGE_BUFF_LENGTH]; - sprintf (command_line, "rm -f %s\n", filename); - system ("unalias rm"); + sprintf (command_line, "\\rm -f %s\n", filename); + //system ("unalias rm"); system (command_line); } -void +void formatdb_system_call (char database[]) { char command_line[LARGE_BUFF_LENGTH *2]; - char* ncbi_dir; - ncbi_dir = getenv ("NCBI"); - sprintf (command_line, "%s/formatdb -i %s -o T -p T\n", ncbi_dir, database); - printf ("formatting database command_line %s\n", command_line); + sprintf (command_line, "formatdb -i %s -o T -p T\n", database); + fprintf( stderr, "formatting database command_line %s\n", command_line); system (command_line); - printf ("finished formatting database\n"); + fprintf( stderr, "finished formatting database\n"); } /* end formatdb_system_call */ /* opens psiblast result file and reads the top-scoring matches. The top match that isn't already in the sequence name hash is returned in next_seq_to_add. */ void -get_top_seq (char next_seq_to_add[], HashTable namehash, +get_top_seq (char next_seq_to_add[], HashTable namehash, char psiblastres_file[]) { FILE* psiblastfp; @@ -286,7 +285,7 @@ if (read_psiblast_header_until_first_no_error (psiblastfp, FALSE) == -1) { - next_seq_to_add[0] = '\0'; + next_seq_to_add[0] = '\0'; } else { fgets (line, LINE_LEN, psiblastfp); @@ -295,24 +294,24 @@ assert ( strlen(strptr) < KEY_WIDTH); strncpy (name, strptr, KEY_WIDTH); name[KEY_WIDTH-1] = '\0'; - } + } while (Exists(name, namehash)) { - fgets (line, LINE_LEN, psiblastfp); + fgets (line, LINE_LEN, psiblastfp); strptr = strtok(line, " \t\n"); if (strptr == NULL) { - printf ("entered no line\n"); + fprintf( stderr, "entered no line\n"); name[0] = '\0'; } else { strncpy (name, strptr, KEY_WIDTH); /* strptr[strlen(strptr)] = '\0' ; */ - name[KEY_WIDTH-1] = '\0'; + name[KEY_WIDTH-1] = '\0'; } } strncpy (next_seq_to_add, name, KEY_WIDTH); next_seq_to_add[KEY_WIDTH -1] = '\0'; fclose (psiblastfp); - + } /* end get_top_seq */ #endif --- a/src/psiblast_res_to_fasta_dbpairwise.c +++ b/src/psiblast_res_to_fasta_dbpairwise.c @@ -32,8 +32,8 @@ #include "Protdist.c" #include "Psiblast.c" -#define MAXSEQ 400 /* Maximum number of sequences */ -#define MIN_SEQ 5 +#define MAXSEQ 20000 /* Maximum number of sequences */ +#define MIN_SEQ 2 #define LINE_LEN 800 #define FALSE 0 #define TRUE 1 @@ -113,17 +113,17 @@ free_seqs (alignedseqs, nseqs); if (nseqs < MIN_SEQ) { - fprintf (errorfp, "ERROR! Not enough sequences (only %d) found by the PSI-BLAST search! Program terminated.\n", nseqs -1); - exit (-1); + fprintf (stderr, "Warning: few sequences (only %d, MIN_SEQ = %d) found by the PSI-BLAST search.\n", nseqs -1, MIN_SEQ); + fprintf (errorfp, "Warning: few sequences (only %d, MIN_SEQ = %d) found by the PSI-BLAST search.\n", nseqs -1, MIN_SEQ); + exit(0); } else if (nseqs > MAXSEQ) { - fprintf (errorfp, "WARNING: exceeded maximum # of alignments in processing PSIBLAST result\n"); - exit (-1); + fprintf (stderr, "ERROR: exceeded maximum number of alignments (%d > %d) in processing PSIBLAST result\n", nseqs, MAXSEQ ); + fprintf (errorfp, "ERROR: exceeded maximum number of alignments (%d > %d) in processing PSIBLAST result\n", nseqs, MAXSEQ ); + exit(-1); } fclose (errorfp); rm_file (errorfilename); exit (0); - - } /* end main */ void getargs (int argc, char* argv[], FILE** psiblastfp, @@ -147,33 +147,33 @@ else { printf ("Enter name of psiblast outfile with pairwise alignments:\n"); - gets (psiblastfilename); + fgets( psiblastfilename, LARGE_BUFF_LENGTH, stdin ); } if ((*psiblastfp = fopen (psiblastfilename, "r")) == NULL) { - printf ("cannot open file %s \n", psiblastfilename); + fprintf(stderr, "cannot open file %s \n", psiblastfilename); exit (-1); } if (argc > 2) strcpy (queryaligned_outfilename, argv[2]); else { - printf ("Enter name of output file for which the psiblast \n"); - printf ("alignment to the query will be printed out in \n"); - printf ("fasta format\n"); - gets (queryaligned_outfilename); + fprintf( stderr, "Enter name of output file for which the psiblast \n"); + fprintf( stderr, "alignment to the query will be printed out in \n"); + fprintf( stderr, "fasta format\n"); + fgets( queryaligned_outfilename, LARGE_BUFF_LENGTH, stdin ); } if ((*outfpqueryalign = fopen (queryaligned_outfilename, "w")) == NULL) { - printf ("cannot open file %s\n", queryaligned_outfilename); + fprintf( stderr, "cannot open file %s\n", queryaligned_outfilename); exit (-1); } strcpy (errorfilename, queryaligned_outfilename); strcat (errorfilename, ".error"); if ((errorfp = fopen (errorfilename, "w")) == NULL) { - printf ("couldn't open file %s\n", errorfilename); + fprintf (stderr, "couldn't open file %s\n", errorfilename); exit (-1); } @@ -201,6 +201,7 @@ Sequence* seq; if ((fp = fopen (filename, "r")) == NULL) { + fprintf (stderr, "couldn't open %s in processing PSIBLAST results\n", filename); fprintf (errorfp, "couldn't open %s in processing PSIBLAST results\n", filename); exit (-1); } --- a/src/seqs_from_psiblast_res.c +++ b/src/seqs_from_psiblast_res.c @@ -23,7 +23,7 @@ #include "stringhash.c" #include "Psiblast.c" -#define MAXSEQ 400 /* Maximum number of sequences */ +#define MAXSEQ 20000 /* Maximum number of sequences */ #define LINE_LEN 800 #define FALSE 0 #define TRUE 1 @@ -154,8 +154,8 @@ if (argc > 1) strcpy (psiblastfilename, argv[1]); else { - printf ("Enter name of psiblast outfile with pairwise alignments:\n"); - gets (psiblastfilename); + fprintf( stderr, "Enter name of psiblast outfile with pairwise alignments:\n"); + fgets( psiblastfilename, LARGE_BUFF_LENGTH, stdin ); } if ((*psiblastfp = fopen (psiblastfilename, "r")) == NULL) @@ -167,9 +167,9 @@ if (argc > 2) strcpy (seqfilename, argv[2]); else { - printf ("Enter name of seq file for which the psiblast \n"); - printf ("sequences will be printed \n"); - gets (seqfilename); + fprintf( stderr, "Enter name of seq file for which the psiblast \n"); + fprintf( stderr, "sequences will be printed \n"); + fgets( seqfilename, LARGE_BUFF_LENGTH, stdin ); } if ((*seqfp = fopen (seqfilename, "r")) == NULL) --- /dev/null +++ b/src/sift_blimps.h @@ -0,0 +1,12 @@ +#ifndef SIFT_BLIMPS_H +#define SIFT_BLIMPS_H + +inline const char* getblimpsdir() +{ + const char* blimps_dir = getenv ("BLIMPS_DIR"); + if( blimps_dir == NULL ) blimps_dir = __MAKE_PREFIX__ "/share/sift/blimps"; + + return blimps_dir; +} + +#endif --- a/src/SortList.c +++ b/src/SortList.c @@ -13,7 +13,7 @@ lowest_scoring_aa (Matrix* matrix, int pos) { struct working* col; - int aa, min_aa; + int aa, min_aa = 0; double min; min = 10000; --- /dev/null +++ b/SIFT_for_submitting_fasta_seq.csh.pod @@ -0,0 +1,55 @@ +=head1 NAME + +SIFT_for_submitting_fasta_seq.csh - predict effect of an amino acid substitution on protein function + +=head1 SYNOPSIS + +SIFT_for_submitting_fasta_seq.csh [BLAST_PROCESSORS] + +=head1 DESCRIPTION + +SIFT predicts whether an amino acid substitution affects protein function based on sequence homology and the physical properties of amino acids. + +Results are stored in F<< ./.SIFTprediction >>. + +This program is used for FASTA input and is part of the SIFT suite. + +=head1 OPTIONS + +=over + +=item + +Protein sequence in fasta format. + +=item + +Protein database to search. These sequences are assumed to be functional. + +=item + +File of substitutions to be predicted. See /usr/share/doc/sift/examples/lacI.subst for an example of the format. If you give '-', scores for all mutations of the entire protein sequence are printed. + +=item [BLAST_PROCESSORS] + +Number of processors/cores to use when running blast via the B<-a> argument. + +=back + +=head1 EXAMPLES + + SIFT_for_submitting_fasta_seq.csh /usr/share/doc/sift/examples/lacI.fasta [BLAST_DB] /usr/share/doc/sift/examples/lacI.subst + +=head1 AUTHOR + +Pauline Ng + +=head1 COPYRIGHT AND LICENSE + +(C) Copyright 1993-2001, Fred Hutchinson Cancer Research Center + +Noncommercial license. See /usr/share/doc/sift/copyright for details. + +=head1 SEE ALSO + +L --- a/bin/Classify_SNPs_Indels.pl +++ b/bin/Classify_SNPs_Indels.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl $SIFT_HOME = $ENV{'SIFT_HOME'}; $bin = "$SIFT_HOME/bin"; --- a/bin/SIFT_intersect_cds.pl +++ b/bin/SIFT_intersect_cds.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl $| = 1; use Getopt::Std; use File::Basename; --- a/bin/SNPClassifier/Classify_SNPs.pl +++ b/bin/SNPClassifier/Classify_SNPs.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl ############################################################################### # # --- a/bin/SNPClassifier/Extract_Coding_Info.pl +++ b/bin/SNPClassifier/Extract_Coding_Info.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl ############################################################################### # # --- a/bin/detect_indel.pl +++ b/bin/detect_indel.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl -w +#!/usr/bin/perl -w use strict; # This program is licensed to you under the Fred --- a/bin/detect_repeat.pl +++ b/bin/detect_repeat.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) --- a/bin/get_sequences.pl +++ b/bin/get_sequences.pl @@ -1,4 +1,4 @@ -#! /opt/local/bin/perl5 -w +#!/usr/bin/perl -w ##################################################################### # --- a/bin/indelfile_to_gff.pl +++ b/bin/indelfile_to_gff.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) # NONCOMMERICAL LICENSE. A copy of the license may be found at --- a/bin/map_coords_to_bin.pl +++ b/bin/map_coords_to_bin.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) --- a/bin/map_coords_to_bin_indels.pl +++ b/bin/map_coords_to_bin_indels.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl # This program is licensed to you under the Fred --- a/bin/model_transcript.pl +++ b/bin/model_transcript.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl -w +#!/usr/bin/perl -w use strict; use DBI; my $SIFT_HOME = $ENV{'SIFT_HOME'}; --- a/bin/reformat_chrfile.pl +++ b/bin/reformat_chrfile.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl # This program is licensed to you under the Fred # Hutchinos Cancer Research Center (FHCRC) --- a/bin/sift_feed_to_chr_coords_batch.pl +++ b/bin/sift_feed_to_chr_coords_batch.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl my $SIFT_HOME = $ENV{'SIFT_HOME'}; my $bin = "$SIFT_HOME/bin"; --- a/bin/snv_db_engine.pl +++ b/bin/snv_db_engine.pl @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/perl use Class::Struct; # @@ -17,7 +17,8 @@ system("umask 006"); my $bin = "$SIFT_HOME/bin"; $snp_classifier = "$bin/Classify_SNPs.pl"; -require 'SIFT_subroutines.pm'; +use lib '/usr/lib/sift/bin'; +use SIFT_subroutines; my $tmp = $ARGV[0]; my $Variation_db_dir = $ARGV[1]; my $master_pid = $ARGV[2]; --- a/src/Clumping.c +++ b/src/Clumping.c @@ -183,7 +183,7 @@ printf ("Clumped %d sequences into %d clusters\n", nseq, iclus); /* keep this print because get # of clustersin awk statement */ free(pairs); -printf ("exiting clump\n"); +fprintf( stderr, "%s:%d exiting clump\n", __FILE__, __LINE__); return clusters_of_seqs; } /* end of cluster */ @@ -440,7 +440,7 @@ printf ("Clumped %d sequences into %d clusters\n", nseq, i2); /* keep this print because get # of clustersin awk statement */ free(pairs); -printf ("exiting clump\n"); +fprintf( stderr, "%s:%d exiting clump\n", __FILE__, __LINE__); return no_of_clumps; } /* end of cluster */ --- /dev/null +++ b/info_on_seqs.pod @@ -0,0 +1,46 @@ +=head1 NAME + +info_on_seqs - perform final step of SIFT prediction + +=head1 SYNOPSIS + +info_on_seqs + +=head1 DESCRIPTION + +You should use L instead of this program unless you know what you are doing. + +This program prepares the final SIFT prediction. In addition to the command line arguments, it expects certain files present I the FASTA_FILE. + +=head1 OPTIONS + +=over + +=item + +Protein sequence in fasta format. + +=item + +File of substitutions to be predicted. See /usr/share/doc/sift/examples/lacI.subst for an example of the format. If you give '-', scores for all mutations of the entire protein sequence are printed. + +=item + +Output file. + +=back + +=head1 AUTHOR + +Pauline Ng + +=head1 COPYRIGHT AND LICENSE + +(C) Copyright 1993-2001, Fred Hutchinson Cancer Research Center + +Noncommercial license. See /usr/share/doc/sift/copyright for details. + +=head1 SEE ALSO + +L + debian/patches/ld-as-needed0000644000000000000000000000204612034577522012773 0ustar Description: src/Makefile patch [Michael Bienia] As sift fails to link with ld --as-needed, I had to patch src/Makefile to fix it. While at it I've also added support for dpkg-buildflags to use in CFLAGS and LDFLAGS. [Laszlo Kajan] Added dpkg-buildflags to CPPFLAGS. Michael's explicit $(BIN) rule is made unnecessary by using LDLIBS instead of his LIBS, as LDLIBS get used by the implicit make rules and achieves the same result. Author: Michael Bienia Author: Laszlo Kajan --- a/src/Makefile +++ b/src/Makefile @@ -9,8 +9,10 @@ seqs_from_psiblast_res CC := gcc -CFLAGS := -I$(include-prefix)/include/blimps -O2 -D__MAKE_PREFIX__=\"$(prefix)\" $(CFLAGS) -LDFLAGS := -L$(lib-prefix)/lib -lblimps $(LDFLAGS) +CPPFLAGS := $(shell dpkg-buildflags --get CPPFLAGS) +CFLAGS := $(shell dpkg-buildflags --get CFLAGS) -Wno-unused-result -I$(include-prefix)/include/blimps -O2 -D__MAKE_PREFIX__=\"$(prefix)\" +LDFLAGS := $(shell dpkg-buildflags --get LDFLAGS) -L$(lib-prefix)/lib +LDLIBS := -lblimps -lm all: bin debian/source/0000755000000000000000000000000012034577550010476 5ustar debian/source/format0000644000000000000000000000001411722003531011666 0ustar 3.0 (quilt) debian/dirs0000644000000000000000000000001011717544051010046 0ustar usr/bin