pax_global_header 0000666 0000000 0000000 00000000064 13536643722 0014525 g ustar 00root root 0000000 0000000 52 comment=d6667e78bbde00232ff25d3b6f16964cc7639378
wtdbg2-2.5/ 0000775 0000000 0000000 00000000000 13536643722 0012564 5 ustar 00root root 0000000 0000000 wtdbg2-2.5/.gitignore 0000664 0000000 0000000 00000000026 13536643722 0014552 0 ustar 00root root 0000000 0000000 *.o
*.a
.*.swp
*.dSYM
wtdbg2-2.5/.travis.yml 0000664 0000000 0000000 00000000065 13536643722 0014676 0 ustar 00root root 0000000 0000000 language: c
compiler:
- gcc
- clang
script: make
wtdbg2-2.5/LICENSE.txt 0000664 0000000 0000000 00000104513 13536643722 0014413 0 ustar 00root root 0000000 0000000 GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Copyright (C)
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
.
wtdbg2-2.5/Makefile 0000664 0000000 0000000 00000003210 13536643722 0014220 0 ustar 00root root 0000000 0000000 VERSION=2.5
RELEASE=20190621
CC := gcc
BIN := /usr/local/bin
ifeq (0, ${MAKELEVEL})
TIMESTAMP=$(shell date)
endif
ifeq (1, ${DEBUG})
CFLAGS=-g3 -W -Wall -Wno-unused-but-set-variable -O0 -DDEBUG=1 -DVERSION="$(VERSION)" -DRELEASE="$(RELEASE)" -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -mpopcnt -msse4.2
else
CFLAGS=-g3 -W -Wall -Wno-unused-but-set-variable -O4 -DVERSION="$(VERSION)" -DRELEASE="$(RELEASE)" -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -mpopcnt -msse4.2
endif
GLIBS=-lm -lrt -lpthread -lz
GENERIC_SRC=mem_share.h chararray.h sort.h list.h pgzf.h sort.h list.h dna.h thread.h filereader.h filewriter.h bitvec.h bit2vec.h bitsvec.h hashset.h
PROGS=kbm2 wtdbg2 wtdbg-cns wtpoa-cns pgzf
all: $(PROGS)
kbm2: $(GENERIC_SRC) kbm.c kbm.h kbmpoa.h wtpoa.h tripoa.h poacns.h kswx.h ksw.h ksw.c
$(CC) $(CFLAGS) -o $@ kbm.c ksw.c $(GLIBS)
wtdbg2: $(GENERIC_SRC) wtdbg.c wtdbg-graph.h wtdbg.h kbm.h kswx.h ksw.h ksw.c kbmpoa.h wtpoa.h tripoa.h poacns.h
$(CC) $(CFLAGS) -o $@ wtdbg.c ksw.c $(GLIBS)
wtdbg-cns: $(GENERIC_SRC) wtdbg-cns.c kswx.h ksw.h ksw.c dbgcns.h dagcns.h queue.h general_graph.h
$(CC) $(CFLAGS) -o wtdbg-cns wtdbg-cns.c ksw.c $(GLIBS)
wtpoa-cns: $(GENERIC_SRC) wtpoa.h wtpoa-cns.c poacns.h tripoa.h ksw.h ksw.c
$(CC) $(CFLAGS) -o $@ wtpoa-cns.c ksw.c $(GLIBS)
pgzf: mem_share.h sort.h list.h thread.h pgzf.h pgzf.c
$(CC) $(CFLAGS) -o $@ pgzf.c $(GLIBS)
best_sam_hits4longreads: $(GENERIC_SRC) best_sam_hits4longreads.c
$(CC) $(CFLAGS) -o $@ best_sam_hits4longreads.c $(GLIBS)
clean:
rm -f *.o *.gcda *.gcno *.gcov gmon.out $(PROGS)
clear:
rm -f *.o *.gcda *.gcno *.gcov gmon.out
install: $(PROGS)
mkdir -p $(BIN) && cp -fvu $(PROGS) $(BIN)
wtdbg2-2.5/README-ori.md 0000664 0000000 0000000 00000024104 13536643722 0014633 0 ustar 00root root 0000000 0000000 # NEWS
* |2018-09-21| Rename wtdbg-1.2.8 to wtdbg2
* |2018-09-19| GFA supports
```sh
wtdbg-dot2gfa.pl dbg.3.dot >dbg.3.gfa
```
* |2018-09-01| New consensus module: POACNS
```sh
wtpoa-cns -t 0 -i dbg.ctg.lay -fo dbg.ctg.poacns.fa
```
1, wtpoa-cns implements POA to generate MSA from reads fragments within an edge. Used SSE instructions, but seems still can be further improved in speed
2, performs realignment on MSA.
3, recalibrates homopolymers, and make consensus sequence for an edge. Most reamining small errors come from homopolymer, more efforts are needed
4, joins edges' sequences into a contig.
Welcome to test POACNS and feedback.
# WTDBG
A fuzzy Bruijn graph (FBG) approach to long noisy reads assembly
# Introduction
A challenge in assembling long noisy reads from third generation sequencing (TGS) is reducing its requirement of computing resource, especially for large genomes.
To address this issue, I developed a novel sequence alignment algorithm and a new assembly graph for efficiently assembling large genomes using TGS data.
* Alignment
KBM: Kmer-BIN-Mapping.
KBM groups k-mers from each non-overlapped sliding 256 bp fragments in long reads into bins.
Bins of which most k-mers are high frequency, are filtered as highly repetitive ones.
Then, KBM searches synteny of matched bin pairs in sequences in a dynamic programming way.
A matched bin pair in two sequences is defined as two bins different by original but share a set of k-mers.
The result of alignments in KBM have the same features of traditional sequence alignment, excepting the unit of KBM alignments is 256 bp bin instead of single base.
* Assembly
FBG: Fuzzy Bruijn Graph.
FBG is composed of vertices in length of 1024 bp from reads, and edges connecting vertices in their order on read paths.
Comparing with DBG, the size of vertices in FBG are much bigger, thus wonĄŻt be sensitive to small repeat.
To tolerate high sequencing errors, FBG's vertices are found using gapped sequence alignments from KBM or other aligners, comparing with searching identical k-mers in DBG.
* Why choose wtdbg in genome assembly
There are many assemblers for long noisy reads assembly, e.g. FALCON, CANU, miniasm, and SMARTdenovo (progenitor of wtdbg). If you have a genome of 10G bp or bigger in size,
wtdbg is your first or even the only option. For small but complicated genomes (< 3 G), wtdbg was often reported to yield better assembly by my friends.
Besides, KBM is easy to use when you are setting up a web-server for long reads mapping (see Example 2).
* Limitation
Max read length is 0x0003FFFFU (256 Kb), longer reads will be split.
Max number of reads is 0x03FFFFFFU (64 M). If your data volume exceeds, please filter relative shorter reads.
In KBM, max read length is 0xFFFFFFFFU (4 Gb), max number of reads is 0x0FFFFFFFU (256 M).
Max number of threads is 4096.
Cannot parallelly run in multiple nodes. But you can implement it simplely using `kbm` and `wtdbg --load-alignments`
Developed and tested in Linux-GCC only.
Only accepts fasta/fastq format for input, '.gz' suffixed files will be piped by `gzip -dc`.
# Installation
```sh
git clone https://github.com/ruanjue/wtdbg2.git
cd wtdbg2
make
```
# Long reads mapping
Supposes you have `hg19.fa` as reference sequences, and `reads.fq.gz` as query sequences.
## Example 1
```sh
kbm -t 64 -d hg19.fa -i reads.fa.gz -o reads.kbmap
```
### output format
* COL1 `qry_name`
* COL2 `qry_strand`
* COL3 `qry_length`
* COL4 `qry_beg`
* COL5 `qry_end`
* COL6 `ref_name`
* COL7 `ref_strand` (always equals `+`)
* COL8 `ref_length`
* COL9 `ref_beg`
* COL10 `ref_end`
* COL11 `match_len` (length of matched k-mers)
* COL12 `align_len` (length of aligned)
* COL13 `#kcnt` (number of matched k-mers)
* COL14 `#gap` (number of gapped BINs)
* COL15 `cigar` (256 x SAM's cigar)
## Example 2
Suitable for online tools, or frequently used references
### Build KBM-INDEX
```sh
kbm -t 64 -d hg19.fa -i /dev/null -W hg19.kbmidx
```
### Set up KBM server
```sh
kbm -R hg19.kbmidx start
```
Now, hg19.kbmidx is cached in memory for further call.
### Mapping with KBM-INDEX
```sh
kbm -R hg19.kbmidx -t 64 -i reads.fa -o reads.kbmap
```
Please note that, hg19.kbmidx can be multilple called by any processes in the same computer.
### Shutdown KBM server
```sh
kbm -R hg19.kbmidx stop
```
# Long reads assembling
## Quick Start
```sh
echo "manual"
run_wtdbg_assembly.sh -h
echo "generating shell script"
run_wtdbg_assembly.sh -t 0 -i reads.fa.gz -o dbg -T >run.sh
```
`run.sh` is ready for invoked
## Play with wtdbg
```sh
wtdbg2 -h
wtdbg2 --help
```
### options
```sh
-t Number of threads, 0: all cores, [0]
-i Long reads sequences file, + *
-I Error-free sequences file, +
-o Prefix of output files, *
-f Force overwrite
-k Kmer fsize, 0 <= k <= 25, [0]
-p Kmer psize, 0 <= p <= 25, [21]
k + p <= 25, seed is +
-K Filter high frequency kmers, maybe repetitive, [1000]
if K >= 1, take the integer value as cutoff
else, mask the top fraction part high frequency kmers
-E Min kmer frequency, [2]
-F Filter low frequency kmers by a 4G-bytes array (max_occ=3 2-bits). Here, -E must greater than 1
-S Subsampling kmers, 1/(<-S>) kmers are indexed, [4]
-S is very useful in saving memeory and speeding up
please note that subsampling kmers will have less matched length
-X Max number of bin(256bp) in one gap, [4]
-Y Max number of bin(256bp) in one deviation, [4]
-x penalty for BIN gap, [-7]
-y penalty for BIN deviation, [-21]
-l Min length of alignment, [2048]
-m Min matched, [200]
-s Max length variation of two aligned fragments, [0.2]
-q Quiet
-v Verbose, +
--help Show more options
```
### For higher error rate long sequences
Decrease `-p`. Try `-p 19` or `-p 17`
Decrease `-S`. Try `-S 2` or `-S 1`
Both will increase computing time.
### For very high coverage
Increase `--edge-min`. Try `--edge-min 4`, or higher.
### For low coverage
Decrease `--edge-min`. Try `--edge-min 2 --rescue-low-cov-edges`.
### Filter reads
`--tidy-reads 5000`. Will filtered shorter sequences. If names in format of `\/\d+_\d+$`, will selected the longest subread.
### output
Suppose the prefix is `dbg`
* dbg.1.dot
DOT file for initialized graph
* dbg.1.nodes
nodes and their positions in reads
* dbg.1.reads
reads and their nodes
* dbg.2.dot
DOT file after transitive reduction
* dbg.3.dot
DOT file after merging bubble and remove tips
* dbg.alignments
KBMAP file, all vs all alignments
* dbg.binkmer
Distribution of number of k-mers in a BIN
* dbg.closed\_bins
Filtered BINs
* dbg.clps
Reads clip information.
COL1 read\_name
COL2 read\_length
COL3 keep\_offset
COL4 keep\_length
* dbg.ctg.dot
DOT file for contigs
* dbg.ctg.lay
Contigs layout file. Will be read by `wtdbg-cns`. This file is the main result of `wtdbg`
**Format**:
```
>ctg(\d+) nodes=(\d+) len=(\d+)
E
S
S
S
...
E ...
...
```
One contig contains many edges (starting with 'E'), each edge contains many regions inside reads.
Please note that one read often contains many REGs.
* dbg.events
Log file of graph simplification
* dbg.frg.dot
DOT file for unitigs
* dbg.frg.nodes
unitigs and their nodes
* dbg.kmerdep
Distribution of k-mer depth
* STDERR stream
wtdbg print runtime information on progrom's STDERR stream. `--quiet` to disiable it
## Consensus
```sh
wtdbg-cns -t 64 -i dbg.ctg.lay -o dbg.ctg.lay.fa
```
The output file `dbg.ctg.lay.fa` is ready for further polished by `PILON` or `QUIVER`.
```sh
wtpoa-cns -t 64 -i dbg.ctg.lay -o dbg.ctg.lay.fa
```
wtpoa-cns is slower than wtdbg-cns, but offer more accurate consensus sequences.
I will update it in following development.
# Performance
## Human (3G) CHM1 PacBio P5C3 dataset, 65.5 core.hours
* Data Source
http://datasets.pacb.com/2014/Human54x/fasta.html
* Command
```sh
wtdbg2 -t 96 -i pb.fa -fo dbg --tidy-reads 5000 --edge-min 2 --rescue-low-cov-edges
```
* Contigs
`TOT 2978536704, CNT 8752, AVG 340327, MAX 11662848, N50 1925120, L50 453, N90 400128, L90 1727, Min 5120`
* Runtime
`real 6131.803 sec, user 201836.200 sec, sys 33956.790 sec, maxrss 117281672.0 kB, maxvsize 202422172.0 kB`
## Human (3G) CHM1 PacBio P6C4 dataset, 211.3 core.hours
* Data Source
http://www.ebi.ac.uk/ena/data/view/PRJNA246220
* Command
```sh
wtdbg2 -t 96 -i wt.fa -fo dbg --tidy-reads 5000 --edge-min 4 --rescue-low-cov-edges
```
* Contigs
`TOT 2964872448, CNT 1909, AVG 1553103, MAX 105310208, N50 23586816, L50 34, N90 3326976, L90 158, Min 5120`
* Runtime
`real 16806.534 sec, user 681278.770 sec, sys 79371.630 sec, maxrss 264956752.0 kB, maxvsize 443356532.0 kB`
## Axolotl (32G) PacBio dataset, 32 X, 3053 core.hours
* Command
```sh
wtdbg2 -t 96 -i ../rawdata/pacbio.fa.gz -p 21 -S 2 --aln-noskip --rescue-low-cov-edges --tidy-reads 5000 -fo axolotl
```
* Contigs
`TOT 27375160576, CNT 115355, AVG 237313, MAX 7812608, N50 606976, L50 12527, N90 144896, L90 47295, Min 5120`
* Runtime
`real 190237.591 sec, user 10994200.800 sec, sys 488715.030 sec, maxrss 1671005352.0 kB, maxvsize 2365400208.0 kB`
## Human (3G) NA12878 ONT dataset, 197.5 core.hours
* Data Source
https://github.com/nanopore-wgs-consortium/NA12878
* Command
```sh
wtdbg2 -t 64 -i NA12878-ONT.fa.gz -fo dbg -S 2 --edge-min 2 --rescue-low-cov-edges
```
* Contigs
`TOT 2827644928, CNT 19473, AVG 145209, MAX 31366400, N50 4540672, L50 162, N90 172800, L90 1111, Min 5120`
* Runtime
`real 14992.925 sec, user 649202.270 sec, sys 61638.300 sec, maxrss 256840096.0 kB, maxvsize 356668088.0 kB`
# Citation
To be published.
URL **https://github.com/ruanjue/wtdbg2/**
# Contact
Jue Ruan
Jue Ruan
wtdbg2-2.5/README.md 0000664 0000000 0000000 00000015645 13536643722 0014056 0 ustar 00root root 0000000 0000000 ## Getting Started
```sh
git clone https://github.com/ruanjue/wtdbg2
cd wtdbg2 && make
#quick start with wtdbg2.pl
./wtdbg2.pl -t 16 -x rs -g 4.6m -o dbg reads.fa.gz
# Step by step commandlines
# assemble long reads
./wtdbg2 -x rs -g 4.6m -i reads.fa.gz -t 16 -fo dbg
# derive consensus
./wtpoa-cns -t 16 -i dbg.ctg.lay.gz -fo dbg.raw.fa
# polish consensus, not necessary if you want to polish the assemblies using other tools
minimap2 -t16 -ax map-pb -r2k dbg.raw.fa reads.fa.gz | samtools sort -@4 >dbg.bam
samtools view -F0x900 dbg.bam | ./wtpoa-cns -t 16 -d dbg.raw.fa -i - -fo dbg.cns.fa
# Addtional polishment using short reads
bwa mem -t 16 dbg.cns.fa sr.1.fa sr.2.fa | samtools sort -O SAM | ./wtpoa-cns -t 16 -x sam-sr -d dbg.cns.fa -i - -fo dbg.srp.fa
```
## Introduction
Wtdbg2 is a *de novo* sequence assembler for long noisy reads produced by
PacBio or Oxford Nanopore Technologies (ONT). It assembles raw reads without
error correction and then builds the consensus from intermediate assembly
output. Wtdbg2 is able to assemble the human and even the 32Gb
[Axolotl][Axolotl] genome at a speed tens of times faster than [CANU][canu] and
[FALCON][falcon] while producing contigs of comparable base accuracy.
During assembly, wtdbg2 chops reads into 1024bp segments, merges similar
segments into a vertex and connects vertices based on the segment adjacency on
reads. The resulting graph is called fuzzy Bruijn graph (FBG). It is akin to De
Bruijn graph but permits mismatches/gaps and keeps read paths when collapsing
k-mers. The use of FBG distinguishes wtdbg2 from the majority of long-read
assemblers.
## Installation
Wtdbg2 only works on 64-bit Linux. To compile, please type `make` in the source
code directory. You can then copy `wtdbg2` and `wtpoa-cns` to your `PATH`.
Wtdbg2 also comes with an approxmimate read mapper `kbm`, a faster but less
accurate consesus tool `wtdbg-cns` and many auxiliary scripts in the `scripts`
directory.
## Usage
Wtdbg2 has two key components: an assembler **wtdbg2** and a consenser
**wtpoa-cns**. Executable **wtdbg2** assembles raw reads and generates the
contig layout and edge sequences in a file "*prefix*.ctg.lay.gz". Executable
**wtpoa-cns** takes this file as input and produces the final consensus in
FASTA. A typical workflow looks like this:
```sh
./wtdbg2 -x rs -g 4.6m -t 16 -i reads.fa.gz -fo prefix
./wtpoa-cns -t 16 -i prefix.ctg.lay.gz -fo prefix.ctg.fa
```
where `-g` is the estimated genome size and `-x` specifies the sequencing
technology, which could take value "rs" for PacBio RSII, "sq" for PacBio
Sequel, "ccs" for PacBio CCS reads and "ont" for Oxford Nanopore. This option
sets multiple parameters and should be **applied before other parameters**.
When you are unable to get a good assembly, you may need to tune other
parameters as follows.
Wtdbg2 combines normal k-mers and homopolymer-compressed (HPC) k-mers to find
read overlaps. Option `-k` specifies the length of normal k-mers, while `-p`
specifies the length of HPC k-mers. By default, wtdbg2 samples a fourth of all
k-mers by their hashcodes. For data of relatively low coverage, you may
increase this sampling rate by reducing `-S`. This will greatly increase the
peak memory as a cost, though. Option `-e`, which defaults to 3, specifies the
minimum read coverage of an edge in the assembly graph. You may adjust this
option according to the overall sequencing depth, too. Option `-A` also helps
relatively low coverage data at the cost of performance. For PacBio data,
`-L5000` often leads to better assemblies emperically, so is recommended.
Please run `wtdbg2 --help` for a complete list of available options or consult
[README-ori.md](README-ori.md) for more help.
The following table shows various command lines and their resource usage for
the assembly step:
|Dataset |GSize |Cov |Asm options |CPU asm |CPU cns |Real tot| RAM|
|:-----------------------|-----:|-------:|:------------------|-------:|-------:|-------:|-------:|
|[E. coli][pbcr] |4.6Mb |PB x20 |-x rs -g4.6m -t16 | 53s| 8m54s| 42s| 1.0G|
|[C. elegans][ce] |100Mb |PB x80 |-x rs -g100m -t32 | 1h07m| 5h06m| 13m42s| 11.6G|
|[D. melanogaster A4][dm2]| 144m|PB x120 |-x rs -g144m -t32 | 2h06m| 5h11m| 26m17s| 19.4G|
|[D. melanogaster ISO1][dm1]|144m|ONT x32|-xont -g144m -t32 | 5h12m| 4h30m| 25m59s| 17.3G|
|[A. thaliana][at] |125Mb |PB x75 |-x sq -g125m -t32 | 11h26m| 4h57m| 49m35s| 25.7G|
|[Human NA12878][na12878]|3Gb |ONT x36 |-x ont -g3g -t31 | 793h11m| 97h46m| 31h03m| 221.8G|
|[Human NA19240][na19240]|3Gb |ONT x35 |-x ont -g3g -t31 | 935h31m| 89h17m| 35h20m| 215.0G|
|[Human HG00733][hg00733]|3Gb |PB x93 |-x sq -g3g -t47 |2114h26m| 152h24m| 52h22m| 338.1G|
|[Human NA24385][na24385]|3Gb |CCS x28 |-x ccs -g3g -t31 | 231h25m| 58h48m| 10h14m| 112.9G|
|[Human CHM1][chm1] |3Gb |PB x60 |-x rs -g3g -t96 | 105h33m| 139h24m| 5h17m| 225.1G|
|[Axolotl][axosra] |32Gb |PB x32 |-x rs -g32g -t96 |2806h40m|1456h13m| 110h16m| 1788.1G|
The timing was obtained on three local servers with different hardware
configurations. There are also run-to-run fluctuations. Exact timing on your
machines may differ. The assembled contigs can be found at the following FTP:
```txt
ftp://ftp.dfci.harvard.edu/pub/hli/wtdbg/
```
## Limitations
* For Nanopore data, wtdbg2 may produce an assembly smaller than the true
genome.
* When inputing multiple files of both fasta and fastq format, please put fastq first, then fasta.
Otherwise, program cannot find '>' in fastq, and append all fastq in one read.
## Citing wtdbg2
If you use wtdbg2, please cite:
> Ruan, J. and Li, H. (2019) Fast and accurate long-read assembly with wtdbg2. *bioRxiv*. doi:10.1101/530972
## Getting Help
Please use the [GitHub's Issues page][issue] if you have questions. You may
also directly contact Jue Ruan at ruanjue@gmail.com.
[miniasm]: https://github.com/lh3/miniasm
[canu]: https://github.com/marbl/canu
[falcon]: https://github.com/PacificBiosciences/FALCON
[Axolotl]: https://www.nature.com/articles/nature25458
[chm1]: https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP044331
[na12878]: https://github.com/nanopore-wgs-consortium/NA12878/blob/master/rel5.md
[na19240]: https://www.ebi.ac.uk/ena/data/view/PRJEB26791
[pbcr]: http://www.cbcb.umd.edu/software/PBcR/data/selfSampleData.tar.gz
[ce]: https://github.com/PacificBiosciences/DevNet/wiki/C.-elegans-data-set
[axosra]: https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJNA378970
[issue]: https://github.com/ruanjue/wtdbg2/issues
[at]: https://downloads.pacbcloud.com/public/SequelData/ArabidopsisDemoData/
[dm1]: https://www.ebi.ac.uk/ena/data/view/SRR6702603
[dm2]: https://www.ebi.ac.uk/ena/data/view/SRR5439404
[hg00733]: https://www.ebi.ac.uk/ena/data/view/SRR7615963
[na24385]: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_CCS_15kb/
wtdbg2-2.5/best_sam_hits4longreads.c 0000664 0000000 0000000 00000012322 13536643722 0017537 0 ustar 00root root 0000000 0000000
#include "mem_share.h"
#include "filereader.h"
#include "list.h"
#include "hashset.h"
typedef struct {
u4i stroff, strlen;
u4i taglen, flag;
u4i qlen, qb, qe;
u4i refidx, reflen;
} lr_hit_t;
define_list(lrhitv, lr_hit_t);
int select_best_hit(lrhitv *hits, String *lines, u4i minlen, float mincov, FILE *out){
lr_hit_t *h1, *h2;
u4i i, j, pass;
int x, y, ret;
sort_array(hits->buffer, hits->size, lr_hit_t, num_cmpgt(b.qe - b.qb, a.qe - a.qb));
ret = 0;
for(i=0;isize;i++){
h1 = ref_lrhitv(hits, i);
if(h1->qe - h1->qb < minlen) break;
if(h1->qe - h1->qb < UInt(mincov * h1->qlen)) break;
pass = 1;
for(j=0;jqb, h2->qb);
y = num_min(h1->qe, h2->qe);
if(y - x >= (1 - mincov) * (h1->qe - h1->qb)){
pass = 0;
break;
}
}
if(pass){
fprintf(out, "%s\n", lines->string + h1->stroff);
ret ++;
}
}
clear_lrhitv(hits);
clear_string(lines);
return ret;
}
int usage(char *prog){
printf(
"Usage: %s [-h] [-v] [-B:retain secondary aligment] [-l min_map_len:100] [-f min_map_cov:0.70]\n"
, prog
);
return 1;
}
int main(int argc, char **argv){
FileReader *fr;
FILE *out;
cuhash *refs;
cplist *reftags;
u4v *reflens;
String *lines;
lrhitv *hits;
lr_hit_t *hit, HIT;
char *str, *reftag;
u4i minlen, i, reflen;
float mincov;
int c, primary_hit, verbose;
u1i movs[256];
minlen = 100;
mincov = 0.70;
primary_hit = 1;
verbose = 0;
out = stdout;
while((c = getopt(argc, argv, "hvBl:f:")) != -1){
switch(c){
case 'l': minlen = atoi(optarg); break;
case 'f': mincov = atof(optarg); break;
case 'B': primary_hit = 0; break;
case 'v': verbose = 1; break;
default: return usage(argv[0]);
}
}
fr = open_filereader(NULL, 1);
refs = init_cuhash(13);
reftags = init_cplist(8);
reflens = init_u4v(8);
hits = init_lrhitv(4);
lines = init_string(1024);
memset(movs, 0, 256);
movs[(int)'M'] = 0b11;
movs[(int)'I'] = 0b10;
movs[(int)'D'] = 0b01;
movs[(int)'N'] = 0b01;
movs[(int)'S'] = 0b10;
movs[(int)'H'] = 0b10;
movs[(int)'P'] = 0b00;
movs[(int)'='] = 0b11;
movs[(int)'X'] = 0b11;
while((c = readline_filereader(fr))){
if(fr->line->string[0] == '@'){
fprintf(out, "%s\n", fr->line->string);
if(fr->line->string[1] == 'S' && fr->line->string[2] == 'Q'){
if((c = split_line_filereader(fr, '\t')) > 2){
reftag = NULL;
reflen = 0;
for(i=1;i<3;i++){
if(get_col_len(fr, i) <= 3){
continue;
}
str = get_col_str(fr, i);
if(str[0] == 'S' && str[1] == 'N' && str[2] == ':'){
reftag = strdup(str + 3);
} else if(str[0] == 'L' && str[1] == 'N' && str[2] == ':'){
reflen = atol(str + 3);
}
}
if(strlen(reftag) && reflen){
push_cplist(reftags, reftag);
push_u4v(reflens, reflen);
put_cuhash(refs, (cuhash_t){reftag, reftags->size - 1});
}
}
}
} else {
hit = &HIT;
str = index(fr->line->string, '\t');
if(str == NULL){
fprintf(stderr, "[WARNNING:too_few_column] %s\n", fr->line->string);
continue;
}
hit->taglen = str - fr->line->string;
if(hits->size && (hits->buffer[0].taglen != hit->taglen || strncmp(lines->string + hits->buffer[0].stroff, fr->line->string, hit->taglen))){
select_best_hit(hits, lines, minlen, mincov, out);
}
hit->stroff = lines->size;
hit->strlen = fr->line->size;
append_string(lines, fr->line->string, fr->line->size);
add_char_string(lines, '\0');
if((c = split_line_filereader(fr, '\t')) < 11){
fprintf(stderr, "[WARNNING:too_few_columns] %s\n", lines->string + hit->stroff);
continue;
}
hit->taglen = get_col_len(fr, 0);
hit->flag = atol(get_col_str(fr, 1));
if(primary_hit && (hit->flag & 0x900)){
continue;
}
if(get_col_str(fr, 2)[0] == '*'){
continue;
}
hit->refidx = getval_cuhash(refs, get_col_str(fr, 2));
if(hit->refidx == MAX_U4){
fprintf(stderr, "[WARNNING:unknown_refname] %s\n", lines->string + hit->stroff);
continue;
}
hit->reflen = reflens->buffer[hit->refidx];
hit->qlen = 0;
u4i tb, te, qb, qe, len, cnt, tmp;
u4i ln;
char op;
tb = atoi(get_col_str(fr, 3));
te = tb;
qb = qe = 0;
len = 0;
tmp = cnt = 0;
str = get_col_str(fr, 5); // CIGAR
ln = 0;
op = 0;
while(str[0]){
if(str[0] >= '0' && str[0] <= '9'){
ln = ln * 10 + str[0] - '0';
} else {
op = movs[(int)str[0]];
if(op & 0b01){
te += ln;
qe += tmp;
tmp = 0;
if(cnt == 0){
qb = qe;
cnt = 1;
}
if(op & 0b10){
qe += ln;
len += ln;
}
} else if(op & 0b10){
tmp += ln;
len += ln;
}
ln = 0;
}
str ++;
}
if(hit->flag & 0x10){
tmp = len - qb;
qb = len - qe;
qe = tmp;
}
hit->qlen = len;
hit->qb = qb;
hit->qe = qe;
if(verbose){
fprintf(out, "#%s\t%u\t+\t%u\t%u\t%s\t%c\t%u\t%u\n", get_col_str(fr, 0), len, qb, qe, get_col_str(fr, 2), "+-"[(hit->flag & 0x10) >> 4], tb, te);
}
push_lrhitv(hits, HIT);
}
}
select_best_hit(hits, lines, minlen, mincov, out);
free_string(lines);
free_lrhitv(hits);
free_cuhash(refs);
for(i=0;isize;i++){
free(reftags->buffer[i]);
}
free_cplist(reftags);
free_u4v(reflens);
close_filereader(fr);
return 0;
}
wtdbg2-2.5/bit2vec.h 0000664 0000000 0000000 00000005136 13536643722 0014300 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __BIT2_VEC_RJ_H
#define __BIT2_VEC_RJ_H
#include
#include
#include
#include
#include "mem_share.h"
typedef struct {
uint8_t *bits;
uint64_t size;
uint64_t cap;
} Bit2Vec;
static inline Bit2Vec* init_bit2vec(uint64_t size){
Bit2Vec *vec;
vec = malloc(sizeof(Bit2Vec));
vec->size = 0;
vec->cap = size;
vec->bits = calloc(1, (size * 2 + 7) / 8);
return vec;
}
static inline size_t bit2vec_obj_desc_cnt(void *obj, int idx){
return (((Bit2Vec*)obj)->cap * 2 + 7) / 8;
idx = idx;
}
static const obj_desc_t bit2vec_obj_desc = {"bit2vec_obj_desc", sizeof(Bit2Vec), 1, {1}, {offsetof(Bit2Vec, bits)}, {(obj_desc_t*)&OBJ_DESC_DATA}, bit2vec_obj_desc_cnt, NULL};
static inline void clear_bit2vec(Bit2Vec *vec){
memset(vec->bits, 0, (vec->cap * 2 + 7) / 8);
vec->size = 0;
}
static inline void free_bit2vec(Bit2Vec *vec){
free(vec->bits);
free(vec);
}
static inline int encap_bit2vec(Bit2Vec *vec, uint32_t n){
uint64_t cap;
if(vec->size + n <= vec->cap) return 0;
cap = vec->cap;
while(vec->size + n > vec->cap){
if(vec->cap < 1024 * 1024){
vec->cap <<= 1;
} else {
vec->cap += 1024 * 1024;
}
}
vec->bits = realloc(vec->bits, (vec->cap * 2 + 7) / 8);
memset(vec->bits + (cap * 2 + 7) / 8, 0, (vec->cap * 2 + 7) / 8 - (cap * 2 + 7) / 8);
return 1;
}
static inline void set_bit2vec(Bit2Vec *vec, uint64_t idx, uint8_t dat){
vec->bits[idx >> 2] = (vec->bits[idx >> 2] & (~(3U << ((idx & 0x03U) << 1)))) | ((dat & 0x03) << ((idx & 0x03U) << 1));
}
static inline void push_bit2vec(Bit2Vec *vec, uint8_t dat){
encap_bit2vec(vec, 1);
set_bit2vec(vec, vec->size, dat);
vec->size ++;
}
static inline uint8_t get_bit2vec(Bit2Vec *vec, uint64_t idx){
return (vec->bits[idx >> 2] >> ((idx & 0x03U) << 1)) & 0x03;
}
static inline int pop_bit2vec(Bit2Vec *vec){
if(vec->size == 0) return -1;
vec->size --;
return get_bit2vec(vec, vec->size);
}
#endif
wtdbg2-2.5/bitsvec.h 0000664 0000000 0000000 00000012150 13536643722 0014373 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __BITS_VEC_RJ_H
#define __BITS_VEC_RJ_H
#include
#include
#include
#include
#include "mem_share.h"
/* Useful functions when n_bit > 8 */
static inline void u8byte2bits(uint64_t val, uint8_t *dat, uint64_t offset, uint8_t size){
uint8_t i, *v;
v = (uint8_t*)&val;
#if __BYTE_ORDER == 1234
for(i=0;i> 3] & (1U << (i & 0x7U))) dat[offset >> 3] |= 1U << (offset & 0x7U);
else dat[offset >> 3] &= ~(1U << (offset & 0x7U));
offset ++;
}
#else
for(i=0;i> 3)] & (1U << (i & 0x7U))) dat[offset >> 3] |= 1U << (offset & 0x7U);
else dat[offset >> 3] &= ~(1U << (offset & 0x7U));
offset ++;
}
#endif
}
static inline uint64_t bits2u8byte(uint8_t *dat, uint64_t offset, uint8_t size){
uint64_t ret;
uint8_t i, *v;
ret = 0;
v = (uint8_t*)&ret;
#if __BYTE_ORDER == 1234
for(i=0;i> 3] & (1U << (offset & 0x7U))) v[i >> 3] |= 1U << (i & 0x7U);
offset ++;
}
#else
for(i=0;i> 3] & (1U << (offset & 0x7U))) v[7 - (i >> 3)] |= 1U << (i & 0x7U);
offset ++;
}
#endif
return ret;
}
typedef struct {
uint8_t *bits;
uint64_t size;
uint64_t cap;
uint8_t n_bit;
uint32_t mask;
} BitsVec;
static inline BitsVec* init_bitsvec(uint64_t size, uint32_t n_bit){
BitsVec *vec;
if(n_bit == 0) n_bit = 1;
else if(n_bit > 8) n_bit = 8;
if(size < 8) size = 8;
vec = calloc(1, sizeof(BitsVec));
vec->n_bit = n_bit;
vec->mask = (1U << n_bit) - 1U;
vec->size = 0;
vec->cap = size;
vec->bits = calloc((size * vec->n_bit + 15) / 8, 1);
return vec;
}
static inline size_t bitsvec_obj_desc_cnt(void *obj, int idx){
return (((BitsVec*)obj)->cap * ((BitsVec*)obj)->n_bit + 15) / 8;
idx = idx;
}
static const obj_desc_t bitsvec_obj_desc = {"bitsvec_obj_desc", sizeof(BitsVec), 1, {1}, {offsetof(BitsVec, bits)}, {(obj_desc_t*)&OBJ_DESC_DATA}, bitsvec_obj_desc_cnt, NULL};
static inline void clear_bitsvec(BitsVec *vec){
vec->size = 0;
}
static inline void free_bitsvec(BitsVec *vec){
free(vec->bits);
free(vec);
}
static inline int encap_bitsvec(BitsVec *vec, u8i n){
if(vec->size + n <= vec->cap) return 0;
if(vec->size + n < 0x3FFFFFFFLLU){
vec->cap = roundup_power2(vec->size + n);
} else {
vec->cap = (vec->size + n + 0x3FFFFFFFLLU) & (MAX_U8 << 30);
}
vec->bits = realloc(vec->bits, (vec->cap * vec->n_bit + 15) / 8);
return 1;
}
static inline void set_bitsvec(BitsVec *vec, u8i idx, u1i dat){
register u8i off;
register u2i x, d;
off = (idx * vec->n_bit);
d = off & 0x07;
off >>= 3;
x = (((u2i)vec->bits[off + 1]) << 8) | vec->bits[off + 0];
x = (x & (~(vec->mask << d))) | ((UInt(dat) & vec->mask) << d);
vec->bits[off] = x;
vec->bits[off + 1] = x >> 8;
}
static inline void push_bitsvec(BitsVec *vec, u1i dat){
encap_bitsvec(vec, 1);
set_bitsvec(vec, vec->size, dat);
vec->size ++;
}
// TODO: need to be optimized
static inline void pushs_bitsvec(BitsVec *vec, u1i dat, u8i len){
u8i i;
encap_bitsvec(vec, len);
for(i=0;isize + i, dat);
}
vec->size += len;
}
static inline u2i gets_bitsvec(BitsVec *vec, u8i idx){
u8i off;
off = (idx * vec->n_bit);
return ((((u2i)vec->bits[(off >> 3) + 1]) << 8) | vec->bits[(off >> 3) + 0]) >> (off & 0x07);
}
static inline uint8_t get_bitsvec(BitsVec *vec, u8i idx){
return gets_bitsvec(vec, idx) & vec->mask;
}
static inline void append_bitsvec(BitsVec *dst, BitsVec *src, u8i off, u8i len){
u8i i, di, si, se;
u2i x;
u1i p, n, sd;
if(0){ // Assume dst->n_bit == src->n_bit
if(dst->n_bit != src->n_bit){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
encap_bitsvec(dst, len);
p = (dst->n_bit & 0x01)? 8 : (8 / dst->n_bit);
n = dst->size % p;
if(n) n = p - n;
if(len <= n){
for(i=0;isize + i, get_bitsvec(src, off + i));
}
dst->size += len;
return;
} else {
for(i=0;isize + i, get_bitsvec(src, off + i));
}
dst->size += n;
}
di = (dst->size * dst->n_bit) >> 3;
si = ((off + i) * src->n_bit);
sd = si & 0x07;
si >>= 3;
se = ((off + len) * src->n_bit + 7) >> 3;
while(si < se){
x = ((src->bits[si + 1] << 8) | src->bits[si]) >> sd;
dst->bits[di++] = x;
si ++;
}
dst->size += len - i;
}
static inline int pop_bitsvec(BitsVec *vec){
if(vec->size == 0) return -1;
vec->size --;
return get_bitsvec(vec, vec->size);
}
#endif
wtdbg2-2.5/bitvec.h 0000664 0000000 0000000 00000035401 13536643722 0014214 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __BIT_VEC_RJ_H
#define __BIT_VEC_RJ_H
#include
#include
#include
#include
#include "mem_share.h"
static const u1i byte_ones_table[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
static inline unsigned int _bitvec_roundup_power2(unsigned int v){
if(v == 0) return 0;
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return v + 1;
}
typedef struct {
u8i *bits;
u8i n_bit;
u8i n_cap;
u8i *sums;
u8i sum_size;
u8i n_ones;
u8i *hash;
u8i hash_size;
u8i hash_mod;
int64_t iter_idx;
} BitVec;
#if 0
static inline u4i count_ones_bit32(u4i v){
v = v - ((v >> 1) & 0x55555555U); // reuse input as temporary
v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U); // temp
return (((v + (v >> 4)) & 0xF0F0F0FU) * 0x1010101U) >> 24; // count
}
#define ONES_STEP_4 0x1111111111111111ULL
#define ONES_STEP_8 0x0101010101010101ULL
static inline int count_ones_bit64(const u8i x){
register u8i byte_sums = x - ((x & 0xa * ONES_STEP_4) >> 1);
byte_sums = (byte_sums & 3 * ONES_STEP_4) + ((byte_sums >> 2) & 3 * ONES_STEP_4);
byte_sums = (byte_sums + (byte_sums >> 4)) & 0x0f * ONES_STEP_8;
return byte_sums * ONES_STEP_8 >> 56;
}
#else
#define count_ones_bit32(v) __builtin_popcount(v)
#define count_ones_bit64(v) __builtin_popcountll(v)
#endif
#define reverse_u1i(v) (((((u1i)v) * 0x0202020202ULL) & 0x010884422010ULL) % 1023)
static inline size_t bitvec_obj_desc_cnt(void *bitv, int idx){
switch(idx){
case 0: return ((BitVec*)bitv)->n_cap / 64 * 8;
case 1: return ((BitVec*)bitv)->sums? (((BitVec*)bitv)->sum_size * 2 + 1) * 8 : 0;
case 2: return ((BitVec*)bitv)->hash? (((BitVec*)bitv)->hash_size) * 8 : 0;
default: return 0;
}
}
static const obj_desc_t bitvec_obj_desc = {"bitvec_obj_desc", sizeof(BitVec), 3, {1, 1, 1}, {offsetof(BitVec, bits), offsetof(BitVec, sums), offsetof(BitVec, hash)}, {(obj_desc_t*)&OBJ_DESC_DATA, (obj_desc_t*)&OBJ_DESC_DATA, (obj_desc_t*)&OBJ_DESC_DATA}, bitvec_obj_desc_cnt, NULL};
static inline BitVec* init_bitvec(u8i n_bit){
BitVec *bitv;
if(n_bit == 0) n_bit = 64 * 8;
bitv = (BitVec*)malloc(sizeof(BitVec));
bitv->n_bit = 0;
bitv->n_cap = (((n_bit + 63) / 64) + 7) / 8 * 64 * 8;
bitv->bits = (u8i*)calloc((bitv->n_cap / 64) + 1, 8);
bitv->bits[bitv->n_cap / 64] = 0x0000000000000001LLU;
//memset(bitv->bits, 0, bitv->n_cap / 8);
bitv->sums = NULL;
bitv->hash = NULL;
bitv->sum_size = 0;
bitv->n_ones = 0;
bitv->hash_size = 0;
bitv->hash_mod = 0;
bitv->iter_idx = 0;
return bitv;
}
static inline size_t dump_bitvec(BitVec *bitv, FILE *out){
fwrite(&bitv->n_bit, sizeof(u8i), 1, out);
fwrite(&bitv->n_cap, sizeof(u8i), 1, out);
fwrite(bitv->bits, sizeof(u8i), bitv->n_cap / 64, out);
return sizeof(u8i) * (2 + bitv->n_cap / 64);
}
static inline BitVec* load_bitvec(FILE *inp){
BitVec *bitv;
size_t n;
bitv = (BitVec*)malloc(sizeof(BitVec));
if((n = fread(&bitv->n_bit, sizeof(u8i), 1, inp)) != 1){
free(bitv); return NULL;
}
if((n = fread(&bitv->n_cap, sizeof(u8i), 1, inp)) != 1){
free(bitv); return NULL;
}
bitv->bits = (u8i*)malloc(bitv->n_cap / 8);
if(bitv->bits == NULL){
fprintf(stderr, " Out of memeory in load_bitvec\n "); fflush(stderr); exit(1);
}
if((n = fread(bitv->bits, sizeof(u8i), bitv->n_cap / 64, inp)) != bitv->n_cap / 64){
free(bitv); free(bitv->bits); return NULL;
}
bitv->sums = NULL;
bitv->hash = NULL;
bitv->hash_size = 0;
return bitv;
}
#if 0
static inline BitVec* mem_load_bitvec(void *mem, FILE *inp){
BitVec *bitv;
size_t off, n;
bitv = mem;
off = ((sizeof(BitVec) + 7) / 8) * 8;
if((n = fread(&bitv->n_bit, sizeof(u8i), 1, inp)) != 1) return NULL;
if((n = fread(&bitv->n_cap, sizeof(u8i), 1, inp)) != 1) return NULL;
bitv->sums = NULL;
bitv->hash = NULL;
bitv->hash_size = 0;
bitv->bits = mem + off;
off += (bitv->n_cap / 64) * 8;
if((n = fread(bitv->bits, sizeof(u8i), bitv->n_cap / 64, inp)) != bitv->n_cap / 64) return NULL;
return bitv;
}
#endif
static inline void clear_bitvec(BitVec *bitv){ bitv->n_bit = 0; }
static inline void zeros_bitvec(BitVec *bitv){ memset(bitv->bits, 0, bitv->n_cap / 8); }
// exclusive end
static inline void reg_zeros_bitvec(BitVec *bitv, u8i beg, u8i end){
u8i b, e;
if(beg >= end) return;
b = beg >> 6;
e = end >> 6;
if(b == e){
bitv->bits[b] &= (MAX_U8 << (beg & 0x3FU)) ^ (MAX_U8 >> (64 - (end & 0x3FU)));
} else {
bitv->bits[b] &= ~(MAX_U8 << (beg & 0x3FU));
while(++b < e){ bitv->bits[b] = 0; }
bitv->bits[b] &= MAX_U8 << (end & 0x3FU);
}
}
static inline void ones_bitvec(BitVec *bitv){ memset(bitv->bits, 0xFFU, bitv->n_cap / 8); }
// exclusive end
static inline void reg_ones_bitvec(BitVec *bitv, u8i beg, u8i end){
u8i b, e;
if(beg >= end) return;
b = beg >> 6;
e = end >> 6;
if(b == e){
bitv->bits[b] |= (MAX_U8 << (beg & 0x3FU)) & (MAX_U8 >> (64 - (end & 0x3FU)));
} else {
bitv->bits[b] |= MAX_U8 << (beg & 0x3FU);
while(++b < e){ bitv->bits[b] = MAX_U8; }
bitv->bits[b] |= ~(MAX_U8 << (end & 0x3FU));
}
}
static inline void flip_bitvec(BitVec *bitv, u8i idx){ bitv->bits[idx>>6] ^= 1LLU << (idx&0x3FU); }
static inline void one_bitvec(BitVec *bitv, u8i idx){ bitv->bits[idx>>6] |= 1LLU << (idx&0x3FU); }
static inline void zero_bitvec(BitVec *bitv, u8i idx){ bitv->bits[idx>>6] &= ~(1LLU << (idx&0x3FU)); }
static inline void set_bitvec(BitVec *bitv, u8i idx, int v){
if(v){
one_bitvec(bitv, idx);
} else {
zero_bitvec(bitv, idx);
}
}
static inline u8i get_bitvec(BitVec *bitv, u8i idx){ return (bitv->bits[idx>>6] >> (idx&0x3FU)) & 0x01LLU; }
static inline u8i get64_bitvec(BitVec *bitv, u8i off){
u8i m, n;
m = off >> 6;
n = off & 0x3F;
if(n){
return (bitv->bits[m] >> (64 - n)) | (bitv->bits[m + 1] << n);
} else {
return bitv->bits[m];
}
}
static inline void set64_bitvec(BitVec *bitv, u8i off, u8i val){
u8i m, n;
m = off >> 6;
n = off & 0x3F;
if(n){
bitv->bits[m] = ((bitv->bits[m] << (64 - n)) >> (64 - n)) | (val << (64 - n));
m ++;
bitv->bits[m] = ((bitv->bits[m] >> n) << n) | (val >> (64 - n));
} else {
bitv->bits[m] = val;
}
}
static inline void encap_bitvec(BitVec *bitv, u8i num){
u8i cap;
if(bitv->n_bit + num < bitv->n_cap) return;
cap = bitv->n_cap;
while(bitv->n_bit + num >= bitv->n_cap){
if(bitv->n_cap < 1024 * 1024 * 8){
bitv->n_cap <<= 1;
} else bitv->n_cap += 1024 * 1024 * 8;
}
bitv->bits = (u8i*)realloc(bitv->bits, bitv->n_cap / 8 + 8);
memset(((void*)bitv->bits) + cap / 8, 0, (bitv->n_cap - cap) / 8 + 8);
bitv->bits[cap / 64] = 0x0000000000000001LLU;
}
static inline void recap_bitvec(BitVec *bitv, u8i new_cap){
if(new_cap & 0x3FU) new_cap = (new_cap & 0xFFFFFFFFFFFFFFC0LLU) + 0x40U;
if(bitv->n_cap == new_cap) return;
bitv->bits = (u8i*)realloc(bitv->bits, new_cap / 8 + 8);
if(new_cap > bitv->n_cap){
memset(((void*)bitv->bits) + bitv->n_cap / 8, 0, (new_cap - bitv->n_cap) / 8 + 8);
}
bitv->bits[new_cap / 64] = 0x0000000000000001LLU;
bitv->n_cap = new_cap;
}
static inline void one2bitvec(BitVec *bitv){ encap_bitvec(bitv, 1); one_bitvec(bitv, bitv->n_bit); bitv->n_bit ++; }
static inline void zero2bitvec(BitVec *bitv){ encap_bitvec(bitv, 1); zero_bitvec(bitv, bitv->n_bit); bitv->n_bit ++; }
static inline u8i get_2bitvec(BitVec *bitv, u8i idx){ return (bitv->bits[idx>>5] >> ((idx&0x1FU) << 1)) & 0x03LLU; }
static inline void set_2bitvec(BitVec *bitv, u8i idx, u8i v){
bitv->bits[idx>>5] = (bitv->bits[idx>>5] & (~(0x03LLU << ((idx&0x1FU) << 1)))) | ((v&0x03LLU) << ((idx&0x1FU) << 1));
}
static inline void push_2bitvec(BitVec *bitv, u8i v){
encap_bitvec(bitv, 2);
set_2bitvec(bitv, bitv->n_bit >> 1, v);
bitv->n_bit = ((bitv->n_bit >> 1) + 1) << 1;
}
static inline void end_bitvec(BitVec *bitv){ encap_bitvec(bitv, 1); one_bitvec(bitv, bitv->n_bit); }
static inline u8i next_one_bitvec(BitVec *bitv, u8i idx){
register u8i p, v;
register u4i s;
p = idx >> 6;
s = idx & 0x3F;
while(!(bitv->bits[p] >> s)){ p ++; s = 0; }
v = bitv->bits[p] >> s;
s += __builtin_ctzll(v);
return (p << 6) + s;
}
static inline u8i reg_count_bitvec(BitVec *bitv, u8i beg, u8i end){
u8i cnt, b, e, t;
if(beg >= end) return 0;
b = beg >> 6;
e = end >> 6;
if(b == e){
t = (bitv->bits[b] & (MAX_U8 >> (64 - (end & 0x3F)))) >> (beg & 0x3F);
cnt = count_ones_bit64(t);
} else {
cnt = count_ones_bit64(bitv->bits[b] >> (beg & 0x3F));
while(++b < e){
cnt += count_ones_bit64(bitv->bits[b]);
}
if(end & 0x3F){
cnt += count_ones_bit64(bitv->bits[b] & (MAX_U8 >> (64 - (end & 0x3F))));
}
}
return cnt;
}
static const int Mod37BitPosition[] = // map a bit value mod 37 to its position
{
32, 0, 1, 26, 2, 23, 27, 0, 3, 16,
24, 30, 28, 11, 0, 13, 4, 7, 17, 0,
25, 22, 31, 15, 29, 10, 12, 6, 0, 21,
14, 9, 5, 20, 8, 19, 18
};
static inline u8i next_one_bitvec2(BitVec *bitv, u8i idx){
register u8i p;
register u4i s, v;
p = idx >> 6;
s = idx & 0x3F;
while(!(bitv->bits[p] >> s)){ p ++; s = 0; }
if(!((bitv->bits[p] >> s) & 0xFFFFFFFFU)) s += 32;
v = bitv->bits[p] >> s;
s += Mod37BitPosition[(-v & v) % 37];
return (p << 6) + s;
}
static inline u8i next_one_bitvec3(BitVec *bitv, u8i idx){
register u8i p;
register u4i s;
p = idx >> 6;
s = idx & 0x3F;
while(!(bitv->bits[p] >> s)){ p ++; s = 0; }
while(!((bitv->bits[p] >> s) & 0xFFU)) s += 8;
while(!((bitv->bits[p] >> s) & 0x01U)) s ++;
return (p << 6) + s;
}
//n_cap MUST be times of 64 * 8
static inline void index_bitvec_core(BitVec *bitv, size_t n_cap){
u8i i, k, s, t, m;
m = ((n_cap + 63) / 64 + 7) / 8;
if(bitv->sums) free(bitv->sums);
bitv->sums = (u8i*)calloc((m * 2 + 1), 8);
t = 0;
for(i=0;i>6) >> 3) << 1;
bitv->sums[k] = t;
s = 0;
s += count_ones_bit64(bitv->bits[(i>>6)+0]);
bitv->sums[k+1] |= s << 0;
s += count_ones_bit64(bitv->bits[(i>>6)+1]);
bitv->sums[k+1] |= s << 9;
s += count_ones_bit64(bitv->bits[(i>>6)+2]);
bitv->sums[k+1] |= s << 18;
s += count_ones_bit64(bitv->bits[(i>>6)+3]);
bitv->sums[k+1] |= s << 27;
s += count_ones_bit64(bitv->bits[(i>>6)+4]);
bitv->sums[k+1] |= s << 36;
s += count_ones_bit64(bitv->bits[(i>>6)+5]);
bitv->sums[k+1] |= s << 45;
s += count_ones_bit64(bitv->bits[(i>>6)+6]);
bitv->sums[k+1] |= s << 54;
s += count_ones_bit64(bitv->bits[(i>>6)+7]);
t += s;
}
bitv->sums[((i>>6) >> 3) << 1] = t;
bitv->n_ones = t;
bitv->sum_size = m;
bitv->hash_size = (n_cap / 64 / 8) / 2;
if(bitv->hash_size == 0) bitv->hash_size = 1;
bitv->hash_mod = (t + bitv->hash_size) / bitv->hash_size;
if(bitv->hash_mod == 0) bitv->hash_mod = 1;
if(bitv->hash) free(bitv->hash);
bitv->hash = (u8i*)malloc(sizeof(u8i) * bitv->hash_size);
s = 0;
t = 0;
for(i=0;i<=m;i++){
k = bitv->sums[i*2] / bitv->hash_mod;
if(s < k){
while(s < k){ bitv->hash[s] = t; s ++; }
t = i? i - 1 : 0;
}
}
bitv->hash[bitv->sums[m*2] / bitv->hash_mod] = t;
}
static inline void index_bitvec(BitVec *bitv){
index_bitvec_core(bitv, bitv->n_cap);
}
static inline u8i rank_bitvec(BitVec *bitv, u8i idx){
u8i p, s, sum;
p = (idx>>6)>>3;
s = (idx >> 6) & 0x07U;
sum = bitv->sums[p<<1];
if(s) sum += (bitv->sums[(p<<1)+1] >> (9 * (s - 1))) & 0x1FFU;
if(idx & 0x3FU) sum += count_ones_bit64(bitv->bits[idx>>6]<<(64-(idx&0x3FU)));
return sum;
}
static inline u1i select_8bytes(u8i word, u1i n_one){
u1i idx, n, m;
n = count_ones_bit32((u4i)word);
if(n >= n_one){
n = 0;
idx = 0;
word = word & 0xFFFFFFFFU;
} else {
idx = 32;
word = word >> 32;
}
while(1){
m = byte_ones_table[(u1i)word];
if(n + m >= n_one) break;
n += m;
idx += 8;
word >>= 8;
}
m = byte_ones_table[(u1i)(word & 0xF)];
if(n + m < n_one){
idx += 4;
word >>= 4;
n += m;
}
while(word){
idx ++;
if(word & 0x01){
n ++;
if(n == n_one) break;
}
word >>= 1;
}
return idx;
}
/*
* To select the 1'st one, use select_bitvec(bitv, 1) - 1
* */
static inline u8i select_bitvec(BitVec *bitv, u8i idx){
u8i i, p, s, sum, t;
p = bitv->hash[idx / bitv->hash_mod];
while(p + 1 < bitv->sum_size && bitv->sums[(p + 1) << 1] < idx) p ++;
sum = bitv->sums[p << 1];
i = 0;
t = sum;
while(i < 7){
s = (bitv->sums[(p << 1) + 1] >> (9 * i)) & 0x1FFU;
if(sum + s >= idx) break;
t = sum + s;
i ++;
}
p = p * 8 + i;
s = idx - t;
return p * 64 + select_8bytes(bitv->bits[p], s);
}
static inline void begin_iter_bitvec(BitVec *bitv){ bitv->iter_idx = -1; }
static inline u8i iter_bitvec(BitVec *bitv){
if((u8i)(bitv->iter_idx + 1) > bitv->n_cap) return 0xFFFFFFFFFFFFFFFFLLU;
bitv->iter_idx = next_one_bitvec(bitv, bitv->iter_idx + 1);
return (u8i)bitv->iter_idx;
}
static inline void free_bitvec(BitVec *bitv){
free(bitv->bits);
if(bitv->sums) free(bitv->sums);
if(bitv->hash) free(bitv->hash);
free(bitv);
}
#if 0
static inline size_t mem_size_bitvec(BitVec *bitv){
size_t m;
m = (sizeof(BitVec) + 7) / 8 * 8 + ((bitv->n_cap / 64) * 8);
if(bitv->sums){
m += (bitv->sum_size * 2 + 1) * 8;
}
if(bitv->hash){
m += bitv->hash_size * 8;
}
return m;
}
static inline size_t mem_dump_bitvec(BitVec *bitv, void *mem){
BitVec *clone;
size_t off;
clone = mem;
memcpy(clone, bitv, sizeof(BitVec));
off = ((sizeof(BitVec) + 7) / 8) * 8;
clone->bits = mem + off;
memcpy(clone->bits, bitv->bits, (bitv->n_cap / 64) * 8);
off += (bitv->n_cap / 64) * 8;
if(bitv->sums){
clone->sums = mem + off;
memcpy(clone->sums, bitv->sums, (bitv->sum_size * 2 + 1) * 8);
off += (bitv->sum_size * 2 + 1) * 8;
}
if(bitv->hash){
clone->hash = mem + off;
memcpy(clone->hash, bitv->hash, bitv->hash_size * 8);
off += bitv->hash_size * 8;
}
return off;
}
#endif
#endif
wtdbg2-2.5/chararray.h 0000664 0000000 0000000 00000020376 13536643722 0014721 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __STRING_RJ_H
#define __STRING_RJ_H
#include
#include
#include
#include
#include "list.h"
#include "mem_share.h"
/**
* String
*/
#ifdef HUGE_STRING
typedef long int string_size_t;
#else
typedef int string_size_t;
#endif
typedef struct {
union { char *string; char *buffer; };
string_size_t size;
string_size_t capacity;
} String;
typedef struct {
char *string;
string_size_t size;
} VString;
define_list(VStrv, VString);
static inline String* init_string(string_size_t cap){
String *str;
str = (String*)malloc(sizeof(String));
str->size = 0;
str->capacity = (cap&0x1)? cap + 1 : cap + 2;
str->string = (char*)malloc(sizeof(char) * (str->capacity));
str->string[0] = 0;
return str;
}
static inline size_t string_obj_desc_cnt(void *obj, int idx){
return ((String*)obj)->size + 1;
idx = idx;
}
static const obj_desc_t string_obj_desc = {"string_obj_desc", sizeof(String), 1, {1}, {offsetof(String, string)}, {(obj_desc_t*)&OBJ_DESC_DATA}, string_obj_desc_cnt, NULL};
static inline unsigned int _string_size_roundup_power2(unsigned int v){
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return v + 1;
}
static inline void encap_string(String *str, string_size_t inc){
if(inc + str->size + 1 > str->capacity){
if(inc + str->size + 1 < 0xFFFFFFF){
str->capacity = _string_size_roundup_power2(inc + str->size + 1);
} else {
str->capacity += (((inc + str->size + 1) - str->capacity + 0xFFFFFFF - 1) / 0xFFFFFFF) * 0xFFFFFFF;
}
str->string = (char*)realloc(str->string, str->capacity);
}
}
static inline void recap_string(String *str, string_size_t cap){
if(cap <= str->size) return;
str->string = (char*)realloc(str->string, cap);
str->capacity = cap;
}
static inline void uc_string(String *str){
string_size_t i;
for(i=0;isize;i++){
if(str->string[i] >= 'a' && str->string[i] <= 'z') str->string[i] = str->string[i] + 'A' - 'a';
}
}
static inline void lc_string(String *str){
string_size_t i;
for(i=0;isize;i++){
if(str->string[i] >= 'A' && str->string[i] <= 'Z') str->string[i] = str->string[i] + 'a' - 'A';
}
}
static inline char* substr(char *string, string_size_t len, char *dst){
char *str;
if(len < 0) len = strlen(string);
if(dst != NULL) str = dst;
else str = (char*)malloc(sizeof(char) * (len + 1));
strncpy(str, string, len);
str[len] = '\0';
return str;
}
static inline char* catstr(string_size_t n_str, ...){
char *str, *s;
string_size_t i, len, inc;
va_list params;
len = 0;
str = NULL;
va_start(params, n_str);
for(i=0;isize && str->string[str->size - 1] == '\n'){
str->size --;
str->string[str->size] = 0;
}
}
static inline void chomp_vstring(VString *str){
if(str->size && str->string[str->size - 1] == '\n'){
str->size --;
}
}
static inline void trim_string(String *str){
string_size_t i, j;
i = str->size - 1;
while(i >= 0 && (str->string[i] == '\n' || str->string[i] == '\t' || str->string[i] == ' ')) i--;
str->size = i + 1;
i = 0;
while(i < str->size && (str->string[i] == '\n' || str->string[i] == '\t' || str->string[i] == ' ')) i++;
if(i){
for(j=i;jsize;j++){ str->string[j-i] = str->string[j]; }
str->size -= i;
}
str->string[str->size] = 0;
}
static inline void trim_vstring(VString *str){
string_size_t i;
i = str->size - 1;
while(i >= 0 && (str->string[i] == '\n' || str->string[i] == '\t' || str->string[i] == ' ')) i--;
str->size = i + 1;
i = 0;
while(i < str->size && (str->string[i] == '\n' || str->string[i] == '\t' || str->string[i] == ' ')) i++;
str->string += i;
}
static inline void append_string(String *str, char *src, string_size_t offlen){
encap_string(str, offlen);
memcpy(str->string + str->size, src, offlen);
str->size += offlen;
str->string[str->size] = 0;
}
static inline void append_char_string(String *str, char c, string_size_t num){
encap_string(str, num);
while(num-- > 0){ str->string[str->size ++] = c; }
str->string[str->size] = 0;
}
static inline String* as_string(char *chs, string_size_t len){
String *str;
str = init_string(len);
memcpy(str->string, chs, len);
str->size = len;
str->string[len] = 0;
return str;
}
static inline VString* as_vstring(char *chs){
string_size_t len;
VString *str;
len = strlen(chs);
str = malloc(sizeof(VString));
str->string = chs;
str->size = len;
return str;
}
static inline void add_char_string(String *str, char ch){
encap_string(str, 1);
str->string[str->size++] = ch;
str->string[str->size] = 0;
}
#define push_string(str, ch) add_char_string(str, ch)
static inline void add_int_string(String *str, long long val){
string_size_t n;
long long v;
encap_string(str, 30);
if(val == 0){
str->string[str->size++] = '0';
} else {
if(val < 0){
val = - val;
str->string[str->size++] = '-';
}
v = val;
for(n=0;v;n++) v /= 10;
str->size += n;
v = val;
while(v){
str->string[--str->size] = '0' + (v % 10);
v /= 10;
}
str->size += n;
}
str->string[str->size] = 0;
}
static inline void clear_string(String *str){ str->size = 0; str->string[0] = 0; }
static inline string_size_t split_string(String *str, char separator, VStrv *vstrs){
VString *vstr;
string_size_t n_tab, i, s;
for(i=s=n_tab=0;isize;i++){
if(str->string[i] == separator){
if(i > s){
str->string[i] = '\0';
vstr = next_ref_VStrv(vstrs);
vstr->string = str->string + s;
n_tab ++;
vstr->size = i - s;
}
s = i + 1;
}
}
if(i > s){
str->string[i] = '\0';
vstr = next_ref_VStrv(vstrs);
vstr->string = str->string + s;
n_tab ++;
vstr->size = i - s;
}
return n_tab;
}
static inline string_size_t split_vstring(VString *str, char separator, VStrv *vstrs, string_size_t cut){
VString *vstr;
string_size_t n_tab, i, s;
for(i=s=n_tab=0;isize;i++){
if(str->string[i] == separator){
if(i > s){
if(cut) str->string[i] = '\0';
vstr = next_ref_VStrv(vstrs);
vstr->string = str->string + s;
n_tab ++;
vstr->size = i - s;
}
s = i + 1;
}
}
if(i > s){
if(cut) str->string[i] = '\0';
vstr = next_ref_VStrv(vstrs);
vstr->string = str->string + s;
n_tab ++;
vstr->size = i - s;
}
return n_tab;
}
static inline void reverse_string(String *str){
string_size_t i, j;
char c;
i = 0;
j = str->size - 1;
while(i < j){
swap_tmp(str->string[i], str->string[j], c);
i ++;
j --;
}
}
static inline void reverse_str(char *str, string_size_t len){
string_size_t i, j;
char c;
i = 0;
j = len - 1;
while(i < j){
swap_tmp(str[i], str[j], c);
i ++;
j --;
}
}
static inline void tidy_string(String *src, String *dst, char ch){
string_size_t i;
encap_string(dst, src->size);
for(i=0;isize;i++){
if(src->string[i] != ch){
dst->string[dst->size ++] = src->string[i];
}
}
dst->string[dst->size] = 0;
}
static inline string_size_t occ_str(char *str, string_size_t len, char c){
string_size_t i, ret;
for(i=ret=0;i= str->size || size < 0) return;
str->size = size;
str->string[size] = 0;
}
static inline String* clone_string(String *str){
String *clone;
clone = init_string(str->size);
append_string(clone, str->string, str->size);
return clone;
}
static inline void free_string(String *str){ free(str->string); free(str); }
static inline void free_vstring(VString *str){ free(str); }
#endif
wtdbg2-2.5/dagcns.h 0000664 0000000 0000000 00000061517 13536643722 0014206 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include "dna.h"
#include "chararray.h"
#include "list.h"
#include "queue.h"
#include "hashset.h"
#include "general_graph.h"
#include
#include
#ifndef PACBIO_PROBS_DAGCNS_RJ_H
#define PACBIO_PROBS_DAGCNS_RJ_H
#define DAGCNS_MAX_LEN 0x3FFF // 16k
static int dagcns_debug = 0;
typedef struct {
int x, y;
u4i dnidx;
f8i probs[2];
} bdp_node_t;
define_list(bdpnodev, bdp_node_t);
typedef struct {
f8i prob;
u1i cigar, base;
} bdp_edge_t;
define_list(bdpedgev, bdp_edge_t);
define_simple_geg_callback(bdp, bdpnodev, bdp_node_t, bdpedgev, bdp_edge_t);
typedef struct {
u4i gnidx, dnidx;
} bdp_link_t;
define_list(bdplinkv, bdp_link_t);
typedef struct {
uint32_t nodes[2];
uint32_t links[2];
uint32_t cov:28, visit:1, closed:1, cns:1, alt:1;
double score;
} dagedge_t;
define_list(dagedgev, dagedge_t);
#define NODE_MAX_FW_EDGE 0xFFFFFFFFU
typedef struct dagnode_t {
uint32_t pos:28, base:2, cns:1, visit:1;
uint32_t fw_edge;
uint32_t edges[2];
f8i aux;
} dagnode_t;
define_list(dagnodev, dagnode_t);
typedef struct {
uint32_t pos;
uint32_t bases[4];
} dagsnp_t;
define_list(dagsnpv, dagsnp_t);
typedef struct {
u8list *cns;
u32list *deps;
dagnodev *nodes;
dagedgev *edges;
u32list *trash;
String *alns[2];
int W, M, X, I, D, E;
f4i pM, pX, pI, pD; // log(prob_M)
double ref_penalty, alt_penalty; // 0.5 and 0.2
double cns_score;
uint32_t cns_head, backbone_size;
} DAGCNS;
static inline DAGCNS* init_dagcns(int W, int M, int X, int I, int D, int E, f4i pM, f4i pX, f4i pI, f4i pD){
DAGCNS *g;
g = malloc(sizeof(DAGCNS));
g->cns = init_u8list(1024);
g->deps = init_u32list(1024);
g->nodes = init_dagnodev(1024);
g->edges = init_dagedgev(1024);
g->trash = init_u32list(1024);
g->alns[0] = init_string(1024);
g->alns[1] = init_string(1024);
g->cns_score = 0;
g->cns_head = 0xFFFFFFFFU;
g->backbone_size = 0;
g->W = W;
g->M = M;
g->X = X;
g->I = I;
g->D = D;
g->E = E;
g->pM = pM;
g->pX = pX;
g->pI = pI;
g->pD = pD;
g->ref_penalty = 0.5;
g->alt_penalty = 0.2;
return g;
}
static inline void free_dagcns(DAGCNS *g){
free_dagnodev(g->nodes);
free_dagedgev(g->edges);
free_u32list(g->trash);
free_u8list(g->cns);
free_u32list(g->deps);
free_string(g->alns[0]);
free_string(g->alns[1]);
free(g);
}
static inline void reset_dagcns(DAGCNS *g){
clear_dagnodev(g->nodes);
clear_dagedgev(g->edges);
clear_u32list(g->trash);
clear_u8list(g->cns);
clear_u32list(g->deps);
g->cns_score = 0;
g->cns_head = 0xFFFFFFFFU;
g->backbone_size = 0;
}
static uint32_t prepare_node_dagcns(DAGCNS *g, uint32_t pos, uint8_t base){
dagnode_t *n;
n = next_ref_dagnodev(g->nodes);
n->pos = pos;
n->base = base;
n->cns = 0;
n->aux = 0;
n->visit = 0;
n->edges[0] = 0xFFFFFFFFU;
n->edges[1] = 0xFFFFFFFFU;
n->fw_edge = NODE_MAX_FW_EDGE;
return g->nodes->size - 1;
}
static inline dagedge_t* find_edge_by_node_dagcns(DAGCNS *g, uint32_t nid1, uint32_t nid2, int dir){
dagnode_t *n;
dagedge_t *e;
n = ref_dagnodev(g->nodes, nid1);
if(n->edges[dir] != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, n->edges[dir]);
while(1){
if(e->nodes[dir] == nid2) return e;
if(e->links[dir] == 0xFFFFFFFFU) break;
e = ref_dagedgev(g->edges, e->links[dir]);
}
}
return NULL;
}
static inline dagedge_t* add_edge_dagcns(DAGCNS *g, uint32_t nid1, uint32_t nid2, int dir){
dagnode_t *n;
dagedge_t *e;
uint32_t eid;
n = ref_dagnodev(g->nodes, nid1);
if(pop_u32list(g->trash, &eid)){
e = ref_dagedgev(g->edges, eid);
} else {
eid = g->edges->size;
e = next_ref_dagedgev(g->edges);
}
e->nodes[!dir] = nid1;
e->nodes[dir] = nid2;
e->cov = 1;
e->score = 0;
e->visit = 0;
e->closed = 0;
e->cns = 0;
e->alt = 0;
e->links[dir] = n->edges[dir];
n->edges[dir] = eid;
n = ref_dagnodev(g->nodes, nid2);
e->links[!dir] = n->edges[!dir];
n->edges[!dir] = eid;
return e;
}
static inline dagedge_t* prepare_edge_dagcns(DAGCNS *g, uint32_t nid1, uint32_t nid2, int dir){
dagedge_t *e;
e = find_edge_by_node_dagcns(g, nid1, nid2, dir);
if(e){ e->cov ++; return e; }
return add_edge_dagcns(g, nid1, nid2, dir);
}
static inline void gen_pregraph_dagcns(DAGCNS *g){
dagedge_t *e;
uint32_t i;
clear_dagnodev(g->nodes);
clear_dagedgev(g->edges);
clear_u32list(g->trash);
clear_u32list(g->deps);
g->backbone_size = g->cns->size;
for(i=0;icns->size;i++){
push_u32list(g->deps, 0);
prepare_node_dagcns(g, i, g->cns->buffer[i]);
if(i){ // make sure the graph is conntective even the alignment is partial
e = add_edge_dagcns(g, i - 1, i, 0);
e->cov = 0;
}
}
}
static inline int remove_edge_dagcns(DAGCNS *g, uint32_t eid){
dagnode_t *n;
dagedge_t *e;
uint32_t i, lst;
for(i=0;i<2;i++){
e = ref_dagedgev(g->edges, eid);
lst = e->links[i];
n = ref_dagnodev(g->nodes, e->nodes[!i]);
if(n->edges[i] == eid){
n->edges[i] = lst;
} else if(n->edges[i] == 0xFFFFFFFFU){
//fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort();
return 0;
} else {
e = ref_dagedgev(g->edges, n->edges[i]);
while(1){
if(e->links[i] == eid){
e->links[i] = lst; break;
} else if(e->links[i] == 0xFFFFFFFFU){
//fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort();
return 0;
} else {
e = ref_dagedgev(g->edges, e->links[i]);
}
}
}
}
push_u32list(g->trash, eid);
return 1;
}
#define MIN_SCORE -0x0FFFFFFF
static inline f8i log_sum(f8i a, f8i b){
f8i c;
c = num_max(a, b);
if(c - a >= 10 || c - b >= 10) return c;
return logl(expl(a - c) + expl(b - c)) + c;
}
typedef struct {
int x, y, d;
} bdp_beg_t;
define_list(bdpbegv, bdp_beg_t);
static inline void fprint_dot_bdpgraph(GEGraph *g, bdpnodev *bnodes, bdpedgev *bedges, char *prefix, char *suffix){
static const char *colors[2][2] = {{"blue", "green"}, {"red", "gray"}};
FILE *out;
ge_node_t *n;
bdp_node_t *bn;
ge_edge_t *e;
bdp_edge_t *be;
u8i i;
out = open_file_for_write(prefix, suffix, 1);
fprintf(out, "digraph {\nnode [shape=record]\nrankdir=LR\n");
for(i=0;inodes->size;i++){
n = ref_genodev(g->nodes, i);
if(n->closed) continue;
bn = ref_bdpnodev(bnodes, offset_genodev(g->nodes, n));
fprintf(out, " N%llu [label=\"{N%llu|%d|%d}|{%.4Lf|%.4Lf}\"]\n", i, i, bn->x, bn->y, bn->probs[0], bn->probs[1]);
}
for(i=1;iedges->size;i++){
e = ref_geedgev(g->edges, i);
if(e->closed) continue;
be = ref_bdpedgev(bedges, offset_geedgev(g->edges, e));
fprintf(out, " N%llu -> N%llu [label=\"%c%c:%d:%c:%.4Lf\" color=%s]\n", (u8i)e->node1, (u8i)e->node2, "+-"[e->dir1], "+-"[e->dir2], e->cov, "MIDX"[be->cigar], be->prob, colors[e->dir1][e->dir2]);
fprintf(out, " N%llu -> N%llu [label=\"%c%c:%d:%c:%.4Lf\" color=%s]\n", (u8i)e->node2, (u8i)e->node1, "-+"[e->dir2], "-+"[e->dir1], e->cov, "MIDX"[be->cigar], be->prob, colors[!e->dir2][!e->dir1]);
}
fprintf(out, "}\n");
fclose(out);
}
static inline u4i dp_matrix2alignment_graph(DAGCNS *dag, u1i *query, u1i *z, int x, int y, GEGraph *g, bdpnodev *bnodes, bdpedgev *bedges){
ge_node_t *gn, *gn2;
ge_edge_ref_t *gf;
ge_edge_t *ge;
bdp_node_t *bn, *bn2;
bdp_edge_t *be;
bdp_beg_t BEG;
UUhash *bnhash;
UUhash_t *u;
bdpbegv *stack;
f8i cigar_probs[4], curator;
u8v *idxs;
u1i *target;
u4i i, k, nidx, nlst, nbegs[2], visit, ret;
int d, f, n_col, ops[2], base, exists;
cigar_probs[0] = dag->pM;
cigar_probs[1] = dag->pI;
cigar_probs[2] = dag->pD;
cigar_probs[3] = dag->pX;
n_col = dag->cns->size;
target = dag->cns->buffer;
bdp_set_callbacks_gegraph(g, bnodes, bedges);
reset_gegraph(g);
stack = init_bdpbegv(4);
push_bdpbegv(stack, (bdp_beg_t){x, y, 0});
idxs = init_u8v(4);
nbegs[0] = nbegs[1] = 0;
bnhash = init_UUhash(1023);
ret = 0;
while(stack->size){
BEG = stack->buffer[--stack->size];
u = prepare_UUhash(bnhash, (((b8i)BEG.x) << 32) | BEG.y, &exists);
if(exists){
nidx = u->val;
} else {
gn = add_node_gegraph(g);
nidx = offset_genodev(g->nodes, gn);
u->key = (((b8i)BEG.x) << 32) | BEG.y;
u->val = nidx;
bn = ref_bdpnodev(bnodes, nidx);
bn->x = BEG.x; bn->y = BEG.y;
bn->dnidx = MAX_VALUE_U4;
}
nlst = nidx;
x = BEG.x; y = BEG.y;
d = BEG.d;
f = 0;
clear_u8v(idxs);
while(x >= 0 && y >= 0){
d = (z[x * n_col + y] >> (d << 1)) & 0x03;
if(d == 0){
if(query[x] == target[y]){
if(z[x * n_col + y] & (1 << 6)){
f = 1;
break;
} else if(BEG.d == 0){
z[x * n_col + y] |= 1 << 6;
push_bdpbegv(stack, (bdp_beg_t){x, y, 1});
push_bdpbegv(stack, (bdp_beg_t){x, y, 2});
}
ops[0] = 0; ops[1] = 3;
} else {
ops[0] = 1; ops[1] = 2;
}
//x --; y --;
} else if(d == 1){
ops[0] = 1; ops[1] = 3;
//x --;
} else {
ops[0] = 2; ops[1] = 3;
//y --;
}
for(i=0;i<2;i++){
if(ops[i] == 3) break;
base = query[x];
if(ops[i] == 0){ x --; y --; }
else if(ops[i] == 1){ x --; }
else { y --; }
u = prepare_UUhash(bnhash, (((b8i)x) << 32) | y, &exists);
if(exists){
nidx = u->val;
} else {
gn = add_node_gegraph(g);
nidx = offset_genodev(g->nodes, gn);
u->key = (((b8i)x) << 32) | y;
u->val = nidx;
bn = ref_bdpnodev(bnodes, nidx);
bn->x = x; bn->y = y;
bn->dnidx = MAX_VALUE_U4;
}
ge = prepare_edge_gegraph(g, nlst, 1, nidx, 1, &exists);
if(exists){
ge->cov ++;
} else {
ge->cov = 1;
be = ref_bdpedgev(bedges, offset_geedgev(g->edges, ge));
be->cigar = ops[i];
be->base = base;
}
nlst = nidx;
if(BEG.d) push_u8v(idxs, offset_geedgev(g->edges, ge));
}
}
if(BEG.d){
if(f == 0){
// not a bubble
for(i=0;isize;i++){
ge = ref_geedgev(g->edges, idxs->buffer[i]);
ge->cov --;
if(ge->cov == 0) cut_edge_gegraph(g, ge);
}
} else ret ++;
} else {
ret ++;
nbegs[0] = g->nodes->size > 1? g->nodes->size - 2 : 0;
nbegs[1] = 0;
}
}
free_bdpbegv(stack);
free_UUhash(bnhash);
u4i cnt = 0;
for(i=0;inodes->size;i++){
gn = ref_genodev(g->nodes, i);
if(gn->edges[0].cnt || gn->edges[1].cnt){ cnt++; continue; }
gn->closed = 1;
}
// calculate probabilities (forward + backward)
if(nbegs[0] == nbegs[1]){
free_u8v(idxs);
return ret;
}
for(k=0;k<2;k++){
bn = ref_bdpnodev(bnodes, nbegs[k]);
bn->probs[k] = 0;
clear_u8v(idxs);
push_u8v(idxs, nbegs[k]);
visit = k + 1;
while(idxs->size){
gn = ref_genodev(g->nodes, idxs->buffer[idxs->size - 1]);
bn = ref_bdpnodev(bnodes, idxs->buffer[idxs->size - 1]);
idxs->size --;
geg_beg_iter_edges(g, gn, k, gf, ge);
if(ge->closed) continue;
be = ref_bdpedgev(bedges, offset_geedgev(g->edges, ge));
gn2 = ref_genodev(g->nodes, gf->flg? ge->node1 : ge->node2);
bn2 = ref_bdpnodev(bnodes, gf->flg? ge->node1 : ge->node2);
if(gn2->bt_visit == visit){
bn2->probs[k] = log_sum(bn2->probs[k], bn->probs[k] + cigar_probs[be->cigar]); // p1 + p2
} else {
gn2->bt_visit = visit;
gn2->unvisit = gn2->edges[!k].cnt;
bn2->probs[k] = bn->probs[k] + cigar_probs[be->cigar]; // p1 * p2
}
if(gn2->unvisit) gn2->unvisit --;
if(gn2->unvisit == 0){
push_u8v(idxs, offset_genodev(g->nodes, gn2));
}
geg_end_iter_edges();
}
}
free_u8v(idxs);
// calculate edge prob
curator = -FLT_MAX;
for(i=1;iedges->size;i++){
ge = ref_geedgev(g->edges, i);
if(ge->closed) continue;
be = ref_bdpedgev(bedges, i);
be->prob = cigar_probs[be->cigar] + ref_bdpnodev(bnodes, ge->node1)->probs[ge->dir1] + ref_bdpnodev(bnodes, ge->node2)->probs[!ge->dir2];
if(be->prob > curator) curator = be->prob;
}
for(i=1;iedges->size;i++){
ge = ref_geedgev(g->edges, i);
if(ge->closed) continue;
be = ref_bdpedgev(bedges, i);
be->prob = expl(be->prob - curator);
}
//fprint_dot_bdpgraph(g, bnodes, bedges, "geg.dot", NULL);
return nbegs[0];
}
static inline u4i branched_dynamic_programming_alignment(DAGCNS *g, u1i *query, int ql, GEGraph *geg, bdpnodev *bnodes, bdpedgev *bedges, u1v *mem_buffer){
u4i nbeg;
int *rh, *re;
u1i *z, *zi, d, *target;
int tl, n_col;
int i, j, jc, jb, je, h1, h, m, e, f, t;
int mi, mj, max;
int W, M, X, I, D, E;
tl = g->cns->size;
target = g->cns->buffer;
if(ql <= 0 || tl <= 0){ return 0; }
n_col = tl;
encap_u1v(mem_buffer, kswx_roundup8x((tl + 2) * sizeof(int)) + kswx_roundup8x((tl + 2) * sizeof(int)) + kswx_roundup8x(((long long)ql) * n_col));
rh = (int*)(mem_buffer->buffer + mem_buffer->size);
re = (int*)(mem_buffer->buffer + mem_buffer->size + kswx_roundup8x((tl + 2) * sizeof(int)));
z = (mem_buffer->buffer + mem_buffer->size + kswx_roundup8x((tl + 2) * sizeof(int)) + kswx_roundup8x((tl + 2) * sizeof(int)));
W = g->W;
M = g->M;
X = g->X;
I = g->I;
D = g->D;
E = g->E;
// banded DP, global alignment
rh[0] = 0;
re[1] = 0 + D + E;
for(j=2;j<=tl&&j<=W;j++) rh[j] = rh[j-1] + E;
for(;j W? jc - W : 0;
je = jc + W + 1 < tl? jc + W + 1 : tl;
h1 = jb == 0? (I + E * (i + 1)) : MIN_SCORE;
zi = &z[i * n_col];
for(j=jb;j= e? 0 : 1;
h = m >= e? m : e;
d = h >= f? d : 2;
h = h >= f? h : f;
h1 = h;
t = m + I + E;
e = e + E;
d |= e > t? 1<<2 : 0;
e = e > t? e : t;
re[j] = e;
t = m + D + E;
f = f + E;
d |= f > t? 2<<4 : 0;
f = f > t? f : t;
zi[j] = d;
}
rh[j] = h1; re[j] = MIN_SCORE;
if(i + 1 == ql){
for(j=jb;j max){
max = rh[j + 1];
mi = i; mj = j;
}
}
} else if(je == tl){
if(h1 > max){
max = h1;
mi = i; mj = tl - 1;
}
}
}
if(max == MIN_SCORE) return 0;
nbeg = dp_matrix2alignment_graph(g, query, z, mi, mj, geg, bnodes, bedges);
return nbeg;
}
static inline void bdpgraph2dagcns(DAGCNS *dg, GEGraph *gg, bdpnodev *bnodes, bdpedgev *bedges, u4i nbeg, bdplinkv *stack){
dagnode_t *dn, *dn2;
dagedge_t *de;
ge_node_t *gn, *gn2;
ge_edge_ref_t *gf;
ge_edge_t *ge;
bdp_node_t *bn, *bn2;
bdp_edge_t *be;
bdp_link_t T;
u4i i, j, beg, end;
int open;
for(i=0;inodes->size;i++) gg->nodes->buffer[i].bt_visit = 0;
clear_bdplinkv(stack);
push_bdplinkv(stack, (bdp_link_t){nbeg, 0xFFFFFFFFU});
beg = bnodes->buffer[nbeg].y;
end = beg;
open = 0;
while(stack->size){
T = stack->buffer[--stack->size];
gn = ref_genodev(gg->nodes, T.gnidx);
bn = ref_bdpnodev(bnodes, T.gnidx);
if(bn->y > (int)end) end = bn->y;
dn = (T.dnidx == 0xFFFFFFFFU)? NULL : ref_dagnodev(dg->nodes, T.dnidx);
geg_beg_iter_edges(gg, gn, 0, gf, ge);
if(ge->closed) continue;
be = ref_bdpedgev(bedges, offset_geedgev(gg->edges, ge));
gn2 = ref_genodev(gg->nodes, gf->flg? ge->node1 : ge->node2);
bn2 = ref_bdpnodev(bnodes, gf->flg? ge->node1 : ge->node2);
if(gn2->bt_visit == 0){
gn2->bt_visit = 1;
open ++;
gn2->unvisit = gn2->edges[1].cnt;
}
if(gn2->unvisit) gn2->unvisit --;
if(gn2->unvisit == 0){
open --;
}
if(open < 0){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
if(bn2->dnidx == MAX_VALUE_U4){
if(be->cigar == 2){
dn2 = dn;
} else if(dn && (open || be->cigar == 1)){
dn2 = ref_dagnodev(dg->nodes, prepare_node_dagcns(dg, bn2->y, be->base));
dn = dn? ref_dagnodev(dg->nodes, T.dnidx) : NULL;
} else if((dn || open == 0) && be->cigar == 0){ // cigar == 0 && open == 0
dn2 = ref_dagnodev(dg->nodes, bn2->y);
} else dn2 = NULL;
bn2->dnidx = dn2? offset_dagnodev(dg->nodes, dn2) : MAX_VALUE_U4;
} else {
dn2 = ref_dagnodev(dg->nodes, bn2->dnidx);
}
if(dn && dn2 && dn != dn2){
de = prepare_edge_dagcns(dg, offset_dagnodev(dg->nodes, dn), offset_dagnodev(dg->nodes, dn2), 0);
de->score += be->prob;
}
if(gn2->unvisit == 0){
push_bdplinkv(stack, (bdp_link_t){offset_genodev(gg->nodes, gn2), dn2? offset_dagnodev(dg->nodes, dn2) : 0xFFFFFFFFU});
}
geg_end_iter_edges();
}
for(j=beg;jdeps->buffer[j] ++;
return;
}
static inline void merge_nodes_core_dagcns(DAGCNS *g, uint32_t nid, u32list *stack, u32list *cache[4], int dir){
dagnode_t *n0, *n2, *n;
dagedge_t *e, *e2, *e1;
uint32_t base, eid, nid1, i, ret;
clear_u32list(stack);
push_u32list(stack, nid);
ret = 0;
while(pop_u32list(stack, &nid)){
n0 = ref_dagnodev(g->nodes, nid);
if((eid = n0->edges[dir]) == 0xFFFFFFFFU) continue;
clear_u32list(cache[0]);
clear_u32list(cache[1]);
clear_u32list(cache[2]);
clear_u32list(cache[3]);
while(1){
e = ref_dagedgev(g->edges, eid);
n = ref_dagnodev(g->nodes, e->nodes[dir]);
e2 = ref_dagedgev(g->edges, n->edges[!dir]);
if(e2->links[!dir] == 0xFFFFFFFFU){ // check whether there is only one edge from n -(!dir)-> n0
push_u32list(cache[n->base], eid);
}
if((eid = e->links[dir]) == 0xFFFFFFFFU) break;
}
for(base=0;base<4;base++){
if(cache[base]->size < 2) continue;
for(i=0;isize;i++) ref_dagedgev(g->edges, cache[base]->buffer[i])->visit = 1;
e1 = ref_dagedgev(g->edges, cache[base]->buffer[0]);
n = ref_dagnodev(g->nodes, e1->nodes[dir]);
eid = n->edges[dir];
nid1 = e1->nodes[dir];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
e->visit = 1;
eid = e->links[dir];
}
for(i=1;isize;i++){
e2 = ref_dagedgev(g->edges, cache[base]->buffer[i]);
n2 = ref_dagnodev(g->nodes, e2->nodes[dir]);
e1->cov += e2->cov;
e1->score += e2->score;
remove_edge_dagcns(g, cache[base]->buffer[i]);
eid = n2->edges[dir];
while(eid != 0xFFFFFFFFU){
e2 = ref_dagedgev(g->edges, eid);
e = prepare_edge_dagcns(g, nid1, e2->nodes[dir], dir);
{
e1 = ref_dagedgev(g->edges, cache[base]->buffer[0]); // memory referred by e1 may be freed in prepare_edge_dagcns
}
e->cov = e->cov - 1 + e2->cov;
e->score = e->score + e2->score;
e->visit = 1;
eid = e2->links[dir];
}
eid = n2->edges[dir];
while(eid != 0xFFFFFFFFU){
e2 = ref_dagedgev(g->edges, eid);
remove_edge_dagcns(g, eid); // e2->links retain the same values after removing
eid = e2->links[dir];
}
}
//n = ref_dagnodev(g->nodes, e1->nodes[dir]);
//eid = n->edges[dir];
//if(eid != 0xFFFFFFFFU && g->edges->buffer[eid].links[dir] == 0xFFFFFFFFU) continue; // we had merged a bubble branch1:A->C->T, branch2:A->C->T
push_u32list(stack, nid1);
}
}
}
static inline int has_non_visited_edge_dagcns(DAGCNS *g, uint32_t nid, int dir){
dagnode_t *n;
dagedge_t *e;
uint32_t eid;
n = ref_dagnodev(g->nodes, nid);
eid = n->edges[dir];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
if(e->visit == 0) return 1;
eid = e->links[dir];
}
return 0;
}
static inline void print_local_dot_dagcns(DAGCNS *g, uint32_t nid, int distance, FILE *out){
u32list *stack;
u32hash *hash;
dagnode_t *n, *n1, *n2;
dagedge_t *e;
uint32_t id1, id2, eid, *u;
int lo, hi, dir, exists;
n = ref_dagnodev(g->nodes, nid);
stack = init_u32list(32);
hash = init_u32hash(1023);
lo = n->pos - distance;
hi = n->pos + distance;
push_u32list(stack, nid);
put_u32hash(hash, nid);
fprintf(out, "digraph {\nrankdir=LR\n");
while(stack->size){
id1 = stack->buffer[--stack->size];
n1 = ref_dagnodev(g->nodes, id1);
for(dir=0;dir<1;dir++){
eid = n1->edges[dir];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
id2 = e->nodes[dir];
n2 = ref_dagnodev(g->nodes, id2);
fprintf(out, "N%d_%d_%c -> N%d_%d_%c [label=\"%d:%0.6f\" color=%s]\n", id1, n1->pos, "ACGT"[n1->base], id2, n2->pos, "ACGT"[n2->base], e->cov, e->score, e->visit? "blue" : "black");
if(n2->pos >= lo && n2->pos <= hi){
u = prepare_u32hash(hash, id2, &exists);
if(exists){
} else {
*u = id2;
push_u32list(stack, id2);
}
}
eid = e->links[dir];
}
}
}
fprintf(out, "}\n");
fflush(out);
free_u32list(stack);
free_u32hash(hash);
}
static inline void fprint_local_dot_dagcns(DAGCNS *g, u4i nid, int level, char *prefix, char *suffix){
FILE *out;
out = open_file_for_write(prefix, suffix, 1);
print_local_dot_dagcns(g, nid, level, out);
fclose(out);
}
static inline void merge_nodes_dagcns(DAGCNS *g){
dagnode_t *n;
dagedge_t *e;
u32list *stack, *cache[4];
u32fifo *queue;
uint32_t i, nid, eid;
stack = init_u32list(1024);
cache[0] = init_u32list(4);
cache[1] = init_u32list(4);
cache[2] = init_u32list(4);
cache[3] = init_u32list(4);
for(i=0;iedges->size;i++) g->edges->buffer[i].visit = 0;
for(i=0;inodes->size;i++) g->nodes->buffer[i].visit = 0;
queue = init_u32fifo();
for(i=0;inodes->size;i++){
n = ref_dagnodev(g->nodes, i);
if(n->edges[1] != 0xFFFFFFFFU) continue;
push_u32fifo(queue, i);
}
//dagcns_debug = 2;
while(pop_u32fifo(queue, &nid)){
if(dagcns_debug > 1) fprintf(stdout, "\npop(%u) %u\n", (u4i)queue->size, nid);
n = ref_dagnodev(g->nodes, nid);
if(n->visit) continue;
n->visit = 1;
merge_nodes_core_dagcns(g, nid, stack, cache, 1);
merge_nodes_core_dagcns(g, nid, stack, cache, 0);
n = ref_dagnodev(g->nodes, nid);
eid = n->edges[0];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
e->visit = 1;
eid = e->links[0];
}
eid = n->edges[0];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
if(!has_non_visited_edge_dagcns(g, e->nodes[0], 1)){
if(dagcns_debug > 1) fprintf(stdout, "push %u\n", e->nodes[0]);
push_u32fifo(queue, e->nodes[0]);
}
eid = e->links[0];
}
}
free_u32fifo(queue);
free_u32list(stack);
free_u32list(cache[0]);
free_u32list(cache[1]);
free_u32list(cache[2]);
free_u32list(cache[3]);
}
static inline void print_seq_dagcns(DAGCNS *g, FILE *out){
char buffer[100];
uint32_t i, j;
for(i=j=0;icns->size;i++){
buffer[j++] = bit_base_table[g->cns->buffer[i]];
if(j == 99){
buffer[j] = '\0';
fprintf(out, "%s", buffer);
j = 0;
}
}
buffer[j] = '\0';
fprintf(out, "%s", buffer);
}
static inline void gen_consensus_dagcns(DAGCNS *g, u32list *map){
dagnode_t *n1, *n2;
dagedge_t *e;
u32fifo *queue;
uint32_t i, lst, nid, eid, best_e;
f8i best_s, score;
queue = init_u32fifo();
if(queue == NULL){ // un-reachable, but is used to call fprint_local_dot_dagcns in gdb Debug
fprint_local_dot_dagcns(g, 0, 10, "test.dot", NULL);
}
for(i=0;inodes->size;i++){
n1 = ref_dagnodev(g->nodes, i);
if(n1->edges[0] == 0xFFFFFFFFU && n1->edges[1] != 0xFFFFFFFFU){
push_u32fifo(queue, i);
n1->fw_edge = NODE_MAX_FW_EDGE;
n1->aux = 0;
}
}
for(i=0;iedges->size;i++) g->edges->buffer[i].visit = 0;
while(pop_u32fifo(queue, &nid)){
best_s = - FLT_MAX;
best_e = 0xFFFFFFFFU;
n1 = ref_dagnodev(g->nodes, nid);
eid = n1->edges[0];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
n2 = ref_dagnodev(g->nodes, e->nodes[0]);
if(e->nodes[0] < g->backbone_size){
//score = n2->aux + e->cov - g->ref_penalty * g->deps->buffer[n1->pos];
score = n2->aux + e->score - g->ref_penalty * g->deps->buffer[n1->pos];
} else {
//score = n2->aux + e->cov - g->alt_penalty * g->deps->buffer[n1->pos];
score = n2->aux + e->score - g->alt_penalty * g->deps->buffer[n1->pos];
}
if(score > best_s){
best_s = score;
best_e = eid;
}
eid = e->links[0];
}
if(best_s > - FLT_MAX) n1->aux = best_s;
n1->fw_edge = best_e;
eid = n1->edges[1];
while(eid != 0xFFFFFFFFU){
e = ref_dagedgev(g->edges, eid);
e->visit = 1;
if(!has_non_visited_edge_dagcns(g, e->nodes[1], 0)){
push_u32fifo(queue, e->nodes[1]);
}
eid = e->links[1];
}
}
free_u32fifo(queue);
clear_u8list(g->cns);
clear_u32list(g->deps);
if(map) clear_u32list(map);
g->cns_head = 0;
if(g->nodes->size == 0) return;
n1 = ref_dagnodev(g->nodes, g->cns_head);
g->cns_score = n1->aux;
n1->cns = 1;
lst = 0;
if(map && g->cns_head < g->backbone_size){
while(lst < g->cns_head){ push_u32list(map, g->cns->size); lst ++; }
}
push_u8list(g->cns, n1->base);
push_u32list(g->deps, 0);
while(n1->fw_edge != NODE_MAX_FW_EDGE){
e = ref_dagedgev(g->edges, n1->fw_edge);
e->cns = 1;
if(map && e->nodes[0] < g->backbone_size){
while(lst < e->nodes[0]){ push_u32list(map, g->cns->size); lst ++; }
}
n1 = ref_dagnodev(g->nodes, e->nodes[0]);
n1->cns = 1;
push_u8list(g->cns, n1->base);
push_u32list(g->deps, 0);
}
if(map) while(lst <= g->backbone_size){ push_u32list(map, g->cns->size); lst ++; }
}
#endif
wtdbg2-2.5/dbgcns.h 0000664 0000000 0000000 00000116465 13536643722 0014212 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __DBGCNS_CNS_RJ_H
#define __DBGCNS_CNS_RJ_H
#include "dna.h"
#include "list.h"
#include "hashset.h"
#include "thread.h"
#include "ksw.h"
#include "chararray.h"
/*
#ifndef DBGCNS_DEBUG
#define DBGCNS_DEBUG 0
#endif
*/
static int DBGCNS_DEBUG = 0;
#define DBGCNS_KMER_MAX_SIZE 21
#define DBGCNS_KMER_MAX_NODE_COV 0x3FF
#define DBGCNS_KMER_MAX_EDGE_COV 0xFF
#define DBGCNS_MAX_UID 0xFFF
typedef struct {
uint64_t mer:42, cov:10, visit:12;
uint8_t edges[2][4];
} dbgcns_kmer_t;
define_list(dbgcnskmerv, dbgcns_kmer_t);
#define UD(E) ((dbgcnskmerv*)set->userdata)->buffer[E].mer
#define kmer_hashcode(E) u64hashcode(UD(E))
#define kmer_hashequals(E1, E2) UD(E1) == UD(E2)
define_hashset(dbgcnskmerhash, uint32_t, kmer_hashcode, kmer_hashequals);
#undef UD
typedef struct {
u4i ksize;
u8i kmask;
int hz;
dbgcnskmerv *kmers;
dbgcnskmerhash *khash;
u1v *zseq;
} DBG;
#define BT_SCORE_MIN -0x0FFFFFFF
typedef struct {
u8i mer:42, off:21, closed:1;
u2i n_in, n_visit;
u4i edges, bt_node, bt_edge;
int bt_score;
u8i ptr; // refer to dbg->kmers
} fbg_kmer_t;
#define fbgkmer_hashcode(E) u64hashcode((E).mer)
#define fbgkmer_equals(E1, E2) ((E1).mer == (E2).mer)
define_hashset(fbgkmerh, fbg_kmer_t, fbgkmer_hashcode, fbgkmer_equals);
typedef struct {
u4i node, cov:28, most:1, key:2, select:1;
u4i dist;
u4i link;
u4i next;
} fbg_edge_t;
define_list(fbgedgev, fbg_edge_t);
typedef struct {
u4i ridx, roff:15, rlen:15, key:1, select:1;
u4i next;
} fbg_link_t;
define_list(fbglinkv, fbg_link_t);
typedef struct {
u8i ridx:16, kidx:30, koff:17, closed:1;
} rd_kmer_t;
define_list(rdkmerv, rd_kmer_t);
typedef struct {
u4i lidx, len, gidx;
} link_grp_t;
define_list(linkgrpv, link_grp_t);
typedef struct {
fbgkmerh *kmers;
fbgedgev *edges;
fbglinkv *links;
linkgrpv *grps;
rdkmerv *mats;
u1v *starseq;
} FBG;
#define DBGCNS_DP_SCORE_MIN -0x7FFFFFFF
#define DBGCNS_PATH_M 0
#define DBGCNS_PATH_X 1
#define DBGCNS_PATH_I 2
#define DBGCNS_PATH_D 3
#define DBGCNS_CNS_NON 0
#define DBGCNS_CNS_TIP 1
#define DBGCNS_CNS_CUT 2
#define DBGCNS_CNS_EXT 3
#define DBGCNS_CNS_HIT 4
typedef struct {
union {
struct { uint32_t kidx:30, path:2; uint32_t qpos; };
uint64_t identifier;
};
int score;
uint32_t bt_idx;
} dbgcns_dp_t;
define_list(dbgcnsdpv, dbgcns_dp_t);
#define DD(E) ((dbgcnsdpv*)set->userdata)->buffer[E]
#define dp_hashcode(E) u64hashcode(DD(E).identifier)
#define dp_hashequals(E1, E2) DD(E1).identifier == DD(E2).identifier
define_hashset(dbgcnsdphash, uint32_t, dp_hashcode, dp_hashequals);
#undef DD
typedef struct {uint64_t off:40, len:23, solid:1;} blk_t;
define_list(blkv, blk_t);
typedef struct {
DBG *g;
FBG *fbg;
int C, M, X, I, D, E, H, L;
int Z, W;
u1v *qseqs;
blkv *qblks;
uint8_t *qry;
uint32_t qlen;
u4i qidx;
int avg_cov;
dbgcnsdpv *dps;
u4v *heap;
dbgcnsdphash *hash;
b4v *qmaxs;
uint32_t qtop;
int max_score;
uint32_t best_idx;
String *seq;
u1v *cns;
u1v *cigars;
int alns[4];
} CNS;
static inline DBG* init_dbg(uint32_t ksize){
DBG *g;
if(ksize > DBGCNS_KMER_MAX_SIZE){
fprintf(stderr, " -- ksize MUST be no greater than %d, but is %d in %s -- %s:%d --\n", DBGCNS_KMER_MAX_SIZE, ksize, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
exit(1);
}
g = malloc(sizeof(DBG));
g->hz = 0;
g->ksize = ksize;
g->kmask = 0xFFFFFFFFFFFFFFFFLLU >> ((32 - ksize) << 1);
g->kmers = init_dbgcnskmerv(32);
next_ref_dbgcnskmerv(g->kmers);
memset(g->kmers->buffer, 0, sizeof(dbgcns_kmer_t));
g->khash = init_dbgcnskmerhash(1023);
set_userdata_dbgcnskmerhash(g->khash, g->kmers);
g->zseq = init_u1v(64);
return g;
}
static inline void free_dbg(DBG *g){
free_dbgcnskmerv(g->kmers);
free_dbgcnskmerhash(g->khash);
free_u1v(g->zseq);
free(g);
}
static inline void clear_dbg(DBG *g){
clear_dbgcnskmerv(g->kmers);
next_ref_dbgcnskmerv(g->kmers);
memset(g->kmers->buffer, 0, sizeof(dbgcns_kmer_t));
clear_dbgcnskmerhash(g->khash);
}
static inline int add_kmer_dbg(DBG *g, u8i kmer, u4i ridx, uint8_t fbase, uint8_t rbase, int inc){
dbgcns_kmer_t *k;
u8i krev;
uint32_t *u;
int dir, exists;
if(0){
krev = dna_rev_seq(kmer, g->ksize);
if(kmer < krev){ dir = 0; }
else { kmer = krev; dir = 1; }
} else {
dir = 0;
}
g->kmers->buffer[0].mer = kmer;
u = prepare_dbgcnskmerhash(g->khash, 0, &exists);
if(exists){
k = ref_dbgcnskmerv(g->kmers, *u);
} else {
*u = g->kmers->size;
k = next_ref_dbgcnskmerv(g->kmers);
memset(k, 0, sizeof(dbgcns_kmer_t));
k->mer = kmer;
k->cov = 0;
k->visit = 0;
}
if(k->visit != ridx + 1 && k->cov < DBGCNS_KMER_MAX_NODE_COV && (inc || k->cov == 0)){
k->cov ++;
}
k->visit = ridx + 1;
if(dir){
if(fbase < 4){
fbase = (~fbase) & 0x03;
if(k->edges[1][fbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[1][fbase] == 0)) k->edges[1][fbase] ++;
}
if(rbase < 4){
if(k->edges[0][rbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[0][rbase] == 0)) k->edges[0][rbase] ++;
}
} else {
if(fbase < 4){
if(k->edges[0][fbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[0][fbase] == 0)) k->edges[0][fbase] ++;
}
if(rbase < 4){
rbase = (~rbase) & 0x03;
if(k->edges[1][rbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[1][rbase] == 0)) k->edges[1][rbase] ++;
}
}
return exists;
}
static inline void homopolymer_compress_dbg(DBG *g, u1i *seq, u4i len){
u4i i;
u1i b;
clear_u1v(g->zseq);
b = 4;
for(i=0;izseq, b);
}
}
static inline void add_seq_dbg(DBG *g, u4i ridx, uint8_t *seq, uint32_t len, int cov_inc){
u8i kmer;
uint32_t i;
uint8_t b, f, r;
if(g->hz){
homopolymer_compress_dbg(g, seq, len);
seq = g->zseq->buffer;
len = g->zseq->size;
}
kmer = 0;
for(i=0;ikmask;
i ++;
if(i < g->ksize) continue;
f = (i < len)? seq[i] : 4;
r = (i > g->ksize)? seq[i - g->ksize] : 4;
add_kmer_dbg(g, kmer, ridx, f, r, cov_inc);
}
}
static inline int kmer_cov_seq_dbg(DBG *g, u1i *seq, u4i len, u4i uid){
dbgcns_kmer_t *k;
u8i kmer;
uint32_t i, *u;
uint8_t b;
int cov;
uid = uid % DBGCNS_MAX_UID;
if(g->hz){
homopolymer_compress_dbg(g, seq, len);
seq = g->zseq->buffer;
len = g->zseq->size;
}
cov = 0;
kmer = 0;
for(i=0;ikmask;
i ++;
if(i < g->ksize) continue;
g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(g->khash, 0);
if(u){
k = ref_dbgcnskmerv(g->kmers, *u);
if(k->visit == uid){ cov --; continue; }
k->visit = uid;
cov += k->cov > 1? k->cov + 1 : - 1;
}
}
return cov;
}
static inline void print_kmers_dbg(DBG *g, FILE *out){
dbgcns_kmer_t *k;
uint64_t i;
char seq[DBGCNS_KMER_MAX_SIZE + 1];
for(i=1;ikmers->size;i++){
k = ref_dbgcnskmerv(g->kmers, i);
kmer2seq(seq, k->mer, g->ksize);
fprintf(out, "%s\t%d\t%d,%d,%d,%d\t%d,%d,%d,%d\n", seq, k->cov,
k->edges[0][0], k->edges[0][1], k->edges[0][2], k->edges[0][3],
k->edges[1][0], k->edges[1][1], k->edges[1][2], k->edges[1][3]);
}
}
static inline CNS* init_cns(uint32_t ksize, int Z, int W, int M, int X, int I, int D, int E, int H, int L){
CNS *cns;
cns = malloc(sizeof(CNS));
cns->g = init_dbg(ksize);
cns->fbg = malloc(sizeof(FBG));
cns->fbg->kmers = init_fbgkmerh(1023);
cns->fbg->edges = init_fbgedgev(32);
memset(next_ref_fbgedgev(cns->fbg->edges), 0, sizeof(fbg_edge_t));
cns->fbg->links = init_fbglinkv(32);
memset(next_ref_fbglinkv(cns->fbg->links), 0, sizeof(fbg_link_t));
cns->fbg->grps = init_linkgrpv(32);
cns->fbg->mats = init_rdkmerv(32);
cns->fbg->starseq = init_u1v(32);
cns->qseqs = init_u1v(32);
cns->qblks = init_blkv(32);
cns->Z = Z;
cns->W = W;
cns->C = 1;
cns->M = M;
cns->X = X;
cns->I = I;
cns->D = D;
cns->E = E;
cns->H = H;
cns->L = L;
cns->qlen = 0;
cns->qidx = 0;
cns->avg_cov = 0;
cns->dps = init_dbgcnsdpv(32);
next_ref_dbgcnsdpv(cns->dps);
memset(cns->dps->buffer, 0, sizeof(dbgcns_dp_t)); // no need, it is always zero-filled
cns->heap = init_u4v(32);
cns->hash = init_dbgcnsdphash(1023);
set_userdata_dbgcnsdphash(cns->hash, cns->dps);
cns->qmaxs = init_b4v(32);
cns->qtop = 0;
cns->max_score = DBGCNS_DP_SCORE_MIN;
cns->best_idx = 0;
cns->seq = init_string(32);
cns->cns = init_u1v(32);
cns->cigars = init_u1v(32);
return cns;
}
static inline void free_cns(CNS *cns){
free_dbg(cns->g);
free_fbgkmerh(cns->fbg->kmers);
free_fbgedgev(cns->fbg->edges);
free_fbglinkv(cns->fbg->links);
free_linkgrpv(cns->fbg->grps);
free_rdkmerv(cns->fbg->mats);
free_u1v(cns->fbg->starseq);
free(cns->fbg);
free_u1v(cns->qseqs);
free_blkv(cns->qblks);
free_dbgcnsdpv(cns->dps);
free_u4v(cns->heap);
free_dbgcnsdphash(cns->hash);
free_b4v(cns->qmaxs);
free_string(cns->seq);
free_u1v(cns->cns);
free_u1v(cns->cigars);
free(cns);
}
static inline void reset_cns(CNS *cns){
clear_dbg(cns->g);
clear_u1v(cns->qseqs);
clear_blkv(cns->qblks);
clear_fbgkmerh(cns->fbg->kmers);
clear_fbgedgev(cns->fbg->edges);
memset(next_ref_fbgedgev(cns->fbg->edges), 0, sizeof(fbg_edge_t));
clear_fbglinkv(cns->fbg->links);
memset(next_ref_fbglinkv(cns->fbg->links), 0, sizeof(fbg_link_t));
cns->qry = NULL;
cns->qlen = 0;
}
static inline void add_seq_cns(CNS *cns, char *seq, int len, int solid){
blk_t *b;
int i;
b = next_ref_blkv(cns->qblks);
b->off = cns->qseqs->size;
b->len = len;
b->solid = solid;
for(i=0;iqseqs, base_bit_table[(int)seq[i]]);
}
static inline void ready_cns(CNS *cns){
UNUSED(cns);
//blk_t *b;
//u4i i;
//for(i=0;iqblks->size;i++){
//b = ref_blkv(cns->qblks, i);
//add_seq_dbg(cns->g, cns->qseqs->buffer + b->off, b->len);
//}
}
static inline int dbg_cns_core(CNS *cns){
dbgcns_dp_t *dp, *dp2;
dbgcns_kmer_t *k;
u8i kmer, knew;
uint32_t i, dp_idx, kidx, *u;
int sum, cov, cut;
uint8_t b, q;
int exists, score, nadd, mat_only;
if(cns->heap->size == 0) return DBGCNS_CNS_NON;
dp_idx = cns->heap->buffer[0]; //array_heap_pop(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
encap_dbgcnsdpv(cns->dps, 9);
dp = ref_dbgcnsdpv(cns->dps, dp_idx);
mat_only = 0;
if(dp->qpos >= cns->qlen) return DBGCNS_CNS_HIT;
if(dp->qpos + cns->W < cns->qtop){
mat_only = 1;
//array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
//return DBGCNS_CNS_CUT;
} else if(dp->qpos > cns->qtop){
cns->qtop = dp->qpos;
for(i=cns->heap->size;i>0;i--){
if(cns->dps->buffer[cns->heap->buffer[i-1]].qpos + cns->W < cns->qtop){
array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, i - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
}
}
}
if(dp->score < cns->qmaxs->buffer[dp->qpos]){
mat_only = 1;
//array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
//return DBGCNS_CNS_CUT;
} else if(dp->score + cns->Z * cns->X > cns->qmaxs->buffer[dp->qpos]){
cns->qmaxs->buffer[dp->qpos] = dp->score + cns->Z * cns->X;
}
u = prepare_dbgcnsdphash(cns->hash, dp_idx, &exists);
if(exists){
dp2 = ref_dbgcnsdpv(cns->dps, *u);
if(dp->score > dp2->score){
*u = dp_idx;
} else {
array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
return DBGCNS_CNS_CUT;
}
} else {
*u = dp_idx;
}
array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
k = ref_dbgcnskmerv(cns->g->kmers, dp->kidx);
kmer = k->mer;
q = cns->qry[dp->qpos];
sum = k->edges[0][0] + k->edges[0][1] + k->edges[0][2] + k->edges[0][3];
if(sum == 0) return DBGCNS_CNS_TIP;
cut = num_max((cns->avg_cov + cns->L - 1) / cns->L, 1);
//cut = num_max((cns->avg_cov + 2) / 3, 1);
nadd = 0;
for(b=0;b<4;b++){
if((cov = k->edges[0][b]) == 0) continue;
if(b == q){
score = dp->score + cns->M;
} else if(mat_only){
continue;
} else {
score = dp->score + cns->X;
}
//score += (cov > 1)? (cov >= cut? 1 : 0) : -1;
score += (cov > cut)? (cns->H + (b == q? 1 : 0)) : cov - cut;
//score += (cov > cut)? cns->H : cov - cut;
knew = ((kmer << 2) | b) & cns->g->kmask;
cns->g->kmers->buffer[0].mer = knew;
u = get_dbgcnskmerhash(cns->g->khash, 0);
kidx = *u;
dp2 = next_ref_dbgcnsdpv(cns->dps);
dp2->kidx = kidx;
dp2->path = (b == q)? DBGCNS_PATH_M : DBGCNS_PATH_X;
dp2->qpos = dp->qpos + 1;
dp2->score = score;
dp2->bt_idx = dp_idx;
array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
nadd ++;
// deletion
if(dp->path != DBGCNS_PATH_I){
dp2 = next_ref_dbgcnsdpv(cns->dps);
score = dp->score + (cns->E + (dp->path == DBGCNS_PATH_D? 0 : cns->D));
if(dp->path != DBGCNS_PATH_D) score += (cov > cut)? 0 : cov - cut;
else score += (cov > cut)? cns->H : cov - cut;
dp2->kidx = kidx;
dp2->path = DBGCNS_PATH_D;
dp2->qpos = dp->qpos;
dp2->score = score;
dp2->bt_idx = dp_idx;
array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
nadd ++;
}
}
// insertion
if(mat_only == 0 && dp->path != DBGCNS_PATH_D){
dp2 = next_ref_dbgcnsdpv(cns->dps);
score = dp->score + (cns->E + (dp->path == DBGCNS_PATH_I? 0 : cns->I));
//if(dp->qpos && cns->qry[dp->qpos] == cns->qry[dp->qpos - 1]) score += 1; // homopolymer merge
score += 1 - cut;
dp2->kidx = dp->kidx;
dp2->path = DBGCNS_PATH_I;
dp2->qpos = dp->qpos + 1;
dp2->score = score;
dp2->bt_idx = dp_idx;
array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
nadd ++;
}
return nadd? DBGCNS_CNS_EXT : DBGCNS_CNS_CUT;
}
static inline void ready_core_cns(CNS *cns, int candidate_mode, int reflen){
blk_t *b;
u4i i;
int tot;
if(candidate_mode == 3) i = 1;
else i = 0;
tot = 0;
for(;iqblks->size;i++){
b = ref_blkv(cns->qblks, i);
add_seq_dbg(cns->g, i, cns->qseqs->buffer + b->off, b->len, 1);
tot += b->len;
}
if(candidate_mode == 3){
b = ref_blkv(cns->qblks, 0);
add_seq_dbg(cns->g, i, cns->qseqs->buffer + b->off, b->len, 0);
tot += b->len;
}
cns->avg_cov = (tot + reflen - 1) / reflen;
}
static inline int run_core_cns(CNS *cns, uint8_t *qry, uint32_t qlen){
dbgcns_dp_t *dp;
dbgcns_kmer_t *k;
uint32_t i, kmer, kidx, dp_idx, *u;
int status;
{
cns->qry = qry;
cns->qlen = qlen;
// reset auxiliaries
clear_dbgcnsdpv(cns->dps);
next_ref_dbgcnsdpv(cns->dps);
memset(cns->dps->buffer, 0, sizeof(dbgcns_dp_t));
clear_u4v(cns->heap);
clear_dbgcnsdphash(cns->hash);
clear_b4v(cns->qmaxs);
for(i=0;iqmaxs, DBGCNS_DP_SCORE_MIN);
cns->qtop = 0;
cns->max_score= DBGCNS_DP_SCORE_MIN;
cns->best_idx = 0;
clear_u1v(cns->cns);
clear_string(cns->seq);
clear_u1v(cns->cigars);
cns->alns[0] = cns->alns[1] = cns->alns[2] = cns->alns[3] = 0;
}
// set first kmer
kmer = 0;
for(cns->qtop=0;cns->qtopg->ksize;cns->qtop++){
kmer = (kmer << 2) | qry[cns->qtop];
}
cns->g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(cns->g->khash, 0);
if(u == NULL) return 0;
kidx = *u;
dp = next_ref_dbgcnsdpv(cns->dps);
dp->kidx = kidx;
dp->path = DBGCNS_PATH_M;
dp->qpos = cns->qtop;
dp->score = 0;
dp->bt_idx = 0;
array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
// dbg traversing
while(cns->heap->size){
status = dbg_cns_core(cns);
if(status == DBGCNS_CNS_HIT){
dp_idx = cns->heap->buffer[0];
array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
dp = ref_dbgcnsdpv(cns->dps, dp_idx);
if(dp->score > cns->max_score){
cns->max_score = dp->score;
cns->best_idx = dp_idx;
}
}
}
if(cns->best_idx == 0) return 0;
// traceback to get cns seq
dp_idx = cns->best_idx;
while(dp_idx){
dp = ref_dbgcnsdpv(cns->dps, dp_idx);
push_u1v(cns->cigars, dp->path);
cns->alns[dp->path] ++;
if(dp->path != DBGCNS_PATH_I){
k = ref_dbgcnskmerv(cns->g->kmers, dp->kidx);
push_u1v(cns->cns, k->mer & 0x03);
if(k->cov == 1){
add_char_string(cns->seq, "acgt"[k->mer & 0x03]);
} else {
add_char_string(cns->seq, "ACGT"[k->mer & 0x03]);
}
}
dp_idx = dp->bt_idx;
}
// first ksize - 1 bases may be not corrected, truncated
reverse_string(cns->seq);
reverse_u1v(cns->cns);
reverse_u1v(cns->cigars);
return cns->seq->size;
}
static inline int hierarchical_clustering_edge_links(FBG *fbg, fbg_edge_t *e, linkgrpv *grps, double max_var){
fbg_link_t *lnk;
u4i lidx, gidx;
u4i i, b;
double sum, avg;
clear_linkgrpv(grps);
lidx = e->link;
while(lidx){
lnk = ref_fbglinkv(fbg->links, lidx);
push_linkgrpv(grps, (link_grp_t){lidx, lnk->rlen, 0});
lidx = lnk->next;
}
if(grps->size == 0) return 0;
sort_array(grps->buffer, grps->size, link_grp_t, num_cmpgt(a.len, b.len));
gidx = 0;
b = 0;
sum = avg = grps->buffer[0].len;
for(i=1;isize;i++){
if(grps->buffer[i].len - avg > avg * max_var){
gidx ++;
sum = avg = grps->buffer[i].len;
b = i;
} else {
sum += grps->buffer[i].len;
avg = sum / (i - b + 1);
if(avg - grps->buffer[b].len > avg * max_var || grps->buffer[i].len - avg > avg * max_var){
gidx ++;
sum = avg = grps->buffer[i].len;
b = i;
}
}
grps->buffer[i].gidx = gidx;
}
return gidx + 1;
}
static inline int revise_edge_fbg(FBG *fbg, u4i kidx, u4i eidx, linkgrpv *grps, double max_var){
fbg_kmer_t *k;
fbg_edge_t *e, *p;
fbg_link_t *lnk;
u4i eidx2, eidx3;
u4i i, b, j, ng, avg, gidx;
u4i max_cov, max_eidx, key;
double sum;
e = ref_fbgedgev(fbg->edges, eidx);
ng = hierarchical_clustering_edge_links(fbg, e, grps, max_var);
encap_fbgedgev(fbg->edges, ng - 1);
k = ref_fbgkmerh(fbg->kmers, kidx);
e = ref_fbgedgev(fbg->edges, eidx);
e->key = key = e->key? 2 : 0;
e->most = 0;
if(DBGCNS_DEBUG){
if(ng > 1){
fprintf(stderr, "REVISE K%d -> K%d cov = %d into %d edges --\n", k->off, ref_fbgkmerh(fbg->kmers, e->node)->off, e->cov, ng); fflush(stderr);
}
}
ref_fbgkmerh(fbg->kmers, e->node)->n_in += ng - 1;
p = e;
p->link = 0;
eidx3 = e->next;
gidx = 0;
sum = 0;
max_cov = 0;
max_eidx = 0;
for(i=b=0;i<=grps->size;i++){
if(i == grps->size || grps->buffer[i].gidx != gidx){
avg = sum / (i - b) + 0.5;
if(p == NULL){
eidx2 = fbg->edges->size;
p = next_ref_fbgedgev(fbg->edges);
p->node = e->node;
e->next = eidx2;
p->next = eidx3;
}
p->link = 0;
p->cov = i - b;
p->dist = avg;
p->key = key;
p->select = 0;
p->most = 0;
if(p->cov > max_cov){
max_cov = p->cov;
max_eidx = offset_fbgedgev(fbg->edges, p);
} else if(p->cov == max_cov){
max_eidx = 0;
}
if(DBGCNS_DEBUG){
if(ng > 1){
fprintf(stderr, "+ %d %d --\n", p->cov, p->dist); fflush(stderr);
}
}
for(j=b;jlinks, grps->buffer[j].lidx);
lnk->next = p->link;
p->link = grps->buffer[j].lidx;
if(lnk->key) p->key = 1;
}
e = p;
p = NULL;
b = i;
gidx ++;
sum = 0;
}
if(i < grps->size) sum += grps->buffer[i].len;
}
if(max_eidx){
ref_fbgedgev(fbg->edges, max_eidx)->most = 1;
}
return gidx;
}
static inline void revise_edge_cov_fbg(FBG *fbg, fbg_edge_t *e){
fbg_link_t *lnk;
u4i lidx, cnt;
double sum, var, max, avg, std;
sum = 0;
var = 0;
max = 0;
cnt = 0;
lidx = e->link;
while(lidx){
lnk = ref_fbglinkv(fbg->links, lidx);
if((double)lnk->rlen > max) max = lnk->rlen;
sum += lnk->rlen;
var += lnk->rlen * lnk->rlen;
cnt ++;
lidx = lnk->next;
}
if(cnt == 0) return;
avg = sum / cnt;
std = sqrt(var / cnt - avg * avg);
if(std > 10 && std > 0.2 * avg) std = 0.2 * max;
if(std < 1) std = 1;
cnt = 0;
sum = 0;
lidx = e->link;
while(lidx){
lnk = ref_fbglinkv(fbg->links, lidx);
if(num_diff((double)lnk->rlen, avg) <= std){
sum += lnk->rlen;
cnt ++;
}
lidx = lnk->next;
}
if(cnt == 0) cnt = 1;
avg = sum / cnt;
e->cov = cnt;
e->dist = avg + 0.5;
}
static inline void build_DirectFuzzyBruijnGraph(CNS *cns, u4i ridx, u4i cov_cutoff, double max_dist_var){
DBG *g;
dbgcns_kmer_t *k;
fbg_kmer_t A, *a;
fbg_edge_t *e;
fbg_link_t *l;
rd_kmer_t *rk;
u8i kmer;
u4i r, rr, i, j, beg, end, c, len, idx1, off1, idx2, off2, *u;
u4i eidx;
u1i b, *seq;
int exists;
g = cns->g;
// select high cov kmers
if(cov_cutoff){
seq = cns->qseqs->buffer + cns->qblks->buffer[ridx].off;
len = cns->qblks->buffer[ridx].len;
memset(&A, 0, sizeof(fbg_kmer_t));
kmer = 0;
j = 0x0000FFFFU;
for(i=0;ikmask;
i ++;
if(i < g->ksize) continue;
g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(g->khash, 0);
if(u == NULL) continue;
k = ref_dbgcnskmerv(g->kmers, *u);
if(k->cov < cov_cutoff) continue;
//if(j + 1 == i){ j = i; continue; }
A.mer = kmer;
a = prepare_fbgkmerh(cns->fbg->kmers, A, &exists);
if(exists){
a->closed = 1;
} else {
a->mer = kmer;
a->off = i - g->ksize;
a->closed = 0;
a->n_in = 0;
a->n_visit = 0;
a->edges = 0;
a->ptr = *u;
j = i;
}
}
} else {
// select best cov per 10 bp, but cov >= 4
clear_rdkmerv(cns->fbg->mats);
seq = cns->qseqs->buffer + cns->qblks->buffer[ridx].off;
len = cns->qblks->buffer[ridx].len;
memset(&A, 0, sizeof(fbg_kmer_t));
kmer = 0;
for(i=0;ikmask;
i ++;
if(i < g->ksize) continue;
g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(g->khash, 0);
if(u == NULL) continue;
if(g->kmers->buffer[*u].cov < 4) continue;
push_rdkmerv(cns->fbg->mats, (rd_kmer_t){ridx, *u, i - g->ksize, 0});
}
sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgtx(g->kmers->buffer[b.kidx].cov, g->kmers->buffer[a.kidx].cov, a.kidx, b.kidx));
for(i=1;ifbg->mats->size;i++){
rk = ref_rdkmerv(cns->fbg->mats, i);
if(rk->kidx == ref_rdkmerv(cns->fbg->mats, i - 1)->kidx){
rk->closed = 1;
ref_rdkmerv(cns->fbg->mats, i - 1)->closed = 1;
}
}
sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgt(a.koff, b.koff));
beg = 0;
j = 0xFFFFFFFFU;
for(i=0;ifbg->mats->size;i++){
rk = ref_rdkmerv(cns->fbg->mats, i);
if(rk->closed) continue;
if(j == 0xFFFFFFFFU) j = i;
if(rk->koff - beg < 10){
if(g->kmers->buffer[rk->kidx].cov > g->kmers->buffer[cns->fbg->mats->buffer[j].kidx].cov){
j = i;
}
continue;
}
rk = ref_rdkmerv(cns->fbg->mats, j);
beg = rk->koff;
j = i;
i --;
k = ref_dbgcnskmerv(g->kmers, rk->kidx);
kmer = k->mer;
A.mer = kmer;
a = prepare_fbgkmerh(cns->fbg->kmers, A, &exists);
if(exists){
a->closed = 1;
} else {
a->mer = kmer;
a->off = rk->koff;
a->closed = 0;
a->n_in = 0;
a->n_visit = 0;
a->edges = 0;
a->ptr = rk->kidx;
}
}
}
clear_rdkmerv(cns->fbg->mats);
for(r=0;rqblks->size;r++){
seq = cns->qseqs->buffer + cns->qblks->buffer[r].off;
len = cns->qblks->buffer[r].len;
memset(&A, 0, sizeof(fbg_kmer_t));
kmer = 0;
for(i=0;ikmask;
i ++;
if(i < g->ksize) continue;
A.mer = kmer;
a = get_fbgkmerh(cns->fbg->kmers, A);
if(a == NULL) continue;
if(a->closed) continue;
push_rdkmerv(cns->fbg->mats, (rd_kmer_t){r, offset_fbgkmerh(cns->fbg->kmers, a), i - g->ksize, 0});
}
}
sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgtx(a.ridx, b.ridx, a.kidx, b.kidx));
for(i=1;ifbg->mats->size;i++){
if(cns->fbg->mats->buffer[i-1].ridx == cns->fbg->mats->buffer[i].ridx && cns->fbg->mats->buffer[i-1].kidx == cns->fbg->mats->buffer[i].kidx){
ref_fbgkmerh(cns->fbg->kmers, cns->fbg->mats->buffer[i].kidx)->closed = 1;
}
}
for(i=0;ifbg->mats->size;i++){
if(ref_fbgkmerh(cns->fbg->kmers, cns->fbg->mats->buffer[i].kidx)->closed){
cns->fbg->mats->buffer[i].closed = 1;
}
}
sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgtx(a.ridx, b.ridx, a.koff, b.koff));
// add edges
for(i=0;i+1fbg->mats->size;i++){
if(cns->fbg->mats->buffer[i].closed) continue;
idx1 = cns->fbg->mats->buffer[i].kidx;
off1 = cns->fbg->mats->buffer[i].koff;
a = ref_fbgkmerh(cns->fbg->kmers, idx1);
for(j=i+1,c=0;jfbg->mats->size&&c<1;j++){
if(cns->fbg->mats->buffer[j].ridx != cns->fbg->mats->buffer[i].ridx) break;
if(cns->fbg->mats->buffer[j].closed) continue;
idx2 = cns->fbg->mats->buffer[j].kidx;
off2 = cns->fbg->mats->buffer[j].koff;
if(a->off >= ref_fbgkmerh(cns->fbg->kmers, idx2)->off) continue;
c ++;
eidx = a->edges;
while(eidx){
e = ref_fbgedgev(cns->fbg->edges, eidx);
if(e->node == idx2) break;
eidx = e->next;
}
if(eidx == 0){
eidx = cns->fbg->edges->size;
e = next_ref_fbgedgev(cns->fbg->edges);
e->node = idx2;
e->link = 0;
e->next = a->edges;
e->cov = 0;
e->dist = 0;
e->key = 0;
e->select = 0;
a->edges = eidx;
ref_fbgkmerh(cns->fbg->kmers, idx2)->n_in ++;
}
e = ref_fbgedgev(cns->fbg->edges, eidx);
if(cns->fbg->mats->buffer[i].ridx == ridx) e->key = 1;
e->cov ++;
l = next_ref_fbglinkv(cns->fbg->links);
l->ridx = cns->fbg->mats->buffer[i].ridx;
l->roff = off1;
l->rlen = off2 - off1;
l->key = (cns->fbg->mats->buffer[i].ridx == ridx);
l->select = 0;
l->next = e->link;
e->link = cns->fbg->links->size - 1;
}
}
rr = 0xFFFFFFFFU;
for(beg=end=0;begfbg->mats->size;beg=end){
r = rr;
for(;endfbg->mats->size;end++){
if(cns->fbg->mats->buffer[end].closed) continue;
if(r == 0xFFFFFFFFU){
r = cns->fbg->mats->buffer[end].ridx;
} else if(cns->fbg->mats->buffer[end].ridx == r) continue;
else { rr = cns->fbg->mats->buffer[end].ridx; break; }
}
if(r == ridx) continue; // reference seq
for(i=beg;i+2fbg->mats->buffer[i].closed) continue;
idx1 = cns->fbg->mats->buffer[i].kidx;
off1 = cns->fbg->mats->buffer[i].koff;
a = ref_fbgkmerh(cns->fbg->kmers, idx1);
c = 0;
for(j=i+1;jfbg->mats->buffer[j].closed) continue;
idx2 = cns->fbg->mats->buffer[j].kidx;
off2 = cns->fbg->mats->buffer[j].koff;
if(a->off >= ref_fbgkmerh(cns->fbg->kmers, idx2)->off) continue;
c ++;
if(c < 2) continue;
if(c > 5) break;
eidx = a->edges;
while(eidx){
e = ref_fbgedgev(cns->fbg->edges, eidx);
if(e->node == idx2) break;
eidx = e->next;
}
if(eidx == 0) continue;
e = ref_fbgedgev(cns->fbg->edges, eidx);
e->cov ++;
l = next_ref_fbglinkv(cns->fbg->links);
l->ridx = r;
l->roff = off1;
l->rlen = off2 - off1;
l->key = 0;
l->select = 0;
l->next = e->link;
e->link = cns->fbg->links->size - 1;
}
}
}
if(0){
for(i=1;ifbg->edges->size;i++){
revise_edge_cov_fbg(cns->fbg, ref_fbgedgev(cns->fbg->edges, i));
}
} else {
reset_iter_fbgkmerh(cns->fbg->kmers);
while((a = ref_iter_fbgkmerh(cns->fbg->kmers))){
eidx = a->edges;
while(eidx){
e = ref_fbgedgev(cns->fbg->edges, eidx);
eidx = e->next;
revise_edge_fbg(cns->fbg, offset_fbgkmerh(cns->fbg->kmers, a), offset_fbgedgev(cns->fbg->edges, e), cns->fbg->grps, max_dist_var);
}
}
}
}
static inline void print_dot_DirectFuzzyBruijnGraph(CNS *cns, FILE *out){
fbg_kmer_t *k, *n;
fbg_edge_t *e;
fbg_link_t *l;
u4i eidx, lidx;
if(out == NULL) return;
fprintf(out, "digraph {\n\trankdir=LR\n");
reset_iter_fbgkmerh(cns->fbg->kmers);
while((k = ref_iter_fbgkmerh(cns->fbg->kmers))){
if(k->closed) continue;
eidx = k->edges;
while(eidx){
e = ref_fbgedgev(cns->fbg->edges, eidx);
if(e->cov >= 3 || e->key || e->select){
n = ref_fbgkmerh(cns->fbg->kmers, e->node);
lidx = e->link;
while(lidx){
l = ref_fbglinkv(cns->fbg->links, lidx);
fprintf(out, "\tK%d -> K%d [label=\"R%04d_%d_%d(%d:%d)\" color=%s%s]\n", k->off, n->off, l->ridx, l->roff, l->rlen, e->cov, e->dist, l->key? "blue" : (e->most? "green" : "black"), l->select? " style=dashed" : "");
lidx = l->next;
}
}
eidx = e->next;
}
}
fprintf(out, "}\n");
}
static inline void DP_best_path_DirectFuzzyBruijnGraph(CNS *cns){
FBG *fbg;
fbg_kmer_t *k, *n;
fbg_edge_t *e;
u4v *heap;
u4i kidx, nb, ne, nboff, neoff;
u4i eidx, etmp;
int ref_score, ref_one, alt_one, most_score, max_dist_var, var, score;
float dist_var;
ref_score = 1;
ref_one = -50;
alt_one = -1000;
most_score = 2;
max_dist_var = 100;
dist_var = -0.5;
fbg = cns->fbg;
reset_iter_fbgkmerh(fbg->kmers);
nb = ne = 0xFFFFFFFFU;
nboff = 0xFFFFFFFFU;
neoff = 0;
while((k = ref_iter_fbgkmerh(fbg->kmers))){
k->n_visit = 0;
k->bt_node = 0xFFFFFFFFU;
k->bt_edge = 0;
k->bt_score = BT_SCORE_MIN;
if(k->closed) continue;
if(k->off < nboff){ nb = offset_fbgkmerh(fbg->kmers, k); nboff = k->off; }
if(k->off >= neoff){ ne = offset_fbgkmerh(fbg->kmers, k); neoff = k->off; }
}
if(nb == 0xFFFFFFFFU) return;
heap = init_u4v(32);
k = ref_fbgkmerh(fbg->kmers, nb);
k->bt_score = 0;
push_u4v(heap, nb);
while(heap->size){
kidx = heap->buffer[--heap->size];
k = ref_fbgkmerh(fbg->kmers, kidx);
eidx = k->edges;
while(eidx){
e = ref_fbgedgev(fbg->edges, eidx);
n = ref_fbgkmerh(fbg->kmers, e->node);
score = k->bt_score + e->most * most_score;
var = n->off - k->off;
var = num_diff(var, (int)e->dist);
if(e->key == 1){
score += ref_score + (e->cov <= 1? ref_one : 0);
} else if(e->key == 2){
score += (e->cov <= 1? alt_one : 0);
} else {
if(var > max_dist_var) score = -1000000;
else score += var * dist_var + (e->cov <= 1? alt_one : 0);
}
if(score > n->bt_score){
n->bt_score = score;
n->bt_node = kidx;
n->bt_edge = eidx;
}
n->n_visit ++;
if(n->n_visit >= n->n_in){
push_u4v(heap, e->node);
}
eidx = e->next;
}
}
free_u4v(heap);
k = ref_fbgkmerh(fbg->kmers, ne);
if(ne != nb && k->bt_edge == 0){
FILE *out = open_file_for_write("debug.dot", NULL, 1);
print_dot_DirectFuzzyBruijnGraph(cns, out);
fclose(out);
fprintf(stderr, " -- something wrong, nb = %d(K%04d), ne = %d(K%04d) in %s -- %s:%d --\n", nb, fbg->kmers->array[nb].off, ne, fbg->kmers->array[ne].off, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
exit(1);
}
kidx = 0xFFFFFFFFU;
eidx = 0;
while(kidx != nb){
k = ref_fbgkmerh(fbg->kmers, ne);
ne = k->bt_node;
swap_tmp(eidx, k->bt_edge, etmp);
k->bt_node = kidx;
kidx = offset_fbgkmerh(fbg->kmers, k);
}
}
static inline int correct_struct_DirectFuzzyBruijnGraph(CNS *cns, u4i ridx){
FBG *fbg;
fbg_kmer_t *k, *n;
fbg_edge_t *e;
fbg_link_t *lnk;
int chg;
u4i off, kidx, koff, eidx, lidx, key, upd;
fbg = cns->fbg;
reset_iter_fbgkmerh(fbg->kmers);
kidx = 0xFFFFFFFFU;
koff = 0xFFFFFFFFU;
while((k = ref_iter_fbgkmerh(fbg->kmers))){
if(k->closed) continue;
if(k->off < koff){ kidx = offset_fbgkmerh(fbg->kmers, k); koff = k->off; }
}
clear_u1v(fbg->starseq);
if(DBGCNS_DEBUG){
fprintf(stderr, " -- select seq[%d] len=%d in %s -- %s:%d --\n", ridx, cns->qblks->buffer[ridx].len, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
if(kidx == 0xFFFFFFFFU){
append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[ridx].off, cns->qblks->buffer[ridx].len);
return 0;
}
chg = 0;
off = 0;
while(1){
k = ref_fbgkmerh(fbg->kmers, kidx);
if(off < k->off){
if(DBGCNS_DEBUG){
fprintf(stderr, "-- %d + %d bases from seq[%d], offset %d -> %d\n", (int)fbg->starseq->size, k->off - off, ridx, off, k->off);
}
append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[ridx].off + off, k->off - off);
off = k->off;
}
if(k->bt_edge == 0) break;
eidx = k->bt_edge;
e = ref_fbgedgev(fbg->edges, eidx);
e->select = 1;
key = 0;
double sum, var, cnt, avg, std, min;
sum = 0;
var = 0;
cnt = 0;
lidx = e->link;
while(lidx){
lnk = ref_fbglinkv(fbg->links, lidx);
sum += lnk->rlen;
var += lnk->rlen * lnk->rlen;
cnt ++;
if(lnk->key) key = lidx;
lidx = lnk->next;
}
if(key){
upd = key;
} else {
avg = sum / cnt;
std = sqrt(var / cnt - avg * avg);
if(std < 1) std = 1;
lidx = e->link;
min = 100000;
upd = key;
while(lidx){
lnk = ref_fbglinkv(fbg->links, lidx);
if(num_diff((double)lnk->rlen, avg) < min){
upd = lidx;
min = num_diff((double)lnk->rlen, avg);
}
lidx = lnk->next;
}
}
lnk = ref_fbglinkv(fbg->links, upd);
lnk->select = 1;
kidx = k->bt_node;
n = ref_fbgkmerh(fbg->kmers, kidx);
if(DBGCNS_DEBUG){
fprintf(stderr, "-- %d + %d bases from seq[%d], offset %d + %d -> %d\n", (int)fbg->starseq->size, lnk->rlen, lnk->ridx, off, n->off - off, n->off);
}
append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[lnk->ridx].off + lnk->roff, lnk->rlen);
off = n->off;
}
k = NULL;
if(off < cns->qblks->buffer[ridx].len){
if(DBGCNS_DEBUG){
fprintf(stderr, "-- %d + %d bases from seq[%d], offset %d -> %d\n", (int)fbg->starseq->size, cns->qblks->buffer[ridx].len - off, ridx, off, cns->qblks->buffer[ridx].len);
}
append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[ridx].off + off, cns->qblks->buffer[ridx].len - off);
}
return chg;
}
static inline int homopolymer_analysis_cns(CNS *cns){
u8i kmer, xmask[2];
u4i chg, i, j, l, r, c, brun, *u, kcnts[3];
u1i b;
char kstr[64];
UNUSED(kstr); // only for compile warning
xmask[1] = 0xFFFFFFFFFFFFFFFFLLU >> (64 - (cns->g->ksize / 2 * 2));
xmask[0] = (~xmask[1]) & cns->g->kmask;
b = 4; brun = 0;
clear_u1v(cns->g->zseq);
chg = 0;
for(i=0;ig->ksize;i++) push_u1v(cns->g->zseq, cns->cns->buffer[i]);
for(;i+cns->g->ksizecns->size;i++){
if(cns->cns->buffer[i] == b){
brun ++;
} else {
if(brun >= 3 && brun + 2 <= cns->g->ksize){
if(DBGCNS_DEBUG){
fprintf(stderr, "POLY %c(%d) at pos %d\n", "ACGT"[b], brun, i);
}
r = i - brun;
c = (cns->g->ksize - brun) / 2;
l = r - c;
kmer = 0;
for(j=l;jcns->buffer[j];
for(j=1;jg->ksize - brun - c) + 1;
for(j=l;jcns->buffer[j];
cns->g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(cns->g->khash, 0);
kcnts[0] = u? ref_dbgcnskmerv(cns->g->kmers, *u)->cov : 0;
if(DBGCNS_DEBUG){
kmer2seq(kstr, kmer, cns->g->ksize);
fprintf(stderr, "%s\t%d\n", kstr, kcnts[0]);
}
kmer = (kmer & xmask[0]) | (((u8i)b) << (cns->g->ksize / 2 * 2 - 2)) | ((kmer & xmask[1]) >> 2);
cns->g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(cns->g->khash, 0);
kcnts[1] = u? ref_dbgcnskmerv(cns->g->kmers, *u)->cov : 0;
if(DBGCNS_DEBUG){
kmer2seq(kstr, kmer, cns->g->ksize);
fprintf(stderr, "%s\t%d\n", kstr, kcnts[1]);
}
kmer = (kmer & xmask[0]) | (((u8i)b) << (cns->g->ksize / 2 * 2 - 2)) | ((kmer & xmask[1]) >> 2);
cns->g->kmers->buffer[0].mer = kmer;
u = get_dbgcnskmerhash(cns->g->khash, 0);
kcnts[2] = u? ref_dbgcnskmerv(cns->g->kmers, *u)->cov : 0;
if(DBGCNS_DEBUG){
kmer2seq(kstr, kmer, cns->g->ksize);
fprintf(stderr, "%s\t%d\n", kstr, kcnts[2]);
}
if(kcnts[0] > kcnts[1] && kcnts[1] >= kcnts[2]){ // there is a insertion base
brun --;
chg ++;
if(DBGCNS_DEBUG){
fprintf(stderr, "#HOMO ins\n");
}
} else if(kcnts[2] > kcnts[1] && kcnts[1] >= kcnts[0]){ // deletion
brun ++;
chg ++;
if(DBGCNS_DEBUG){
fprintf(stderr, "#HOMO del\n");
}
}
}
for(j=0;jg->zseq, b);
b = cns->cns->buffer[i];
brun = 1;
}
}
for(j=0;jg->zseq, b);
for(;icns->size;i++) push_u1v(cns->g->zseq, cns->cns->buffer[i]);
if(chg){
clear_u1v(cns->cns);
append_u1v(cns->cns, cns->g->zseq);
clear_string(cns->seq);
encap_string(cns->seq, cns->g->zseq->size);
for(i=0;ig->zseq->size;i++){
cns->seq->string[i] = bit_base_table[cns->g->zseq->buffer[i]];
}
cns->seq->size = i;
cns->seq->string[i] = '\0';
}
return chg;
}
static inline int run_cns(CNS *cns, int candidate_mode, int corr_struct){
u4i i;
if(cns->qblks->size == 0) return 0;
cns->qidx = 0;
if(candidate_mode == 4){ // longest
for(i=0;iqblks->size;i++){
if(cns->qblks->buffer[i].solid == 0) continue;
if(cns->qblks->buffer[i].len > cns->qblks->buffer[cns->qidx].len) cns->qidx = i;
}
} else if(candidate_mode == 5){ // shortest
for(i=0;iqblks->size;i++){
if(cns->qblks->buffer[i].solid == 0) continue;
if(cns->qblks->buffer[i].len < cns->qblks->buffer[cns->qidx].len) cns->qidx = i;
}
} else if(candidate_mode == 3){ // first and but not increase coverage
cns->qidx = 0;
} else if(candidate_mode == 0){ // kmer coverage
cns->qidx = 0;
} else if(candidate_mode == 2){ // first and include
cns->qidx = 0;
} else if(candidate_mode < 0){
cns->qidx = num_min(-1 - candidate_mode, (int)(cns->qblks->size - 1));
} else if(candidate_mode == 1){ // median
u2v *idxs;
idxs = init_u2v(cns->qblks->size);
for(i=0;iqblks->size;i++){
if(cns->qblks->buffer[i].solid == 0) continue;
push_u2v(idxs, i);
}
cns->qidx = quick_median_array(idxs->buffer, idxs->size, u2i, num_cmpgt(cns->qblks->buffer[a].len, cns->qblks->buffer[b].len));
} else {
fprintf(stderr, " -- Unknown candidate mode %d in %s -- %s:%d --\n", candidate_mode, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
ready_core_cns(cns, candidate_mode, cns->qblks->buffer[cns->qidx].len);
if(candidate_mode == 0){
double max, cov;
max = - 1000000;
for(i=0;ig->kmers->size;i++) cns->g->kmers->buffer[i].visit = 0;
for(i=cns->qblks->size;i>0;i--){
if(cns->qblks->buffer[i - 1].solid == 0) continue;
cov = kmer_cov_seq_dbg(cns->g, cns->qseqs->buffer + cns->qblks->buffer[i - 1].off, cns->qblks->buffer[i - 1].len, (i - 1 + 1) % 255) * 1.0 / cns->qblks->buffer[i - 1].len;
if(cov > max){
max = cov;
cns->qidx = i - 1;
}
}
// revise cns->avg_cov
cns->avg_cov = cns->avg_cov * cns->qblks->buffer[0].len / cns->qblks->buffer[cns->qidx].len;
}
if(corr_struct){
build_DirectFuzzyBruijnGraph(cns, cns->qidx, 5, 0.2);
//build_DirectFuzzyBruijnGraph(cns, cns->qidx, 0, 0.2);
DP_best_path_DirectFuzzyBruijnGraph(cns);
correct_struct_DirectFuzzyBruijnGraph(cns, cns->qidx);
if(DBGCNS_DEBUG){
FILE *dotf = open_file_for_write("debug.dot", NULL, 1);
print_dot_DirectFuzzyBruijnGraph(cns, dotf);
fclose(dotf);
}
run_core_cns(cns, cns->fbg->starseq->buffer, cns->fbg->starseq->size);
} else {
run_core_cns(cns, cns->qseqs->buffer + cns->qblks->buffer[cns->qidx].off, cns->qblks->buffer[cns->qidx].len);
}
homopolymer_analysis_cns(cns);
return cns->seq->size;
}
#endif
wtdbg2-2.5/dna.h 0000664 0000000 0000000 00000115152 13536643722 0013504 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __DNA_RJ_H
#define __DNA_RJ_H
#include
#include
#include
#include
#include "list.h"
#include "bitvec.h"
#include "hashset.h"
#include "thread.h"
static const u1i base_bit_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
static const u1i base_bit4_table[256] = {
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15,
15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15,
15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15,
15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
};
static const u1i bit4_bit_table[16] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
static const char bit_base_table[12] = "ACGTN-acgtn*";
static const char bit4_base_table[16] = "-ACMGRSVTWYHKDBN";
// u8i = 0|1|2|3|4|5|6|...
#define bits2bit(bits, off) (((bits)[(off) >> 5] >> (((~(off)) & 0x1FU) << 1)) & 0x03U)
#define bits2revbit(bits, off) ((~((bits)[(off) >> 5] >> (((~(off)) & 0x1FU) << 1))) & 0x03U)
static inline u8i dna_xor2ones(u8i seq){
return ((seq & 0xAAAAAAAAAAAAAAAALLU) >> 1) | (seq & 0x5555555555555555LLU);
}
static inline u8i dna_rev_seq32(u8i seq){
seq = ~seq;
seq = ((seq & 0x3333333333333333LLU)<< 2) | ((seq & 0xCCCCCCCCCCCCCCCCLLU)>> 2);
seq = ((seq & 0x0F0F0F0F0F0F0F0FLLU)<< 4) | ((seq & 0xF0F0F0F0F0F0F0F0LLU)>> 4);
#if 0
seq = ((seq & 0x00FF00FF00FF00FFLLU)<< 8) | ((seq & 0xFF00FF00FF00FF00LLU)>> 8);
seq = ((seq & 0x0000FFFF0000FFFFLLU)<<16) | ((seq & 0xFFFF0000FFFF0000LLU)>>16);
seq = ((seq & 0x00000000FFFFFFFFLLU)<<32) | ((seq & 0xFFFFFFFF00000000LLU)>>32);
#else
seq = __builtin_bswap64(seq);
#endif
return seq;
}
static inline u8i dna_rev_seq(u8i seq, u1i seq_size){
return dna_rev_seq32(seq) >> (64 - (seq_size<<1));
}
// order of 2-bit in this->seqs is different with that in dna_rev_seq->seq
static inline void dna_rev_seqs(u8i *seqs, u8i seq_size){
register u8i t;
int i, j;
register u1i d, e;
j = (seq_size + 31) >> 5;
// Swap within 64bit
for(i=0;i> 2);
seqs[i] = ((seqs[i] & 0x0F0F0F0F0F0F0F0FLLU)<< 4) | ((seqs[i] & 0xF0F0F0F0F0F0F0F0LLU)>> 4);
seqs[i] = __builtin_bswap64(seqs[i]);
}
// Swap 64bit blocks
for(i=0;i>1;i++){
t = seqs[i]; seqs[i] = seqs[j - i - 1]; seqs[j - i - 1] = t;
}
// left-align seqs
if((d = ((j << 5) - seq_size) << 1)){
e = 64 - d;
for(i=0;i> e);
}
seqs[i] = (seqs[i] << d) | 0;
}
}
//shift one base, and append one base, useful to build big-kmer
static inline void dna_shl_seqs(u8i *seqs, u8i seq_size, u1i base_val){
const u1i d = 2;
const u1i e = 62;
int i, j;
j = (seq_size + 31) >> 5;
for(i=0;i> e);
}
seqs[i] = (seqs[i] << d) | (((u8i)(base_val & 0x03U)) << ((32 - (seq_size & 0x1FU)) << 1));
}
static inline int dna_cmp_seqs(u8i *seqs1, u8i *seqs2, u8i seq_size){
int i, j;
j = (seq_size + 31) >> 5;
for(i=0;i seqs2[i]) return 1;
}
return 0;
}
static inline int dna_cmpx_seqs(u8i *seqs, u8i seq_size){
register int i, j;
register u1i a, b;
j = (seq_size + 1) >> 1;
for(i=0;i b) return 1;
}
return 0;
}
static inline u8i seq2kmer(char *seq, u4i ksize){
u8i kmer;
u4i i;
kmer = 0;
for(i=0;i> ((ksize - 1 - i) << 1)) & 0x03];
}
seq[i] = 0;
}
static inline void kmer2revseq(char *seq, u8i kmer, u4i ksize){
u4i i;
kmer = ~kmer;
for(i=0;i> (i << 1)) & 0x03];
}
seq[i] = 0;
}
static inline void print_kmer_seq(u8i kmer, u4i ksize, FILE *out){
char seq[33];
kmer2seq(seq, kmer, ksize);
fputs(seq, out);
}
static inline void print_kmer_revseq(u8i kmer, u4i ksize, FILE *out){
char seq[33];
kmer2revseq(seq, kmer, ksize);
fputs(seq, out);
}
#define kmer_mask(ksize) (0xFFFFFFFFFFFFFFFFLLU >> ((32 - (ksize)) * 2))
#define beg_seq2kmers(seq, seqlen, ksize, kmask, kmer, idx) { \
u1i beg_seq2kmers_v; \
kmer = 0; \
for(idx=0;(int)idx+1<(int)ksize;idx++){ \
beg_seq2kmers_v = base_bit_table[(int)(seq)[idx]]; \
if(beg_seq2kmers_v == 4) beg_seq2kmers_v = lrand48() & 0x03; \
kmer = (((kmer) << 2) | beg_seq2kmers_v); \
} \
for(idx=0;(int)idx<=(int)(seqlen-ksize);idx++){ \
beg_seq2kmers_v = base_bit_table[(int)(seq)[idx + (ksize) - 1]]; \
if(beg_seq2kmers_v == 4) beg_seq2kmers_v = lrand48() & 0x03; \
kmer = ((kmer << 2) | beg_seq2kmers_v) & kmask;
#define end_seq2kmers } }
#define beg_seq2revkmers(seq, seqlen, ksize, kmask, kmer, idx) { \
u1i beg_seq2revkmers_v; \
kmer = 0; \
for(idx=0;(int)idx+1<(int)ksize;idx++){ \
beg_seq2revkmers_v = base_bit_table[(int)(seq)[seqlen - 1 - idx]]; \
if(beg_seq2revkmers_v == 4) beg_seq2revkmers_v = lrand48() & 0x03; \
kmer = (((kmer) << 2) | beg_seq2revkmers_v); \
} \
for(idx=0;(int)idx<=(int)seqlen-ksize;idx++){ \
beg_seq2revkmers_v = base_bit_table[(int)(seq)[seqlen - idx - (ksize)]]; \
if(beg_seq2revkmers_v == 4) beg_seq2revkmers_v = lrand48() & 0x03; \
kmer = ((kmer << 2) | beg_seq2revkmers_v) & kmask;
#define end_seq2revkmers } }
static inline char reverse_dna_base(char b){
switch(b){
case 'a': return 't';
case 'A': return 'T';
case 'c': return 'g';
case 'C': return 'G';
case 'g': return 'c';
case 'G': return 'C';
case 't': return 'a';
case 'T': return 'A';
default: return 'N';
}
}
static inline void reverse_dna(char *seq, int len){
int i, j;
char c;
i = 0;
j = len - 1;
while(i < j){
c = seq[i]; seq[i] = seq[j]; seq[j] = c;
i ++; j --;
}
for(i=0;i> 5] = 0; (bits)[(off) >> 5] |= ((u8i)(bit)) << (((~(off)) & 0x1FU) << 1); }
#define bit2bits(bits, off, bit) { \
u8i __off1; \
u4i __off2; \
__off1 = (off) >> 5; \
__off2 = (((~(off)) & 0x1FU) << 1); \
(bits)[__off1] = ((bits)[__off1] & (~(0x3LLU << __off2))) | (((u8i)(bit)) << __off2); \
}
static inline void seq2bits(u8i *bits, u8i bitoff, char *seq, u4i seqlen){
u8i i, c;
for(i=0;i>5] >> (((~(off + i)) & 0x1FU) << 1)) & 0x03;
seq[len - i - 1] = bit_base_table[(~c)&0x03];
}
seq[i] = 0;
}
static inline u8i sub32seqbits(u8i *src, u8i off){
u8i m;
u4i n;
m = off >> 5;
n = (off & 0x1F) << 1;
return (src[m] << n) | (((src[m + 1] >> (62 - n)) >> 2));
//n = off & 0x1F;
//if(n){
//return (src[m] << (n << 1)) | (src[m + 1] >> ((32 - n) << 1));
//} else {
//return src[m];
//}
}
static inline u8i sub8seqbits(u8i *src, u8i off){
u8i off1;
u4i off2;
off1 = off >> 5;
off2 = (off & 0x1FU) << 1;
return ((src[off1] << off2) | (((src[off1 + 1] >> (62 - off2)) >> 2))) >> 48;
}
static inline u8i sub4seqbits(u8i *src, u8i off){
u8i off1;
u4i off2;
off1 = off >> 5;
off2 = (off & 0x1FU) << 1;
return ((src[off1] << off2) | (((src[off1 + 1] >> (62 - off2)) >> 2))) >> 56;
}
static inline u8i sub2seqbits(u8i *src, u8i off){
u8i off1;
u4i off2;
off1 = off >> 5;
off2 = (off & 0x1FU) << 1;
return ((src[off1] << off2) | (((src[off1 + 1] >> (62 - off2)) >> 2))) >> 60;
}
static inline u8i sub_seqbits(u8i *src, u8i off, u1i len){
u8i off1;
u4i off2;
off1 = off >> 5;
off2 = (off & 0x1FU) << 1;
return ((src[off1] << off2) | (((src[off1 + 1] >> (62 - off2)) >> 2))) >> ((32 - len) << 1);
}
#define subseqbits(src, off, len) sub_seqbits(src, off, len)
static inline int cmpgt_seqbits(u8i *bits, u8i off1, u8i off2, u4i _len){
u8i idxs[2], v[2];
u4i offs[2], i, len;
idxs[0] = off1 >> 5;
idxs[1] = off2 >> 5;
offs[0] = (off1 & 0x1FU) << 1;
offs[1] = (off2 & 0x1FU) << 1;
len = roundup_times(_len, 32);
for(i=0;i> (62 - offs[0])) >> 2));
v[1] = (bits[idxs[1]] << offs[1]) | (((bits[idxs[1] + 1] >> (62 - offs[1])) >> 2));
if(v[0] > v[1]){
return 1;
} else if(v[0] < v[1]){
return 0;
}
idxs[0] ++;
idxs[1] ++;
}
return 0;
}
#if __BYTE_ORDER == 1234
static const u4i spare_2bits_table[256] = {
0, 16777216, 33554432, 50331648, 65536, 16842752, 33619968, 50397184,
131072, 16908288, 33685504, 50462720, 196608, 16973824, 33751040, 50528256,
256, 16777472, 33554688, 50331904, 65792, 16843008, 33620224, 50397440,
131328, 16908544, 33685760, 50462976, 196864, 16974080, 33751296, 50528512,
512, 16777728, 33554944, 50332160, 66048, 16843264, 33620480, 50397696,
131584, 16908800, 33686016, 50463232, 197120, 16974336, 33751552, 50528768,
768, 16777984, 33555200, 50332416, 66304, 16843520, 33620736, 50397952,
131840, 16909056, 33686272, 50463488, 197376, 16974592, 33751808, 50529024,
1, 16777217, 33554433, 50331649, 65537, 16842753, 33619969, 50397185,
131073, 16908289, 33685505, 50462721, 196609, 16973825, 33751041, 50528257,
257, 16777473, 33554689, 50331905, 65793, 16843009, 33620225, 50397441,
131329, 16908545, 33685761, 50462977, 196865, 16974081, 33751297, 50528513,
513, 16777729, 33554945, 50332161, 66049, 16843265, 33620481, 50397697,
131585, 16908801, 33686017, 50463233, 197121, 16974337, 33751553, 50528769,
769, 16777985, 33555201, 50332417, 66305, 16843521, 33620737, 50397953,
131841, 16909057, 33686273, 50463489, 197377, 16974593, 33751809, 50529025,
2, 16777218, 33554434, 50331650, 65538, 16842754, 33619970, 50397186,
131074, 16908290, 33685506, 50462722, 196610, 16973826, 33751042, 50528258,
258, 16777474, 33554690, 50331906, 65794, 16843010, 33620226, 50397442,
131330, 16908546, 33685762, 50462978, 196866, 16974082, 33751298, 50528514,
514, 16777730, 33554946, 50332162, 66050, 16843266, 33620482, 50397698,
131586, 16908802, 33686018, 50463234, 197122, 16974338, 33751554, 50528770,
770, 16777986, 33555202, 50332418, 66306, 16843522, 33620738, 50397954,
131842, 16909058, 33686274, 50463490, 197378, 16974594, 33751810, 50529026,
3, 16777219, 33554435, 50331651, 65539, 16842755, 33619971, 50397187,
131075, 16908291, 33685507, 50462723, 196611, 16973827, 33751043, 50528259,
259, 16777475, 33554691, 50331907, 65795, 16843011, 33620227, 50397443,
131331, 16908547, 33685763, 50462979, 196867, 16974083, 33751299, 50528515,
515, 16777731, 33554947, 50332163, 66051, 16843267, 33620483, 50397699,
131587, 16908803, 33686019, 50463235, 197123, 16974339, 33751555, 50528771,
771, 16777987, 33555203, 50332419, 66307, 16843523, 33620739, 50397955,
131843, 16909059, 33686275, 50463491, 197379, 16974595, 33751811, 50529027
};
#else
static const u4i spare_2bits_table[256] = {
0, 1, 2, 3, 256, 257, 258, 259,
512, 513, 514, 515, 768, 769, 770, 771,
65536, 65537, 65538, 65539, 65792, 65793, 65794, 65795,
66048, 66049, 66050, 66051, 66304, 66305, 66306, 66307,
131072, 131073, 131074, 131075, 131328, 131329, 131330, 131331,
131584, 131585, 131586, 131587, 131840, 131841, 131842, 131843,
196608, 196609, 196610, 196611, 196864, 196865, 196866, 196867,
197120, 197121, 197122, 197123, 197376, 197377, 197378, 197379,
16777216, 16777217, 16777218, 16777219, 16777472, 16777473, 16777474, 16777475,
16777728, 16777729, 16777730, 16777731, 16777984, 16777985, 16777986, 16777987,
16842752, 16842753, 16842754, 16842755, 16843008, 16843009, 16843010, 16843011,
16843264, 16843265, 16843266, 16843267, 16843520, 16843521, 16843522, 16843523,
16908288, 16908289, 16908290, 16908291, 16908544, 16908545, 16908546, 16908547,
16908800, 16908801, 16908802, 16908803, 16909056, 16909057, 16909058, 16909059,
16973824, 16973825, 16973826, 16973827, 16974080, 16974081, 16974082, 16974083,
16974336, 16974337, 16974338, 16974339, 16974592, 16974593, 16974594, 16974595,
33554432, 33554433, 33554434, 33554435, 33554688, 33554689, 33554690, 33554691,
33554944, 33554945, 33554946, 33554947, 33555200, 33555201, 33555202, 33555203,
33619968, 33619969, 33619970, 33619971, 33620224, 33620225, 33620226, 33620227,
33620480, 33620481, 33620482, 33620483, 33620736, 33620737, 33620738, 33620739,
33685504, 33685505, 33685506, 33685507, 33685760, 33685761, 33685762, 33685763,
33686016, 33686017, 33686018, 33686019, 33686272, 33686273, 33686274, 33686275,
33751040, 33751041, 33751042, 33751043, 33751296, 33751297, 33751298, 33751299,
33751552, 33751553, 33751554, 33751555, 33751808, 33751809, 33751810, 33751811,
50331648, 50331649, 50331650, 50331651, 50331904, 50331905, 50331906, 50331907,
50332160, 50332161, 50332162, 50332163, 50332416, 50332417, 50332418, 50332419,
50397184, 50397185, 50397186, 50397187, 50397440, 50397441, 50397442, 50397443,
50397696, 50397697, 50397698, 50397699, 50397952, 50397953, 50397954, 50397955,
50462720, 50462721, 50462722, 50462723, 50462976, 50462977, 50462978, 50462979,
50463232, 50463233, 50463234, 50463235, 50463488, 50463489, 50463490, 50463491,
50528256, 50528257, 50528258, 50528259, 50528512, 50528513, 50528514, 50528515,
50528768, 50528769, 50528770, 50528771, 50529024, 50529025, 50529026, 50529027
};
#endif
static inline void spare_2bits(u1i bs[32], u8i v){
((u4i*)bs)[0] = spare_2bits_table[((v >> 56) & 0xFF)];
((u4i*)bs)[1] = spare_2bits_table[((v >> 48) & 0xFF)];
((u4i*)bs)[2] = spare_2bits_table[((v >> 40) & 0xFF)];
((u4i*)bs)[3] = spare_2bits_table[((v >> 32) & 0xFF)];
((u4i*)bs)[4] = spare_2bits_table[((v >> 24) & 0xFF)];
((u4i*)bs)[5] = spare_2bits_table[((v >> 16) & 0xFF)];
((u4i*)bs)[6] = spare_2bits_table[((v >> 8) & 0xFF)];
((u4i*)bs)[7] = spare_2bits_table[((v >> 0) & 0xFF)];
}
typedef struct {
u8i *bits;
u8i size;
u8i cap;
} BaseBank;
static inline size_t basebank_obj_desc_cnt(void *obj, int idx){ return ((((BaseBank*)obj)->size + 31) / 32 + 1) * 8; idx = idx; }
static inline void basebank_obj_desc_post_load(void *obj, size_t aux_data){
BaseBank *bnk;
UNUSED(aux_data);
bnk = (BaseBank*)obj;
bnk->cap = ((bnk->size + 31) / 32) * 32;
}
static const obj_desc_t basebank_obj_desc = {"BaseBank", sizeof(BaseBank), 1, {1}, {offsetof(BaseBank, bits)}, {(obj_desc_t*)&OBJ_DESC_DATA}, basebank_obj_desc_cnt, basebank_obj_desc_post_load};
static inline BaseBank* init_basebank(){
BaseBank *bnk;
bnk = malloc(sizeof(BaseBank));
bnk->size = 0;
bnk->cap = 256;
bnk->bits = calloc(bnk->cap / 32 + 1, 8);
return bnk;
}
static inline void free_basebank(BaseBank *bnk){
free(bnk->bits);
free(bnk);
}
static inline void encap_basebank(BaseBank *bnk, u8i inc){
u8i old;
u8i *bits;
if(bnk->cap - bnk->size >= inc) return;
old = bnk->cap;
if(MAX_U8 - inc <= bnk->size){
fprintf(stderr, " -- Overflow(64bits) %llu + %llu, in %s -- %s:%d --\n", (u8i)bnk->size, (u8i)inc, __FUNCTION__, __FILE__, __LINE__);
print_backtrace(stderr, 20);
abort();
}
if(MAX_U8 - inc < 0x3FFFFFFFLLU){
fprintf(stderr, " -- Overflow(64bits) %llu + %llu, in %s -- %s:%d --\n", (u8i)bnk->size, (u8i)inc, __FUNCTION__, __FILE__, __LINE__);
print_backtrace(stderr, 20);
abort();
}
if(bnk->size + inc <= 0x3FFFFFFFLLU){
bnk->cap = roundup_times(2 * (bnk->size + inc), 32);
} else {
//bnk->cap = ((bnk->size + inc + 0xFFFFFFFLLU - 1LLU) / 0xFFFFFFFLLU) * 0xFFFFFFFLLU;
bnk->cap = (bnk->size + inc + 0x3FFFFFFFLLU) & (MAX_U8 << 30);
}
if(bnk->cap < 32) bnk->cap = 32;
bits = realloc(bnk->bits, ((bnk->cap >> 5) + 1) << 3);
if(bits == NULL){
fprintf(stderr, " -- Out of memory, try to allocate %llu bytes, old size %llu, in %s -- %s:%d --\n", (u8i)bnk->cap >> 2, old >> 2, __FUNCTION__, __FILE__, __LINE__);
print_backtrace(stderr, 20);
abort();
}
bnk->bits = bits;
memset(bnk->bits + (old / 32), 0, (bnk->cap + 32 - old) / 4);
}
static inline void clear_basebank(BaseBank *bnk){
//memset(bnk->bits, 0, ((bnk->size + 31) / 32) * 8);
bnk->size = 0;
}
static inline void normalize_basebank(BaseBank *bnk){
if(bnk->size < bnk->cap){
if(bnk->size & 0x1FU){
bnk->bits[bnk->size>>5] = bnk->bits[bnk->size>>5] & (MAX_U8 << (64 - ((bnk->size & 0x1FU) << 1)));
}
}
}
static inline void pack_basebank(BaseBank *bnk){
u8i size;
size = (bnk->size + 31) & (~0x1FLLU);
if(size == 0) size = 32;
if(size >= bnk->cap) return;
bnk->cap = ((size + 31) / 32) * 32;
bnk->bits = realloc(bnk->bits, ((bnk->cap >> 5) + 1) << 3);
memset(bnk->bits + (bnk->cap >> 5), 0, 8);
}
static inline void bit2basebank(BaseBank *bnk, u1i v){
encap_basebank(bnk, 1);
bit2bits(bnk->bits, bnk->size, (v & 0x03));
bnk->size ++;
}
static inline void bits2basebank(BaseBank *bnk, u8i *bits, u8i off, u8i len){
u8i offset;
encap_basebank(bnk, len);
for(offset=off;offsetbits, bnk->size, bits2bit(bits, offset));
bnk->size ++;
}
}
#define fwdbits2basebank(bnk, bits, off, len) bits2basebank(bnk, bits, off, len)
static inline void fast_bits2basebank(BaseBank *bnk, u8i *bits, u8i off, u8i len){
u8i end, dat;
u4i gap;
encap_basebank(bnk, len);
if(len == 0) return;
if(bnk->size & 0x1FU){
gap = 32 - (bnk->size & 0x1FU);
if(len <= gap){
dat = subseqbits(bits, off, len);
bnk->bits[bnk->size >> 5] |= dat << ((gap - len) << 1);
bnk->size += len;
return;
} else {
dat = subseqbits(bits, off, gap);
bnk->bits[bnk->size >> 5] |= dat;
bnk->size += gap;
off += gap;
len -= gap;
}
}
end = off + len;
for(;off+32<=end;off+=32){
dat = sub32seqbits(bits, off);
bnk->bits[bnk->size >> 5] = dat;
bnk->size += 32;
}
if(off < end){
dat = sub32seqbits(bits, off);
bnk->bits[bnk->size >> 5] = dat & (MAX_U8 << ((32 - (end - off)) << 1));
bnk->size += end - off;
} else {
bnk->bits[bnk->size >> 5] = 0;
}
}
#define fast_fwdbits2basebank(bnk, bits, off, len) fast_bits2basebank(bnk, bits, off, len)
static inline void revbits2basebank(BaseBank *bnk, u8i *bits, u8i off, u8i len){
u8i i;
encap_basebank(bnk, len);
for(i=1;i<=len;i++){
bit2bits(bnk->bits, bnk->size, bits2revbit(bits, (off + len - i)));
bnk->size ++;
}
}
static inline void fast_revbits2basebank(BaseBank *bnk, u8i *bits, u8i off, u8i len){
u8i end, dat;
u4i gap;
if(len == 0) return;
encap_basebank(bnk, len);
if(bnk->size & 0x1FU){
gap = 32 - (bnk->size & 0x1FU);
if(len <= gap){
dat = subseqbits(bits, off, len);
dat = dna_rev_seq(dat, len);
bnk->bits[bnk->size >> 5] |= dat << ((gap - len) << 1);
bnk->size += len;
return;
} else {
dat = subseqbits(bits, off + len - gap, gap);
dat = dna_rev_seq(dat, gap);
bnk->bits[bnk->size >> 5] |= dat;
bnk->size += gap;
//off += gap;
len -= gap;
}
}
end = off + len;
for(;off+32<=end;){
end -= 32;
dat = sub32seqbits(bits, end);
dat = dna_rev_seq32(dat);
bnk->bits[bnk->size >> 5] = dat;
bnk->size += 32;
}
if(off < end){
dat = sub32seqbits(bits, off);
dat = dna_rev_seq32(dat);
//bnk->bits[bnk->size >> 5] = dat & (MAX_U8 << ((32 - (end - off)) << 1));
bnk->bits[bnk->size >> 5] = dat << ((32 - (end - off)) << 1);
bnk->size += end - off;
} else {
bnk->bits[bnk->size >> 5] = 0;
}
}
static inline void seq2basebank(BaseBank *bnk, char *seq, u8i len){
u8i idx1, i, c;
u1i idx2;
encap_basebank(bnk, len);
idx1 = bnk->size >> 5;
idx2 = ((bnk->size) & 0x1FU) << 1;
bnk->size += len;
if(idx2 == 0) bnk->bits[idx1] = 0;
for(i=0;ibits[idx1] |= c << (62 - idx2);
idx2 = (idx2 + 2) & 0x3F;
if(idx2 == 0){
bnk->bits[++idx1] = 0;
}
}
}
#define fwdseq2basebank(bnk, seq, len) seq2basebank(bnk, seq, len)
static inline void revseq2basebank(BaseBank *bnk, char *seq, u8i len){
char *p;
u1i c;
p = seq + len;
encap_basebank(bnk, len);
while(p > seq){
p --;
c = base_bit_table[(int)*p];
c = (~c) & 0x03;
bit2bits(bnk->bits, bnk->size, c);
bnk->size ++;
}
}
static inline void seq2basebank2(BaseBank *bnk, char *seq, u8i len){
char *p;
u1i c;
p = seq;
seq = seq + len;
encap_basebank(bnk, len);
while(p < seq){
c = base_bit_table[(int)*p];
if(c == 4) c = lrand48() & 0x03;
bit2bits(bnk->bits, bnk->size, c);
bnk->size ++;
p ++;
}
}
static inline void revseq2basebank2(BaseBank *bnk, char *seq, u8i len){
char *p;
u1i c;
p = seq + len;
encap_basebank(bnk, len);
while(p > seq){
p --;
c = base_bit_table[(int)*p];
if(c == 4) c = lrand48() & 0x03;
c = (~c) & 0x03;
bit2bits(bnk->bits, bnk->size, c);
bnk->size ++;
}
}
static inline u1i get_basebank(BaseBank *bnk, u8i off){ return bits2bit(bnk->bits, off); }
static inline void seq_basebank(BaseBank *bnk, u8i off, u8i len, char *seq){
u8i i;
for(i=0;ibits, off + i)];
}
seq[i] = 0;
}
#define fwdseq_basebank(bnk, off, len, seq) seq_basebank(bnk, off, len, seq)
static inline void bitseq_basebank(BaseBank *bnk, u8i off, u8i len, u1i *seq){
u8i i;
for(i=0;ibits, off + i);
}
}
static inline void revseq_basebank(BaseBank *bnk, u8i off, u8i len, char *seq){
u8i i;
for(i=0;ibits, off + len - 1 - i)) & 0x03];
}
seq[i] = 0;
}
static inline void revbitseq_basebank(BaseBank *bnk, u8i off, u8i len, u1i *seq){
u8i i;
for(i=0;ibits, off + len - 1 - i)) & 0x03;
}
}
static inline void reverse_basebank(BaseBank *bnk){
u8i size, rsize;
size = bnk->size;
rsize = (bnk->size + 31) & (~0x1FLLU);
encap_basebank(bnk, rsize + 32);
memcpy(bnk->bits + (rsize >> 5), bnk->bits + 0, (rsize >> 5) << 3);
bnk->size = 0;
fast_revbits2basebank(bnk, bnk->bits, rsize, size);
}
static inline void print_seq_basebank(BaseBank *bnk, u8i off, u8i len, FILE *out){
u8i i, b, e;
char buf[101];
for(b=off;bbits, i)];
}
buf[e - b] = '\0';
fputs(buf, out);
//fputc('\n', out);
b = e;
}
}
static inline void print_lines_basebank(BaseBank *bnk, u8i off, u8i len, FILE *out, int linewidth){
u8i i, b, e;
char *buf;
if(linewidth < 1) linewidth = 100;
buf = malloc(linewidth + 1);
for(b=off;bbits, i)];
}
buf[e - b] = '\0';
fputs(buf, out);
fputc('\n', out);
b = e;
}
free(buf);
}
#define print_fwdseq_basebank(bnk, off, len, out) print_seq_basebank(bnk, off, len, out)
static inline void println_seq_basebank(BaseBank *bnk, u8i off, u8i len, FILE *out){
print_seq_basebank(bnk, off, len, out);
fputc('\n', out);
}
#define println_fwdseq_basebank(bnk, off, len, out) println_seq_basebank(bnk, off, len, out)
static inline void print_revseq_basebank(BaseBank *bnk, u8i off, u8i len, FILE *out){
u8i i;
char buf[65];
buf[64] = '\0';
for(i=0;ibits, off + len - 1 - i)];
i ++;
if((i & 0x3F) == 0){
fprintf(out, "%s", buf);
}
}
if(i & 0x3F){
buf[i & 0x3F] = '\0';
fprintf(out, "%s", buf);
}
}
static inline u8i sub32_basebank(BaseBank *bnk, u8i off){ return sub32seqbits(bnk->bits, off); }
static inline u8i sub4_basebank(BaseBank *bnk, u8i off){ return sub4seqbits(bnk->bits, off); }
// assert(len > 0 && len <= 32)
static inline u8i subbits_basebank(BaseBank *bnk, u8i off, u1i len){ return sub_seqbits(bnk->bits, off, len); }
static inline void println_revseq_basebank(BaseBank *bnk, u8i off, u8i len, FILE *out){
print_revseq_basebank(bnk, off, len, out);
fputc('\n', out);
}
static inline u8i hzsubbits_basebank(BaseBank *bnk, u8i off, u1i len){
u8i k;
u1i i, b, c;
k = 0;
b = 4;
for(i=0;ibits, off);
if(c == b) continue;
i ++;
b = c;
k = (k << 2) | b;
}
return k;
}
static inline int bitsearch_basebank(BaseBank *bnk, u8i *_off, u8i len, u8i bits, u1i size, int max_occ){
u8i off, end, k, mask;
u1i b;
int ret;
off = *_off;
end = off + len;
mask = MAX_U8 >> ((32 - size) << 1);
k = subbits_basebank(bnk, off, size - 1);
off += size - 1;
ret = 0;
for(;offbits, off);
k = ((k << 2) | b) & mask;
if(k == bits){
_off[ret++] = off - (size - 1);
if(ret >= max_occ) break;
}
}
return ret;
}
static inline int hzbitsearch_basebank(BaseBank *bnk, u8i *_off, u8i len, u8i bits, u1i size, int max_occ){
u8i off, h, end, k, mask;
u1i b, c;
int ret;
off = *_off;
end = off + len;
mask = MAX_U8 >> ((32 - size) << 1);
k = 0;
h = 0;
b = 4;
ret = 0;
for(;offbits, off);
if(c == b) continue;
b = c;
h ++;
k = ((k << 2) | b) & mask;
if(h >= size && k == bits){
_off[ret++] = off - (size - 1);
if(ret >= max_occ) break;
}
}
return ret;
}
static inline u4i mismatch_basebank(BaseBank *bnk, u8i off1, u8i off2, u4i len){
u8i seq1, seq2;
u4i mm, i;
mm = 0;
for(i=0;i+32<=len;i+=32){
seq1 = sub32seqbits(bnk->bits, off1 + i);
seq2 = sub32seqbits(bnk->bits, off2 + i);
mm += count_ones_bit64(dna_xor2ones(seq1 ^ seq2));
}
if(i < len){
seq1 = sub32seqbits(bnk->bits, off1 + i);
seq2 = sub32seqbits(bnk->bits, off2 + i);
mm += count_ones_bit64((dna_xor2ones(seq1 ^ seq2)) >> ((32 - (len - i)) << 1));
}
return mm;
}
thread_beg_def(_mradix);
BaseBank *bb;
u4i *counts[2];
u4i *offs;
u1v *lcps;
u4i size, klen;
int task;
FILE *log;
thread_end_def(_mradix);
thread_beg_func(_mradix);
BaseBank *bb;
u4i *offs, *counts[2];
u4i i, j, size, klen, m, n, v, t;
u4i ncpu, tidx;
bb = _mradix->bb;
counts[0] = calloc((MAX_U2 + 1), sizeof(u4i)); // used in twice
counts[1] = calloc((MAX_U2 + 1), sizeof(u4i));
ncpu = _mradix->n_cpu;
tidx = _mradix->t_idx;
thread_beg_loop(_mradix);
if(_mradix->task == 1){
size = _mradix->size;
for(i=_mradix->t_idx;in_cpu){
v = sub8seqbits(bb->bits, i);
counts[1][v] ++;
}
_mradix->counts[0] = counts[0];
_mradix->counts[1] = counts[1];
} else if(_mradix->task == 11){
offs = _mradix->offs;
size = _mradix->size;
for(i=0;ibits, i);
if((v % _mradix->n_cpu) == (u4i)_mradix->t_idx){
offs[_mradix->counts[0][v]++] = i;
}
if(_mradix->t_idx == 0 && _mradix->log && (i % 1000000) == 0){
fprintf(_mradix->log, "\r%u", i); fflush(_mradix->log);
}
}
if(_mradix->t_idx == 0 && _mradix->log){
fprintf(_mradix->log, "\r%u\n", size);
}
} else if(_mradix->task == 2) {
offs = _mradix->offs;
size = _mradix->size;
klen = _mradix->klen - 8;
if(size <= MAX_U1){
sort_array(offs, size, u4i, cmpgt_seqbits(bb->bits, a + 8, b + 8, klen)); // 8 bp already sorted
} else {
memset(counts[1], 0, (MAX_U1 + 1) * sizeof(u4i));
for(i=0;ibits, offs[i] + 8);
counts[1][v] ++;
}
m = 0;
for(i=0;i<=MAX_U1;i++){
counts[0][i] = m;
m += counts[1][i];
counts[1][i] = m;
}
for(m=0;m<=MAX_U1;m++){
while(counts[0][m] < counts[1][m]){
v = offs[counts[0][m]];
n = sub4seqbits(bb->bits, v + 8);
while(n > m){
t = offs[counts[0][n]];
offs[counts[0][n]] = v;
counts[0][n] ++;
v = t;
n = sub4seqbits(bb->bits, v + 8);
}
offs[counts[0][m]++] = v;
}
}
n = 0;
klen -= 4;
for(m=0;m<=MAX_U1;m++){
if(counts[0][m] - n < 2){
// nothing to do
} else {
sort_array(offs + n, counts[0][m] - n, u4i, cmpgt_seqbits(bb->bits, a + 8 + 4, b + 8 + 4, klen));
}
n = counts[0][m];
}
}
} else if(_mradix->task == 3){
u1v *lcps;
u8i mask;
u4i beg, end;
u1i lcp;
offs = _mradix->offs;
lcps = _mradix->lcps;
beg = ((_mradix->size + ncpu - 1) / ncpu) * tidx;
end = ((_mradix->size + ncpu - 1) / ncpu) * (tidx + 1);
if(end > _mradix->size) end = _mradix->size;
klen = _mradix->klen;
if(beg == 0){
lcps->buffer[0] = 0;
beg = 1;
}
for(i=beg;ibits, offs[i - 1] + j) ^ sub32seqbits(bb->bits, offs[i] + j);
if(mask == 0){
lcp += 32;
} else {
lcp += __builtin_clzll(mask) >> 1;
break;
}
}
lcps->buffer[i] = lcp;
}
}
thread_end_loop(_mradix);
free(counts[0]);
free(counts[1]);
thread_end_func(_mradix);
// bullet size = 4^8 = (MAX_U2 + 1)
static inline void msd_radix_sort_u4_basebank(BaseBank *bb, u4v *offs, u1v *lcps, u1i _klen, u4i ncpu, FILE *log){
u4i *counts[3]; // off, end, off
u4i klen, i, j, size, m, n;
thread_preprocess(_mradix);
klen = roundup_times(_klen, 32);
size = num_max(bb->size, klen) - klen;
clear_u4v(offs);
encap_u4v(offs, size);
offs->size = size;
counts[0] = calloc(MAX_U2 + 1, sizeof(u4i));
counts[1] = calloc(MAX_U2 + 1, sizeof(u4i));
if(log) fprintf(log, "[%s] msd_radix_sort length=%u depth=%u\n", date(), (u4i)bb->size, klen);
thread_beg_init(_mradix, ncpu);
_mradix->bb = bb;
_mradix->counts[0] = NULL;
_mradix->counts[1] = NULL;
_mradix->size = size;
_mradix->offs = NULL;
_mradix->klen = klen;
_mradix->task = 0;
_mradix->log = log;
thread_end_init(_mradix);
thread_apply_all(_mradix, _mradix->task = 1);
thread_beg_iter(_mradix);
for(j=0;j<=MAX_U2;j++){
counts[1][j] += _mradix->counts[1][j];
}
thread_end_iter(_mradix);
m = 0;
for(i=0;i<=MAX_U2;i++){
counts[0][i] = m;
m += counts[1][i];
counts[1][i] = m;
}
thread_beg_iter(_mradix);
_mradix->offs = offs->buffer;
_mradix->counts[0] = counts[0];
_mradix->counts[1] = counts[1];
_mradix->task = 11;
thread_wake(_mradix);
thread_end_iter(_mradix);
thread_wait_all(_mradix);
if(log) fprintf(log, "[%s] msd_radix_sort sorted by first 8 bp\n", date());
n = 0;
for(m=0;m<=MAX_U2;m++){
if(log && (m % 100) == 0){
fprintf(log, "\r%u", counts[0][m]); fflush(log);
}
if(counts[0][m] - n < 2){
// nothing to do
} else {
thread_wait_one(_mradix);
_mradix->offs = offs->buffer + n;
_mradix->size = counts[0][m] - n;
_mradix->task = 2;
thread_wake(_mradix);
}
n = counts[0][m];
}
thread_wait_all(_mradix);
if(log) fprintf(log, "\r%u\n", n);
if(log) fprintf(log, "[%s] msd_radix_sort sorted %u bases\n", date(), klen - 8);
free(counts[0]);
free(counts[1]);
if(lcps){
clear_u1v(lcps);
encap_u1v(lcps, size);
lcps->size = size;
thread_beg_iter(_mradix);
_mradix->offs = offs->buffer;
_mradix->size = size;
_mradix->lcps = lcps;
_mradix->task = 3;
thread_wake(_mradix);
thread_end_iter(_mradix);
thread_wait_all(_mradix);
if(log) fprintf(log, "[%s] msd_radix_sort calculated LCP\n", date());
}
thread_beg_close(_mradix);
thread_end_close(_mradix);
}
/*
* Sequence DB
*/
typedef struct {
u4i nseq;
BaseBank *rdseqs;
cplist *rdtags;
u8v *rdoffs;
u4v *rdlens;
cuhash *rdhash;
} SeqBank;
static inline void rebuild_rdhash_seqbank(void *sb, size_t aux);
static const obj_desc_t seqbank_obj_desc = {"SeqBank", sizeof(SeqBank), 5, {1, 1, 1, 1, 1},
{offsetof(SeqBank, rdseqs), offsetof(SeqBank, rdtags), offsetof(SeqBank, rdoffs), offsetof(SeqBank, rdlens), offsetof(SeqBank, rdhash)},
{&basebank_obj_desc, &cplist_deep_obj_desc, &u8v_obj_desc, &u4v_obj_desc, &cuhash_obj_desc},
NULL, rebuild_rdhash_seqbank};
static inline SeqBank* init_seqbank(){
SeqBank *sb;
sb = malloc(sizeof(SeqBank));
sb->nseq = 0;
sb->rdseqs = init_basebank();
sb->rdtags = init_cplist(16);
sb->rdoffs = init_u8v(16);
sb->rdlens = init_u4v(16);
sb->rdhash = init_cuhash(1023);
return sb;
}
static inline void free_seqbank(SeqBank *sb){
u4i i;
for(i=0;irdtags->size;i++) if(sb->rdtags->buffer[i]) free(sb->rdtags->buffer[i]);
free_basebank(sb->rdseqs);
free_cplist(sb->rdtags);
free_u8v(sb->rdoffs);
free_u4v(sb->rdlens);
free_cuhash(sb->rdhash);
free(sb);
}
static inline void clear_seqbank(SeqBank *sb){
u4i i;
for(i=0;irdtags->size;i++) if(sb->rdtags->buffer[i]) free(sb->rdtags->buffer[i]);
clear_basebank(sb->rdseqs);
clear_cplist(sb->rdtags);
clear_u8v(sb->rdoffs);
clear_u4v(sb->rdlens);
clear_cuhash(sb->rdhash);
sb->nseq = 0;
}
// SeqBank's rdhash is wrongly loaded, need to be corrected
static inline void rebuild_rdhash_seqbank(void *_sb, size_t aux){
SeqBank *sb;
u4i i;
UNUSED(aux);
sb = (SeqBank*)_sb;
clear_cuhash(sb->rdhash); // hash size is not changed, thus there won't have hash re-size
for(i=0;irdtags->size;i++){
put_cuhash(sb->rdhash, (cuhash_t){get_cplist(sb->rdtags, i), i});
}
}
static inline void push_seqbank(SeqBank *sb, char *tag, int tag_len, char *seq, int seq_len){
char *ptr;
if(tag && tag_len){
ptr = malloc(tag_len + 1);
memcpy(ptr, tag, tag_len);
ptr[tag_len] = 0;
} else {
ptr = NULL;
}
push_cplist(sb->rdtags, ptr);
push_u8v(sb->rdoffs, sb->rdseqs->size);
seq2basebank(sb->rdseqs, seq, seq_len);
push_u4v(sb->rdlens, seq_len);
if(ptr) put_cuhash(sb->rdhash, (cuhash_t){ptr, sb->nseq});
sb->nseq ++;
}
static inline void fwdbitpush_seqbank(SeqBank *sb, char *tag, int tag_len, u8i *bits, u8i off, u4i len){
char *ptr;
if(tag && tag_len){
ptr = malloc(tag_len + 1);
memcpy(ptr, tag, tag_len);
ptr[tag_len] = 0;
} else {
ptr = NULL;
}
push_cplist(sb->rdtags, ptr);
push_u8v(sb->rdoffs, sb->rdseqs->size);
fast_fwdbits2basebank(sb->rdseqs, bits, off, len);
push_u4v(sb->rdlens, len);
if(ptr) put_cuhash(sb->rdhash, (cuhash_t){ptr, sb->nseq});
sb->nseq ++;
}
static inline void revbitpush_seqbank(SeqBank *sb, char *tag, int tag_len, u8i *bits, u8i off, u4i len){
char *ptr;
if(tag && tag_len){
ptr = malloc(tag_len + 1);
memcpy(ptr, tag, tag_len);
ptr[tag_len] = 0;
} else {
ptr = NULL;
}
push_cplist(sb->rdtags, ptr);
push_u8v(sb->rdoffs, sb->rdseqs->size);
fast_revbits2basebank(sb->rdseqs, bits, off, len);
push_u4v(sb->rdlens, len);
if(ptr) put_cuhash(sb->rdhash, (cuhash_t){ptr, sb->nseq});
sb->nseq ++;
}
static inline u4i find_seqbank(SeqBank *sb, char *tag){ cuhash_t *e; if((e = get_cuhash(sb->rdhash, tag))) return e->val; else return MAX_U4; }
static inline u4i off2idx_seqbank(SeqBank *sb, u8i off){
u4i ret;
bsearch_array(sb->rdoffs->buffer, sb->rdoffs->size, u8i, ret, a < off);
return ret? ret - 1 : 0;
}
static inline u4i num_n50(u4v *lens, FILE *out){
u8i tot, cum;
u4i i, max, min, n50, l50, n90, l90, avg;
if(lens->size == 0) return 0;
sort_array(lens->buffer, lens->size, u4i, num_cmpgt(b, a));
tot = 0;
max = lens->buffer[0];
min = lens->buffer[lens->size - 1];
for(i=0;isize;i++){
tot += lens->buffer[i];
}
avg = (tot + lens->size - 1) / lens->size;
cum = 0;
i = 0;
while(i < lens->size){
cum += lens->buffer[i];
if((b8i)cum >= tot * 0.5) break;
i ++;
}
n50 = i < lens->size? lens->buffer[i] : min;
l50 = i < lens->size? i + 1 : i;
i ++;
while(i < lens->size){
cum += lens->buffer[i];
if((b8i)cum >= tot * 0.9) break;
i ++;
}
n90 = i < lens->size? lens->buffer[i] : min;
l90 = i < lens->size? i + 1 : i;
if(out){
fprintf(out, "TOT %llu, CNT %u, AVG %u, MAX %u, N50 %u, L50 %u, N90 %u, L90 %u, Min %u", tot, (u4i)lens->size, avg, max, n50, l50, n90, l90, min);
fflush(out);
}
return n50;
}
#endif
wtdbg2-2.5/filereader.h 0000664 0000000 0000000 00000041332 13536643722 0015042 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __FILEREADER_RJ_H
#define __FILEREADER_RJ_H
#include
#include
#include
#include
#include
#include
#include
#include "chararray.h"
#include "mem_share.h"
#include "list.h"
#include "thread.h"
#include "pgzf.h"
#define BIOSEQ_ATTR_NULL 0
#define BIOSEQ_ATTR_TAG 1
#define BIOSEQ_ATTR_SEQ 2
#define BIOSEQ_ATTR_QLT 4
#define BIOSEQ_ATTR_FULL 7
typedef struct {
String *tag, *seq, *dsc, *qlt;
u4i attr;
} BioSequence;
#define FILEREADER_TYPE_NULL 0
#define FILEREADER_TYPE_FASTA 1
#define FILEREADER_TYPE_FASTQ 2
#define FILEREADER_TYPE_TEXT 3
#define FILEREADER_ATTR_NULL 0
#define FILEREADER_ATTR_NORMAL 1
#define FILEREADER_ATTR_STDIN 2
#define FILEREADER_ATTR_PROC 3
#define FILEREADER_ATTR_TEXT 4
#define FILEREADER_ATTR_USER 5 // defined by user
typedef size_t (*read_data_func)(void *obj, void *dat, size_t len);
typedef void (*close_input_func)(void *obj);
static inline size_t _read_data_file(void *obj, void *dat, size_t len){ return fread(dat, 1, len, (FILE*)obj); }
static inline void _close_input_file(void *obj){ if(obj) fclose((FILE*)obj); }
static inline void _close_input_proc(void *obj){ if(obj) pclose((FILE*)obj); }
typedef struct {
int file_attr;
char *filename;
void *_file;
read_data_func _read;
close_input_func _close;
} file_src_t;
define_list_core(filesrcv, file_src_t, int, 0xFF);
typedef struct {
filesrcv *files;
int fidx;
char *buffer[2];
int ridx, widx, flag;
u8i bufmax, bufoff, bufcnt[2];
#ifdef FR_USE_SPINLOCK
pthread_spinlock_t lock;
#else
pthread_mutex_t lock;
#endif
char line_breaker;
char delimiter;
u8i n_char, n_line;
String *line, *line2;
VStrv *tabs;
int rollback; // line will be re-used in next readline
// thread
pthread_t pid;
int running;
int eof;
} FileReader;
static inline BioSequence* init_biosequence(){
BioSequence *seq;
seq = malloc(sizeof(BioSequence));
seq->tag = init_string(32);
seq->seq = init_string(32);
seq->dsc = init_string(32);
seq->qlt = init_string(32);
seq->attr = BIOSEQ_ATTR_FULL;
return seq;
}
static inline void reset_biosequence(BioSequence *seq){
clear_string(seq->tag);
clear_string(seq->seq);
clear_string(seq->dsc);
clear_string(seq->qlt);
}
static inline void free_biosequence(BioSequence *seq){
free_string(seq->tag);
free_string(seq->seq);
free_string(seq->dsc);
free_string(seq->qlt);
free(seq);
}
static inline void* file_src_thread_func(void *obj){
FileReader *fr;
file_src_t *fc;
void *_file;
read_data_func _read;
close_input_func _close;
size_t off, cnt, len;
fr = (FileReader*)obj;
while(fr->running){
if(fr->fidx >= fr->files->size){
fr->eof = 1;
microsleep(1);
} else {
fr->eof = 0;
fc = ref_filesrcv(fr->files, fr->fidx);
_file = NULL;
_read = NULL;
_close = NULL;
switch(fc->file_attr){
case FILEREADER_ATTR_TEXT:
len = strlen(fc->filename);
off = 0;
while(fr->running && len){
while(fr->flag == 1 && fr->running){ nano_sleep(1); }
cnt = num_min(len, fr->bufmax);
memcpy(fr->buffer[fr->widx], fc->filename + off, cnt);
fr->flag = 1;
off += cnt;
len -= cnt;
fr->widx = !fr->widx;
}
break;
case FILEREADER_ATTR_STDIN:
if(_file == NULL){
_file = fc->_file = stdin;
_read = fc->_read = _read_data_file;
_close = fc->_close = NULL;
}
case FILEREADER_ATTR_PROC:
if(_file == NULL){
_file = fc->_file = popen(fc->filename, "r");
_read = fc->_read = _read_data_file;
_close = fc->_close = _close_input_proc;
}
case FILEREADER_ATTR_USER:
if(_file == NULL){
_file = fc->_file;
_read = fc->_read;
_close = fc->_close;
}
default:
if(_file == NULL){
_file = fc->_file = open_file_for_read(fc->filename, NULL);
_read = fc->_read = _read_data_file;
_close = fc->_close = _close_input_file;
}
while(fr->running){
while(fr->flag == 1){
nano_sleep(1);
if(fr->running == 0){
break;
}
}
if(fr->flag == 1) break;
fr->bufcnt[fr->widx] = _read(_file, fr->buffer[fr->widx], fr->bufmax);
fr->widx = !fr->widx;
fr->flag = 1;
if(fr->bufcnt[!fr->widx] == 0) break;
}
}
if(_file && _close){
_close(_file);
}
fr->fidx ++;
}
}
return NULL;
}
static inline FileReader* init_filereader(){
FileReader *fr;
fr = malloc(sizeof(FileReader));
fr->files = init_filesrcv(4);
fr->fidx = 0;
fr->bufmax = 128 * 1024;
fr->bufoff = 0;
fr->bufcnt[0] = 0;
fr->bufcnt[1] = 0;
fr->ridx = 0;
fr->widx = 1;
fr->flag = 0;
#ifdef FR_USE_SPINLOCK
pthread_spin_init(&fr->lock, 0);
#else
pthread_mutex_init(&fr->lock, NULL);
#endif
fr->buffer[0] = malloc(fr->bufmax);
fr->buffer[1] = malloc(fr->bufmax);
fr->line_breaker = '\n';
fr->delimiter = '\t';
fr->n_char = 0;
fr->n_line = 0;
fr->line = init_string(32);
fr->line2 = init_string(32);
fr->tabs = init_VStrv(16);
fr->rollback = 0;
fr->pid = 0;
fr->running = 1;
fr->eof = 0;
return fr;
}
static inline void beg_asyn_filereader(FileReader *fr){
if(pthread_create(&fr->pid, NULL, file_src_thread_func, fr) != 0){
fprintf(stderr, " -- Failed to create thread [%s] in %s -- %s:%d --\n", "file_src_thread_func", __FUNCTION__, __FILE__, __LINE__);
fr->pid = 0; // switch to directed read
}
}
static inline void reset_filereader(FileReader *fr){
if(fr->pid){
fr->running = 0;
pthread_join(fr->pid, NULL);
}
fr->fidx = 0;
fr->bufoff = 0;
fr->bufcnt[0] = 0;
fr->bufcnt[1] = 0;
fr->ridx = 0;
fr->widx = 0;
fr->flag = 0;
#ifdef FR_USE_SPINLOCK
pthread_spin_destroy(&fr->lock);
pthread_spin_init(&fr->lock, 0);
#else
pthread_mutex_destroy(&fr->lock);
pthread_mutex_init(&fr->lock, NULL);
#endif
clear_string(fr->line);
clear_VStrv(fr->tabs);
fr->rollback = 0;
fr->n_line = 0;
fr->n_char = 0;
fr->running = 1;
fr->eof = 0;
if(fr->pid){
fr->pid = 0;
beg_asyn_filereader(fr);
}
}
static inline void free_filereader(FileReader *fr){
file_src_t *f;
int i;
if(fr->pid){
fr->running = 0;
pthread_join(fr->pid, NULL);
}
for(i=0;ifiles->size;i++){
f = ref_filesrcv(fr->files, i);
if(f->filename) free(f->filename);
}
#ifdef FR_USE_SPINLOCK
pthread_spin_destroy(&fr->lock);
#else
pthread_mutex_destroy(&fr->lock);
#endif
free(fr->buffer[0]);
free(fr->buffer[1]);
free_filesrcv(fr->files);
free_string(fr->line);
free_string(fr->line2);
free_VStrv(fr->tabs);
free(fr);
}
static inline int push_filereader(FileReader *fr, char *filename){
file_src_t *f;
int len;
f = next_ref_filesrcv(fr->files);
f->_file = NULL;
f->_read = NULL;
f->_close = NULL;
len = filename? strlen(filename) : 0;
while(len && filename[len-1] == ' ') len --;
if(len == 0 || strcmp(filename, "-") == 0){
f->filename = NULL;
f->file_attr = FILEREADER_ATTR_STDIN;
} else if(filename[len-1] == '|'){
f->filename = malloc(len);
strncpy(f->filename, filename, len - 1);
f->file_attr = FILEREADER_ATTR_PROC;
} else if(len > 3 && strcmp(filename + len - 3, ".gz") == 0){
//f->filename = malloc(len + 20);
//sprintf(f->filename, "gzip -dc %s", filename);
//f->file_attr = FILEREADER_ATTR_PROC;
f->filename = strdup(filename);
f->file_attr = FILEREADER_ATTR_USER;
f->_file = open_pgzf_reader(open_file_for_read(f->filename, NULL), 0, 4);
f->_read = read_pgzf4filereader;
f->_close = close_pgzf4filereader;
} else if(len > 5 && strcmp(filename + len - 5, ".pgzf") == 0){
f->filename = strdup(filename);
f->file_attr = FILEREADER_ATTR_USER;
f->_file = open_pgzf_reader(open_file_for_read(f->filename, NULL), 0, 4);
f->_read = read_pgzf4filereader;
f->_close = close_pgzf4filereader;
} else {
f->filename = strdup(filename);
f->file_attr = FILEREADER_ATTR_NORMAL;
}
return f->file_attr;
}
static inline int push_text_filereader(FileReader *fr, char *str, size_t len){
file_src_t *f;
UNUSED(len);
f = next_ref_filesrcv(fr->files);
f->_file = NULL;
f->_read = NULL;
f->_close = NULL;
f->filename = str;
f->file_attr = FILEREADER_ATTR_TEXT;
return f->file_attr;
}
static inline int push_user_filereader(FileReader *fr, void *_file, read_data_func _read, close_input_func _close){
file_src_t *f;
f = next_ref_filesrcv(fr->files);
f->_file = _file;
f->_read = _read;
f->_close = _close;
f->filename = NULL;
f->file_attr = FILEREADER_ATTR_USER;
return f->file_attr;
}
static inline void push_all_filereader(FileReader *fr, int nfile, char **filenames){
int i;
for(i=0;irollback){
fr->rollback = 0;
return line->size + 1; // in case of end of file and not terminated by line_breaker, the return value is bigger by 1
} else if(fr->eof && fr->bufoff == fr->bufcnt[fr->ridx]){
return 0;
} else {
clear_string(line);
nc = fr->n_char;
while(1){
buffer = fr->buffer[fr->ridx];
ret = 0;
for(i=fr->bufoff;ibufcnt[fr->ridx];){
if(buffer[i++] == fr->line_breaker){
ret = 1;
break;
}
}
fr->n_char += i - fr->bufoff;
encap_string(line, i - fr->bufoff);
append_string(line, buffer + fr->bufoff, i - fr->bufoff - ret);
fr->bufoff = i;
if(ret){
return fr->n_char - nc;
} else if(fr->eof){
return fr->n_char - nc;
}
fr->bufoff = 0;
fr->bufcnt[fr->ridx] = 0;
while(fr->flag == 0){
nano_sleep(1);
if(fr->eof){
if(fr->flag) break;
else {
return fr->n_char - nc;
}
}
}
fr->flag = 0;
fr->ridx = !fr->ridx;
}
return 0;
}
}
static inline int directed_readline_filereader(FileReader *fr, String *line){
file_src_t *fc;
void *_file;
read_data_func _read;
close_input_func _close;
u8i i, nc;
int ch;
int ret;
if(fr->eof) return 0;
else if(fr->rollback){
fr->rollback = 0;
return line->size + 1; // in case of end of file and not terminated by line_breaker, the return value is bigger by 1
}
clear_string(line);
nc = fr->n_char;
while(fr->fidx < fr->files->size){
fc = ref_filesrcv(fr->files, fr->fidx);
_file = NULL;
_read = NULL;
_close = NULL;
if(fr->flag == 0){
switch(fc->file_attr){
case FILEREADER_ATTR_TEXT:
break;
case FILEREADER_ATTR_STDIN:
_file = fc->_file = stdin;
_read = fc->_read = _read_data_file;
_close = fc->_close = NULL;
break;
case FILEREADER_ATTR_PROC:
_file = fc->_file = popen(fc->filename, "r");
_read = fc->_read = _read_data_file;
_close = fc->_close = _close_input_proc;
break;
case FILEREADER_ATTR_USER:
_file = fc->_file;
_read = fc->_read;
_close = fc->_close;
break;
default:
_file = fc->_file = open_file_for_read(fc->filename, NULL);
_read = fc->_read = _read_data_file;
_close = fc->_close = _close_input_file;
break;
}
fr->flag = 1;
fr->bufoff = 0;
fr->bufcnt[0] = fr->bufcnt[1] = 0;
} else {
_file = fc->_file;
_read = fc->_read;
_close = fc->_close;
}
switch(fc->file_attr){
case FILEREADER_ATTR_TEXT:
ret = 0;
for(i=fr->bufoff;fc->filename[i];){
if(fc->filename[i++] == fr->line_breaker){
ret = 1;
break;
}
}
fr->n_char += i - fr->bufoff;
encap_string(line, i - fr->bufoff);
append_string(line, fc->filename + fr->bufoff, i - fr->bufoff - ret);
fr->bufoff = i;
if(ret){
break;
}
break;
case FILEREADER_ATTR_STDIN:
while((ch = fgetc(stdin)) != EOF){
fr->n_char ++;
if(ch == fr->line_breaker){
break;
}
add_char_string(line, ch);
}
break;
default:
while(1){
if(fr->bufoff >= fr->bufcnt[0]){
fr->bufoff = 0;
fr->bufcnt[0] = _read(_file, fr->buffer[0], fr->bufmax);
if(fr->bufcnt[0] == 0) break;
}
ret = 0;
for(i=fr->bufoff;ibufcnt[0];){
if(fr->buffer[0][i++] == fr->line_breaker){
ret = 1;
break;
}
}
fr->n_char += i - fr->bufoff;
encap_string(line, i - fr->bufoff);
append_string(line, fr->buffer[0] + fr->bufoff, i - fr->bufoff - ret);
fr->bufoff = i;
if(ret){
break;
}
}
break;
}
if(fr->n_char > nc){
return fr->n_char - nc;
} else {
if(_file && _close){
_close(_file);
}
fr->flag = 0;
fr->fidx ++;
}
}
fr->eof = 1;
return 0;
}
int readline_filereader(FileReader *fr){
int ret;
ret = ((fr)->pid? asyn_readline_filereader(fr, (fr)->line) : directed_readline_filereader(fr, (fr)->line));
if(ret > 0){
fr->n_line ++;
}
return ret;
}
static inline void rollback_filereader(FileReader *fr){
fr->rollback = 1;
fr->n_line --;
}
static inline int split_line_filereader(FileReader *fr, char delimiter){
VString *vs;
int i;
clear_VStrv(fr->tabs);
vs = next_ref_VStrv(fr->tabs);
vs->string = fr->line->string;
vs->size = 0;
for(i=0;iline->size;i++){
if(fr->line->string[i] == delimiter){
vs->size = fr->line->string + i - vs->string;
vs = next_ref_VStrv(fr->tabs);
vs->string = fr->line->string + i + 1;
vs->size = 0;
}
}
vs->size = fr->line->string + fr->line->size - vs->string;
return (int)fr->tabs->size;
}
static inline int readtable_filereader(FileReader *fr){
if(readline_filereader(fr) == 0) return -1;
return split_line_filereader(fr, fr->delimiter);
}
static inline int get_col_len(FileReader *fr, int col){
return fr->tabs->buffer[col].size;
}
static inline char* get_col_str(FileReader *fr, int col){
VString *vs;
vs = ref_VStrv(fr->tabs, col);
vs->string[vs->size] = '\0';
return vs->string;
}
static inline char* get_line_str(FileReader *fr){
int i;
for(i=0;iline->size;i++){
if(fr->line->string[i] == 0){
fr->line->string[i] = fr->delimiter;
}
}
return fr->line->string;
}
// @return FILEREADER_TYPE_NULL (end of files), _FASTA, _FASTQ, or _TEXT (cannot parse sequence type)
static inline int readseq_filereader(FileReader *fr, BioSequence *seq){
int n, i;
do {
if((n = readline_filereader(fr)) == 0) return FILEREADER_TYPE_NULL;
} while(n == 0);
reset_biosequence(seq);
if(fr->line->string[0] == '>'){
if(seq->attr & BIOSEQ_ATTR_TAG){
for(i=1;iline->size;i++){
if(fr->line->string[i] == ' ' || fr->line->string[i] == '\t') break;
}
append_string(seq->tag, fr->line->string + 1, i - 1);
append_string(seq->dsc, fr->line->string + i, fr->line->size - i);
}
while((n = readline_filereader(fr))){
if(fr->line->string[0] == '>'){
rollback_filereader(fr);
break;
} else if(seq->attr & BIOSEQ_ATTR_SEQ){
append_string(seq->seq, fr->line->string, fr->line->size);
}
}
return FILEREADER_TYPE_FASTA;
} else if(fr->line->string[0] == '@'){
if(seq->attr & BIOSEQ_ATTR_TAG){
for(i=1;iline->size;i++){
if(fr->line->string[i] == ' ' || fr->line->string[i] == '\t') break;
}
append_string(seq->tag, fr->line->string + 1, i - 1);
append_string(seq->dsc, fr->line->string + i, fr->line->size - i);
}
if((n = readline_filereader(fr))){
if(seq->attr & BIOSEQ_ATTR_SEQ) append_string(seq->seq, fr->line->string, fr->line->size);
} else {
return FILEREADER_TYPE_FASTQ;
}
if((n = readline_filereader(fr))){
// expected '+'
} else {
return FILEREADER_TYPE_FASTQ;
}
if((n = readline_filereader(fr))){
if(seq->attr & BIOSEQ_ATTR_QLT) append_string(seq->qlt, fr->line->string, fr->line->size);
} else {
return FILEREADER_TYPE_FASTQ;
}
return FILEREADER_TYPE_FASTQ;
} else {
append_string(seq->dsc, fr->line->string, fr->line->size);
return FILEREADER_TYPE_TEXT;
}
}
#endif
wtdbg2-2.5/filewriter.h 0000664 0000000 0000000 00000012207 13536643722 0015113 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __FILEWRITER_RJ_H
#define __FILEWRITER_RJ_H
#include "mem_share.h"
#include "thread.h"
#include "pgzf.h"
typedef size_t (*write_data_func)(void *obj, void *dat, size_t len);
typedef void (*close_output_func)(void *obj);
static inline size_t _write_data_file(void *obj, void *dat, size_t len){ return fwrite(dat, 1, len, (FILE*)obj); }
static inline void _close_output_file(void *obj){ if(obj) fclose((FILE*)obj); }
/**
* BufferedWriter
*/
typedef struct {
FILE *bios[2];
FILE *out;
void *_file;
write_data_func _write;
close_output_func _close;
int bidx;
size_t buf_size;
char *buffs[2];
size_t blens[2];
size_t nbytes;
pthread_mutex_t lock;
pthread_t pid;
int running, flush;
} BufferedWriter;
static inline void* _buffered_writer_thread_func(void *obj){
BufferedWriter *bw;
size_t bsize[2];
int bidx, lock;
bw = (BufferedWriter*)obj;
bw->running = 1;
bw->flush = 0;
bw->nbytes = 0;
while(bw->running){
bidx = bw->bidx;
bsize[0] = ftell(bw->bios[0]);
bsize[1] = ftell(bw->bios[1]);
if(bsize[bidx] >= bw->buf_size || (bsize[bidx] && bw->flush == 1)){
lock = 1;
pthread_mutex_lock(&bw->lock);
} else {
lock = 0;
}
if(bsize[!bidx]){
fflush(bw->bios[!bidx]);
bw->_write(bw->_file, bw->buffs[!bidx], bsize[!bidx]);
bw->nbytes += bsize[!bidx];
fseek(bw->bios[!bidx], 0, SEEK_SET);
}
if(lock){
bw->bidx = !bidx;
pthread_mutex_unlock(&bw->lock);
} else if(bsize[bidx]){
pthread_mutex_lock(&bw->lock);
bw->bidx = !bidx;
pthread_mutex_unlock(&bw->lock);
}
if(bw->flush && bsize[0] == 0 && bsize[1] == 0){
bw->flush = 2;
while(bw->flush == 2){
nano_sleep(1);
}
bw->flush = 0;
}
nano_sleep(10);
}
{
bsize[0] = ftell(bw->bios[0]);
bsize[1] = ftell(bw->bios[1]);
fflush(bw->bios[0]);
fflush(bw->bios[1]);
bidx = bw->bidx;
if(bsize[!bidx]){
bw->_write(bw->_file, bw->buffs[!bidx], bsize[!bidx]);
bw->nbytes += bsize[!bidx];
}
if(bsize[bidx]){
bw->_write(bw->_file, bw->buffs[bidx], bsize[bidx]);
bw->nbytes += bsize[bidx];
}
}
return NULL;
}
static inline BufferedWriter* open2_bufferedwriter(void *obj, write_data_func _write, close_output_func _close, size_t buf_size){
BufferedWriter *bw;
bw = malloc(sizeof(BufferedWriter));
bw->_file = obj;
bw->_write = _write;
bw->_close = _close;
bw->buffs[0] = NULL;
bw->buffs[1] = NULL;
bw->blens[0] = 0;
bw->blens[1] = 0;
bw->bios[0] = open_memstream(bw->buffs + 0, bw->blens + 0);
bw->bios[1] = open_memstream(bw->buffs + 1, bw->blens + 1);
bw->out = NULL;
bw->bidx = 0;
bw->buf_size = buf_size? buf_size : 4 * 1024;
bw->nbytes = 0;
bw->lock = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
bw->running = 0;
bw->flush = 0;
if(pthread_create(&bw->pid, NULL, _buffered_writer_thread_func, bw) != 0){
fprintf(stderr, " -- Failed to create thread [%s] in %s -- %s:%d --\n", "_buffered_writer_thread_func", __FUNCTION__, __FILE__, __LINE__);
bw->pid = 0;
}
while(bw->running != 1){ nano_sleep(1); }
return bw;
}
static inline BufferedWriter* open_bufferedwriter(FILE *out, size_t buf_size){
return open2_bufferedwriter(out, _write_data_file, NULL, buf_size);
}
static inline BufferedWriter* zopen_bufferedwriter(FILE *out, size_t buf_size, int ncpu, int level){
PGZF *pz;
pz = open_pgzf_writer(out, buf_size, ncpu, level);
return open2_bufferedwriter(pz, write_pgzf4filewriter, close_pgzf4filewriter, pz->bufsize);
}
static inline int beg_bufferedwriter(BufferedWriter *bw){
if(bw->pid){
while(bw->flush){ nano_sleep(1); }
pthread_mutex_lock(&bw->lock);
bw->out = bw->bios[bw->bidx];
return 0;
} else {
bw->out = NULL;
return 1; // error
}
}
static inline int end_bufferedwriter(BufferedWriter *bw){
if(bw->pid){
pthread_mutex_unlock(&bw->lock);
}
bw->out = NULL;
return 0;
}
static inline size_t flush_bufferedwriter(BufferedWriter *bw){
size_t ret;
if(bw->pid){
pthread_mutex_unlock(&bw->lock);
while(bw->flush == 1){ nano_sleep(1); }
bw->flush = 1;
while(bw->flush == 1){
nano_sleep(1);
}
pthread_mutex_lock(&bw->lock);
bw->flush = 0;
bw->out = bw->bios[bw->bidx];
ret = bw->nbytes;
} else {
ret = 0;
}
return ret;
}
static inline size_t close_bufferedwriter(BufferedWriter *bw){
size_t ret;
if(bw->pid){
bw->running = 0;
pthread_join(bw->pid, NULL);
}
fclose(bw->bios[0]);
fclose(bw->bios[1]);
if(bw->buffs[0]) free(bw->buffs[0]);
if(bw->buffs[1]) free(bw->buffs[1]);
if(bw->_close){
bw->_close(bw->_file);
}
ret = bw->nbytes;
free(bw);
return ret;
}
#endif
wtdbg2-2.5/general_graph.h 0000664 0000000 0000000 00000026143 13536643722 0015541 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __GENERAL_GRAPH_RJ_H
#define __GENERAL_GRAPH_RJ_H
#include "list.h"
#include "hashset.h"
#define GEG_MAX_NODE 0xFFFFFFFFFFLLU
#define GEG_MAX_EDGE_CNT 0x3FFFF
#define GEG_MAX_EDGE_COV 0x7FFFF
#define GEG_MAX_EDGE_OFF 0x7FFFFF
#define GEG_MIN_EDGE_OFF -0x7FFFFF
typedef struct {
u8i node1:40, dir1:1, dir2:1, closed:2, cov:19, visit:1;
u8i node2:40; b8i off:24;
} ge_edge_t;
define_list(geedgev, ge_edge_t);
static inline uint64_t _ge_edge_hashcode(ge_edge_t e){
const uint64_t m = 0xc6a4a7935bd1e995LLU;
const int r = 47;
uint64_t h = 1023 ^ (16 * m);
uint64_t k = (e.node1 << 1) | e.dir1;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
k = (e.node2 << 1) | e.dir2;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
#define GEEDGEHASH(idx) ((geedgev *)set->userdata)->buffer[idx]
#define ge_edge_hashcode(E) _ge_edge_hashcode(GEEDGEHASH(E))
#define ge_edge_hashequals(E1, E2) (GEEDGEHASH(E1).node1 == GEEDGEHASH(E2).node1 && GEEDGEHASH(E1).node2 == GEEDGEHASH(E2).node2 \
&& GEEDGEHASH(E1).dir1 == GEEDGEHASH(E2).dir1 && GEEDGEHASH(E1).dir2 == GEEDGEHASH(E2).dir2)
define_hashset(geedgehash, u8i, ge_edge_hashcode, ge_edge_hashequals);
typedef struct {
u8i idx:63, flg:1;
u8i next;
} ge_edge_ref_t;
define_list(geedgerefv, ge_edge_ref_t);
typedef struct { uint64_t idx:46, cnt:18; } ge_ptr_ref_t;
static const ge_ptr_ref_t GE_PTR_REF_NULL = (ge_ptr_ref_t){0, 0};
define_list(geptrrefv, ge_ptr_ref_t);
typedef struct { uint64_t idx:46, cnt:18; } ge_vec_ref_t;
static const ge_vec_ref_t GE_VEC_REF_NULL = (ge_vec_ref_t){0, 0};
define_list(gevecrefv, ge_vec_ref_t);
typedef struct {
u8i closed:1, bt_visit:40, bt_dir:1, bt_idx:18, status:4;
u8i unvisit:18, aux:46;
ge_ptr_ref_t edges[2];
} ge_node_t;
define_list(genodev, ge_node_t);
#define GEG_TRACE_MSG_ZERO 0
#define GEG_TRACE_MSG_ONE 1
#define GEG_TRACE_MSG_MORE 2
#define GEG_TRACE_MSG_VISITED 3
#define GEG_TRACE_MSG_UNDEF 4
typedef void (*geg_clr_node_callback)(void *userdata);
typedef void (*geg_add_node_callback)(void *userdata, u8i nidx);
typedef void (*geg_del_node_callback)(void *userdata, u8i nidx);
typedef void (*geg_clr_edge_callback)(void *userdata);
typedef void (*geg_add_edge_callback)(void *userdata, u8i eidx);
typedef void (*geg_del_edge_callback)(void *userdata, u8i eidx);
#define define_simple_geg_callback(tag, node_tag, node_tag_t, edge_tag, edge_tag_t) \
void tag##nodeclr(void *aux){ clear_##node_tag((node_tag*)aux); } \
void tag##nodeadd(void *aux, u8i idx){ \
node_tag *nodes = (node_tag*)aux; \
if(idx < nodes->size){ \
memset(ref_##node_tag(nodes, idx), 0, sizeof(node_tag_t)); \
} else { \
memset(next_ref_##node_tag(nodes), 0, sizeof(node_tag_t)); \
} \
} \
void tag##nodedel(void *aux, u8i idx){ UNUSED(aux); UNUSED(idx); } \
void tag##edgeclr(void *aux){ clear_##edge_tag((edge_tag*)aux); } \
void tag##edgeadd(void *aux, u8i idx){ \
edge_tag *edges = (edge_tag*)aux; \
if(idx < edges->size){ \
memset(ref_##edge_tag(edges, idx), 0, sizeof(edge_tag_t)); \
} else { \
memset(next_ref_##edge_tag(edges), 0, sizeof(edge_tag_t)); \
} \
} \
void tag##edgedel(void *aux, u8i idx){ UNUSED(aux); UNUSED(idx); } \
static inline void tag##_set_callbacks_gegraph(GEGraph *g, node_tag *nodeaux, edge_tag *edgeaux){ \
set_callbacks_gegraph(g, (void*)nodeaux, (void*)edgeaux, tag##nodeclr, tag##nodeadd, tag##nodedel, tag##edgeclr, tag##edgeadd, tag##edgedel); \
}
typedef struct {
genodev *nodes;
geedgev *edges;
geedgehash *ehash;
geedgerefv *erefs;
void *nodeaux;
void *edgeaux;
geg_clr_node_callback nodeclr;
geg_add_node_callback nodeadd;
geg_del_node_callback nodedel;
geg_clr_edge_callback edgeclr;
geg_add_edge_callback edgeadd;
geg_del_edge_callback edgedel;
} GEGraph;
static inline GEGraph* init_gegraph(){
GEGraph *g;
g = malloc(sizeof(GEGraph));
g->nodes = init_genodev(32);
g->edges = init_geedgev(32);
g->ehash = init_geedgehash(1023);
set_userdata_geedgehash(g->ehash, g->edges);
g->erefs = init_geedgerefv(32);
g->nodeaux = NULL;
g->edgeaux = NULL;
g->nodeclr = NULL;
g->nodeadd = NULL;
g->nodedel = NULL;
g->edgeclr = NULL;
g->edgeadd = NULL;
g->edgedel = NULL;
return g;
}
static inline void set_callbacks_gegraph(GEGraph *g, void *nodeaux, void *edgeaux, geg_clr_node_callback nodeclr, geg_add_node_callback nodeadd, geg_del_node_callback nodedel, geg_clr_edge_callback edgeclr, geg_add_edge_callback edgeadd, geg_del_edge_callback edgedel){
g->nodeaux = nodeaux;
g->edgeaux = edgeaux;
g->nodeclr = nodeclr;
g->nodeadd = nodeadd;
g->nodedel = nodedel;
g->edgeclr = edgeclr;
g->edgeadd = edgeadd;
g->edgedel = edgedel;
}
static inline void free_gegraph(GEGraph *g){
free_genodev(g->nodes);
free_geedgev(g->edges);
free_geedgehash(g->ehash);
free_geedgerefv(g->erefs);
free(g);
}
static inline void reset_gegraph(GEGraph *g){
clear_genodev(g->nodes);
if(g->nodeclr) g->nodeclr(g->nodeaux);
clear_geedgev(g->edges);
memset(next_ref_geedgev(g->edges), 0, sizeof(ge_edge_t));
if(g->edgeclr){
g->edgeclr(g->edgeaux);
g->edgeadd(g->edgeaux, 0);
}
clear_geedgehash(g->ehash);
clear_geedgerefv(g->erefs);
memset(next_ref_geedgerefv(g->erefs), 0, sizeof(ge_edge_ref_t));
}
static inline ge_node_t* add_node_gegraph(GEGraph *g){
ge_node_t *n;
n = next_ref_genodev(g->nodes);
memset(n, 0, sizeof(ge_node_t));
if(g->nodeadd) g->nodeadd(g->nodeaux, offset_genodev(g->nodes, n));
return n;
}
static inline ge_edge_t* prepare_edge_gegraph(GEGraph *g, u8i node1, int dir1, u8i node2, int dir2, int *exists){
ge_node_t *n;
ge_edge_t *e;
ge_edge_ref_t *f;
u8i *u;
e = ref_geedgev(g->edges, 0);
if(node1 <= node2){
e->node1 = node1;
e->dir1 = dir1;
e->node2 = node2;
e->dir2 = dir2;
} else {
e->node1 = node2;
e->dir1 = !dir2;
e->node2 = node1;
e->dir2 = !dir1;
}
e->cov = 0;
e->off = 0;
e->visit = 0;
e->closed = 0;
u = prepare_geedgehash(g->ehash, 0, exists);
if(*exists){
return g->edges->buffer + *u;
} else {
*u = g->edges->size;
e = next_ref_geedgev(g->edges);
*e = g->edges->buffer[0];
n = g->nodes->buffer + e->node1;
f = next_ref_geedgerefv(g->erefs);
f->idx = *u;
f->flg = 0;
f->next = n->edges[e->dir1].idx;
n->edges[e->dir1].idx = g->erefs->size - 1;
n->edges[e->dir1].cnt ++;
n = g->nodes->buffer + e->node2;
f = next_ref_geedgerefv(g->erefs);
f->idx = *u;
f->flg = 1;
f->next = n->edges[!e->dir2].idx;
n->edges[!e->dir2].idx = g->erefs->size - 1;
n->edges[!e->dir2].cnt ++;
if(g->edgeadd) g->edgeadd(g->edgeaux, offset_geedgev(g->edges, e));
return e;
}
}
static inline void cut_edge_core_gegraph(GEGraph *g, ge_edge_t *e, int closed_val){
if(e->closed) return;
e->closed = closed_val;
ref_genodev(g->nodes, e->node1)->edges[e->dir1].cnt --;
ref_genodev(g->nodes, e->node2)->edges[!e->dir2].cnt --;
if(g->edgedel) g->edgedel(g->edgeaux, offset_geedgev(g->edges, e));
}
#define cut_edge_gegraph(g, e) cut_edge_core_gegraph(g, e, 1)
static inline void revive_edge_gegraph(GEGraph *g, ge_edge_t *e){
if(e->closed == 0) return;
e->closed = 0;
ref_genodev(g->nodes, e->node1)->edges[e->dir1].cnt ++;
ref_genodev(g->nodes, e->node2)->edges[!e->dir2].cnt ++;
if(g->edgedel) g->edgeadd(g->edgeaux, offset_geedgev(g->edges, e));
}
static inline ge_edge_ref_t* single_edge_gegraph(GEGraph *g, ge_node_t *n, int dir, int *info){
ge_edge_ref_t *f, *ret;
uint64_t idx;
ret = NULL;
if(info){
*info = GEG_TRACE_MSG_ZERO;
if(n->edges[dir].cnt == 0) return NULL;
idx = n->edges[dir].idx;
while(idx){
f = ref_geedgerefv(g->erefs, idx);
idx = f->next;
if(g->edges->buffer[f->idx].closed) continue;
if(ret){ *info = GEG_TRACE_MSG_MORE; return NULL; }
else { *info = GEG_TRACE_MSG_ONE; ret = f; }
}
} else {
if(n->edges[dir].cnt == 0) return NULL;
idx = n->edges[dir].idx;
while(idx){
f = ref_geedgerefv(g->erefs, idx);
idx = f->next;
if(g->edges->buffer[f->idx].closed) continue;
if(ret){ return NULL; }
else { ret = f; }
}
}
return ret;
}
#define count_edges_gegraph(g, n, dir) (n)->edges[dir].cnt
// dir = 2 means either strand
static inline ge_edge_ref_t* edge_node2node_gegraph(GEGraph *g, u8i node1, int dir1, u8i node2, int dir2){
ge_node_t *n;
ge_edge_ref_t *f;
ge_edge_t *e;
uint64_t idx;
int dire;
n = ref_genodev(g->nodes, node1);
if(dir1 > 1){
dir1 = 0; dire = 2;
} else {
dire = dir1 + 1;
}
while(dir1 < dire){
idx = n->edges[dir1].idx;
while(idx){
f = ref_geedgerefv(g->erefs, idx);
idx = f->next;
e = ref_geedgev(g->edges, f->idx);
if(f->flg){
if(e->node1 == node2 && (dir2 > 1? 1 : (dir2 == (!e->dir1)))) return f;
} else {
if(e->node2 == node2 && (dir2 > 1? 1 : (dir2 == e->dir2))) return f;
}
}
dir1 ++;
}
return NULL;
}
static inline void del_node_edges_gegraph(GEGraph *g, ge_node_t *n, int closed_val){
ge_edge_ref_t *f;
ge_edge_t *e;
uint64_t idx;
uint32_t k;
for(k=0;k<2;k++){
idx = n->edges[k].idx;
while(idx){
f = ref_geedgerefv(g->erefs, idx);
idx = f->next;
e = g->edges->buffer + f->idx;
cut_edge_core_gegraph(g, e, closed_val);
}
}
}
static inline void del_node_gegraph(GEGraph *g, ge_node_t *n){
del_node_edges_gegraph(g, n, 1);
n->closed = 1;
if(g->nodedel) g->nodeadd(g->nodeaux, offset_genodev(g->nodes, n));
}
#define geg_beg_iter_edges(g, n, dir, f, e) \
{ \
u8i _geg_iter_idx; \
_geg_iter_idx = (n)->edges[dir].idx; \
while(_geg_iter_idx){ \
(f) = ref_geedgerefv((g)->erefs, _geg_iter_idx); \
_geg_iter_idx = (f)->next; \
(e) = (g)->edges->buffer + (f)->idx
#define geg_end_iter_edges() \
} \
}
static inline void print_dot_gegraph(GEGraph *g, FILE *out){
static const char *colors[2][2] = {{"blue", "green"}, {"red", "gray"}};
ge_node_t *n;
ge_edge_t *e;
u8i i;
fprintf(out, "digraph {\n");
for(i=0;inodes->size;i++){
n = ref_genodev(g->nodes, i);
if(n->closed) continue;
fprintf(out, " N%llu\n", i);
}
for(i=1;iedges->size;i++){
e = ref_geedgev(g->edges, i);
if(e->closed) continue;
fprintf(out, " N%llu -> N%llu [label=\"%c%c:%d:%d\" color=%s]\n", (u8i)e->node1, (u8i)e->node2, "+-"[e->dir1], "+-"[e->dir2], e->cov, e->off, colors[e->dir1][e->dir2]);
fprintf(out, " N%llu -> N%llu [label=\"%c%c:%d:%d\" color=%s]\n", (u8i)e->node2, (u8i)e->node1, "-+"[e->dir2], "-+"[e->dir1], e->cov, e->off, colors[!e->dir2][!e->dir1]);
}
fprintf(out, "}\n");
fflush(out);
}
static inline void fprint_dot_gegraph(GEGraph *g, char *prefix, char *suffix){
FILE *out;
out = open_file_for_write(prefix, suffix, 1);
print_dot_gegraph(g, out);
fclose(out);
}
#endif
wtdbg2-2.5/hashset.h 0000664 0000000 0000000 00000062057 13536643722 0014406 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef __HASH_SET_RJ
#define __HASH_SET_RJ
#include
#include
#include
#include
#include
#include "mem_share.h"
#include "bitvec.h"
static const uint64_t sys_prime_list[61] = {
0x7LLU, 0xfLLU, 0x1fLLU, 0x43LLU, 0x89LLU,
0x115LLU, 0x22dLLU, 0x45dLLU, 0x8bdLLU, 0x1181LLU,
0x2303LLU, 0x4609LLU, 0x8c17LLU, 0x1183dLLU, 0x2307bLLU,
0x460fdLLU, 0x8c201LLU, 0x118411LLU, 0x230833LLU, 0x461069LLU,
0x8c20e1LLU, 0x11841cbLLU, 0x2308397LLU, 0x461075bLLU, 0x8c20ecbLLU,
0x11841da5LLU, 0x23083b61LLU, 0x461076c7LLU, 0x8c20ed91LLU, 0x11841db31LLU,
0x23083b673LLU, 0x461076d1bLLU, 0x8c20eda41LLU, 0x11841db48dLLU, 0x23083b6937LLU,
0x461076d27fLLU, 0x8c20eda50dLLU, 0x11841db4a59LLU, 0x23083b694ebLLU, 0x461076d29f1LLU,
0x8c20eda5441LLU, 0x11841db4a887LLU, 0x23083b69511fLLU, 0x461076d2a2c1LLU, 0x8c20eda54591LLU,
0x11841db4a8b55LLU, 0x23083b69516c1LLU, 0x461076d2a2da5LLU, 0x8c20eda545b55LLU, 0x11841db4a8b6b5LLU,
0x23083b69516d91LLU, 0x461076d2a2db3bLLU, 0x8c20eda545b69dLLU, 0x11841db4a8b6d5dLLU, 0x23083b69516daf5LLU,
0x461076d2a2db5edLLU, 0x8c20eda545b6c5fLLU, 0x11841db4a8b6d8ebLLU, 0x23083b69516db1ffLLU, 0x461076d2a2db643fLLU,
0x8c20eda545b6c8f3LLU
};
static inline uint64_t _rj_hashset_find_prime(uint64_t n){
uint32_t i;
i = 0;
while(i < 60 && n > sys_prime_list[i]) i ++;
return sys_prime_list[i];
}
#define init_hashset_macro(hash_type, hash_ele_type) \
typedef struct { hash_ele_type *array; BitVec *ones, *dels; size_t e_size; size_t ocp; size_t size; size_t count; size_t max; float load_factor; size_t iter_ptr; void *userdata; } hash_type; \
static inline size_t hash_type##_obj_desc_cnt(void *obj, int idx){ \
hash_type *set; \
set = (hash_type*)obj; \
if(set->dels){ \
switch(idx){ \
case 0: return ((hash_type*)obj)->size * sizeof(hash_ele_type); \
default: return 1; \
} \
} else { \
switch(idx){ \
case 0: return ((hash_type*)obj)->count * sizeof(hash_ele_type); \
case 1: return 1; \
default: return 0; \
} \
} \
} \
static const obj_desc_t hash_type##_obj_desc = {TOSTR(_hashset_##hash_type), sizeof(hash_type), 3, {1, 1, 1}, {offsetof(hash_type, array), offsetof(hash_type, ones), offsetof(hash_type, dels)}, {(obj_desc_t*)&OBJ_DESC_DATA, (obj_desc_t*)&bitvec_obj_desc, (obj_desc_t*)&bitvec_obj_desc}, hash_type##_obj_desc_cnt, NULL}; \
static inline int hash_type##_is_prime(uint64_t num){ \
uint64_t i, max; \
if(num < 4) return 1; \
if(num % 2 == 0) return 0; \
max = (uint64_t)sqrt((double)num); \
for(i=3;ie_size = sizeof(hash_ele_type); \
set->size = _rj_hashset_find_prime(size); \
set->count = 0; \
set->ocp = 0; \
set->load_factor = factor; \
set->max = set->size * set->load_factor; \
set->iter_ptr = 0; \
set->array = calloc(set->size, set->e_size); \
set->ones = init_bitvec(set->size); \
set->dels = init_bitvec(set->size); \
set->userdata = NULL; \
return set; \
} \
static inline void set_userdata_##hash_type(hash_type *set, void *userdata){ set->userdata = userdata; } \
static inline hash_type* init_##hash_type(uint32_t size){ return init2_##hash_type(size, 0.67f); }
#define get_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal, hash_val_type, hash_ele2val) \
static inline hash_ele_type* get_##hash_type(hash_type *set, hash_key_type key){\
hash_ele_type *e; \
size_t hc, hi; \
hc = hash_key_code(key) % set->size; \
if(set->dels){ \
while(1){ \
if(get_bitvec(set->ones, hc) == 0){ \
return NULL; \
} else if(get_bitvec(set->dels, hc)){ \
} else { \
e = ((hash_ele_type*)set->array) + hc; \
if(hash_key_equal((key), (*e))) return e; \
} \
hc = (hc + 1) % set->size; \
} \
} else { \
hi = MAX_U8; \
while(1){ \
if(get_bitvec(set->ones, hc)){ \
if(hi == MAX_U8){ \
hi = rank_bitvec(set->ones, hc); \
} \
e = ((hash_ele_type*)set->array) + hi; \
if(hash_key_equal((key), (*e))) return e; \
} else { \
return NULL; \
} \
hc ++; \
hi ++; \
} \
} \
return NULL; \
} \
static inline size_t offset_##hash_type(hash_type *set, hash_ele_type *ptr){ \
return ptr - set->array; \
} \
static inline hash_ele_type* ref_##hash_type(hash_type *set, size_t off){ return set->array + off; } \
static inline hash_val_type getval_##hash_type(hash_type *set, hash_key_type key){ \
hash_ele_type *e; \
e = get_##hash_type(set, key); \
return hash_ele2val(e); \
}
#define prepare_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal) \
static inline void encap_##hash_type(hash_type *set, size_t num); \
static inline hash_ele_type* prepare_##hash_type(hash_type *set, hash_key_type key, int *exists){\
hash_ele_type *e; \
size_t hc, d; \
if(set->dels == NULL){ *exists = 0; return NULL; } \
encap_##hash_type(set, 1); \
hc = hash_key_code((key)) % set->size; \
d = set->size; \
while(1){ \
if(get_bitvec(set->ones, hc) == 0){ \
if(d == set->size){ \
one_bitvec(set->ones, hc); \
set->ocp ++; \
} else { \
hc = d; \
zero_bitvec(set->dels, hc); \
} \
if(exists) *exists = 0; \
set->count ++; \
e = ((hash_ele_type*)set->array) + hc; \
return e; \
} else if(get_bitvec(set->dels, hc)){ \
if(d == set->size) d = hc; \
} else { \
e = ((hash_ele_type*)set->array) + hc; \
if(hash_key_equal((key), (*e))){ \
if(exists) *exists = 1; \
return e; \
} \
} \
hc = (hc + 1) % set->size; \
} \
return NULL; \
}
#define exists_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal) \
static inline int exists_##hash_type(hash_type *set, hash_key_type key){ \
return get_##hash_type(set, key) != NULL; \
}
#define add_hashset_macro(hash_type, hash_ele_type, hash_code_macro, hash_equal_macro) \
static inline hash_ele_type* add_##hash_type(hash_type *set, hash_ele_type ele){ \
hash_ele_type *e; \
size_t d, hc; \
if(set->dels == NULL) return NULL; \
hc = hash_code_macro(ele) % set->size; \
d = set->size; \
do { \
if(get_bitvec(set->ones, hc) == 0){ \
if(d == set->size){ \
one_bitvec(set->ones, hc); \
set->ocp ++; \
} else { \
hc = d; \
zero_bitvec(set->dels, hc); \
} \
set->count ++; \
e = ((hash_ele_type*)set->array) + hc; \
*e = ele; \
return e; \
} else if(get_bitvec(set->dels, hc)){ \
if(d == set->size) d = hc; \
} else { \
e = ((hash_ele_type*)set->array) + hc; \
if(hash_equal_macro((ele), (*e))){ \
*e = ele; \
return e; \
} \
} \
hc = (hc + 1) % set->size; \
} while(1); \
return NULL; \
}
#define put_hashset_macro(hash_type, hash_ele_type) \
static inline hash_ele_type* put_##hash_type(hash_type *set, hash_ele_type ele){ \
encap_##hash_type(set, 1); \
return add_##hash_type(set, ele); \
}
#define remove_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal) \
static inline int delete_##hash_type(hash_type *set, hash_ele_type *ele){ \
size_t hc; \
if(set->dels == NULL) return 0; \
hc = offset_##hash_type(set, ele); \
if(get_bitvec(set->ones, (hc + 1) % set->size) == 0){ \
zero_bitvec(set->ones, hc); \
set->ocp --; \
} else { \
one_bitvec(set->dels, hc); \
} \
set->count --; \
return 1; \
} \
\
static inline int remove_##hash_type(hash_type *set, hash_key_type key){ \
hash_ele_type *e; \
size_t hc; \
if(set->dels == NULL) return 0; \
hc = hash_key_code(key) % set->size; \
while(1){ \
if(get_bitvec(set->ones, hc) == 0){ \
return 0; \
} else if(get_bitvec(set->dels, hc)){ \
} else { \
e = ((hash_ele_type*)set->array) + hc; \
if(hash_key_equal((key), (*e))){ \
if(get_bitvec(set->ones, (hc + 1) % set->size) == 0){ \
zero_bitvec(set->ones, hc); \
set->ocp --; \
} else { \
one_bitvec(set->dels, hc); \
} \
set->count --; \
return 1; \
} \
} \
hc = (hc + 1) % set->size; \
} \
return 0; \
}
#define reset_iter_hashset_macro(hash_type) static inline void reset_iter_##hash_type(hash_type *set){ set->iter_ptr = 0; }
#define ref_iter_hashset_macro(hash_type, hash_ele_type) \
static inline hash_ele_type* ref_iter2_##hash_type(hash_type *set, size_t *iter_ptr){ \
if(set->dels){ \
while(((*iter_ptr) = next_one_bitvec(set->ones, (*iter_ptr))) < set->size){ \
if(get_bitvec(set->dels, (*iter_ptr))){ \
(*iter_ptr) ++; \
} else { \
return (((hash_ele_type*)set->array) + (*iter_ptr)++); \
} \
} \
} else { \
while((*iter_ptr) < set->count){ \
return (((hash_ele_type*)set->array) + (*iter_ptr)++); \
} \
} \
return NULL; \
} \
static inline hash_ele_type* ref_iter_##hash_type(hash_type *set){ \
return ref_iter2_##hash_type(set, &(set->iter_ptr)); \
}
#define count_hashset_macro(hash_type) static inline int64_t count_##hash_type(hash_type *set){ return set->count; }
#define freeze_hashset_macro(hash_type, hash_ele_type, hash_code_macro) \
static inline int freeze_##hash_type(hash_type *set, float load_factor){ \
size_t *hvs, i, j, sz; \
if(set->dels == NULL) return 0; \
if(load_factor == 0) load_factor = set->load_factor; \
sz = set->count / load_factor; \
sz = _rj_hashset_find_prime(sz); \
for(i=j=0;(i=next_one_bitvec(set->ones, i))size;i++){ \
if(get_bitvec(set->dels, i)) continue; \
if(j < i){ \
set->array[j] = set->array[i]; \
} \
j ++; \
} \
free_bitvec(set->ones); \
set->ones = NULL; \
free_bitvec(set->dels); \
set->dels = NULL; \
set->size = sz; \
set->load_factor = load_factor; \
set->ocp = set->count; \
set->array = realloc(set->array, (set->count + 1) * sizeof(hash_ele_type)); \
memset(set->array + set->count, 0, sizeof(hash_ele_type)); \
hvs = malloc(set->count * sizeof(size_t)); \
for(i=0;icount;i++){ \
hvs[i] = hash_code_macro(set->array[i]) % sz; \
} \
sort_array_adv(set->count, hvs[a] > hvs[b], swap_var(hvs[a], hvs[b]); swap_var(set->array[a], set->array[b])); \
for(i=j=0;icount;i++){ \
if(j < hvs[i]) j = hvs[i]; \
j ++; \
} \
if(j < sz) j = sz; \
set->ones = init_bitvec(j + 1); \
for(i=j=0;icount;i++){ \
if(j < hvs[i]) j = hvs[i]; \
one_bitvec(set->ones, j); \
j ++; \
} \
free(hvs); \
index_bitvec(set->ones); \
return 1; \
}
#define clear_hashset_macro(hash_type) \
static inline void clear_##hash_type(hash_type *set){ \
if(set->dels == NULL){ \
return; \
} \
zeros_bitvec(set->ones); \
zeros_bitvec(set->dels); \
set->count = 0; \
set->ocp = 0; \
set->iter_ptr = 0; \
}
#define free_hashset_macro(hash_type) \
static inline void free_##hash_type(hash_type *set){ \
free(set->array); \
if(set->ones) free_bitvec(set->ones); \
if(set->dels) free_bitvec(set->dels); \
free(set); \
}
#define encap_hashset_macro(hash_type, hash_ele_type, hash_code_macro) \
static inline void encap_##hash_type(hash_type *set, size_t num){ \
BitVec *ones, *dels; \
size_t i, n, hc; \
hash_ele_type key; \
if(set->dels == NULL) return; \
if(set->ocp + num <= set->max) return; \
n = set->size; \
do{ n = _rj_hashset_find_prime(n * 2); } while(n * set->load_factor < set->count + num); \
set->array = realloc(set->array, n * set->e_size); \
if(set->array == NULL){ \
fprintf(stderr, "-- Out of memory --\n"); \
print_backtrace(stderr, 20); \
exit(1); \
} \
ones = init_bitvec(n); \
dels = init_bitvec(n); \
set->ocp = set->count; \
set->max = n * set->load_factor; \
for(i=0;(i=next_one_bitvec(set->ones, i))size;i++){ \
if(get_bitvec(set->dels, i)) continue; \
key = ((hash_ele_type*)set->array)[i]; \
one_bitvec(set->dels, i); \
while(1){ \
hc = hash_code_macro(key) % n; \
while(get_bitvec(ones, hc)){ \
hc = (hc + 1) % n; \
} \
one_bitvec(ones, hc); \
if(hc < set->size && get_bitvec(set->ones, hc) && get_bitvec(set->dels, hc) == 0){ \
swap_var(key, ((hash_ele_type*)set->array)[hc]); \
one_bitvec(set->dels, hc); \
} else { \
((hash_ele_type*)set->array)[hc] = key; \
break; \
} \
} \
} \
swap_var(ones, set->ones); \
swap_var(dels, set->dels); \
set->size = n; \
free_bitvec(ones); \
free_bitvec(dels); \
} \
static inline size_t offsetof_##hash_type(hash_type *set, hash_ele_type *ptr){ return ptr - set->array; } \
#define ITSELF(E) (E)
#define NUM_EQUALS(E1, E2) ((E1) == (E2))
#define define_hashtable(hash_type, hash_ele_type, hash_code_macro, hash_equal_macro, hash_key_type, hash_key_code, hash_key_equal, hash_val_type, hash_ele2val) \
init_hashset_macro(hash_type, hash_ele_type); \
get_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal, hash_val_type, hash_ele2val); \
prepare_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal); \
exists_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal); \
add_hashset_macro(hash_type, hash_ele_type, hash_code_macro, hash_equal_macro); \
put_hashset_macro(hash_type, hash_ele_type); \
remove_hashset_macro(hash_type, hash_ele_type, hash_key_type, hash_key_code, hash_key_equal); \
ref_iter_hashset_macro(hash_type, hash_ele_type); \
reset_iter_hashset_macro(hash_type); \
count_hashset_macro(hash_type); \
clear_hashset_macro(hash_type); \
freeze_hashset_macro(hash_type, hash_ele_type, hash_code_macro); \
free_hashset_macro(hash_type); \
encap_hashset_macro(hash_type, hash_ele_type, hash_code_macro);
#define define_hashset(hash_type, hash_ele_type, hash_code_macro, hash_equal_macro) define_hashtable(hash_type, hash_ele_type, hash_code_macro, hash_equal_macro, hash_ele_type, hash_code_macro, hash_equal_macro, hash_ele_type*, ITSELF)
/* ------------------ Useful functions ------------------------------------- */
static inline uint32_t __lh3_Jenkins_hash_int(uint32_t key){
key += (key << 12);
key ^= (key >> 22);
key += (key << 4);
key ^= (key >> 9);
key += (key << 10);
key ^= (key >> 2);
key += (key << 7);
key ^= (key >> 12);
return key;
}
static inline uint64_t __lh3_Jenkins_hash_64(uint64_t key){
key += ~(key << 32);
key ^= (key >> 22);
key += ~(key << 13);
key ^= (key >> 8);
key += (key << 3);
key ^= (key >> 15);
key += ~(key << 27);
key ^= (key >> 31);
return key;
}
static inline uint32_t jenkins_one_at_a_time_hash(char *key, size_t len){
uint32_t hash, i;
for(hash = i = 0; i < len; ++i){
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
static inline u8i invertible_hashcode(u8i x, int p){
u8i m;
m = 0xFFFFFFFFFFFFFFFFLLU >> (64 - p);
x = ((~x) + (x << 21)) & m;
x = x ^ (x >> 24);
x = (x + (x << 3) + (x << 8)) & m;
x = x ^ (x >> 14);
x = (x + (x << 2) + (x << 4)) & m;
x = x ^ (x >> 28);
x = (x + (x << 31)) & m;
return x;
}
static inline uint64_t hash64shift(uint64_t key){
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
static inline uint64_t MurmurHash64A(const void * key, int len, uint32_t seed){
const uint64_t m = 0xc6a4a7935bd1e995LLU;
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
while(data != end){
uint64_t k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
switch(len & 7){
case 7: h ^= ((uint64_t)data2[6]) << 48;
case 6: h ^= ((uint64_t)data2[5]) << 40;
case 5: h ^= ((uint64_t)data2[4]) << 32;
case 4: h ^= ((uint64_t)data2[3]) << 24;
case 3: h ^= ((uint64_t)data2[2]) << 16;
case 2: h ^= ((uint64_t)data2[1]) << 8;
case 1: h ^= ((uint64_t)data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
#define u32hashcode(key) __lh3_Jenkins_hash_int(key)
#define u64hashcode(key) __lh3_Jenkins_hash_64(key)
static inline uint32_t __string_hashcode(const char *s){
uint32_t h = *s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
return h;
}
#define u32hash_code(e) u32hashcode(e)
#define u64hash_code(e) u64hashcode(e)
#define uxxhash_equals(e1, e2) ((e1) == (e2))
define_hashset(u32hash, uint32_t, u32hash_code, uxxhash_equals);
define_hashset(u64hash, uint64_t, u64hash_code, uxxhash_equals);
#define i32hash_code(e) u32hashcode((uint32_t)(e))
#define i32hash_equals(e1, e2) ((e1) == (e2))
define_hashset(i32hash, int, i32hash_code, i32hash_equals);
#define chash_code(e) __string_hashcode(e)
#define chash_equals(e1, e2) (strcmp(e1, e2) == 0)
define_hashset(chash, char*, chash_code, chash_equals);
#define KV_HASH_GET_VAL(e) (e)? (e)->val : ((typeof(e->val))MAX_U8)
typedef struct { u4i key, val; } uuhash_t;
#define uuhash_code(e) u32hashcode((e).key)
#define uuhash_equals(e1, e2) ((e1).key == (e2).key)
#define uuhash_key_equals(e1, e2) ((e1) == (e2).key)
define_hashtable(uuhash, uuhash_t, uuhash_code, uuhash_equals, u4i, u32hashcode, uuhash_key_equals, u4i, KV_HASH_GET_VAL);
typedef struct { u4i key; int val; } uihash_t;
#define uihashcode(E) u32hashcode((E).key)
#define uihashequals(E1, E2) (E1).key == (E2).key
#define uihashkeyequals(E1, E2) (E1) == (E2).key
define_hashtable(uihash, uihash_t, uihashcode, uihashequals, u4i, u32hashcode, uihashkeyequals, b4i, KV_HASH_GET_VAL);
typedef struct { u8i key, val; } UUhash_t;
#define UUhashcode(E) u64hashcode((E).key)
#define UUhashequals(E1, E2) (E1).key == (E2).key
#define UUhashkeyequals(E1, E2) (E1) == (E2).key
define_hashtable(UUhash, UUhash_t, UUhashcode, UUhashequals, u8i, u64hashcode, UUhashkeyequals, u8i, KV_HASH_GET_VAL);
typedef struct { char *key; u4i val; } cuhash_t;
#define cuhash_code(e) __string_hashcode((e).key)
#define cuhash_equals(e1, e2) (strcmp((e1).key, (e2).key) == 0)
#define cuhash_key_equals(e1, e2) (strcmp((char*)(e1), (e2).key) == 0)
define_hashtable(cuhash, cuhash_t, cuhash_code, cuhash_equals, char*, __string_hashcode, cuhash_key_equals, u4i, KV_HASH_GET_VAL);
static const obj_desc_t cuhash_struct_deep_obj_desc = {"cuhash_struct_deep_obj_desc", sizeof(cuhash_t), 1, {1}, {offsetof(cuhash_t, key)}, {(obj_desc_t*)&OBJ_DESC_CHAR_ARRAY}, NULL, NULL};
static const obj_desc_t cuhash_deep_obj_desc = {"cuhash_deep_obj_desc", sizeof(cuhash), 3, {1, 1, 1}, {offsetof(cuhash, array), offsetof(cuhash, ones), offsetof(cuhash, dels)}, {(obj_desc_t*)&cuhash_struct_deep_obj_desc, (obj_desc_t*)&bitvec_obj_desc, &bitvec_obj_desc}, cuhash_obj_desc_cnt, NULL};
typedef struct { char *key; int val; } cihash_t;
#define cihash_code(e) __string_hashcode((e).key)
#define cihash_equals(e1, e2) (strcmp((e1).key, (e2).key) == 0)
#define cihash_key_equals(e1, e2) (strcmp((char*)(e1), (e2).key) == 0)
define_hashtable(cihash, cihash_t, cihash_code, cihash_equals, char*, __string_hashcode, cihash_key_equals, b4i, KV_HASH_GET_VAL);
typedef struct { char *key; unsigned long long val; } clhash_t;
#define clhash_code(e) __string_hashcode((e).key)
#define clhash_equals(e1, e2) (strcmp((e1).key, (e2).key) == 0)
#define clhash_key_equals(e1, e2) (strcmp((char*)(e1), (e2).key) == 0)
define_hashtable(clhash, clhash_t, clhash_code, clhash_equals, char*, __string_hashcode, clhash_key_equals, u8i, KV_HASH_GET_VAL);
typedef struct { char *key; char *val; } cchash_t;
#define cchash_code(e) __string_hashcode((e).key)
#define cchash_equals(e1, e2) (strcmp((e1).key, (e2).key) == 0)
#define cchash_key_equals(e1, e2) (strcmp((char*)(e1), (e2).key) == 0)
#define KV_CCHASH_GET_VAL(e) ((e)? (e)->val : NULL)
define_hashtable(cchash, cchash_t, cchash_code, cchash_equals, char*, __string_hashcode, cchash_key_equals, char*, KV_CCHASH_GET_VAL);
/**
* Example of using userdata in thread-safe mode
* char **strs;
* ... codes init strs
* #define test_hc(E) __string_hashcode(((char**)set->userdata)[E])
* #define test_he(E1, E2) (strcmp(((char**)set->userdata)[E1], ((char**)set->userdata)[E2]) == 0)
* define_hashset(testhash, uint32_t, test_hc, test_he);
* testhash *hash = init_testhash(13);
* set_userdata_testhash(hash, strs);
* ... now, the key of testhash is uint32_t, but refer to strs
*/
#endif
wtdbg2-2.5/kbm.c 0000664 0000000 0000000 00000063020 13536643722 0013502 0 ustar 00root root 0000000 0000000 /*
*
* Copyright (c) 2011, Jue Ruan
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include "kbm.h"
#include "kbmpoa.h"
#include
#ifndef VERSION
#define VERSION 0.0
#endif
#ifndef RELEASE
#define RELEASE 19830203
#endif
int kbm_usage(){
fprintf(stdout, "Program: kbm is a simple instance which implemented kmer-binmap\n");
fprintf(stdout, " it maps query sequence against reference by kmer matching\n");
fprintf(stdout, " matched kmer-pairs are bined (256bp) and counted in a matrix\n");
fprintf(stdout, " dynamic programming is used to search the best path\n");
fprintf(stdout, "Version: %s (%s)\n", TOSTR(VERSION), TOSTR(RELEASE));
fprintf(stdout, "Author: Jue Ruan \n");
fprintf(stdout, "Usage: kbm [start|list|stop]\n");
fprintf(stdout, "Options:\n");
fprintf(stdout, " -i File(s) of query sequences, +, [STDIN]\n");
fprintf(stdout, " -d File(s) of reference sequences, +, [<-i>]\n");
fprintf(stdout, " -L Choose the longest subread and drop reads shorter than (5000 recommended for PacBio) [0]\n");
fprintf(stdout, " Negative integer indicate keeping read names, e.g. -5000.\n");
fprintf(stdout, " -o Output file, [STDOUT]\n");
fprintf(stdout, " -I Interactive mode\n");
fprintf(stdout, " e.g. `mkfifo pipe` then `while true; do cat pipe && sleep 1; done | kbm -t 8 -I -d ref.fa -i - -Hk 21 -S 4`\n");
fprintf(stdout, " then `cat 1.fq >pipe; cat 2.fq >pipe`, fastq format is better in interaction\n");
fprintf(stdout, " -f Force overwrite\n");
fprintf(stdout, " -t Number of threads, 0: all cores, [1]\n");
fprintf(stdout, " -k Kmer-f size, <= %d, [0]\n", KBM_MAX_KSIZE);
fprintf(stdout, " -p Kmer-p size, <= %d, [21]\n", KBM_MAX_KSIZE);
fprintf(stdout, " -K Filter high frequency kmers, maybe repetitive, [1000]\n");
fprintf(stdout, " if K >= 1, take the integer value as cutoff, MUST <= 65535\n");
fprintf(stdout, " else, mask the top fraction part high frequency kmers\n");
fprintf(stdout, " -E Min kmer frequency, [1]\n");
fprintf(stdout, " -O Filter low complexity bins (#indexed_kmer less than <-O>), [2]\n");
fprintf(stdout, " -S Subsampling kmers, 1/(<-S>) kmers are indexed, [4.00]\n");
fprintf(stdout, " -S is very useful in saving memeory and speeding up\n");
fprintf(stdout, " please note that subsampling kmers will have less matched length\n");
fprintf(stdout, " -B Select no more than n seeds in a query bin, [256]\n");
// Obsolete
//fprintf(stdout, " -G Recognize error kmers in a bin when be aligned >= <-G> times, [0]\n");
fprintf(stdout, " If you are using shared kbmidx by other process using -D too, it will bring wrong behavior\n");
fprintf(stdout, " -D Strand of alignment, 1: forward, 2: reverse, 3: both, [3]\n");
fprintf(stdout, " -X Max number of bin(256bp) in one gap, [4]\n");
fprintf(stdout, " -Y Max number of bin(256bp) in one deviation, [4]\n");
fprintf(stdout, " -Z Max fraction of gapped BINs / aligned BINs, [0.6]\n");
fprintf(stdout, " -x penalty for BIN gap, [-7]\n");
fprintf(stdout, " -y penalty for BIN deviation, [-21]\n");
fprintf(stdout, " -z Enable refine alignment with -p <-z> [0]\n");
fprintf(stdout, " -l Min alignment length, [2048]\n");
fprintf(stdout, " -m Min matched length, [200]\n");
fprintf(stdout, " -s Min similarity, calculated by kmer matched length / aligned length, [0.05]\n");
fprintf(stdout, " -r Max length variation of two aligned fragments, [0.25]\n");
fprintf(stdout, " -c Insist to query contained reads against all\n");
fprintf(stdout, " -C Chainning alignments\n");
fprintf(stdout, " -n Max hits per query, [1000]\n");
#ifdef TEST_MODE
fprintf(stdout, " -T For debug, [0]\n");
#endif
fprintf(stdout, " -W Dump kbm index to file, [NULL]\n");
fprintf(stdout, " -R Load kbm index from file, [NULL]\n");
fprintf(stdout, " -q Quiet\n");
fprintf(stdout, " -V Print version information and then exit\n");
#if __DEBUG__
fprintf(stdout, " -v Verbose, +\n");
#endif
fprintf(stdout, "Server start: {kbm -R start}, will mmap wt.fa.kbmidx into mmeory\n");
fprintf(stdout, "Server list: {kbm -R list [10]}, will list the object tree in file\n");
fprintf(stdout, "Server stop: {kbm -R stop}, will remove the mmap object\n");
return 1;
}
thread_beg_def(maln);
CTGCNS *cc;
KBMAux *aux;
String *rdtag;
BaseBank *rdseqs;
u4i qidx;
u8i rdoff;
u4i rdlen;
int corr_mode;
float corr_cov;
u4i corr_min, corr_max;
FILE *out, *lay;
int chainning;
int interactive;
int refine;
thread_end_def(maln);
thread_beg_func(maln);
KBMPar *rpar;
KBM *rkbm;
KBMAux *raux;
kbm_map_t HIT;
u4v *tidxs;
{
rpar = init_kbmpar();
rpar->ksize = 0;
rpar->psize = maln->refine;
rpar->min_bin_degree = 0;
rpar->kmin = 1;
rpar->kmax = 1000;
rpar->kmer_mod = KBM_N_HASH;
rkbm = init_kbm(rpar);
raux = init_kbmaux(rkbm);
}
tidxs = init_u4v(16);
thread_beg_loop(maln);
if(maln->rdlen == 0) break;
if(maln->corr_mode){
if(map_kbmpoa(maln->cc, maln->aux, maln->rdtag->size? maln->rdtag->string : NULL, maln->qidx, maln->rdseqs, maln->rdoff, maln->rdlen, maln->corr_min, maln->corr_max, maln->corr_cov, maln->lay) == 0){
clear_kbmmapv(maln->aux->hits);
break;
}
} else {
query_index_kbm(maln->aux, maln->rdtag->size? maln->rdtag->string : NULL, maln->qidx, maln->rdseqs, maln->rdoff, maln->rdlen);
map_kbm(maln->aux);
if(maln->refine && maln->aux->hits->size){
kbm_read_t *rd;
kbm_map_t *hit;
u4i i, j, tidx;
clear_kbm(rkbm);
bitpush_kbm(rkbm, maln->rdtag->size? maln->rdtag->string : NULL, maln->rdtag->size, maln->rdseqs->bits, 0, maln->rdlen);
ready_kbm(rkbm);
simple_index_kbm(rkbm, 0, rkbm->bins->size);
clear_u4v(tidxs);
for(i=0;iaux->hits->size;i++){
hit = ref_kbmmapv(maln->aux->hits, i);
if(tidxs->size == 0 || hit->tidx != tidxs->buffer[tidxs->size - 1]){
push_u4v(tidxs, hit->tidx);
}
if(KBM_LOG){
fprintf(maln->out, "#");
fprint_hit_kbm(maln->aux, i, maln->out);
}
}
clear_kbmmapv(maln->aux->hits);
clear_bitsvec(maln->aux->cigars);
for(i=0;i