pax_global_header00006660000000000000000000000064147601214720014516gustar00rootroot0000000000000052 comment=1973f1e6a44fc8d8e90a016fb3ef53eee6d6c5c2 vsearch-2.30.0/000077500000000000000000000000001476012147200132335ustar00rootroot00000000000000vsearch-2.30.0/.dockerignore000066400000000000000000000000341476012147200157040ustar00rootroot00000000000000.git .gitignore .travis.yml vsearch-2.30.0/.github/000077500000000000000000000000001476012147200145735ustar00rootroot00000000000000vsearch-2.30.0/.github/workflows/000077500000000000000000000000001476012147200166305ustar00rootroot00000000000000vsearch-2.30.0/.github/workflows/jekyll-gh-pages.yml000066400000000000000000000027741476012147200223500ustar00rootroot00000000000000# Sample workflow for building and deploying a Jekyll site to GitHub Pages name: Deploy Jekyll with GitHub Pages dependencies preinstalled on: # Runs on pushes targeting the dev branch push: branches: ["dev"] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: contents: read pages: write id-token: write # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. concurrency: group: "pages" cancel-in-progress: false jobs: # Build job build: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 - name: generate markdown files run: | cd man bash ./scripts/generate_online_documentation.sh - name: Setup Pages uses: actions/configure-pages@v5 - name: Build with Jekyll uses: actions/jekyll-build-pages@v1 with: source: ./ destination: ./_site - name: Upload artifact uses: actions/upload-pages-artifact@v3 # Deployment job deploy: environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest needs: build steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 vsearch-2.30.0/.gitignore000066400000000000000000000004251476012147200152240ustar00rootroot00000000000000*.a *.clang *.json *.o *.pdf *.sh *.texinfo *~ .cache/ .deps .dirstamp /autom4te.cache /bin /config.h /config.log /config.status /config.guess /config.sub /stamp-h1 Makefile aclocal.m4 config.h.in configure compile depcomp install-sh missing Makefile.in .vscode .DS_Store .Tpo vsearch-2.30.0/.travis.yml000066400000000000000000000004671476012147200153530ustar00rootroot00000000000000language: - cpp arch: - arm64 os: - linux addons: apt: packages: - ghostscript - groff compiler: - g++ - clang script: - ./autogen.sh - ./configure - make - export PATH=$PWD/bin:$PATH - git clone https://github.com/frederic-mahe/vsearch-tests.git - cd vsearch-tests - bash ./run_all_tests.sh vsearch-2.30.0/CITATION.cff000066400000000000000000000026171476012147200151330ustar00rootroot00000000000000cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - family-names: "Rognes" given-names: "Torbjørn" orcid: "https://orcid.org/0000-0002-9329-9974" - family-names: "Flouri" given-names: "Tomas" orcid: "https://orcid.org/0000-0002-8474-9507" - family-names: "Nichols" given-names: "Benjamin" - family-names: "Quince" given-names: "Christopher" orcid: "https://orcid.org/0000-0003-1884-8440" - family-names: "Mahé" given-names: "Frédéric" orcid: "https://orcid.org/0000-0002-2808-0984" title: "VSEARCH: versatile open-source tool for microbiome analysis" version: 2.22.1 date-released: 2022-09-19 url: "https://github.com/torognes/vsearch" preferred-citation: type: article authors: - family-names: "Rognes" given-names: "Torbjørn" orcid: "https://orcid.org/0000-0002-9329-9974" - family-names: "Flouri" given-names: "Tomas" orcid: "https://orcid.org/0000-0002-8474-9507" - family-names: "Nichols" given-names: "Ben" - family-names: "Quince" given-names: "Christopher" orcid: "https://orcid.org/0000-0003-1884-8440" - family-names: "Mahé" given-names: "Frédéric" orcid: "https://orcid.org/0000-0002-2808-0984" doi: "10.7717/peerj.2584" journal: "Peer Journal" day: 18 month: 10 start: e2584 # First page number title: "VSEARCH: a versatile open source tool for metagenomic" volume: 4 year: 2016 vsearch-2.30.0/Dockerfile000066400000000000000000000006261476012147200152310ustar00rootroot00000000000000FROM alpine:latest WORKDIR /opt/vsearch COPY . . RUN apk add --no-cache \ libstdc++ zlib-dev bzip2-dev \ autoconf automake make g++ && \ ./autogen.sh && \ ./configure CFLAGS="-O2" CXXFLAGS="-O2" && \ make clean && \ make && \ make install && \ make clean && \ apk del autoconf automake make g++ && \ rm -rf /opt/vsearch ENTRYPOINT ["/usr/local/bin/vsearch"] vsearch-2.30.0/LICENSE.txt000066400000000000000000000046441476012147200150660ustar00rootroot00000000000000 VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vsearch-2.30.0/LICENSE_GNU_GPL3.txt000066400000000000000000001045131476012147200163600ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . vsearch-2.30.0/Makefile.am000066400000000000000000000001051476012147200152630ustar00rootroot00000000000000AUTOMAKE_OPTIONS = foreign SUBDIRS = src man EXTRA_DIST = autogen.sh vsearch-2.30.0/README.md000066400000000000000000000573421476012147200145250ustar00rootroot00000000000000[![Build Status](https://app.travis-ci.com/torognes/vsearch.svg?branch=master)](https://app.travis-ci.com/torognes/vsearch) # VSEARCH ## Introduction The aim of this project is to create an alternative to the [USEARCH](https://www.drive5.com/usearch/) tool developed by Robert C. Edgar (2010). The new tool should: * have open source code with an appropriate open source license * be free of charge, gratis * have a 64-bit design that handles very large databases and much more than 4GB of memory * be as accurate or more accurate than usearch * be as fast or faster than usearch We have implemented a tool called VSEARCH which supports *de novo* and reference based chimera detection, clustering, full-length and prefix dereplication, rereplication, reverse complementation, masking, all-vs-all pairwise global alignment, exact and global alignment searching, shuffling, subsampling and sorting. It also supports FASTQ file analysis, filtering, conversion and merging of paired-end reads. VSEARCH stands for vectorized search, as the tool takes advantage of parallelism in the form of SIMD vectorization as well as multiple threads to perform accurate alignments at high speed. VSEARCH uses an optimal global aligner (full dynamic programming Needleman-Wunsch), in contrast to USEARCH which by default uses a heuristic seed and extend aligner. This usually results in more accurate alignments and overall improved sensitivity (recall) with VSEARCH, especially for alignments with gaps. [VSEARCH binaries](https://github.com/torognes/vsearch/releases/latest) are provided for GNU/Linux on five 64-bit processor architectures: x86_64, POWER8 (ppc64le), ARMv8 (aarch64), little-endian 64-bit RISC-V (riscv64), and little-endian 64-bit MIPS (mips64el). Binaries are also provided for macOS (version 10.9 Mavericks or later) on Intel (x86_64) and Apple Silicon (ARMv8), as well as Windows (64-bit, version 7 or higher, on x86_64). VSEARCH contains native SIMD code for three processor architectures (SSE2/SSSE3, AltiVec/VMX/VSX, Neon). In addition, VSEARCH uses the SIMD Everywhere (SIMDe) library to enable building on riscv64, mips64el, and other little-endian architectures, but the performance may be lower than a native implementation. | CPU \ OS | GNU/Linux | macOS | Windows | | ------------- | :-----------: | :----: | :-------: | | x86_64 | ✔ | ✔ | ✔ | | ARMv8 | ✔ | ✔ | | | POWER8 | ✔ | | | | RISC-V 64 LE | ✔ | | | | MIPS 64 LE | not tested | | | Various packages, plugins and wrappers for VSEARCH are also available from other sources - see [below](https://github.com/torognes/vsearch#packages-plugins-and-wrappers). The source code compiles correctly with `gcc` (versions 4.8.5 to 14.0) and `llvm-clang` (3.8 to 19.0). The source code should also compile on [FreeBSD](https://www.freebsd.org/) and [NetBSD](https://www.netbsd.org/) systems. VSEARCH can directly read input query and database files that are compressed using gzip (.gz) and bzip2 (.bz2) if the zlib and bzip2 libraries are available. Most of the nucleotide based commands and options in USEARCH version 7 are supported, as well as some in version 8. The same option names as in USEARCH version 7 has been used in order to make VSEARCH an almost drop-in replacement. VSEARCH does not support amino acid sequences or local alignments. These features may be added in the future. ## Getting Help If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.30.0/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. ## Example In the example below, VSEARCH will identify sequences in the file database.fsa that are at least 90% identical on the plus strand to the query sequences in the file queries.fsa and write the results to the file alnout.txt. `./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9 --alnout alnout.txt` ## Download and install **Source distribution** To download the source distribution from a [release](https://github.com/torognes/vsearch/releases) and build the executable and the documentation, use the following commands: ``` wget https://github.com/torognes/vsearch/archive/v2.30.0.tar.gz tar xzf v2.30.0.tar.gz cd vsearch-2.30.0 ./autogen.sh ./configure CFLAGS="-O2" CXXFLAGS="-O2" make ARFLAGS="cr" sudo make install ``` You may customize the installation directory using the `--prefix=DIR` option to `configure`. If the compression libraries [zlib](https://www.zlib.net) and/or [bzip2](https://www.sourceware.org/bzip2/) are installed on the system, they will be detected automatically and support for compressed files will be included in vsearch (see section **Dependencies** below). Support for compressed files may be disabled using the `--disable-zlib` and `--disable-bzip2` options to `configure`. A PDF version of the manual will be created from the `vsearch.1` manual file if `ps2pdf` is available, unless disabled using the `--disable-pdfman` option to `configure`. It is recommended to run configure with the options `CFLAGS="-O2"` and `CXXFLAGS="-O2"`. Other options may also be applied to `configure`, please run `configure -h` to see them all. GNU autoconf (version 2.63 or later), automake and the GCC C++ (`g++`) compiler is required to build vsearch. Version 3.82 or later of `make` may be required on Linux, while version 3.81 is sufficient on macOS. Warning: Compiling the `align_simd.cc` file on x86_64 systems using the GNU C++ compiler version 9 or later with the `-O3` optimization option on may result in incorrect code that may cause bad alignments in some circumstances. This was due to the `-ftree-partial-pre` optimization enabled by `-O3`. A compiler pragma has been inserted in the code to specifically turn off this optimization for the affected code. Using `-O3` should be safe. To build VSEARCH on Debian and similar Linux distributions (Ubuntu etc) you'll need the following packages: autoconf, automake, g++, ghostscript, groff, libbz2-dev, make, zlib1g-dev. Include libsimde-dev to build on riscv64 or mips64el. To build VSEARCH on Fedora and similar Linux distributions (RHEL, Centos etc) you'll need the following packages: autoconf, automake, bzip2-devel, gcc-c++, ghostscript, groff-base, make, zlib-devel. Instead of downloading the source distribution as a compressed archive, you could clone the repo and build it as shown below. The options to `configure` as described above are still valid. ``` git clone https://github.com/torognes/vsearch.git cd vsearch ./autogen.sh ./configure CFLAGS="-O2" CXXFLAGS="-O2" make ARFLAGS="cr" sudo make install ``` **Binary distribution**: Starting with version 1.4.0, binary distribution files containing pre-compiled binaries as well as the documentation will be made available as part of each [release](https://github.com/torognes/vsearch/releases). The included executables include support for input files compressed by zlib and bzip2 (with files usually ending in `.gz` or `.bz2`). Binary distributions are provided for x86-64 systems running GNU/Linux, macOS (version 10.7 or higher) or Windows (64-bit, version 7 or higher), 64-bit AMDv8 (aarch64) systems running GNU/Linux or macOS, as well as POWER8 (ppc64le), 64-bit little-endian RISC-V (risv64), and 64-bit little endian MIPS (mips64el) systems running GNU/Linux. A universal macOS binary is also provided. In addition, an x86_64 binary built for the discontinued RHEL 7 and CentOS 7 linux distributions is provided. The other Linux binaries are built on Debian 11 (oldstable, Bullseye). Static binaries are available for all Linux architectures except x86_64, these can be used on systems that do not have all the necessary libraries installed. The Windows binary was built with cross compilation using [Mingw-w64](http://mingw-w64.org/). Download the appropriate executable for your system using the following commands if you are using a Linux or macOS system: ```sh wget https://github.com/torognes/vsearch/releases/download/v{VERSION}/vsearch-{VERSION}-{OS}-{ARCH}.tar.gz tar xzf vsearch-{VERSION}-{OS}-{ARCH}.tar.gz ``` Replace `{VERSION}` with the VSEARCH version number (e.g. `2.30.0`), `{OS}` with the target operating system (`linux` or `macos`), and `{ARCH}` with the architecture (`x86_64`, `aarch64`, `ppc64le`, `riscv64`, or `mips64el`). You could add `-static` after `{ARCH}` to get a statically compiled version for Linux (except x86_64). The name of the binary for the RHEL 7 and CentOS 7 Linux distributions ends in `-ubi7`. Or, if you are using Windows, download and extract (unzip) the contents of this file: ``` https://github.com/torognes/vsearch/releases/download/v{VERSION}/vsearch-{VERSION}-win-x86_64.zip ``` **Linux and Mac**: You will now have the binary distribution in a folder called `vsearch-{VERSION}-{OS}-{ARCH}` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. **Windows**: You will now have the binary distribution in a folder called `vsearch-{VERSION}-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. If you want to be able to call `vsearch.exe` from any command prompt window, you can put the VSEARCH executable in a folder (for instance `C:\Users\\bin`), and add the new folder to the user `Path`: open the `Environment Variables` window by searching for it in the Start menu, `Edit` user variables, add `;C:\Users\\bin` to the end of the `Path` variable, and save your changes. The windows distribution also includes the `libbz2.dll` and `zlib1.dll` files required for reading compressed input files. These DLL's have been obtained for mingw-w64 from the MSYS2 platform. **Documentation:** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.30.0/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.30.0/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). ## Packages, plugins, and wrappers **Conda package** Thanks to the [BioConda](https://bioconda.github.io/) team, there is now a [vsearch package](https://anaconda.org/bioconda/vsearch) in [Conda](https://conda.io/). **Debian package** Thanks to the [Debian Med](https://www.debian.org/devel/debian-med/) team, there is now a [vsearch](https://packages.debian.org/sid/vsearch) package in [Debian](https://www.debian.org/). **FreeBSD ports package** Thanks to [Jason Bacon](https://github.com/outpaddling), a [vsearch](https://www.freebsd.org/cgi/ports.cgi?query=vsearch&stype=all) [FreeBSD ports](https://www.freebsd.org/ports/) package is available. Install the binary package with `pkg install vsearch`, or build from source with additional optimizations. **Galaxy wrapper** Thanks to the work of the [Intergalactic Utilities Commission](https://wiki.galaxyproject.org/IUC) members, VSEARCH is now part of the [Galaxy ToolShed](https://toolshed.g2.bx.psu.edu/view/iuc/vsearch/). **Homebrew package** Thanks to [Torsten Seeman](https://github.com/tseemann), a [vsearch package](https://formulae.brew.sh/formula/vsearch) for [Homebrew](http://brew.sh/) has been made. **Pkgsrc package** Thanks to [Jason Bacon](https://github.com/outpaddling), a vsearch [pkgsrc](https://www.pkgsrc.org) package is available for NetBSD and other UNIX-like systems. Install the binary package with `pkgin install vsearch`, or build from source with additional optimizations. **QIIME 2 plugin** Thanks to the [QIIME 2](https://github.com/qiime2) team, there is now a plugin called [q2-vsearch](https://github.com/qiime2/q2-vsearch) for [QIIME 2](https://qiime2.org). ## Converting output to a biom file for use in QIIME and other software With the `from-uc`command in [biom](http://biom-format.org/) 2.1.5 or later, it is possible to convert data in a `.uc` file produced by vsearch into a biom file that can be read by QIIME and other software. It is described [here](https://gist.github.com/gregcaporaso/f3c042e5eb806349fa18). Please note that VSEARCH version 2.2.0 and later are able to directly output OTU tables in biom 1.0 format as well as the classic and mothur formats. ## Implementation details and initial assessment Please see the paper for details: Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584 doi: [10.7717/peerj.2584](https://doi.org/10.7717/peerj.2584) ## Dependencies Compiling VSEARCH requires either GCC (`g++`) or `clang`, `make` and the autotools (`ui-auto` on Debian-based distributions). Optionally, the header files for the following two optional libraries are required if support for gzip and bzip2 compressed FASTA and FASTQ input files is needed: * libz (zlib library) (`zlib.h` header file, available as `zlib1g-dev` on Debian-based distributions) (optional) * libbz2 (bzip2lib library) (`bzlib.h` header file, available as `libbz2-dev`on Debian-based distributions) (optional) VSEARCH will automatically check whether these libraries are available and load them dynamically. On Windows these libraries are called `zlib1.dll` and `libbz2.dll`. These DLL's are included with the released distribution of vsearch 2.27.0 and later. To create the PDF file with the manual the ps2pdf tool is required. It is part of the `ghostscript` package. ## VSEARCH license and third party licenses The VSEARCH code is dual-licensed either under the GNU General Public License version 3 or under the BSD 2-clause license. Please see LICENSE.txt for details. VSEARCH includes code from several other projects. We thank the authors for making their source code available. VSEARCH includes code from Google's [CityHash project](https://github.com/google/cityhash) by Geoff Pike and Jyrki Alakuijala, providing some excellent hash functions available under a MIT license. VSEARCH includes code derived from Tatusov and Lipman's DUST program that is in the public domain. VSEARCH includes public domain code written by Alexander Peslyak for the MD5 message digest algorithm. VSEARCH includes public domain code written by Steve Reid and others for the SHA1 message digest algorithm. The VSEARCH distribution includes code from GNU Autoconf which normally is available under the GNU General Public License, but may be distributed with the special autoconf configure script exception. VSEARCH may include code from the [zlib](https://www.zlib.net) library copyright Jean-loup Gailly and Mark Adler, distributed under the [zlib license](https://www.zlib.net/zlib_license.html). VSEARCH may include code from the [bzip2](https://www.sourceware.org/bzip2/) library copyright Julian R. Seward, distributed under a BSD-style license. ## Code The code is written mostly in C++. File | Description ---|--- **align_simd.cc** | SIMD parallel global alignment of 1 query with 8 database sequences **allpairs.cc** | All-vs-all optimal global pairwise alignment (no heuristics) **arch.cc** | Architecture specific code (Mac/Linux) **attributes.cc** | Extraction and printing of attributes in FASTA headers **bitmap.cc** | Implementation of bitmaps **chimera.cc** | Chimera detection **city.cc** | CityHash code **cluster.cc** | Clustering (cluster\_fast and cluster\_smallmem) **cpu.cc** | Code dependent on specific cpu features (e.g. ssse3) **cut.cc** | Restriction site cutting **db.cc** | Handles the database file read, access etc **dbhash.cc** | Database hashing for exact searches **dbindex.cc** | Indexes the database by identifying unique kmers in the sequences **derep.cc** | Dereplication, full-length **derep_prefix.cc** | Dereplication, prefix **derep_smallmem.cc** | Dereplication, small memory usage **dynlibs.cc** | Dynamic loading of compression libraries **eestats.cc** | Produce statistics for fastq_eestats command **fasta.cc** | FASTA file parser **fasta2fastq.cc** | FASTA to FASTQ conversion **fastq.cc** | FASTQ file parser **fastq_chars.cc** | FASTQ statistics **fastq_join.cc** | FASTQ paired-end reads joining **fastqops.cc** | FASTQ file statistics etc **fastx.cc** | Detection of FASTA and FASTQ files, wrapper for FASTA and FASTQ parsers **filter.cc** | Trimming and filtering of sequences in FASTA and FASTQ files **getseq.cc** | Extraction of sequences based on header labels **kmerhash.cc** | Hash for kmers used by paired-end read merger **linmemalign.cc** | Linear memory global sequence aligner **maps.cc** | Various character mapping arrays **mask.cc** | Masking (DUST) **md5.c** | MD5 message digest **mergepairs.cc** | Paired-end read merging **minheap.cc** | A minheap implementation for the list of top kmer matches **msa.cc** | Simple multiple sequence alignment and consensus sequence computation for clusters **orient.cc** | Orient direction of sequences based on reference database **otutable.cc** | Generate OTU tables in various formats **rereplicate.cc** | Rereplication **results.cc** | Output results in various formats (alnout, userout, blast6, uc) **search.cc** | Implements search using global alignment **search_exact.cc** | Exact search functions **searchcore.cc** | Core search functions for searching, clustering and chimera detection **sff_convert.cc** | SFF to FASTQ file conversion **sha1.c** | SHA1 message digest **showalign.cc** | Output an alignment in a human-readable way given a CIGAR-string and the sequences **shuffle.cc** | Shuffle sequences **sintax.cc** | Taxonomic classification using Sintax method **sortbylength.cc** | Code for sorting by length **sortbysize.cc** | Code for sorting by size (abundance) **subsample.cc** | Subsampling reads from a FASTA file **tax.cc** | Taxonomy information parsing **udb.cc** | UDB database file handling **unique.cc** | Find unique kmers in a sequence **userfields.cc** | Code for parsing the userfields option argument **util.cc** | Various common utility functions **vsearch.cc** | Main program file, general initialization, reads arguments and parses options, writes info. **utils/maps.cc** | Utilities, maps for encoding of nucleotides **utils/seqcmp.cc** | Utilities, sequence comparison VSEARCH may be compiled with zlib or bzip2 integration that allows it to read compressed FASTA files. The [zlib](http://www.zlib.net/) and the [bzip2](https://www.sourceware.org/bzip2/) libraries are needed for this. ## Bugs All bug reports are highly appreciated. You may submit a bug report here on GitHub as an [issue](https://github.com/torognes/vsearch/issues) (preferred), you could post a message on the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) or you could send an email to [torognes@ifi.uio.no](mailto:torognes@ifi.uio.no?subject=bug_in_vsearch). ## Limitations VSEARCH is designed for rather short sequences, and will be slow when sequences are longer than about 5,000 bp. This is because it always performs optimal global alignment on selected sequences. ## The VSEARCH team The main contributors to VSEARCH: * Torbjørn Rognes (Coding, testing, documentation, evaluation) * Frédéric Mahé (Documentation, testing, feature suggestions) * Tomáš Flouri (Coding, testing) * Christopher Quince (Initiator, feature suggestions, evaluation) * Ben Nichols (Evaluation) ## Acknowledgements Special thanks to the following people for patches, suggestions, computer access etc: * Davide Albanese * Colin Brislawn * Michael R. Crusoe * Jeff Epler * Christopher M. Sullivan * Andreas Tille * Sarah Westcott ## Citing VSEARCH Please cite the following publication if you use VSEARCH: Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. doi: [10.7717/peerj.2584](https://doi.org/10.7717/peerj.2584) Please note that citing any of the underlying algorithms, e.g. UCHIME, may also be appropriate. ## Test datasets Test datasets (found in the separate vsearch-data repository) were obtained from the BioMarks project (Logares et al. 2014), the [TARA OCEANS project](https://oceans.taraexpeditions.org/en/) (Karsenti et al. 2011) and the [Protist Ribosomal Reference Database (PR2)](https://github.com/pr2database/pr2database) (Guillou et al. 2013). ## References * Edgar RC (2010) **Search and clustering orders of magnitude faster than BLAST.** *Bioinformatics*, 26 (19): 2460-2461. doi:[10.1093/bioinformatics/btq461](https://doi.org/10.1093/bioinformatics/btq461) * Edgar RC (2016) **SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences.** *bioRxiv*. doi:[10.1101/074161](https://doi.org/10.1101/074161) * Edgar RC (2016) **UNOISE2: improved error-correction for Illumina 16S and ITS amplicon sequencing.** *bioRxiv*. doi:[10.1101/081257](https://doi.org/10.1101/081257) * Edgar RC, Flyvbjerg H (2015) **Error filtering, pair assembly and error correction for next-generation sequencing reads.** *Bioinformatics*, 31 (21): 3476-3482. doi:[10.1093/bioinformatics/btv401](https://doi.org/10.1093/bioinformatics/btv401) * Edgar RC, Haas BJ, Clemente JC, Quince C, Knight R (2011) **UCHIME improves sensitivity and speed of chimera detection.** *Bioinformatics*, 27 (16): 2194-2200. doi:[10.1093/bioinformatics/btr381](https://doi.org/10.1093/bioinformatics/btr381) * Guillou L, Bachar D, Audic S, Bass D, Berney C, Bittner L, Boutte C, Burgaud G, de Vargas C, Decelle J, del Campo J, Dolan J, Dunthorn M, Edvardsen B, Holzmann M, Kooistra W, Lara E, Lebescot N, Logares R, Mahé F, Massana R, Montresor M, Morard R, Not F, Pawlowski J, Probert I, Sauvadet A-L, Siano R, Stoeck T, Vaulot D, Zimmermann P & Christen R (2013) **The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy.** *Nucleic Acids Research*, 41 (D1), D597-D604. doi:[10.1093/nar/gks1160](https://doi.org/10.1093/nar/gks1160) * Karsenti E, González Acinas S, Bork P, Bowler C, de Vargas C, Raes J, Sullivan M B, Arendt D, Benzoni F, Claverie J-M, Follows M, Jaillon O, Gorsky G, Hingamp P, Iudicone D, Kandels-Lewis S, Krzic U, Not F, Ogata H, Pesant S, Reynaud E G, Sardet C, Sieracki M E, Speich S, Velayoudon D, Weissenbach J, Wincker P & the Tara Oceans Consortium (2011) **A holistic approach to marine eco-systems biology.** *PLoS Biology*, 9(10), e1001177. doi:[10.1371/journal.pbio.1001177](https://doi.org/10.1371/journal.pbio.1001177) * Logares R, Audic S, Bass D, Bittner L, Boutte C, Christen R, Claverie J-M, Decelle J, Dolan J R, Dunthorn M, Edvardsen B, Gobet A, Kooistra W H C F, Mahé F, Not F, Ogata H, Pawlowski J, Pernice M C, Romac S, Shalchian-Tabrizi K, Simon N, Stoeck T, Santini S, Siano R, Wincker P, Zingone A, Richards T, de Vargas C & Massana R (2014) **The patterning of rare and abundant community assemblages in coastal marine-planktonic microbial eukaryotes.** *Current Biology*, 24(8), 813-821. doi:[10.1016/j.cub.2014.02.050](https://doi.org/10.1016/j.cub.2014.02.050) * Rognes T (2011) **Faster Smith-Waterman database searches by inter-sequence SIMD parallelisation.** *BMC Bioinformatics*, 12: 221. doi:[10.1186/1471-2105-12-221](https://doi.org/10.1186/1471-2105-12-221) vsearch-2.30.0/autogen.sh000077500000000000000000000000471476012147200152350ustar00rootroot00000000000000#!/bin/sh autoreconf --force --install vsearch-2.30.0/configure.ac000066400000000000000000000062601476012147200155250ustar00rootroot00000000000000# -*- Autoconf -*- # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) AC_INIT([vsearch], [2.30.0], [torognes@ifi.uio.no], [vsearch], [https://github.com/torognes/vsearch]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C++]) AC_CONFIG_SRCDIR([src/vsearch.cc]) AC_CONFIG_HEADERS([config.h]) AC_SUBST(MACOSX_DEPLOYMENT_TARGET) MACOSX_DEPLOYMENT_TARGET="10.9" # Checks for programs. AC_PROG_CXX AC_PROG_RANLIB AC_PROG_INSTALL # Checks for libraries. AC_CHECK_LIB([pthread], [pthread_create]) AC_CHECK_LIB([dl], [dlopen]) AC_CHECK_LIB([psapi], [GetProcessMemoryInfo]) # Checks for header files. AC_CHECK_HEADERS([getopt.h fcntl.h float.h regex.h ctype.h locale.h limits.h string.h sys/time.h dlfcn.h pthread.h]) # Checks for typedefs, structures, and compiler characteristics. AC_C_INLINE AC_TYPE_SIZE_T AC_TYPE_UINT32_T AC_TYPE_INT64_T AC_TYPE_UINT64_T AC_TYPE_UINT8_T # Checks for library functions. AC_CHECK_FUNCS([memmove memcpy posix_memalign gettimeofday localtime memchr memset pow regcomp strcasecmp strchr strcspn sysinfo]) have_bzip2=no AC_ARG_ENABLE(bzip2, AS_HELP_STRING([--disable-bzip2], [Disable bzip2 support])) AS_IF([test "x$enable_bzip2" != "xno"], [ have_bzip2=yes ]) if test "x${have_bzip2}" = "xyes"; then AC_CHECK_HEADERS([bzlib.h], [], [have_bzip2=no]) fi have_zlib=no AC_ARG_ENABLE(zlib, AS_HELP_STRING([--disable-zlib], [Disable zlib support])) AS_IF([test "x$enable_zlib" != "xno"], [ have_zlib=yes ]) if test "x${have_zlib}" = "xyes"; then AC_CHECK_HEADERS([zlib.h], [], [have_zlib=no]) fi have_ps2pdf=no AC_ARG_ENABLE(pdfman, AS_HELP_STRING([--disable-pdfman], [Disable PDF manual creation])) AS_IF([test "x$enable_pdfman" != "xno"], [ have_ps2pdf=yes AC_CHECK_PROG(HAVE_PS2PDF, ps2pdf, yes, no) if test "x$HAVE_PS2PDF" = "xno"; then AC_MSG_WARN([*** ps2pdf is required to build a PDF version of the manual]) have_ps2pdf=no fi ]) # Check for --enable-profiling option AC_ARG_ENABLE([profiling], [AS_HELP_STRING([--enable-profiling], [Enable profiling build])], [enable_profiling=$enableval], [enable_profiling=no]) # Define AM_CONDITIONAL for profiling AM_CONDITIONAL([ENABLE_PROFILING], [test "x$enable_profiling" = "xyes"]) # Check for --enable-debug option AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable debug build])], [enable_debug=$enableval], [enable_debug=no]) # Define AM_CONDITIONAL for debug AM_CONDITIONAL([ENABLE_DEBUG], [test "x$enable_debug" = "xyes"]) have_man_html=no case $target in aarch64*) target_aarch64="yes" ;; powerpc64*) target_ppc="yes" ;; x86_64*) target_x86_64="yes" ;; esac AC_CHECK_HEADERS([windows.h], [AM_CONDITIONAL(TARGET_WIN, true)], [AM_CONDITIONAL(TARGET_WIN, false)]) AM_CONDITIONAL(HAVE_PS2PDF, test "x${have_ps2pdf}" = "xyes") AM_CONDITIONAL(HAVE_MAN_HTML, test "x${have_man_html}" = "xyes") AM_CONDITIONAL(TARGET_PPC, test "x${target_ppc}" = "xyes") AM_CONDITIONAL(TARGET_AARCH64, test "x${target_aarch64}" = "xyes") AM_CONDITIONAL(TARGET_X86_64, test "x${target_x86_64}" = "xyes") AM_PROG_CC_C_O AC_CONFIG_FILES([Makefile src/Makefile man/Makefile]) AC_OUTPUT vsearch-2.30.0/dockerfiles/000077500000000000000000000000001476012147200155255ustar00rootroot00000000000000vsearch-2.30.0/dockerfiles/Dockerfile.debian000066400000000000000000000005171476012147200207430ustar00rootroot00000000000000FROM debian:latest WORKDIR /opt/vsearch COPY . . RUN apt-get update RUN apt-get -y install \ autoconf \ automake \ g++ \ ghostscript \ groff \ libbz2-dev \ make \ zlib1g-dev RUN ./autogen.sh RUN ./configure CFLAGS="-O2" CXXFLAGS="-O2" RUN make clean RUN make ARFLAGS="cr" RUN make install ENTRYPOINT ["/usr/local/bin/vsearch"] vsearch-2.30.0/dockerfiles/Dockerfile.fedora000066400000000000000000000005301476012147200207540ustar00rootroot00000000000000FROM fedora:latest WORKDIR /opt/vsearch COPY . . RUN yum update -y RUN yum -y install \ autoconf \ automake \ bzip2-devel \ gcc-c++ \ ghostscript \ groff-base \ make \ zlib-devel RUN ./autogen.sh RUN ./configure CFLAGS="-O2" CXXFLAGS="-O2" RUN make clean RUN make ARFLAGS="cr" RUN make install ENTRYPOINT ["/usr/local/bin/vsearch"] vsearch-2.30.0/man/000077500000000000000000000000001476012147200140065ustar00rootroot00000000000000vsearch-2.30.0/man/Makefile.am000077500000000000000000000010671476012147200160510ustar00rootroot00000000000000# Makefile for creating PDF manual from man file dist_man_MANS = vsearch.1 doc_DATA = CLEANFILES = if HAVE_MAN_HTML doc_DATA += vsearch_manual.html vsearch_manual.html : vsearch.1 sed -e 's/\\-/-/g' $< | \ iconv -f UTF-8 -t ISO-8859-1 | \ groff -t -m mandoc -m www -Thtml > $@ CLEANFILES += vsearch_manual.html endif if HAVE_PS2PDF doc_DATA += vsearch_manual.pdf vsearch_manual.pdf : vsearch.1 sed -e 's/\\-/-/g' $< | \ iconv -f UTF-8 -t ISO-8859-1 | \ groff -W space -t -m mandoc -T ps -P -pa4 | ps2pdf - $@ CLEANFILES += vsearch_manual.pdf endif vsearch-2.30.0/man/vsearch.1000066400000000000000000006216441476012147200155400ustar00rootroot00000000000000.\" import www macros (URL, TAG, MTO) .mso www.tmac .\" ============================================================================ .TH vsearch 1 "February 27, 2025" "version 2.30.0" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pairwise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. .\" ============================================================================ .SH SYNOPSIS .\" left justified, ragged right .ad l Chimera detection: .RS \fBvsearch\fR (\-\-uchime_denovo | \-\-uchime2_denovo | \-\-uchime3_denovo) \fIfastafile\fR (\-\-chimeras | \-\-nonchimeras | \-\-uchimealns | \-\-uchimeout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-uchime_ref \fIfastafile\fR (\-\-chimeras | \-\-nonchimeras | \-\-uchimealns | \-\-uchimeout) \fIoutputfile\fR \-\-db \fIfastafile\fR [\fIoptions\fR] .PP .RE Clustering: .RS \fBvsearch\fR (\-\-cluster_fast | \-\-cluster_size | \-\-cluster_smallmem | \-\-cluster_unoise) \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-centroids | \-\-clusters | \-\-mothur_shared_out | \-\-msaout | \-\-otutabout | \-\-profile | \-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR] .PP .RE Dereplication and rereplication: .RS \fBvsearch\fR \-\-fastx_uniques (\fIfastafile\fR | \fIfastqfile\fR) (\-\-fastaout | \-\-fastqout | \-\-tabbedout | \-\-uc) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR (\-\-derep_fulllength | \-\-derep_id | \-\-derep_prefix) \fIfastafile\fR (\-\-output | \-\-uc) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-derep_smallmem (\fIfastafile\fR | \fIfastqfile\fR) \-\-fastaout \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-rereplicate \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP .RE Extraction of sequences: .RS \fBvsearch\fR \-\-fastx_getseq \fIfastafile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-notmatchedfq) \fIoutputfile\fR \-\-label \fIlabel\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_getseqs \fIfastafile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-notmatchedfq) \fIoutputfile\fR (\-\-label \fIlabel\fR \ \-\-labels \fIlabelfile\fR | \-\-label_word \fIlabel\fR | \-\-label_words \fIlabelfile\fR) [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_getsubseq \fIfastafile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-notmatchedfq) \fIoutputfile\fR \-\-label \fIlabel\fR [\-\-subseq_start \fIposition\fR] [\-\-subseq_end \fIposition\fR] [\fIoptions\fR] .PP .RE FASTA/FASTQ/SFF file processing: .RS \fBvsearch\fR \-\-fasta2fastq \fIfastqfile\fR \-\-fastqout \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_chars \fIfastqfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_convert \fIfastqfile\fR \-\-fastqout \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR (\-\-fastq_eestats | \-\-fastq_eestats2) \fIfastqfile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_filter \fIfastqfile\fR [\-\-reverse \fIfastqfile\fR] (\-\-fastaout | \-\-fastaout_discarded | \-\-fastqout | \-\-fastqout_discarded \-\-fastaout_rev | \-\-fastaout_discarded_rev | \-\-fastqout_rev | \-\-fastqout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_join \fIfastqfile\fR \-\-reverse \fIfastqfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_mergepairs \fIfastqfile\fR \-\-reverse \fIfastqfile\fR (\-\-fastaout | \-\-fastqout | \-\-fastaout_notmerged_fwd | \-\-fastaout_notmerged_rev | \-\-fastqout_notmerged_fwd | \-\-fastqout_notmerged_rev | \-\-eetabbedout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_stats \fIfastqfile\fR [\-\-log \fIlogfile\fR] [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_filter \fIinputfile\fR [\-\-reverse \fIinputfile\fR] (\-\-fastaout | \-\-fastaout_discarded | \-\-fastqout | \-\-fastqout_discarded \-\-fastaout_rev | \-\-fastaout_discarded_rev | \-\-fastqout_rev | \-\-fastqout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_revcomp \fIinputfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-sff_convert \fIsff-file\fR \-\-fastqout \fIoutputfile\fR [\fIoptions\fR] .PP .RE Masking: .RS \fBvsearch\fR \-\-fastx_mask \fIfastxfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-maskfasta \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP .RE Orienting: .RS \fBvsearch\fR \-\-orient \fIfastxfile\fR \-\-db \fIfastxfile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-tabbedout) \fIoutputfile\fR [\fIoptions\fR] .PP .RE Pairwise alignment: .RS \fBvsearch\fR \-\-allpairs_global \fIfastafile\fR (\-\-alnout | \-\-blast6out | \-\-matched | \-\-notmatched | \-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR (\-\-acceptall | \-\-id \fIreal\fR) [\fIoptions\fR] .PP .RE Restriction site cutting: .RS \fBvsearch\fR \-\-cut \fIfastafile\fR \-\-cut_pattern \fIpattern\fR (\-\-fastaout | \-\-fastaout_rev | \-\-fastaout_discarded | \-\-fastaout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP .RE Searching: .RS \fBvsearch\fR \-\-search_exact \fIfastafile\fR \-\-db \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out | \-\-otutabout | \-\-samout | \-\-uc | \-\-userout | \-\-lcaout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-usearch_global \fIfastafile\fR \-\-db \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out | \-\-otutabout | \-\-samout | \-\-uc | \-\-userout | \-\-lcaout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR] .PP .RE Shuffling and sorting: .RS \fBvsearch\fR (\-\-shuffle | \-\-sortbylength | \-\-sortbysize) \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP .RE Subsampling: .RS \fBvsearch\fR \-\-fastx_subsample \fIfastafile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR (\-\-sample_pct \fIreal\fR | \-\-sample_size \fIpositive integer\fR) [\fIoptions\fR] .PP .RE Taxonomic classification: .RS \fBvsearch\fR \-\-sintax \fIfastafile\fR \-\-db \fIfastafile\fR \-\-tabbedout \fIoutputfile\fR [\-\-sintax_cutoff \fIreal\fR] [\fIoptions\fR] .PP .RE UDB database handling: .RS \fBvsearch\fR \-\-makeudb_usearch \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-udb2fasta \fIudbfile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR (\-\-udbinfo | \-\-udbstats) \fIudbfile\fR [\fIoptions\fR] .PP .RE .\" left and right justified (default) .ad b .\" ============================================================================ .SH DESCRIPTION Environmental or clinical molecular diversity studies generate large volumes of amplicons (e.g.; SSU-rRNA sequences) that need to be checked for chimeras, dereplicated, masked, sorted, searched, clustered or compared to reference sequences. The aim of \fBvsearch\fR is to offer a all-in-one open source tool to perform these tasks, using optimized algorithm implementations and harvesting the full potential of modern computers, thus providing fast and accurate data processing. .PP Comparing nucleotide sequences is at the core of \fBvsearch\fR. To speed up comparisons, \fBvsearch\fR implements an extremely fast Needleman-Wunsch algorithm, making use of the Streaming SIMD Extensions (SSE2) of post-2003 x86-64 CPUs. If SSE2 instructions are not available, \fBvsearch\fR exits with an error message. On Power8 CPUs it will use AltiVec/VSX/VMX instructions, and on ARMv8 CPUs it will use Neon instructions. On other systems it can use the SIMD Everywhere (simde) library, if available. Memory usage increases rapidly with sequence length: for example comparing two sequences of length 1 kb requires 8 MB of memory per thread, and comparing two 10 kb sequences requires 800 MB of memory per thread. For comparisons involving sequences with a length product greater than 25 million (for example two sequences of length 5 kb), \fBvsearch\fR uses a slower alignment method described by Hirschberg (1975) and Myers and Miller (1988), with much smaller memory requirements. .\" ---------------------------------------------------------------------------- .SS Input \fBvsearch\fR accept as input fasta or fastq files containing one or several nucleotidic entries. In fasta files, each entry is made of a header and a sequence. The header is defined as the string comprised between the initial '>' symbol and the first space, tab or the end of the line, unless the \-\-notrunclabels option is in effect, in which case the entire line is included. The header should contain printable ascii characters (33-126). The program will terminate with a fatal error if there are unprintable ascii characters. A warning will be issued if non-ascii characters (128-255) are encountered. .PP If the header matches the pattern '>[;]size=\fIinteger\fR;label', the pattern '>label;size=\fIinteger\fR;label', or the pattern '>label;size=\fIinteger\fR[;]', \fBvsearch\fR will interpret \fIinteger\fR as the number of occurrences (or abundance) of the sequence in the study. That abundance information is used or created during chimera detection, clustering, dereplication, sorting and searching. .PP The sequence is defined as a string of IUPAC symbols (ACGTURYSWKMDBHVN), starting after the end of the identifier line and ending before the next identifier line, or the file end. \fBvsearch\fR silently ignores ascii characters 9 to 13, and exits with an error message if ascii characters 0 to 8, 14 to 31, '.' or '-' are present. All other ascii or non-ascii characters are stripped and complained about in a warning message. .PP In fastq files, each entry is made of sequence header starting with a symbol '@', a nucleotidic sequence (same rules as for fasta sequences), a quality header starting with a symbol '+' and a string of ASCII characters (offset 33 or 64), each one encoding the quality value of the corresponding position in the nucleotidic sequence. .PP \fBvsearch\fR operations are case insensitive, except when soft masking is activated. Masking is automatically applied during chimera detection, clustering, masking, pairwise alignment and searching. Soft masking is specified with the options '\-\-dbmask soft' (for searching and chimera detection with a reference) or '\-\-qmask soft' (for searching, \fIde novo\fR chimera detection, clustering and masking). When using soft masking, lower case letters indicate masked symbols, while upper case letters indicate regular symbols. Masked symbols are never included in the unique index words used for sequence comparisons, otherwise they are treated as normal symbols. .PP When comparing sequences during chimera detection, dereplication, searching and clustering, T and U are considered identical, regardless of their case. When aligning sequences, identical symbols will receive a positive match score (default +2). If two symbols are not identical, their alignment result in a negative mismatch score (default -4). Aligning a pair of symbols where at least one of them is an ambiguous symbol (BDHKMNRSVWY) will always result in a score of zero. Alignment of two identical ambiguous symbols (for example, R vs R) also receives a score of zero. When computing the amount of similarity by counting matches and mismatches after alignment, ambiguous nucleotide symbols will count as matching to other symbols if they have at least one of the nucleotides (ACGTU) they may represent in common. For example: W will match A and T, but also any of MRVHDN. When showing alignments (for example with the \-\-alnout option) matches involving ambiguous symbols will be shown with a plus character (+) between them while exact matches between non-ambiguous symbols will be shown with a vertical bar character (|). .PP \fBvsearch\fR can read data from standard files and write to standard files, but it can also read from pipes and write to pipes! For example, multiple fasta files can be piped into \fBvsearch\fR for dereplication. To do so, file names can be replaced with: .RS .IP - 2 the symbol '-', representing '/dev/stdin' for input files or '/dev/stdout' for output files (with an exception for '\-\-db \-', see * below), .IP - a named pipe created with the command mkfifo, .IP - a process substitution '<(command)' as input or '>(command)' as output. .IP * \-\-db \- is not accepted, to prevent potential concurrent reads from stdin. A workaround for advanced users is to call '\-\-db /dev/stdin' directly. .RE .PP \fBvsearch\fR can automatically read compressed gzip or bzip2 files if the appropriate libraries are present during the compilation. \fBvsearch\fR can also read pipes streaming compressed gzip or bzip2 data if the options \-\-gzip_decompress or \-\-bzip2_decompress are selected. When reading from a pipe, the progress indicator is not updated. .\" ---------------------------------------------------------------------------- .SS Options \fBvsearch\fR recognizes a large number of command-line commands and options. For easier navigation, options are grouped below by theme (chimera detection, clustering, dereplication and rereplication, FASTA/FASTQ file processing, masking, pairwise alignment, searching, shuffling, sorting, and subsampling). We start with the general options that apply to all themes. Options start with a double dash (\-\-). A single dash (\-) may also be used, except on NetBSD systems. Option names may be shortened as long as they are not ambiguous (e.g. \-\-derep_f). .RE .PP .\" ---------------------------------------------------------------------------- .TAG help-and-version-commands Help and version commands: .PP .RS .TAG help .TAG h .TP 9 .B \-\-help \-\-h Display help text with brief information about all commands and options. .TAG version .TAG v .TP .B \-\-version \-\-v Output version information and a citation for the VSEARCH publication. Show the status of the support for gzip- and bzip2-compressed input files. .RE .PP .\" ---------------------------------------------------------------------------- .TAG general-options General options: .RS .TAG bzip2_decompress .TP 9 .B \-\-bzip2_decompress When reading from a pipe streaming bzip2-compressed data, decompress the data. This option is not needed when reading from a standard bzip2-compressed file. .TAG fasta_width .TP .BI \-\-fasta_width\~ "positive integer" Fasta files produced by \fBvsearch\fR are wrapped (sequences are written on lines of \fIinteger\fR nucleotides, 80 by default). Set the value to zero to eliminate the wrapping. .TAG gzip_decompress .TP .B \-\-gzip_decompress When reading from a pipe streaming gzip-compressed data, decompress the data. This option is not needed when reading from a standard gzip-compressed file. .TAG label_suffix .TP .BI \-\-label_suffix\~ string When writing FASTA or FASTQ files, add the suffix \fIstring\fR to sequence headers. .TAG log .TP .BI \-\-log \0filename Write messages to the specified log file. Information written includes program version, amount of memory available, number of cores and command line options, and if need be, informational messages, warnings and fatal errors. The start and finish times are also recorded as well as the elapsed time and the maximum amount of memory consumed. The different \fBvsearch\fR commands can also write additional information to the log file. .TAG maxseqlength .TP .BI \-\-maxseqlength\~ "positive integer" All \fBvsearch\fR operations discard sequences longer than \fIinteger\fR (50,000 nucleotides by default). .TAG minseqlength .TP .BI \-\-minseqlength\~ "positive integer" All \fBvsearch\fR operations discard sequences shorter than \fIinteger\fR: 1 nucleotide by default for sorting or shuffling, 32 nucleotides for clustering and dereplication as well as the commands \-\-makeudb_usearch, \-\-sintax, and \-\-usearch_global. .\" note: minseqlength can be set to zero (keep empty entries) .TAG no_progress .TP .B \-\-no_progress Do not show the gradually increasing progress indicator. .TAG notrunclabels .TP .B \-\-notrunclabels Do not truncate sequence labels at first space or tab, but use the full header in output files. Turned off by default for all commands except the sintax command. .TAG quiet .TP .B \-\-quiet Suppress all messages to stdout and stderr except for warnings and fatal error messages. .TAG sample .TP .BI \-\-sample\~ string When writing FASTA or FASTQ files, add the the given sample identifier \fIstring\fR to sequence headers. For instance, if the given string is ABC, the text ";sample=ABC" will be added to the header. Note that \fIstring\fR will be truncated at the first ';' or blank character. Other characters (alphabetical, numerical and punctuations) are accepted. .TAG threads .TP .BI \-\-threads\~ "positive integer" Number of computation threads to use (1 to 1024). The number of threads should be less than or equal to the number of available CPU cores. The default is to use all available resources and to launch one thread per core. The following commands are multi-threaded: allpairs_global, cluster_fast, cluster_size, cluster_smallmem, cluster_unoise, fastq_mergepairs, fastx_mask, maskfasta, search_exact, sintax, uchime_ref, and usearch_global. Only one thread is used for the other commands. .RE .PP .\" ---------------------------------------------------------------------------- .TAG chimera-detection-options Chimera detection options: .PP .RS Chimera detection is based on a scoring function controlled by five options (\-\-dn, \-\-mindiffs, \-\-mindiv, \-\-minh, \-\-xn). Sequences are first sorted by decreasing abundance, if available, and compared on their \fIplus\fR strand only (case insensitive). .PP Input sequences are masked as specified with the \-\-qmask and \-\-hardmask options. Masking of the database for reference based chimera detection is specified with the \-\-dbmask option. .PP In \fIde novo\fR mode, input fasta file must present abundance annotations (i.e. a pattern [;]size=\fIinteger\fR[;] in the fasta header). Input order matters for chimera detection, so we recommend to sort sequences by decreasing abundance (default of \-\-derep_fulllength command). If your sequence set needs to be sorted, please see the \-\-sortbysize command in the sorting section. .PP .TAG abskew .TP 9 .BI \-\-abskew \0real When using \-\-uchime_denovo, the abundance skew is used to distinguish in a three-way alignment which sequence is the chimera and which are the parents. The assumption is that chimeras appear later in the PCR amplification process and are therefore less abundant than their parents. For \-\-uchime3_denovo the default value is 16.0. For the other commands, the default value is 2.0, which means that the parents should be at least 2 times more abundant than their chimera. Any positive value equal or greater than 1.0 can be used. .TAG alignwidth .TP .BI \-\-alignwidth\~ "positive integer" When using \-\-uchimealns, set the width of the three-way alignments (80 nucleotides by default). Set to zero to eliminate wrapping. .TAG borderline .TP .BI \-\-borderline \0filename Output borderline chimeric sequences to \fIfilename\fR, in fasta format. Borderline chimeric sequences are sequences that have a high enough score but which are not sufficiently different from their closest parent. .TAG chimeras .TP .BI \-\-chimeras \0filename Output chimeric sequences to \fIfilename\fR, in fasta format. Output order may vary when using multiple threads. .TAG db .TP .BI \-\-db \0filename When using \-\-uchime_ref, detect chimeras using the reference sequences contained in \fIfilename\fR. Reference sequences are assumed to be chimera-free. Chimeras cannot be detected if their parents, or sufficiently close relatives, are not present in the database. The file name must refer to a FASTA file or to a UDB file. If a UDB file is used, it should be created using the \-\-makeudb_usearch command with the \-\-dbmask dust option. .TAG dn .TP .BI \-\-dn\~ "strictly positive real number" pseudo-count prior on the number of no votes, corresponding to the parameter \fIn\fR in the chimera scoring function (default value is 1.4). Increasing \-\-dn reduces the likelihood of tagging a sequence as a chimera (less false positives, but also more false negatives). .TAG fasta_score .TP .B \-\-fasta_score Add the chimera score to the headers in the fasta output files for chimeras, non-chimeras and borderline sequences, using the format ';uchime_denovo=\fIfloat\fR;'. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG mindiffs .TP .BI \-\-mindiffs\~ "positive integer" Minimum number of differences per segment (default value is 3). The parameter is ignored with \-\-uchime2_denovo and \-\-uchime3_denovo. .TAG mindiv .TP .BI \-\-mindiv \0real Minimum divergence from closest parent (default value is 0.8). The parameter is ignored with \-\-uchime2_denovo and \-\-uchime3_denovo. .TAG minh .TP .BI \-\-minh \0real Minimum score (\fIh\fR). Increasing this value tends to reduce the number of false positives and to decrease sensitivity. Default value is 0.28, and values ranging from 0.0 to 1.0 included are accepted. The parameter is ignored with \-\-uchime2_denovo and \-\-uchime3_denovo. .TAG nonchimeras .TP .BI \-\-nonchimeras \0filename Output non-chimeric sequences to \fIfilename\fR, in fasta format. Output order may vary when using multiple threads. .TAG relabel .TP .BI \-\-relabel \0string Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3, etc.) to construct the new headers. Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequences using the MD5 message digest algorithm applied to each sequence. Former sequence headers are discarded. The sequence is converted to upper case and each 'U' is replaced by a 'T' before computation of the digest. The MD5 digest is a cryptographic hash function designed to minimize the probability that two different inputs give the same output, even for very similar, but non-identical inputs. Still, there is a very small, but non-zero, probability that two different inputs give the same digest (i.e. a collision). MD5 generates a 128-bit (16-byte) digest that is represented by 16 hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the abundance annotations. .\" The probablity of collision for two sequences is 1/2^128 .TAG relabel_self .TP .B \-\-relabel_self Relabel sequences using each sequence itself as a label. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequences using the SHA1 message digest algorithm applied to each sequence. It is similar to the \-\-relabel_md5 option but uses the SHA1 algorithm instead of the MD5 algorithm. SHA1 generates a 160-bit (20-byte) digest that is represented by 20 hexadecimal numbers (40 symbols). The probability of a collision (two non-identical sequences resulting in the same digest) is smaller for the SHA1 algorithm than it is for the MD5 algorithm. .\" The probablity of collision for two sequences is 1/2^160 .TAG self .TP .B \-\-self When using \-\-uchime_ref, ignore a reference sequence when its label matches the label of the query sequence (useful to estimate false-positive rate in reference sequences). .\" I am not sure the statement above is true. .TAG selfid .TP .B \-\-selfid When using \-\-uchime_ref, ignore a reference sequence when its nucleotide sequence is strictly identical to the nucleotidic sequence of the query. .TP .B \-\-sizein In \fIde novo\fR mode, abundance annotations (pattern '[>;]size=\fIinteger\fR[;]') present in sequence headers are taken into account by default (\-\-sizein is always implied). This option is ignored by \-\-uchime_ref. .TP .TAG sizeout .B \-\-sizeout When relabelling, add abundance annotations to fasta headers (using the format ';size=\fIinteger\fR;'). .TAG uchime_denovo .TP .BI \-\-uchime_denovo \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR, without external references (i.e. \fIde novo\fR). Automatically sort the sequences in \fIfilename\fR by decreasing abundance beforehand (see the sorting section for details). Multithreading is not supported. .TAG uchime2_denovo .TP .BI \-\-uchime2_denovo \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR, using the UCHIME2 algorithm. This algorithm is designed for denoised amplicons (see \-\-cluster_unoise). Automatically sort the sequences in \fIfilename\fR by decreasing abundance beforehand (see the sorting section for details). Multithreading is not supported. .TAG uchime3_denovo .TP .BI \-\-uchime3_denovo \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR, using the UCHIME2 algorithm. The only difference from \-\-uchime2_denovo is that the default minimum abundance skew (\-\-abskew) is set to 16.0 rather than 2.0. .TAG uchime_ref .TP .BI \-\-uchime_ref \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR by comparing them with reference sequences (option \-\-db). Multithreading is supported. .TAG uchimealns .TP .BI \-\-uchimealns \0filename Write the three-way global alignments (parentA, parentB, chimera) to \fIfilename\fR using a human-readable format. Use \-\-alignwidth to modify alignment length. Output order may vary when using multiple threads. All sequences are converted to upper case before alignment. Lower case letters indicate disagreement in the alignment. .TAG uchimeout .TP .BI \-\-uchimeout \0filename Write chimera detection results to \fIfilename\fR using a 18-field, tab\-separated uchime\-like format. Use \-\-uchimeout5 to use a format compatible with usearch v5 and earlier versions. Rows output order may vary when using multiple threads. .RS .RS .nr step 1 1 .IP \n[step]. 4 score: higher score means a more likely chimeric alignment. .IP \n+[step]. Q: query sequence label. .IP \n+[step]. A: parent A sequence label. .IP \n+[step]. B: parent B sequence label. .IP \n+[step]. T: top parent sequence label (i.e. parent most similar to the query). That field is removed when using \-\-uchimeout5. .IP \n+[step]. idQM: percentage of similarity of query (Q) and model (M) constructed as a part of parent A and a part of parent B. .IP \n+[step]. idQA: percentage of similarity of query (Q) and parent A. .IP \n+[step]. idQB: percentage of similarity of query (Q) and parent B. .IP \n+[step]. idAB: percentage of similarity of parent A and parent B. .IP \n+[step]. idQT: percentage of similarity of query (Q) and top parent (T). .IP \n+[step]. LY: yes votes in the left part of the model. .IP \n+[step]. LN: no votes in the left part of the model. .IP \n+[step]. LA: abstain votes in the left part of the model. .IP \n+[step]. RY: yes votes in the right part of the model. .IP \n+[step]. RN: no votes in the right part of the model. .IP \n+[step]. RA: abstain votes in the right part of the model. .IP \n+[step]. div: divergence, defined as (idQM - idQT). .IP \n+[step]. YN: query is chimeric (Y), or not (N), or is a borderline case (?). .RE .RE .TAG uchimeout5 .TP .B \-\-uchimeout5 When using \-\-uchimeout, write chimera detection results using a 17\-field, tab\-separated uchime\-like format (drop the 5th field of \-\-uchimeout), compatible with usearch version 5 and earlier versions. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xn .TP .BI \-\-xn\~ "strictly positive real number" weight of no votes, corresponding to the parameter \fIbeta\fR in the scoring function (default value is 8.0). Increasing \-\-xn reduces the likelihood of tagging a sequence as a chimera (less false positives, but also more false negatives). .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG clustering-options Clustering options: .RS .PP \fBvsearch\fR implements a single-pass, greedy centroid-based clustering algorithm, similar to the algorithms implemented in usearch, DNAclust and sumaclust for example. Important parameters are the global clustering threshold (\-\-id) and the pairwise identity definition (\-\-iddef). .PP Input sequences are masked as specified with the \-\-qmask and \-\-hardmask options. .TAG biomout .TP 9 .BI \-\-biomout \0filename Generate an OTU table in the biom version 1.0 JSON file format as specified at .URL https://biom-format.org/documentation/format_versions/biom-1.0.html "(link)" . The format describes how to store a sparse matrix containing the abundances of the OTUs in the different samples. This format is much more efficient than the classic and mothur OTU table formats available with the \-\-otutabout and \-\-mothur_shared_out options, respectively, and is recommended at least for large tables. The OTUs are represented by the cluster centroids. Taxonomy information will be included for the OTUs if available. Sample identifiers will be extracted from the headers of all sequences in the input file. If the header contains ';sample=abc123;' or ';barcodelabel=abc123;' or a similar string somewhere, then the given sample identifier (here 'abc123') will be used. The semicolon is not mandatory at the beginning or end of the header. The sample identifier may contain any printable character except semicolons. If no such sample label is found, the identifier in the initial part of the header will be used, but only letters, digits and underscores are allowed. OTU identifiers will be extracted from the headers of the cluster centroid sequences. If the header contains ';otu=def789;' or a similar string somewhere, then the given OTU identifier (here 'def789') will be used. The semicolon is not mandatory at the beginning or end of the header. The OTU identifier may contain any printable character except semicolons. If no such OTU label is found, the identifier in the initial part of the header will be used, and all characters except semicolons are allowed. Alternatively, OTU identifiers can be generated using the relabelling options (\-\-relabel, \-\-relabel_self, \-\-relabel_sha1, or \-\-relabel_md5). Taxonomy information, if present, will also be extracted from the headers of the centroid sequences. If the header contains ';tax=Homo_sapiens;' or a similar string somewhere, then the given taxonomy information (here 'Homo_sapiens') will be used. The semicolon is not mandatory at the beginning or end of the header. The taxonomy information may contain any printable character except semicolons. If an OTU table in the biom version 2.1 HDF5 file format is required, the biom utility may be used as described at .URL https://biom-format.org/documentation/biom_conversion.html "(link)" . .TAG centroids .TP .BI \-\-centroids \0filename Output cluster centroid sequences to \fIfilename\fR, in fasta format. The centroid is the sequence that seeded the cluster (i.e. the first sequence of the cluster). .TAG clusterout_id .TP .BI \-\-clusterout_id Add cluster identifier information to the output files when using the \-\-centroids, \-\-consout and \-\-profile options. .TAG clusterout_sort .TP .BI \-\-clusterout_sort Sort some output files by decreasing abundance instead of input order. It applies to the \-\-consout, \-\-msaout, \-\-profile, \-\-centroids, and \-\-uc options. For \-\-uc, the sorting applies only to the centroid information part (the C lines). .TAG cluster_fast .TP .BI \-\-cluster_fast \0filename Clusterize the fasta sequences in \fIfilename\fR, automatically sort by decreasing sequence length beforehand. .TAG cluster_size .TP .BI \-\-cluster_size \0filename Clusterize the fasta sequences in \fIfilename\fR, automatically sort by decreasing sequence abundance beforehand. .TAG cluster_smallmem .TP .BI \-\-cluster_smallmem \0filename Clusterize the fasta sequences in \fIfilename\fR without automatically modifying their order beforehand. Sequence are expected to be sorted by decreasing sequence length, unless \-\-usersort is used. .TAG cluster_unoise .TP .BI \-\-cluster_unoise \0filename Perform denoising of the fasta sequences in \fIfilename\fR according to the UNOISE version 3 algorithm by Robert Edgar, but without the \fIde novo\fR chimera removal step, which may be performed afterwards with \-\-uchime3_denovo. The options \-\-minsize (default 8) and \-\-unoise_alpha (default 2.0) may be specified. In the this algorithm, clustering of sequences depend on both the sequence distance and the abundance ratio. The abundance ratio (skew) is the abundance of a new sequence divided by the abundance of the centroid sequence. This skew must not be larger than beta if the sequences should be clustered together. Beta is calculated as 2 raised to the power of minus 1 minus alpha times the sequence distance. The sequence distance used is the number of mismatches in the alignment, ignoring gaps. This means that the abundance must be exponentially lower as the distance increases from the centroid for a new sequence to be included in the cluster. Nearer sequences with higher abundances will form their own new clusters. .TAG clusters .TP .BI \-\-clusters \0string Output each cluster to a separate fasta file using the prefix \fIstring\fR and a ticker (0, 1, 2, etc.) to construct the path and filenames. .TAG consout .TP .BI \-\-consout \0filename Output cluster consensus sequences to \fIfilename\fR. For each cluster, a center-star multiple sequence alignment is computed with the centroid as the center, using a fast algorithm (not accurate when using low pairwise identity thresholds). A consensus sequence is constructed by taking the majority symbol (nucleotide or gap) from each column of the alignment. Columns containing a majority of gaps are skipped, except for terminal gaps. If the \-\-sizein option is specified, sequence abundances will be taken into account. .TAG cons_truncate .TP .B \-\-cons_truncate This command is ignored. A warning is issued. .\" .TP .\" .B \-\-cons_truncate .\" when using the \-\-consout option to build consensus sequences, .\" do not ignore terminal gaps. That option skips terminal columns .\" if they contain a majority of gaps, yielding shorter consensus .\" sequences than when using \-\-consout alone. .TAG id .TP .BI \-\-id \0real Do not add the target to the cluster if the pairwise identity with the centroid is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). The pairwise identity is defined as the number of (matching columns) / (alignment length - terminal gaps). That definition can be modified by \-\-iddef. .TAG iddef .TP .BI \-\-iddef\~ "0|1|2|3|4" Change the pairwise identity definition used in \-\-id. Values accepted are: .RS .RS .nr step 0 1 .IP \n[step]. 4 CD-HIT definition: (matching columns) / (shortest sequence length). .IP \n+[step]. edit distance: (matching columns) / (alignment length). .IP \n+[step]. edit distance excluding terminal gaps (same as \-\-id). .IP \n+[step]. Marine Biological Lab definition counting each gap opening (internal or terminal) as a single mismatch, whether or not the gap was extended: 1.0 - [(mismatches + gap openings)/(longest sequence length)] .IP \n+[step]. BLAST definition, equivalent to \-\-iddef 1 in a context of global pairwise alignment. .RE .RE .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG minsize .TP .BI \-\-minsize\~ "positive integer" Specify the minimum abundance of sequences for denoising using \-\-cluster_unoise. The default is 8. .TAG msaout .TP .BI \-\-msaout \0filename Output a multiple sequence alignment and a consensus sequence for each cluster to \fIfilename\fR, in fasta format. Be warned that vsearch computes center star multiple sequence alignments using a fast method whose accuracy can decrease significantly when using low pairwise identity thresholds. The consensus sequence is constructed by taking the majority symbol (nucleotide or gap) from each column of the alignment. Columns containing a majority of gaps are skipped, except for terminal gaps. If the \-\-sizein option is specified, sequence abundances will be taken into account when computing the consensus. .TAG mothur_shared_out .TP .BI \-\-mothur_shared_out \0filename Output an OTU table in the mothur 'shared' tab-separated plain text format as described at .URL https://www.mothur.org/wiki/Shared_file (link) . The format describes how a matrix containing the abundances of the OTUs in the different samples is stored. The first line will start with the strings 'label', 'group' and 'numOtus' and is followed by a list of all OTU identifiers. The following lines, one for each sample, starts with the string 'vsearch' followed by the sample identifier, the total number of OTUs, and a list of abundances for each OTU in that sample, in the order given on the first line. The OTU and sample identifiers are extracted from the FASTA headers of the sequences. The OTUs are represented by the cluster centroids. See the \-\-biomout option for further details. .TAG otutabout .TP .BI \-\-otutabout \0filename Output an OTU table in the classic tab-separated plain text format as a matrix containing the abundances of the OTUs in the different samples. The first line will start with the string '#OTU ID' and is followed by a tab-separated list of all sample identifiers. The following lines, one for each OTU, starts with the OTU identifier and is followed by a tab-separated list of abundances for that OTU in each sample, in the order given on the first line. The OTU and sample identifiers are extracted from the FASTA headers of the sequences (see the \-\-sample option). The OTUs are represented by the cluster centroids. An extra column is added to the right of the table if taxonomy information is available for at least one of the OTUs. This column will be labelled 'taxonomy' and each row will then contain the taxonomy information extracted for that OTU. See the \-\-biomout option for further details. .TAG profile .TP .BI \-\-profile \0filename Output a sequence profile to a text file with the frequency of each nucleotide in each position in the multiple alignment for each cluster. There is a FASTA-like header line for each cluster, followed by the profile information in a tab-separated format. The eight columns are: position (0-based), consensus nucleotide, number of As, number of Cs, number of Gs, number of Ts or Us, number of gap symbols, and finally the total number of ambiguous nucleotide symbols (B, D, H, K, M, N, R, S, Y, V or W). All numbers are integers. If the \-\-sizein option is specified, sequence abundances will be taken into account. .TAG qmask .TP .BI \-\-qmask\~ "none|dust|soft" Mask regions in sequences using the \fIdust\fR or the \fIsoft\fR methods, or do not mask (\fInone\fR). Warning, when using \fIsoft\fR masking, clustering becomes case sensitive. The default is to mask using \fIdust\fR. .TAG qsegout .TP .BI \-\-qsegout \0filename Write the aligned part of each query sequence to \fIfilename\fR in FASTA format. .TAG relabel .TP .BI \-\-relabel \0string Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG relabel_self .TP .B \-\-relabel_self Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG sizein .TP .B \-\-sizein Take into account the abundance annotations present in the input fasta file (search for the pattern '[>;]size=\fIinteger\fR[;]' in sequence headers). .TAG sizeorder .TP .B \-\-sizeorder When an amplicon is close to 2 or more centroids, both within the distance specified with the \-\-id option, resolve the ambiguity by clustering it with the centroid having the highest abundance, not necessarily the closest one. The option only has effect when the value specified with \-\-maxaccepts is higher than one. The \-\-sizeorder option turns on what is sometimes referred to as abundance-based greedy clustering (AGC), in contrast to the default distance-based greedy clustering (DGC). .TAG sizeout .TP .B \-\-sizeout Add abundance annotations to the output fasta files (add the pattern ';size=\fIinteger\fR;' to sequence headers). If \-\-sizein is specified, abundance annotations are reported to output files, and each cluster centroid receives a new abundance value corresponding to the total abundance of the amplicons included in the cluster (\-\-centroids option). If \-\-sizein is not specified, input abundances are set to 1 for amplicons, and to the number of amplicons per cluster for centroids. .TAG strand .TP .BI \-\-strand\~ "plus|both" When comparing sequences with the cluster seed, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. .TAG tsegout .TP .BI \-\-tsegout \0filename Write the aligned part of each target sequence to \fIfilename\fR in FASTA format. .TAG uc .TP .BI \-\-uc \0filename Output clustering results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns and 3 different type of entries (S, H or C). Each fasta sequence in the input file can be either a cluster centroid (S) or a hit (H) assigned to a cluster. Cluster records (C) summarize information (size, centroid label) for each cluster. In the context of clustering, the option \-\-uc_allhits has no effect on the \-\-uc output. Column content varies with the type of entry (S, H or C): .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type: S, H, or C. .IP \n+[step]. Cluster number (zero-based). .IP \n+[step]. Centroid length (S), query length (H), or cluster size (C). .IP \n+[step]. Percentage of similarity with the centroid sequence (H), or set to '*' (S, C). .IP \n+[step]. Match orientation + or - (H), or set to '*' (S, C). .IP \n+[step]. Not used, always set to '*' (S, C) or to zero (H). .IP \n+[step]. Not used, always set to '*' (S, C) or to zero (H). .IP \n+[step]. set to '*' (S, C) or, for H, compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence. .IP \n+[step]. Label of the query sequence (H), or of the centroid sequence (S, C). .IP \n+[step]. Label of the centroid sequence (H), or set to '*' (S, C). .RE .RE .TAG unoise_alpha .TP .BI \-\-unoise_alpha\~ real Specify the alpha parameter to the \-\-cluster_unoise command. The default is 2.0. .TAG usersort .TP .B \-\-usersort When using \-\-cluster_smallmem, allow any sequence input order, not just a decreasing length ordering. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .TP .B ... Most searching options as well as score filtering, gap penalties and masking also apply to clustering (see the Searching section for definitions): \-\-alnout, \-\-blast6out, \-\-fastapairs, \-\-matched, \-\-notmatched, \-\-maxaccepts, \-\-maxrejects, \-\-samout, \-\-userout, \-\-userfields .RE .PP .\" ---------------------------------------------------------------------------- .TAG dereplication-and-rereplication-options Dereplication and rereplication options: .PP .RS VSEARCH can dereplicate sequences with the commands \-\-derep_fulllength, \-\-derep_id, \-\-derep_smallmem, \-\-derep_prefix and \-\-fastx_uniques. The \-\-derep_fulllength command is depreciated and is replaced by the new \-\-fastx_uniques command that can also handle FASTQ files in addition to FASTA files. The \-\-derep_fulllength, \-\-derep_smallmem, and \-\-fastx_uniques commands requires strictly identical sequences of the same length, but ignores upper/lower case and treats T and U as identical symbols. The \-\-derep_id command requires both identical sequences and identical headers/labels. The \-\-derep_prefix command will group sequences with a common prefix and does not require them to be equally long. The \-\-derep_smallmem uses a much smaller amount of memory when dereplicating than the other files, and may be a bit slower and cannot read the input from a pipe. It takes both FASTA and FASTQ files as input but only writes FASTA output to the file specified with the \-\-fastaout option. The \-\-fastx_uniques command can write FASTQ output (specified with \-\-fastqout) or FASTA output (specified with \-\-fastaout) as well as a special tab-separated column text format (with \-\-tabbedout). The other commands can write FASTA output to the file specified with the \-\-output option. All dereplication commands, except \-\-derep_smallmem, can write output to a special UCLUST-like file specified with the \-\-uc option. The \-\-rereplicate command can duplicate sequences in the input file according to the abundance of each input sequence. Other valid options are \-\-fastq_ascii, \-\-fastq_asciiout, \-\-fastq_qmax, \-\-fastq_qmaxout, \-\-fastq_qmin, \-\-fastq_qminout, \-\-fastq_qout_max, \-\-lengthout, \-\-maxuniquesize, \-\-minuniquesize, \-\-relabel, \-\-relabel_keep, \-\-relabel_md5, \-\-relabel_self, \-\-relabel_sha1, \-\-sizein, \-\-sizeout, \-\-strand, \-\-topn, \-\-xlength, and \-\-xsize. .PP .TAG derep_fulllength .TP 9 .BI \-\-derep_fulllength \0filename Merge strictly identical sequences contained in \fIfilename\fR. Identical sequences are defined as having the same length and the same string of nucleotides (case insensitive, T and U are considered the same). See the options \-\-sizein and \-\-sizeout to take into account and compute abundance values. This command does not support multithreading. .TAG derep_id .TP .BI \-\-derep_id \0filename Merge strictly identical sequences contained in \fIfilename\fR, as with the \-\-derep_fulllength command, but the sequence labels (identifiers) on the header line need to be identical too. .TAG derep_smallmem .TP .BI \-\-derep_smallmem \0filename Merge strictly identical sequences contained in \fIfilename\fR, as with the \-\-derep_fulllength command, but using much less memory. The output is written to a FASTA file specified with the \-\-fastaout option. The output is written in the order that the sequences first appear in the input, and not in descending abundance order as with the other dereplication commands. It can read, but not write FASTQ files. This command cannot read from a pipe, it must be a proper file, as it is read twice. Dereplication is performed with a 128 bit hash function and it is not verified that grouped sequences are identical, however the probability that two different sequences are grouped in a dataset of one billion unique sequences is approximately 1e-21. Memory footprint is appr. 24 bytes times the number of unique sequence. Multithreading and the options \-\-topn, \-\-uc, or \-\-tabbedout are not supported. .TAG derep_prefix .TP .BI \-\-derep_prefix \0filename Merge sequences with identical prefixes contained in \fIfilename\fR. A short sequence identical to an initial segment (prefix) of another sequence is considered a replicate of the longer sequence. If a sequence is identical to the prefix of two or more longer sequences, it is clustered with the shortest of them. If they are equally long, it is clustered with the most abundant. Remaining ties are solved using sequence headers and sequence input order. Sequence comparisons are case insensitive, and T and U are considered identical. This command does not support multithreading. .TAG fastaout .TP .BI \-\-fastaout \0filename Write the dereplicated sequences to \fIfilename\fR, in fasta format and sorted by decreasing abundance. Identical sequences receive the header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fasta header using the pattern ';size=\fIinteger\fR;'. This option is only valid for \-\-fastx_uniques and \-\-derep_smallmem. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the dereplicated sequences to \fIfilename\fR, in fastq format and sorted by decreasing abundance. Identical sequences receive the header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fastq header using the pattern ';size=\fIinteger\fR;'. This option is only valid for \-\-fastx_uniques. .TAG fastq_ascii .TP .BI \-\-fastq_ascii\~ "positive integer" Define the ASCII character number used as the basis for the FASTQ quality score. The default is 33, which is used by the Sanger / Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 and 64 are valid arguments. .TAG fastq_asciiout .TP .BI \-\-fastq_asciiout\~ "positive integer" When using \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, define the ASCII character number used as the basis for the FASTQ quality score when writing FASTQ output files. The default is 33. Only 33 and 64 are valid arguments. .TAG fastq_qmax .TP .BI \-\-fastq_qmax\~ "positive integer" Specify the maximum quality score accepted when reading FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. .TAG fastq_qmaxout .TP .BI \-\-fastq_qmaxout\~ "positive integer" Specify the maximum quality score used when writing FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use a maximum quality score of 40. .TAG fastq_qmin .TP .BI \-\-fastq_qmin\~ "positive integer" Specify the minimum quality score accepted for FASTQ files. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. .TAG fastq_qminout .TP .BI \-\-fastq_qminout\~ "positive integer" Specify the minimum quality score used when writing FASTQ files. The default is 0, which is usual for Sanger/Illumina 1.8+ files. Older versions of the format may use scores between -5 and 2. .TAG fastq_qout_max .TP .BI \-\-fastq_qout_max For \-\-fastx_uniques, indicate that the new quality scores computed when dereplicating FASTQ files should be equal to the maximum (best) of the input quality scores for each position (corresponding to the lowest error probability). The default is to output a quality score corresponding to the average of the error probabilities for each position. .TAG fastx_uniques .TP .BI \-\-fastx_uniques \0filename Merge strictly identical sequences contained in FASTA or FASTQ file \fIfilename\fR. Identical sequences are defined as having the same length and the same string of nucleotides (case insensitive, T and U are considered the same). See the options \-\-sizein and \-\-sizeout to take into account and compute abundance values. This command does not support multithreading. By default, the quality scores in FASTQ output files will correspond to the average error probability of the nucleotides in the each position. If the \-\-fastq_qout_max option is given, the quality score will be the highest (best) quality score observed in each position. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA and FASTQ format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG maxuniquesize .TP .BI \-\-maxuniquesize\~ "positive integer" Discard sequences with a post-dereplication abundance value greater than \fIinteger\fR. .TAG minuniquesize .TP .BI \-\-minuniquesize\~ "positive integer" Discard sequences with a post-dereplication abundance value smaller than \fIinteger\fR. .TAG output .TP .BI \-\-output \0filename Write the dereplicated sequences to \fIfilename\fR, in fasta format and sorted by decreasing abundance. Identical sequences receive the header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fasta header using the pattern ';size=\fIinteger\fR;'. This option is not allowed for \-\-fastx_uniques or \-\-derep_smallmem. .TP .TAG relabel .BI \-\-relabel \0string Please see the description of the same option under Chimera detection for details. .TP .TAG relabel_keep .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TP .TAG relabel_md5 .B \-\-relabel_md5 Please see the description of the same option under Chimera detection for details. .TP .TAG relabel_self .B \-\-relabel_self Please see the description of the same option under Chimera detection for details. .TP .TAG relabel_sha1 .B \-\-relabel_sha1 Please see the description of the same option under Chimera detection for details. .TP .TAG rereplicate .BI \-\-rereplicate \0filename Duplicate each sequence the number of times indicated by the abundance of each sequence in the specified file (option \-\-sizein is always implied). The sequence labels are identical for the same sequence, unless \-\-relabel, \-\-relabel_self, \-\-relabel_sha1 or \-\-relabel_md5 is used to create unique labels. Output is written to the file specified with the \-\-output option, in FASTA format. The output file does not contain abundance information unless \-\-sizeout is specified, in which case an abundance of 1 is used. .TAG sizein .TP .B \-\-sizein Take into account the abundance annotations present in the input fasta file (search for the pattern '[>;]size=\fIinteger\fR[;]' in sequence headers). That option is active by default when rereplicating. .TAG sizeout .TP .B \-\-sizeout Add abundance annotations to the output fasta file (add the pattern ';size=\fIinteger\fR;' to sequence headers). If \-\-sizein is specified, each unique sequence receives a new abundance value corresponding to its total abundance (sum of the abundances of its occurrences). If \-\-sizein is not specified, input abundances are set to 1, and each unique sequence receives a new abundance value corresponding to its number of occurrences in the input file. .TAG strand .TP .BI \-\-strand\~ "plus|both" When searching for strictly identical sequences, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. .TAG tabbedout .TP .BI \-\-tabbedout \0filename Output clustering info to the specified tab-separated text file with 6 columns and a row for each input sequence. Column 1 contains the original label/header of the sequence. Column 2 contains the label of the output sequence which is equal to the label/header of the first sequence in each cluster, but potentially relabelled. Column 3 contains the cluster number, starting from 0. Column 4 contains the sequence number within each cluster, starting at 0. Column 5 contains the number of sequences in the cluster. Column 6 contains the original label/header of the first sequence in the cluster before any potential relabelling. This option is only valid for the \-\-fastx_uniques command. .TAG topn .TP .BI \-\-topn\~ "positive integer" Output only the top \fIinteger\fR sequences (i.e. the most abundant). .TAG uc .TP .BI \-\-uc \0filename Output full-length or prefix-dereplication results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns and 3 different type of entries (S, H or C). Each fasta sequence in the input file can be either a cluster centroid (S) or a hit (H) assigned to a cluster. Cluster records (C) summarize information (size, centroid label) for each cluster. In the context of dereplication, the option \-\-uc_allhits has no effect on the \-\-uc output. Column content varies with the type of entry (S, H or C): .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type: S, H, or C. .IP \n+[step]. Cluster number (zero-based). .IP \n+[step]. Sequence length (S, H), or cluster size (C). .IP \n+[step]. Percentage of similarity with the centroid sequence (H), or set to '*' (S, C). .IP \n+[step]. Match orientation + or - (H), or set to '*' (S, C). .IP \n+[step]. Not used, always set to '*' (S, C) or 0 (H). .IP \n+[step]. Not used, always set to '*' (S, C) or 0 (H). .IP \n+[step]. Not used, always set to '*'. .IP \n+[step]. Label of the query sequence (H), or of the centroid sequence (S, C). .IP \n+[step]. Label of the centroid sequence (H), or set to '*' (S, C). .RE .RE .RE .PP .RS .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG extraction-options Extraction options: .RS .PP Sequences with headers matching certain criteria can be extracted from FASTA and FASTQ files using the \-\-fastx_getseq, \-\-fastx_getseqs and \-\-fastx_getsubseq commands. .PP The \-\-fastx_getseq command requires the header to match a label specified with the \-\-label option. If the \-\-label_substr_match option is given, the label may be a substring located anywhere in the header, otherwise the entire header must match the label. These matches are not case-sensitive. The headers in the input file are truncated at the first space or tab character unless the \-\-notrunclabels option is given. The matching sequences will be written to the files specified with the \-\-fastaout and \-\-fastqout options, in FASTA and FASTQ format, respectively. Sequences that do not match are written to the files specified with the \-\-notmatched and \-\-notmatchedfq options, respectively. .PP The \-\-fastx_getsubseq command is similar to the \-\-fastx_getseq command, but will extract a subsequence of the matching sequences. The start position is specified with the \-\-subseq_start option and the end position is specified with the \-\-subseq_end option. The positions are 1-based, meaning that the first symbol of the sequence is at position 1. If the start or end position option is not specified, the default is to start at the first position and end at the last position in the sequence. .PP The \-\-fastx_getseqs command is similar to the \-\-fastx_getseq command but allows more flexibility in specifying the label(s) to be matched. A single label may be specified using the \-\-label option as described above. Alternatively, a file containing a list of labels to be matched may be specified with the \-\-labels option. The file must be a plain text file with one label on each line. The \-\-label_word and \-\-label_words options may be used to specify either a single word or a file containing a list of words, respectively, to be matched. Words are defined as character sequences delimited either by a character that is not alpha-numeric (A-Z, a-z, or 0-9) or by the beginning or end of the header. Word matching is case-sensitive. The \-\-label_field option will limit the matching of words to a certain field in the header. .PP .TAG fastaout .TP 9 .BI \-\-fastaout \0filename Write the extracted sequences in FASTA format to the file with the given name. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the extracted sequences in FASTQ format to the file with the given name. This option is illegal if the input is in FASTA format. .TAG fastx_getseq .TP .BI \-\-fastx_getseq \0filename Extract sequences from the given FASTA or FASTQ file. Specify a label to match using the \-\-label option. Output files are specified with the \-\-fastaout, \-\-fastqout, \-\-notmatched and \-\-notmatchedfq options. .TAG fastx_getseqs .TP .BI \-\-fastx_getseqs \0filename Extract sequences from the given FASTA or FASTQ file. Specify the label or labels to match using one of the following options: \-\-label, \-\-labels, \-\-label_word, or \-\-label_words. Output files are specified with the \-\-fastaout, \-\-fastqout, \-\-notmatched and \-\-notmatchedfq options. .TAG fastx_getsubseq .TP .BI \-\-fastx_getsubseq \0filename Extract a certain part of some of the sequences in the given FASTA or FASTQ file. Specify labels to match using the \-\-label option. Specify the subsequence range to be extracted with the \-\-subseq_start and \-\-subseq_end options. Output files are specified with the \-\-fastaout, \-\-fastqout, \-\-notmatched and \-\-notmatchedfq options. .TAG label .TP .BI \-\-label \0string Specify the label to match in the sequence header. Unless the \-\-label_substr_match option is given, the label must match the entire header. The comparison is not case-sensitive. .TAG label_field .TP .BI \-\-label_field \0string Specify a field name to be used when matching using the \-\-label_word or \-\-label_words option. The field name is a string like "abc" that must precede the word to be matched with an equals sign (=) in between. The field must be delimited by semicolons or the beginning or end of the header. The following header will match the label 123 in the field abc: "seq1;abc=123". .TAG label_substr_match .TP .BI \-\-label_substr_match The labels specified with the \-\-label or the \-\-labels option may match anywhere in the header if this option is given. Otherwise a label needs to match the entire header. .TAG label_word .TP .BI \-\-label_word \0string Specify a word to match in the sequence header. Words are defined as strings delimited by either the start or end of the header or by any symbol that is not a letter (A-Z, a-z) or digit (0-9). The comparison is case-sensitive. .TAG label_words .TP .BI \-\-label_words \0filename Specify a file containing words to be matched against the sequence headers. The plain text file must contain one word on each line. Words are defined as strings delimited by either the start or end of the header or by any symbol that is not a letter (A-Z, a-z) or digit (0-9). The comparison is case-sensitive. .TAG labels .TP .BI \-\-labels \0filename Specify a file containing labels to be matched against the sequence headers. The plain text file must contain one label on each line. Unless the \-\-label_substr_match option is given, a label must match the entire header. The comparison is not case-sensitive. .TAG notmatched .TP .BI \-\-notmatched \0filename Write the sequences that were not extracted to the file with the given name, in FASTA format. .TAG notmatchedfq .TP .BI \-\-notmatchedfq \0filename Write the sequences that were not extracted to the file with the given name, in FASTQ format. This option is illegal if the input is in FASTA format. .TAG subseq_end .TP .BI \-\-subseq_end\~ "positive integer" Specify the end position in the sequences when extracting subsequences using the \-\-fastx_getsubseq command. Positions are 1-based, so the sequences start at position 1. The default is to end at the end of the sequence if this option is not specified. .TAG subseq_start .TP .BI \-\-subseq_start\~ "positive integer" Specify the starting position in the sequences when extracting subsequences using the \-\-fastx_getsubseq command. Positions are 1-based, so the sequences start at position 1. The default is to start at the beginning of the sequence (position 1), if this option is not specified. .RE .PP .\" ---------------------------------------------------------------------------- .TAG fasta-fastq-file-processing-options FASTA/FASTQ/SFF file processing options: .RS .PP Analyse, trim, filter, convert, merge, join or reverse complement sequences in FASTA, FASTQ or SFF files. The \-\-fastq_chars command can be used to analyse FASTQ files to identify the quality encoding and the range of quality score values used. To convert between different FASTQ file variants, use the \-\-fastq_convert command. Statistical analysis of the quality and length of the sequences in a FASTQ file may be performed with the \-\-fastq_stats, \-\-fastq_eestats, and \-\-fastq_eestats2 commands. Sequences may be trimmed, filtered and converted by the \-\-fastq_filter or \-\-fastx_filter commands. The \-\-sff_convert command can be used to convert SFF files to FASTQ, while the \-\-fasta2fastq command will convert a FASTA file to a FASTQ file with fake quality scores. Paired-end reads can be merged using the \-\-fastq_mergepairs command or joined with the \-\-fastq_join command. The \-\-fastx_revcomp command will reverse-complements sequences. .PP .TAG eeout .TP 9 .B \-\-eeout When using \-\-fastq_filter, \-\-fastx_filter or \-\-fastq_mergepairs, include the number of expected errors (ee) in the sequence header of FASTQ and FASTA output files. This option is a synonym of the \-\-fastq_eeout option. Use the \-\-xee option to remove this information from headers. .TAG eetabbedout .TP .BI \-\-eetabbedout \0filename When specified with the \-\-fastq_mergepairs command, write statistics with expected errors of each merged read to the given file. The file is a tab separated file with four columns: The number of expected errors in the forward read, the number of expected errors in the reverse read, the number of observed errors in the forward read, and the number of observed errors in the reverse read. The observed number of errors are the number of differences in the overlap region of the merged sequence relative to each of the reads in the pair. .TAG fasta2fastq .TP .BI \-\-fasta2fastq \0filename Add a fake nucleotide quality score to the sequences in the given FASTA file and write them to the FASTQ file specified with the \-\-fastqout option. The quality score may be adjusted using the \-\-fastq_qmaxout option (default 41). The \-\-fastq_asciiout option may be used to adjust the FASTQ output quality ASCII base character (default 33). .TAG fastaout .TP .BI \-\-fastaout \0filename When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, write to the given FASTA-formatted file the sequences passing the filter, or the merged sequences. .TAG fastaout_rev .TP .BI \-\-fastaout_rev \0filename When using \-\-fastq_filter, or \-\-fastx_filter, write to the given FASTA-formatted file the reverse reads passing the filter. .TAG fastaout_notmerged_fwd .TP .BI \-\-fastaout_notmerged_fwd \0filename When using \-\-fastq_mergepairs, write forward reads not merged to the specified FASTA file. .TAG fastaout_notmerged_rev .TP .BI \-\-fastaout_notmerged_rev \0filename When using \-\-fastq_mergepairs, write reverse reads not merged to the specified FASTA file. .TAG fastaout_discarded .TP .BI \-\-fastaout_discarded \0filename Write sequences that do not pass the filter of the \-\-fastq_filter or \-\-fastx_filter command to the given FASTA-formatted file. .TAG fastaout_discarded_rev .TP .BI \-\-fastaout_discarded_rev \0filename Write reverse reads that do not pass the filter of the \-\-fastq_filter or \-\-fastx_filter command to the given FASTA-formatted file. .TAG fastq_allowmergestagger .TP .B \-\-fastq_allowmergestagger When using \-\-fastq_mergepairs, allow merging of staggered read pairs. Staggered pairs are pairs where the 3' end of the reverse read has an overhang to the left of the 5' end of the forward read. This situation can occur when a very short fragment is sequenced. The 3' overhang of the reverse read is not included in the merged sequence. The opposite option is the \-\-fastq_nostagger option. The default is to discard staggered pairs. .TAG fastq_ascii .TP .BI \-\-fastq_ascii\~ "positive integer" Define the ASCII character number used as the basis for the FASTQ quality score. The default is 33, which is used by the Sanger / Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 and 64 are valid arguments. .TAG fastq_asciiout .TP .BI \-\-fastq_asciiout\~ "positive integer" When using \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, define the ASCII character number used as the basis for the FASTQ quality score when writing FASTQ output files. The default is 33. Only 33 and 64 are valid arguments. .TAG fastq_chars .TP .BI \-\-fastq_chars \0filename Summarize the composition of sequence and quality strings contained in the input FASTQ file. For each sequence symbol, \-\-fastq_chars gives the number of occurrences of the symbol, its relative frequency and the length of the longest run of that symbol. For each character present in the quality strings, \-\-fastq_chars gives the ASCII value of the character, its relative frequency, and the number of times a \fIk\fR-mer of that character appears at the end of quality strings. The length of the \fIk\fR-mer can be set using \-\-fastq_tail (4 by default). The command \-\-fastq_chars tries to automatically detect the quality encoding (Solexa, Illumina 1.3+, Illumina 1.5+ or Illumina 1.8+/Sanger) by analyzing the range of observed quality score values. In case of success, \-\-fastq_chars suggests values for the \-\-fastq_ascii (33 or 64), \-\-fastq_qmin and \-\-fastq_qmax options to be used with the other commands that require a FASTQ input file. .TAG fastq_convert .TP .BI \-\-fastq_convert \0filename Convert between the different variants of the FASTQ file format. The quality encoding of the input file must be specified with the \-\-fastq_ascii option (either 33 or 64, the default is 33), and the output quality encoding must be specified with the \-\-fastq_asciiout option (default 33). The minimum and maximum output quality scores may be limited using the \-\-fastq_qminout and \-\-fastq_qmaxout options. The output file is specified with the \-\-fastqout option. .TAG fastq_eeout .TP .B \-\-fastq_eeout When using \-\-fastq_filter, \-\-fastx_filter or \-\-fastq_mergepairs, include the number of expected errors (ee) in the sequence header of FASTQ and FASTA files. This option is a synonym of the \-\-eeout option. Use the \-\-xee option to remove this information from headers. .TAG fastq_eestats .TP .BI \-\-fastq_eestats \0filename Analyze a FASTQ file and report statistics on the distributions of quality scores, error probabilities and expected accumulated errors. The report, a table of 21 tab-separated columns, is written to the file specified with the \-\-output option. The first column corresponds to the position in the reads (Pos). The second and third columns correspond to the number of reads (Reads) and percentage of reads (PctRecs) that include this position. The remaining columns include information about the distribution of quality scores in this position (Q), error probabilities in this position (Pe), and finally the expected number of accumulated errors from the beginning of the reads and until the current position (EE). For each of the Q, Pe and EE distributions, the following statistics are included: minimum value (Min), lower quartile (Low), median (Med), mean (Mean), upper quartile (Hi), and maximum value (Max). The quality encoding and the range of quality values may be specified with \-\-fastq_ascii \-\-fastq_qmin and \-\-fastq_qmax. .TAG fastq_eestats2 .TP .BI \-\-fastq_eestats2 \0filename Analyze the specified FASTQ file and report statistics on the number of sequences that would be retained at a combination of selected cutoffs for length truncation and maximum expected errors, that could potentially be used as arguments to the \-\-fastq_trunclen and \-\-fastq_maxee options to the \-\-fastq_filter command. The result, a table of two or more columns, is written to the file specified with the \-\-output option. There is a line for each length truncation cutoff. The first column on each line contains the selected truncation length, while the following columns contain the number of sequences and, in parenthesis, the percentage of sequences that would be retained at the selected EE levels. The truncation length cutoffs may be specified with the \-\-length_cutoffs option and requires a list of three comma-separated integers indicating the shortest cutoff, the longest cutoff, and the increment between cutoffs. The longest cutoff may be specified with a star (*) which indicates that the limit is equal to the longest sequence in the input file. The default setting is "50,*,50" meaning that truncation lengths of 50, 100, 150 and so on up to the longest sequence length should be used. The maximum expected error (EE) cutoffs may be specified with the \-\-ee_cutoffs option which requires a comma-separated list of floating point numbers as its argument. The default setting is "0.5,1.0,2.0" that indicates that expected error levels of 0.5, 1.0 and 2.0 should be used. .TAG fastq_filter .TP .BI \-\-fastq_filter \0filename Trim and/or filter sequences in the given FASTQ file. Similar to the \-\-fastx_filter command, but works only on FASTQ files. See \-\-fastx_filter for details. .TAG fastq_join .TP .BI \-\-fastq_join\0 filename Join paired-end sequence reads into one sequence and add a gap between them using a padding sequence. The sequences are not merged as with the fastq_mergepairs command, but simply joined with a gap. The forward reads are specified as the argument to this option and the reverse reads are specified with the \-\-reverse option. The resulting sequences consist of the forward read, the padding sequence and the reverse complement of the reverse read. The padding sequence is specified with the \-\-join_padgap option and the padding quality is specified with the \-\-join_padgapq option. The default padding sequence string is NNNNNNNN and the default padding quality string is IIIIIIII, corresponding to a base quality score of 40 (a very high quality score with error probability 0.0001). The joined sequences are output to the file(s) specified with the \-\-fastaout or \-\-fastqout options. .TAG fastq_maxdiffs .TP .BI \-\-fastq_maxdiffs\~ "positive integer" When using \-\-fastq_mergepairs, specify the maximum number of non-matching nucleotides allowed in the overlap region. That option has a strong influence on the merging success rate. The default value is 10. .TAG fastq_maxdiffpct .TP .BI \-\-fastq_maxdiffpct\~ real When using \-\-fastq_mergepairs, specify the maximum percentage of non-matching nucleotides allowed in the overlap region. The default value is 100.0%. There are other more sophisticated rules in the merging algorithm that will discard read pairs with a high fraction of mismatches. .TAG fastq_maxee .TP .BI \-\-fastq_maxee\~ real When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard sequences with an expected error greater than the specified number (value ranging from 0.0 to infinity). For a given sequence, the expected error is the sum of error probabilities for all the positions in the sequence. Since error probabilities can be small but not null, the expected error is always greater than zero, and at most equal to the length of the sequence when all positions in the sequence have an error probability of 1.0. Using the expected error as the \fIlambda\fR parameter in the Poisson distribution, it is possible to compute the probability of observing \fIk\fR errors. For instance, a read with an expected error of 1.0 has: .RS .IP - 2 36.8% chance of having zero error, .IP - 36.8% chance of having one error, .IP - 18.4% chance of having two errors, .IP - 6.1% chance of having three errors, .IP - 1.5% chance of having four errors, .IP - 0.3% chance of having five errors, .IP - etc. .RE .PP .TAG fastq_maxee_rate .TP .BI \-\-fastq_maxee_rate\~ real When using \-\-fastq_filter or \-\-fastx_filter, discard sequences with an average expected error greater than the specified number (value ranging from 0.0 to 1.0 included). For a given sequence, the average expected error is the sum of error probabilities for all the positions in the sequence, divided by the length of the sequence. .TAG fastq_maxlen .TP .BI \-\-fastq_maxlen\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard sequences with more than the specified number of bases. .TAG fastq_maxmergelen .TP .BI \-\-fastq_maxmergelen\~ "positive integer" When using \-\-fastq_mergepairs, specify the maximum length of the merged sequence (default is 1,000,000). .TAG fastq_maxns .TP .BI \-\-fastq_maxns\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard sequences with more than the specified number of N's. .TAG fastq_mergepairs .TP .BI \-\-fastq_mergepairs\0 filename Merge paired-end sequence reads into one sequence. The forward reads are specified as the argument to this option and the reverse reads are specified with the \-\-reverse option. Reads with the same index/position in the forward and reverse files are considered to form a pair, even if their labels are different. Thus, forward and reverse reads \fBmust\fR appear in the same order and total number in both files. A warning is emitted if the forward and reverse files contain different numbers of reads. The merged sequences are written to the file(s) specified with the \-\-fastaout or \-\-fastqout options. The non-merged reads can be output to the files specified with the \-\-fastaout_notmerged_fwd, \-\-fastaout_notmerged_rev, \-\-fastqout_notmerged_fwd and \-\-fastqout_notmerged_rev options. Statistics may be output to the file specified with the \-\-eetabbedout option. Sequences are truncated as specified with the \-\-fastq_truncqual option to remove low-quality bases in the 3' end. Sequences shorter than specified with \-\-fastq_minlen (after truncation) are discarded (1 by default). Sequences with too many ambiguous bases (N's), as specified with the \-\-fastq_maxns are also discarded (no limit by default). Staggered reads are not merged unless the \-\-fastq_allowmergestagger option is specified. The minimum length of the overlap region between the reads may be specified with the \-\-fastq_minovlen option (at least 5, default 10). The overlap region may not include more mismatches than specified with the \-\-fastq_maxdiffs option (10 by default) or a higher percentage of mismatches than specified with the \-\-fastq_maxdiffpct option (100.0% by default), otherwise the read pair is discarded. Additional rules will avoid merging of reads that cannot be aligned reliably and unambiguously. The minimum and maximum length of the merged sequence may be specified with the \-\-fastq_minmergelen and \-\-fastq_maxmergelen options, respectively. The quality value limits for output files may be specified with the \-\-fastq_qminout and \-\-fastq_qmaxout options, but they apply only to the merged region. Other relevant options are: \-\-fastq_ascii, \-\-fastq_maxee, \-\-fastq_nostagger, \-\-fastq_qmax, \-\-fastq_qmin, and \-\-label_suffix. .TAG fastq_minlen .TP .BI \-\-fastq_minlen\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard input sequences with less than the specified number of bases (default 1). .TAG fastq_minmergelen .TP .BI \-\-fastq_minmergelen\~ "positive integer" When using \-\-fastq_mergepairs, specify the minimum length of the merged sequence. The default is 1. .TAG fastq_minovlen .TP .BI \-\-fastq_minovlen\~ "positive integer" When using \-\-fastq_mergepairs, specify the minimum overlap between the merged reads. The default is 10. Must be at least 5. .TAG fastq_minqual .TP .BI \-\-fastq_minqual\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, discard reads having any base with a quality score below the given value. The default is 0, which discards none. .TAG fastq_nostagger .TP .B \-\-fastq_nostagger When using \-\-fastq_mergepairs, forbid the merging of staggered read pairs. This is the default behaviour of \-\-fastq_mergepairs. To change that behaviour, see the \-\-fastq_allowmergestagger option. .TAG fastq_qmax .TP .BI \-\-fastq_qmax\~ "positive integer" Specify the maximum quality score accepted when reading FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. .TAG fastq_qmaxout .TP .BI \-\-fastq_qmaxout\~ "positive integer" When using \-\-fastq_mergepairs, \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, specify the maximum quality score used when writing FASTQ files. For the \-\-fasta2fastq command, the value specified here is the fake quality score used for the FASTQ output file. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use a maximum quality score of 40. The limit only applies to the merged region when using \-\-fastq_mergepairs. .TAG fastq_qmin .TP .BI \-\-fastq_qmin\~ "positive integer" Specify the minimum quality score accepted for FASTQ files. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. .TAG fastq_qminout .TP .BI \-\-fastq_qminout\~ "positive integer" When using \-\-fastq_mergepairs, \-\-fastq_convert or \-\-sff_convert, specify the minimum quality score used when writing FASTQ files. The default is 0, which is usual for Sanger/Illumina 1.8+ files. Older versions of the format may use scores between -5 and 2. The limit applies only to the merged region when using \-\-fastq_mergepairs. .TAG fastq_stats .TP .BI \-\-fastq_stats \0filename Analyze a FASTQ file and report the number of reads it contains. The quality encoding and the range of quality values may be specified with \-\-fastq_ascii \-\-fastq_qmin and \-\-fastq_qmax. That command requires the \-\-log option and outputs the following detailed statistics on read length, quality score, length vs. quality distributions, and length / quality filtering: .RS .TP Read length distribution: .RS .nr step 1 1 .IP \n[step]. 4 L: read length. .IP \n+[step]. N: number of reads. .IP \n+[step]. Pct: fraction of reads with this length. .IP \n+[step]: AccPct: fraction of reads with this length or longer. .RE .TP Quality score distribution: .RS .nr step 1 1 .IP \n[step]. 4 ASCII: character encoding the quality score. .IP \n+[step]. Q: Phred quality score. .IP \n+[step]. Pe: probability of error associated with the quality score. .IP \n+[step]. N: number of bases with this quality score. .IP \n+[step]. Pct: fraction of bases with this quality score. .IP \n+[step]: AccPct: fraction of bases with this quality score or higher. .RE .TP Length vs. quality distribution: .RS .nr step 1 1 .IP \n[step]. 4 L: position in reads (starting from position 2). .IP \n+[step]. PctRecs: fraction of reads with at least this length. .IP \n+[step]. AvgQ: average quality score over all reads up to this position. .IP \n+[step]. P(AvgQ): error probability corresponding to AvgQ. .IP \n+[step]. AvgP: average error probability. .IP \n+[step]: AvgEE: average expected error over all reads up to this position. .IP \n+[step]: Rate: growth rate of AvgEE between this position and position - 1. .IP \n+[step]: RatePct: Rate (as explained above) expressed as a percentage. .RE .TP Effect of expected error and length filtering: .RS The first column indicates read lengths (\fIL\fR). The next four columns indicate the number of reads that would be retained by the \-\-fastq_filter command if the reads were truncated at length \fIL\fR (option \-\-fastq_trunclen \fIL\fR) and filtered to have a maximum expected error of 1.0, 0.5, 0.25 or 0.1 (with the option \-\-fastq_maxee \fIfloat\fR). The last four columns indicate the fraction of reads that would be retained by the \-\-fastq_filter command using the same length and maximum expected error parameters. .RE .TP Effect of minimum quality and length filtering: .RS The first column indicates read lengths (\fILen\fR). The next four columns indicate the fraction of reads that would be retained by the \-\-fastq_filter command if the reads were truncated at length \fILen\fR (option \-\-fastq_trunclen \fILen\fR) or at the first position with a quality \fIQ\fR below 5, 10, 15 or 20 (option \-\-fastq_truncqual \fIQ\fR). .RE .RE .TAG fastq_stripleft .TP .BI \-\-fastq_stripleft\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, strip the specified number of bases from the left end of the reads. If the length of the resulting read is null, then the read is discarded. .TAG fastq_stripright .TP .BI \-\-fastq_stripright\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, strip the specified number of bases from the right end of the reads. If the length of the resulting read is null, then the read is discarded. .TAG fastq_tail .TP .BI \-\-fastq_tail\~ "positive integer" When using \-\-fastq_chars, count the number of times a series of characters of length \fIk\fR appears at the end of quality strings. By default, \fIk\fR = 4. .TAG fastq_truncee .TP .BI \-\-fastq_truncee\~ real When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so that their total expected error is not higher than the specified value. .TAG fastq_truncee_rate .TP .BI \-\-fastq_truncee_rate\~ real When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so that their average expected error per base is not higher than the specified value. The truncation will happen at the first occurence. The average expected error per base is calculated as the total expected number of errors divided by the length of the sequence after truncation. .TAG fastq_trunclen .TP .BI \-\-fastq_trunclen\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences to the specified length. Shorter sequences are discarded. .TAG fastq_trunclen_keep .TP .BI \-\-fastq_trunclen_keep\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences to the specified length. Shorter sequences are not discarded. .TAG fastq_truncqual .TP .BI \-\-fastq_truncqual\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, truncate sequences starting from the first base with the specified base quality score value or lower. .TAG fastqout .TP .BI \-\-fastqout \0filename When using \-\-fastq_filter, \-\-fastq_mergepairs, \-\-fastx_filter or \-\-fasta2fastq, write to the given FASTQ-formatted file the sequences passing the filter, or the merged or converted sequences. .TAG fastqout_rev .TP .BI \-\-fastqout_rev \0filename When using \-\-fastq_filter or \-\-fastx_filter, write to the given FASTQ-formatted file the reverse reads passing the filter. .TAG fastqout_discarded .TP .BI \-\-fastqout_discarded \0filename When using \-\-fastq_filter or \-\-fastx_filter, write sequences that do not pass the filter to the given FASTQ-formatted file. .TAG fastqout_discarded_rev .TP .BI \-\-fastqout_discarded_rev \0filename When using \-\-fastq_filter or \-\-fastx_filter, write reverse reads that do not pass the filter to the given FASTQ-formatted file. .TAG fastqout_notmerged_fwd .TP .BI \-\-fastqout_notmerged_fwd \0filename When using \-\-fastq_mergepairs, write forward reads not merged to the specified FASTQ file. .TAG fastqout_notmerged_rev .TP .BI \-\-fastqout_notmerged_rev \0filename When using \-\-fastq_mergepairs, write reverse reads not merged to the specified FASTQ file. .TAG fastx_filter .TP .BI \-\-fastx_filter \0filename Trim and/or filter the sequences in the given FASTA or FASTQ file and output the remaining sequences to the FASTQ file specified with the \-\-fastqout option and/or to the FASTA file specified with the \-\-fastaout option. Discarded sequences are written to the files specified with the \-\-fastaout_discarded and \-\-fastqout_discarded options. The input format (FASTA or FASTQ) is automatically detected. If the input consists of paired sequences, an input file with reverse reads may be specified with the \-\-reverse option, and corresponding output will be written to the files specified with the \-\-fastqout_rev, \-\-fastaout_rev, \-\-fastqout_discarded_rev, and \-\-fastaout_discarded_rev options. Output can not be written to FASTQ files if the input is in FASTA format. The sequences are first trimmed and then filtered based on the remaining bases. Sequences may be trimmed using the options \-\-fastq_stripleft, \-\-fastq_stripright, \-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_trunclen, \-\-fastq_trunclen_keep and \-\-fastq_truncqual. The sequences may be filtered using the options \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxlen, \-\-fastq_maxns, \-\-fastq_minlen (default 1), \-\-fastq_minqual, \-\-fastq_trunclen, \-\-maxsize, and \-\-minsize. Sequences not satisfying the requirements are discarded. For pairs of sequences, both sequences in a pair must satisfy the requirements, otherwise both are discarded. If no shortening or filtering options are given, all sequences are written to the output files, possibly after conversion from FASTQ to FASTA format. The \-\-relabel option may be used to relabel the output sequences. The \-\-eeout option may be used to output the expected number of errors in each sequence. After all sequences have been processed, the number of kept and discarded sequences will be shown, as well as how many of the kept sequences were trimmed. When the input is in FASTA format, the following options are not accepted because quality scores are not available: \-\-eeout, \-\-fastq_ascii, \-\-fastq_eeout, \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_minqual, \-\-fastq_out, \-\-fastq_qmax, \-\-fastq_qmin, \-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_truncqual, \-\-fastqout_discarded, \-\-fastqout_discarded_rev, \-\-fastqout_rev. .TAG fastx_revcomp .TP .BI \-\-fastx_revcomp \0filename Reverse-complement the sequences in the given FASTA or FASTQ file to a file specified with the \-\-fastaout and/or \-\-fastqout options. If the input file is in FASTA format, the output can not be written back to a FASTQ file due to missing base quality scores. .TAG join_padgap .TP .BI \-\-join_padgap\~ string When running \-\-fastq_join, use the \fIstring\fR as a sequence padding string. The default is NNNNNNNN (8 N's). .TAG join_padgapq .TP .BI \-\-join_padgapq\~ string When running \-\-fastq_join, use the \fIstring\fR as a quality padding string. The default is a string of I's equal in length to the sequence padding string. The letter I corresponds to a base quality score of 40 indicating a very high quality base with error probability of 0.0001. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA or FASTQ format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG maxsize .TP .BI \-\-maxsize\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, discard sequences with an abundance higher than the specified value. .TAG minsize .TP .BI \-\-minsize\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, discard sequences with an abundance lower than the specified value. .TAG output .TP .BI \-\-output \0filename When using \-\-fastq_eestats or \-\-fastq_eestats2, write tabulated results to \fIfilename\fR. See \-\-fastq_eestats's and \-\-fastq_eestats2's documentation for a complete description of the table. .TAG relabel_keep .TP .B \-\-relabel_keep When using \-\-relabel, keep the old identifier in the header after a space. .TAG relabel .TP .BI \-\-relabel \0string Please see the description of the same option under Chimera detection for details. .TAG relabel_md5 .TP .BI \-\-relabel_md5 Please see the description of the same option under Chimera detection for details. .TAG relabel_self .TP .BI \-\-relabel_self Please see the description of the same option under Chimera detection for details. .TAG relabel_sha1 .TP .BI \-\-relabel_sha1 Please see the description of the same option under Chimera detection for details. .TAG reverse .TP .BI \-\-reverse \0filename When using \-\-fastq_filter, \-\-fastx_filter, \-\-fastq_mergepairs or \-\-fastq_join, specify the FASTQ file containing containing the reverse reads. .TAG sff_convert .TP .BI \-\-sff_convert \0filename Convert the given SFF file to FASTQ. The FASTQ output file is specified with the \-\-fastqout option. The sequence may be clipped as specified in the SFF file if the option \-\-sff_clip is specified, otherwise no clipping occurs. Bases that would have been clipped are converted to lower case, while the rest is in upper case. The output quality encoding may be specified with the \-\-fastq_asciiout option (default 33). The minimum and maximum output quality scores may be limited using the \-\-fastq_qminout and \-\-fastq_qmaxout options. .TAG sff_clip .TP .BI \-\-sff_clip Specifies that the sequences converted by the \-\-sff_convert command should be clipped in both ends as indicated in the SFF file. By default no clipping is performed. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .TAG xee .TP .B \-\-xee Strip information about expected errors (ee) from the output file headers. This information is added by the \-\-fastq_eeout and \-\-eeout options. .RE .PP .\" ---------------------------------------------------------------------------- .TAG masking-options Masking options: .RS .PP An input sequence can be composed of lower- or uppercase letters. When soft masking is specified, lower case letters are treated as symbols that should be masked. Otherwise the case of the input sequences is ignored. .PP Masking is performed by the commands for chimera detection (uchime_denovo, uchime_ref), clustering (cluster_fast, cluster_smallmem, cluster_size), masking (maskfasta, fastx_mask), pairwise alignment (allpairs_global) and searching (search_exact, usearch_global). .PP Masking is usually specified with the \-\-qmask option, while the \-\-dbmask option is used for the database sequences specified with the \-\-db option with the \-\-usearch_global, \-\-search_exact and \-\-uchime_ref commands. .PP The argument to the \-\-qmask and \-\-dbmask option may be none, soft or dust. If the argument is none, the no masking is performed. If the argument is soft the lower case symbols are masked. Finally, if the argument is dust, the sequence is masked using the DUST algorithm by Tatusov and Lipman to mask low-complexity regions. .PP If the \-\-hardmask option is specified, all masked regions are converted to N's, otherwise masked regions are indicated by lower case letters. .PP If any sequence is masked, the masked version of the sequence (with lower case letters or N's) is used in all output files. Otherwise the sequence is unmodified. The exception is the sequences in the output file specified with the \-\-uchimealns option, where the input sequences are converted to upper case first and lower case letters indicate disagreement between the aligned sequences. .PP The \-\-qmask option (or \-\-dbmask for database sequences) may be combined with the \-\-hardmask option. The results of using the none, dust or soft argument to \-\-qmask or \-\-dbmask are presented below, assuming each input sequence contains both lower and uppercase symbols. .PP Results if the \-\-hardmask option is off (default): .RS .TP 9 .B none: no masking, all symbols used, no change .TP .B dust: masked symbols lowercased, rest uppercased .TP .B soft: lowercase symbols masked, no case changes .RE .PP Results if the \-\-hardmask option is on: .RS .TP 9 .B none: no masking, all symbols used, no change .TP .B dust: masked symbols changed to Ns, rest unchanged .TP .B soft: lowercase symbols masked and changed to Ns .RE .PP When a sequence region is masked, words in the region are not included in the indices used in the heuristic search algorithm. In all other aspects, the region is treated as other regions. .PP Regions in sequences that are hardmasked (with N's) have a zero alignment score and do not contribute to an alignment. .RE .PP .RS .TAG fastaout .TP 9 .BI \-\-fastaout \0filename Write the masked sequences to \fIfilename\fR, in fasta format. Applies only to the \-\-fastx_mask command. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the masked sequences to \fIfilename\fR, in fastq format. Applies only to the \-\-fastx_mask command. .TAG fastx_mask .TP .BI \-\-fastx_mask \0filename Mask regions in sequences contained in the specified fasta or fastq file. The default is to mask using DUST (use \-\-qmask to modify that behaviour). The output files are specified with the \-\-fastaout and \-\-fastqout options. The minimum and maximum percentage of unmasked residues may be specified with the \-\-min_unmasked_pct and \-\-max_unmasked_pct options, respectively. .TAG hardmask .TP .B \-\-hardmask Symbols in masked regions are replaced by N's. The default is to replace the masked regions by lower case letters. .TAG maskfasta .TP .BI \-\-maskfasta \0filename Mask regions in sequences contained in the fasta file \fIfilename\fR. The default is to mask using \fIdust\fR (use \-\-qmask to modify that behaviour). The output file is specified with the \-\-output option. This command is depreciated, please use \-\-fastx_mask instead. .TAG max_unmasked_pct .TP .BI \-\-max_unmasked_pct \0real Discard sequences with more than the specified maximum percentage of unmasked residues. Works only with \-\-fastx_mask. .TAG min_unmasked_pct .TP .BI \-\-min_unmasked_pct \0real Discard sequences with less than the specified minimum percentage of unmasked residues. Works only with \-\-fastx_mask. .TAG output .TP .BI \-\-output \0filename Write the masked sequences to \fIfilename\fR, in fasta format. Applies only to the \-\-mask_fasta command. .TAG qmask .TP .BI \-\-qmask\~ "none|dust|soft" If the argument is dust, mask regions in sequences using the \fIDUST\fR algorithm that detects simple repeats and low-complexity regions. This is the default. If the argument is soft, mask the lower case letters in the input sequence. If the argument is none, do not mask. .RE .PP .\" ---------------------------------------------------------------------------- .TAG orienting-options Orienting options: .RS .PP The \-\-orient command can be used to orient the sequences in a given file in either the forward or the reverse complementary direction based on a reference database specified with the \-\-db option. The two strands of each input sequence are compared to the reference database using nucleotide words. If one of the strands shares many more words with at least one sequence in the database than the other, that strand is chosen. The correctly oriented sequences may be written to a FASTA file specified with the \-\-fastaout, and to a FASTQ file specified with the \-\-fastqout option (as long as the input was also in FASTQ format). If the result is uncertain, because the number of matching words is too similar, the original sequence is written to the file specified with the \-\-notmatched option. The results may also be written to a tab-delimited text file specified with the \-\-tabbedout option. This file will contain the query label, the direction (+, - or ?), the number of matching words on the forward strand, and the number of matching words on the reverse complementary strand. By default, a word length of 12 is used for this command. The word length may be adjusted using the \-\-wordlength option. There has to be at least 4 times as many matches on one strand than the other for a strand to be selected. In addition to the common options, the following options may also be specified for this command: \-\-dbmask, \-\-qmask, \-\-relabel, \-\-relabel_keep, \-\-relabel_md5, \-\-relabel_self, \-\-relabel_sha1, \-\-sizein, and \-\-sizeout. .PP .TAG db .TP 9 .BI \-\-db \0filename Read the reference database from the given file. It may be in FASTA, FASTQ or UDB format. If an UDB file is used it should have been created with a wordlength of 12. .TAG fastaout .TP .BI \-\-fastaout \0filename Write the correctly oriented sequences to \fIfilename\fR, in fasta format. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the correctly oriented sequences to \fIfilename\fR, in fastq format. .TAG notmatched .TP .BI \-\-notmatched \0filename Write the sequences with undetermined direction to \fIfilename\fR, in the original format. .TAG orient .TP .BI \-\-orient \0filename Orient the sequences in the given file. .TAG tabbedout .TP .BI \-\-tabbedout \0filename Write the resuls to a tab-delimited text file with the specified \fIfilename\fR. This file will contain the query label, the direction (+, - or ?), the number of matching words on the forward strand, and the number of matching words on the reverse complementary strand. .RE .PP .\" ---------------------------------------------------------------------------- .TAG pairwise-alignment-options Pairwise alignment options: .RS .PP The results of the n * (n-1) / 2 pairwise alignments are written to the result files specified with \-\-alnout, \-\-blast6out, \-\-fastapairs \-\-matched, \-\-notmatched, \-\-qsegout, \-\-samout, \-\-tsegout, \-\-uc or \-\-userout (see Searching section below). Specify either the \-\-acceptall option to output all pairwise alignments, or specify an identity level with \-\-id to discard weak alignments. Most other accept/reject options (see Searching options below) may also be used. Sequences are aligned on their \fIplus\fR strand only. Masking is performed as usual and specified with \-\-qmask and \-\-hardmask. .TAG acceptall .TP 9 .B \-\-acceptall Write the results of all alignments to output files. This option overrides all other accept/reject options (including \-\-id). .TAG allpairs_global .TP .BI \-\-allpairs_global \0filename Perform optimal global pairwise alignments of the fasta sequences contained in \fIfilename\fR. Each sequence is compared to all sequencs that come after it in the file, resulting in a total of n * (n-1) / 2 pairwise alignments, where n is the total number of sequences. This command is multi-threaded. .TAG id .TP .BI \-\-id \0real Reject the sequence match if the pairwise identity is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). .TAG threads .TP .BI \-\-threads\~ "positive integer" Number of computation threads to use (1 to 1024). The number of threads should be lesser or equal to the number of available CPU cores. The default is to use all available resources and to launch one thread per logical core. .TAG uc .TP .BI \-\-uc \0filename Output pairwise alignment results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns. Each sequence is compared to all other sequences, and all hits (\-\-acceptall) or only some hits (\-\-id \fIfloat\fR) are reported, with one pairwise comparison per line: .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type, always set to 'H'. .IP \n+[step]. Ordinal number of the target sequence (based on input order, starting from zero). .IP \n+[step]. Sequence length. .IP \n+[step]. Percentage of similarity with the target sequence. .IP \n+[step]. Match orientation, always set to '+'. .IP \n+[step]. Not used, always set to zero. .IP \n+[step]. Not used, always set to zero. .IP \n+[step]. Compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence. .IP \n+[step]. Label of the query sequence. .IP \n+[step]. Label of the target sequence. .RE .RE .RE .PP .\" ---------------------------------------------------------------------------- .TAG restriction-site-cutting-options Restriction site cutting options: .RS .PP The input sequences in the file specified with the \-\-cut command are cut into fragments at all restriction sites matching the pattern given with the \-\-cut_pattern option. The fragments on the forward strand are written to the file specified with the \-\-fastaout file and the fragments on the reverse strand are written to the file specified with the \-\-fastaout_rev option. Input sequences that do not match are written to the file specified with the option \-\-fastaout_discarded, and their reverse complement are also written to the file specified with the \-\-fastaout_discarded_rev option. The relabel options (\-\-relabel, \-\-relabel_self, \-\-relabel_keep, \-\-relabel_md5, and \-\-relabel_sha1) may be used to relabel the output sequences). .TAG cut .TP 9 .BI \-\-cut \0filename Specify the input file with sequences in FASTA format. .TAG cut_pattern .TP .BI \-\-cut_pattern \0string Specify the restriction site cutting pattern and positions. The pattern is a string of lower- or uppercase letters specifying the nucleotides that must match, and may include ambiguous nucleotide symbols. The special characters "^" (circumflex) and "_" (underscore) are used to indicate the cutting position on the forward and reverse strand, respectively. For example, the pattern "G^AATT_C" is the pattern for the EcoRI restriction site. For such palindromic patterns (identical to its reverse complement) the command will output all possible fragments on both strands. For non-palindromic sites, it may be necessary to run the command also on the reverse complemented input sequences. Exactly one cutting site on each strand must be indicated. .TAG fastaout .TP .BI \-\-fastaout \0filename Specify the output file for the resulting fragments on the forward strand. .TAG fastaout_rev .TP .BI \-\-fastaout_rev \0filename Specify the output file for the resulting fragments on the reverse strand. .TAG fastaout_discarded .TP .BI \-\-fastaout_discarded \0filename Specify the output file for the non-matching sequences. .TAG fastaout_discarded_rev .TP .BI \-\-fastaout_discarded_rev \0filename Specify the output file for the non-matching sequences, reverse complemented. .RE .PP .\" ---------------------------------------------------------------------------- .TAG searching-options Searching options: .RS .TAG alnout .TP 9 .BI \-\-alnout \0filename Write pairwise global alignments to \fIfilename\fR using a human-readable format. Use \-\-rowlen to modify alignment length. Output order may vary when using multiple threads. .TAG biomout .TP .BI \-\-biomout \0filename Write search results to an OTU table in the biom version 1.0 file format. The query file contains the samples, while the database file contains the OTUs. Sample and OTU identifiers are extracted from the header of these sequences. See the \-\-biomout option in the Clustering section for further details. .TAG blast6out .TP .BI \-\-blast6out \0filename Write search results to \fIfilename\fR using a blast-like tab-separated format of twelve fields (listed below), with one line per query-target matching (or lack of matching if \-\-output_no_hits is used). Warning, vsearch uses global pairwise alignments, not blast's seed-and-extend algorithm. Therefore, some common blast output values (alignment start and end, evalue, bit score) are reported differently. Output order may vary when using multiple threads. A similar output can be obtain with \-\-userout \fIfilename\fR and \-\-userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits. A complete list and description is available in the section 'Userfields' of this manual. .RS .RS .nr step 1 1 .IP \n[step]. 4 \fIquery\fR: query label. .IP \n+[step]. \fItarget\fR: target (database sequence) label. The field is set to '*' if there is no alignment. .IP \n+[step]. \fIid\fR: percentage of identity (real value ranging from 0.0 to 100.0). The percentage identity is defined as 100 * (matching columns) / (alignment length - terminal gaps). See fields id0 to id4 for other definitions. .IP \n+[step]. \fIalnlen\fR: length of the query-target alignment (number of columns). The field is set to 0 if there is no alignment. .IP \n+[step]. \fImism\fR: number of mismatches in the alignment (zero or positive integer value). .IP \n+[step]. \fIopens\fR: number of columns containing a gap opening (zero or positive integer value, excluding terminal gaps). .IP \n+[step]. \fIqlo\fR: first nucleotide of the query aligned with the target. Always equal to 1 if there is an alignment, 0 otherwise (see \fIqilo\fR to ignore initial gaps). .IP \n+[step]. \fIqhi\fR: last nucleotide of the query aligned with the target. Always equal to the length of the pairwise alignment, 0 otherwise (see \fIqihi\fR to ignore terminal gaps). .IP \n+[step]. \fItlo\fR: first nucleotide of the target aligned with the query. Always equal to 1 if there is an alignment, 0 otherwise (see \fItilo\fR to ignore initial gaps). .IP \n+[step]. \fIthi\fR: last nucleotide of the target aligned with the query. Always equal to the length of the pairwise alignment, 0 otherwise (see \fItihi\fR to ignore terminal gaps). .IP \n+[step]. \fIevalue\fR: expectancy-value (not computed for nucleotide alignments). Always set to -1. .IP \n+[step]. \fIbits\fR: bit score (not computed for nucleotide alignments). Always set to 0. .RE .RE .TAG db .TP .BI \-\-db \0filename Compare query sequences (specified with \-\-usearch_global) to the target sequences contained in \fIfilename\fR in FASTA or FASTQ format, using global pairwise alignment. Alternatively, the name of a preformatted UDB database created using the makeudb_usearch command (see below) may be specified. .TAG dbmask .TP .BI \-\-dbmask\~ "none|dust|soft" Mask regions in the target database sequences using the dust method or the soft method, or do not mask (none). Warning, when using soft masking search commands become case sensitive. The default is to mask using dust. .TAG dbmatched .TP .BI \-\-dbmatched \0filename Write database target sequences matching at least one query sequence to \fIfilename\fR, in fasta format. If the option \-\-sizeout is used, the number of queries that matched each target sequence is indicated using the pattern ";size=\fIinteger\fR;". .TAG dbnotmatched .TP .BI \-\-dbnotmatched \0filename Write database target sequences not matching query sequences to \fIfilename\fR, in fasta format. .TAG fastapairs .TP .BI \-\-fastapairs \0filename Write pairwise alignments of query and target sequences to \fIfilename\fR, in fasta format. .TAG fulldp .TP .B \-\-fulldp Dummy option for compatibility with usearch. To maximize search sensitivity, \fBvsearch\fR uses a 8-way 16-bit SIMD vectorized full dynamic programming algorithm (Needleman-Wunsch), whether or not \-\-fulldp is specified. .TAG gapext .TP .BI \-\-gapext \0string Set penalties for a gap extension. See \-\-gapopen for a complete description of the penalty declaration system. The default is to initialize the six gap extending penalties using a penalty of 2 for extending internal gaps and a penalty of 1 for extending terminal gaps, in both query and target sequences (i.e. 2I/1E). .TAG gapopen .TP .BI \-\-gapopen \0string Set penalties for a gap opening. A gap opening can occur in six different contexts: in the query (Q) or in the target (T) sequence, at the left (L) or right (R) extremity of the sequence, or inside the sequence (I). Sequence symbols (Q and T) can be combined with location symbols (L, I, and R), and numerical values to declare penalties for all possible contexts: aQL/bQI/cQR/dTL/eTI/fTR, where abcdef are zero or positive integers, and '/' is used as a separator. .br To simplify declarations, the location symbols (L, I, and R) can be combined, the symbol (E) can be used to treat both extremities (L and R) equally, and the symbols Q and T can be omitted to treat query and target sequences equally. For instance, the default is to declare a penalty of 20 for opening internal gaps and a penalty of 2 for opening terminal gaps (left or right), in both query and target sequences (i.e. 20I/2E). If only a numerical value is given, without any sequence or location symbol, then the penalty applies to all gap openings. To forbid gap-opening, an infinite penalty value can be declared with the symbol '*'. To use \fBvsearch\fR as a semi-global aligner, a null-penalty can be applied to the left (L) or right (R) gaps. .br \fBvsearch\fR always initializes the six gap opening penalties using the default parameters (20I/2E). The user is then free to declare only the values he/she wants to modify. The \fIstring\fR is scanned from left to right, accepted symbols are (0123456789/LIREQT*), and later values override previous values. .br Please note that \fBvsearch\fR, in contrast to usearch, only allows integer gap penalties. Because the lowest gap penalties are 0.5 by default in usearch, all default scores and gap penalties in \fBvsearch\fR have been doubled to maintain equivalent penalties and to produce identical alignments. .TAG hardmask .TP .B \-\-hardmask Mask sequence regions by replacing them with Ns instead of setting them to lower case as is the default. For more information, please see the Masking section. .TAG id .TP .BI \-\-id \0real Reject the sequence match if the pairwise identity is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). The search process sorts target sequences by decreasing number of \fIk\fR-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. That efficient pre-filtering also prevents pairwise alignments with very short, or with weakly matching targets, as there needs to be by default at least 12 shared \fIk\fR-mers to start the pairwise alignment, and at least one out of every 16 \fIk\fR-mers from the query needs to match the target (see options \-\-wordlength and \-\-minwordmatches to change that behaviour). Consequently, using values lower than \-\-id 0.5 is not likely to capture more weakly matching targets. The pairwise identity is by default defined as the number of (matching columns) / (alignment length - terminal gaps). That definition can be modified by \-\-iddef. .TAG iddef .TP .BI \-\-iddef\~ "0|1|2|3|4" Change the pairwise identity definition used in \-\-id. Values accepted are: .RS .RS .nr step 0 1 .IP \n[step]. 4 CD-HIT definition: (matching columns) / (shortest sequence length). .IP \n+[step]. edit distance: (matching columns) / (alignment length). .IP \n+[step]. edit distance excluding terminal gaps (default definition for \-\-id). .IP \n+[step]. Marine Biological Lab definition counting each gap opening (internal or terminal) as a single mismatch, whether or not the gap was extended: 1.0 - [(mismatches + gap openings)/(longest sequence length)] .IP \n+[step]. BLAST definition, equivalent to \-\-iddef 1 for global pairwise alignments. .RE .PP The option \-\-userfields accepts the fields id0 to id4, in addition to the field id, to report the pairwise identity values corresponding to the different definitions. .RE .TAG idprefix .TP .BI \-\-idprefix\~ "positive integer" Reject the sequence match if the first \fIinteger\fR nucleotides of the target do not match the query. .TAG idsuffix .TP .BI \-\-idsuffix\~ "positive integer" Reject the sequence match if the last \fIinteger\fR nucleotides of the target do not match the query. .TAG lca_cutoff .TP .BI \-\-lca_cutoff \0real Adjust the fraction of matching hits required for the last common ancestor (LCA) output with the \-\-lcaout option during searches. The default value is 1.0 which requires all hits to match at each taxonomic rank for that rank to be included. If a lower cutoff value is used, e.g. 0.95, a small fraction of non-matching hits are allowed while that rank will still be reported. The argument to this option must be larger than 0.5, but not larger than 1.0. .TAG lcaout .TP .BI \-\-lcaout \0filename Output last common ancestor (LCA) information about the hits of each query to a text file in a tab-separated format. The first column contains the query id, while the second column contains the taxonomic information. The headers of the sequences in the database must contain taxonomic information in the same format as used with the \-\-sintax command, e.g. "tax=k:Archaea,p:Euryarchaeota,c:Halobacteria". Only the initial parts of the taxonomy that are common to a large fraction of the hits of each query will be output. It is necessary to set the \-\-maxaccepts option to a value different from 1 for this information to be useful. The \-\-top_hits_only option may also be useful. The fraction of matching hits required may be adjusted by the \-\-lca_cutoff option (default 1.0). .TAG leftjust .TP .B \-\-leftjust Reject the sequence match if the pairwise alignment begins with gaps. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG match .TP .BI \-\-match\~ "integer" Score assigned to a match (i.e. identical nucleotides) in the pairwise alignment. The default value is 2. .TAG matched .TP .BI \-\-matched \0filename Write query sequences matching database target sequences to \fIfilename\fR, in fasta format. .TAG maxaccepts .TP .BI \-\-maxaccepts\~ "positive integer" Maximum number of matching target sequences to accept before stopping the search for a given query. The default value is 1. This option works in pair with \-\-maxrejects. The search process sorts target sequences by decreasing number of \fIk\fR-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. After pairwise alignments, if the first target sequence passes the acceptation criteria, it is accepted as best hit and the search process stops for that query. If \-\-maxaccepts is set to a higher value, more matching targets are accepted. If \-\-maxaccepts and \-\-maxrejects are both set to 0, the complete database is searched. See \-\-maxhits option for a control on the number of hits reported per query when search is done on both strands. .TAG maxdiffs .TP .BI \-\-maxdiffs\~ "positive integer" Reject the sequence match if the alignment contains at least \fIinteger\fR substitutions, insertions or deletions. .TAG maxgaps .TP .BI \-\-maxgaps\~ "positive integer" Reject the sequence match if the alignment contains at least \fIinteger\fR insertions or deletions. .TAG maxhits .TP .BI \-\-maxhits\~ "non-negative integer" Maximum number of hits to show once the search is terminated for a given query (hits are sorted by decreasing identity). When searching only on the plus strand (default situation, see \-\-strand), the number of matching targets (\-\-maxaccepts) and the number of hits (\-\-maxhits) are the same. However, when searching on both strands, there could be two hits per target (one per strand): \-\-maxhits then controls the overall number of reported hits per query. Unlimited by default or if the argument is zero. This option applies to \-\-alnout, \-\-blast6out, \-\-fastapairs, \-\-samout, \-\-uc, or \-\-userout output files. .TAG maxid .TP .BI \-\-maxid \0real Reject the sequence match if the percentage of identity between the two sequences is greater than \fIreal\fR. .TAG maxqsize .TP .BI \-\-maxqsize\~ "positive integer" Reject query sequences with an abundance greater than \fIinteger\fR. .TAG maxqt .TP .BI \-\-maxqt \0real Reject if the query/target sequence length ratio is greater than \fIreal\fR. .TAG maxrejects .TP .BI \-\-maxrejects\~ "positive integer" Maximum number of non-matching target sequences to consider before stopping the search for a given query. The default value is 32. This option works in pair with \-\-maxaccepts. The search process sorts target sequences by decreasing number of \fIk\fR-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. After pairwise alignments, if none of the first 32 examined target sequences pass the acceptation criteria, the search process stops for that query (no hit). If \-\-maxrejects is set to a higher value, more target sequences are considered. If \-\-maxaccepts and \-\-maxrejects are both set to 0, the complete database is searched. .TAG maxsizeratio .TP .BI \-\-maxsizeratio \0real Reject if the query/target abundance ratio is greater than \fIreal\fR. .TAG maxsl .TP .BI \-\-maxsl \0real Reject if the shorter/longer sequence length ratio is greater than \fIreal\fR. .TAG maxsubs .TP .BI \-\-maxsubs\~ "positive integer" Reject the sequence match if the pairwise alignment contains more than \fIinteger\fR substitutions. .TAG mid .TP .BI \-\-mid \0real Reject the sequence match if the percentage of identity is lower than \fIreal\fR (ignoring all gaps, internal and terminal). .TAG mincols .TP .BI \-\-mincols\~ "positive integer" Reject the sequence match if the alignment length is shorter than \fIinteger\fR. .TAG minqt .TP .BI \-\-minqt \0real Reject if the query/target sequence length ratio is lower than \fIreal\fR. .TAG minsizeratio .TP .BI \-\-minsizeratio \0real Reject if the query/target abundance ratio is lower than \fIreal\fR. .TAG minsl .TP .BI \-\-minsl \0real Reject if the shorter/longer sequence length ratio is lower than \fIreal\fR. .TAG mintsize .TP .BI \-\-mintsize\~ "positive integer" Reject target sequences with an abundance lower than \fIinteger\fR. .TAG minwordmatches .TP .BI \-\-minwordmatches\~ "non-negative integer" Minimum number of \fIk\fR-mers or word matches required for a sequence to be considered further. Default value is 12 for the default word length 8. For word lengths 3-15, the default minimum word matches are 18, 17, 16, 15, 14, 12, 11, 10, 9, 8, 7, 5 and 3, respectively. If the query sequence has fewer unique words than the number specified, all words in the query must match. If the argument is 0, no word matches are required. .TAG mismatch .TP .BI \-\-mismatch\~ "integer" Score assigned to a mismatch (i.e. different nucleotides) in the pairwise alignment. The default value is -4. .TAG mothur_shared_out .TP .BI \-\-mothur_shared_out \0filename Write search results to an OTU table in the mothur 'shared' tab-separated plain text file format. The query file contains the samples, while the database file contains the OTUs. Sample and OTU identifiers are extracted from the header of these sequences. See the \-\-otutabout option in the Clustering section for further details. .TAG notmatched .TP .BI \-\-notmatched \0filename Write query sequences not matching database target sequences to \fIfilename\fR, in fasta format. .TAG otutabout .TP .BI \-\-otutabout \0filename Write search results to an OTU table in the classic tab-separated plain text format. The query file contains the samples, while the database file contains the OTUs. Sample and OTU identifiers are extracted from the header of these sequences (\-\-sample option). See the \-\-mothur_shared_out option in the Clustering section for further details. .TAG output_no_hits .TP .B \-\-output_no_hits Write both matching and non-matching queries to \-\-alnout, \-\-blast6out, \-\-samout or \-\-userout output files. Non-matching queries are labelled 'No hits' in \-\-alnout files. .TAG pattern .TP .B \-\-pattern \fIstring\fR This option is ignored. It is provided for compatibility with usearch. .TAG qmask .TP .BI \-\-qmask\~ "none|dust|soft" Mask regions in the query sequences using the dust or the soft algorithms, or do not mask (none). Warning, when using soft masking search commands become case sensitive. The default is to mask using \fIdust\fR. .TAG qsegout .TP .BI \-\-qsegout \0filename Write the aligned part of each query sequence to \fIfilename\fR in FASTA format. .TAG query_cov .TP .BI \-\-query_cov \0real Reject if the fraction of the query aligned to the target sequence is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). The query coverage is computed as (matches + mismatches) / query sequence length. Internal or terminal gaps are not taken into account. .TAG rightjust .TP .B \-\-rightjust Reject the sequence match if the pairwise alignment ends with gaps. .TAG rowlen .TP .BI \-\-rowlen\~ "positive integer" Width of alignment lines in \-\-alnout output. The default value is 64. Set to 0 to eliminate wrapping. .TAG samheader .TP .B \-\-samheader Include header lines to the SAM file when \-\-samout is specified. The header includes lines starting with @HD, @SQ and @PG, but no @RG lines (see .URL https://github.com/samtools/hts-specs (link) ). By default no header line is written. .TAG samout .TP .BI \-\-samout \0filename Write alignment results to \fIfilename\fR using the SAM format (a tab-separated text file). When using the \-\-samheader option, the SAM file starts with header lines. Each non-header line is a SAM record, which represents either a query-target alignment or the absence of match for a query (output order may vary when using multiple threads). Each record contains 11 mandatory fields and optional fields (see .URL https://github.com/samtools/hts-specs (link) for a complete description of the format): .RS .RS .nr step 1 1 .IP \n[step]. 4 query sequence label. .IP \n+[step]. combination of bitwise flags. Possible values are: 0 (top hit), 4 (no hit), 16 (reverse-complemented hit), 256 (secondary hit, i.e. all hits except the top hit). .IP \n+[step]. target sequence label. .IP \n+[step]. first position of a target aligned with the query (always 1 for global pairwise alignments, 0 if there is no match). .IP \n+[step]. mapping quality (ignored, always set to '*'). .IP \n+[step]. CIGAR string (set to '*' if there is no match). .IP \n+[step]. name of the target sequence matching with the next read of the query (for mate reads only, ignored and always set to '*'). .IP \n+[step]. position of the primary alignment of the next read of the query (for mate reads only, ignored and always set to 0). .IP \n+[step]. target sequence length (for multi-segment targets, ignored and always set to 0). .IP \n+[step]. query sequence (complete, not only the segment aligned to the target as usearch does). .IP \n+[step]. quality string (ignored, always set to '*'). .RE .TP Optional fields for query-target matches (number and order of fields may vary): .RS .nr step 12 1 .IP \n[step]. 4 AS:i:? alignment score (i.e. percentage of identity). .IP \n+[step]. XN:i:? next best alignment score (always set to 0). .IP \n+[step]. XM:i:? number of mismatches. .IP \n+[step]. XO:i:? number of gap openings (excluding terminal gaps). .IP \n+[step]. XG:i:? number of gap extensions (excluding terminal gaps). .IP \n+[step]. NM:i:? edit distance to the target (sum of XM and XG). .IP \n+[step]. MD:Z:? string for mismatching positions. .IP \n+[step]. YT:Z:UU string representing the alignment type. .RE .RE .TAG search_exact .TP .BI \-\-search_exact \0filename Search for exact full-length matches to the query sequences contained in \fIfilename\fR in the database of target sequences (\-\-db). Only 100% exact matches are reported and this command is much faster than \-\-usearch_global. The \-\-id, \-\-maxaccepts and \-\-maxrejects options are ignored, but the rest of the searching options may be specified. .TAG self .TP .B \-\-self Reject the sequence match if the query and target labels are identical. .TAG selfid .TP .B \-\-selfid Reject the sequence match if the query and target sequences are strictly identical. .TAG sizeout .TP .B \-\-sizeout Add abundance annotations to the output of the option \-\-dbmatched (using the pattern ';size=\fIinteger\fR;'), to report the number of queries that matched each target. .TAG strand .TP .BI \-\-strand\~ "plus|both" When searching for similar sequences, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. .TAG target_cov .TP .BI \-\-target_cov \0real Reject the sequence match if the fraction of the target sequence aligned to the query sequence is lower than \fIreal\fR. The target coverage is computed as (matches + mismatches) / target sequence length. Internal or terminal gaps are not taken into account. .TAG top_hits_only .TP .B \-\-top_hits_only Only the top hits with an equally high percentage of identity between the query and database sequence sets are written to the output specified with the options \-\-lcaout, \-\-alnout, \-\-samout, \-\-userout, \-\-blast6out, \-\-uc, \-\-fastapairs, \-\-matched or \-\-notmatched (but not \-\-dbmatched and \-\-dbnotmatched). For each query, the top hit is the one presenting the highest percentage of identity (see the \-\-iddef option to change the way identity is measured). For a given query, if several top hits present exactly the same percentage of identity, the number of matching targets reported is controlled by the \-\-maxaccepts value (1 by default), and the number of hits is controlled by the \-\-maxhits option. .TAG tsegout .TP .BI \-\-tsegout \0filename Write the aligned part of each target sequence to \fIfilename\fR in FASTA format. .TAG uc .TP .BI \-\-uc \0filename Output searching results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns. When using the \-\-search_exact command, the table layout is the same than with the \-\-allpairs_global. When using the \-\-usearch_global command, the table present two different type of entries: hit (H) or no hit (N). Each query sequence is compared to all other sequences, and the best hit (\-\-maxaccepts 1) or several hits (\-\-maxaccepts > 1) are reported (H). Output order may vary when using multiple threads. Column content varies with the type of entry (H or N): .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type: H, or N ('hit' or 'no hit'). .IP \n+[step]. Ordinal number of the target sequence (based on input order, starting from zero). Set to '*' for N. .IP \n+[step]. Sequence length. Set to '*' for N. .IP \n+[step]. Percentage of similarity with the target sequence. Set to '*' for N. .IP \n+[step]. Match orientation + or -. . Set to '.' for N. .IP \n+[step]. Not used, always set to zero for H, or '*' for N. .IP \n+[step]. Not used, always set to zero for H, or '*' for N. .IP \n+[step]. Compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence. Set to '*' for N. .IP \n+[step]. Label of the query sequence. .IP \n+[step]. Label of the target centroid sequence. Set to '*' for N. .RE .RE .TAG uc_allhits .TP .B \-\-uc_allhits When using the \-\-uc option, show all hits, not just the top hit for each query. .TAG usearch_global .TP .BI \-\-usearch_global \0filename Compare target sequences (\-\-db) to the query sequences contained in \fIfilename\fR in FASTA or FASTQ format, using global pairwise alignment. .TAG userfields .TP .BI \-\-userfields \0string When using \-\-userout, select and order the fields written to the output file. Fields are separated by '+' (e.g. query+target+id). See the 'Userfields' section for a complete list of fields. .TAG userout .TP .BI \-\-userout \0filename Write user-defined tab-separated output to \fIfilename\fR. Select the fields with the option \-\-userfields. Output order may vary when using multiple threads. If \-\-userfields is empty or not present, \fIfilename\fR is empty. .TAG weak_id .TP .BI \-\-weak_id \0real Show hits with percentage of identity of at least \fIreal\fR, without terminating the search. A normal search stops as soon as enough hits are found (as defined by \-\-maxaccepts, \-\-maxrejects, and \-\-id). As \-\-weak_id reports weak hits that are not deduced from \-\-maxaccepts (but count towards \-\-maxrejects), high \-\-id values can be used, hence preserving both speed and sensitivity. Logically, \fIreal\fR must be smaller than the value indicated by \-\-id. .TAG wordlength .TP .BI \-\-wordlength\~ "positive integer" Length of words (i.e. \fIk\fR-mers) for database indexing. The range of possible values goes from 3 to 15, but values near 8 or 9 are generally recommended. Longer words may reduce the sensitivity/recall for weak similarities, but can increase precision. On the other hand, shorter words may increase sensitivity or recall, but may reduce precision. Computation time generally increases with shorter words and decreases with longer words, but it increases again for very long words. Memory requirements for a part of the index increase with a factor of 4 each time word length increases by one nucleotide, and this generally becomes significant for long words (12 or more). The default value is 8. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .RE .PP .\" ---------------------------------------------------------------------------- .TAG shuffling-options Shuffling options: .RS Fasta entries in the input file are outputted in a pseudo-random order. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG output .TP 9 .BI \-\-output \0filename Write the shuffled sequences to \fIfilename\fR, in fasta format. .TAG randseed .TP .BI \-\-randseed\~ "positive integer" When shuffling sequence order, use \fIinteger\fR as seed. A given seed always produces the same output order (useful for replicability). Set to 0 to use a pseudo-random seed (default behaviour). .TAG relabel .TP .BI \-\-relabel \0string Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3, etc.) to construct the new headers. Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequences using the MD5 message digest algorithm applied to each sequence. Former sequence headers are discarded. The sequence is converted to upper case and U is replaced by T before the digest is computed. The MD5 digest is a cryptographic hash function designed to minimize the probability that two different inputs gives the same output, even for very similar, but non-identical inputs. Still, there is always a very small, but non-zero probability that two different inputs give the same result. The MD5 digest generates a 128-bit (16-byte) digest that is represented by 16 hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_self .TP .B \-\-relabel_self Relabel sequences using the sequence itself as the label. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequences using the SHA1 message digest algorithm applied to each sequence. It is similar to the \-\-relabel_md5 option but uses the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest generates a 160-bit (20-byte) result that is represented by 20 hexadecimal numbers (40 symbols). The probability of a collision (two non-identical sequences having the same digest) is smaller for the SHA1 algorithm than it is for the MD5 algorithm. Use \-\-sizeout to conserve the abundance annotations. .TAG sizeout .TP .B \-\-sizeout When using \-\-relabel, \-\-relabel_self, \-\-relabel_md5 or \-\-relabel_sha1, preserve and report abundance annotations to the output fasta file (using the pattern ';size=\fIinteger\fR;'). .TAG shuffle .TP .BI \-\-shuffle \0filename Pseudo-randomly shuffle the order of sequences contained in \fIfilename\fR. .TAG topn .TP .BI \-\-topn\~ "positive integer" Output only the first \fIinteger\fR sequences after pseudo-random reordering. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG sorting-options Sorting options: .RS Fasta entries are sorted by decreasing abundance (\-\-sortbysize) or sequence length (\-\-sortbylength). To obtain a stable sorting order, ties are sorted by decreasing abundance (if present) and label increasing alpha-numerical order (\-\-sortbylength), or just by label increasing alpha-numerical order (\-\-sortbysize). Label sorting assumes that all sequences have unique labels. The same applies to the automatic sorting performed during chimera checking (\-\-uchime_denovo), dereplication (\-\-derep_fulllength), and clustering (\-\-cluster_fast and \-\-cluster_size). .PP .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG maxsize .TP 9 .BI \-\-maxsize\~ "positive integer" When using \-\-sortbysize, discard sequences with an abundance value greater than \fIinteger\fR. .TAG minsize .TP .BI \-\-minsize\~ "positive integer" When using \-\-sortbysize, discard sequences with an abundance value smaller than \fIinteger\fR. .TAG output .TP .BI \-\-output \0filename Write the sorted sequences to \fIfilename\fR, in fasta format. .TAG relabel .TP .BI \-\-relabel \0string Please see the description of the same option under Chimera detection for details. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .BI \-\-relabel_md5 Please see the description of the same option under Chimera detection for details. .TAG relabel_self .TP .BI \-\-relabel_self Please see the description of the same option under Chimera detection for details. .TAG relabel_sha1 .TP .BI \-\-relabel_sha1 Please see the description of the same option under Chimera detection for details. .TAG sizeout .TP .B \-\-sizeout When using \-\-relabel, report abundance annotations to the output fasta file (using the pattern ';size=\fIinteger\fR;'). .TAG sortbylength .TP .BI \-\-sortbylength \0filename Sort by decreasing length the sequences contained in \fIfilename\fR. See the general options \-\-minseqlength and \-\-maxseqlength to eliminate short and long sequences. .TAG sortbysize .TP .BI \-\-sortbysize \0filename Sort by decreasing abundance the sequences contained in \fIfilename\fR (missing abundance values are assumed to be ';size=1'). See the options \-\-minsize and \-\-maxsize to eliminate rare and dominant sequences. .TAG topn .TP .BI \-\-topn\~ "positive integer" Output only the top \fIinteger\fR sequences (i.e. the longest or the most abundant). .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG subsampling-options Subsampling options: .RS Subsampling randomly extracts a certain number or a certain percentage of the sequences in the input file. If the \-\-sizein option is in effect, the abundances of the input sequences is taken into account and the sampling is performed as if the input sequences were rereplicated, subsampled and dereplicated before being written to the output file. The extraction is performed as a random sampling with a uniform distribution among the input sequences and is performed without replacement. The input file is specified with the \-\-fastx_subsample option, the output files are specified with the \-\-fastaout and \-\-fastqout options and the amount of sequences to be sampled is specified with the \-\-sample_pct or \-\-sample_size options. The sequences not sampled may be written to files specified with the options \-\-fasta_discarded and \-\-fastq_discarded. The \-\-fastq_ascii, \-\-fastq_qmin and \-\-fastq_qmax options are also available. .PP .TAG fastaout .TP 9 .BI \-\-fastaout \0filename Write the sampled sequences to \fIfilename\fR, in fasta format. .TAG fastaout_discarded .TP .BI \-\-fastaout_discarded \0filename Write the sequences not sampled to \fIfilename\fR, in fasta format. .TAG fastq_ascii .TP .BI \-\-fastq_ascii\~ "positive integer" Define the ASCII character number used as the basis for the FASTQ quality score. The default is 33, which is used by the Sanger / Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 and 64 are valid arguments. .TAG fastq_qmax .TP .BI \-\-fastq_qmax\~ "positive integer" Specify the maximum quality score accepted when reading FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. .TAG fastq_qmin .TP .BI \-\-fastq_qmin\~ "positive integer" Specify the minimum quality score accepted for FASTQ files. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the sampled sequences to \fIfilename\fR, in fastq format. Requires input in fastq format. .TAG fastqout_discarded .TP .BI \-\-fastqout_discarded \0filename Write the sequences not sampled to \fIfilename\fR, in fastq format. Requires input in fastq format. .TAG fastx_subsample .TP .BI \-\-fastx_subsample \0filename Perform subsampling from the sequences in the specified input file that is in FASTA or FASTQ format. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG randseed .TP .BI \-\-randseed\~ "positive integer" Use \fIinteger\fR as a seed for the pseudo-random generator. A given seed always produces the same output, which is useful for replicability. Set to 0 to use a pseudo-random seed (default behaviour). .TAG relabel .TP .BI \-\-relabel \0string Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3, etc.) to construct the new headers. Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequences using the MD5 message digest algorithm applied to each sequence. Former sequence headers are discarded. The sequence is converted to upper case and U is replaced by T before the digest is computed. The MD5 digest is a cryptographic hash function designed to minimize the probability that two different inputs give the same output, even for very similar, but non-identical inputs. Still, there is always a very small, but non-zero probability that two different inputs give the same result. The MD5 digest generates a 128-bit (16-byte) digest that is represented by 16 hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_self .TP .B \-\-relabel_self Relabel sequences using the sequence itself as the label. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequences using the SHA1 message digest algorithm applied to each sequence. It is similar to the \-\-relabel_md5 option but uses the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest generates a 160-bit (20-byte) result that is represented by 20 hexadecimal numbers (40 symbols). The probability of a collision (two non-identical sequences having the same digest) is smaller for the SHA1 algorithm than it is for the MD5 algorithm. Use \-\-sizeout to conserve the abundance annotations. .TAG sample_pct .TP .BI \-\-sample_pct\~ "real" Subsample the given percentage of the input sequences. Accepted values range from 0.0 to 100.0. .TAG sample_size .TP .BI \-\-sample_size\~ "positive integer" Extract the given number of sequences. .TAG sizein .TP .B \-\-sizein Take the abundance information of the input file into account, otherwise the abundance of each sequence is considered to be 1. .TAG sizeout .TP .B \-\-sizeout Write abundance information to the output file. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG taxonomic-classification-options Taxonomic classification options: .RS The vsearch command \-\-sintax will classify the input sequences according to the Sintax algorithm as described by Robert Edgar (2016) in SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences, BioRxiv, 074161. Preprint. doi: 10.1101/074161 .URL https://doi.org/10.1101/074161 (link) .PP The name of the fasta file containing the input sequences to be classified is given as an argument to the \-\-sintax command. The reference sequence database is specified with the \-\-db option. The results are written in a tab delimited text file whose name is specified with the \-\-tabbedout option. The \-\-sintax_cutoff option may be used to set a minimum level of bootstrap support for the taxonomic ranks to be reported. The \-\-randseed option may be included to specify a seed for initialisation of the random number generator used by the algorithm. Please note that when using multiple threads, the \-\-randseed option may not work as intended, because sequences may be processed in a random order by different threads. To ensure the same results each time, use a single thread \-\-threads 1) in combination with a fixed random seed specified with \-\-randseed. .PP Multithreading is supported. Databases in UDB files are supported. The strand option may be specified. .PP The reference database must contain taxonomic information in the header of each sequence in the form of a string starting with ";tax=" and followed by a comma-separated list of up to nine taxonomic identifiers. Each taxonomic identifier must start with an indication of the rank by one of the letters d (for domain) k (kingdom), p (phylum), c (class), o (order), f (family), g (genus), s (species), or t (strain). The letter is followed by a colon (:) and the name of that rank. Commas and semicolons are not allowed in the name of the rank. Non-ascii characters should be avoided in the names. .PP Example: >X80725_S000004313;\:tax=d:Bacteria,\:p:Proteobacteria,\:c:Gammaproteobacteria,\:o:Enterobacteriales,\:f:Enterobacteriaceae,\:g:Escherichia/Shigella,\:s:Escherichia_coli,\:t:str._K-12_substr._MG1655 .PP The option \-\-notrunclabels is turned on by default for this command, allowing spaces in the taxonomic identifiers. .PP If two sequences in the reference database has equally many kmer matches with the query, the shortest sequence will be chosen by default. If they are equally long, the sequence appearing first in the database will be chosen. If the recommended option \-\-sintax_random is specified, sequences with an equal number of kmer matches will instead be chosen by a random draw. .PP .TAG db .TP 9 .BI \-\-db \0filename Read the reference sequences from \fIfilename\fR, in FASTA, FASTQ or UDB format. These sequences need to be annotated with taxonomy. .TAG randseed .TP .BI \-\-randseed\~ "positive integer" Use \fIinteger\fR as seed for the random number generator used in the Sintax algorithm. A given seed always produces the same output order (useful for replicability). Set to 0 to use a pseudo-random seed (default behaviour). Does not work correctly with multiple threads; please use \-\-threads 1 to ensure correct behaviour. .TAG sintax .TP .BI \-\-sintax \0filename Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format. .TAG sintax_cutoff .TP .BI \-\-sintax_cutoff\~ "real" Specify a minimum level of bootstrap support for the taxonomic ranks that will be included in column 4 of the output file. For instance 0.9, corresponding to 90%. .TAG sintax_random .TP .B \-\-sintax_random Break ties between sequences with equally many kmer matches by a random draw. This option is recommended and may be made the default in the future. .TAG tabbedout .TP .BI \-\-tabbedout \0filename Write the results to \fIfilename\fR, in a tab-separated text format. Column 1 contains the query label. Column 2 contains the predicted taxonomy in the same format as for the reference data, with bootstrap support indicated in parentheses after each rank. Column 3 contains the strand. If the \-\-sintax_cutoff option is used, the predicted taxonomy will be repeated in column 4 while omitting the bootstrap values and including only the ranks with support at or above the threshold. .RE .PP .\" ---------------------------------------------------------------------------- .TAG udb-options UDB options: .RS Databases to be used with the \-\-usearch_global command may be prepared from FASTA files and stored to a binary UDB formatted file in order to speed up searching. This may be worthwhile when searching a large database repeatedly. The sequences are indexed and stored in a way that can be quickly loaded into memory. The commands and options below can be used to create and inspect UDB files. An UDB file may be specified with the \-\-db option instead of a FASTA formatted file with the \-\-usearch_global command. .PP .TAG dbmask .TP 9 .BI \-\-dbmask\~ "none|dust|soft" Specify the sequence masking method used with the \-\-makeudb_usearch command, either none, dust or soft. No masking is performed when none is specified. When dust is specified, the DUST algorithm will be used for masking low complexity regions (short repeats and skewed composition). Lower case letters in the input file will be masked when soft is specified (soft masking). .TAG hardmask .TP .B \-\-hardmask Mask sequences by replacing letters with N for the \-\-makeudb_usearch command. The default is to use lower case letters (soft masking). .TAG makeudb_usearch .TP .BI \-\-makeudb_usearch \0filename Create an UDB database file from the FASTA-formatted sequences in the file with the given \fIfilename\fR. The UDB database is written to the file specified with the \-\-output option. .TAG output .TP .BI \-\-output \0filename Specify the \fIfilename\fR of a FASTA or UDB output file for the \-\-makeudb_usearch or the \-\-udb2fasta command, respectively. .TAG udb2fasta .TP .BI \-\-udb2fasta \0filename Read the UDB database in the file with the given \fIfilename\fR and output the sequences in FASTA format in the file specified by the \-\-output option. .TAG udbinfo .TP .BI \-\-udbinfo \0filename Show information about the UDB database in the file with the given \fIfilename\fR. .TAG udbstats .TP .BI \-\-udbstats \0filename Report statistics about the indexed words in the UDB database in the file with the given \fIfilename\fR. .TAG wordlength .TP .BI \-\-wordlength\~ "positive integer" Specify the length of the words to be used when creating the UDB database index using the \-\-makeudb_usearch command. Valid numbers range from 3 to 15. The default is 8. .RE .PP .\" ---------------------------------------------------------------------------- .TAG userfields Userfields (fields accepted by the \-\-userfields option): .RS .TP 9 .B aln Print a string of M (match/mismatch, i.e. not a gap), D (delete, i.e. a gap in the query) and I (insert, i.e. a gap in the target) representing the pairwise alignment. Empty field if there is no alignment. .TP .B alnlen Print the length of the query-target alignment (number of columns). The field is set to 0 if there is no alignment. .TP .B bits Bit score (not computed for nucleotide alignments). Always set to 0. .TP .B caln Compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). Empty field if there is no alignment. .TP .B evalue E-value (not computed for nucleotide alignments). Always set to -1. .TP .B exts Number of columns containing a gap extension (zero or positive integer value). .TP .B gaps Number of columns containing a gap (zero or positive integer value, excluding terminal gaps). .TP .B id The percentage of identity, according to the identity definition specified by the \-\-iddef option. Equal to id0, id1, id2, id3 or id4 below. By default the same as id2. .TP .B id0 CD-HIT definition of the percentage of identity (real value ranging from 0.0 to 100.0) using the length of the shortest sequence in the pairwise alignment as denominator: 100 * (matching columns) / (shortest sequence length). .TP .B id1 The percentage of identity (real value ranging from 0.0 to 100.0) is defined as the edit distance: 100 * (matching columns) / (alignment length). .TP .B id2 The percentage of identity (real value ranging from 0.0 to 100.0) is defined as the edit distance, excluding terminal gaps. .TP .B id3 Marine Biological Lab definition of the percentage of identity (real value ranging from 0.0 to 100.0), counting each gap opening (internal or terminal) as a single mismatch, whether or not the gap was extended, and using the length of the longest sequence in the pairwise alignment as denominator: 100 * (1.0 - [(mismatches + gaps) / (longest sequence length)]). .TP .B id4 BLAST definition of the percentage of identity (real value ranging from 0.0 to 100.0), equivalent to \-\-iddef 1 in a context of global pairwise alignment. The field id4 is always equal to the field id1. .TP .B ids Number of matches in the alignment (zero or positive integer value). .TP .B mism Number of mismatches in the alignment (zero or positive integer value). .TP .B opens Number of columns containing a gap opening (zero or positive integer value, excluding terminal gaps). .TP .B pairs Number of columns containing only nucleotides. That value corresponds to the length of the alignment minus the gap-containing columns (zero or positive integer value). .TP .B pctgaps Number of columns containing gaps expressed as a percentage of the alignment length (real value ranging from 0.0 to 100.0). .TP .B pctpv Percentage of positive columns. When working with nucleotide sequences, this is equivalent to the percentage of matches (real value ranging from 0.0 to 100.0). .TP .B pv Number of positive columns. When working with nucleotide sequences, this is equivalent to the number of matches (zero or positive integer value). .TP .B qcov Fraction of the query sequence that is aligned with the target sequence (real value ranging from 0.0 to 100.0). The query coverage is computed as 100.0 * (matches + mismatches) / query sequence length. Internal or terminal gaps are not taken into account. The field is set to 0.0 if there is no alignment. .TP .B qframe Query frame (-3 to +3). That field only concerns coding sequences and is not computed by \fBvsearch\fR. Always set to +0. .TP .B qhi Last nucleotide of the query aligned with the target. Always equal to the length of the pairwise alignment, 0 otherwise (see \fIqihi\fR to ignore terminal gaps). .TP .B qihi Last nucleotide of the query aligned with the target (ignoring terminal gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B qilo First nucleotide of the query aligned with the target (ignoring initial gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B ql Query sequence length (positive integer value). The field is set to 0 if there is no alignment. .TP .B qlo First nucleotide of the query aligned with the target. Always equal to 1 if there is an alignment, 0 otherwise (see \fIqilo\fR to ignore initial gaps). .TP .B qrow Print the sequence of the query segment as seen in the pairwise alignment (i.e. with gap insertions if need be). Empty field if there is no alignment. .TP .B qs Query segment length. Always equal to query sequence length. .\" The meaning of that field is not clear to us. .TP .B qstrand Query strand orientation (+ or - for nucleotide sequences). Empty field if there is no alignment. .TP .B query Query label. .TP .B raw Raw alignment score (negative, null or positive integer value). The score is the sum of match rewards minus mismatch penalties, gap openings and gap extensions. The field is set to 0 if there is no alignment. .TP .B target Target label. The field is set to '*' if there is no alignment. .TP .B tcov Fraction of the target sequence that is aligned with the query sequence (real value ranging from 0.0 to 100.0). The target coverage is computed as 100.0 * (matches + mismatches) / target sequence length. Internal or terminal gaps are not taken into account. The field is set to 0.0 if there is no alignment. .TP .B tframe Target frame (-3 to +3). That field only concerns coding sequences and is not computed by \fBvsearch\fR. Always set to +0. .TP .B thi Last nucleotide of the target aligned with the query. Always equal to the length of the pairwise alignment, 0 otherwise (see \fItihi\fR to ignore terminal gaps). .TP .B tihi Last nucleotide of the target aligned with the query (ignoring terminal gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B tilo First nucleotide of the target aligned with the query (ignoring initial gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B tl Target sequence length (positive integer value). The field is set to 0 if there is no alignment. .TP .B tlo First nucleotide of the target aligned with the query. Always equal to 1 if there is an alignment, 0 otherwise (see \fItilo\fR to ignore initial gaps). .TP .B trow Print the sequence of the target segment as seen in the pairwise alignment (i.e. with gap insertions if need be). Empty field if there is no alignment. .TP .B ts Target segment length. Always equal to target sequence length. The field is set to 0 if there is no alignment. .TP .B tstrand Target strand orientation (+ or - for nucleotide sequences). Always set to '+', so reverse strand matches have tstrand '+' and qstrand '\-'. Empty field if there is no alignment. .RE .PP .\" ============================================================================ .SH DELIBERATE CHANGES If you are a usearch user, our objective is to make you feel at home. That's why \fBvsearch\fR was designed to behave like usearch, to some extent. Like any complex software, usearch is not free from quirks and inconsistencies. We decided not to reproduce some of them, and for complete transparency, to document here the deliberate changes we made. .PP During a search with usearch, when using the options \-\-blast6out and \-\-output_no_hits, for queries with no match the number of fields reported is 13, where it should be 12. This is corrected in \fBvsearch\fR. .PP The field raw of the \-\-userfields option is not informative in usearch. This is corrected in \fBvsearch\fR. .PP The fields qlo, qhi, tlo, thi now have counterparts (qilo, qihi, tilo, tihi) reporting alignment coordinates ignoring terminal gaps. .PP In usearch, when using the option \-\-output_no_hits, queries that receive no match are reported in \-\-blast6out file, but not in the alignment output file. This is corrected in \fBvsearch\fR. .PP \fBvsearch\fR introduces a new \-\-cluster_size command that sorts sequences by decreasing abundance before clustering. .PP \fBvsearch\fR reintroduces \-\-iddef alternative pairwise identity definitions that were removed from usearch. .PP \fBvsearch\fR extends the \-\-topn option to sorting commands. .PP \fBvsearch\fR extends the \-\-sizein option to dereplication (\-\-derep_fulllength) and clustering (\-\-cluster_fast). .PP \fBvsearch\fR treats T and U as identical nucleotides during dereplication. .PP \fBvsearch\fR sorting is stabilized by using sequence abundances or sequences labels as secondary or tertiary keys. .PP \fBvsearch\fR by default uses the DUST algorithm for masking low-complexity regions. Masking behaviour is also slightly changed to be more consistent. .PP .\" ============================================================================ .SH NOVELTIES \fBvsearch\fR introduces new commands and new options not present in usearch 7. They are described in the 'Options' section of this manual. Here is a short list: .RS .IP - 2 uchime2_denovo, uchime3_denovo, alignwidth, borderline, fasta_score (chimera checking) .IP - cluster_size, cluster_unoise, clusterout_id, clusterout_sort, profile (clustering) .IP - fasta_width, gzip_decompress, bzip2_decompress (general option) .IP - iddef (clustering, pairwise alignment, searching) .IP - maxuniquesize (dereplication) .IP - relabel_md5, relabel_self and relabel_sha1 (chimera detection, dereplication, FASTQ processing, shuffling, sorting) .IP - shuffle (shuffling) .IP - fastq_eestats, fastq_eestats2, fastq_maxlen, fastq_truncee (FASTQ processing) .IP - fastaout_discarded, fastqout_discarded (subsampling) .IP - rereplicate (dereplication/rereplication) .RE .PP .\" ============================================================================ .SH EXAMPLES .PP Align all sequences in a database with each other and output all pairwise alignments: .PP .RS \fBvsearch\fR \-\-allpairs_global \fIdatabase.fas\fR \-\-alnout \fIresults.aln\fR \-\-acceptall .RE .PP Check for the presence of chimeras (\fIde novo\fR); parents should be at least 1.5 times more abundant than chimeras. Output non-chimeric sequences in fasta format (no wrapping): .PP .RS \fBvsearch\fR \-\-uchime_denovo \fIqueries.fas\fR \-\-abskew 1.5 \-\-nonchimeras \fIresults.fas\fR \-\-fasta_width 0 .RE .PP Cluster with a 97% similarity threshold, collect cluster centroids, and write cluster descriptions using a uclust-like format: .PP .RS \fBvsearch\fR \-\-cluster_fast \fIqueries.fas\fR \-\-id 0.97 \-\-centroids \fIcentroids.fas\fR \-\-uc \fIclusters.uc\fR .RE .PP Dereplicate the sequences contained in \fIqueries.fas\fR, take into account the abundance information already present, write unwrapped fasta sequences to \fIqueries_unique.fas\fR with the new abundance information, discard all sequences with an abundance of 1: .PP .RS \fBvsearch\fR \-\-derep_fulllength \fIqueries.fas\fR \-\-sizein \-\-fasta_width 0 \-\-sizeout \-\-output \fIqueries_unique.fas\fR \-\-minuniquesize 2 .RE .PP Mask simple repeats and low complexity regions in the input fasta file with the DUST algorithm (masked regions are lowercased), and write the results to the output file: .PP .RS \fBvsearch\fR \-\-maskfasta \fIqueries.fas\fR \-\-qmask dust \-\-output \fIqueries_masked.fas\fR .RE .PP Search queries in a reference database, with a 80%-similarity threshold, take terminal gaps into account when calculating pairwise similarities, output pairwise alignments: .PP .RS \fBvsearch\fR \-\-usearch_global \fIqueries.fas\fR \-\-db \fIreferences.fas\fR \-\-id 0.8 \-\-iddef 1 \-\-alnout \fIresults.aln\fR .RE .PP Search a sequence dataset against itself (ignore self hits), get all matches with at least 60% similarity, and collect results in a blast-like tab-separated format. Accept an unlimited number of hits (\-\-maxaccepts 0), and compare each query to all other sequences, including unlikely candidates (\-\-maxrejects 0): .PP .RS \fBvsearch\fR \-\-usearch_global \fIqueries.fas\fR \-\-db \fIqueries.fas\fR \-\-self \-\-id 0.6 \-\-blast6out \fIresults.blast6\fR \-\-maxaccepts 0 \-\-maxrejects 0 .RE .PP Shuffle the input fasta file (change the order of sequences) in a repeatable fashion (fixed seed), and write unwrapped fasta sequences to the output file: .PP .RS \fBvsearch\fR \-\-shuffle \fIqueries.fas\fR \-\-output \fIqueries_shuffled.fas\fR \-\-randseed 13 \-\-fasta_width 0 .RE .PP Sort by decreasing abundance the sequences contained in \fIqueries.fas\fR (using the 'size=\fIinteger\fR' information), relabel the sequences while preserving the abundance information (with \-\-sizeout), keep only sequences with an abundance equal to or greater than 2: .PP .RS \fBvsearch\fR \-\-sortbysize \fIqueries.fas\fR \-\-output \fIqueries_sorted.fas\fR \-\-relabel sampleA_ \-\-sizeout \-\-minsize 2 .RE .PP .\" .\" ============================================================================ .SH AUTHORS Implementation and documentation by Torbjørn Rognes, Frédéric Mahé and Tomás Flouri. .PP .\" ============================================================================ .SH CITATION .PP Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. \fIPeerJ\fR 4:e2584 doi: 10.7717/peerj.2584 .URL https://doi.org/10.7717/peerj.2584 (link) .PP .\" ============================================================================ .SH REPORTING BUGS Submit suggestions and bug-reports at .URL https://github.com/torognes/vsearch/issues (link) , send a pull request on .URL https://github.com/torognes/vsearch (link) , or compose a friendly or curmudgeont e-mail to Torbjørn Rognes .MTO torognes@ifi.uio.no (link) . .PP .\" ============================================================================ .SH AVAILABILITY Source code and binaries are available at . .PP .\" ============================================================================ .SH COPYRIGHT Copyright (C) 2014-2024, Torbjørn Rognes, Frédéric Mahé and Tomás Flouri .PP All rights reserved. .PP Contact: Torbjørn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway .PP This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. .PP \fBGNU General Public License version 3\fR .PP This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. .PP This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. .PP You should have received a copy of the GNU General Public License along with this program. If not, see .URL https://www.gnu.org/licenses/ (link) . .PP .PP \fBThe BSD 2-Clause License\fR .PP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: .PP 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. .PP 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. .PP THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .PP We would like to thank the authors of the following projects for making their source code available: .RS .IP - 2 \fBvsearch\fR includes code from Google's CityHash project by Geoff Pike and Jyrki Alakuijala, providing some excellent hash functions available under a MIT license. .IP - \fBvsearch\fR includes code derived from Tatusov and Lipman's DUST program that is in the public domain. .IP - \fBvsearch\fR includes public domain code written by Alexander Peslyak for the MD5 message digest algorithm. .IP - \fBvsearch\fR includes public domain code written by Steve Reid and others for the SHA1 message digest algorithm. .IP - \fBvsearch\fR binaries may include code from the zlib library, copyright Jean-Loup Gailly and Mark Adler. .IP - \fBvsearch\fR binaries may include code from the bzip2 library, copyright Julian R. Seward. .RE .PP .\" ============================================================================ .SH SEE ALSO \fBswipe\fR, an extremely fast pairwise local (Smith-Waterman) database search tool by Torbjørn Rognes, available at .URL https://github.com/torognes/swipe "(link)" . .PP \fBswarm\fR, a fast and accurate amplicon clustering method by Frédéric Mahé and Torbjørn Rognes, available at .URL https://github.com/torognes/swarm "(link)" . .PP .\" ============================================================================ .SH VERSION HISTORY New features and important modifications of \fBvsearch\fR (short lived or minor bug releases may not be mentioned): .TP .BR v1.0.0\~ "released November 28th, 2014" First public release. .TP .BR v1.0.1\~ "released December 1st, 2014" Bug fixes (sortbysize, semicolon after size annotation in headers) and minor changes (labels as secondary sort key for most sorts, treat T and U as identical for dereplication, only output size in \-\-dbmatched file if \-\-sizeout specified). .TP .BR v1.0.2\~ "released December 6th, 2014" Bug fixes (ssse3/sse4.1 requirement, memory leak). .TP .BR v1.0.3\~ "released December 6th, 2014" Bug fix (now writes help to stdout instead of stderr). .TP .BR v1.0.4\~ "released December 8th, 2014" Added \-\-allpairs_global option. Reduce memory requirements slightly and eliminate memory leaks. .TP .BR v1.0.5\~ "released December 9th, 2014" Fixes a minor bug with \-\-allpairs_global and \-\-acceptall options. .TP .BR v1.0.6\~ "released December 14th, 2014" Fixes a memory allocation bug in chimera detection (\-\-uchime_ref option). .TP .BR v1.0.7\~ "released December 19th, 2014" Fixes a bug in the output from chimera detection with the \-\-uchimeout option. .TP .BR v1.0.8\~ "released January 22nd, 2015" Introduces several changes and bug fixes: .RS .IP - 2 a new linear memory aligner for alignment of sequences longer than 5,000 nucleotides, .IP - a new \-\-cluster_size command that sorts sequences by decreasing abundance before clustering, .IP - meaning of userfields qlo, qhi, tlo, thi changed for compatibility with usearch, .IP - new userfields qilo, qihi, tilo, tihi give alignment coordinates ignoring terminal gaps, .IP - in \-\-uc output files, a perfect alignment is indicated with a '=' sign, .IP - the option \-\-cluster_fast now sorts sequences by decreasing length, then by decreasing abundance and finally by sequence identifier, .IP - default \-\-maxseqlength value set to 50,000 nucleotides, .IP - fix for bug in alignment in rare cases, .IP - fix for lack of detection of under- or overflow in SIMD aligner. .RE .TP .BR v1.0.9\~ "released January 22nd, 2015" Fixes a bug in the function sorting sequences by decreasing abundance (\-\-sortbysize). .TP .BR v1.0.10\~ "released January 23rd, 2015" Fixes a bug where the \-\-sizein option was ignored and always treated as on, affecting clustering and dereplication commands. .TP .BR v1.0.11\~ "released February 5th, 2015" Introduces the possibility to output results in SAM format (for clustering, pairwise alignment and searching). .TP .BR v1.0.12\~ "released February 6th, 2015" Temporarily fixes a problem with long headers in FASTA files. .TP .BR v1.0.13\~ "released February 17th, 2015" Fix a memory allocation problem when computing multiple sequence alignments with the \-\-msaout and \-\-consout options, as well as a memory leak. Also increased line buffer for reading FASTA files to 4MB. .TP .BR v1.0.14\~ "released February 17th, 2015" Fix a bug where the multiple alignment and consensus sequence computed after clustering ignored the strand of the sequences. Also decreased size of line buffer for reading FASTA files to 1MB again due to excessive stack memory usage. .TP .BR v1.0.15\~ "released February 18th, 2015" Fix bug in calculation of identity metric between sequences when using the MBL definition (\-\-iddef 3). .TP .BR v1.0.16\~ "released February 19th, 2015" Integrated patches from Debian for increased compatibility with various architectures. .TP .BR v1.1.0\~ "released February 20th, 2015" Added the \-\-quiet option to suppress all output to stdout and stderr except for warnings and fatal errors. Added the \-\-log option to write messages to a log file. .TP .BR v1.1.1\~ "released February 20th, 2015" Added info about \-\-log and \-\-quiet options to help text. .TP .BR v1.1.2\~ "released March 18th, 2015" Fix bug with large datasets. Fix format of help info. .TP .BR v1.1.3\~ "released March 18th, 2015" Fix more bugs with large datasets. .TP .BR v1.2.0-1.2.19\~ "released July 6th to September 8th, 2015" Several new commands and options added. Bugs fixed. Documentation updated. .TP .BR v1.3.0\~ "released September 9th, 2015" Changed to autotools build system. .TP .BR v1.3.1\~ "released September 14th, 2015" Several new commands and options. Bug fixes. .TP .BR v1.3.2\~ "released September 15th, 2015" Fixed memory leaks. Added '-h' shortcut for help. Removed extra 'v' in version number. .TP .BR v1.3.3\~ "released September 15th, 2015" Fixed bug in hexadecimal digits of MD5 and SHA1 digests. Added \-\-samheader option. .TP .BR v1.3.4\~ "released September 16th, 2015" Fixed compilation problems with zlib and bzip2lib. .TP .BR v1.3.5\~ "released September 17th, 2015" Minor configuration/makefile changes to compile to native CPU and simplify makefile. .TP .BR v1.4.0\~ "released September 25th, 2015" Added \-\-sizeorder option. .TP .BR v1.4.1\~ "released September 29th, 2015" Inserted public domain MD5 and SHA1 code to eliminate dependency on crypto and openssl libraries and their licensing issues. .TP .BR v1.4.2\~ "released October 2nd, 2015" Dynamic loading of libraries for reading gzip and bzip2 compressed files if available. Circumvention of missing gzoffset function in zlib 1.2.3 and earlier. .TP .BR v1.4.3\~ "released October 3rd, 2015" Fix a bug with determining amount of memory on some versions of Apple OS X. .TP .BR v1.4.4\~ "released October 3rd, 2015" Remove debug message. .TP .BR v1.4.5\~ "released October 6th, 2015" Fix memory allocation bug when reading long FASTA sequences. .TP .BR v1.4.6\~ "released October 6th, 2015" Fix subtle bug in SIMD alignment code that reduced accuracy. .TP .BR v1.4.7\~ "released October 7th, 2015" Fixes a problem with searching for or clustering sequences with repeats. In this new version, vsearch looks at all words occurring at least once in the sequences in the initial step. Previously only words occurring exactly once were considered. In addition, vsearch now requires at least 10 words to be shared by the sequences, previously only 6 were required. If the query contains less than 10 words, all words must be present for a match. This change seems to lead to slightly reduced recall, but somewhat increased precision, ending up with slightly improved overall accuracy. .TP .BR v1.5.0\~ "released October 7th, 2015" This version introduces the new option \-\-minwordmatches that allows the user to specify the minimum number of matching unique words before a sequence is considered further. New default values for different word lengths are also set. The minimum word length is increased to 7. .TP .BR v1.6.0\~ "released October 9th, 2015" This version adds the relabeling options (\-\-relabel, \-\-relabel_md5 and \-\-relabel_sha1) to the shuffle command. It also adds the \-\-xsize option to the clustering, dereplication, shuffling and sorting commands. .TP .BR v1.6.1\~ "released October 14th, 2015" Fix bugs and update manual and help text regarding relabelling. Add all relabelling options to the subsampling command. Add the \-\-xsize option to chimera detection, dereplication and fastq filtering commands. Refactoring of code. .TP .BR v1.7.0\~ "released October 14th, 2015" Add \-\-relabel_keep option. .TP .BR v1.8.0\~ "released October 19th, 2015" Added \-\-search_exact, \-\-fastx_mask and \-\-fastq_convert commands. Changed most commands to read FASTQ input files as well as FASTA files. Modified \-\-fastx_revcomp and \-\-fastx_subsample to write FASTQ files. .TP .BR v1.8.1\~ "released November 2nd, 2015" Fixes for compatibility with QIIME and older OS X versions. .TP .BR v1.9.0\~ "released November 12th, 2015" Added the \-\-fastq_mergepairs command and associated options. This command has not been tested well yet. Included additional files to avoid dependency of autoconf for compilation. Fixed an error where identifiers in fasta headers where not truncated at tabs, just spaces. Fixed a bug in detection of the file format (FASTA/FASTQ) of a gzip compressed input file. .TP .BR v1.9.1\~ "released November 13th, 2015" Fixed memory leak and a bug in score computation in \-\-fastq_mergepairs, and improved speed. .TP .BR v1.9.2\~ "released November 17th, 2015" Fixed a bug in the computation of some values with \-\-fastq_stats. .TP .BR v1.9.3\~ "released November 19th, 2015" Workaround for missing x86intrin.h with old compilers. .TP .BR v1.9.4\~ "released December 3rd, 2015" Fixed incrementation of counter when relabeling dereplicated sequences. .TP .BR v1.9.5\~ "released December 3rd, 2015" Fixed bug resulting in inferior chimera detection performance. .TP .BR v1.9.6\~ "released January 8th, 2016" Fixed bug in aligned sequences produced with \-\-fastapairs and \-\-userout (qrow, trow) options. .TP .BR v1.9.7\~ "released January 12th, 2016" Masking behaviour is changed somewhat to keep the letter case of the input sequences unchanged when no masking is performed. Masking is now performed also during chimera detection. Documentation updated. .TP .BR v1.9.8\~ "released January 22nd, 2016" Fixed bug causing segfault when chimera detection is performed on extremely short sequences. .TP .BR v1.9.9\~ "released January 22nd, 2016" Adjusted default minimum number of word matches during searches for improved performance. .TP .BR v1.9.10\~ "released January 25th, 2016" Fixed bug related to masking and lower case database sequences. .TP .BR v1.10.0\~ "released February 11th, 2016" Parallelized and improved merging of paired-end reads and adjusted some defaults. Removed progress indicator when stderr is not a terminal. Added \-\-fasta_score option to report chimera scores in FASTA files. Added \-\-rereplicate and \-\-fastq_eestats commands. Fixed typos. Added relabelling to files produced with \-\-consout and \-\-profile options. .TP .BR v1.10.1\~ "released February 23rd, 2016" Fixed a bug affecting the \-\-fastq_mergepairs command causing FASTQ headers to be truncated at first space (despite the bug fix release 1.9.0 of November 12th, 2015). Full headers are now included in the output (no matter if \-\-notrunclabels is in effect or not). .TP .BR v1.10.2\~ "released March 18th, 2016" Fixed a bug causing a segmentation fault when running \-\-usearch_global with an empty query sequence. Also fixed a bug causing imperfect alignments to be reported with an alignment string of '=' in uc output files. Fixed typos in man file. Fixed fasta/fastq processing code regarding presence or absence of compression library header files. .TP .BR v1.11.1\~ "released April 13th, 2016" Added strand information in UC file for \-\-derep_fulllength and \-\-derep_prefix. Added expected errors (ee) to header of FASTA files specified with \-\-fastaout and \-\-fastaout_discarded when \-\-eeout or \-\-fastq_eeout option is in effect for fastq_filter and fastq_mergepairs. The options \-\-eeout and \-\-fastq_eeout are now equivalent. .TP .BR v1.11.2\~ "released June 21st, 2016" Two bugs were fixed. The first issue was related to the \-\-query_cov option that used a different coverage definition than the qcov userfield. The coverage is now defined as the fraction of the whole query sequence length that is aligned with matching or mismatching residues in the target. All gaps are ignored. The other issue was related to the consensus sequences produced during clustering when only N's were present in some positions. Previously these would be converted to A's in the consensus. The behaviour is changed so that N's are produced in the consensus, and it should now be more compatible with usearch. .TP .BR v2.0.0\~ "released June 24th, 2016" This major new version supports reading from pipes. Two new options are added: \-\-gzip_decompress and \-\-bzip2_decompress. One of these options must be specified if reading compressed input from a pipe, but are not required when reading from ordinary files. The vsearch header that was previously written to stdout is now written to stderr. This enables piping of results for further processing. The file name '\-' now represent standard input (/dev/stdin) or standard output (/dev/stdout) when reading or writing files, respectively. Code for reading FASTA and FASTQ files has been refactored. .TP .BR v2.0.1\~ "released June 30th, 2016" Avoid segmentation fault when masking very long sequences. .TP .BR v2.0.2\~ "released July 5th, 2016" Avoid warnings when compiling with GCC 6. .TP .BR v2.0.3\~ "released August 2nd, 2016" Fixed bad compiler options resulting in Illegal instruction errors when running precompiled binaries. .TP .BR v2.0.4\~ "released September 1st, 2016" Improved error message for bad FASTQ quality values. Improved manual. .TP .BR v2.0.5\~ "released September 9th, 2016" Add options \-\-fastaout_discarded and \-\-fastqout_discarded to output discarded sequences from subsampling to separate files. Updated manual. .TP .BR v2.1.0\~ "released September 16th, 2016" New command: \-\-fastx_filter. New options: \-\-fastq_maxlen, \-\-fastq_truncee. Allow \-\-minwordmatches down to 3. .TP .BR v2.1.1\~ "released September 23rd, 2016" Fixed bugs in output to UC-files. Improved help text and manual. .TP .BR v2.1.2\~ "released September 28th, 2016" Fixed incorrect abundance output from fastx_filter and fastq_filter when relabelling. .TP .BR v2.2.0\~ "released October 7th, 2016" Added OTU table generation options \-\-biomout, \-\-mothur_shared_out and \-\-otutabout to the clustering and searching commands. .TP .BR v2.3.0\~ "released October 10th, 2016" Allowed zero-length sequences in FASTA and FASTQ files. Added \-\-fastq_trunclen_keep option. Fixed bug with output of OTU tables to pipes. .TP .BR v2.3.1\~ "released November 16th, 2016" Fixed bug where \-\-minwordmatches 0 was interpreted as the default minimum word matches for the given word length instead of zero. When used in combination with \-\-maxaccepts 0 and \-\-maxrejects 0 it will allow complete bypass of kmer-based heuristics. .TP .BR v2.3.2\~ "released November 18th, 2016" Fixed bug where vsearch reported the ordinal number of the target sequence instead of the cluster number in column 2 on H-lines in the uc output file after clustering. For search and alignment commands both usearch and vsearch reports the target sequence number here. .TP .BR v2.3.3\~ "released December 5th, 2016" A minor speed improvement. .TP .BR v2.3.4\~ "released December 9th, 2016" Fixed bug in output of sequence profiles and updated documentation. .TP .BR v2.4.0\~ "released February 8th, 2017" Added support for Linux on Power8 systems (ppc64le) and Windows on x86_64. Improved detection of pipes when reading FASTA and FASTQ files. Corrected option for specifying output from fastq_eestats command in help text. .TP .BR v2.4.1\~ "released March 1st, 2017" Fixed an overflow bug in fastq_stats and fastq_eestats affecting analysis of very large FASTQ files. Fixed maximum memory usage reporting on Windows. .TP .BR v2.4.2\~ "released March 10th, 2017" Default value for fastq_minovlen increased to 16 in accordance with help text and for compatibility with usearch. Minor changes for improved accuracy of paired-end read merging. .TP .BR v2.4.3\~ "released April 6th, 2017" Fixed bug with progress bar for shuffling. Fixed missing N-lines in UC files with usearch_global, search_exact and allpairs_global when the output_no_hits option was not specified. .TP .BR v2.4.4\~ "released August 28th, 2017" Fixed a few minor bugs, improved error messages and updated documentation. .TP .BR v2.5.0\~ "released October 5th, 2017" Support for UDB database files. New commands: fastq_stripright, fastq_eestats2, makeudb_usearch, udb2fasta, udbinfo, and udbstats. New general option: no_progress. New options minsize and maxsize to fastx_filter. Minor bug fixes, error message improvements and documentation updates. .TP .BR v2.5.1\~ "released October 25th, 2017" Fixed bug with bad default value of 1 instead of 32 for minseqlength when using the makeudb_usearch command. .TP .BR v2.5.2\~ "released October 30th, 2017" Fixed bug with where '-' as an argument to the fastq_eestats2 option was treated literally instead of equivalent to stdin. .TP .BR v2.6.0\~ "released November 10th, 2017" Rewritten paired-end reads merger with improved accuracy. Decreased default value for fastq_minovlen option from 16 to 10. The default value for the fastq_maxdiffs option is increased from 5 to 10. There are now other more important restrictions that will avoid merging reads that cannot be reliably aligned. .TP .BR v2.6.1\~ "released December 8th, 2017" Improved parallelisation of paired end reads merging. .TP .BR v2.6.2\~ "released December 18th, 2017" Fixed option xsize that was partially inactive for commands uchime_denovo, uchime_ref, and fastx_filter. .TP .BR v2.7.0\~ "released February 13th, 2018" Added commands cluster_unoise, uchime2_denovo and uchime3_denovo contributed by Davide Albanese based on Robert Edgar's papers. Refactored fasta and fastq print functions as well as code for extraction of abundance and other attributes from the headers. .TP .BR v2.7.1\~ "released February 16th, 2018" Fix several bugs on Windows related to large files, use of "-" as a file name to mean stdin or stdout, alignment errors, missed kmers and corrupted UDB files. Added documentation of UDB-related commands. .TP .BR v2.7.2\~ "released April 20th, 2018" Added the sintax command for taxonomic classification. Fixed a bug with incorrect FASTA headers of consensus sequences after clustering. .TP .BR v2.8.0\~ "released April 24th, 2018" Added the fastq_maxdiffpct option to the fastq_mergepairs command. .TP .BR v2.8.1\~ "released June 22nd, 2018" Fixes for compilation warnings with GCC 8. .TP .BR v2.8.2\~ "released August 21st, 2018" Fix for wrong placement of semicolons in header lines in some cases when using the sizeout or xsize options. Reduced memory requirements for full-length dereplication in cases with many duplicate sequences. Improved wording of fastq_mergepairs report. Updated manual regarding use of sizein and sizeout with dereplication. Changed a compiler option. .TP .BR v2.8.3\~ "released August 31st, 2018" Fix for segmentation fault for \-\-derep_fulllength with \-\-uc. .TP .BR v2.8.4\~ "released September 3rd, 2018" Further reduce memory requirements for dereplication when not using the uc option. Fix output during subsampling when quiet or log options are in effect. .TP .BR v2.8.5\~ "released September 26th, 2018" Fixed a bug in fastq_eestats2 that caused the values for large lengths to be much too high when the input sequences had varying lengths. .TP .BR v2.8.6\~ "released October 9th, 2018" Fixed a bug introduced in version 2.8.2 that caused derep_fulllength to include the full FASTA header in its output instead of stopping at the first space (unless the notrunclabels option is in effect). .TP .BR v2.9.0\~ "released October 10th, 2018" Added the fastq_join command. .TP .BR v2.9.1\~ "released October 29th, 2018" Changed compiler options that select the target cpu and tuning to allow the software to run on any 64-bit x86 system, while tuning for more modern variants. Avoid illegal instruction error on some architectures. Update documentation of rereplicate command. .TP .BR v2.10.0\~ "released December 6th, 2018" Added the sff_convert command to convert SFF files to FASTQ. Added some additional option argument checks. Fixed segmentation fault bug after some fatal errors when a log file was specified. .TP .BR v2.10.1\~ "released December 7th, 2018" Improved sff_convert command. It will now read several variants of the SFF format. It is also able to read from a pipe. Warnings are given if there are minor problems. Errors messages have been improved. Minor speed and memory usage improvements. .TP .BR v2.10.2\~ "released December 10th, 2018" Fixed bug in sintax with reversed order of domain and kingdom. .TP .BR v2.10.3\~ "released December 19th, 2018" Ported to Linux on ARMv8 (aarch64). Fixed compilation warning with gcc version 8.1.0 and 8.2.0. .TP .BR v2.10.4\~ "released January 4th, 2019" Fixed serious bug in x86_64 SIMD alignment code introduced in version 2.10.3. Added link to BioConda in README. Fixed bug in fastq_stats with sequence length 1. Fixed use of equals symbol in UC files for identical sequences with cluster_fast. .TP .BR v2.11.0\~ "released February 13th, 2019" Added ability to trim and filter paired-end reads using the reverse option with the fastx_filter and fastq_filter commands. Added \-\-xee option to remove ee attributes from FASTA headers. Minor invisible improvement to the progress indicator. .TP .BR v2.11.1\~ "released February 28th, 2019" Minor change to the handling of the weak_id and id options when using cluster_unoise. .TP .BR v2.12.0\~ "released March 19th, 2019" Take sequence abundance into account when computing consensus sequences or profiles after clustering. Warn when rereplicating sequences without abundance info. Guess offset 33 in more cases with fastq_chars. Stricter checking of option arguments and option combinations. .TP .BR v2.13.0\~ "released April 11th, 2019" Added the \-\-fastx_getseq, \-\-fastx_getseqs and \-\-fastx_getsubseq commands to extract sequences from a FASTA or FASTQ file based on their labels. Improved handling of ambiguous nucleotide symbols. Corrected behaviour of \-\-uchime_ref command with and options \-\-self and \-\-selfid. Strict detection of illegal options for each command. .TP .BR v2.13.1\~ "released April 26th, 2019" Minor changes to the allowed options for each command. All commands now allow the log, quiet and threads options. If more than 1 thread is specified for commands that are not multi-threaded, a warning will be issued. Minor changes to the manual. .TP .BR v2.13.2\~ "released April 30th, 2019" Fixed bug related to improper handling of newlines on Windows. Allowed option strand plus to uchime_ref for compatibility. .TP .BR v2.13.3\~ "released April 30th, 2019" Fixed bug in FASTQ parsing introduced in version 2.13.2. .TP .BR v2.13.4\~ "released May 10th, 2019" Added information about support for gzip- and bzip2-compressed input files to the output of the version command. Adapted source code for compilation on FreeBSD and NetBSD systems. .TP .BR v2.13.5\~ "released July 2nd, 2019" Added cut command to fragment sequences at restriction sites. Silenced output from the fastq_stats command if quiet option was given. Updated manual. .TP .BR v2.13.6\~ "released July 2nd, 2019" Added info about cut command to output of help command. .TP .BR v2.13.7\~ "released September 2nd, 2019" Fixed bug in consensus sequence introduced in version 2.13.0. .TP .BR v2.14.0\~ "released September 11th, 2019" Added relabel_self option. Made fasta_width, sizein, sizeout and relabelling options valid for certain commands. .TP .BR v2.14.1\~ "released September 18th, 2019" Fixed bug with sequences written to file specified with fastaout_rev for commands fastx_filter and fastq_filter. .TP .BR v2.14.2\~ "released January 28th, 2020" Fixed some issues with the cut, fastx_revcomp, fastq_convert, fastq_mergepairs, and makeudb_usearch commands. Updated manual. .TP .BR v2.15.0\~ "released June 19th, 2020" Update manual and documentation. Turn on notrunclabels option for sintax command by default. Change maxhits 0 to mean unlimited hits, like the default. Allow non-ascii characters in headers, with a warning. Sort centroids and uc too when clusterout_sort specified. Add cluster id to centroids output when clusterout_id specified. Improve error messages when parsing FASTQ files. Add missing fastq_qminout option and fix label_suffix option for fastq_mergepairs. Add derep_id command that dereplicates based on both label and sequence. Remove compilation warnings. .TP .BR v2.15.1\~ "released October 28th, 2020" Fix for dereplication when including reverse complement sequences and headers. Make some extra checks when loading compression libraries and add more diagnostic output about them to the output of the version command. Report an error when fastx_filter is used with FASTA input and options that require FASTQ input. Update manual. .TP .BR v2.15.2\~ "released January 26th, 2021" No real functional changes, but some code and compilation changes. Compiles successfully on macOS running on Apple Silicon (ARMv8). Binaries available. Code updated for C++11. Minor adaptations for Windows compatibility, including the use of the C++ standard library for regular expressions. Minor changes for compatibility with Power8. Switch to C++ header files. .TP .BR v2.16.0\~ "released March 22nd, 2021" This version adds the orient command. It also handles empty input files properly. Documentation has been updated. .TP .BR v2.17.0\~ "released March 29nd, 2021" The fastq_mergepairs command has been changed. It now allows merging of sequences with overlaps as short as 5 bp if the \-\-fastq_minovlen option has been adjusted down from the default 10. In addition, much fewer pairs of reads should now be rejected with the reason 'multiple potential alignments' as the algorithm for detecting those have been changed. .TP .BR v2.17.1\~ "released June 14th, 2021" Modernized code. Minor changes to help info. .TP .BR v2.18.0\~ "released August 27th, 2021" Added the fasta2fastq command. Fixed search bug on ppc64le. Fixed bug with removal of size and ee info in uc files. Fixed compilation errors in some cases. Made some general code improvements. Updated manual. .TP .BR v2.19.0\~ "released December 21st, 2021" Added the lcaout and lca_cutoff options to enable the output of last common ancestor (LCA) information about hits when searching. The randseed option was added as a valid option to the sintax command. Code improvements. .TP .BR v2.20.0\~ "released January 10th, 2022" Added the fastx_uniques command and the fastq_qout_max option for dereplication of FASTQ files. Some code cleaning. .TP .BR v2.20.1\~ "released January 11th, 2022" Fixes a bug in fastq_mergepair that caused an occational hang at the end when using multiple threads. .TP .BR v2.21.0\~ "released January 12th, 2022" This version adds the sample, qsegout and tsegout options. It enables the use of UDB databases with uchime_ref. .TP .BR v2.21.1\~ "released January 18th, 2022" Fix a problem with dereplication of empty input files. Update Altivec code on ppc64le for improved compiler compatibility (vector->__vector). .TP .BR v2.21.2\~ "released September 12th, 2022" Fix problems with the lcaout option when using maxaccepts above 1 and either lca_cutoff below 1 or with top_hits_only enabled. Update documentation. Update code to avoid compiler warnings. .TP .BR v2.22.0\~ "released September 19th, 2022" Add the derep_smallmem command for dereplication using little memory. .TP .BR v2.22.1\~ "released September 19th, 2022" Fix compiler warning. .TP .BR v2.23.0\~ "released July 7th, 2023" Update documentation. Add citation file. Modernize and improve code. Fix several minor bugs. Fix compilation with GCC 13. Print stats after fastq_mergepairs to log file instead of stderr. Handle sizein option correctly with dbmatched option for usearch_global. Allow maxseqlength option for makeudb_usearch. Fix memory allocation problem with chimera detection. Add lengthout and xlength options. Increase precision for eeout option. Add warning about sintax algorithm, random seed and multiple threads. Refactor chimera detection code. Add undocumented experimental long_chimeras_denovo command. Fix segfault with clustering. Add more references. .TP .BR v2.24.0\~ "released October 26th, 2023" Update documentation. Improve code. Allow up to 20 parents for the undocumented and experimental chimeras_denovo command. Fix compilation warnings for sha1.c. Compile for release (not debug) by default. .TP .BR v2.25.0\~ "released November 10th, 2023" Allow a given percentage of mismatches between chimeras and parents for the experimental chimeras_denovo command. .TP .BR v2.26.0\~ "released November 24th, 2023" Enable the maxseqlength and minseqlength options for the chimera detection commands. When the usearch_global or search_exact commands are used, OTU tables will include samples and OTUs with no matches. .TP .BR v2.26.1\~ "released November 25th, 2023" No real changes, but the previous version was released without proper updates to the source code. .TP .BR v2.27.0\~ "released January 19th, 2024" The usearch_global and search_exact commands now support FASTQ files as well as FASTA files as input. This version of vsearch includes clarifications and updates to the manual. Some code has been refactored. Generic Dockerfiles for major Linux distributions have been included. Some warnings from compilers and other tools have been eliminated. The release for Windows will also include DLL's for the two compression libraries. .TP .BR v2.27.1\~ "released April 6th, 2024" This version fixes the weak_id option and makes searches report weak hits in some cases. It also updates the names of the compression libraries to libz.so.1 and libbz2.so.1 on Linux to make them work on common Linux distributions without installing additional packages. README.md has been updated with information about compression libraries on Windows. .TP .BR v2.28.0\~ "released April 26th, 2024" The sintax command has been improved in several ways in this version of vsearch. Please note that several details of this algorithm is not clearly described in the preprint, and the implementation in vsearch differs from that in usearch. The former vsearch version did not always choose the most common taxonomic entity over the 100 bootstraps among the database sequences with the highest amount of word similarity to the query. Instead, if several sequences had an equal similarity with the query, the sequence encountered in the earliest bootstrap was chosen. The confidence level was calculated based on this sequence compared to the selected sequences from the other 99 bootstraps. This could lead to a suboptimal choice with a low confidence. In the new version, the most common of the sequences with the highest amount of word similarity across the 100 bootstraps will be selected, and ties will be broken randomly. Another problem with the old implementation was that if several sequences had the same amount of word similarity, the shortest one in the reference database would be chosen, and if they were equally long, the earliest in the database file would be chosen. A new option called sintax_random has now been introduced. This option will randomly select one of the sequences with the highest number of shared words with the query, without considering their length or position. This avoids a bias towards shorter reference sequences. This option is strongly recommended and will probably soon be the default. Furthermore, a ninth taxonomic rank, strain (letter t), is now recognized. The speed of the sintax command has also been significantly improved at least in some cases. Run vsearch with the randseed option and 1 thread to ensure reproducibility of the random choices in the algorithm. .TP .BR v2.28.1\~ "released April 26th, 2024" Fix a segmentation fault that could occur with the blast6out and output_no_hits options. .TP .BR v2.29.0\~ "released September 26th, 2024" This version fixes seven bugs (see changelog below), adds initial support for RISC-V architectures, and improves code quality and code testing (1,210 new tests): .RS .IP - 2 add: experimental support for RISCV64 and other 64-bit little-endian architectures, thanks to Michael R. Crusoe and his fellow Debian developers (issue #566), .IP - add: official support for clang-19 and gcc 14, .IP - add: beta support for clang-20, .IP - remove: unused \-\-output option for command \-\-fastq_stats (issue #572), .IP - fix: bug in \-\-sintax when selecting the best lineage (only low confidence values below 0.5 were affected) (issue #573), .IP - fix: out-of-bounds error in \-\-fastq_stats when processing empty reads (issue #571), .IP - fix: bug in \-\-cut, patterns with multiple cutting sites were not detected (commit 4c4f9fa70f14b28d50185dbf322cf5727087e86a), .IP - fix: memory error (segmentation fault) when using \-\-derep_id and \-\-strand (issue #565), .IP - fix: \-\-fastq_join now obeys to \-\-quiet and \-\-log options (commit 87f968b09f17c17ebf8db00aebe86e89b13a3948), .IP - fix: \-\-fastq_join quality padding is now also set to Q40 when quality offset is 64 (commit be0bf9b48d782286c4ce38f0bf1a4c82bd230250), .IP - fix: (partial) \-\-fastq_join's handling of abundance annotations (commit f2bbcb421dc2f4dfa6603b9f31ec3e4598c1b591), .IP - improve: additional safeguards to validate input values and to make sure that they are within acceptable limits. Changes concern options \-\-abskew (commit a530dd8990f8a05cb25fc0b6a5da5a14d28fbedd) and \-\-fastq_maxdiffs (commit 4b254db7f120bfd49e86185ef3cd9070c236f940), .IP - improve: code quality (1.3k+ commits, 6k+ clang-tidy warnings eliminated), .IP - improve: documentation and help messages (issue #568), .IP - improve: complete refactoring and modernization of a subset of commands (\-\-sortbylength, \-\-sortbysize, \-\-shuffle, \-\-rereplicate, \-\-cut, \-\-fastq_join, \-\-fasta2fastq, \-\-fastq_chars), .IP - improve: code-coverage of our test-suite for the above-mentioned commands (1,210 new tests, 4,753 in total) .RE .LP .TP .BR v2.29.1\~ "released October 24th, 2024" Fix a segmentation fault that could occur during alignment in version 2.29.0, for example with \-\-uchime_ref. Some improvements to code and documentation. .TP .BR v2.29.2\~ "released December 20th, 2024" Fix a segmentation fault during clustering when the set of clusters is empty. Initial documentation in markdown format available on GitHub Pages. .TP .BR v2.29.3\~ "released February 3rd, 2025" This version is released in order to mitigate a bug that occurs when compiling the `align_simd.cc` file on x86_64 systems with the GNU C++ compiler version 9 or later with the `-O3` optimization option. It results in incorrect code that may cause bad alignments in some circumstances. We are investigating this issue further, but for now we recommend compiling with the `-O2` flag. The README.md file and the Dockerfiles have been updated to reflect this. The binaries released with this version will include this fix. .TP .BR v2.29.4\~ "released February 14th, 2025" Adjust the window size used for chimera detection down from 64 to 32. The window size was by accident increased from 32 to 64 in version 2.23.0, leading to somewhat fewer chimeras being predicted. In addition, a compiler pragma has been included in align_simd.cc to further protect the compiler from generating wrong code. .TP .BR v2.30.0\~ "released February 27th, 2025" Add options `\-\-n_mismatch`, `\-\-fastq_minqual`, and `\-\-fastq_truncee_rate`. The `\-\-n_mismatch` option will count N's as mismatches in alignments, which may be useful to get sensible alignments for sequences with lots of N's. By default N's are counted as matches. Both the scoring and the counting of matches are affected. The new `\-\-fastq_minqual` option for the `fastq_filter` and `fastx_filter` commands will discard sequences with any bases with a quality scores below the given value. The new `\-\-fastq_truncee_rate` option for the same commands will truncate sequences at the first position where the number of expected errors per base is below the given value. .\" ============================================================================ .\" TODO: .\" .\" NOTES .\" visualize and output to pdf .\" man -l vsearch.1 .\" man -t ./doc/vsearch.1 | ps2pdf - > ./doc/vsearch_manual.pdf vsearch-2.30.0/src/000077500000000000000000000000001476012147200140225ustar00rootroot00000000000000vsearch-2.30.0/src/Makefile.am000066400000000000000000000064141476012147200160630ustar00rootroot00000000000000bin_PROGRAMS = $(top_builddir)/bin/vsearch AM_CFLAGS = -Wall -Wextra -Wpedantic # Conditionally set profiling based on ENABLE_PROFILING if ENABLE_PROFILING AM_CFLAGS += -pg -O1 endif if TARGET_PPC AM_CFLAGS += -mcpu=powerpc64le -maltivec else if TARGET_AARCH64 AM_CFLAGS += -march=armv8-a+simd -mtune=generic else if TARGET_X86_64 AM_CFLAGS += -march=x86-64 -mtune=generic endif endif endif # Conditionally set NDEBUG based on ENABLE_DEBUG if ENABLE_DEBUG AM_CFLAGS += -UNDEBUG -Wcast-align -Wdate-time \ -Wdouble-promotion -Wduplicated-branches -Wduplicated-cond -Wfloat-equal \ -Wformat=1 -Wformat-overflow -Wlogical-op -Wnon-virtual-dtor -Wnull-dereference \ -Woverloaded-virtual -Wuninitialized \ -Wunsafe-loop-optimizations -Wunused -Wvla else AM_CFLAGS += -DNDEBUG endif AM_CXXFLAGS = $(AM_CFLAGS) -std=c++11 export MACOSX_DEPLOYMENT_TARGET=10.9 VSEARCHHEADERS=\ align_simd.h \ allpairs.h \ arch.h \ attributes.h \ bitmap.h \ chimera.h \ city.h \ citycrc.h \ cluster.h \ cpu.h \ cut.h \ db.h \ dbhash.h \ dbindex.h \ derep.h \ derep_prefix.h \ derep_smallmem.h \ dynlibs.h \ eestats.h \ fasta2fastq.h \ fasta.h \ fastq.h \ fastq_chars.h \ fastq_join.h \ fastqops.h \ fastx.h \ filter.h \ getseq.h \ kmerhash.h \ linmemalign.h \ maps.h \ mask.h \ md5.h \ mergepairs.h \ minheap.h \ msa.h \ orient.h \ otutable.h \ rereplicate.h \ results.h \ search.h \ searchcore.h \ search_exact.h \ sff_convert.h \ showalign.h \ sha1.h \ shuffle.h \ sintax.h \ sortbylength.h \ sortbysize.h \ subsample.h \ tax.h \ udb.h \ unique.h \ userfields.h \ util.h \ utils/maps.hpp \ utils/seqcmp.h \ vsearch.h \ xstring.h if TARGET_X86_64 libcpu_sse2_a_SOURCES = cpu.cc $(VSEARCHHEADERS) libcpu_sse2_a_CXXFLAGS = $(AM_CXXFLAGS) -msse2 libcpu_ssse3_a_SOURCES = cpu.cc $(VSEARCHHEADERS) libcpu_ssse3_a_CXXFLAGS = $(AM_CXXFLAGS) -mssse3 -DSSSE3 noinst_LIBRARIES = libcpu_sse2.a libcpu_ssse3.a libcityhash.a else libcpu_a_SOURCES = cpu.cc $(VSEARCHHEADERS) noinst_LIBRARIES = libcpu.a libcityhash.a endif libcityhash_a_SOURCES = city.cc city.h if TARGET_WIN libcityhash_a_CXXFLAGS = $(AM_CXXFLAGS) -Wno-sign-compare -D_MSC_VER __top_builddir__bin_vsearch_LDFLAGS = -static __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu_ssse3.a libcpu_sse2.a else libcityhash_a_CXXFLAGS = $(AM_CXXFLAGS) -Wno-sign-compare if TARGET_X86_64 __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu_ssse3.a libcpu_sse2.a else __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu.a endif endif __top_builddir__bin_vsearch_SOURCES = $(VSEARCHHEADERS) \ align_simd.cc \ allpairs.cc \ arch.cc \ attributes.cc \ bitmap.cc \ chimera.cc \ cluster.cc \ cut.cc \ db.cc \ dbhash.cc \ dbindex.cc \ derep.cc \ derep_prefix.cc \ derep_smallmem.cc \ dynlibs.cc \ eestats.cc \ fasta2fastq.cc \ fasta.cc \ fastq.cc \ fastq_chars.cc \ fastq_join.cc \ fastqops.cc \ fastx.cc \ filter.cc \ getseq.cc \ kmerhash.cc \ linmemalign.cc \ maps.cc \ mask.cc \ md5.c \ mergepairs.cc \ minheap.cc \ msa.cc \ orient.cc \ otutable.cc \ rereplicate.cc \ results.cc \ search.cc \ searchcore.cc \ search_exact.cc \ sff_convert.cc \ sha1.c \ showalign.cc \ shuffle.cc \ sintax.cc \ sortbylength.cc \ sortbysize.cc \ subsample.cc \ tax.cc \ udb.cc \ unique.cc \ userfields.cc \ util.cc \ utils/maps.cpp \ utils/seqcmp.cc \ vsearch.cc vsearch-2.30.0/src/align_simd.cc000066400000000000000000001655771476012147200164640ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // int64_t, uint64_t #include // std::printf, std::snprintf #include // std::memcpy, std::memmove, std::memset, std::strcpy, std::strlen #include /* Using 16-bit signed values, from -32768 to +32767. match: positive mismatch: negative gap penalties: positive (open, extend, query/target, left/interior/right) optimal global alignment (NW) maximize score */ constexpr auto CHANNELS = 8; constexpr auto CDEPTH = 4; /* Due to memory usage, limit the product of the length of the sequences. If the product of the query length and any target sequence length is above the limit, the alignment will not be computed and a score of SHRT_MAX will be returned as the score. If an overflow occurs during alignment computation, a score of SHRT_MAX will also be returned. The limit is set to 5 000 * 5 000 = 25 000 000. This will allocate up to 200 MB per thread. It will align pairs of sequences less than 5000 nt long using the SIMD implementation, larger alignments will be performed with the linear memory aligner. */ #include "align_simd.h" constexpr auto MAXSEQLENPRODUCT = 25000000LL; static int64_t scorematrix[16][16]; /* The macros below usually operate on 128-bit vectors of 8 signed short 16-bit integers. Additions and subtractions should be saturated. The shift operation should shift left by 2 bytes (one short int) and shift in zeros. The v_mask_gt operation should compare two vectors of signed shorts and return a 16-bit bitmask with pairs of 2 bits set for each element greater in the first than in the second argument. */ #ifdef __PPC__ using VECTOR_SHORT = __vector signed short; const __vector unsigned char perm_merge_long_low = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17}; const __vector unsigned char perm_merge_long_high = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h} #define v_load(a) vec_ld(0, (VECTOR_SHORT *)(a)) #define v_store(a, b) vec_st((__vector unsigned char)(b), 0, \ (__vector unsigned char *)(a)) #define v_add(a, b) vec_adds((a), (b)) #define v_sub(a, b) vec_subs((a), (b)) #define v_sub_unsigned(a, b) ((VECTOR_SHORT) \ vec_subs((__vector unsigned short) (a), \ (__vector unsigned short) (b))) #define v_max(a, b) vec_max((a), (b)) #define v_min(a, b) vec_min((a), (b)) #define v_dup(a) vec_splat((VECTOR_SHORT){(short)(a), 0, 0, 0, 0, 0, 0, 0}, 0); #define v_zero vec_splat_s16(0) #define v_and(a, b) vec_and((a), (b)) #define v_xor(a, b) vec_xor((a), (b)) #define v_shift_left(a) vec_sld((a), v_zero, 2) #elif defined __aarch64__ using VECTOR_SHORT = int16x8_t; const uint16x8_t neon_mask = {0x0003, 0x000c, 0x0030, 0x00c0, 0x0300, 0x0c00, 0x3000, 0xc000}; #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h} #define v_load(a) vld1q_s16((const int16_t *)(a)) #define v_store(a, b) vst1q_s16((int16_t *)(a), (b)) #define v_merge_lo_16(a, b) vzip1q_s16((a),(b)) #define v_merge_hi_16(a, b) vzip2q_s16((a),(b)) #define v_merge_lo_32(a, b) vreinterpretq_s16_s32(vzip1q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b))) #define v_merge_hi_32(a, b) vreinterpretq_s16_s32(vzip2q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b))) #define v_merge_lo_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(a)), vget_low_s64(vreinterpretq_s64_s16(b)))) #define v_merge_hi_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s16(a)), vget_high_s64(vreinterpretq_s64_s16(b)))) #define v_add(a, b) vqaddq_s16((a), (b)) #define v_sub(a, b) vqsubq_s16((a), (b)) #define v_sub_unsigned(a, b) vreinterpretq_s16_u16(vqsubq_u16(vreinterpretq_u16_s16(a), vreinterpretq_u16_s16(b))) #define v_max(a, b) vmaxq_s16((a), (b)) #define v_min(a, b) vminq_s16((a), (b)) #define v_dup(a) vdupq_n_s16(a) #define v_zero v_dup(0) #define v_and(a, b) vandq_s16((a), (b)) #define v_xor(a, b) veorq_s16((a), (b)) #define v_shift_left(a) vextq_s16((v_zero), (a), 7) #define v_mask_gt(a, b) vaddvq_u16(vandq_u16((vcgtq_s16((a), (b))), neon_mask)) #elif defined(__x86_64__) || defined(SIMDE_VERSION) using VECTOR_SHORT = __m128i; #define v_init(a,b,c,d,e,f,g,h) _mm_set_epi16(h,g,f,e,d,c,b,a) #define v_load(a) _mm_load_si128((VECTOR_SHORT *)(a)) #define v_store(a, b) _mm_store_si128((VECTOR_SHORT *)(a), (b)) #define v_merge_lo_16(a, b) _mm_unpacklo_epi16((a),(b)) #define v_merge_hi_16(a, b) _mm_unpackhi_epi16((a),(b)) #define v_merge_lo_32(a, b) _mm_unpacklo_epi32((a),(b)) #define v_merge_hi_32(a, b) _mm_unpackhi_epi32((a),(b)) #define v_merge_lo_64(a, b) _mm_unpacklo_epi64((a),(b)) #define v_merge_hi_64(a, b) _mm_unpackhi_epi64((a),(b)) #define v_add(a, b) _mm_adds_epi16((a), (b)) #define v_sub(a, b) _mm_subs_epi16((a), (b)) #define v_sub_unsigned(a, b) _mm_subs_epu16((a), (b)) #define v_max(a, b) _mm_max_epi16((a), (b)) #define v_min(a, b) _mm_min_epi16((a), (b)) #define v_dup(a) _mm_set1_epi16(a) #define v_zero v_dup(0) #define v_and(a, b) _mm_and_si128((a), (b)) #define v_xor(a, b) _mm_xor_si128((a), (b)) #define v_shift_left(a) _mm_slli_si128((a), 2) #define v_mask_gt(a, b) _mm_movemask_epi8(_mm_cmpgt_epi16((a), (b))) #else #error Unknown Architecture #endif struct s16info_s { VECTOR_SHORT matrix[32]; VECTOR_SHORT * hearray; VECTOR_SHORT * dprofile; VECTOR_SHORT ** qtable; unsigned short * dir; char * qseq; uint64_t diralloc; char * cigar; char * cigarend; int64_t cigaralloc; int opcount; char op; int qlen; int maxdlen; CELL penalty_gap_open_query_left; CELL penalty_gap_open_target_left; CELL penalty_gap_open_query_interior; CELL penalty_gap_open_target_interior; CELL penalty_gap_open_query_right; CELL penalty_gap_open_target_right; CELL penalty_gap_extension_query_left; CELL penalty_gap_extension_target_left; CELL penalty_gap_extension_query_interior; CELL penalty_gap_extension_target_interior; CELL penalty_gap_extension_query_right; CELL penalty_gap_extension_target_right; }; auto _mm_print(VECTOR_SHORT x) -> void { auto * y = (unsigned short *) &x; for (int i = 0; i < 8; i++) { printf("%s%6d", (i > 0 ? " " : ""), y[7 - i]); } } auto _mm_print2(VECTOR_SHORT x) -> void { auto * y = (signed short *) &x; for (int i = 0; i < 8; i++) { printf("%s%2d", (i > 0 ? " " : ""), y[7 - i]); } } auto dprofile_dump16(CELL * dprofile) -> void { char * s = sym_nt_4bit; printf("\ndprofile:\n"); for (int i = 0; i < 16; i++) { printf("%c: ", s[i]); for (int k = 0; k < CDEPTH; k++) { printf("["); for (int j = 0; j < CHANNELS; j++) { printf(" %3d", dprofile[(CHANNELS * CDEPTH * i) + (CHANNELS * k) + j]); } printf("]"); } printf("\n"); } } auto dumpscorematrix(CELL * m) -> void { for (int i = 0; i < 16; i++) { printf("%2d %c", i, sym_nt_4bit[i]); for (int j = 0; j < 16; j++) { printf(" %2d", m[(16 * i) + j]); } printf("\n"); } } auto dprofile_fill16(CELL * dprofile_word, CELL * score_matrix_word, BYTE * dseq) -> void { #if 0 dumpscorematrix(score_matrix_word); for (int j = 0; j < CDEPTH; j++) { for (int z = 0; z < CHANNELS; z++) fprintf(stderr, " [%c]", sym_nt_4bit[dseq[j * CHANNELS + z]]); fprintf(stderr, "\n"); } #endif for (int j = 0; j < CDEPTH; j++) { int d[CHANNELS]; for (int z = 0; z < CHANNELS; z++) { d[z] = dseq[(j * CHANNELS) + z] << 4U; } for (int i = 0; i < 16; i += 8) { #ifdef __PPC__ __vector signed short reg0; __vector signed short reg1; __vector signed short reg2; __vector signed short reg3; __vector signed short reg4; __vector signed short reg5; __vector signed short reg6; __vector signed short reg7; __vector signed int reg8; __vector signed int reg9; __vector signed int reg10; __vector signed int reg11; __vector signed int reg12; __vector signed int reg13; __vector signed int reg14; __vector signed int reg15; __vector signed long long reg16; __vector signed long long reg17; __vector signed long long reg18; __vector signed long long reg19; __vector signed long long reg20; __vector signed long long reg21; __vector signed long long reg22; __vector signed long long reg23; __vector signed long long reg24; __vector signed long long reg25; __vector signed long long reg26; __vector signed long long reg27; __vector signed long long reg28; __vector signed long long reg29; __vector signed long long reg30; __vector signed long long reg31; #else VECTOR_SHORT reg0; VECTOR_SHORT reg1; VECTOR_SHORT reg2; VECTOR_SHORT reg3; VECTOR_SHORT reg4; VECTOR_SHORT reg5; VECTOR_SHORT reg6; VECTOR_SHORT reg7; VECTOR_SHORT reg8; VECTOR_SHORT reg9; VECTOR_SHORT reg10; VECTOR_SHORT reg11; VECTOR_SHORT reg12; VECTOR_SHORT reg13; VECTOR_SHORT reg14; VECTOR_SHORT reg15; VECTOR_SHORT reg16; VECTOR_SHORT reg17; VECTOR_SHORT reg18; VECTOR_SHORT reg19; VECTOR_SHORT reg20; VECTOR_SHORT reg21; VECTOR_SHORT reg22; VECTOR_SHORT reg23; VECTOR_SHORT reg24; VECTOR_SHORT reg25; VECTOR_SHORT reg26; VECTOR_SHORT reg27; VECTOR_SHORT reg28; VECTOR_SHORT reg29; VECTOR_SHORT reg30; VECTOR_SHORT reg31; #endif reg0 = v_load(score_matrix_word + d[0] + i); reg1 = v_load(score_matrix_word + d[1] + i); reg2 = v_load(score_matrix_word + d[2] + i); reg3 = v_load(score_matrix_word + d[3] + i); reg4 = v_load(score_matrix_word + d[4] + i); reg5 = v_load(score_matrix_word + d[5] + i); reg6 = v_load(score_matrix_word + d[6] + i); reg7 = v_load(score_matrix_word + d[7] + i); #ifdef __PPC__ reg8 = (__vector signed int) vec_mergeh(reg0, reg1); reg9 = (__vector signed int) vec_mergel(reg0, reg1); reg10 = (__vector signed int) vec_mergeh(reg2, reg3); reg11 = (__vector signed int) vec_mergel(reg2, reg3); reg12 = (__vector signed int) vec_mergeh(reg4, reg5); reg13 = (__vector signed int) vec_mergel(reg4, reg5); reg14 = (__vector signed int) vec_mergeh(reg6, reg7); reg15 = (__vector signed int) vec_mergel(reg6, reg7); reg16 = (__vector signed long long) vec_mergeh(reg8, reg10); reg17 = (__vector signed long long) vec_mergel(reg8, reg10); reg18 = (__vector signed long long) vec_mergeh(reg12, reg14); reg19 = (__vector signed long long) vec_mergel(reg12, reg14); reg20 = (__vector signed long long) vec_mergeh(reg9, reg11); reg21 = (__vector signed long long) vec_mergel(reg9, reg11); reg22 = (__vector signed long long) vec_mergeh(reg13, reg15); reg23 = (__vector signed long long) vec_mergel(reg13, reg15); reg24 = (__vector signed long long) vec_perm (reg16, reg18, perm_merge_long_low); reg25 = (__vector signed long long) vec_perm (reg16, reg18, perm_merge_long_high); reg26 = (__vector signed long long) vec_perm (reg17, reg19, perm_merge_long_low); reg27 = (__vector signed long long) vec_perm (reg17, reg19, perm_merge_long_high); reg28 = (__vector signed long long) vec_perm (reg20, reg22, perm_merge_long_low); reg29 = (__vector signed long long) vec_perm (reg20, reg22, perm_merge_long_high); reg30 = (__vector signed long long) vec_perm (reg21, reg23, perm_merge_long_low); reg31 = (__vector signed long long) vec_perm (reg21, reg23, perm_merge_long_high); #else reg8 = v_merge_lo_16(reg0, reg1); reg9 = v_merge_hi_16(reg0, reg1); reg10 = v_merge_lo_16(reg2, reg3); reg11 = v_merge_hi_16(reg2, reg3); reg12 = v_merge_lo_16(reg4, reg5); reg13 = v_merge_hi_16(reg4, reg5); reg14 = v_merge_lo_16(reg6, reg7); reg15 = v_merge_hi_16(reg6, reg7); reg16 = v_merge_lo_32(reg8, reg10); reg17 = v_merge_hi_32(reg8, reg10); reg18 = v_merge_lo_32(reg12, reg14); reg19 = v_merge_hi_32(reg12, reg14); reg20 = v_merge_lo_32(reg9, reg11); reg21 = v_merge_hi_32(reg9, reg11); reg22 = v_merge_lo_32(reg13, reg15); reg23 = v_merge_hi_32(reg13, reg15); reg24 = v_merge_lo_64(reg16, reg18); reg25 = v_merge_hi_64(reg16, reg18); reg26 = v_merge_lo_64(reg17, reg19); reg27 = v_merge_hi_64(reg17, reg19); reg28 = v_merge_lo_64(reg20, reg22); reg29 = v_merge_hi_64(reg20, reg22); reg30 = v_merge_lo_64(reg21, reg23); reg31 = v_merge_hi_64(reg21, reg23); #endif v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 0)) + (CHANNELS * j), reg24); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 1)) + (CHANNELS * j), reg25); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 2)) + (CHANNELS * j), reg26); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 3)) + (CHANNELS * j), reg27); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 4)) + (CHANNELS * j), reg28); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 5)) + (CHANNELS * j), reg29); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 6)) + (CHANNELS * j), reg30); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 7)) + (CHANNELS * j), reg31); } } #if 0 dprofile_dump16(dprofile_word); #endif } /* The direction bits are set as follows: in DIR[0..1] if F>H initially (must go up) (4th pri) in DIR[2..3] if E>max(H,F) (must go left) (3rd pri) in DIR[4..5] if new F>H (must extend up) (2nd pri) in DIR[6..7] if new E>H (must extend left) (1st pri) no bits set: go diagonally */ /* On PPC the fifth parameter is a vector for the result in the lower 64 bits. On x86_64 the fifth parameter is the address to write the result to. */ #ifdef __PPC__ /* Handle differences between GNU and IBM compilers */ #ifdef __IBMCPP__ #define VECTORBYTEPERMUTE vec_bperm #else #define VECTORBYTEPERMUTE vec_vbpermq #endif /* The VSX vec_bperm instruction puts the 16 selected bits of the first source into bits 48-63 of the destination. */ const __vector unsigned char perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; #define ALIGNCORE(H, N, F, V, RES, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \ { \ __vector unsigned short W, X, Y, Z; \ __vector unsigned int WX, YZ; \ __vector short VV; \ VV = v_load(&V); \ H = v_add(H, VV); \ W = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(F, H), perm); \ H = v_max(H, F); \ X = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(E, H), perm); \ H = v_max(H, E); \ H_MIN = v_min(H_MIN, H); \ H_MAX = v_max(H_MAX, H); \ N = H; \ HF = v_sub(H, QR_t); \ F = v_sub(F, R_t); \ Y = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(F, HF), perm); \ F = v_max(F, HF); \ HE = v_sub(H, QR_q); \ E = v_sub(E, R_q); \ Z = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(E, HE), perm); \ E = v_max(E, HE); \ WX = (__vector unsigned int) vec_mergel(W, X); \ YZ = (__vector unsigned int) vec_mergel(Y, Z); \ RES = (__vector unsigned long long) vec_mergeh(WX, YZ); \ } #else /* x86_64 & aarch64 */ #define ALIGNCORE(H, N, F, V, PATH, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \ H = v_add(H, V); \ *((PATH)+0) = v_mask_gt(F, H); \ (H) = v_max(H, F); \ *((PATH)+1) = v_mask_gt(E, H); \ (H) = v_max(H, E); \ (H_MIN) = v_min(H_MIN, H); \ (H_MAX) = v_max(H_MAX, H); \ (N) = H; \ HF = v_sub(H, QR_t); \ (F) = v_sub(F, R_t); \ *((PATH)+2) = v_mask_gt(F, HF); \ (F) = v_max(F, HF); \ HE = v_sub(H, QR_q); \ E = v_sub(E, R_q); \ *((PATH)+3) = v_mask_gt(E, HE); \ E = v_max(E, HE); #endif auto aligncolumns_first(VECTOR_SHORT * Sm, VECTOR_SHORT * hep, VECTOR_SHORT ** qp, VECTOR_SHORT QR_q_i, VECTOR_SHORT R_q_i, VECTOR_SHORT QR_q_r, VECTOR_SHORT R_q_r, VECTOR_SHORT QR_t_0, VECTOR_SHORT R_t_0, VECTOR_SHORT QR_t_1, VECTOR_SHORT R_t_1, VECTOR_SHORT QR_t_2, VECTOR_SHORT R_t_2, VECTOR_SHORT QR_t_3, VECTOR_SHORT R_t_3, VECTOR_SHORT h0, VECTOR_SHORT h1, VECTOR_SHORT h2, VECTOR_SHORT h3, VECTOR_SHORT f0, VECTOR_SHORT f1, VECTOR_SHORT f2, VECTOR_SHORT f3, VECTOR_SHORT * _h_min, VECTOR_SHORT * _h_max, VECTOR_SHORT Mm, VECTOR_SHORT M_QR_t_left, VECTOR_SHORT M_R_t_left, VECTOR_SHORT M_QR_q_interior, VECTOR_SHORT M_QR_q_right, int64_t ql, unsigned short * dir) -> void { VECTOR_SHORT h4; VECTOR_SHORT h5; VECTOR_SHORT h6; VECTOR_SHORT h7; VECTOR_SHORT h8; VECTOR_SHORT E; VECTOR_SHORT HE; VECTOR_SHORT HF; VECTOR_SHORT * vp = nullptr; VECTOR_SHORT h_min = v_zero; VECTOR_SHORT h_max = v_zero; #ifdef __PPC__ __vector unsigned long long RES1; __vector unsigned long long RES2; __vector unsigned long long RES; #endif int64_t i = 0; f0 = v_sub(f0, QR_t_0); f1 = v_sub(f1, QR_t_1); f2 = v_sub(f2, QR_t_2); f3 = v_sub(f3, QR_t_3); for (i = 0; i < ql - 1; i++) { vp = qp[i + 0]; h4 = hep[(2 * i) + 0]; E = hep[(2 * i) + 1]; /* Initialize selected h and e values for next/this round. First zero those cells where a new sequence starts by using an unsigned saturated subtraction of a huge value to set it to zero. Then use signed subtraction to obtain the correct value. */ h4 = v_sub_unsigned(h4, Mm); h4 = v_sub(h4, M_QR_t_left); E = v_sub_unsigned(E, Mm); E = v_sub(E, M_QR_t_left); E = v_sub(E, M_QR_q_interior); M_QR_t_left = v_add(M_QR_t_left, M_R_t_left); #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+0, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+4, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+8, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; h0 = h4; h1 = h5; h2 = h6; h3 = h7; } /* the final round - using query gap penalties for right end */ vp = qp[i + 0]; E = hep[(2 * i) + 1]; E = v_sub_unsigned(E, Mm); E = v_sub(E, M_QR_t_left); E = v_sub(E, M_QR_q_right); #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; Sm[0] = h5; Sm[1] = h6; Sm[2] = h7; Sm[3] = h8; *_h_min = h_min; *_h_max = h_max; } auto aligncolumns_rest(VECTOR_SHORT * Sm, VECTOR_SHORT * hep, VECTOR_SHORT ** qp, VECTOR_SHORT QR_q_i, VECTOR_SHORT R_q_i, VECTOR_SHORT QR_q_r, VECTOR_SHORT R_q_r, VECTOR_SHORT QR_t_0, VECTOR_SHORT R_t_0, VECTOR_SHORT QR_t_1, VECTOR_SHORT R_t_1, VECTOR_SHORT QR_t_2, VECTOR_SHORT R_t_2, VECTOR_SHORT QR_t_3, VECTOR_SHORT R_t_3, VECTOR_SHORT h0, VECTOR_SHORT h1, VECTOR_SHORT h2, VECTOR_SHORT h3, VECTOR_SHORT f0, VECTOR_SHORT f1, VECTOR_SHORT f2, VECTOR_SHORT f3, VECTOR_SHORT * _h_min, VECTOR_SHORT * _h_max, int64_t ql, unsigned short * dir) -> void { VECTOR_SHORT h4; VECTOR_SHORT h5; VECTOR_SHORT h6; VECTOR_SHORT h7; VECTOR_SHORT h8; VECTOR_SHORT E; VECTOR_SHORT HE; VECTOR_SHORT HF; VECTOR_SHORT * vp = nullptr; VECTOR_SHORT h_min = v_zero; VECTOR_SHORT h_max = v_zero; #ifdef __PPC__ __vector unsigned long long RES1; __vector unsigned long long RES2; __vector unsigned long long RES; #endif int64_t i = 0; f0 = v_sub(f0, QR_t_0); f1 = v_sub(f1, QR_t_1); f2 = v_sub(f2, QR_t_2); f3 = v_sub(f3, QR_t_3); for (i = 0; i < ql - 1; i++) { vp = qp[i + 0]; h4 = hep[(2 * i) + 0]; E = hep[(2 * i) + 1]; #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir + 16 * i + 4, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir + 16 * i + 8, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir + 16 * i + 12, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; h0 = h4; h1 = h5; h2 = h6; h3 = h7; } /* the final round - using query gap penalties for right end */ vp = qp[i + 0]; E = hep[(2 * i) + 1]; #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir + 16 * i + 0, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir + 16 * i + 4, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir + 16 * i + 8, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir + 16 * i + 12, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; Sm[0] = h5; Sm[1] = h6; Sm[2] = h7; Sm[3] = h8; *_h_min = h_min; *_h_max = h_max; } inline auto pushop(s16info_s * s, char newop) -> void { if (newop == s->op) { s->opcount++; } else { *--s->cigarend = s->op; if (s->opcount > 1) { const int size = 11; char buf[size]; int const len = snprintf(buf, size, "%d", s->opcount); s->cigarend -= len; memcpy(s->cigarend, buf, len); } s->op = newop; s->opcount = 1; } } inline auto finishop(s16info_s * s) -> void { if (s->op && s->opcount) { *--s->cigarend = s->op; if (s->opcount > 1) { const int size = 11; char buf[size]; int const len = snprintf(buf, size, "%d", s->opcount); s->cigarend -= len; memcpy(s->cigarend, buf, len); } s->op = 0; s->opcount = 0; } } auto backtrack16(s16info_s * s, char * dseq, uint64_t dlen, uint64_t offset, uint64_t channel, unsigned short * paligned, unsigned short * pmatches, unsigned short * pmismatches, unsigned short * pgaps) -> void { unsigned short * dirbuffer = s->dir; uint64_t const dirbuffersize = s->qlen * s->maxdlen * 4; uint64_t const qlen = s->qlen; char * qseq = s->qseq; uint64_t const maskup = 3ULL << (2 * channel + 0); uint64_t const maskleft = 3ULL << (2 * channel + 16); uint64_t const maskextup = 3ULL << (2 * channel + 32); uint64_t const maskextleft = 3ULL << (2 * channel + 48); #if 0 printf("Dumping backtracking array\n"); for (uint64_t i = 0; i < qlen; i++) { for (uint64_t j = 0; j < dlen; j++) { uint64_t d = *((uint64_t *) (dirbuffer + (offset + 16 * s->qlen * (j / 4) + 16 * i + 4 * (j & 3)) % dirbuffersize)); if (d & maskup) { if (d & maskleft) printf("+"); else printf("^"); } else if (d & maskleft) { printf("<"); } else { printf("\\"); } } printf("\n"); } printf("Dumping gap extension array\n"); for (uint64_t i = 0; i < qlen; i++) { for (uint64_t j = 0; j < dlen; j++) { uint64_t d = *((uint64_t *) (dirbuffer + (offset + 16 * s->qlen * (j / 4) + 16 * i + 4 * (j & 3)) % dirbuffersize)); if (d & maskextup) { if (d & maskextleft) printf("+"); else printf("^"); } else if (d & maskextleft) { printf("<"); } else { printf("\\"); } } printf("\n"); } #endif unsigned short aligned = 0; unsigned short matches = 0; unsigned short mismatches = 0; unsigned short gaps = 0; int64_t i = qlen - 1; int64_t j = dlen - 1; s->cigarend = s->cigar + s->qlen + s->maxdlen + 1; s->op = 0; s->opcount = 1; while ((i >= 0) && (j >= 0)) { ++aligned; uint64_t const d = *((uint64_t *) (dirbuffer + (offset + (16 * s->qlen * (j / 4)) + (16 * i) + (4 * (j & 3))) % dirbuffersize)); if ((s->op == 'I') && (d & maskextleft)) { --j; pushop(s, 'I'); } else if ((s->op == 'D') && (d & maskextup)) { --i; pushop(s, 'D'); } else if (d & maskleft) { if (s->op != 'I') { ++gaps; } --j; pushop(s, 'I'); } else if (d & maskup) { if (s->op != 'D') { ++gaps; } --i; pushop(s, 'D'); } else { if (chrmap_4bit[(int) (qseq[i])] & chrmap_4bit[(int) (dseq[j])]) { if (opt_n_mismatch && ((chrmap_4bit[(int) (qseq[i])] == 15) || (chrmap_4bit[(int) (dseq[j])] == 15))) ++mismatches; else ++matches; } else { ++mismatches; } --i; --j; pushop(s, 'M'); } } while(i >= 0) { ++aligned; if (s->op != 'D') { ++gaps; } --i; pushop(s, 'D'); } while(j >= 0) { ++aligned; if (s->op != 'I') { ++gaps; } --j; pushop(s, 'I'); } finishop(s); /* move cigar to beginning of allocated memory area */ int const cigarlen = s->cigar + s->qlen + s->maxdlen - s->cigarend; memmove(s->cigar, s->cigarend, cigarlen + 1); * paligned = aligned; * pmatches = matches; * pmismatches = mismatches; * pgaps = gaps; } auto search16_init(CELL score_match, CELL score_mismatch, CELL penalty_gap_open_query_left, CELL penalty_gap_open_target_left, CELL penalty_gap_open_query_interior, CELL penalty_gap_open_target_interior, CELL penalty_gap_open_query_right, CELL penalty_gap_open_target_right, CELL penalty_gap_extension_query_left, CELL penalty_gap_extension_target_left, CELL penalty_gap_extension_query_interior, CELL penalty_gap_extension_target_interior, CELL penalty_gap_extension_query_right, CELL penalty_gap_extension_target_right) -> struct s16info_s * { (void) score_match; (void) score_mismatch; /* prepare alloc of qtable, dprofile, hearray, dir */ auto * s = (struct s16info_s *) xmalloc(sizeof(struct s16info_s)); s->dprofile = (VECTOR_SHORT *) xmalloc(2 * 4 * 8 * 16); s->qlen = 0; s->qseq = nullptr; s->maxdlen = 0; s->dir = nullptr; s->diralloc = 0; s->hearray = nullptr; s->qtable = nullptr; s->cigar = nullptr; s->cigarend = nullptr; s->cigaralloc = 0; for (int i = 0; i < 16; i++) { for (int j = 0; j < 16; j++) { CELL value = 0; if (opt_n_mismatch && ((i == 15) || (j == 15))) { value = opt_mismatch; } else if (ambiguous_4bit[i] or ambiguous_4bit[j]) { value = 0; } else if (i == j) { value = opt_match; } else { value = opt_mismatch; } ((CELL *) (&s->matrix))[(16 * i) + j] = value; scorematrix[i][j] = value; } } s->penalty_gap_open_query_left = penalty_gap_open_query_left; s->penalty_gap_open_query_interior = penalty_gap_open_query_interior; s->penalty_gap_open_query_right = penalty_gap_open_query_right; s->penalty_gap_open_target_left = penalty_gap_open_target_left; s->penalty_gap_open_target_interior = penalty_gap_open_target_interior; s->penalty_gap_open_target_right = penalty_gap_open_target_right; s->penalty_gap_extension_query_left = penalty_gap_extension_query_left; s->penalty_gap_extension_query_interior = penalty_gap_extension_query_interior; s->penalty_gap_extension_query_right = penalty_gap_extension_query_right; s->penalty_gap_extension_target_left = penalty_gap_extension_target_left; s->penalty_gap_extension_target_interior = penalty_gap_extension_target_interior; s->penalty_gap_extension_target_right = penalty_gap_extension_target_right; return s; } auto search16_exit(s16info_s * s) -> void { /* free mem for dprofile, hearray, dir, qtable */ if (s->dir) { xfree(s->dir); } if (s->hearray) { xfree(s->hearray); } if (s->dprofile) { xfree(s->dprofile); } if (s->qtable) { xfree(s->qtable); } if (s->cigar) { xfree(s->cigar); } xfree(s); } auto search16_qprep(s16info_s * s, char * qseq, int qlen) -> void { s->qlen = qlen; s->qseq = qseq; if (s->hearray) { xfree(s->hearray); } s->hearray = (VECTOR_SHORT *) xmalloc(2 * s->qlen * sizeof(VECTOR_SHORT)); memset(s->hearray, 0, 2 * s->qlen * sizeof(VECTOR_SHORT)); if (s->qtable) { xfree(s->qtable); } s->qtable = (VECTOR_SHORT **) xmalloc(s->qlen * sizeof(VECTOR_SHORT*)); for (int i = 0; i < qlen; i++) { s->qtable[i] = s->dprofile + 4 * chrmap_4bit[(int) (qseq[i])]; } } /* Turn off tree-partial-pre optimizations for the rest of the file. GNU C++ 9 or later generates incorrect code on x86_64 if turned on. */ #ifdef __GNUC__ #ifndef __clang__ #pragma GCC optimize ("-fno-tree-partial-pre") #endif #endif auto search16(s16info_s * s, unsigned int sequences, unsigned int * seqnos, CELL * pscores, unsigned short * paligned, unsigned short * pmatches, unsigned short * pmismatches, unsigned short * pgaps, char ** pcigar) -> void { CELL ** q_start = (CELL **) s->qtable; CELL * dprofile = (CELL *) s->dprofile; CELL * hearray = (CELL *) s->hearray; uint64_t const qlen = s->qlen; if (qlen == 0) { for (unsigned int cand_id = 0; cand_id < sequences; cand_id++) { unsigned int const seqno = seqnos[cand_id]; int64_t const length = db_getsequencelen(seqno); paligned[cand_id] = length; pmatches[cand_id] = 0; pmismatches[cand_id] = 0; pgaps[cand_id] = length; if (length == 0) { pscores[cand_id] = 0; } else { pscores[cand_id] = MAX(- s->penalty_gap_open_target_left - (length * s->penalty_gap_extension_target_left), - s->penalty_gap_open_target_right - (length * s->penalty_gap_extension_target_right)); } char * cigar = nullptr; if (length > 0) { int const ret = xsprintf(&cigar, "%ldI", length); if ((ret < 2) or not cigar) { fatal("Unable to allocate enough memory."); } } else { cigar = (char *) xmalloc(1); cigar[0] = 0; } pcigar[cand_id] = cigar; } return; } /* find longest target sequence and reallocate direction buffer */ uint64_t maxdlen = 0; for (int64_t i = 0; i < sequences; i++) { uint64_t const dlen = db_getsequencelen(seqnos[i]); /* skip the very long sequences */ if ((int64_t) (s->qlen) * dlen <= MAXSEQLENPRODUCT) { if (dlen > maxdlen) { maxdlen = dlen; } } } maxdlen = 4 * ((maxdlen + 3) / 4); s->maxdlen = maxdlen; uint64_t const dirbuffersize = s->qlen * s->maxdlen * 4; if (dirbuffersize > s->diralloc) { s->diralloc = dirbuffersize; if (s->dir) { xfree(s->dir); } s->dir = (unsigned short*) xmalloc(dirbuffersize * sizeof(unsigned short)); } unsigned short * dirbuffer = s->dir; if (s->qlen + s->maxdlen + 1 > s->cigaralloc) { s->cigaralloc = s->qlen + s->maxdlen + 1; if (s->cigar) { xfree(s->cigar); } s->cigar = (char *) xmalloc(s->cigaralloc); } VECTOR_SHORT M; VECTOR_SHORT T0; VECTOR_SHORT M_QR_target_left; VECTOR_SHORT M_R_target_left; VECTOR_SHORT M_QR_query_interior; VECTOR_SHORT M_QR_query_right; VECTOR_SHORT R_query_left; VECTOR_SHORT QR_query_interior; VECTOR_SHORT R_query_interior; VECTOR_SHORT QR_query_right; VECTOR_SHORT R_query_right; VECTOR_SHORT QR_target_left; VECTOR_SHORT R_target_left; VECTOR_SHORT QR_target_interior; VECTOR_SHORT R_target_interior; VECTOR_SHORT QR_target_right; VECTOR_SHORT R_target_right; VECTOR_SHORT QR_target[4]; VECTOR_SHORT R_target[4]; VECTOR_SHORT * hep = nullptr; VECTOR_SHORT ** qp = nullptr; BYTE * d_begin[CHANNELS]; BYTE * d_end[CHANNELS]; uint64_t d_offset[CHANNELS]; BYTE * d_address[CHANNELS]; uint64_t d_length[CHANNELS]; int64_t seq_id[CHANNELS]; bool overflow[CHANNELS]; VECTOR_SHORT dseqalloc[CDEPTH]; VECTOR_SHORT S[4]; BYTE * dseq = (BYTE *) & dseqalloc; BYTE zero = 0; uint64_t next_id = 0; uint64_t done = 0; T0 = v_init(-1, 0, 0, 0, 0, 0, 0, 0); R_query_left = v_dup(s->penalty_gap_extension_query_left); QR_query_interior = v_dup((s->penalty_gap_open_query_interior + s->penalty_gap_extension_query_interior)); R_query_interior = v_dup(s->penalty_gap_extension_query_interior); QR_query_right = v_dup((s->penalty_gap_open_query_right + s->penalty_gap_extension_query_right)); R_query_right = v_dup(s->penalty_gap_extension_query_right); QR_target_left = v_dup((s->penalty_gap_open_target_left + s->penalty_gap_extension_target_left)); R_target_left = v_dup(s->penalty_gap_extension_target_left); QR_target_interior = v_dup((s->penalty_gap_open_target_interior + s->penalty_gap_extension_target_interior)); R_target_interior = v_dup(s->penalty_gap_extension_target_interior); QR_target_right = v_dup((s->penalty_gap_open_target_right + s->penalty_gap_extension_target_right)); R_target_right = v_dup(s->penalty_gap_extension_target_right); hep = (VECTOR_SHORT *) hearray; qp = (VECTOR_SHORT **) q_start; for (int c = 0; c < CHANNELS; c++) { d_begin[c] = &zero; d_end[c] = d_begin[c]; d_address[c] = nullptr; d_offset[c] = 0; d_length[c] = 0; seq_id[c] = -1; overflow[c] = false; } short gap_penalty_max = 0; gap_penalty_max = MAX(gap_penalty_max, s->penalty_gap_open_query_left + s->penalty_gap_extension_query_left); gap_penalty_max = MAX(gap_penalty_max, s->penalty_gap_open_query_interior + s->penalty_gap_extension_query_interior); gap_penalty_max = MAX(gap_penalty_max, s->penalty_gap_open_query_right + s->penalty_gap_extension_query_right); gap_penalty_max = MAX(gap_penalty_max, s->penalty_gap_open_target_left + s->penalty_gap_extension_target_left); gap_penalty_max = MAX(gap_penalty_max, s->penalty_gap_open_target_interior + s->penalty_gap_extension_target_interior); gap_penalty_max = MAX(gap_penalty_max, s->penalty_gap_open_target_right + s->penalty_gap_extension_target_right); short const score_min = std::numeric_limits::min() + gap_penalty_max; short const score_max = std::numeric_limits::max(); for (int i = 0; i < 4; i++) { S[i] = v_zero; dseqalloc[i] = v_zero; } VECTOR_SHORT H0 = v_zero; VECTOR_SHORT H1 = v_zero; VECTOR_SHORT H2 = v_zero; VECTOR_SHORT H3 = v_zero; VECTOR_SHORT F0 = v_zero; VECTOR_SHORT F1 = v_zero; VECTOR_SHORT F2 = v_zero; VECTOR_SHORT F3 = v_zero; bool easy = false; unsigned short * dir = dirbuffer; while (true) { if (easy) { /* fill all channels with symbols from the database sequences */ for (int c = 0; c < CHANNELS; c++) { for (int j = 0; j < CDEPTH; j++) { if (d_begin[c] < d_end[c]) { dseq[(CHANNELS * j) + c] = chrmap_4bit[*(d_begin[c]++)]; } else { dseq[(CHANNELS * j) + c] = 0; } } if (d_begin[c] == d_end[c]) { easy = false; } } dprofile_fill16(dprofile, (CELL*) s->matrix, dseq); /* create vectors of gap penalties for target depending on whether any of the database sequences ended in these four columns */ if (easy) { for (unsigned int j = 0; j < CDEPTH; j++) { QR_target[j] = QR_target_interior; R_target[j] = R_target_interior; } } else { /* one or more sequences ended */ VECTOR_SHORT const QR_diff = v_sub(QR_target_right, QR_target_interior); VECTOR_SHORT const R_diff = v_sub(R_target_right, R_target_interior); for (unsigned int j = 0; j < CDEPTH; j++) { VECTOR_SHORT MM = v_zero; VECTOR_SHORT TT = T0; for (int c = 0; c < CHANNELS; c++) { if ((d_begin[c] == d_end[c]) && (j >= ((d_length[c] + 3) % 4))) { MM = v_xor(MM, TT); } TT = v_shift_left(TT); } QR_target[j] = v_add(QR_target_interior, v_and(QR_diff, MM)); R_target[j] = v_add(R_target_interior, v_and(R_diff, MM)); } } VECTOR_SHORT h_min; VECTOR_SHORT h_max; aligncolumns_rest(S, hep, qp, QR_query_interior, R_query_interior, QR_query_right, R_query_right, QR_target[0], R_target[0], QR_target[1], R_target[1], QR_target[2], R_target[2], QR_target[3], R_target[3], H0, H1, H2, H3, F0, F1, F2, F3, & h_min, & h_max, qlen, dir); VECTOR_SHORT h_min_vector; VECTOR_SHORT h_max_vector; v_store(& h_min_vector, h_min); v_store(& h_max_vector, h_max); for (int c = 0; c < CHANNELS; c++) { if (not overflow[c]) { signed short const h_min_c = ((signed short *) (& h_min_vector))[c]; signed short const h_max_c = ((signed short *) (& h_max_vector))[c]; if ((h_min_c <= score_min) or (h_max_c >= score_max)) { overflow[c] = true; } } } } else { /* One or more sequences ended in the previous block. We have to switch over to a new sequence */ easy = true; M = v_zero; VECTOR_SHORT T = T0; for (int c = 0; c < CHANNELS; c++) { if (d_begin[c] < d_end[c]) { /* this channel has more sequence */ for (int j = 0; j < CDEPTH; j++) { if (d_begin[c] < d_end[c]) { dseq[(CHANNELS * j) + c] = chrmap_4bit[*(d_begin[c]++)]; } else { dseq[(CHANNELS * j) + c] = 0; } } if (d_begin[c] == d_end[c]) { easy = false; } } else { /* sequence in channel c ended. change of sequence */ M = v_xor(M, T); int64_t cand_id = seq_id[c]; if (cand_id >= 0) { /* save score */ char * dbseq = (char *) d_address[c]; int64_t const dbseqlen = d_length[c]; int64_t const z = (dbseqlen + 3) % 4; int64_t const score = ((CELL *) S)[(z * CHANNELS) + c]; if (overflow[c]) { pscores[cand_id] = std::numeric_limits::max(); paligned[cand_id] = 0; pmatches[cand_id] = 0; pmismatches[cand_id] = 0; pgaps[cand_id] = 0; pcigar[cand_id] = xstrdup(""); } else { pscores[cand_id] = score; backtrack16(s, dbseq, dbseqlen, d_offset[c], c, paligned + cand_id, pmatches + cand_id, pmismatches + cand_id, pgaps + cand_id); pcigar[cand_id] = (char *) xmalloc(strlen(s->cigar)+1); strcpy(pcigar[cand_id], s->cigar); } done++; } /* get next sequence of reasonable length */ int64_t length = 0; while ((length == 0) && (next_id < sequences)) { cand_id = next_id++; length = db_getsequencelen(seqnos[cand_id]); if ((length == 0) or (s->qlen * length > MAXSEQLENPRODUCT)) { pscores[cand_id] = std::numeric_limits::max(); paligned[cand_id] = 0; pmatches[cand_id] = 0; pmismatches[cand_id] = 0; pgaps[cand_id] = 0; pcigar[cand_id] = xstrdup(""); length = 0; done++; } } if (length > 0) { seq_id[c] = cand_id; char * address = db_getsequence(seqnos[cand_id]); d_address[c] = (BYTE *) address; d_length[c] = length; d_begin[c] = (unsigned char *) address; d_end[c] = (unsigned char *) address + length; d_offset[c] = dir - dirbuffer; overflow[c] = false; ((CELL *) &H0)[c] = 0; ((CELL *) &H1)[c] = - s->penalty_gap_open_query_left - 1 * s->penalty_gap_extension_query_left; ((CELL *) &H2)[c] = - s->penalty_gap_open_query_left - 2 * s->penalty_gap_extension_query_left; ((CELL *) &H3)[c] = - s->penalty_gap_open_query_left - 3 * s->penalty_gap_extension_query_left; ((CELL *) &F0)[c] = - s->penalty_gap_open_query_left - 1 * s->penalty_gap_extension_query_left; ((CELL *) &F1)[c] = - s->penalty_gap_open_query_left - 2 * s->penalty_gap_extension_query_left; ((CELL *) &F2)[c] = - s->penalty_gap_open_query_left - 3 * s->penalty_gap_extension_query_left; ((CELL *) &F3)[c] = - s->penalty_gap_open_query_left - 4 * s->penalty_gap_extension_query_left; /* fill channel */ for (int j = 0; j < CDEPTH; j++) { if (d_begin[c] < d_end[c]) { dseq[(CHANNELS * j) + c] = chrmap_4bit[*(d_begin[c]++)]; } else { dseq[(CHANNELS * j) + c] = 0; } } if (d_begin[c] == d_end[c]) { easy = false; } } else { /* no more sequences, empty channel */ seq_id[c] = -1; d_address[c] = nullptr; d_begin[c] = &zero; d_end[c] = d_begin[c]; d_length[c] = 0; d_offset[c] = 0; for (int j = 0; j < CDEPTH; j++) { dseq[(CHANNELS * j) + c] = 0; } } } T = v_shift_left(T); } if (done == sequences) { break; } /* make masked versions of QR and R for gaps in target */ M_QR_target_left = v_and(M, QR_target_left); M_R_target_left = v_and(M, R_target_left); /* make masked versions of QR for gaps in query at target left end */ M_QR_query_interior = v_and(M, QR_query_interior); M_QR_query_right = v_and(M, QR_query_right); dprofile_fill16(dprofile, (CELL *) s->matrix, dseq); /* create vectors of gap penalties for target depending on whether any of the database sequences ended in these four columns */ if (easy) { for (unsigned int j = 0; j < CDEPTH; j++) { QR_target[j] = QR_target_interior; R_target[j] = R_target_interior; } } else { /* one or more sequences ended */ VECTOR_SHORT const QR_diff = v_sub(QR_target_right, QR_target_interior); VECTOR_SHORT const R_diff = v_sub(R_target_right, R_target_interior); for (unsigned int j = 0; j < CDEPTH; j++) { VECTOR_SHORT MM = v_zero; VECTOR_SHORT TT = T0; for (int c = 0; c < CHANNELS; c++) { if ((d_begin[c] == d_end[c]) && (j >= ((d_length[c] + 3) % 4))) { MM = v_xor(MM, TT); } TT = v_shift_left(TT); } QR_target[j] = v_add(QR_target_interior, v_and(QR_diff, MM)); R_target[j] = v_add(R_target_interior, v_and(R_diff, MM)); } } VECTOR_SHORT h_min; VECTOR_SHORT h_max; aligncolumns_first(S, hep, qp, QR_query_interior, R_query_interior, QR_query_right, R_query_right, QR_target[0], R_target[0], QR_target[1], R_target[1], QR_target[2], R_target[2], QR_target[3], R_target[3], H0, H1, H2, H3, F0, F1, F2, F3, & h_min, & h_max, M, M_QR_target_left, M_R_target_left, M_QR_query_interior, M_QR_query_right, qlen, dir); VECTOR_SHORT h_min_vector; VECTOR_SHORT h_max_vector; v_store(& h_min_vector, h_min); v_store(& h_max_vector, h_max); for (int c = 0; c < CHANNELS; c++) { if (not overflow[c]) { signed short const h_min_c = ((signed short *) (& h_min_vector))[c]; signed short const h_max_c = ((signed short *) (& h_max_vector))[c]; if ((h_min_c <= score_min) or (h_max_c >= score_max)) { overflow[c] = true; } } } } H0 = v_sub(H3, R_query_left); H1 = v_sub(H0, R_query_left); H2 = v_sub(H1, R_query_left); H3 = v_sub(H2, R_query_left); F0 = v_sub(F3, R_query_left); F1 = v_sub(F0, R_query_left); F2 = v_sub(F1, R_query_left); F3 = v_sub(F2, R_query_left); dir += 4 * 4 * s->qlen; if (dir >= dirbuffer + dirbuffersize) { dir -= dirbuffersize; } } } vsearch-2.30.0/src/align_simd.h000066400000000000000000000073631476012147200163120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using CELL = signed short; using WORD = unsigned short; using BYTE = unsigned char; struct s16info_s; auto search16_init(CELL score_match, CELL score_mismatch, CELL penalty_gap_open_query_left, CELL penalty_gap_open_target_left, CELL penalty_gap_open_query_interior, CELL penalty_gap_open_target_interior, CELL penalty_gap_open_query_right, CELL penalty_gap_open_target_right, CELL penalty_gap_extension_query_left, CELL penalty_gap_extension_target_left, CELL penalty_gap_extension_query_interior, CELL penalty_gap_extension_target_interior, CELL penalty_gap_extension_query_right, CELL penalty_gap_extension_target_right) -> struct s16info_s *; auto search16_exit(s16info_s * s) -> void; auto search16_qprep(s16info_s * s, char * qseq, int qlen) -> void; auto search16(s16info_s * s, unsigned int sequences, unsigned int * seqnos, CELL * pscores, unsigned short * paligned, unsigned short * pmatches, unsigned short * pmismatches, unsigned short * pgaps, char * * pcigar) -> void; vsearch-2.30.0/src/allpairs.cc000066400000000000000000000544401476012147200161470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "mask.h" #include // std::min, std::max #include // int64_t #include // std::fprintf, std::FILE, std:fclose, std::size_t #include // std::qsort #include // std::strlen #include #include #include static pthread_t * pthread; /* global constants/data, no need for synchronization */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static int qmatches; static int queries; static int64_t progress = 0; static FILE * fp_alnout = nullptr; static FILE * fp_samout = nullptr; static FILE * fp_userout = nullptr; static FILE * fp_blast6out = nullptr; static FILE * fp_uc = nullptr; static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; static FILE * fp_qsegout = nullptr; static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; inline auto allpairs_hit_compare_typed(struct hit * x, struct hit * y) -> int { // high id, then low id // early target, then late target if (x->id > y->id) { return -1; } else if (x->id < y->id) { return +1; } else if (x->target < y->target) { return -1; } else if (x->target > y->target) { return +1; } else { return 0; } } auto allpairs_hit_compare(const void * a, const void * b) -> int { return allpairs_hit_compare_typed((struct hit *) a, (struct hit *) b); } auto allpairs_output_results(int hit_count, struct hit * hits, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc) -> void { /* show results */ auto const toreport = std::min(opt_maxhits, static_cast(hit_count)); if (fp_alnout) { results_show_alnout(fp_alnout, hits, toreport, query_head, qsequence, qseqlen); } if (fp_samout) { results_show_samout(fp_samout, hits, toreport, query_head, qsequence, qsequence_rc); } if (toreport) { double const top_hit_id = hits[0].id; for (int t = 0; t < toreport; t++) { struct hit * hp = hits + t; if (opt_top_hits_only and (hp->id < top_hit_id)) { break; } if (fp_fastapairs) { results_show_fastapairs_one(fp_fastapairs, hp, query_head, qsequence, qsequence_rc); } if (fp_qsegout) { results_show_qsegout_one(fp_qsegout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout) { results_show_tsegout_one(fp_tsegout, hp); } if (fp_uc) { if ((t == 0) or opt_uc_allhits) { results_show_uc_one(fp_uc, hp, query_head, qseqlen, hp->target); } } if (fp_userout) { results_show_userout_one(fp_userout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, hp, query_head, qseqlen); } } } else { if (fp_uc) { results_show_uc_one(fp_uc, nullptr, query_head, qseqlen, 0); } if (opt_output_no_hits) { if (fp_userout) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } } if (hit_count) { ++count_matched; if (opt_matched) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), 0, count_matched, -1.0, -1, -1, nullptr, 0.0); } } else { ++count_notmatched; if (opt_notmatched) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), 0, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } } auto allpairs_thread_run(int64_t t) -> void { (void) t; struct searchinfo_s searchinfo; struct searchinfo_s * si = & searchinfo; searchinfo.hits_v.resize(seqcount); searchinfo.hits = searchinfo.hits_v.data(); searchinfo.s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); LinearMemoryAligner lma; int64_t * scorematrix = lma.scorematrix_create(opt_match, opt_mismatch); lma.set_parameters(scorematrix, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); /* allocate memory for alignment results */ auto const maxhits = static_cast(seqcount); std::vector pseqnos(maxhits); std::vector pscores(maxhits); std::vector paligned(maxhits); std::vector pmatches(maxhits); std::vector pmismatches(maxhits); std::vector pgaps(maxhits); std::vector pcigar(maxhits); std::vector finalhits(maxhits); auto cont = true; while (cont) { xpthread_mutex_lock(&mutex_input); int const query_no = queries; if (query_no < seqcount) { ++queries; /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* init search info */ searchinfo.query_no = query_no; searchinfo.qsize = db_getabundance(query_no); searchinfo.query_head_len = db_getheaderlen(query_no); searchinfo.query_head = db_getheader(query_no); searchinfo.qseqlen = db_getsequencelen(query_no); searchinfo.qsequence = db_getsequence(query_no); searchinfo.rejects = 0; searchinfo.accepts = 0; searchinfo.hit_count = 0; for (int target = searchinfo.query_no + 1; target < seqcount; target++) { if (opt_acceptall or search_acceptable_unaligned(si, target)) { pseqnos[searchinfo.hit_count] = target; ++searchinfo.hit_count; } } if (searchinfo.hit_count) { /* perform alignments */ search16_qprep(searchinfo.s, searchinfo.qsequence, searchinfo.qseqlen); search16(searchinfo.s, searchinfo.hit_count, pseqnos.data(), pscores.data(), paligned.data(), pmatches.data(), pmismatches.data(), pgaps.data(), pcigar.data()); /* convert to hit structure */ for (int h = 0; h < searchinfo.hit_count; h++) { struct hit * hit = &searchinfo.hits_v[h]; unsigned int const target = pseqnos[h]; int64_t nwscore = pscores[h]; char * nwcigar {nullptr}; int64_t nwalignmentlength {0}; int64_t nwmatches {0}; int64_t nwmismatches {0}; int64_t nwgaps {0}; if (nwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * tseq = db_getsequence(target); int64_t const tseqlen = db_getsequencelen(target); if (pcigar[h] != nullptr) { xfree(pcigar[h]); } nwcigar = xstrdup(lma.align(searchinfo.qsequence, tseq, searchinfo.qseqlen, tseqlen)); lma.alignstats(nwcigar, searchinfo.qsequence, tseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); } else { nwcigar = pcigar[h]; nwalignmentlength = paligned[h]; nwmatches = pmatches[h]; nwmismatches = pmismatches[h]; nwgaps = pgaps[h]; } hit->target = target; hit->strand = 0; hit->count = 0; hit->accepted = false; hit->rejected = false; hit->aligned = true; hit->weak = false; hit->nwscore = nwscore; hit->nwdiff = nwalignmentlength - nwmatches; hit->nwgaps = nwgaps; hit->nwindels = nwalignmentlength - nwmatches - nwmismatches; hit->nwalignmentlength = nwalignmentlength; hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) / nwalignmentlength; hit->nwalignment = nwcigar; hit->matches = nwalignmentlength - hit->nwdiff; hit->mismatches = hit->nwdiff - hit->nwindels; auto const dseqlen = static_cast(db_getsequencelen(target)); hit->shortest = std::min(searchinfo.qseqlen, dseqlen); hit->longest = std::max(searchinfo.qseqlen, dseqlen); /* trim alignment, compute numbers excluding terminal gaps */ align_trim(hit); /* test accept/reject criteria after alignment */ if (opt_acceptall or search_acceptable_aligned(si, hit)) { finalhits[searchinfo.accepts] = *hit; ++searchinfo.accepts; } } /* sort hits */ qsort(finalhits.data(), searchinfo.accepts, sizeof(struct hit), allpairs_hit_compare); } /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* output results */ allpairs_output_results(searchinfo.accepts, finalhits.data(), searchinfo.query_head, searchinfo.qseqlen, searchinfo.qsequence, nullptr); /* update stats */ if (searchinfo.accepts) { ++qmatches; } /* show progress */ progress += seqcount - query_no - 1; progress_update(progress); xpthread_mutex_unlock(&mutex_output); /* free memory for alignment strings */ for (int i = 0; i < searchinfo.hit_count; i++) { if (searchinfo.hits_v[i].aligned) { xfree(searchinfo.hits_v[i].nwalignment); } } } else { /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); cont = false; } } search16_exit(searchinfo.s); xfree(scorematrix); } auto allpairs_thread_worker(void * void_ptr) -> void * { auto const nth_thread = reinterpret_cast(void_ptr); allpairs_thread_run(nth_thread); return nullptr; } auto allpairs_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { xpthread_create(pthread + t, &attr, allpairs_thread_worker, (void *) (int64_t) t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); } xpthread_attr_destroy(&attr); } auto allpairs_global(char * cmdline, char * progheader) -> void { opt_strand = 1; opt_uc_allhits = 1; /* open output files */ if (opt_alnout) { fp_alnout = fopen_output(opt_alnout); if (not fp_alnout) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_samout) { fp_samout = fopen_output(opt_samout); if (not fp_samout) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout) { fp_userout = fopen_output(opt_userout); if (not fp_userout) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out) { fp_blast6out = fopen_output(opt_blast6out); if (not fp_blast6out) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_uc) { fp_uc = fopen_output(opt_uc); if (not fp_uc) { fatal("Unable to open uc output file for writing"); } } if (opt_fastapairs) { fp_fastapairs = fopen_output(opt_fastapairs); if (not fp_fastapairs) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout) { fp_qsegout = fopen_output(opt_qsegout); if (not fp_qsegout) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout) { fp_tsegout = fopen_output(opt_tsegout); if (not fp_tsegout) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched) { fp_matched = fopen_output(opt_matched); if (not fp_matched) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched) { fp_notmatched = fopen_output(opt_notmatched); if (not fp_notmatched) { fatal("Unable to open notmatched output file for writing"); } } db_read(opt_allpairs_global, 0); results_show_samheader(fp_samout, cmdline, opt_allpairs_global); if (opt_qmask == MASK_DUST) { dust_all(); } else if ((opt_qmask == MASK_SOFT) and (opt_hardmask)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); /* prepare reading of queries */ qmatches = 0; queries = 0; std::vector pthread_v(opt_threads); pthread = pthread_v.data(); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); progress = 0; progress_init("Aligning", MAX(0, ((int64_t) seqcount) * ((int64_t) seqcount - 1)) / 2); // refactoring: issue with parenthesis? allpairs_thread_worker_run(); progress_done(); if (not opt_quiet) { fprintf(stderr, "Matching query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); } if (opt_log) { fprintf(fp_log, "Matching query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n\n"); } xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); // pthread_v not used after this point /* clean up, global */ db_free(); if (opt_matched) { fclose(fp_matched); } if (opt_notmatched) { fclose(fp_notmatched); } if (opt_fastapairs) { fclose(fp_fastapairs); } if (opt_qsegout) { fclose(fp_qsegout); } if (opt_tsegout) { fclose(fp_tsegout); } if (fp_uc) { fclose(fp_uc); } if (fp_blast6out) { fclose(fp_blast6out); } if (fp_userout) { fclose(fp_userout); } if (fp_alnout) { fclose(fp_alnout); } if (fp_samout) { fclose(fp_samout); } show_rusage(); } vsearch-2.30.0/src/allpairs.h000066400000000000000000000047461476012147200160150ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void allpairs_global(char * cmdline, char * progheader); vsearch-2.30.0/src/arch.cc000066400000000000000000000176421476012147200152600ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dynlibs.h" #include // std::FILE #include // uint64_t #include // std::realloc, std::free #include // strcasestr const int memalignment = 16; auto arch_get_memused() -> uint64_t { #ifdef _WIN32 PROCESS_MEMORY_COUNTERS pmc; GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(PROCESS_MEMORY_COUNTERS)); return pmc.PeakWorkingSetSize; #else struct rusage r_usage; getrusage(RUSAGE_SELF, & r_usage); # ifdef __APPLE__ /* Mac: ru_maxrss gives the size in bytes */ return r_usage.ru_maxrss; # else /* Linux: ru_maxrss gives the size in kilobytes */ return r_usage.ru_maxrss * 1024; # endif #endif } auto arch_get_memtotal() -> uint64_t { #ifdef _WIN32 MEMORYSTATUSEX ms; ms.dwLength = sizeof(MEMORYSTATUSEX); GlobalMemoryStatusEx(&ms); return ms.ullTotalPhys; #elif defined(__APPLE__) int mib [] = { CTL_HW, HW_MEMSIZE }; int64_t ram = 0; size_t length = sizeof(ram); if(sysctl(mib, 2, &ram, &length, NULL, 0) == -1) fatal("Cannot determine amount of RAM"); return ram; #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) int64_t const phys_pages = sysconf(_SC_PHYS_PAGES); int64_t const pagesize = sysconf(_SC_PAGESIZE); if ((phys_pages == -1) || (pagesize == -1)) { fatal("Cannot determine amount of RAM"); } return pagesize * phys_pages; #else struct sysinfo si; if (sysinfo(&si)) fatal("Cannot determine amount of RAM"); return si.totalram * si.mem_unit; #endif } auto arch_get_cores() -> long { #ifdef _WIN32 SYSTEM_INFO si; GetSystemInfo(&si); return si.dwNumberOfProcessors; #else return sysconf(_SC_NPROCESSORS_ONLN); #endif } auto arch_get_user_system_time(double * user_time, double * system_time) -> void { *user_time = 0; *system_time = 0; #ifdef _WIN32 HANDLE hProcess = GetCurrentProcess(); FILETIME ftCreation, ftExit, ftKernel, ftUser; ULARGE_INTEGER ul; GetProcessTimes(hProcess, &ftCreation, &ftExit, &ftKernel, &ftUser); ul.u.HighPart = ftUser.dwHighDateTime; ul.u.LowPart = ftUser.dwLowDateTime; *user_time = ul.QuadPart * 100.0e-9; ul.u.HighPart = ftKernel.dwHighDateTime; ul.u.LowPart = ftKernel.dwLowDateTime; *system_time = ul.QuadPart * 100.0e-9; #else struct rusage r_usage; getrusage(RUSAGE_SELF, & r_usage); * user_time = r_usage.ru_utime.tv_sec * 1.0 + r_usage.ru_utime.tv_usec * 1.0e-6; * system_time = r_usage.ru_stime.tv_sec * 1.0 + r_usage.ru_stime.tv_usec * 1.0e-6; #endif } auto arch_srandom() -> void { /* initialize pseudo-random number generator */ unsigned int seed = opt_randseed; if (seed == 0) { #ifdef _WIN32 srand(GetTickCount()); #else int const fd = open("/dev/urandom", O_RDONLY); if (fd < 0) { fatal("Unable to open /dev/urandom"); } if (read(fd, & seed, sizeof(seed)) < 0) { fatal("Unable to read from /dev/urandom"); } close(fd); srandom(seed); #endif } else { #ifdef _WIN32 srand(seed); #else srandom(seed); #endif } } auto arch_random() -> uint64_t { #ifdef _WIN32 return rand(); #else return random(); #endif } auto xmalloc(size_t size) -> void * { if (size == 0) { size = 1; } void * t = nullptr; #ifdef _WIN32 t = _aligned_malloc(size, memalignment); #else if (posix_memalign(& t, memalignment, size)) { t = nullptr; } #endif if (!t) { fatal("Unable to allocate enough memory."); } return t; } auto xrealloc(void *ptr, size_t size) -> void * { if (size == 0) { size = 1; } #ifdef _WIN32 void * t = _aligned_realloc(ptr, size, memalignment); #else void * t = realloc(ptr, size); #endif if (not t) { fatal("Unable to reallocate enough memory."); } return t; } auto xfree(void * ptr) -> void { if (ptr) { #ifdef _WIN32 _aligned_free(ptr); #else free(ptr); #endif } else { fatal("Trying to free a null pointer"); } } auto xfstat(int file_descriptor, xstat_t * buf) -> int { #ifdef _WIN32 return _fstat64(file_descriptor, buf); #else return fstat(file_descriptor, buf); #endif } auto xstat(const char * path, xstat_t * buf) -> int { #ifdef _WIN32 return _stat64(path, buf); #else return stat(path, buf); #endif } auto xlseek(int file_descriptor, uint64_t offset, int whence) -> uint64_t { #ifdef _WIN32 return _lseeki64(file_descriptor, offset, whence); #else return lseek(file_descriptor, offset, whence); #endif } auto xftello(std::FILE * stream) -> uint64_t { #ifdef _WIN32 return _ftelli64(stream); #else return ftello(stream); #endif } auto xopen_read(const char * path) -> int { #ifdef _WIN32 return _open(path, _O_RDONLY | _O_BINARY); #else return open(path, O_RDONLY); #endif } auto xopen_write(const char * path) -> int { #ifdef _WIN32 return _open(path, _O_WRONLY | _O_CREAT | _O_TRUNC | _O_BINARY, _S_IREAD | _S_IWRITE); #else return open(path, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); #endif } auto xstrcasestr(const char * haystack, const char * needle) -> const char * { #ifdef _WIN32 return StrStrIA(haystack, needle); #else return strcasestr(haystack, needle); #endif } #ifdef _WIN32 auto arch_dlsym(HMODULE handle, const char * symbol) -> FARPROC #else auto arch_dlsym(void * handle, const char * symbol) -> void * #endif { #ifdef _WIN32 return GetProcAddress(handle, symbol); #else return dlsym(handle, symbol); #endif } vsearch-2.30.0/src/arch.h000066400000000000000000000067741476012147200151260ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t #include // std::FILE, std::size_t #ifdef _WIN32 using xstat_t = struct __stat64; #else using xstat_t = struct stat; #endif auto arch_get_memused() -> uint64_t; auto arch_get_memtotal() -> uint64_t; auto arch_get_cores() -> long; auto arch_get_user_system_time(double * user_time, double * system_time) -> void; auto arch_srandom() -> void; auto arch_random() -> uint64_t; auto xmalloc(std::size_t size) -> void *; auto xrealloc(void * ptr, std::size_t size) -> void *; auto xfree(void * ptr) -> void; auto xfstat(int file_descriptor, xstat_t * buf) -> int; auto xstat(const char * path, xstat_t * buf) -> int; auto xlseek(int file_descriptor, uint64_t offset, int whence) -> uint64_t; auto xftello(std::FILE * stream) -> uint64_t; auto xopen_read(const char * path) -> int; auto xopen_write(const char * path) -> int; auto xstrcasestr(const char * haystack, const char * needle) -> const char *; #ifdef _WIN32 auto arch_dlsym(HMODULE handle, const char * symbol) -> FARPROC; #else auto arch_dlsym(void * handle, const char * symbol) -> void *; #endif vsearch-2.30.0/src/attributes.cc000066400000000000000000000201771476012147200165260ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::swap #include // int64_t #include // std::FILE, std::fprintf #include // std::atol #include // std::strlen, std::strstr, std::strspn auto header_find_attribute(const char * header, int header_length, const char * attribute, int * start, int * end, bool allow_decimal) -> bool { /* Identify the first occurence of the pattern (^|;)size=([0-9]+)(;|$) in the header string, where "size=" is the specified attribute. If allow_decimal is true, a dot (.) is allowed within the digits. */ const char * digit_chars = "0123456789"; const char * digit_chars_decimal = "0123456789."; if ((not header) or (not attribute)) { return false; } int const hlen = header_length; int const alen = strlen(attribute); int i = 0; while (i < hlen - alen) { char * r = (char *) strstr(header + i, attribute); /* no match */ if (r == nullptr) { break; } i = r - header; /* check for ';' in front */ if ((i > 0) and (header[i - 1] != ';')) { i += alen + 1; continue; } int const digits = (int) strspn(header + i + alen, (allow_decimal ? digit_chars_decimal : digit_chars)); /* check for at least one digit */ if (digits == 0) { i += alen + 1; continue; } /* check for ';' after */ if ((i + alen + digits < hlen) and (header[i + alen + digits] != ';')) { i += alen + digits + 2; continue; } /* ok */ * start = i; * end = i + alen + digits; return true; } return false; } auto header_get_size(char * header, int header_length) -> int64_t { /* read size/abundance annotation */ int64_t abundance = 0; int start = 0; int end = 0; if (header_find_attribute(header, header_length, "size=", &start, &end, false)) { int64_t const number = atol(header + start + 5); if (number > 0) { abundance = number; } else { fatal("Invalid (zero) abundance annotation in FASTA file header"); } } return abundance; } auto header_fprint_strip(FILE * output_handle, char * header, int header_length, bool strip_size, bool strip_ee, bool strip_length) -> void { int attributes = 0; int attribute_start[3]; int attribute_end[3]; /* look for size attribute */ int size_start = 0; int size_end = 0; bool size_found = false; if (strip_size) { size_found = header_find_attribute(header, header_length, "size=", & size_start, & size_end, false); } if (size_found) { attribute_start[attributes] = size_start; attribute_end[attributes] = size_end; ++attributes; } /* look for ee attribute */ int ee_start = 0; int ee_end = 0; bool ee_found = false; if (strip_ee) { ee_found = header_find_attribute(header, header_length, "ee=", & ee_start, & ee_end, true); } if (ee_found) { attribute_start[attributes] = ee_start; attribute_end[attributes] = ee_end; ++attributes; } /* look for length attribute */ int length_start = 0; int length_end = 0; bool length_found = false; if (strip_length) { length_found = header_find_attribute(header, header_length, "length=", &length_start, &length_end, true); } if (length_found) { attribute_start[attributes] = length_start; attribute_end[attributes] = length_end; ++attributes; } /* sort */ int last_swap = 0; int limit = attributes - 1; while (limit > 0) { for(int i = 0; i < limit; i++) { if (attribute_start[i] > attribute_start[i + 1]) { std::swap(attribute_start[i], attribute_start[i + 1]); std::swap(attribute_end[i], attribute_end[i + 1]); last_swap = i; } } limit = last_swap; } /* print */ if (attributes == 0) { fprintf(output_handle, "%.*s", header_length, header); } else { int prev_end = 0; for (int i = 0; i < attributes; i++) { /* print part of header in front of this attribute */ if (attribute_start[i] > prev_end + 1) { fprintf(output_handle, "%.*s", attribute_start[i] - prev_end - 1, header + prev_end); } prev_end = attribute_end[i]; } /* print the rest, if any */ if (header_length > prev_end + 1) { fprintf(output_handle, "%.*s", header_length - prev_end, header + prev_end); } } } vsearch-2.30.0/src/attributes.h000066400000000000000000000055011476012147200163620ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // int64_t #include // std::FILE auto header_get_size(char * header, int header_length) -> int64_t; auto header_fprint_strip(std::FILE * output_handle, char * header, int header_length, bool strip_size, bool strip_ee, bool strip_length) -> void; vsearch-2.30.0/src/bitmap.cc000066400000000000000000000071251476012147200156120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "bitmap.h" #include // std::memset auto bitmap_init(unsigned int const size) -> struct bitmap_s * { auto * b = (struct bitmap_s *) xmalloc(sizeof(struct bitmap_s)); b->size = size; b->bitmap = (unsigned char *) xmalloc((size + 7) / 8); return b; } auto bitmap_get(struct bitmap_s * a_bitmap, unsigned int const seed_value) -> unsigned char { constexpr auto mask_111 = 7U; constexpr auto divider = 3U; // divide by 8 return (a_bitmap->bitmap[seed_value >> divider] >> (seed_value & mask_111)) & 1U; } auto bitmap_reset_all(struct bitmap_s * a_bitmap) -> void { constexpr auto n_bits_in_a_byte = 8U; const auto size_in_bytes = (a_bitmap->size + n_bits_in_a_byte - 1) / n_bits_in_a_byte; std::memset(a_bitmap->bitmap, 0, size_in_bytes); } auto bitmap_set(struct bitmap_s * a_bitmap, unsigned int const seed_value) -> void { constexpr auto mask_111 = 7U; constexpr auto divider = 3U; // divide by 8 a_bitmap->bitmap[seed_value >> divider] |= 1U << (seed_value & mask_111); } auto bitmap_free(struct bitmap_s * a_bitmap) -> void { if (a_bitmap->bitmap) { xfree(a_bitmap->bitmap); } xfree(a_bitmap); } vsearch-2.30.0/src/bitmap.h000066400000000000000000000056561476012147200154630ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BITMAP_H #define BITMAP_H struct bitmap_s { unsigned char * bitmap; /* the actual bitmap */ unsigned int size; /* size in bits */ }; auto bitmap_init(unsigned int size) -> struct bitmap_s *; auto bitmap_get(struct bitmap_s * a_bitmap, unsigned int seed_value) -> unsigned char; auto bitmap_reset_all(struct bitmap_s * a_bitmap) -> void; auto bitmap_set(struct bitmap_s * a_bitmap, unsigned int seed_value) -> void; auto bitmap_free(struct bitmap_s * a_bitmap) -> void; #endif // BITMAP_H vsearch-2.30.0/src/chimera.cc000066400000000000000000002152171476012147200157510ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "attributes.h" #include "chimera.h" #include "dbindex.h" #include "maps.h" #include "mask.h" #include "minheap.h" #include "udb.h" #include "unique.h" #include // std::max #include // std::tolower #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::qsort #include // std::FILE, std::fprintf, std::sscanf #include // std::strlen, std::strncpy, std::strcpy #include #include #include /* This code implements the method described in this paper: Robert C. Edgar, Brian J. Haas, Jose C. Clemente, Christopher Quince and Rob Knight (2011) UCHIME improves sensitivity and speed of chimera detection Bioinformatics, 27, 16, 2194-2200 https://doi.org/10.1093/bioinformatics/btr381 */ /* global constants/data, no need for synchronization */ static int parts = 0; const int maxparts = 100; const int window = 32; const int few = 4; const int maxcandidates = few * maxparts; const int rejects = 16; const double chimera_id = 0.55; static int tophits; static pthread_attr_t attr; static pthread_t * pthread; static fastx_handle query_fasta_h; /* mutexes and global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static unsigned int seqno = 0; static uint64_t progress = 0; static int chimera_count = 0; static int nonchimera_count = 0; static int borderline_count = 0; static int total_count = 0; static int64_t chimera_abundance = 0; static int64_t nonchimera_abundance = 0; static int64_t borderline_abundance = 0; static int64_t total_abundance = 0; static FILE * fp_chimeras = nullptr; static FILE * fp_nonchimeras = nullptr; static FILE * fp_uchimealns = nullptr; static FILE * fp_uchimeout = nullptr; static FILE * fp_borderline = nullptr; /* information for each query sequence to be checked */ struct chimera_info_s { int query_alloc; /* the longest query sequence allocated memory for */ int head_alloc; /* the longest header allocated memory for */ int query_no; char * query_head; int query_head_len; int query_size; char * query_seq; int query_len; struct searchinfo_s si[maxparts]; unsigned int cand_list[maxcandidates]; int cand_count; struct s16info_s * s; CELL snwscore[maxcandidates]; unsigned short snwalignmentlength[maxcandidates]; unsigned short snwmatches[maxcandidates]; unsigned short snwmismatches[maxcandidates]; unsigned short snwgaps[maxcandidates]; int64_t nwscore[maxcandidates]; int64_t nwalignmentlength[maxcandidates]; int64_t nwmatches[maxcandidates]; int64_t nwmismatches[maxcandidates]; int64_t nwgaps[maxcandidates]; char * nwcigar[maxcandidates]; int match_size; int * match; int * insert; int * smooth; int * maxsmooth; double * scan_p; double * scan_q; int parents_found; int best_parents[maxparents]; int best_start[maxparents]; int best_len[maxparents]; int best_target; char * best_cigar; int * maxi; char * paln[maxparents]; char * qaln; char * diffs; char * votes; char * model; char * ignore; struct hit * all_hits; double best_h; }; static struct chimera_info_s * cia; auto realloc_arrays(struct chimera_info_s * ci) -> void { if (opt_chimeras_denovo) { if (opt_chimeras_parts == 0) { parts = (ci->query_len + maxparts - 1) / maxparts; } else { parts = opt_chimeras_parts; } if (parts < 2) { parts = 2; } else if (parts > maxparts) { parts = maxparts; } } else { /* default for uchime, uchime2, and uchime3 */ parts = 4; } const int maxhlen = MAX(ci->query_head_len, 1); if (maxhlen > ci->head_alloc) { ci->head_alloc = maxhlen; ci->query_head = (char *) xrealloc(ci->query_head, maxhlen + 1); } /* realloc arrays based on query length */ const int maxqlen = MAX(ci->query_len, 1); const int maxpartlen = (maxqlen + parts - 1) / parts; if (maxqlen > ci->query_alloc) { ci->query_alloc = maxqlen; ci->query_seq = (char *) xrealloc(ci->query_seq, maxqlen + 1); for (auto & i: ci->si) { i.qsequence = (char *) xrealloc(i.qsequence, maxpartlen + 1); } ci->maxi = (int *) xrealloc(ci->maxi, (maxqlen + 1) * sizeof(int)); ci->maxsmooth = (int *) xrealloc(ci->maxsmooth, maxqlen * sizeof(int)); ci->match = (int *) xrealloc(ci->match, maxcandidates * maxqlen * sizeof(int)); ci->insert = (int *) xrealloc(ci->insert, maxcandidates * maxqlen * sizeof(int)); ci->smooth = (int *) xrealloc(ci->smooth, maxcandidates * maxqlen * sizeof(int)); ci->scan_p = (double *) xrealloc(ci->scan_p, (maxqlen + 1) * sizeof(double)); ci->scan_q = (double *) xrealloc(ci->scan_q, (maxqlen + 1) * sizeof(double)); const int maxalnlen = maxqlen + (2 * db_getlongestsequence()); for (int f = 0; f < maxparents ; f++) { ci->paln[f] = (char *) xrealloc(ci->paln[f], maxalnlen + 1); } ci->qaln = (char *) xrealloc(ci->qaln, maxalnlen + 1); ci->diffs = (char *) xrealloc(ci->diffs, maxalnlen + 1); ci->votes = (char *) xrealloc(ci->votes, maxalnlen + 1); ci->model = (char *) xrealloc(ci->model, maxalnlen + 1); ci->ignore = (char *) xrealloc(ci->ignore, maxalnlen + 1); } } auto find_matches(struct chimera_info_s * ci) -> void { /* find the positions with matches for each potential parent */ /* also note the positions with inserts in front */ char * qseq = ci->query_seq; for (int i = 0; i < ci->cand_count; i++) { for (int j = 0; j < ci->query_len; j++) { int const x = (i * ci->query_len) + j; ci->match[x] = 0; ci->insert[x] = 0; } } for (int i = 0; i < ci->cand_count; i++) { char * tseq = db_getsequence(ci->cand_list[i]); int qpos = 0; int tpos = 0; char * p = ci->nwcigar[i]; char * e = p + strlen(p); while (p < e) { int run = 1; int scanlength = 0; sscanf(p, "%d%n", &run, &scanlength); p += scanlength; char const op = *p++; switch (op) { case 'M': for (int k = 0; k < run; k++) { if (chrmap_4bit[(int) (qseq[qpos])] & chrmap_4bit[(int) (tseq[tpos])]) { ci->match[(i * ci->query_len) + qpos] = 1; } ++qpos; ++tpos; } break; case 'I': ci->insert[(i * ci->query_len) + qpos] = run; tpos += run; break; case 'D': qpos += run; break; } } } } struct parents_info_s { int cand; int start; int len; }; auto compare_positions(const void * a, const void * b) -> int { const int x = ((const parents_info_s *) a)->start; const int y = ((const parents_info_s *) b)->start; if (x < y) { return -1; } else if (x > y) { return +1; } else { return 0; } } auto scan_matches(struct chimera_info_s * ci, int * matches, int len, double percentage, int * best_start, int * best_len) -> bool { /* Scan matches array of zeros and ones, and find the longest subsequence having a match fraction above or equal to the given percentage (e.g. 2%). Based on an idea of finding the longest positive sum substring: https://stackoverflow.com/questions/28356453/longest-positive-sum-substring If the percentage is 2%, matches are given a score of 2 and mismatches -98. */ double const score_match = percentage; double const score_mismatch = percentage - 100.0; double * p = ci->scan_p; double * q = ci->scan_q; p[0] = 0.0; for (int i = 0; i < len; i++) { p[i + 1] = p[i] + (matches[i] ? score_match : score_mismatch); } q[len] = p[len]; for (int i = len - 1; i >= 0; i--) { q[i] = MAX(q[i + 1], p[i]); } int best_i = 0; int best_d = -1; double best_c = -1.0; int i = 1; int j = 1; while (j <= len) { double const c = q[j] - p[i - 1]; if (c >= 0.0) { int const d = j - i + 1; if (d > best_d) { best_i = i; best_d = d; best_c = c; } j += 1; } else { i += 1; } } if (best_c >= 0.0) { * best_start = best_i - 1; * best_len = best_d; return true; } else { return false; } } auto find_best_parents_long(struct chimera_info_s * ci) -> int { /* Find parents with longest matching regions, without indels, allowing a given percentage of mismatches (specified with --chimeras_diff_pct), and excluding regions matched by previously identified parents. */ find_matches(ci); struct parents_info_s best_parents[maxparents]; for (int f = 0; f < maxparents; f++) { best_parents[f].cand = -1; best_parents[f].start = -1; } std::vector position_used(ci->query_len, false); int pos_remaining = ci->query_len; int parents_found = 0; for (int f = 0; f < opt_chimeras_parents_max; f++) { /* scan each candidate and find longest matching region */ int best_start = 0; int best_len = 0; int best_cand = -1; for (int i = 0; i < ci->cand_count; i++) { int start = 0; int len = 0; int j = 0; while (j < ci->query_len) { start = j; len = 0; while ((j < ci->query_len) && (not position_used[j]) && ((len == 0) or (ci->insert[(i * ci->query_len) + j] == 0))) { ++len; ++j; } if (len > best_len) { int scan_best_start = 0; int scan_best_len = 0; if (scan_matches(ci, ci->match + (i * ci->query_len) + start, len, opt_chimeras_diff_pct, & scan_best_start, & scan_best_len)) { if (scan_best_len > best_len) { best_cand = i; best_start = start + scan_best_start; best_len = scan_best_len; } } } ++j; } } if (best_len >= opt_chimeras_length_min) { best_parents[f].cand = best_cand; best_parents[f].start = best_start; best_parents[f].len = best_len; ++parents_found; #if 0 if (f == 0) printf("\n"); printf("Best parents long: %d %d %d %d %s %s\n", f, best_cand, best_start, best_len, ci->query_head, db_getheader(ci->cand_list[best_cand])); #endif /* mark positions used */ for (int j = best_start; j < best_start + best_len; j++) { position_used[j] = true; } pos_remaining -= best_len; } else { break; } } /* sort parents by position */ qsort(best_parents, parents_found, sizeof(struct parents_info_s), compare_positions); ci->parents_found = parents_found; for (int f = 0; f < parents_found; f++) { ci->best_parents[f] = best_parents[f].cand; ci->best_start[f] = best_parents[f].start; ci->best_len[f] = best_parents[f].len; } #if 0 if (pos_remaining == 0) printf("Fully covered!\n"); else printf("Not covered completely (%d).\n", pos_remaining); #endif return (parents_found > 1) and (pos_remaining == 0); } auto find_best_parents(struct chimera_info_s * ci) -> int { find_matches(ci); int best_parent_cand[maxparents]; for (int f = 0; f < 2; f++) { best_parent_cand[f] = -1; ci->best_parents[f] = -1; } std::vector cand_selected(ci->cand_count, false); for (int f = 0; f < 2; f++) { if (f > 0) { /* for all parents except the first */ /* wipe out matches for all candidates in positions covered by the previous parent */ for (int qpos = window - 1; qpos < ci->query_len; qpos++) { int const z = (best_parent_cand[f - 1] * ci->query_len) + qpos; if (ci->smooth[z] == ci->maxsmooth[qpos]) { for (int i = qpos + 1 - window; i <= qpos; i++) { for (int j = 0; j < ci->cand_count; j++) { ci->match[(j * ci->query_len) + i] = 0; } } } } } /* Compute smoothed score in a 32bp window for each candidate. */ /* Record max smoothed score for each position among candidates left. */ for (int j = 0; j < ci->query_len; j++) { ci->maxsmooth[j] = 0; } for (int i = 0; i < ci->cand_count; i++) { if (not cand_selected[i]) { int sum = 0; for (int qpos = 0; qpos < ci->query_len; qpos++) { int const z = (i * ci->query_len) + qpos; sum += ci->match[z]; if (qpos >= window) { sum -= ci->match[z - window]; } if (qpos >= window - 1) { ci->smooth[z] = sum; ci->maxsmooth[qpos] = std::max(ci->smooth[z], ci->maxsmooth[qpos]); } } } } /* find parent with the most wins */ std::vector wins(ci->cand_count, 0); for (int qpos = window - 1; qpos < ci->query_len; qpos++) { if (ci->maxsmooth[qpos] != 0) { for (int i = 0; i < ci->cand_count; i++) { if (not cand_selected[i]) { int const z = (i * ci->query_len) + qpos; if (ci->smooth[z] == ci->maxsmooth[qpos]) { wins[i]++; } } } } } /* select best parent based on most wins */ int maxwins = 0; for (int i = 0; i < ci->cand_count; i++) { int const w = wins[i]; if (w > maxwins) { maxwins = w; best_parent_cand[f] = i; } } /* terminate loop if no parent found */ if (best_parent_cand[f] < 0) { break; } #if 0 printf("Query %d: Best parent (%d) candidate: %d. Wins: %d\n", ci->query_no, f, best_parent_cand[f], maxwins); #endif ci->best_parents[f] = best_parent_cand[f]; cand_selected[best_parent_cand[f]] = true; } /* Check if at least 2 candidates selected */ return (best_parent_cand[0] >= 0) and (best_parent_cand[1] >= 0); } auto find_max_alignment_length(struct chimera_info_s * ci) -> int { /* find max insertions in front of each position in the query sequence */ for (int i = 0; i <= ci->query_len; i++) { ci->maxi[i] = 0; } for (int f = 0; f < ci->parents_found; f++) { int const best_parent = ci->best_parents[f]; char * p = ci->nwcigar[best_parent]; char * e = p + strlen(p); int pos = 0; while (p < e) { int run = 1; int scanlength = 0; sscanf(p, "%d%n", &run, &scanlength); p += scanlength; char const op = *p++; switch (op) { case 'M': case 'D': pos += run; break; case 'I': ci->maxi[pos] = std::max(run, ci->maxi[pos]); break; } } } /* find total alignment length */ int alnlen = 0; for (int i = 0; i < ci->query_len + 1; i++) { alnlen += ci->maxi[i]; } alnlen += ci->query_len; return alnlen; } auto fill_alignment_parents(struct chimera_info_s * ci) -> void { /* fill in alignment strings for the parents */ for (int j = 0; j < ci->parents_found; j++) { int const cand = ci->best_parents[j]; int const target_seqno = ci->cand_list[cand]; char * target_seq = db_getsequence(target_seqno); int inserted = 0; int qpos = 0; int tpos = 0; char * t = ci->paln[j]; char * p = ci->nwcigar[cand]; char * e = p + strlen(p); while (p < e) { int run = 1; int scanlength = 0; sscanf(p, "%d%n", &run, &scanlength); p += scanlength; char const op = *p++; if (op == 'I') { for (int x = 0; x < ci->maxi[qpos]; x++) { if (x < run) { *t++ = chrmap_upcase[(int) (target_seq[tpos++])]; } else { *t++ = '-'; } } inserted = 1; } else { for (int x = 0; x < run; x++) { if (not inserted) { for (int y = 0; y < ci->maxi[qpos]; y++) { *t++ = '-'; } } if (op == 'M') { *t++ = chrmap_upcase[(int) (target_seq[tpos++])]; } else { *t++ = '-'; } ++qpos; inserted = 0; } } } /* add any gaps at the end */ if (not inserted) { for (int x = 0; x < ci->maxi[qpos]; x++) { *t++ = '-'; } } /* end of sequence string */ *t = 0; } } auto eval_parents_long(struct chimera_info_s * ci) -> int { /* always chimeric if called */ int const status = 4; int const alnlen = find_max_alignment_length(ci); fill_alignment_parents(ci); /* fill in alignment string for query */ char * pm = ci->model; int m = 0; char * q = ci->qaln; int qpos = 0; for (int i = 0; i < ci->query_len; i++) { if (qpos >= (ci->best_start[m] + ci->best_len[m])) { ++m; } for (int j = 0; j < ci->maxi[i]; j++) { *q++ = '-'; *pm++ = 'A' + m; } *q++ = chrmap_upcase[(int)(ci->query_seq[qpos++])]; *pm++ = 'A' + m; } for (int j = 0; j < ci->maxi[ci->query_len]; j++) { *q++ = '-'; *pm++ = 'A' + m; } *q = 0; *pm = 0; for (int i = 0; i < alnlen; i++) { unsigned int const qsym = chrmap_4bit[(int) (ci->qaln[i])]; unsigned int psym[maxparents]; for (int f = 0; f < maxparents; f++) { psym[f] = 0; } for (int f = 0; f < ci->parents_found; f++) { psym[f] = chrmap_4bit[(int) (ci->paln[f][i])]; } /* lower case parent symbols that differ from query */ for (int f = 0; f < ci->parents_found; f++) { if (psym[f] and (psym[f] != qsym)) { ci->paln[f][i] = tolower(ci->paln[f][i]); } } /* compute diffs */ char diff = ' '; bool all_defined = qsym; for (int f = 0; f < ci->parents_found; f++) { if (not psym[f]) { all_defined = false; } } if (all_defined) { int z = 0; for (int f = 0; f < ci->parents_found; f++) { if (psym[f] == qsym) { diff = 'A' + f; ++z; } } if (z > 1) { diff = ' '; } } ci->diffs[i] = diff; } ci->diffs[alnlen] = 0; /* count matches */ int match_QP[maxparents]; int cols = 0; for (int f = 0; f < ci->parents_found; f++) { match_QP[f] = 0; } for (int i = 0; i < alnlen; i++) { ++cols; char const qsym = chrmap_4bit[(int) (ci->qaln[i])]; for (int f = 0; f < ci->parents_found; f++) { char const psym = chrmap_4bit[(int) (ci->paln[f][i])]; if (qsym == psym) { match_QP[f]++; } } } int const seqno_a = ci->cand_list[ci->best_parents[0]]; int const seqno_b = ci->cand_list[ci->best_parents[1]]; int seqno_c = -1; if (ci->parents_found > 2) { seqno_c = ci->cand_list[ci->best_parents[2]]; } double QP[maxparents]; double QT = 0.0; for (int f = 0; f < maxparents; f++) { if (f < ci->parents_found) { QP[f] = 100.0 * match_QP[f] / cols; } else { QP[f] = 0.0; } QT = std::max(QP[f], QT); } double const QA = QP[0]; double const QB = QP[1]; double const QC = ci->parents_found > 2 ? QP[2] : 0.00; double const QM = 100.00; double const divfrac = 100.00 * (QM - QT) / QT; xpthread_mutex_lock(&mutex_output); if (opt_alnout and (status == 4)) { fprintf(fp_uchimealns, "\n"); fprintf(fp_uchimealns, "----------------------------------------" "--------------------------------\n"); fprintf(fp_uchimealns, "Query (%5d nt) ", ci->query_len); header_fprint_strip(fp_uchimealns, ci->query_head, ci->query_head_len, opt_xsize, opt_xee, opt_xlength); for (int f = 0; f < ci->parents_found; f++) { int const seqno = ci->cand_list[ci->best_parents[f]]; fprintf(fp_uchimealns, "\nParent%c (%5" PRIu64 " nt) ", 'A' + f, db_getsequencelen(seqno)); header_fprint_strip(fp_uchimealns, db_getheader(seqno), db_getheaderlen(seqno), opt_xsize, opt_xee, opt_xlength); } fprintf(fp_uchimealns, "\n\n"); int const width = opt_alignwidth > 0 ? opt_alignwidth : alnlen; qpos = 0; int ppos[maxparents]; for (int f = 0; f < ci->parents_found; f++) { ppos[f] = 0; } int rest = alnlen; for (int i = 0; i < alnlen; i += width) { /* count non-gap symbols on current line */ int qnt = 0; int pnt[maxparents]; for (int f = 0; f < ci->parents_found; f++) { pnt[f] = 0; } int const w = MIN(rest, width); for (int j = 0; j < w; j++) { if (ci->qaln[i + j] != '-') { ++qnt; } for (int f = 0; f < ci->parents_found; f++) { if (ci->paln[f][i + j] != '-') { pnt[f]++; } } } fprintf(fp_uchimealns, "Q %5d %.*s %d\n", qpos + 1, w, ci->qaln + i, qpos + qnt); for (int f = 0; f < ci->parents_found; f++) { fprintf(fp_uchimealns, "%c %5d %.*s %d\n", 'A' + f, ppos[f] + 1, w, ci->paln[f] + i, ppos[f] + pnt[f]); } fprintf(fp_uchimealns, "Diffs %.*s\n", w, ci->diffs + i); fprintf(fp_uchimealns, "Model %.*s\n", w, ci->model + i); fprintf(fp_uchimealns, "\n"); rest -= width; qpos += qnt; for (int f = 0; f < ci->parents_found; f++) { ppos[f] += pnt[f]; } } fprintf(fp_uchimealns, "Ids. QA %.2f%%, QB %.2f%%, QC %.2f%%, " "QT %.2f%%, QModel %.2f%%, Div. %+.2f%%\n", QA, QB, QC, QT, QM, divfrac); } if (opt_tabbedout) { fprintf(fp_uchimeout, "%.4f\t", 99.9999); header_fprint_strip(fp_uchimeout, ci->query_head, ci->query_head_len, opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); if (seqno_c >= 0) { header_fprint_strip(fp_uchimeout, db_getheader(seqno_c), db_getheaderlen(seqno_c), opt_xsize, opt_xee, opt_xlength); } else { fprintf(fp_uchimeout, "*"); } fprintf(fp_uchimeout, "\t"); fprintf(fp_uchimeout, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t" "%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%c\n", QM, QA, QB, QC, QT, 0, /* ignore, left yes */ 0, /* ignore, left no */ 0, /* ignore, left abstain */ 0, /* ignore, right yes */ 0, /* ignore, right no */ 0, /* ignore, right abstain */ 0.00, status == 4 ? 'Y' : (status == 2 ? 'N' : '?')); } xpthread_mutex_unlock(&mutex_output); return status; } auto eval_parents(struct chimera_info_s * ci) -> int { int status = 1; ci->parents_found = 2; int const alnlen = find_max_alignment_length(ci); fill_alignment_parents(ci); /* fill in alignment string for query */ char * q = ci->qaln; int qpos = 0; for (int i = 0; i < ci->query_len; i++) { for (int j=0; j < ci->maxi[i]; j++) { *q++ = '-'; } *q++ = chrmap_upcase[(int) (ci->query_seq[qpos++])]; } for (int j = 0; j < ci->maxi[ci->query_len]; j++) { *q++ = '-'; } *q = 0; /* mark positions to ignore in voting */ for (int i = 0; i < alnlen; i++) { ci->ignore[i] = 0; } for (int i = 0; i < alnlen; i++) { unsigned int const qsym = chrmap_4bit[(int) (ci->qaln [i])]; unsigned int const p1sym = chrmap_4bit[(int) (ci->paln[0][i])]; unsigned int const p2sym = chrmap_4bit[(int) (ci->paln[1][i])]; /* ignore gap positions and those next to the gap */ if ((not qsym) or (not p1sym) or (not p2sym)) { ci->ignore[i] = 1; if (i > 0) { ci->ignore[i - 1] = 1; } if (i < alnlen - 1) { ci->ignore[i + 1] = 1; } } /* ignore ambiguous symbols */ if ((ambiguous_4bit[qsym]) or (ambiguous_4bit[p1sym]) or (ambiguous_4bit[p2sym])) { ci->ignore[i] = 1; } /* lower case parent symbols that differ from query */ if (p1sym and (p1sym != qsym)) { ci->paln[0][i] = tolower(ci->paln[0][i]); } if (p2sym and (p2sym != qsym)) { ci->paln[1][i] = tolower(ci->paln[1][i]); } /* compute diffs */ char diff = '\0'; if (qsym and p1sym and p2sym) { if (p1sym == p2sym) { if (qsym == p1sym) { diff = ' '; } else { diff = 'N'; } } else { if (qsym == p1sym) { diff = 'A'; } else if (qsym == p2sym) { diff = 'B'; } else { diff = '?'; } } } else { diff = ' '; } ci->diffs[i] = diff; } ci->diffs[alnlen] = 0; /* compute score */ int sumA = 0; int sumB = 0; int sumN = 0; for (int i = 0; i < alnlen; i++) { if (not ci->ignore[i]) { char const diff = ci->diffs[i]; if (diff == 'A') { ++sumA; } else if (diff == 'B') { ++sumB; } else if (diff != ' ') { ++sumN; } } } int left_n = 0; int left_a = 0; int left_y = 0; int right_n = sumA; int right_a = sumN; int right_y = sumB; double best_h = -1; int best_i = -1; int best_reverse = 0; int best_left_y = 0; int best_right_y = 0; int best_left_n = 0; int best_right_n = 0; int best_left_a = 0; int best_right_a = 0; for (int i = 0; i < alnlen; i++) { if (not ci->ignore[i]) { char const diff = ci->diffs[i]; if (diff != ' ') { if (diff == 'A') { ++left_y; --right_n; } else if (diff == 'B') { ++left_n; --right_y; } else { ++left_a; --right_a; } double left_h = 0; double right_h = 0; double h = 0; if ((left_y > left_n) and (right_y > right_n)) { left_h = left_y / (opt_xn * (left_n + opt_dn) + left_a); right_h = right_y / (opt_xn * (right_n + opt_dn) + right_a); h = left_h * right_h; if (h > best_h) { best_reverse = 0; best_h = h; best_i = i; best_left_n = left_n; best_left_y = left_y; best_left_a = left_a; best_right_n = right_n; best_right_y = right_y; best_right_a = right_a; } } else if ((left_n > left_y) and (right_n > right_y)) { /* swap left/right and yes/no */ left_h = left_n / (opt_xn * (left_y + opt_dn) + left_a); right_h = right_n / (opt_xn * (right_y + opt_dn) + right_a); h = left_h * right_h; if (h > best_h) { best_reverse = 1; best_h = h; best_i = i; best_left_n = left_y; best_left_y = left_n; best_left_a = left_a; best_right_n = right_y; best_right_y = right_n; best_right_a = right_a; } } } } } ci->best_h = best_h > 0 ? best_h : 0.0; if (best_h >= 0.0) { status = 2; /* flip A and B if necessary */ if (best_reverse) { for (int i = 0; i < alnlen; i++) { char const diff = ci->diffs[i]; if (diff == 'A') { ci->diffs[i] = 'B'; } else if (diff == 'B') { ci->diffs[i] = 'A'; } } } /* fill in votes and model */ for (int i = 0; i < alnlen; i++) { char const m = i <= best_i ? 'A' : 'B'; ci->model[i] = m; char v = ' '; if (not ci->ignore[i]) { char const d = ci->diffs[i]; if ((d == 'A') or (d == 'B')) { if (d == m) { v = '+'; } else { v = '!'; } } else if ((d == 'N') or (d == '?')) { v = '0'; } } ci->votes[i] = v; /* lower case diffs for no votes */ if (v == '!') { ci->diffs[i] = tolower(ci->diffs[i]); } } /* fill in crossover region */ for (int i = best_i + 1; i < alnlen; i++) { if ((ci->diffs[i] == ' ') or (ci->diffs[i] == 'A')) { ci->model[i] = 'x'; } else { break; } } ci->votes[alnlen] = 0; ci->model[alnlen] = 0; /* count matches */ int const index_a = best_reverse ? 1 : 0; int const index_b = best_reverse ? 0 : 1; int match_QA = 0; int match_QB = 0; int match_AB = 0; int match_QM = 0; int cols = 0; for (int i = 0; i < alnlen; i++) { if (not ci->ignore[i]) { ++cols; char const qsym = chrmap_4bit[(int) (ci->qaln[i])]; char const asym = chrmap_4bit[(int) (ci->paln[index_a][i])]; char const bsym = chrmap_4bit[(int) (ci->paln[index_b][i])]; char const msym = (i <= best_i) ? asym : bsym; if (qsym == asym) { ++match_QA; } if (qsym == bsym) { ++match_QB; } if (asym == bsym) { ++match_AB; } if (qsym == msym) { ++match_QM; } } } int const seqno_a = ci->cand_list[ci->best_parents[index_a]]; int const seqno_b = ci->cand_list[ci->best_parents[index_b]]; double const QA = 100.0 * match_QA / cols; double const QB = 100.0 * match_QB / cols; double const AB = 100.0 * match_AB / cols; double const QT = MAX(QA, QB); double const QM = 100.0 * match_QM / cols; double const divdiff = QM - QT; double const divfrac = 100.0 * divdiff / QT; int const sumL = best_left_n + best_left_a + best_left_y; int const sumR = best_right_n + best_right_a + best_right_y; if (opt_uchime2_denovo or opt_uchime3_denovo) { // fix -Wfloat-equal: if match_QM == cols, then QM == 100.0 if ((match_QM == cols) and (QT < 100.0)) { status = 4; } } else if (best_h >= opt_minh) { status = 3; if ((divdiff >= opt_mindiv) and (sumL >= opt_mindiffs) and (sumR >= opt_mindiffs)) { status = 4; } } /* print alignment */ xpthread_mutex_lock(&mutex_output); if (opt_uchimealns and (status == 4)) { fprintf(fp_uchimealns, "\n"); fprintf(fp_uchimealns, "----------------------------------------" "--------------------------------\n"); fprintf(fp_uchimealns, "Query (%5d nt) ", ci->query_len); header_fprint_strip(fp_uchimealns, ci->query_head, ci->query_head_len, opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimealns, "\nParentA (%5" PRIu64 " nt) ", db_getsequencelen(seqno_a)); header_fprint_strip(fp_uchimealns, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimealns, "\nParentB (%5" PRIu64 " nt) ", db_getsequencelen(seqno_b)); header_fprint_strip(fp_uchimealns, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimealns, "\n\n"); int const width = opt_alignwidth > 0 ? opt_alignwidth : alnlen; qpos = 0; int p1pos = 0; int p2pos = 0; int rest = alnlen; for (int i = 0; i < alnlen; i += width) { /* count non-gap symbols on current line */ int qnt = 0; int p1nt = 0; int p2nt = 0; int const w = MIN(rest,width); for (int j = 0; j < w; j++) { if (ci->qaln[i + j] != '-') { ++qnt; } if (ci->paln[0][i + j] != '-') { ++p1nt; } if (ci->paln[1][i + j] != '-') { ++p2nt; } } if (not best_reverse) { fprintf(fp_uchimealns, "A %5d %.*s %d\n", p1pos + 1, w, ci->paln[0] + i, p1pos + p1nt); fprintf(fp_uchimealns, "Q %5d %.*s %d\n", qpos + 1, w, ci->qaln + i, qpos + qnt); fprintf(fp_uchimealns, "B %5d %.*s %d\n", p2pos + 1, w, ci->paln[1] + i, p2pos + p2nt); } else { fprintf(fp_uchimealns, "A %5d %.*s %d\n", p2pos + 1, w, ci->paln[1] + i, p2pos + p2nt); fprintf(fp_uchimealns, "Q %5d %.*s %d\n", qpos + 1, w, ci->qaln + i, qpos + qnt); fprintf(fp_uchimealns, "B %5d %.*s %d\n", p1pos + 1, w, ci->paln[0] + i, p1pos + p1nt); } fprintf(fp_uchimealns, "Diffs %.*s\n", w, ci->diffs + i); fprintf(fp_uchimealns, "Votes %.*s\n", w, ci->votes + i); fprintf(fp_uchimealns, "Model %.*s\n", w, ci->model + i); fprintf(fp_uchimealns, "\n"); qpos += qnt; p1pos += p1nt; p2pos += p2nt; rest -= width; } fprintf(fp_uchimealns, "Ids. QA %.1f%%, QB %.1f%%, AB %.1f%%, " "QModel %.1f%%, Div. %+.1f%%\n", QA, QB, AB, QM, divfrac); fprintf(fp_uchimealns, "Diffs Left %d: N %d, A %d, Y %d (%.1f%%); " "Right %d: N %d, A %d, Y %d (%.1f%%), Score %.4f\n", sumL, best_left_n, best_left_a, best_left_y, 100.0 * best_left_y / sumL, sumR, best_right_n, best_right_a, best_right_y, 100.0 * best_right_y / sumR, best_h); } if (opt_uchimeout) { fprintf(fp_uchimeout, "%.4f\t", best_h); header_fprint_strip(fp_uchimeout, ci->query_head, ci->query_head_len, opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); if (not opt_uchimeout5) { if (QA >= QB) { header_fprint_strip(fp_uchimeout, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); } else { header_fprint_strip(fp_uchimeout, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); } fprintf(fp_uchimeout, "\t"); } fprintf(fp_uchimeout, "%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t" "%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%c\n", QM, QA, QB, AB, QT, best_left_y, best_left_n, best_left_a, best_right_y, best_right_n, best_right_a, divdiff, status == 4 ? 'Y' : (status == 2 ? 'N' : '?')); } xpthread_mutex_unlock(&mutex_output); } return status; } // refactoring: enum struct status {}; /* new chimeric status: 0: no parents, non-chimeric 1: score < 0 (no alignment), non-chimeric 2: score < minh, non-chimeric 3: score >= minh, suspicious -> not available with uchime2_denovo and uchime3_denovo 4: score >= minh && (divdiff >= opt_mindiv) && ..., chimeric */ auto query_init(struct searchinfo_s * si) -> void { si->qsequence = nullptr; si->kmers = nullptr; si->hits = (struct hit *) xmalloc(sizeof(struct hit) * tophits); si->kmers = (count_t *) xmalloc((db_getsequencecount() * sizeof(count_t)) + 32); si->hit_count = 0; si->uh = unique_init(); si->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); si->m = minheap_init(tophits); } auto query_exit(struct searchinfo_s * si) -> void { search16_exit(si->s); unique_exit(si->uh); minheap_exit(si->m); if (si->qsequence) { xfree(si->qsequence); si->qsequence = nullptr; } if (si->hits) { xfree(si->hits); si->hits = nullptr; } if (si->kmers) { xfree(si->kmers); si->kmers = nullptr; } } auto partition_query(struct chimera_info_s * ci) -> void { int rest = ci->query_len; char * p = ci->query_seq; for (int i = 0; i < parts; i++) { int const len = (rest + (parts - i - 1)) / (parts - i); struct searchinfo_s * si = ci->si + i; si->query_no = ci->query_no; si->strand = 0; si->qsize = ci->query_size; si->query_head_len = ci->query_head_len; si->query_head = ci->query_head; si->qseqlen = len; strncpy(si->qsequence, p, len); si->qsequence[len] = 0; rest -= len; p += len; } } auto chimera_thread_init(struct chimera_info_s * ci) -> void { ci->query_alloc = 0; ci->head_alloc = 0; ci->query_head = nullptr; ci->query_seq = nullptr; ci->maxi = nullptr; ci->maxsmooth = nullptr; ci->match = nullptr; ci->insert = nullptr; ci->smooth = nullptr; ci->qaln = nullptr; ci->diffs = nullptr; ci->votes = nullptr; ci->model = nullptr; ci->ignore = nullptr; ci->scan_p = nullptr; ci->scan_q = nullptr; for (int f = 0; f < maxparents; f++) { ci->paln[f] = nullptr; } for (int i = 0; i < maxparts; i++) { query_init(ci->si + i); } ci->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); } auto chimera_thread_exit(struct chimera_info_s * ci) -> void { search16_exit(ci->s); for (int i = 0; i < maxparts; i++) { query_exit(ci->si + i); } if (ci->maxsmooth) { xfree(ci->maxsmooth); } if (ci->match) { xfree(ci->match); } if (ci->insert) { xfree(ci->insert); } if (ci->smooth) { xfree(ci->smooth); } if (ci->diffs) { xfree(ci->diffs); } if (ci->votes) { xfree(ci->votes); } if (ci->model) { xfree(ci->model); } if (ci->ignore) { xfree(ci->ignore); } if (ci->maxi) { xfree(ci->maxi); } if (ci->qaln) { xfree(ci->qaln); } if (ci->query_seq) { xfree(ci->query_seq); } if (ci->query_head) { xfree(ci->query_head); } if (ci->scan_p) { xfree(ci->scan_p); } if (ci->scan_q) { xfree(ci->scan_q); } for (int f = 0; f < maxparents; f++) { if (ci->paln[f]) { xfree(ci->paln[f]); } } } auto chimera_thread_core(struct chimera_info_s * ci) -> uint64_t { chimera_thread_init(ci); auto * allhits_list = (struct hit *) xmalloc(maxcandidates * sizeof(struct hit)); LinearMemoryAligner lma; int64_t * scorematrix = lma.scorematrix_create(opt_match, opt_mismatch); lma.set_parameters(scorematrix, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); while (true) { /* get next sequence */ xpthread_mutex_lock(&mutex_input); if (opt_uchime_ref) { if (fasta_next(query_fasta_h, not opt_notrunclabels, chrmap_no_change)) { ci->query_head_len = fasta_get_header_length(query_fasta_h); ci->query_len = fasta_get_sequence_length(query_fasta_h); ci->query_no = fasta_get_seqno(query_fasta_h); ci->query_size = fasta_get_abundance(query_fasta_h); /* if necessary expand memory for arrays based on query length */ realloc_arrays(ci); /* copy the data locally (query seq, head) */ strcpy(ci->query_head, fasta_get_header(query_fasta_h)); strcpy(ci->query_seq, fasta_get_sequence(query_fasta_h)); } else { xpthread_mutex_unlock(&mutex_input); break; /* end while loop */ } } else { if (seqno < db_getsequencecount()) { ci->query_no = seqno; ci->query_head_len = db_getheaderlen(seqno); ci->query_len = db_getsequencelen(seqno); ci->query_size = db_getabundance(seqno); /* if necessary expand memory for arrays based on query length */ realloc_arrays(ci); strcpy(ci->query_head, db_getheader(seqno)); strcpy(ci->query_seq, db_getsequence(seqno)); } else { xpthread_mutex_unlock(&mutex_input); break; /* end while loop */ } } xpthread_mutex_unlock(&mutex_input); int status = 0; /* partition query */ partition_query(ci); /* perform searches and collect candidate parents */ ci->cand_count = 0; int allhits_count = 0; if (ci->query_len >= parts) { for (int i = 0; i < parts; i++) { struct hit * hits = nullptr; int hit_count = 0; search_onequery(ci->si + i, opt_qmask); search_joinhits(ci->si + i, nullptr, & hits, & hit_count); for (int j = 0; j < hit_count; j++) { if (hits[j].accepted) { allhits_list[allhits_count++] = hits[j]; } } xfree(hits); } } for (int i = 0; i < allhits_count; i++) { unsigned int const target = allhits_list[i].target; /* skip duplicates */ int k {0}; for (k = 0; k < ci->cand_count; k++) { if (ci->cand_list[k] == target) { break; } } if (k == ci->cand_count) { ci->cand_list[ci->cand_count++] = target; } /* deallocate cigar */ if (allhits_list[i].nwalignment) { xfree(allhits_list[i].nwalignment); allhits_list[i].nwalignment = nullptr; } } /* align full query to each candidate */ search16_qprep(ci->s, ci->query_seq, ci->query_len); search16(ci->s, ci->cand_count, ci->cand_list, ci->snwscore, ci->snwalignmentlength, ci->snwmatches, ci->snwmismatches, ci->snwgaps, ci->nwcigar); for (int i = 0; i < ci->cand_count; i++) { int64_t const target = ci->cand_list[i]; int64_t nwscore = ci->snwscore[i]; char * nwcigar = nullptr; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; if (nwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * tseq = db_getsequence(target); int64_t const tseqlen = db_getsequencelen(target); if (ci->nwcigar[i]) { xfree(ci->nwcigar[i]); } nwcigar = xstrdup(lma.align(ci->query_seq, tseq, ci->query_len, tseqlen)); lma.alignstats(nwcigar, ci->query_seq, tseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); ci->nwcigar[i] = nwcigar; ci->nwscore[i] = nwscore; ci->nwalignmentlength[i] = nwalignmentlength; ci->nwmatches[i] = nwmatches; ci->nwmismatches[i] = nwmismatches; ci->nwgaps[i] = nwgaps; } else { ci->nwscore[i] = ci->snwscore[i]; ci->nwalignmentlength[i] = ci->snwalignmentlength[i]; ci->nwmatches[i] = ci->snwmatches[i]; ci->nwmismatches[i] = ci->snwmismatches[i]; ci->nwgaps[i] = ci->snwgaps[i]; } } /* find the best pair of parents, then compute score for them */ if (opt_chimeras_denovo) { /* long high-quality reads */ if (find_best_parents_long(ci)) { status = eval_parents_long(ci); } else { status = 0; } } else { if (find_best_parents(ci)) { status = eval_parents(ci); } else { status = 0; } } /* output results */ xpthread_mutex_lock(&mutex_output); ++total_count; total_abundance += ci->query_size; if (status == 4) { ++chimera_count; chimera_abundance += ci->query_size; if (opt_chimeras) { fasta_print_general(fp_chimeras, nullptr, ci->query_seq, ci->query_len, ci->query_head, ci->query_head_len, ci->query_size, chimera_count, -1.0, -1, -1, opt_fasta_score ? ( opt_uchime_ref ? "uchime_ref" : "uchime_denovo" ) : nullptr, ci->best_h); } } if (status == 3) { ++borderline_count; borderline_abundance += ci->query_size; if (opt_borderline) { fasta_print_general(fp_borderline, nullptr, ci->query_seq, ci->query_len, ci->query_head, ci->query_head_len, ci->query_size, borderline_count, -1.0, -1, -1, opt_fasta_score ? ( opt_uchime_ref ? "uchime_ref" : "uchime_denovo" ) : nullptr, ci->best_h); } } if (status < 3) { ++nonchimera_count; nonchimera_abundance += ci->query_size; /* output no parents, no chimeras */ if ((status < 2) and opt_uchimeout) { fprintf(fp_uchimeout, "0.0000\t"); header_fprint_strip(fp_uchimeout, ci->query_head, ci->query_head_len, opt_xsize, opt_xee, opt_xlength); if (opt_uchimeout5) { fprintf(fp_uchimeout, "\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN\n"); } else { fprintf(fp_uchimeout, "\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN\n"); } } if (opt_nonchimeras) { fasta_print_general(fp_nonchimeras, nullptr, ci->query_seq, ci->query_len, ci->query_head, ci->query_head_len, ci->query_size, nonchimera_count, -1.0, -1, -1, opt_fasta_score ? ( opt_uchime_ref ? "uchime_ref" : "uchime_denovo" ) : nullptr, ci->best_h); } } if (status < 3) { /* uchime_denovo: add non-chimeras to db */ if (opt_uchime_denovo or opt_uchime2_denovo or opt_uchime3_denovo or opt_chimeras_denovo) { dbindex_addsequence(seqno, opt_qmask); } } for (int i = 0; i < ci->cand_count; i++) { if (ci->nwcigar[i]) { xfree(ci->nwcigar[i]); } } if (opt_uchime_ref) { progress = fasta_get_position(query_fasta_h); } else { progress += db_getsequencelen(seqno); } progress_update(progress); ++seqno; xpthread_mutex_unlock(&mutex_output); } if (allhits_list) { xfree(allhits_list); } chimera_thread_exit(ci); xfree(scorematrix); return 0; } auto chimera_thread_worker(void * vp) -> void * { return (void *) chimera_thread_core(cia + (int64_t) vp); } auto chimera_threads_run() -> void { xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* create worker threads */ for (int64_t t = 0; t < opt_threads; t++) { xpthread_create(pthread + t, & attr, chimera_thread_worker, (void*)t); } /* finish worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); } xpthread_attr_destroy(&attr); } auto open_chimera_file(FILE ** f, char * name) -> void { if (name) { *f = fopen_output(name); if (not *f) { fatal("Unable to open file %s for writing", name); } } else { *f = nullptr; } } auto close_chimera_file(FILE * f) -> void { if (f) { fclose(f); } } auto chimera() -> void { open_chimera_file(&fp_chimeras, opt_chimeras); open_chimera_file(&fp_nonchimeras, opt_nonchimeras); open_chimera_file(&fp_borderline, opt_borderline); if (opt_chimeras_denovo) { open_chimera_file(&fp_uchimealns, opt_alnout); open_chimera_file(&fp_uchimeout, opt_tabbedout); } else { open_chimera_file(&fp_uchimealns, opt_uchimealns); open_chimera_file(&fp_uchimeout, opt_uchimeout); } /* override any options the user might have set */ opt_maxaccepts = few; opt_maxrejects = rejects; opt_id = chimera_id; if (opt_strand != 1) { fatal("Only --strand plus is allowed with uchime_ref."); } if (not opt_uchime_ref) { opt_self = 1; opt_selfid = 1; opt_threads = 1; opt_maxsizeratio = 1.0 / opt_abskew; } tophits = opt_maxaccepts + opt_maxrejects; uint64_t progress_total = 0; chimera_count = 0; nonchimera_count = 0; progress = 0; seqno = 0; /* prepare threads */ pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); cia = (struct chimera_info_s *) xmalloc(opt_threads * sizeof(struct chimera_info_s)); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); char * denovo_dbname = nullptr; /* prepare queries / database */ if (opt_uchime_ref) { /* check if the reference database may be an UDB file */ auto const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); } else { db_read(opt_db, 0); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) and (opt_hardmask)) { hardmask_all(); } dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } query_fasta_h = fasta_open(opt_uchime_ref); progress_total = fasta_get_size(query_fasta_h); } else { if (opt_uchime_denovo) { denovo_dbname = opt_uchime_denovo; } else if (opt_uchime2_denovo) { denovo_dbname = opt_uchime2_denovo; } else if (opt_uchime3_denovo) { denovo_dbname = opt_uchime3_denovo; } else if (opt_chimeras_denovo) { denovo_dbname = opt_chimeras_denovo; } else { fatal("Internal error"); } db_read(denovo_dbname, 0); if (opt_qmask == MASK_DUST) { dust_all(); } else if ((opt_qmask == MASK_SOFT) and (opt_hardmask)) { hardmask_all(); } db_sortbyabundance(); dbindex_prepare(1, opt_qmask); progress_total = db_getnucleotidecount(); } if (opt_log) { if (opt_uchime_ref or opt_uchime_denovo) { fprintf(fp_log, "%8.2f minh\n", opt_minh); } if (opt_uchime_ref or opt_uchime_denovo or opt_uchime2_denovo or opt_uchime3_denovo) { fprintf(fp_log, "%8.2f xn\n", opt_xn); fprintf(fp_log, "%8.2f dn\n", opt_dn); fprintf(fp_log, "%8.2f xa\n", 1.0); } if (opt_uchime_ref or opt_uchime_denovo) { fprintf(fp_log, "%8.2f mindiv\n", opt_mindiv); } fprintf(fp_log, "%8.2f id\n", opt_id); if (opt_uchime_ref or opt_uchime_denovo or opt_uchime2_denovo or opt_uchime3_denovo) { fprintf(fp_log, "%8d maxp\n", 2); } fprintf(fp_log, "\n"); } progress_init("Detecting chimeras", progress_total); chimera_threads_run(); progress_done(); if (not opt_quiet) { if (total_count > 0) { if (opt_chimeras_denovo) { fprintf(stderr, "Found %d (%.1f%%) chimeras and " "%d (%.1f%%) non-chimeras " "in %u unique sequences.\n", chimera_count, 100.0 * chimera_count / total_count, nonchimera_count, 100.0 * nonchimera_count / total_count, total_count); } else { fprintf(stderr, "Found %d (%.1f%%) chimeras, " "%d (%.1f%%) non-chimeras,\n" "and %d (%.1f%%) borderline sequences " "in %u unique sequences.\n", chimera_count, 100.0 * chimera_count / total_count, nonchimera_count, 100.0 * nonchimera_count / total_count, borderline_count, 100.0 * borderline_count / total_count, total_count); } } else { if (opt_chimeras_denovo) { fprintf(stderr, "Found %d chimeras and " "%d non-chimeras " "in %u unique sequences.\n", chimera_count, nonchimera_count, total_count); } else { fprintf(stderr, "Found %d chimeras, " "%d non-chimeras,\n" "and %d borderline sequences " "in %u unique sequences.\n", chimera_count, nonchimera_count, borderline_count, total_count); } } if (total_abundance > 0) { if (opt_chimeras_denovo) { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " (%.1f%%) chimeras and " "%" PRId64 " (%.1f%%) non-chimeras " "in %" PRId64 " total sequences.\n", chimera_abundance, 100.0 * chimera_abundance / total_abundance, nonchimera_abundance, 100.0 * nonchimera_abundance / total_abundance, total_abundance); } else { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " (%.1f%%) chimeras, " "%" PRId64 " (%.1f%%) non-chimeras,\n" "and %" PRId64 " (%.1f%%) borderline sequences " "in %" PRId64 " total sequences.\n", chimera_abundance, 100.0 * chimera_abundance / total_abundance, nonchimera_abundance, 100.0 * nonchimera_abundance / total_abundance, borderline_abundance, 100.0 * borderline_abundance / total_abundance, total_abundance); } } else { if (opt_chimeras_denovo) { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " chimeras, " "%" PRId64 " non-chimeras " "in %" PRId64 " total sequences.\n", chimera_abundance, nonchimera_abundance, total_abundance); } else { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " chimeras, " "%" PRId64 " non-chimeras,\n" "and %" PRId64 " borderline sequences " "in %" PRId64 " total sequences.\n", chimera_abundance, nonchimera_abundance, borderline_abundance, total_abundance); } } } if (opt_log) { if (opt_uchime_ref) { fprintf(fp_log, "%s", opt_uchime_ref); } else { fprintf(fp_log, "%s", denovo_dbname); } if (seqno > 0) { fprintf(fp_log, ": %d/%u chimeras (%.1f%%)\n", chimera_count, seqno, 100.0 * chimera_count / seqno); } else { fprintf(fp_log, ": %d/%u chimeras\n", chimera_count, seqno); } } if (opt_uchime_ref) { fasta_close(query_fasta_h); } dbindex_free(); db_free(); xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); xfree(cia); xfree(pthread); close_chimera_file(fp_borderline); close_chimera_file(fp_uchimeout); close_chimera_file(fp_uchimealns); close_chimera_file(fp_nonchimeras); close_chimera_file(fp_chimeras); show_rusage(); } vsearch-2.30.0/src/chimera.h000066400000000000000000000050001476012147200155760ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ constexpr auto maxparents = 20; /* max, could be fewer */ auto chimera() -> void; vsearch-2.30.0/src/city.cc000066400000000000000000000466101476012147200153100ustar00rootroot00000000000000// Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // This file provides CityHash64() and related functions. // // It's probably possible to create even faster hash functions by // writing a program that systematically explores some of the space of // possible hash functions, by using SIMD instructions, or by // compromising on hash quality. #include "config.h" #include #include // std::swap #include // int32_t #include // std::memcpy, std::memset #include // std::pair, std::make_pair using namespace std; // refactoring: bad practice static auto UNALIGNED_LOAD64(const char *p) -> uint64 { uint64 result = 0; std::memcpy(&result, p, sizeof(result)); return result; } static auto UNALIGNED_LOAD32(const char *p) -> uint32 { uint32 result = 0; std::memcpy(&result, p, sizeof(result)); return result; } #ifdef _MSC_VER #include #define bswap_32(x) _byteswap_ulong(x) #define bswap_64(x) _byteswap_uint64(x) #elif defined(__APPLE__) // Mac OS X / Darwin features #include #define bswap_32(x) OSSwapInt32(x) #define bswap_64(x) OSSwapInt64(x) #elif defined(__FreeBSD__) #include #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) #elif defined(__NetBSD__) #include #include #if defined(__BSWAP_RENAME) && !defined(__bswap_32) #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) #endif #else #include #endif #ifdef WORDS_BIGENDIAN #define uint32_in_expected_order(x) (bswap_32(x)) #define uint64_in_expected_order(x) (bswap_64(x)) #else #define uint32_in_expected_order(x) (x) #define uint64_in_expected_order(x) (x) #endif #if !defined(LIKELY) #if HAVE_BUILTIN_EXPECT #define LIKELY(x) (__builtin_expect(!!(x), 1)) #else #define LIKELY(x) (x) #endif #endif static auto Fetch64(const char *p) -> uint64 { return uint64_in_expected_order(UNALIGNED_LOAD64(p)); } static auto Fetch32(const char *p) -> uint32 { return uint32_in_expected_order(UNALIGNED_LOAD32(p)); } // Some primes between 2^63 and 2^64 for various uses. static const uint64 k0 = 0xc3a5c85c97cb3127ULL; static const uint64 k1 = 0xb492b66fbe98f273ULL; static const uint64 k2 = 0x9ae16a3b2f90404fULL; // Magic numbers for 32-bit hashing. Copied from Murmur3. static const uint32_t c1 = 0xcc9e2d51; static const uint32_t c2 = 0x1b873593; // A 32-bit to 32-bit integer hash copied from Murmur3. static auto fmix(uint32 h) -> uint32 { h ^= h >> 16U; h *= 0x85ebca6b; h ^= h >> 13U; h *= 0xc2b2ae35; h ^= h >> 16U; return h; } static auto Rotate32(uint32 val, int shift) -> uint32 { // Avoid shifting by 32: doing so yields an undefined result. return shift == 0 ? val : ((val >> shift) | (val << (32 - shift))); } #undef PERMUTE3 #define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0) static auto Mur(uint32 a, uint32 h) -> uint32 { // Helper from Murmur3 for combining two 32-bit values. a *= c1; a = Rotate32(a, 17); a *= c2; h ^= a; h = Rotate32(h, 19); return (h * 5) + 0xe6546b64; } static auto Hash32Len13to24(const char * s, size_t len) -> uint32 { const uint32 a = Fetch32(s - 4 + (len >> 1U)); const uint32 b = Fetch32(s + 4); const uint32 c = Fetch32(s + len - 8); const uint32 d = Fetch32(s + (len >> 1U)); const uint32 e = Fetch32(s); const uint32 f = Fetch32(s + len - 4); const uint32 h = len; return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); } static auto Hash32Len0to4(const char * s, size_t len) -> uint32 { uint32 b = 0; uint32 c = 9; for (int i = 0; i < len; i++) { const signed char v = s[i]; b = b * c1 + v; c ^= b; } return fmix(Mur(b, Mur(len, c))); } static auto Hash32Len5to12(const char * s, size_t len) -> uint32 { uint32 a = len; uint32 b = len * 5; uint32 c = 9; uint32 const d = b; a += Fetch32(s); b += Fetch32(s + len - 4); c += Fetch32(s + ((len >> 1U) & 4U)); return fmix(Mur(c, Mur(b, Mur(a, d)))); } auto CityHash32(const char * s, size_t len) -> uint32 { if (len <= 24) { return len <= 12 ? (len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) : Hash32Len13to24(s, len); } // len > 24 uint32 h = len; uint32 g = c1 * len; uint32 f = g; const uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2; const uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2; const uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2; const uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2; const uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2; h ^= a0; h = Rotate32(h, 19); h = h * 5 + 0xe6546b64; h ^= a2; h = Rotate32(h, 19); h = h * 5 + 0xe6546b64; g ^= a1; g = Rotate32(g, 19); g = g * 5 + 0xe6546b64; g ^= a3; g = Rotate32(g, 19); g = g * 5 + 0xe6546b64; f += a4; f = Rotate32(f, 19); f = f * 5 + 0xe6546b64; size_t iters = (len - 1) / 20; do { const uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2; const uint32 a1 = Fetch32(s + 4); const uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2; const uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2; const uint32 a4 = Fetch32(s + 16); h ^= a0; h = Rotate32(h, 18); h = h * 5 + 0xe6546b64; f += a1; f = Rotate32(f, 19); f = f * c1; g += a2; g = Rotate32(g, 18); g = g * 5 + 0xe6546b64; h ^= a3 + a1; h = Rotate32(h, 19); h = h * 5 + 0xe6546b64; g ^= a4; g = bswap_32(g) * 5; h += a4 * 5; h = bswap_32(h); f += a0; PERMUTE3(f, h, g); s += 20; } while (--iters != 0); g = Rotate32(g, 11) * c1; g = Rotate32(g, 17) * c1; f = Rotate32(f, 11) * c1; f = Rotate32(f, 17) * c1; h = Rotate32(h + g, 19); h = h * 5 + 0xe6546b64; h = Rotate32(h, 17) * c1; h = Rotate32(h + f, 19); h = h * 5 + 0xe6546b64; h = Rotate32(h, 17) * c1; return h; } // Bitwise right rotate. Normally this will compile to a single // instruction, especially if the shift is a manifest constant. static auto Rotate(uint64 val, int shift) -> uint64 { // Avoid shifting by 64: doing so yields an undefined result. return shift == 0 ? val : ((val >> shift) | (val << (64U - shift))); } static auto ShiftMix(uint64 val) -> uint64 { return val ^ (val >> 47U); } static auto HashLen16(uint64 u, uint64 v) -> uint64 { return Hash128to64(uint128(u, v)); } static auto HashLen16(uint64 u, uint64 v, uint64 mul) -> uint64 { // Murmur-inspired hashing. uint64 a = (u ^ v) * mul; a ^= (a >> 47U); uint64 b = (v ^ a) * mul; b ^= (b >> 47U); b *= mul; return b; } static auto HashLen0to16(const char *s, size_t len) -> uint64 { if (len >= 8) { const uint64 mul = k2 + (len * 2); const uint64 a = Fetch64(s) + k2; const uint64 b = Fetch64(s + len - 8); const uint64 c = (Rotate(b, 37) * mul) + a; const uint64 d = (Rotate(a, 25) + b) * mul; return HashLen16(c, d, mul); } if (len >= 4) { const uint64 mul = k2 + (len * 2); const uint64 a = Fetch32(s); return HashLen16(len + (a << 3U), Fetch32(s + len - 4), mul); } if (len > 0) { const uint8 a = s[0]; const uint8 b = s[len >> 1U]; const uint8 c = s[len - 1]; const uint32 y = static_cast(a) + (static_cast(b) << 8U); const uint32 z = len + (static_cast(c) << 2U); return ShiftMix((y * k2) ^ (z * k0)) * k2; } return k2; } // This probably works well for 16-byte strings as well, but it may be overkill // in that case. static auto HashLen17to32(const char *s, size_t len) -> uint64 { const uint64 mul = k2 + (len * 2); const uint64 a = Fetch64(s) * k1; const uint64 b = Fetch64(s + 8); const uint64 c = Fetch64(s + len - 8) * mul; const uint64 d = Fetch64(s + len - 16) * k2; return HashLen16(Rotate(a + b, 43) + Rotate(c, 30) + d, a + Rotate(b + k2, 18) + c, mul); } // Return a 16-byte hash for 48 bytes. Quick and dirty. // Callers do best to use "random-looking" values for a and b. static auto WeakHashLen32WithSeeds(uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) -> std::pair { a += w; b = Rotate(b + a + z, 21); const uint64 c = a; a += x; a += y; b += Rotate(a, 44); return std::make_pair(a + z, b + c); } // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. static auto WeakHashLen32WithSeeds(const char *s, uint64 a, uint64 b) -> std::pair { return WeakHashLen32WithSeeds(Fetch64(s), Fetch64(s + 8), Fetch64(s + 16), Fetch64(s + 24), a, b); } // Return an 8-byte hash for 33 to 64 bytes. static auto HashLen33to64(const char *s, size_t len) -> uint64 { const uint64 mul = k2 + (len * 2); uint64 a = Fetch64(s) * k2; uint64 b = Fetch64(s + 8); const uint64 c = Fetch64(s + len - 24); const uint64 d = Fetch64(s + len - 32); const uint64 e = Fetch64(s + 16) * k2; const uint64 f = Fetch64(s + 24) * 9; const uint64 g = Fetch64(s + len - 8); const uint64 h = Fetch64(s + len - 16) * mul; const uint64 u = Rotate(a + g, 43) + ((Rotate(b, 30) + c) * 9); const uint64 v = ((a + g) ^ d) + f + 1; const uint64 w = bswap_64((u + v) * mul) + h; const uint64 x = Rotate(e + f, 42) + c; const uint64 y = (bswap_64((v + w) * mul) + g) * mul; const uint64 z = e + f + c; a = bswap_64(((x + z) * mul) + y) + b; b = ShiftMix(((z + a) * mul) + d + h) * mul; return b + x; } auto CityHash64(const char *s, size_t len) -> uint64 { if (len <= 32) { if (len <= 16) { return HashLen0to16(s, len); } else { return HashLen17to32(s, len); } } else if (len <= 64) { return HashLen33to64(s, len); } // For strings over 64 bytes we hash the end first, and then as we // loop we keep 56 bytes of state: v, w, x, y, and z. uint64 x = Fetch64(s + len - 40); uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56); uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); auto v = WeakHashLen32WithSeeds(s + len - 64, len, z); auto w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); x = x * k1 + Fetch64(s); // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. len = (len - 1) & ~static_cast(63); do { x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; x ^= w.second; y += v.first + Fetch64(s + 40); z = Rotate(z + w.first, 33) * k1; v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); std::swap(z, x); s += 64; len -= 64; } while (len != 0); return HashLen16(HashLen16(v.first, w.first) + (ShiftMix(y) * k1) + z, HashLen16(v.second, w.second) + x); } auto CityHash64WithSeed(const char *s, size_t len, uint64 seed) -> uint64 { return CityHash64WithSeeds(s, len, k2, seed); } auto CityHash64WithSeeds(const char *s, size_t len, uint64 seed0, uint64 seed1) -> uint64 { return HashLen16(CityHash64(s, len) - seed0, seed1); } // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings // of any length representable in signed long. Based on City and Murmur. static auto CityMurmur(const char *s, size_t len, uint128 seed) -> uint128 { uint64 a = Uint128Low64(seed); uint64 b = Uint128High64(seed); uint64 c = 0; uint64 d = 0; signed long l = len - 16; if (l <= 0) { // len <= 16 a = ShiftMix(a * k1) * k1; c = b * k1 + HashLen0to16(s, len); d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); } else { // len > 16 c = HashLen16(Fetch64(s + len - 8) + k1, a); d = HashLen16(b + len, c + Fetch64(s + len - 16)); a += d; do { a ^= ShiftMix(Fetch64(s) * k1) * k1; a *= k1; b ^= a; c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; c *= k1; d ^= c; s += 16; l -= 16; } while (l > 0); } a = HashLen16(a, c); b = HashLen16(d, b); return uint128(a ^ b, HashLen16(b, a)); } auto CityHash128WithSeed(const char *s, size_t len, uint128 seed) -> uint128 { if (len < 128) { return CityMurmur(s, len, seed); } // We expect len >= 128 to be the common case. Keep 56 bytes of state: // v, w, x, y, and z. pair v; pair w; uint64 x = Uint128Low64(seed); uint64 y = Uint128High64(seed); uint64 z = len * k1; v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); w.first = Rotate(y + z, 35) * k1 + x; w.second = Rotate(x + Fetch64(s + 88), 53) * k1; // This is the same inner loop as CityHash64(), manually unrolled. do { x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; x ^= w.second; y += v.first + Fetch64(s + 40); z = Rotate(z + w.first, 33) * k1; v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); std::swap(z, x); s += 64; x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; x ^= w.second; y += v.first + Fetch64(s + 40); z = Rotate(z + w.first, 33) * k1; v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); std::swap(z, x); s += 64; len -= 128; } while (LIKELY(len >= 128)); x += Rotate(v.first + z, 49) * k0; y = y * k0 + Rotate(w.second, 37); z = z * k0 + Rotate(w.first, 27); w.first *= 9; v.first *= k0; // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. for (size_t tail_done = 0; tail_done < len; ) { tail_done += 32; y = Rotate(x + y, 42) * k0 + v.second; w.first += Fetch64(s + len - tail_done + 16); x = x * k0 + w.first; z += w.second + Fetch64(s + len - tail_done); w.second += v.first; v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); v.first *= k0; } // At this point our 56 bytes of state should contain more than // enough information for a strong 128-bit hash. We use two // different 56-byte-to-8-byte hashes to get a 16-byte final result. x = HashLen16(x, v.first); y = HashLen16(y + z, w.first); return uint128(HashLen16(x + v.second, w.second) + y, HashLen16(x + w.second, y + v.second)); } auto CityHash128(const char *s, size_t len) -> uint128 { return len >= 16 ? CityHash128WithSeed(s + 16, len - 16, uint128(Fetch64(s), Fetch64(s + 8) + k0)) : CityHash128WithSeed(s, len, uint128(k0, k1)); } #ifdef __SSE4_2__ #include #include // Requires len >= 240. static auto CityHashCrc256Long(const char *s, size_t len, uint32 seed, uint64 *result) -> void { uint64 a = Fetch64(s + 56) + k0; uint64 b = Fetch64(s + 96) + k0; uint64 c = result[0] = HashLen16(b, len); uint64 d = result[1] = Fetch64(s + 120) * k0 + len; uint64 e = Fetch64(s + 184) + seed; uint64 f = 0; uint64 g = 0; uint64 h = c + d; uint64 x = seed; uint64 y = 0; uint64 z = 0; // 240 bytes of input per iter. size_t iters = len / 240; len -= iters * 240; do { #undef CHUNK #define CHUNK(r) \ PERMUTE3(x, z, y); \ b += Fetch64(s); \ c += Fetch64(s + 8); \ d += Fetch64(s + 16); \ e += Fetch64(s + 24); \ f += Fetch64(s + 32); \ a += b; \ h += f; \ b += c; \ f += d; \ g += e; \ e += z; \ g += x; \ z = _mm_crc32_u64(z, b + g); \ y = _mm_crc32_u64(y, e + h); \ x = _mm_crc32_u64(x, f + a); \ e = Rotate(e, r); \ c += e; \ s += 40 CHUNK(0); PERMUTE3(a, h, c); CHUNK(33); PERMUTE3(a, h, f); CHUNK(0); PERMUTE3(b, h, f); CHUNK(42); PERMUTE3(b, h, d); CHUNK(0); PERMUTE3(b, h, e); CHUNK(33); PERMUTE3(a, h, e); } while (--iters > 0); while (len >= 40) { CHUNK(29); e ^= Rotate(a, 20); h += Rotate(b, 30); g ^= Rotate(c, 40); f += Rotate(d, 34); PERMUTE3(c, h, g); len -= 40; } if (len > 0) { s = s + len - 40; CHUNK(33); e ^= Rotate(a, 43); h += Rotate(b, 42); g ^= Rotate(c, 41); f += Rotate(d, 40); } result[0] ^= h; result[1] ^= g; g += h; a = HashLen16(a, g + z); x += y << 32; b += x; c = HashLen16(c, z) + h; d = HashLen16(d, e + result[0]); g += e; h += HashLen16(x, f); e = HashLen16(a, d) + g; z = HashLen16(b, c) + a; y = HashLen16(g, h) + c; result[0] = e + z + y + x; a = ShiftMix((a + y) * k0) * k0 + b; result[1] += a + result[0]; a = ShiftMix(a * k0) * k0 + c; result[2] = a + result[1]; a = ShiftMix((a + e) * k0) * k0; result[3] = a + result[2]; } // Requires len < 240. static auto CityHashCrc256Short(const char *s, size_t len, uint64 *result) -> void { char buf[240]; memcpy(buf, s, len); memset(buf + len, 0, 240 - len); CityHashCrc256Long(buf, 240, ~static_cast(len), result); } auto CityHashCrc256(const char *s, size_t len, uint64 *result) -> void { if (LIKELY(len >= 240)) { CityHashCrc256Long(s, len, 0, result); } else { CityHashCrc256Short(s, len, result); } } auto CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) -> uint128 { if (len <= 900) { return CityHash128WithSeed(s, len, seed); } else { uint64 result[4]; CityHashCrc256(s, len, result); uint64 u = Uint128High64(seed) + result[0]; uint64 v = Uint128Low64(seed) + result[1]; return uint128(HashLen16(u, v + result[2]), HashLen16(Rotate(v, 32), u * k0 + result[3])); } } auto CityHashCrc128(const char *s, size_t len) -> uint128 { if (len <= 900) { return CityHash128(s, len); } else { uint64 result[4]; CityHashCrc256(s, len, result); return uint128(result[2], result[3]); } } #endif vsearch-2.30.0/src/city.h000066400000000000000000000117241476012147200151500ustar00rootroot00000000000000// Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // http://code.google.com/p/cityhash/ // // This file provides a few functions for hashing strings. All of them are // high-quality functions in the sense that they pass standard tests such // as Austin Appleby's SMHasher. They are also fast. // // For 64-bit x86 code, on short strings, we don't know of anything faster than // CityHash64 that is of comparable quality. We believe our nearest competitor // is Murmur3. For 64-bit x86 code, CityHash64 is an excellent choice for hash // tables and most other hashing (excluding cryptography). // // For 64-bit x86 code, on long strings, the picture is more complicated. // On many recent Intel CPUs, such as Nehalem, Westmere, Sandy Bridge, etc., // CityHashCrc128 appears to be faster than all competitors of comparable // quality. CityHash128 is also good but not quite as fast. We believe our // nearest competitor is Bob Jenkins' Spooky. We don't have great data for // other 64-bit CPUs, but for long strings we know that Spooky is slightly // faster than CityHash on some relatively recent AMD x86-64 CPUs, for example. // Note that CityHashCrc128 is declared in citycrc.h. // // For 32-bit x86 code, we don't know of anything faster than CityHash32 that // is of comparable quality. We believe our nearest competitor is Murmur3A. // (On 64-bit CPUs, it is typically faster to use the other CityHash variants.) // // Functions in the CityHash family are not suitable for cryptography. // // Please see CityHash's README file for more details on our performance // measurements and so on. // // WARNING: This code has been only lightly tested on big-endian platforms! // It is known to work well on little-endian platforms that have a small penalty // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. // It should work on all 32-bit and 64-bit platforms that allow unaligned reads; // bug reports are welcome. // // By the way, for some hash functions, given strings a and b, the hash // of a+b is easily derived from the hashes of a and b. This property // doesn't hold for any hash functions in this file. #ifndef CITY_HASH_H_ #define CITY_HASH_H_ #include // uint64_t #include // std::size_t #include // std::pair using uint8 = uint8_t; using uint32 = uint32_t; using uint64 = uint64_t; using uint128 = std::pair; inline auto Uint128Low64(const uint128& x) -> uint64 { return x.first; } inline auto Uint128High64(const uint128& x) -> uint64 { return x.second; } // Hash function for a byte array. auto CityHash64(const char *s, std::size_t len) -> uint64; // Hash function for a byte array. For convenience, a 64-bit seed is also // hashed into the result. auto CityHash64WithSeed(const char *s, std::size_t len, uint64 seed) -> uint64; // Hash function for a byte array. For convenience, two seeds are also // hashed into the result. auto CityHash64WithSeeds(const char *s, std::size_t len, uint64 seed0, uint64 seed1) -> uint64; // Hash function for a byte array. auto CityHash128(const char *s, std::size_t len) -> uint128; // Hash function for a byte array. For convenience, a 128-bit seed is also // hashed into the result. auto CityHash128WithSeed(const char *s, std::size_t len, uint128 seed) -> uint128; // Hash function for a byte array. Most useful in 32-bit binaries. auto CityHash32(const char *s, std::size_t len) -> uint32; // Hash 128 input bits down to 64 bits of output. // This is intended to be a reasonably good hash function. inline auto Hash128to64(const uint128& x) -> uint64 { // Murmur-inspired hashing. static constexpr auto divider = 47U; static constexpr uint64 kMul = 0x9ddfea08eb382d69ULL; uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; a ^= (a >> divider); uint64 b = (Uint128High64(x) ^ a) * kMul; b ^= (b >> divider); b *= kMul; return b; } #endif // CITY_HASH_H_ vsearch-2.30.0/src/citycrc.h000066400000000000000000000036201476012147200156340ustar00rootroot00000000000000// Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // This file declares the subset of the CityHash functions that require // _mm_crc32_u64(). See the CityHash README for details. // // Functions in the CityHash family are not suitable for cryptography. #ifndef CITY_HASH_CRC_H_ #define CITY_HASH_CRC_H_ #include #include // std::size_t // Hash function for a byte array. auto CityHashCrc128(const char *s, std::size_t len) -> uint128; // Hash function for a byte array. For convenience, a 128-bit seed is also // hashed into the result. auto CityHashCrc128WithSeed(const char *s, std::size_t len, uint128 seed) -> uint128; // Hash function for a byte array. Sets result[0] ... result[3]. auto CityHashCrc256(const char *s, std::size_t len, uint64 *result) -> void; #endif // CITY_HASH_CRC_H_ vsearch-2.30.0/src/cluster.cc000066400000000000000000001452661476012147200160300ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "attributes.h" #include "dbindex.h" #include "mask.h" #include "minheap.h" #include "msa.h" #include "otutable.h" #include "unique.h" #include // std::count, std::minmax_element, std::max_element, std::min #include // macros PRIu64 and PRId64 #include // INT_MAX, LONG_MAX #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::qsort #include // std::strcpy, std::strlen #include #include #include // std::get #include static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ struct clusterinfo_s { int seqno; int clusterno; char * cigar; int strand; }; using clusterinfo_t = struct clusterinfo_s; static clusterinfo_t * clusterinfo = nullptr; static int clusters = 0; static int count_matched = 0; static int count_notmatched = 0; static int64_t * cluster_abundance; static std::FILE * fp_centroids = nullptr; static std::FILE * fp_uc = nullptr; static std::FILE * fp_alnout = nullptr; static std::FILE * fp_samout = nullptr; static std::FILE * fp_userout = nullptr; static std::FILE * fp_blast6out = nullptr; static std::FILE * fp_fastapairs = nullptr; static std::FILE * fp_matched = nullptr; static std::FILE * fp_notmatched = nullptr; static std::FILE * fp_otutabout = nullptr; static std::FILE * fp_mothur_shared_out = nullptr; static std::FILE * fp_biomout = nullptr; static std::FILE * fp_qsegout = nullptr; static std::FILE * fp_tsegout = nullptr; static pthread_attr_t attr; static struct searchinfo_s * si_plus; static struct searchinfo_s * si_minus; struct thread_info_s { pthread_t thread; pthread_mutex_t mutex; pthread_cond_t cond; int work; int query_first; int query_count; }; using thread_info_t = struct thread_info_s; static thread_info_t * ti; inline auto compare_byclusterno(const void * a, const void * b) -> int { auto * x = (clusterinfo_t *) a; auto * y = (clusterinfo_t *) b; if (x->clusterno < y->clusterno) { return -1; } else if (x->clusterno > y->clusterno) { return +1; } else if (x->seqno < y->seqno) { return -1; } else if (x->seqno > y->seqno) { return +1; } else { return 0; } } inline auto compare_byclusterabundance(const void * a, const void * b) -> int { auto * x = (clusterinfo_t *) a; auto * y = (clusterinfo_t *) b; if (cluster_abundance[x->clusterno] > cluster_abundance[y->clusterno]) { return -1; } else if (cluster_abundance[x->clusterno] < cluster_abundance[y->clusterno]) { return +1; } else if (x->clusterno < y->clusterno) { return -1; } else if (x->clusterno > y->clusterno) { return +1; } else if (x->seqno < y->seqno) { return -1; } else if (x->seqno > y->seqno) { return +1; } else { return 0; } } inline auto cluster_query_core(struct searchinfo_s * si) -> void { /* the main core function for clustering */ /* get sequence etc */ const int seqno = si->query_no; si->query_head_len = db_getheaderlen(seqno); si->query_head = db_getheader(seqno); si->qsize = db_getabundance(seqno); si->qseqlen = db_getsequencelen(seqno); if (si->strand) { reverse_complement(si->qsequence, db_getsequence(seqno), si->qseqlen); } else { strcpy(si->qsequence, db_getsequence(seqno)); } /* perform search */ search_onequery(si, opt_qmask); } inline auto cluster_worker(int64_t t) -> void { /* wrapper for the main threaded core function for clustering */ for (int q = 0; q < ti[t].query_count; q++) { cluster_query_core(si_plus + ti[t].query_first + q); if (opt_strand > 1) { cluster_query_core(si_minus + ti[t].query_first + q); } } } auto threads_worker(void * vp) -> void * { auto t = (int64_t) vp; thread_info_s * tip = ti + t; xpthread_mutex_lock(&tip->mutex); /* loop until signalled to quit */ while (tip->work >= 0) { /* wait for work available */ if (tip->work == 0) { xpthread_cond_wait(&tip->cond, &tip->mutex); } if (tip->work > 0) { cluster_worker(t); tip->work = 0; xpthread_cond_signal(&tip->cond); } } xpthread_mutex_unlock(&tip->mutex); return nullptr; } auto threads_wakeup(int queries) -> void { int const threads = queries > opt_threads ? opt_threads : queries; int queries_rest = queries; int threads_rest = threads; int query_next = 0; /* tell the threads that there is work to do */ for (int t = 0; t < threads; t++) { thread_info_t * tip = ti + t; tip->query_first = query_next; tip->query_count = (queries_rest + threads_rest - 1) / threads_rest; queries_rest -= tip->query_count; query_next += tip->query_count; --threads_rest; xpthread_mutex_lock(&tip->mutex); tip->work = 1; xpthread_cond_signal(&tip->cond); xpthread_mutex_unlock(&tip->mutex); } /* wait for theads to finish their work */ for (int t = 0; t < threads; t++) { thread_info_t * tip = ti + t; xpthread_mutex_lock(&tip->mutex); while (tip->work > 0) { xpthread_cond_wait(&tip->cond, &tip->mutex); } xpthread_mutex_unlock(&tip->mutex); } } auto threads_init() -> void { xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* allocate memory for thread info */ ti = (thread_info_t *) xmalloc(opt_threads * sizeof(thread_info_t)); /* init and create worker threads */ for (int t = 0; t < opt_threads; t++) { thread_info_t * tip = ti + t; tip->work = 0; xpthread_mutex_init(&tip->mutex, nullptr); xpthread_cond_init(&tip->cond, nullptr); xpthread_create(&tip->thread, &attr, threads_worker, (void *) (int64_t) t); } } auto threads_exit() -> void { /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { struct thread_info_s * tip = ti + t; /* tell worker to quit */ xpthread_mutex_lock(&tip->mutex); tip->work = -1; xpthread_cond_signal(&tip->cond); xpthread_mutex_unlock(&tip->mutex); /* wait for worker to quit */ xpthread_join(tip->thread, nullptr); xpthread_cond_destroy(&tip->cond); xpthread_mutex_destroy(&tip->mutex); } xfree(ti); xpthread_attr_destroy(&attr); } auto cluster_query_init(struct searchinfo_s * si) -> void { /* initialisation of data for one thread; run once for each thread */ /* thread specific initialiation */ si->qsize = 1; si->nw = nullptr; si->hit_count = 0; /* allocate memory for sequence */ si->seq_alloc = db_getlongestsequence() + 1; si->qsequence = (char *) xmalloc(si->seq_alloc); si->kmers = (count_t *) xmalloc((seqcount * sizeof(count_t)) + 32); si->hits = (struct hit *) xmalloc(sizeof(struct hit) * tophits); si->uh = unique_init(); si->m = minheap_init(tophits); si->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); } auto cluster_query_exit(struct searchinfo_s * si) -> void { /* clean up after thread execution; called once per thread */ search16_exit(si->s); unique_exit(si->uh); minheap_exit(si->m); if (si->qsequence) { xfree(si->qsequence); } if (si->hits) { xfree(si->hits); } if (si->kmers) { xfree(si->kmers); } } auto relabel_otu(int clusterno, char * sequence, int seqlen) -> char * { char * label = nullptr; if (opt_relabel) { int const size = strlen(opt_relabel) + 21; label = (char *) xmalloc(size); snprintf(label, size, "%s%d", opt_relabel, clusterno + 1); } else if (opt_relabel_self) { int const size = seqlen + 1; label = (char *) xmalloc(size); snprintf(label, size, "%.*s", seqlen, sequence); } else if (opt_relabel_sha1) { label = (char *) xmalloc(len_hex_dig_sha1); get_hex_seq_digest_sha1(label, sequence, seqlen); } else if (opt_relabel_md5) { label = (char *) xmalloc(len_hex_dig_md5); get_hex_seq_digest_md5(label, sequence, seqlen); } return label; } auto cluster_core_results_hit(struct hit * best, int clusterno, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { ++count_matched; if (opt_otutabout or opt_mothur_shared_out or opt_biomout) { if (opt_relabel or opt_relabel_self or opt_relabel_sha1 or opt_relabel_md5) { char * label = relabel_otu(clusterno, db_getsequence(best->target), db_getsequencelen(best->target)); otutable_add(query_head, label, qsize); xfree(label); } else { otutable_add(query_head, db_getheader(best->target), qsize); } } if (fp_uc) { results_show_uc_one(fp_uc, best, query_head, qseqlen, clusterno); } if (fp_alnout) { results_show_alnout(fp_alnout, best, 1, query_head, qsequence, qseqlen); } if (fp_samout) { results_show_samout(fp_samout, best, 1, query_head, qsequence, qsequence_rc); } if (fp_fastapairs) { results_show_fastapairs_one(fp_fastapairs, best, query_head, qsequence, qsequence_rc); } if (fp_qsegout) { results_show_qsegout_one(fp_qsegout, best, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout) { results_show_tsegout_one(fp_tsegout, best); } if (fp_userout) { results_show_userout_one(fp_userout, best, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, best, query_head, qseqlen); } if (opt_matched) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_matched, -1.0, -1, -1, nullptr, 0.0); } } auto cluster_core_results_nohit(int clusterno, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { ++count_notmatched; if (opt_otutabout or opt_mothur_shared_out or opt_biomout) { if (opt_relabel or opt_relabel_self or opt_relabel_sha1 or opt_relabel_md5) { char * label = relabel_otu(clusterno, qsequence, qseqlen); otutable_add(query_head, label, qsize); xfree(label); } else { otutable_add(query_head, query_head, qsize); } } if (opt_uc) { fprintf(fp_uc, "S\t%d\t%d\t*\t*\t*\t*\t*\t", clusters, qseqlen); header_fprint_strip(fp_uc, query_head, strlen(query_head), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uc, "\t*\n"); } if (opt_output_no_hits) { if (fp_userout) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } if (opt_notmatched) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } auto compare_kmersample(const void * a, const void * b) -> int { unsigned int const x = * (unsigned int *) a; unsigned int const y = * (unsigned int *) b; if (x < y) { return -1; } else if (x > y) { return +1; } else { return 0; } } auto cluster_core_parallel() -> void { /* create threads and set them in stand-by mode */ threads_init(); constexpr static int queries_per_thread = 1; const int max_queries = queries_per_thread * opt_threads; /* allocate memory for the search information for each query; and initialize it */ si_plus = (struct searchinfo_s *) xmalloc(max_queries * sizeof(struct searchinfo_s)); if (opt_strand > 1) { si_minus = (struct searchinfo_s *) xmalloc(max_queries * sizeof(struct searchinfo_s)); } for (int i = 0; i < max_queries; i++) { cluster_query_init(si_plus + i); si_plus[i].strand = 0; if (opt_strand > 1) { cluster_query_init(si_minus + i); si_minus[i].strand = 1; } } std::vector extra_list(max_queries); LinearMemoryAligner lma; int64_t * scorematrix = lma.scorematrix_create(opt_match, opt_mismatch); lma.set_parameters(scorematrix, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); int lastlength = INT_MAX; int seqno = 0; int64_t sum_nucleotides = 0; progress_init("Clustering", db_getnucleotidecount()); while(seqno < seqcount) { /* prepare work for the threads in sia[i] */ /* read query sequences into the search info (si) for each thread */ int queries = 0; for (int i = 0; i < max_queries; i++) { if (seqno < seqcount) { int const length = db_getsequencelen(seqno); #if 1 if (opt_cluster_smallmem and (not opt_usersort) and (length > lastlength)) { fatal("Sequences not sorted by length and --usersort not specified."); } #endif lastlength = length; si_plus[i].query_no = seqno; si_plus[i].strand = 0; if (opt_strand > 1) { si_minus[i].query_no = seqno; si_minus[i].strand = 1; } ++queries; ++seqno; } } /* perform work in threads */ threads_wakeup(queries); /* analyse results */ int extra_count = 0; for (int i = 0; i < queries; i++) { struct searchinfo_s * si_p = si_plus + i; struct searchinfo_s * si_m = opt_strand > 1 ? si_minus + i : nullptr; for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_m : si_p; int added = 0; if (extra_count) { /* Check if there is a hit with one of the non-matching extra sequences just analysed in this round */ for (int j = 0; j < extra_count; j++) { struct searchinfo_s * sic = si_plus + extra_list[j]; /* find the number of shared unique kmers */ unsigned int const shared = unique_count_shared(si->uh, opt_wordlength, sic->kmersamplecount, sic->kmersample); /* check if min number of shared kmers is satisfied */ if (search_enough_kmers(si, shared)) { unsigned int const length = sic->qseqlen; /* Go through the list of hits and see if the current match is better than any on the list in terms of more shared kmers (or shorter length if equal no of kmers). Determine insertion point (x). */ int x = si->hit_count; while ((x > 0) and ((si->hits[x - 1].count < shared) or ((si->hits[x - 1].count == shared) and (db_getsequencelen(si->hits[x - 1].target) > length)))) { --x; } if (x < opt_maxaccepts + opt_maxrejects - 1) { /* insert into list at position x */ /* trash bottom element if no more space */ if (si->hit_count >= opt_maxaccepts + opt_maxrejects - 1) { if (si->hits[si->hit_count-1].aligned) { xfree(si->hits[si->hit_count - 1].nwalignment); } --si->hit_count; } /* move the rest down */ for (int z = si->hit_count; z > x; z--) { si->hits[z] = si->hits[z - 1]; } /* init new hit */ struct hit * hit = si->hits + x; ++si->hit_count; hit->target = sic->query_no; hit->strand = si->strand; hit->count = shared; hit->accepted = false; hit->rejected = false; hit->aligned = false; hit->weak = false; hit->nwalignment = nullptr; ++added; } } } } /* now go through the hits and determine final status of each */ if (added) { si->rejects = 0; si->accepts = 0; /* set all statuses to undetermined */ for (int t = 0; t < si->hit_count; t++) { si->hits[t].accepted = false; si->hits[t].rejected = false; } for (int t = 0; (si->accepts < opt_maxaccepts) and (si->rejects < opt_maxrejects) and (t < si->hit_count); ++t) { struct hit * hit = si->hits + t; if (not hit->aligned) { /* Test accept/reject criteria before alignment */ unsigned int const target = hit->target; if (search_acceptable_unaligned(si, target)) { /* perform vectorized alignment */ /* but only using 1 sequence ! */ unsigned int nwtarget = target; int64_t nwscore = 0; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; char * nwcigar = nullptr; /* short variants for simd aligner */ CELL snwscore = 0; unsigned short snwalignmentlength = 0; unsigned short snwmatches = 0; unsigned short snwmismatches = 0; unsigned short snwgaps = 0; search16(si->s, 1, & nwtarget, & snwscore, & snwalignmentlength, & snwmatches, & snwmismatches, & snwgaps, & nwcigar); int64_t const tseqlen = db_getsequencelen(target); if (snwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * tseq = db_getsequence(target); if (nwcigar) { xfree(nwcigar); } nwcigar = xstrdup(lma.align(si->qsequence, tseq, si->qseqlen, tseqlen)); lma.alignstats(nwcigar, si->qsequence, tseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); } else { nwscore = snwscore; nwalignmentlength = snwalignmentlength; nwmatches = snwmatches; nwmismatches = snwmismatches; nwgaps = snwgaps; } int64_t const nwdiff = nwalignmentlength - nwmatches; int64_t const nwindels = nwdiff - nwmismatches; hit->aligned = true; hit->nwalignment = nwcigar; hit->nwscore = nwscore; hit->nwdiff = nwdiff; hit->nwgaps = nwgaps; hit->nwindels = nwindels; hit->nwalignmentlength = nwalignmentlength; hit->matches = nwmatches; hit->mismatches = nwmismatches; hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) / nwalignmentlength; hit->shortest = MIN(si->qseqlen, tseqlen); hit->longest = MAX(si->qseqlen, tseqlen); /* trim alignment and compute numbers excluding terminal gaps */ align_trim(hit); } else { /* rejection without alignment */ hit->rejected = true; ++si->rejects; } } if (not hit->rejected) { /* test accept/reject criteria after alignment */ if (search_acceptable_aligned(si, hit)) { ++si->accepts; } else { ++si->rejects; } } } /* delete all undetermined hits */ int new_hit_count = si->hit_count; for (int t = si->hit_count - 1; t >= 0; t--) { struct hit * hit = si->hits + t; if (not hit->accepted and not hit->rejected) { new_hit_count = t; if (hit->aligned) { xfree(hit->nwalignment); } } } si->hit_count = new_hit_count; } } /* find best hit */ struct hit * best = nullptr; if (opt_sizeorder) { best = search_findbest2_bysize(si_p, si_m); } else { best = search_findbest2_byid(si_p, si_m); } int const myseqno = si_p->query_no; if (best) { /* a hit was found, cluster current sequence with hit */ int const target = best->target; /* output intermediate results to uc etc */ cluster_core_results_hit(best, clusterinfo[target].clusterno, si_p->query_head, si_p->qseqlen, si_p->qsequence, best->strand ? si_m->qsequence : nullptr, si_p->qsize); /* update cluster info about this sequence */ clusterinfo[myseqno].seqno = myseqno; clusterinfo[myseqno].clusterno = clusterinfo[target].clusterno; clusterinfo[myseqno].cigar = best->nwalignment; clusterinfo[myseqno].strand = best->strand; best->nwalignment = nullptr; } else { /* no hit found; add it to the list of extra sequences that must be considered by the coming queries in this round */ extra_list[extra_count] = i; ++extra_count; /* update cluster info about this sequence */ clusterinfo[myseqno].seqno = myseqno; clusterinfo[myseqno].clusterno = clusters; clusterinfo[myseqno].cigar = nullptr; clusterinfo[myseqno].strand = 0; /* add current sequence to database */ dbindex_addsequence(myseqno, opt_qmask); /* output intermediate results to uc etc */ cluster_core_results_nohit(clusters, si_p->query_head, si_p->qseqlen, si_p->qsequence, nullptr, si_p->qsize); ++clusters; } /* free alignments */ for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_m : si_p; for (int j = 0; j < si->hit_count; j++) { if (si->hits[j].aligned) { if (si->hits[j].nwalignment) { xfree(si->hits[j].nwalignment); } } } } sum_nucleotides += si_p->qseqlen; } progress_update(sum_nucleotides); } progress_done(); /* clean up search info */ for (int i = 0; i < max_queries; i++) { cluster_query_exit(si_plus + i); if (opt_strand > 1) { cluster_query_exit(si_minus + i); } } // extra_list no used after that point xfree(si_plus); if (opt_strand > 1) { xfree(si_minus); } /* terminate threads and clean up */ threads_exit(); xfree(scorematrix); } auto cluster_core_serial() -> void { struct searchinfo_s si_p[1]; struct searchinfo_s si_m[1]; cluster_query_init(si_p); if (opt_strand > 1) { cluster_query_init(si_m); } int lastlength = INT_MAX; progress_init("Clustering", seqcount); for (int seqno=0; seqno lastlength)) { fatal("Sequences not sorted by length and --usersort not specified."); } #endif lastlength = length; si_p->query_no = seqno; si_p->strand = 0; cluster_query_core(si_p); if (opt_strand > 1) { si_m->query_no = seqno; si_m->strand = 1; cluster_query_core(si_m); } struct hit * best = nullptr; if (opt_sizeorder) { best = search_findbest2_bysize(si_p, si_m); } else { best = search_findbest2_byid(si_p, si_m); } if (best) { int const target = best->target; cluster_core_results_hit(best, clusterinfo[target].clusterno, si_p->query_head, si_p->qseqlen, si_p->qsequence, best->strand ? si_m->qsequence : nullptr, si_p->qsize); clusterinfo[seqno].seqno = seqno; clusterinfo[seqno].clusterno = clusterinfo[target].clusterno; clusterinfo[seqno].cigar = best->nwalignment; clusterinfo[seqno].strand = best->strand; best->nwalignment = nullptr; } else { clusterinfo[seqno].seqno = seqno; clusterinfo[seqno].clusterno = clusters; clusterinfo[seqno].cigar = nullptr; clusterinfo[seqno].strand = 0; dbindex_addsequence(seqno, opt_qmask); cluster_core_results_nohit(clusters, si_p->query_head, si_p->qseqlen, si_p->qsequence, nullptr, si_p->qsize); ++clusters; } /* free alignments */ for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_m : si_p; for (int i = 0; i < si->hit_count; i++) { if (si->hits[i].aligned) { if (si->hits[i].nwalignment) { xfree(si->hits[i].nwalignment); } } } } progress_update(seqno); } progress_done(); cluster_query_exit(si_p); if (opt_strand > 1) { cluster_query_exit(si_m); } } auto cluster(char * dbname, char * cmdline, char * progheader) -> void { if (opt_centroids) { fp_centroids = fopen_output(opt_centroids); if (not fp_centroids) { fatal("Unable to open centroids file for writing"); } } if (opt_uc) { fp_uc = fopen_output(opt_uc); if (not fp_uc) { fatal("Unable to open uc file for writing"); } } if (opt_alnout) { fp_alnout = fopen_output(opt_alnout); if (not fp_alnout) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_samout) { fp_samout = fopen_output(opt_samout); if (not fp_samout) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout) { fp_userout = fopen_output(opt_userout); if (not fp_userout) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out) { fp_blast6out = fopen_output(opt_blast6out); if (not fp_blast6out) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_fastapairs) { fp_fastapairs = fopen_output(opt_fastapairs); if (not fp_fastapairs) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout) { fp_qsegout = fopen_output(opt_qsegout); if (not fp_qsegout) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout) { fp_tsegout = fopen_output(opt_tsegout); if (not fp_tsegout) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched) { fp_matched = fopen_output(opt_matched); if (not fp_matched) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched) { fp_notmatched = fopen_output(opt_notmatched); if (not fp_notmatched) { fatal("Unable to open notmatched output file for writing"); } } if (opt_otutabout) { fp_otutabout = fopen_output(opt_otutabout); if (not fp_otutabout) { fatal("Unable to open OTU table (text format) output file for writing"); } } if (opt_mothur_shared_out) { fp_mothur_shared_out = fopen_output(opt_mothur_shared_out); if (not fp_mothur_shared_out) { fatal("Unable to open OTU table (mothur format) output file for writing"); } } if (opt_biomout) { fp_biomout = fopen_output(opt_biomout); if (not fp_biomout) { fatal("Unable to open OTU table (biom 1.0 format) output file for writing"); } } db_read(dbname, 0); otutable_init(); results_show_samheader(fp_samout, cmdline, dbname); if (opt_qmask == MASK_DUST) { dust_all(); } else if ((opt_qmask == MASK_SOFT) and (opt_hardmask)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); if (opt_cluster_fast) { db_sortbylength(); } else if (opt_cluster_size or opt_cluster_unoise) { db_sortbyabundance(); } dbindex_prepare(1, opt_qmask); /* tophits = the maximum number of hits we need to store */ if ((opt_maxrejects == 0) or (opt_maxrejects > seqcount)) { opt_maxrejects = seqcount; } if ((opt_maxaccepts == 0) or (opt_maxaccepts > seqcount)) { opt_maxaccepts = seqcount; } tophits = opt_maxrejects + opt_maxaccepts + MAXDELAYED; tophits = std::min(tophits, seqcount); std::vector clusterinfo_v(seqcount); clusterinfo = clusterinfo_v.data(); if (opt_log) { uint64_t const slots = 1ULL << (static_cast(opt_wordlength) << 1ULL); fprintf(fp_log, "\n"); fprintf(fp_log, " Alphabet nt\n"); fprintf(fp_log, " Word width %" PRId64 "\n", opt_wordlength); fprintf(fp_log, " Word ones %" PRId64 "\n", opt_wordlength); fprintf(fp_log, " Spaced No\n"); fprintf(fp_log, " Hashed No\n"); fprintf(fp_log, " Coded No\n"); fprintf(fp_log, " Stepped No\n"); fprintf(fp_log, " Slots %" PRIu64 " (%.1fk)\n", slots, slots/1000.0); fprintf(fp_log, " DBAccel 100%%\n"); fprintf(fp_log, "\n"); } if (opt_threads == 1) { cluster_core_serial(); } else { cluster_core_parallel(); } /* find size and abundance of each cluster and save stats */ std::vector cluster_abundance_v(clusters); cluster_abundance = cluster_abundance_v.data(); std::vector cluster_size(clusters); for (int i = 0; i < seqcount; i++) { int const seqno = clusterinfo_v[i].seqno; int const clusterno = clusterinfo_v[i].clusterno; cluster_abundance_v[clusterno] += opt_sizein ? db_getabundance(seqno) : 1; ++cluster_size[clusterno]; } auto const minmax_elements = std::minmax_element(cluster_abundance_v.cbegin(), cluster_abundance_v.cend()); auto const abundance_min = cluster_abundance_v.empty() ? 0 : *std::get<0>(minmax_elements); auto const abundance_max = cluster_abundance_v.empty() ? 0 : *std::get<1>(minmax_elements); int const singletons = std::count(cluster_abundance_v.cbegin(), cluster_abundance_v.cend(), int64_t{1}); auto const max_element = std::max_element(cluster_size.cbegin(), cluster_size.cend()); auto const size_max = cluster_size.empty() ? 0 : *max_element; /* Sort sequences in clusters by their abundance or ordinal number */ /* Sequences in same cluster must always come right after each other. */ /* The centroid sequence must be the first in each cluster. */ progress_init("Sorting clusters", clusters); if (opt_clusterout_sort) { qsort(clusterinfo_v.data(), seqcount, sizeof(clusterinfo_t), compare_byclusterabundance); } else { qsort(clusterinfo_v.data(), seqcount, sizeof(clusterinfo_t), compare_byclusterno); } progress_done(); progress_init("Writing clusters", seqcount); /* allocate memory for full file name of the clusters files */ std::FILE * fp_clusters = nullptr; char * fn_clusters = nullptr; int fn_clusters_size = 0; if (opt_clusters) { fn_clusters_size += strlen(opt_clusters) + 25; fn_clusters = (char *) xmalloc(fn_clusters_size); } int lastcluster = -1; int ordinal = 0; for (int i = 0; i < seqcount; i++) { int const seqno = clusterinfo_v[i].seqno; int const clusterno = clusterinfo_v[i].clusterno; if (clusterno != lastcluster) { /* prepare for new cluster */ /* performed with first sequence only in each cluster */ /* the first sequence is always the centroid */ if (opt_centroids) { fasta_print_general(fp_centroids, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), cluster_abundance_v[clusterno], clusterno + 1, -1.0, -1, opt_clusterout_id ? clusterno : -1, nullptr, 0.0); } if (opt_uc) { fprintf(fp_uc, "C\t%d\t%" PRId64 "\t*\t*\t*\t*\t*\t", clusterno, cluster_abundance_v[clusterno]); header_fprint_strip(fp_uc, db_getheader(seqno), db_getheaderlen(seqno), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uc, "\t*\n"); } if (opt_clusters) { /* close previous (except for first time) and open new file */ if (lastcluster != -1) { fclose(fp_clusters); } ordinal = 0; snprintf(fn_clusters, fn_clusters_size, "%s%d", opt_clusters, clusterno); fp_clusters = fopen_output(fn_clusters); if (not fp_clusters) { fatal("Unable to open clusters file for writing"); } } lastcluster = clusterno; } /* performed for all sequences */ if (opt_clusters) { ++ordinal; fasta_print_db_relabel(fp_clusters, seqno, ordinal); } progress_update(i); } if (lastcluster != -1) { /* performed with the last sequence */ if (opt_clusters) { fclose(fp_clusters); if (fn_clusters) { xfree(fn_clusters); } } } progress_done(); if (clusters < 1) { if (not opt_quiet) { fprintf(stderr, "Clusters: 0\n"); fprintf(stderr, "Singletons: 0\n"); } if (opt_log) { fprintf(fp_log, "Clusters: 0\n"); fprintf(fp_log, "Singletons: 0\n"); } } else { if (not opt_quiet) { fprintf(stderr, "Clusters: %d Size min %" PRId64 ", max %" PRId64 ", avg %.1f\n", clusters, abundance_min, abundance_max, 1.0 * seqcount / clusters); fprintf(stderr, "Singletons: %d, %.1f%% of seqs, %.1f%% of clusters\n", singletons, 100.0 * singletons / seqcount, 100.0 * singletons / clusters); } if (opt_log) { fprintf(fp_log, "Clusters: %d Size min %" PRId64 ", max %" PRId64 ", avg %.1f\n", clusters, abundance_min, abundance_max, 1.0 * seqcount / clusters); fprintf(fp_log, "Singletons: %d, %.1f%% of seqs, %.1f%% of clusters\n", singletons, 100.0 * singletons / seqcount, 100.0 * singletons / clusters); fprintf(fp_log, "\n"); } } if (opt_msaout or opt_consout or opt_profile) { int msa_target_count = 0; std::vector msa_target_list_v(size_max); progress_init("Multiple alignments", seqcount); std::FILE * fp_msaout = nullptr; std::FILE * fp_consout = nullptr; std::FILE * fp_profile = nullptr; if (opt_msaout) { fp_msaout = fopen_output(opt_msaout); if (not (fp_msaout)) { fatal("Unable to open msaout file"); } } if (opt_consout) { fp_consout = fopen_output(opt_consout); if (not (fp_consout)) { fatal("Unable to open consout file"); } } if (opt_profile) { fp_profile = fopen_output(opt_profile); if (not (fp_profile)) { fatal("Unable to open profile file"); } } lastcluster = -1; for (int i = 0; i < seqcount; i++) { int const clusterno = clusterinfo_v[i].clusterno; int const seqno = clusterinfo_v[i].seqno; char * cigar = clusterinfo_v[i].cigar; int const strand = clusterinfo_v[i].strand; if (clusterno != lastcluster) { if (lastcluster != -1) { /* compute msa & consensus */ msa(fp_msaout, fp_consout, fp_profile, lastcluster, msa_target_count, msa_target_list_v, cluster_abundance_v[lastcluster]); } /* start new cluster */ msa_target_count = 0; lastcluster = clusterno; } /* add current sequence to the cluster */ msa_target_list_v[msa_target_count].seqno = seqno; msa_target_list_v[msa_target_count].cigar = cigar; msa_target_list_v[msa_target_count].strand = strand; ++msa_target_count; progress_update(i); } if (lastcluster != -1) { /* compute msa & consensus */ msa(fp_msaout, fp_consout, fp_profile, lastcluster, msa_target_count, msa_target_list_v, cluster_abundance_v[lastcluster]); } progress_done(); if (fp_profile) { fclose(fp_profile); } if (fp_msaout) { fclose(fp_msaout); } if (fp_consout) { fclose(fp_consout); } } // cluster_abundance not used below that point // cluster_size not used below that point /* free cigar strings for all aligned sequences */ for (auto & clusterinfo : clusterinfo_v) { if (clusterinfo.cigar != nullptr) { xfree(clusterinfo.cigar); } } // clusterinfo not used after this point if (fp_biomout) { otutable_print_biomout(fp_biomout); fclose(fp_biomout); } if (fp_otutabout) { otutable_print_otutabout(fp_otutabout); fclose(fp_otutabout); } if (fp_mothur_shared_out) { otutable_print_mothur_shared_out(fp_mothur_shared_out); fclose(fp_mothur_shared_out); } otutable_done(); if (opt_matched) { fclose(fp_matched); } if (opt_notmatched) { fclose(fp_notmatched); } if (opt_fastapairs) { fclose(fp_fastapairs); } if (opt_qsegout) { fclose(fp_qsegout); } if (opt_tsegout) { fclose(fp_tsegout); } if (fp_blast6out) { fclose(fp_blast6out); } if (fp_userout) { fclose(fp_userout); } if (fp_alnout) { fclose(fp_alnout); } if (fp_samout) { fclose(fp_samout); } if (fp_uc) { fclose(fp_uc); } if (fp_centroids) { fclose(fp_centroids); } dbindex_free(); db_free(); show_rusage(); } auto cluster_fast(char * cmdline, char * progheader) -> void { cluster(opt_cluster_fast, cmdline, progheader); } auto cluster_smallmem(char * cmdline, char * progheader) -> void { cluster(opt_cluster_smallmem, cmdline, progheader); } auto cluster_size(char * cmdline, char * progheader) -> void { cluster(opt_cluster_size, cmdline, progheader); } auto cluster_unoise(char * cmdline, char * progheader) -> void { cluster(opt_cluster_unoise, cmdline, progheader); } vsearch-2.30.0/src/cluster.h000066400000000000000000000052531476012147200156610ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto cluster_smallmem(char * cmdline, char * progheader) -> void; auto cluster_fast(char * cmdline, char * progheader) -> void; auto cluster_size(char * cmdline, char * progheader) -> void; auto cluster_unoise(char * cmdline, char * progheader) -> void; vsearch-2.30.0/src/cpu.cc000066400000000000000000000201601476012147200151170ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // int32_t /* This file contains code dependent on special cpu features. */ /* The file may be compiled several times with different cpu options. */ #ifdef __aarch64__ void increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) { const uint8x16_t c1 = { 0x01, 0x01, 0x02, 0x02, 0x04, 0x04, 0x08, 0x08, 0x10, 0x10, 0x20, 0x20, 0x40, 0x40, 0x80, 0x80 }; unsigned short * p = (unsigned short *) (bitmap); int16x8_t * q = (int16x8_t *) (counters); const auto r = (totalbits + 15) / 16; for(auto j = 0U; j < r; j++) { // load and duplicate short uint16x8_t r0 = vdupq_n_u16(*p); ++p; // cast to bytes uint8x16_t r1 = vreinterpretq_u8_u16(r0); // bit test with mask giving 0x00 or 0xff uint8x16_t r2 = vtstq_u8(r1, c1); // transpose to duplicate even bytes uint8x16_t r3 = vtrn1q_u8(r2, r2); // transpose to duplicate odd bytes uint8x16_t r4 = vtrn2q_u8(r2, r2); // cast to signed 0x0000 or 0xffff int16x8_t r5 = vreinterpretq_s16_u8(r3); // cast to signed 0x0000 or 0xffff int16x8_t r6 = vreinterpretq_s16_u8(r4); // subtract signed 0 or -1 (i.e add 0 or 1) with saturation to counter *q = vqsubq_s16(*q, r5); ++q; // subtract signed 0 or 1 (i.e. add 0 or 1) with saturation to counter *q = vqsubq_s16(*q, r6); ++q; } } #elif defined __PPC__ void increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) { const __vector unsigned char c1 = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; const __vector unsigned char c2 = { 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f, 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f }; const __vector unsigned char c3 = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; unsigned short * p = (unsigned short *) (bitmap); __vector signed short * q = (__vector signed short *) (counters); const auto r = (totalbits + 15) / 16; for(auto j = 0U; j < r; j++) { __vector unsigned char r0; memcpy(&r0, p, 2); ++p; __vector unsigned char r1 = vec_perm(r0, r0, c1); __vector unsigned char r2 = vec_or(r1, c2); __vector __bool char r3 = vec_cmpeq(r2, c3); __vector signed short r4 = (__vector signed short) vec_unpackl(r3); __vector signed short r5 = (__vector signed short) vec_unpackh(r3); *q = vec_subs(*q, r4); ++q; *q = vec_subs(*q, r5); ++q; } } #elif __x86_64__ || defined(SIMDE_VERSION) #ifdef __x86_64__ #include #else #define SIMDE_ENABLE_NATIVE_ALIASES #include #endif #if defined(SIMDE_VERSION) void increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) #elif defined(SSSE3) void increment_counters_from_bitmap_ssse3(count_t * counters, unsigned char * bitmap, unsigned int totalbits) #else void increment_counters_from_bitmap_sse2(count_t * counters, unsigned char * bitmap, unsigned int totalbits) #endif { /* Increment selected elements in an array of 16 bit counters. The counters to increment are indicated by 1's in the bitmap. We read 16 bytes from the bitmap, but use only two bytes (16 bits). Convert these 16 bits into 16 bytes with either 0x00 or 0xFF. Extend these to 16 words (32 bytes) with either 0x0000 or 0xFFFF. Use these values to increment 16 words in an array by subtraction. See article below for some hints: http://stackoverflow.com/questions/21622212/ how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb Because the efficient PSHUFB instruction is a SSSE3 instruction lacking in many AMD cpus, we provide slightly slower alternative SSE2 code. */ // 0xffffffff -> 1111'1111'1111'1111'1111'1111'1111'1111 (32 bits) static constexpr auto all_ones = static_cast(0xffffffff); // 0x7fbfdfef -> 0111'1111'1011'1111'1101'1111'1110'1111 (32 bits) static constexpr auto mask1 = static_cast(0x7fbfdfef); // 0xf7fbfdfe -> 1111'0111'1111'1011'1111'1101'1111'1110 (32 bits) static constexpr auto mask2 = static_cast(0xf7fbfdfe); #if defined(SSSE3) || defined(SIMDE_VERSION) const auto c1 = _mm_set_epi32(0x01010101, 0x01010101, 0x00000000, 0x00000000); #endif const auto c2 = _mm_set_epi32(mask1, mask2, mask1, mask2); const auto c3 = _mm_set_epi32(all_ones, all_ones, all_ones, all_ones); auto * p = (unsigned short *) (bitmap); auto * q = (__m128i *) (counters); const auto r = (totalbits + 15) / 16; for(auto j = 0U; j < r; j++) { const auto xmm0 = _mm_loadu_si128((__m128i *) p++); #if defined(SSSE3) || defined(SIMDE_VERSION) const auto xmm1 = _mm_shuffle_epi8(xmm0, c1); #else const auto xmm6 = _mm_unpacklo_epi8(xmm0, xmm0); const auto xmm7 = _mm_unpacklo_epi16(xmm6, xmm6); const auto xmm1 = _mm_unpacklo_epi32(xmm7, xmm7); #endif const auto xmm2 = _mm_or_si128(xmm1, c2); const auto xmm3 = _mm_cmpeq_epi8(xmm2, c3); const auto xmm4 = _mm_unpacklo_epi8(xmm3, xmm3); const auto xmm5 = _mm_unpackhi_epi8(xmm3, xmm3); *q = _mm_subs_epi16(*q, xmm4); ++q; *q = _mm_subs_epi16(*q, xmm5); ++q; } } #else #error Unknown architecture #endif vsearch-2.30.0/src/cpu.h000066400000000000000000000060721476012147200147670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using count_t = unsigned short; #ifdef __x86_64__ auto increment_counters_from_bitmap_sse2(count_t * counters, unsigned char * bitmap, unsigned int totalbits) -> void; auto increment_counters_from_bitmap_ssse3(count_t * counters, unsigned char * bitmap, unsigned int totalbits) -> void; #else auto increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) -> void; #endif vsearch-2.30.0/src/cut.cc000066400000000000000000000421641476012147200151330ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/maps.hpp" #include // std::count, std::for_each, std::equal #include #include // macros PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf #include // std::next #include #include // std::move #include struct statistics { int fragment_no = 0; int fragment_rev_no = 0; int fragment_discarded_no = 0; int fragment_discarded_rev_no = 0; int64_t cut = 0; int64_t uncut = 0; int64_t matches = 0; }; struct a_file { char * name = nullptr; std::FILE * handle = nullptr; }; struct a_strand { a_file forward; a_file reverse; }; struct file_purpose { a_strand cut; a_strand discarded; }; struct restriction_pattern { std::string pattern; std::string coded_pattern; int cut_fwd; int cut_rev; }; namespace { auto cut_a_sequence(fastx_handle input_handle, struct restriction_pattern const & restriction, struct file_purpose const & fastaout, struct statistics & counters, std::vector & rc_buffer) -> void { auto const pattern_length = static_cast(restriction.pattern.size()); char * seq = fasta_get_sequence(input_handle); auto const seq_length = static_cast(fasta_get_sequence_length(input_handle)); // failed refactoring: use transform to create a coded std::string // and find() to search for pattern occurrences, IUPAC chars make it // harder to compare sequences /* get reverse complement */ rc_buffer.clear(); rc_buffer.resize(seq_length + 1); reverse_complement(rc_buffer.data(), seq, seq_length); int64_t local_matches = 0; int frag_start = 0; int frag_length = seq_length; int rc_start = seq_length; int rc_length = 0; for (int i = 0; i < seq_length - pattern_length + 1; ++i) { auto const match = std::equal(restriction.coded_pattern.cbegin(), restriction.coded_pattern.cend(), std::next(seq, i), [](char const & lhs, char const & rhs) -> bool { auto const lhs_unsigned = static_cast(lhs); auto const rhs_unsigned = chrmap_4bit_vector[static_cast(rhs)]; return ((lhs_unsigned & rhs_unsigned) != 0); // see maps.hpp }); if (not match) { continue; } ++local_matches; frag_length = i + restriction.cut_fwd - frag_start; rc_length = rc_start - (seq_length - (i + restriction.cut_rev)); rc_start -= rc_length; if ((frag_length > 0) and (fastaout.cut.forward.name != nullptr)) { fasta_print_general(fastaout.cut.forward.handle, nullptr, std::next(seq, frag_start), frag_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_no, -1.0, -1, -1, nullptr, 0.0); } if ((rc_length > 0) and (fastaout.cut.reverse.name != nullptr)) { fasta_print_general(fastaout.cut.reverse.handle, nullptr, &rc_buffer[rc_start], rc_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_rev_no, -1.0, -1, -1, nullptr, 0.0); } frag_start += frag_length; } if (local_matches > 0) { ++counters.cut; frag_length = seq_length - frag_start; rc_length = rc_start; rc_start = 0; } if ((local_matches > 0) and (frag_length > 0) and (fastaout.cut.forward.name != nullptr)) { fasta_print_general(fastaout.cut.forward.handle, nullptr, std::next(seq, frag_start), frag_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_no, -1.0, -1, -1, nullptr, 0.0); } if ((local_matches > 0) and (rc_length > 0) and (fastaout.cut.reverse.name != nullptr)) { fasta_print_general(fastaout.cut.reverse.handle, nullptr, &rc_buffer[rc_start], rc_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_rev_no, -1.0, -1, -1, nullptr, 0.0); } if (local_matches == 0) { ++counters.uncut; } if ((local_matches == 0) and (fastaout.discarded.forward.name != nullptr)) { fasta_print_general(fastaout.discarded.forward.handle, nullptr, seq, seq_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_discarded_no, -1.0, -1, -1, nullptr, 0.0); } if ((local_matches == 0) and (fastaout.discarded.reverse.name != nullptr)) { fasta_print_general(fastaout.discarded.reverse.handle, nullptr, rc_buffer.data(), seq_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_discarded_rev_no, -1.0, -1, -1, nullptr, 0.0); } counters.matches += local_matches; } auto ckeck_if_output_is_set(struct Parameters const & parameters) -> void { if ((parameters.opt_fastaout == nullptr) and (parameters.opt_fastaout_discarded == nullptr) and (parameters.opt_fastaout_rev == nullptr) and (parameters.opt_fastaout_discarded_rev == nullptr)) { fatal("No output files specified"); } } auto open_output_files(struct Parameters const & parameters) -> struct file_purpose { struct file_purpose fastaout; fastaout.cut.forward.name = parameters.opt_fastaout; fastaout.discarded.forward.name = parameters.opt_fastaout_discarded; fastaout.cut.reverse.name = parameters.opt_fastaout_rev; fastaout.discarded.reverse.name = parameters.opt_fastaout_discarded_rev; if (fastaout.cut.forward.name != nullptr) { fastaout.cut.forward.handle = fopen_output(fastaout.cut.forward.name); } if (fastaout.discarded.forward.name != nullptr) { fastaout.discarded.forward.handle = fopen_output(fastaout.discarded.forward.name); } if (fastaout.cut.reverse.name != nullptr) { fastaout.cut.reverse.handle = fopen_output(fastaout.cut.reverse.name); } if (fastaout.discarded.reverse.name != nullptr) { fastaout.discarded.reverse.handle = fopen_output(fastaout.discarded.reverse.name); } return fastaout; } auto check_output_files(struct file_purpose const & fastaout) -> void { if (fastaout.cut.forward.name != nullptr) { if (fastaout.cut.forward.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (fastaout.discarded.forward.name != nullptr) { if (fastaout.discarded.forward.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (fastaout.cut.reverse.name != nullptr) { if (fastaout.cut.reverse.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (fastaout.discarded.reverse.name != nullptr) { if (fastaout.discarded.reverse.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } } auto check_if_contains_circumflex(std::string const & pattern) -> void { auto const occurrences = std::count(pattern.cbegin(), pattern.cend(), '^'); if (occurrences == 0) { fatal("No forward sequence cut site (^) found in pattern"); } if (occurrences > 1) { fatal("Multiple cut sites not supported"); } } auto check_if_contains_underscore(std::string const & pattern) -> void { auto const occurrences = std::count(pattern.cbegin(), pattern.cend(), '_'); if (occurrences == 0) { fatal("No reverse sequence cut site (_) found in pattern"); } if (occurrences > 1) { fatal("Multiple cut sites not supported"); } } auto locate_forward_restriction_site(std::string pattern) -> int { auto const underscore_position = pattern.find('_'); pattern.erase(underscore_position, 1); return static_cast(pattern.find('^')); } auto locate_reverse_restriction_site(std::string pattern) -> int { auto const circumflex_position = pattern.find('^'); pattern.erase(circumflex_position, 1); return static_cast(pattern.find('_')); } auto remove_restriction_sites(std::string pattern) -> std::string { auto const circumflex_position = pattern.find('^'); pattern.erase(circumflex_position, 1); auto const underscore_position = pattern.find('_'); return pattern.erase(underscore_position, 1); } auto reencode_restriction_pattern(std::string raw_pattern) -> std::string { auto pattern = remove_restriction_sites(std::move(raw_pattern)); auto encode_characters = [](char const & character) -> char { auto const symbol_uchar = static_cast(character); auto const coded_symbol_uchar = chrmap_4bit_vector[symbol_uchar]; return static_cast(coded_symbol_uchar); }; std::transform(pattern.cbegin(), pattern.cend(), pattern.begin(), encode_characters); return pattern; } auto check_if_pattern_is_empty(std::string const & pattern) -> void { if (pattern.empty()) { fatal("Empty cut pattern string"); } } auto search_illegal_characters(std::string const & pattern) -> void { auto character_is_illegal = [](char const & character) { auto const unsigned_character = static_cast(character); if (chrmap_4bit_vector[unsigned_character] == 0) { fatal("Illegal character in cut pattern"); } }; std::for_each(pattern.cbegin(), pattern.cend(), character_is_illegal); } auto stats_message(std::FILE * output_stream, struct statistics const & counters) -> void { static_cast(std::fprintf(output_stream, "%" PRId64 " sequence(s) cut %" PRId64 " times, %" PRId64 " sequence(s) never cut.\n", counters.cut, counters.matches, counters.uncut)); } auto output_stats_message(struct Parameters const & parameters, struct statistics const & counters, char const * filename) -> void { if (filename == nullptr) { return; } stats_message(parameters.fp_log, counters); } auto output_stats_message(struct Parameters const & parameters, struct statistics const & counters) -> void { if (parameters.opt_quiet) { return; } stats_message(stderr, counters); } auto close_output_files(struct file_purpose const & fastaout) -> void { for (auto * fp_outputfile : { fastaout.cut.forward.handle, fastaout.discarded.forward.handle, fastaout.cut.reverse.handle, fastaout.discarded.reverse.handle}) { if (fp_outputfile != nullptr) { static_cast(std::fclose(fp_outputfile)); } } } } auto cut(struct Parameters const & parameters) -> void { ckeck_if_output_is_set(parameters); fastx_handle input_handle = fasta_open(parameters.opt_cut); assert(input_handle != nullptr); // verified by fasta_open() auto const fastaout = open_output_files(parameters); check_output_files(fastaout); auto const raw_pattern = parameters.opt_cut_pattern; // check for the expected number of restriction sites check_if_contains_circumflex(raw_pattern); check_if_contains_underscore(raw_pattern); // locate restriction sites and trim pattern struct restriction_pattern const restriction = { remove_restriction_sites(raw_pattern), reencode_restriction_pattern(raw_pattern), locate_forward_restriction_site(raw_pattern), locate_reverse_restriction_site(raw_pattern) }; check_if_pattern_is_empty(restriction.pattern); search_illegal_characters(restriction.pattern); auto const filesize = fasta_get_size(input_handle); progress_init("Cutting sequences", filesize); struct statistics counters; std::vector rc_buffer; while (fasta_next(input_handle, false, chrmap_no_change_vector.data())) { cut_a_sequence(input_handle, restriction, fastaout, counters, rc_buffer); progress_update(fasta_get_position(input_handle)); } progress_done(); output_stats_message(parameters, counters); output_stats_message(parameters, counters, parameters.opt_log); close_output_files(fastaout); fasta_close(input_handle); } vsearch-2.30.0/src/cut.h000066400000000000000000000047451476012147200150000ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto cut(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/db.cc000066400000000000000000000343261476012147200147260ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // macros PRIu64 and PRId64 #include // LONG_MAX #include // int64_t, uint64_t #include // std::fprintf, std::size_t #include // std::qsort #include // std::memcpy, std::strcmp constexpr uint64_t memchunk = 16777216; // 2^24 static fastx_handle h = nullptr; static bool is_fastq = false; static uint64_t sequences = 0; static uint64_t nucleotides = 0; static uint64_t longest = 0; static uint64_t shortest = 0; static uint64_t longestheader = 0; static uint64_t dataalloc = 0; static uint64_t datalen = 0; static size_t seqindex_alloc = 0; seqinfo_t * seqindex = nullptr; char * datap = nullptr; auto db_setinfo(bool new_is_fastq, uint64_t new_sequences, uint64_t new_nucleotides, uint64_t new_longest, uint64_t new_shortest, uint64_t new_longestheader) -> void { is_fastq = new_is_fastq; sequences = new_sequences; nucleotides = new_nucleotides; longest = new_longest; shortest = new_shortest; longestheader = new_longestheader; } auto db_is_fastq() -> bool { return is_fastq; } auto db_getquality(uint64_t seqno) -> char * { if (is_fastq) { return datap + seqindex[seqno].qual_p; } else { return nullptr; } } auto db_add(bool is_fastq, char * header, char * sequence, char * quality, size_t headerlength, size_t sequencelength, int64_t abundance) -> void { /* Add a sequence to the database. Assumes that the database has been initialized. */ /* grow space for data, if necessary */ size_t const dataalloc_old = dataalloc; size_t needed = datalen + headerlength + 1 + sequencelength + 1; if (is_fastq) { needed += sequencelength + 1; } while (dataalloc < needed) { dataalloc += memchunk; } if (dataalloc > dataalloc_old) { datap = (char *) xrealloc(datap, dataalloc); } /* store the header */ size_t const header_p = datalen; memcpy(datap + header_p, header, headerlength + 1); datalen += headerlength + 1; /* store sequence */ size_t const sequence_p = datalen; memcpy(datap + sequence_p, sequence, sequencelength + 1); datalen += sequencelength + 1; size_t const quality_p = datalen; if (is_fastq) { /* store quality */ memcpy(datap + quality_p, quality, sequencelength + 1); datalen += sequencelength + 1; } /* grow space for index, if necessary */ size_t const seqindex_alloc_old = seqindex_alloc; while ((sequences + 1) * sizeof(seqinfo_t) > seqindex_alloc) { seqindex_alloc += memchunk; } if (seqindex_alloc > seqindex_alloc_old) { seqindex = (seqinfo_t *) xrealloc(seqindex, seqindex_alloc); } /* update index */ seqinfo_t * seqindex_p = seqindex + sequences; seqindex_p->headerlen = headerlength; seqindex_p->seqlen = sequencelength; seqindex_p->header_p = header_p; seqindex_p->seq_p = sequence_p; seqindex_p->qual_p = quality_p; seqindex_p->size = abundance; /* update statistics */ ++sequences; nucleotides += sequencelength; if (sequencelength > longest) { longest = sequencelength; } if (sequencelength < shortest) { shortest = sequencelength; } if (headerlength > longestheader) { longestheader = headerlength; } } auto db_read(const char * filename, int upcase) -> void { h = fastx_open(filename); if (not h) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } is_fastq = fastx_is_fastq(h); int64_t const filesize = fastx_get_size(h); char * prompt = nullptr; if (xsprintf(& prompt, "Reading file %s", filename) == -1) { fatal("Out of memory"); } progress_init(prompt, filesize); longest = 0; shortest = LONG_MAX; longestheader = 0; sequences = 0; nucleotides = 0; int64_t discarded_short = 0; int64_t discarded_long = 0; int64_t discarded_unoise = 0; /* allocate space for data */ dataalloc = 0; datap = nullptr; datalen = 0; /* allocate space for index */ seqindex_alloc = 0; seqindex = nullptr; while(fastx_next(h, not opt_notrunclabels, upcase ? chrmap_upcase : chrmap_no_change)) { size_t const sequencelength = fastx_get_sequence_length(h); int64_t const abundance = fastx_get_abundance(h); if (sequencelength < (size_t) opt_minseqlength) { ++discarded_short; } else if (sequencelength > (size_t) opt_maxseqlength) { ++discarded_long; } else if (opt_cluster_unoise && (abundance < opt_minsize)) { ++discarded_unoise; } else { db_add(is_fastq, fastx_get_header(h), fastx_get_sequence(h), is_fastq ? fastx_get_quality(h) : nullptr, fastx_get_header_length(h), sequencelength, abundance); } progress_update(fastx_get_position(h)); } progress_done(); xfree(prompt); fastx_close(h); if (not opt_quiet) { if (sequences > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, " "min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", db_getnucleotidecount(), db_getsequencecount()); } } if (opt_log) { if (sequences > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, " "min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n\n", db_getnucleotidecount(), db_getsequencecount()); } } /* Warn about discarded sequences */ if (discarded_short) { fprintf(stderr, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); if (opt_log) { fprintf(fp_log, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); } } if (discarded_long) { fprintf(stderr, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); if (opt_log) { fprintf(fp_log, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); } } if (discarded_unoise) { fprintf(stderr, "minsize %" PRId64 ": %" PRId64 " %s discarded.\n", opt_minsize, discarded_unoise, (discarded_unoise == 1 ? "sequence" : "sequences")); if (opt_log) { fprintf(fp_log, "minsize %" PRId64 ": %" PRId64 " %s discarded.\n", opt_minsize, discarded_unoise, (discarded_unoise == 1 ? "sequence" : "sequences")); } } show_rusage(); } auto db_getsequencecount() -> uint64_t { return sequences; } auto db_getnucleotidecount() -> uint64_t { return nucleotides; } auto db_getlongestheader() -> uint64_t { return longestheader; } auto db_getlongestsequence() -> uint64_t { return longest; } auto db_getshortestsequence() -> uint64_t { return shortest; } auto db_free() -> void { if (datap) { xfree(datap); } if (seqindex) { xfree(seqindex); } } auto compare_bylength(const void * a, const void * b) -> int { auto * x = (seqinfo_t *) a; auto * y = (seqinfo_t *) b; /* longest first, then by abundance, then by label, otherwise keep order */ if (x->seqlen < y->seqlen) { return +1; } else if (x->seqlen > y->seqlen) { return -1; } else { if (x->size < y->size) { return +1; } else if (x->size > y->size) { return -1; } else { int const r = strcmp(datap + x->header_p, datap + y->header_p); if (r != 0) { return r; } else { if (x < y) { return -1; } else if (x > y) { return +1; } else { return 0; } } } } } auto compare_bylength_shortest_first(const void * a, const void * b) -> int { auto * x = (seqinfo_t *) a; auto * y = (seqinfo_t *) b; /* shortest first, then by abundance, then by label, otherwise keep order */ if (x->seqlen < y->seqlen) { return -1; } else if (x->seqlen > y->seqlen) { return +1; } else { if (x->size < y->size) { return +1; } else if (x->size > y->size) { return -1; } else { int const r = strcmp(datap + x->header_p, datap + y->header_p); if (r != 0) { return r; } else { if (x < y) { return -1; } else if (x > y) { return +1; } else { return 0; } } } } } inline auto compare_byabundance(const void * a, const void * b) -> int { auto * x = (seqinfo_t *) a; auto * y = (seqinfo_t *) b; /* most abundant first, then by label, otherwise keep order */ if (x->size > y->size) { return -1; } else if (x->size < y->size) { return +1; } else { int const r = strcmp(datap + x->header_p, datap + y->header_p); if (r != 0) { return r; } else { if (x < y) { return -1; } else if (x > y) { return +1; } else { return 0; } } } } auto db_sortbylength() -> void { progress_init("Sorting by length", 100); qsort(seqindex, sequences, sizeof(seqinfo_t), compare_bylength); progress_done(); } auto db_sortbylength_shortest_first() -> void { progress_init("Sorting by length", 100); qsort(seqindex, sequences, sizeof(seqinfo_t), compare_bylength_shortest_first); progress_done(); } auto db_sortbyabundance() -> void { progress_init("Sorting by abundance", 100); qsort(seqindex, sequences, sizeof(seqinfo_t), compare_byabundance); progress_done(); } vsearch-2.30.0/src/db.h000066400000000000000000000100101476012147200145500ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t #include // std::size_t struct seqinfo_s { std::size_t header_p; std::size_t seq_p; std::size_t qual_p; unsigned int headerlen; unsigned int seqlen; unsigned int size; }; using seqinfo_t = struct seqinfo_s; extern char * datap; extern seqinfo_t * seqindex; inline auto db_getheader(uint64_t seqno) -> char * { return datap + seqindex[seqno].header_p; } inline auto db_getsequence(uint64_t seqno) -> char * { return datap + seqindex[seqno].seq_p; } inline auto db_getabundance(uint64_t seqno) -> uint64_t { return seqindex[seqno].size; } inline auto db_getsequencelen(uint64_t seqno) -> uint64_t { return seqindex[seqno].seqlen; } inline auto db_getheaderlen(uint64_t seqno) -> uint64_t { return seqindex[seqno].headerlen; } auto db_read(const char * filename, int upcase) -> void; auto db_free() -> void; auto db_getsequencecount() -> uint64_t; auto db_getnucleotidecount() -> uint64_t; auto db_getlongestheader() -> uint64_t; auto db_getlongestsequence() -> uint64_t; auto db_getshortestsequence() -> uint64_t; /* Note: the sorting functions below must be called after db_read, but before dbindex_prepare */ auto db_sortbylength() -> void; auto db_sortbylength_shortest_first() -> void; auto db_sortbyabundance() -> void; auto db_is_fastq() -> bool; auto db_getquality(uint64_t seqno) -> char *; auto db_setinfo(bool new_is_fastq, uint64_t new_sequences, uint64_t new_nucleotides, uint64_t new_longest, uint64_t new_shortest, uint64_t new_longestheader) -> void; vsearch-2.30.0/src/dbhash.cc000066400000000000000000000146631476012147200155740ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "bitmap.h" #include "maps.h" #include // int64_t, uint64_t #include // std::memset static struct bitmap_s * dbhash_bitmap; static uint64_t dbhash_size; static unsigned int dbhash_shift; static uint64_t dbhash_mask; static struct dbhash_bucket_s * dbhash_table; auto dbhash_seqcmp(char * a, char * b, uint64_t n) -> int { char * p = a; char * q = b; if (n <= 0) { return 0; } while ((n-- > 0) and (chrmap_4bit[(int) (*p)] == chrmap_4bit[(int) (*q)])) { if ((n == 0) or (*p == 0) or (*q == 0)) { break; } ++p; ++q; } return chrmap_4bit[(int) (*p)] - chrmap_4bit[(int) (*q)]; } auto dbhash_open(uint64_t maxelements) -> void { /* adjust size of hash table for 2/3 fill rate */ /* and use a multiple of 2 */ dbhash_size = 1; dbhash_shift = 0; while (3 * maxelements > 2 * dbhash_size) { dbhash_size <<= 1U; ++dbhash_shift; } dbhash_mask = dbhash_size - 1; dbhash_table = (struct dbhash_bucket_s *) xmalloc(sizeof(dbhash_bucket_s) * dbhash_size); memset(dbhash_table, 0, sizeof(dbhash_bucket_s) * dbhash_size); dbhash_bitmap = bitmap_init(dbhash_size); bitmap_reset_all(dbhash_bitmap); } auto dbhash_close() -> void { bitmap_free(dbhash_bitmap); dbhash_bitmap = nullptr; xfree(dbhash_table); dbhash_table = nullptr; } auto dbhash_search_first(char * seq, uint64_t seqlen, struct dbhash_search_info_s * info) -> int64_t { uint64_t const hash = hash_cityhash64(seq, seqlen); info->hash = hash; info->seq = seq; info->seqlen = seqlen; uint64_t index = hash & dbhash_mask; struct dbhash_bucket_s * bp = dbhash_table + index; while (bitmap_get(dbhash_bitmap, index) and ((bp->hash != hash) or (seqlen != db_getsequencelen(bp->seqno)) or (dbhash_seqcmp(seq, db_getsequence(bp->seqno), seqlen)))) { index = (index + 1) & dbhash_mask; bp = dbhash_table + index; } info->index = index; if (bitmap_get(dbhash_bitmap, index)) { return bp->seqno; } else { return -1; } } auto dbhash_search_next(struct dbhash_search_info_s * info) -> int64_t { uint64_t const hash = info->hash; char * seq = info->seq; uint64_t const seqlen = info->seqlen; uint64_t index = (info->index + 1) & dbhash_mask; struct dbhash_bucket_s * bp = dbhash_table + index; while (bitmap_get(dbhash_bitmap, index) and ((bp->hash != hash) or (seqlen != db_getsequencelen(bp->seqno)) or (dbhash_seqcmp(seq, db_getsequence(bp->seqno), seqlen)))) { index = (index + 1) & dbhash_mask; bp = dbhash_table + index; } info->index = index; if (bitmap_get(dbhash_bitmap, index)) { return bp->seqno; } else { return -1; } } auto dbhash_add(char * seq, uint64_t seqlen, uint64_t seqno) -> void { struct dbhash_search_info_s info; int64_t ret = dbhash_search_first(seq, seqlen, & info); while (ret >= 0) { ret = dbhash_search_next(&info); } bitmap_set(dbhash_bitmap, info.index); struct dbhash_bucket_s * bp = dbhash_table + info.index; bp->hash = info.hash; bp->seqno = seqno; } auto dbhash_add_one(uint64_t seqno) -> void { char * seq = db_getsequence(seqno); uint64_t const seqlen = db_getsequencelen(seqno); char * normalized = (char *) xmalloc(seqlen + 1); string_normalize(normalized, seq, seqlen); dbhash_add(normalized, seqlen, seqno); } auto dbhash_add_all() -> void { progress_init("Hashing database sequences", db_getsequencecount()); char * normalized = (char *) xmalloc(db_getlongestsequence() + 1); for (uint64_t seqno=0; seqno < db_getsequencecount(); seqno++) { char * seq = db_getsequence(seqno); uint64_t const seqlen = db_getsequencelen(seqno); string_normalize(normalized, seq, seqlen); dbhash_add(normalized, seqlen, seqno); progress_update(seqno + 1); } xfree(normalized); progress_done(); } vsearch-2.30.0/src/dbhash.h000066400000000000000000000061751476012147200154350ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t struct dbhash_bucket_s { uint64_t hash; uint64_t seqno; }; struct dbhash_search_info_s { char * seq; uint64_t seqlen; uint64_t hash; uint64_t index; }; auto dbhash_open(uint64_t maxelements) -> void; auto dbhash_close() -> void; auto dbhash_add(char * seq, uint64_t seqlen, uint64_t seqno) -> void; auto dbhash_add_one(uint64_t seqno) -> void; auto dbhash_add_all() -> void; auto dbhash_search_first(char * seq, uint64_t seqlen, struct dbhash_search_info_s * info) -> int64_t; auto dbhash_search_next(struct dbhash_search_info_s * info) -> int64_t; auto dbhash_search_finish(struct dbhash_search_info_s * info) -> void; vsearch-2.30.0/src/dbindex.cc000066400000000000000000000176151476012147200157600ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "bitmap.h" #include "dbindex.h" #include "maps.h" #include "unique.h" #include // uint64_t #include // std::FILE, std::fprintf #include // std::memset #include // std::next unsigned int * kmercount; uint64_t * kmerhash; unsigned int * kmerindex; struct bitmap_s * * kmerbitmap; unsigned int * dbindex_map; unsigned int kmerhashsize; uint64_t kmerindexsize; unsigned int dbindex_count; uhandle_s * dbindex_uh; constexpr unsigned int bitmap_threshold = 8; static unsigned int bitmap_mincount; auto dbindex_getbitmap(unsigned int const kmer) -> unsigned char * { auto * a_bitmap_s = *std::next(kmerbitmap, kmer); if (a_bitmap_s != nullptr) { return a_bitmap_s->bitmap; } return nullptr; } auto dbindex_getmatchcount(unsigned int const kmer) -> unsigned int { return *std::next(kmercount, kmer); } auto dbindex_getmatchlist(unsigned int const kmer) -> unsigned int * { return std::next(kmerindex, *std::next(kmerhash, kmer)); } auto dbindex_getmapping(unsigned int const index) -> unsigned int { return *std::next(dbindex_map, index); } auto dbindex_getcount() -> unsigned int { return dbindex_count; } auto fprint_kmer(std::FILE * output_handle, unsigned int const kmer_length, uint64_t const kmer) -> void { for (auto i = 0U; i < kmer_length; ++i) { std::fprintf(output_handle, "%c", sym_nt_2bit[(kmer >> (2 * (kmer_length - i - 1))) & 3]); } } auto dbindex_addsequence(unsigned int seqno, int seqmask) -> void { #if 0 std::printf("Adding seqno %d as index element no %d\n", seqno, dbindex_count); #endif unsigned int uniquecount = 0; unsigned int * uniquelist = nullptr; unique_count(dbindex_uh, opt_wordlength, db_getsequencelen(seqno), db_getsequence(seqno), &uniquecount, &uniquelist, seqmask); dbindex_map[dbindex_count] = seqno; for (unsigned int i = 0; i < uniquecount; i++) { unsigned int const kmer = uniquelist[i]; if (kmerbitmap[kmer]) { kmercount[kmer]++; bitmap_set(kmerbitmap[kmer], dbindex_count); } else { kmerindex[kmerhash[kmer] + (kmercount[kmer]++)] = dbindex_count; } } ++dbindex_count; } auto dbindex_addallsequences(int seqmask) -> void { unsigned int const seqcount = db_getsequencecount(); progress_init("Creating k-mer index", seqcount); for (unsigned int seqno = 0; seqno < seqcount ; seqno++) { dbindex_addsequence(seqno, seqmask); progress_update(seqno); } progress_done(); } auto dbindex_prepare(int use_bitmap, int seqmask) -> void { dbindex_uh = unique_init(); unsigned int const seqcount = db_getsequencecount(); kmerhashsize = 1U << (2 * opt_wordlength); /* allocate memory for kmer count array */ kmercount = (unsigned int *) xmalloc(kmerhashsize * sizeof(unsigned int)); std::memset(kmercount, 0, kmerhashsize * sizeof(unsigned int)); /* first scan, just count occurences */ progress_init("Counting k-mers", seqcount); for (unsigned int seqno = 0; seqno < seqcount ; seqno++) { unsigned int uniquecount = 0; unsigned int * uniquelist = nullptr; unique_count(dbindex_uh, opt_wordlength, db_getsequencelen(seqno), db_getsequence(seqno), &uniquecount, &uniquelist, seqmask); for (unsigned int i = 0; i < uniquecount; i++) { kmercount[uniquelist[i]]++; } progress_update(seqno); } progress_done(); #if 0 /* dump kmer counts */ std::FILE * f = fopen_output("kmercounts.txt"); for (unsigned int kmer=0; kmer < kmerhashsize; kmer++) { fprint_kmer(f, 8, kmer); std::fprintf(f, "\t%d\t%d\n", kmer, kmercount[kmer]); } std::fclose(f); #endif /* determine minimum kmer count for bitmap usage */ if (use_bitmap) { bitmap_mincount = seqcount / bitmap_threshold; } else { bitmap_mincount = seqcount + 1; } /* allocate and zero bitmap pointers */ kmerbitmap = (struct bitmap_s **) xmalloc(kmerhashsize * sizeof(struct bitmap_s *)); std::memset(kmerbitmap, 0, kmerhashsize * sizeof(struct bitmap_s *)); /* hash / bitmap setup */ /* convert hash counts to position in index */ kmerhash = (uint64_t *) xmalloc((kmerhashsize + 1) * sizeof(uint64_t)); uint64_t sum = 0; for (unsigned int i = 0; i < kmerhashsize; i++) { kmerhash[i] = sum; if (kmercount[i] >= bitmap_mincount) { kmerbitmap[i] = bitmap_init(seqcount + 127); // pad for xmm bitmap_reset_all(kmerbitmap[i]); } else { sum += kmercount[i]; } } kmerindexsize = sum; kmerhash[kmerhashsize] = sum; #if 0 if (! opt_quiet) std::fprintf(stderr, "Unique %ld-mers: %u\n", opt_wordlength, kmerindexsize); #endif /* reset counts */ std::memset(kmercount, 0, kmerhashsize * sizeof(unsigned int)); /* allocate space for actual data */ kmerindex = (unsigned int *) xmalloc(kmerindexsize * sizeof(unsigned int)); /* allocate space for mapping from indexno to seqno */ dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int)); dbindex_count = 0; show_rusage(); } auto dbindex_free() -> void { xfree(kmerhash); xfree(kmerindex); xfree(kmercount); xfree(dbindex_map); for (unsigned int kmer = 0; kmer < kmerhashsize; kmer++) { if (kmerbitmap[kmer] != nullptr) { bitmap_free(kmerbitmap[kmer]); } } xfree(kmerbitmap); unique_exit(dbindex_uh); } vsearch-2.30.0/src/dbindex.h000066400000000000000000000070441476012147200156150ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bitmap.h" #include // std::FILE #include // uint64_t struct uhandle_s; extern unsigned int * kmercount; /* number of matching seqnos for each kmer */ extern uint64_t * kmerhash; /* index into the list below for each kmer */ extern unsigned int * kmerindex; /* the list of matching seqnos for kmers */ extern struct bitmap_s * * kmerbitmap; extern unsigned int * dbindex_map; extern unsigned int dbindex_count; extern unsigned int kmerhashsize; extern uint64_t kmerindexsize; extern uhandle_s * dbindex_uh; auto fprint_kmer(std::FILE * output_handle, unsigned int kmer_length, uint64_t kmer) -> void; auto dbindex_prepare(int use_bitmap, int seqmask) -> void; auto dbindex_addallsequences(int seqmask) -> void; auto dbindex_addsequence(unsigned int seqno, int seqmask) -> void; auto dbindex_free() -> void; auto dbindex_getbitmap(unsigned int kmer) -> unsigned char *; auto dbindex_getmatchcount(unsigned int kmer) -> unsigned int; auto dbindex_getmatchlist(unsigned int kmer) -> unsigned int *; auto dbindex_getmapping(unsigned int index) -> unsigned int; auto dbindex_getcount() -> unsigned int; vsearch-2.30.0/src/derep.cc000066400000000000000000000676061476012147200154470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include "utils/seqcmp.h" #include // std::min #include // macros PRIu64 and PRId64 #include // std::log10, std::pow #include // int64_t, uint64_t #include // std::qsort #include // std::FILE, std::fprintf, std::fclose #include // std::strlen, std::strcmp, std::memset #include // std::next #include #include #include #define HASH hash_cityhash64 struct bucket { uint64_t hash; unsigned int seqno_first; unsigned int seqno_last; unsigned int size; unsigned int count; bool deleted; char * header; char * seq; char * qual; }; auto derep_compare_full(void const * void_lhs, void const * void_rhs) -> int { auto * lhs = (struct bucket *) void_lhs; auto * rhs = (struct bucket *) void_rhs; /* highest abundance first, then by label, otherwise keep order */ if (lhs->deleted and not rhs->deleted) // refactoring: deleted is always set to false for derep_fulllength { return +1; } if (not lhs->deleted and rhs->deleted) // refactoring: deleted is always set to false for derep_fulllength { return -1; } // same status if (lhs->size < rhs->size) { return +1; } if (lhs->size > rhs->size) { return -1; } // same abundance if (lhs->size == 0) { return 0; } auto const result = std::strcmp(lhs->header, rhs->header); if (result != 0) { return result; } // same header (label) if (lhs->seqno_first < rhs->seqno_first) { return -1; } if (lhs->seqno_first > rhs->seqno_first) { return +1; } // same ordinal value (impossible) return 0; // unreachable } auto rehash(struct bucket ** hashtableref, int64_t alloc_clusters) -> void { /* double the size of the hash table: - allocate the new hash table - rehash all entries from the old to the new table - free the old table - update variables */ auto * old_hashtable = *hashtableref; uint64_t const old_hashtablesize = 2 * alloc_clusters; uint64_t const new_hashtablesize = 2 * old_hashtablesize; uint64_t const new_hash_mask = new_hashtablesize - 1; auto * new_hashtable = (struct bucket *) xmalloc(sizeof(struct bucket) * new_hashtablesize); memset(new_hashtable, 0, sizeof(struct bucket) * new_hashtablesize); /* rehash all */ for (auto i = 0UL; i < old_hashtablesize; ++i) { auto const & old_bucket = *std::next(old_hashtable, static_cast(i)); if (old_bucket.size != 0U) { auto new_index = old_bucket.hash & new_hash_mask; while (std::next(new_hashtable, static_cast(new_index))->size != 0U) { new_index = (new_index + 1) & new_hash_mask; } auto & new_bp = *std::next(new_hashtable, new_index); new_bp = old_bucket; } } xfree(old_hashtable); *hashtableref = new_hashtable; } inline auto convert_quality_symbol_to_probability(int const quality_symbol, struct Parameters const & parameters) -> double { static constexpr auto minimal_quality_value = 2; static constexpr auto maximal_probability = 0.75; auto const quality_value = quality_symbol - static_cast(parameters.opt_fastq_ascii); if (quality_value < minimal_quality_value) { return maximal_probability; } static constexpr auto base = 10.0; return std::pow(base, -quality_value / base); } inline auto convert_probability_to_quality_symbol(double const probability, struct Parameters const & parameters) -> int { static constexpr auto base = 10.0; auto quality_value = static_cast(std::trunc(-base * std::log10(probability))); quality_value = std::min(quality_value, parameters.opt_fastq_qmaxout); quality_value = std::max(quality_value, parameters.opt_fastq_qminout); return static_cast(quality_value + parameters.opt_fastq_asciiout); } auto derep(struct Parameters const & parameters, char * input_filename, bool use_header) -> void { /* dereplicate full length sequences, optionally require identical headers */ /* derep_fulllength output options: --output, --uc (only FASTA, depreciated) fastx_uniques output options: --fastaout, --fastqout, --uc, --tabbedout */ show_rusage(); fastx_handle input_handle = fastx_open(input_filename); if (not input_handle) { fatal("Unrecognized input file type (not proper FASTA or FASTQ format)"); // unreachable? case already handled in fastx_open(), assert(h != nullptr) should always be true } if (not fastx_is_empty(input_handle)) { if (fastx_is_fastq(input_handle)) { if (parameters.opt_fastx_uniques == nullptr) { fatal("FASTQ input is only allowed with the fastx_uniques command"); } } else { if (parameters.opt_fastqout != nullptr) { fatal("Cannot write FASTQ output when input file is not in FASTQ " "format"); } if (parameters.opt_tabbedout != nullptr) { fatal("Cannot write tab separated output file when input file is " "not in FASTQ format"); } } } std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; std::FILE * fp_uc = nullptr; std::FILE * fp_tabbedout = nullptr; if (parameters.opt_fastx_uniques != nullptr) { if ((not parameters.opt_uc) and (not parameters.opt_fastaout) and (not parameters.opt_fastqout) and (not parameters.opt_tabbedout)) { fatal("Output file for dereplication with fastx_uniques must be " "specified with --fastaout, --fastqout, --tabbedout, or --uc"); } } else { if ((not parameters.opt_output) and (not parameters.opt_uc)) { fatal("Output file for dereplication must be specified with --output " "or --uc"); } } if (parameters.opt_fastx_uniques != nullptr) { if (parameters.opt_fastaout) { fp_fastaout = fopen_output(parameters.opt_fastaout); if (not fp_fastaout) { fatal("Unable to open FASTA output file for writing"); } } if (parameters.opt_fastqout) { fp_fastqout = fopen_output(parameters.opt_fastqout); if (not fp_fastqout) { fatal("Unable to open FASTQ output file for writing"); } } if (parameters.opt_tabbedout) { fp_tabbedout = fopen_output(parameters.opt_tabbedout); if (not fp_tabbedout) { fatal("Unable to open tab delimited output file for writing"); } } } else { if (parameters.opt_output) { fp_fastaout = fopen_output(parameters.opt_output); if (not fp_fastaout) { fatal("Unable to open FASTA output file for writing"); } } } if (parameters.opt_uc) { fp_uc = fopen_output(parameters.opt_uc); if (not fp_uc) { fatal("Unable to open output (uc) file for writing"); } } uint64_t const filesize = fastx_get_size(input_handle); /* allocate initial memory for 1024 clusters with sequences of length 1023 */ uint64_t alloc_clusters = 1024; uint64_t alloc_seqs = 1024; int64_t alloc_seqlen = 1023; uint64_t hashtablesize = 2 * alloc_clusters; uint64_t hash_mask = hashtablesize - 1; auto * hashtable = (struct bucket *) xmalloc(sizeof(struct bucket) * hashtablesize); memset(hashtable, 0, sizeof(struct bucket) * hashtablesize); show_rusage(); constexpr auto terminal = std::numeric_limits::max(); std::vector nextseqtab; std::vector headertab; std::vector match_strand; auto const extra_info = parameters.opt_uc or parameters.opt_tabbedout; if (extra_info) { /* If the uc or tabbedout option is in effect, we need to keep some extra info. Allocate and init memory for this. */ /* Links to other sequences in cluster */ nextseqtab.resize(alloc_seqs, terminal); /* Pointers to the header strings */ headertab.resize(alloc_seqs); /* Matching strand */ match_strand.resize(alloc_seqs); } show_rusage(); std::vector seq_up(alloc_seqlen + 1); std::vector rc_seq_up(alloc_seqlen + 1); std::string prompt = std::string("Dereplicating file ") + input_filename; progress_init(prompt.c_str(), filesize); uint64_t sequencecount = 0; uint64_t nucleotidecount = 0; int64_t shortest = INT64_MAX; int64_t longest = 0; uint64_t discarded_short = 0; uint64_t discarded_long = 0; uint64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; double median = 0.0; double average = 0.0; while (fastx_next(input_handle, not parameters.opt_notrunclabels, chrmap_no_change)) { int64_t const seqlen = fastx_get_sequence_length(input_handle); if (seqlen < parameters.opt_minseqlength) { ++discarded_short; continue; } if (seqlen > parameters.opt_maxseqlength) { ++discarded_long; continue; } nucleotidecount += seqlen; longest = std::max(seqlen, longest); shortest = std::min(seqlen, shortest); /* check allocations */ if (seqlen > alloc_seqlen) { alloc_seqlen = seqlen; seq_up.resize(alloc_seqlen + 1); rc_seq_up.resize(alloc_seqlen + 1); show_rusage(); } if (extra_info and (sequencecount + 1 > alloc_seqs)) { uint64_t const new_alloc_seqs = 2 * alloc_seqs; nextseqtab.resize(new_alloc_seqs, terminal); headertab.resize(new_alloc_seqs); match_strand.resize(new_alloc_seqs); alloc_seqs = new_alloc_seqs; show_rusage(); } if (clusters + 1 > alloc_clusters) { uint64_t const new_alloc_clusters = 2 * alloc_clusters; rehash(& hashtable, alloc_clusters); alloc_clusters = new_alloc_clusters; hashtablesize = 2 * alloc_clusters; hash_mask = hashtablesize - 1; show_rusage(); } char * seq = fastx_get_sequence(input_handle); char * header = fastx_get_header(input_handle); int64_t const headerlen = fastx_get_header_length(input_handle); char * qual = fastx_get_quality(input_handle); // nullptr if FASTA /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); /* reverse complement if necessary */ if (parameters.opt_strand) { reverse_complement(rc_seq_up.data(), seq_up.data(), seqlen); } /* Find free bucket or bucket for identical sequence. Make sure sequences are exactly identical in case of any hash collision. With 64-bit hashes, there is about 50% chance of a collision when the number of sequences is about 5e9. */ uint64_t hash_header = 0; if (use_header) { hash_header = HASH(header, headerlen); } else { hash_header = 0; } uint64_t const hash = HASH(seq_up.data(), seqlen) ^ hash_header; uint64_t j = hash & hash_mask; struct bucket * bp = hashtable + j; while ((bp->size) and ((hash != bp->hash) or (seqcmp(seq_up.data(), bp->seq, seqlen)) or (use_header and strcmp(header, bp->header)))) { j = (j + 1) & hash_mask; bp = hashtable + j; } if (parameters.opt_strand and not bp->size) { /* no match on plus strand */ /* check minus strand as well */ uint64_t const rc_hash = HASH(rc_seq_up.data(), seqlen) ^ hash_header; uint64_t k = rc_hash & hash_mask; struct bucket * rc_bp = hashtable + k; while ((rc_bp->size) and ((rc_hash != rc_bp->hash) or (seqcmp(rc_seq_up.data(), rc_bp->seq, seqlen)) or (use_header and strcmp(header, rc_bp->header)))) { k = (k + 1) & hash_mask; rc_bp = hashtable + k; } if (rc_bp->size) { bp = rc_bp; j = k; if (extra_info) { match_strand[sequencecount] = 1; } } } int const abundance = fastx_get_abundance(input_handle); int64_t const ab = parameters.opt_sizein ? abundance : 1; sumsize += ab; if (bp->size) { /* at least one identical sequence already */ if (extra_info) { unsigned int const last = bp->seqno_last; nextseqtab[last] = sequencecount; bp->seqno_last = sequencecount; headertab[sequencecount] = header; } int64_t const s1 = bp->size; int64_t const s2 = ab; int64_t const s3 = s1 + s2; if (parameters.opt_fastqout) { /* update quality scores */ for (int i = 0; i < seqlen; i++) { int const q1 = bp->qual[i]; int const q2 = qual[i]; double const p1 = convert_quality_symbol_to_probability(q1, parameters); double const p2 = convert_quality_symbol_to_probability(q2, parameters); double p3 = 0.0; /* how to compute the new quality score? */ if (parameters.opt_fastq_qout_max) { // fastq_qout_max /* min error prob, highest quality */ p3 = std::min(p1, p2); } else { // fastq_qout_avg /* average, as in USEARCH */ p3 = (p1 * s1 + p2 * s2) / s3; } // fastq_qout_min /* max error prob, lowest quality */ // p3 = MAX(p1, p2); // fastq_qout_first /* keep first */ // p3 = p1; // fastq_qout_last /* keep last */ // p3 = p2; // fastq_qout_ef /* Compute as multiple independent observations Edgar & Flyvbjerg (2015) But what about s1 and s2? */ // p3 = p1 * p2 / 3.0 / (1.0 - p1 - p2 + (4.0 * p1 * p2 / 3.0)); /* always worst quality possible, certain error */ // p3 = 1.0; // always best quality possible, perfect, no errors */ // p3 = 0.0; int const q3 = convert_probability_to_quality_symbol(p3, parameters); bp->qual[i] = q3; } } bp->size = s3; ++bp->count; } else { /* no identical sequences yet */ bp->size = ab; bp->hash = hash; bp->seqno_first = sequencecount; bp->seqno_last = sequencecount; bp->seq = xstrdup(seq); bp->header = xstrdup(header); bp->count = 1; if (qual) { bp->qual = xstrdup(qual); } else { bp->qual = nullptr; } ++clusters; } maxsize = std::max(bp->size, maxsize); ++sequencecount; progress_update(fastx_get_position(input_handle)); } progress_done(); fastx_close(input_handle); show_rusage(); if (not parameters.opt_quiet) { if (sequencecount > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (parameters.opt_log) { if (sequencecount > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (discarded_short) { fprintf(stderr, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); if (parameters.opt_log) { fprintf(fp_log, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); } } if (discarded_long) { fprintf(stderr, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); if (parameters.opt_log) { fprintf(fp_log, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); } } show_rusage(); progress_init("Sorting", 1); qsort(hashtable, hashtablesize, sizeof(struct bucket), derep_compare_full); progress_done(); show_rusage(); if (clusters > 0) { if (clusters % 2) { median = hashtable[(clusters - 1) / 2].size; } else { median = (hashtable[(clusters / 2) - 1].size + hashtable[clusters / 2].size) / 2.0; } } average = 1.0 * sumsize / clusters; if (clusters < 1) { if (not parameters.opt_quiet) { fprintf(stderr, "0 unique sequences\n"); } if (parameters.opt_log) { fprintf(fp_log, "0 unique sequences\n\n"); } } else { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", clusters, average, median, maxsize); } if (parameters.opt_log) { fprintf(fp_log, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", clusters, average, median, maxsize); } } /* count selected */ uint64_t selected = 0; for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; int64_t const size = bp->size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++selected; if (selected == (uint64_t) parameters.opt_topn) { break; } } } show_rusage(); /* write output */ if (parameters.opt_output or parameters.opt_fastaout) { progress_init("Writing FASTA output file", clusters); int64_t relabel_count = 0; for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; int64_t const size = bp->size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++relabel_count; fasta_print_general(fp_fastaout, nullptr, bp->seq, strlen(bp->seq), bp->header, strlen(bp->header), size, relabel_count, -1.0, -1, -1, nullptr, 0.0); if (relabel_count == parameters.opt_topn) { break; } } progress_update(i); } progress_done(); fclose(fp_fastaout); } if (parameters.opt_fastqout) { progress_init("Writing FASTQ output file", clusters); int64_t relabel_count = 0; for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; int64_t const size = bp->size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++relabel_count; fastq_print_general(fp_fastqout, bp->seq, strlen(bp->seq), bp->header, strlen(bp->header), bp->qual, size, relabel_count, -1.0); if (relabel_count == parameters.opt_topn) { break; } } progress_update(i); } progress_done(); fclose(fp_fastqout); } show_rusage(); if (parameters.opt_uc) { progress_init("Writing uc file, first part", clusters); for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; char * hh = bp->header; int64_t const len = strlen(bp->seq); fprintf(fp_uc, "S\t%" PRId64 "\t%" PRId64 "\t*\t*\t*\t*\t*\t%s\t*\n", i, len, hh); for (unsigned int next = nextseqtab[bp->seqno_first]; next != terminal; next = nextseqtab[next]) { fprintf(fp_uc, "H\t%" PRId64 "\t%" PRId64 "\t%.1f\t%s\t0\t0\t*\t%s\t%s\n", i, len, 100.0, (match_strand[next] ? "-" : "+"), headertab[next].c_str(), hh); } progress_update(i); } progress_done(); progress_init("Writing uc file, second part", clusters); for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; fprintf(fp_uc, "C\t%" PRId64 "\t%u\t*\t*\t*\t*\t*\t%s\t*\n", i, bp->size, bp->header); progress_update(i); } fclose(fp_uc); progress_done(); } if (parameters.opt_tabbedout) { progress_init("Writing tab separated file", clusters); for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; char * hh = bp->header; if (parameters.opt_relabel) { fprintf(fp_tabbedout, "%s\t%s%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", hh, parameters.opt_relabel, i + 1, i, (uint64_t) 0, bp->count, hh); } else { fprintf(fp_tabbedout, "%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", hh, hh, i, (uint64_t) 0, bp->count, hh); } uint64_t j = 1; for (unsigned int next = nextseqtab[bp->seqno_first]; next != terminal; next = nextseqtab[next]) { if (parameters.opt_relabel) { fprintf(fp_tabbedout, "%s\t%s%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", headertab[next].c_str(), parameters.opt_relabel, i + 1, i, j, bp->count, hh); } else { fprintf(fp_tabbedout, "%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", headertab[next].c_str(), hh, i, j, bp->count, hh); } ++j; } progress_update(i); } fclose(fp_tabbedout); progress_done(); } show_rusage(); if (selected < clusters) { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } if (parameters.opt_log) { fprintf(fp_log, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } } show_rusage(); /* Free all seqs and headers */ for (uint64_t i = 0; i < clusters; ++i) { struct bucket * bp = hashtable + i; if (bp->size) { xfree(bp->seq); xfree(bp->header); if (bp->qual) { xfree(bp->qual); } } } show_rusage(); xfree(hashtable); show_rusage(); } vsearch-2.30.0/src/derep.h000066400000000000000000000050171476012147200152750ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto derep(struct Parameters const & parameters, char * input_filename, bool use_header) -> void; vsearch-2.30.0/src/derep_prefix.cc000066400000000000000000000352251476012147200170140ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/seqcmp.h" #include // std::max #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::qsort #include // std::FILE, std::fprintf, std::fclose #include // std::strcmp, std::memset #include #include struct bucket { uint64_t hash = 0; unsigned int seqno_first = 0; unsigned int seqno_last = 0; unsigned int size = 0; unsigned int count = 0; bool deleted = false; char * header = nullptr; char * seq = nullptr; char * qual = nullptr; }; auto derep_compare_prefix(const void * a, const void * b) -> int { auto * x = (struct bucket *) a; auto * y = (struct bucket *) b; /* highest abundance first, then by label, otherwise keep order */ if (x->deleted > y->deleted) { return +1; } else if (x->deleted < y->deleted) { return -1; } else { if (x->size < y->size) { return +1; } else if (x->size > y->size) { return -1; } else { int const r = strcmp(db_getheader(x->seqno_first), db_getheader(y->seqno_first)); if (r != 0) { return r; } else { if (x->seqno_first < y->seqno_first) { return -1; } else if (x->seqno_first > y->seqno_first) { return +1; } else { return 0; } } } } } auto derep_prefix(struct Parameters const & parameters) -> void { std::FILE * fp_output = nullptr; std::FILE * fp_uc = nullptr; if (parameters.opt_strand) { fatal("Option '--strand both' not supported with --derep_prefix"); } if (parameters.opt_output) { fp_output = fopen_output(parameters.opt_output); if (not fp_output) { fatal("Unable to open output file for writing"); } } if (parameters.opt_uc) { fp_uc = fopen_output(parameters.opt_uc); if (not fp_uc) { fatal("Unable to open output (uc) file for writing"); } } db_read(parameters.opt_derep_prefix, 0); db_sortbylength_shortest_first(); show_rusage(); int64_t const dbsequencecount = db_getsequencecount(); /* adjust size of hash table for 2/3 fill rate */ int64_t hashtablesize = 1; while (3 * dbsequencecount > 2 * hashtablesize) { hashtablesize <<= 1U; } int const hash_mask = hashtablesize - 1; std::vector hashtable(hashtablesize); int64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; double median = 0.0; double average = 0.0; /* alloc and init table of links to other sequences in cluster */ constexpr auto terminal = std::numeric_limits::max(); std::vector nextseqtab(dbsequencecount, terminal); std::vector seq_up(db_getlongestsequence() + 1); /* make table of hash values of prefixes */ unsigned int const len_longest = db_getlongestsequence(); unsigned int const len_shortest = db_getshortestsequence(); std::vector prefix_hashes(len_longest + 1); progress_init("Dereplicating", dbsequencecount); for (int64_t i = 0; i < dbsequencecount; i++) { unsigned int const seqlen = db_getsequencelen(i); char * seq = db_getsequence(i); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); uint64_t const ab = parameters.opt_sizein ? db_getabundance(i) : 1; sumsize += ab; /* Look for matching identical or prefix sequences. Use a hash function that can quickly be applied iteratively on longer and longer sequences. Hash values are generated for all prefixes and saved. Should start at exact sequence and then try shorter and shorter sequences. No need to check shorter sequences than the shortest in the database. Three cases: 1) Exact match: Update count, point to next 2) Prefix match: Mark old, insert new, update count, point to next 3) No match: Insert new entry */ /* compute hashes of all prefixes */ uint64_t fnv1a_hash = 14695981039346656037ULL; prefix_hashes[0] = fnv1a_hash; for (unsigned int j = 0; j < seqlen; j++) { fnv1a_hash ^= seq_up[j]; fnv1a_hash *= 1099511628211ULL; prefix_hashes[j + 1] = fnv1a_hash; } /* first, look for an identical match */ unsigned int prefix_len = seqlen; uint64_t hash = prefix_hashes[prefix_len]; struct bucket * bp = &hashtable[hash & hash_mask]; while ((bp->size) and ((bp->deleted) or (bp->hash != hash) or (prefix_len != db_getsequencelen(bp->seqno_first)) or (seqcmp(seq_up.data(), db_getsequence(bp->seqno_first), prefix_len)))) { ++bp; if (bp >= &hashtable[hashtablesize]) { bp = hashtable.data(); } } /* at this point, bp points either to (1) a free empty hash bucket, or (2) a bucket with an exact match. */ auto const orig_hash = hash; struct bucket * orig_bp = bp; if (bp->size) { /* exact match */ bp->size += ab; auto const last = bp->seqno_last; nextseqtab[last] = i; bp->seqno_last = i; maxsize = std::max(bp->size, maxsize); } else { /* look for prefix match */ while ((not bp->size) and (prefix_len > len_shortest)) { --prefix_len; hash = prefix_hashes[prefix_len]; bp = &hashtable[hash & hash_mask]; while ((bp->size) and ((bp->deleted) or (bp->hash != hash) or (prefix_len != db_getsequencelen(bp->seqno_first)) or (seqcmp(seq_up.data(), db_getsequence(bp->seqno_first), prefix_len)))) { ++bp; if (bp >= &hashtable[hashtablesize]) { bp = hashtable.data(); } } } if (bp->size) { /* prefix match */ /* get necessary info, then delete prefix from hash */ unsigned int const first = bp->seqno_first; unsigned int const last = bp->seqno_last; unsigned int const size = bp->size; bp->deleted = true; /* create new hash entry */ bp = orig_bp; bp->size = size + ab; bp->hash = orig_hash; bp->seqno_first = i; nextseqtab[i] = first; bp->seqno_last = last; maxsize = std::max(bp->size, maxsize); } else { /* no match */ orig_bp->size = ab; orig_bp->hash = orig_hash; orig_bp->seqno_first = i; orig_bp->seqno_last = i; maxsize = std::max(ab, maxsize); ++clusters; } } progress_update(i); } progress_done(); show_rusage(); progress_init("Sorting", 1); qsort(hashtable.data(), hashtablesize, sizeof(struct bucket), derep_compare_prefix); progress_done(); if (clusters > 0) { if (clusters % 2) { median = hashtable[(clusters - 1) / 2].size; } else { median = (hashtable[(clusters / 2) - 1].size + hashtable[clusters / 2].size) / 2.0; } } average = 1.0 * sumsize / clusters; if (clusters < 1) { if (not parameters.opt_quiet) { fprintf(stderr, "0 unique sequences\n"); } if (parameters.opt_log) { fprintf(fp_log, "0 unique sequences\n\n"); } } else { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", clusters, average, median, maxsize); } if (parameters.opt_log) { fprintf(fp_log, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", clusters, average, median, maxsize); } } show_rusage(); /* count selected */ int64_t selected = 0; for (int64_t i = 0; i < clusters; i++) { struct bucket * bp = &hashtable[i]; int64_t const size = bp->size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++selected; if (selected == parameters.opt_topn) { break; } } } /* write output */ if (parameters.opt_output) { progress_init("Writing output file", clusters); int64_t relabel_count = 0; for (int64_t i = 0; i < clusters; i++) { struct bucket * bp = &hashtable[i]; int64_t const size = bp->size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++relabel_count; fasta_print_general(fp_output, nullptr, db_getsequence(bp->seqno_first), db_getsequencelen(bp->seqno_first), db_getheader(bp->seqno_first), db_getheaderlen(bp->seqno_first), size, relabel_count, -1.0, -1, -1, nullptr, 0.0); if (relabel_count == parameters.opt_topn) { break; } } progress_update(i); } progress_done(); fclose(fp_output); } show_rusage(); if (parameters.opt_uc) { progress_init("Writing uc file, first part", clusters); for (int64_t i = 0; i < clusters; i++) { struct bucket * bp = &hashtable[i]; char * h = db_getheader(bp->seqno_first); int64_t const len = db_getsequencelen(bp->seqno_first); fprintf(fp_uc, "S\t%" PRId64 "\t%" PRId64 "\t*\t*\t*\t*\t*\t%s\t*\n", i, len, h); for (unsigned int next = nextseqtab[bp->seqno_first]; next != terminal; next = nextseqtab[next]) { fprintf(fp_uc, "H\t%" PRId64 "\t%" PRIu64 "\t%.1f\t+\t0\t0\t*\t%s\t%s\n", i, db_getsequencelen(next), 100.0, db_getheader(next), h); } progress_update(i); } progress_done(); show_rusage(); progress_init("Writing uc file, second part", clusters); for (int64_t i = 0; i < clusters; i++) { struct bucket * bp = &hashtable[i]; fprintf(fp_uc, "C\t%" PRId64 "\t%u\t*\t*\t*\t*\t*\t%s\t*\n", i, bp->size, db_getheader(bp->seqno_first)); progress_update(i); } fclose(fp_uc); progress_done(); show_rusage(); } if (selected < clusters) { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } if (parameters.opt_log) { fprintf(fp_log, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } } db_free(); } vsearch-2.30.0/src/derep_prefix.h000066400000000000000000000047561476012147200166630ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto derep_prefix(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/derep_smallmem.cc000066400000000000000000000444621476012147200173310ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // std::min, std::max #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::fprintf, std::fclose #include // std::qsort #include // std::memcpy, std::strcmp #include #include #include #define HASH hash_cityhash128 struct sm_bucket { uint128 hash; uint64_t size; }; static struct sm_bucket * hashtable = nullptr; static uint64_t hashtablesize = 0; auto find_median() -> double { /* find the median size, based on an iterative search starting at e.g. 1 */ uint64_t cand = 1; /* candidate for the median */ uint64_t below = 0; /* closest value below the candidate */ uint64_t above = 0; /* closest value above the candidate */ uint64_t cand_count = 0; /* number of clusters with same size as cand */ uint64_t below_count = 0; /* number of clusters with smaller size than cand */ uint64_t above_count = 0; /* number of clusters with larger size than cand */ while (true) { cand_count = 0; below_count = 0; above_count = 0; for (uint64_t i = 0; i < hashtablesize; i++) { uint64_t const v = hashtable[i].size; if (v > 0) { if (v > cand) { if ((above_count == 0) or (v < above)) { above = v; } ++above_count; } else if (v < cand) { if ((below_count == 0) or (v > below)) { below = v; } ++below_count; } else { ++cand_count; } } } if (below_count + cand_count + above_count == 0U) { // fix -Wfloat-equal return 0; // unreachable? } if (above_count + cand_count >= below_count) // mid >= below_count { if (above_count <= below_count + cand_count) // mid <= below_count + cand_count { if (above_count == below_count + cand_count) // mid == below_count + cand_count // same as: // (below_count + cand_count + above_count) / 2 == below_count + cand_count // which simplifies into: // above_count == below_count + cand_count { return (cand + above) / 2.0; } else if (above_count + cand_count == below_count) // mid == below_count // same as: // (below_count + cand_count + above_count) / 2 == below_count // which simplifies into: // above_count + cand_count == below_count { return (below + cand) / 2.0; // cannot reach? } else { return cand; } } else { cand = above; } } else { cand = below; // cannot reach? } } } inline auto hash2bucket(uint128 hash, uint64_t htsize) -> uint64_t { return Uint128Low64(hash) % htsize; } inline auto next_bucket(uint64_t prev_bucket, uint64_t htsize) -> uint64_t { return (prev_bucket + 1) % htsize; } auto rehash_smallmem() -> void { /* allocate new hash table, 50% larger */ uint64_t const new_hashtablesize = 3 * hashtablesize / 2; auto * new_hashtable = (struct sm_bucket *) xmalloc(sizeof(struct sm_bucket) * new_hashtablesize); /* zero new hash table */ for (uint64_t j = 0; j < new_hashtablesize; j++) { new_hashtable[j].hash.first = 0; new_hashtable[j].hash.second = 0; new_hashtable[j].size = 0; } /* rehash all from old to new */ for (uint64_t i = 0; i < hashtablesize; i++) { struct sm_bucket * old_bp = hashtable + i; if (old_bp->size) { uint64_t k = hash2bucket(old_bp->hash, new_hashtablesize); while (new_hashtable[k].size) { k = next_bucket(k, new_hashtablesize); } struct sm_bucket * new_bp = new_hashtable + k; * new_bp = * old_bp; } } /* free old table */ xfree(hashtable); /* update variables */ hashtable = new_hashtable; hashtablesize = new_hashtablesize; } auto derep_smallmem(struct Parameters const & parameters) -> void { /* dereplicate full length sequences using a small amount of memory output options: --fastaout */ show_rusage(); auto * input_filename = parameters.opt_derep_smallmem; fastx_handle h = fastx_open(input_filename); if (not h) { fatal("Unrecognized input file type (not proper FASTA or FASTQ format)."); } if (h->is_pipe) { fatal("The derep_smallmem command does not support input from a pipe."); } std::FILE * fp_fastaout = nullptr; if (parameters.opt_fastaout) { fp_fastaout = fopen_output(parameters.opt_fastaout); if (not fp_fastaout) { fatal("Unable to open FASTA output file for writing"); } } else { fatal("Output file for dereplication must be specified with --fastaout"); } auto const filesize = fastx_get_size(h); /* allocate initial memory for sequences of length up to 1023 chars */ int64_t alloc_seqlen = 1024; /* allocate initial hashtable with 1024 buckets */ hashtablesize = 1024; hashtable = (struct sm_bucket *) xmalloc(sizeof(struct sm_bucket) * hashtablesize); /* zero hash table */ for (uint64_t j = 0; j < hashtablesize; j++) { hashtable[j].hash.first = 0; hashtable[j].hash.second = 0; hashtable[j].size = 0; } show_rusage(); std::vector seq_up(alloc_seqlen + 1); std::vector rc_seq_up(alloc_seqlen + 1); std::string prompt = std::string("Dereplicating file ") + input_filename; progress_init(prompt.c_str(), filesize); uint64_t sequencecount = 0; uint64_t nucleotidecount = 0; int64_t shortest = std::numeric_limits::max(); int64_t longest = 0; uint64_t discarded_short = 0; uint64_t discarded_long = 0; uint64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; /* first pass */ while (fastx_next(h, not parameters.opt_notrunclabels, chrmap_no_change)) { int64_t const seqlen = fastx_get_sequence_length(h); if (seqlen < parameters.opt_minseqlength) { ++discarded_short; continue; } if (seqlen > parameters.opt_maxseqlength) { ++discarded_long; continue; } nucleotidecount += seqlen; longest = std::max(seqlen, longest); shortest = std::min(seqlen, shortest); /* check allocations */ if (seqlen > alloc_seqlen) { alloc_seqlen = seqlen; seq_up.resize(alloc_seqlen + 1); rc_seq_up.resize(alloc_seqlen + 1); show_rusage(); } if (100 * (clusters + 1) > 95 * hashtablesize) { // keep hash table fill rate at max 95% */ rehash_smallmem(); show_rusage(); } char * seq = fastx_get_sequence(h); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); /* reverse complement if necessary */ if (parameters.opt_strand) { reverse_complement(rc_seq_up.data(), seq_up.data(), seqlen); } /* Find free bucket or bucket for identical sequence. Make sure sequences are exactly identical in case of any hash collision. With 64-bit hashes, there is about 50% chance of a collision when the number of sequences is about 5e9. */ uint128 const hash = HASH(seq_up.data(), seqlen); uint64_t j = hash2bucket(hash, hashtablesize); struct sm_bucket * bp = hashtable + j; while ((bp->size) and (hash != bp->hash)) { j = next_bucket(j, hashtablesize); bp = hashtable + j; } if (parameters.opt_strand and not bp->size) { /* no match on plus strand */ /* check minus strand as well */ uint128 const rc_hash = HASH(rc_seq_up.data(), seqlen); uint64_t k = hash2bucket(rc_hash, hashtablesize); struct sm_bucket * rc_bp = hashtable + k; while ((rc_bp->size) and (rc_hash != rc_bp->hash)) { k = next_bucket(k, hashtablesize); rc_bp = hashtable + k; } if (rc_bp->size) { bp = rc_bp; j = k; } } int const abundance = fastx_get_abundance(h); int64_t const ab = parameters.opt_sizein ? abundance : 1; sumsize += ab; if (bp->size) { /* at least one identical sequence already */ bp->size += ab; } else { /* no identical sequences yet */ bp->size = ab; bp->hash = hash; ++clusters; } maxsize = std::max(bp->size, maxsize); ++sequencecount; progress_update(fastx_get_position(h)); } progress_done(); fastx_close(h); show_rusage(); if (not parameters.opt_quiet) { if (sequencecount > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (parameters.opt_log) { if (sequencecount > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (discarded_short) { fprintf(stderr, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); if (parameters.opt_log) { fprintf(fp_log, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); } } if (discarded_long) { fprintf(stderr, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); if (parameters.opt_log) { fprintf(fp_log, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); } } show_rusage(); if (clusters < 1) { if (not parameters.opt_quiet) { fprintf(stderr, "0 unique sequences\n"); } if (parameters.opt_log) { fprintf(fp_log, "0 unique sequences\n\n"); } } else { const double average = 1.0 * sumsize / clusters; const auto median = find_median(); if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", clusters, average, median, maxsize); } if (parameters.opt_log) { fprintf(fp_log, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", clusters, average, median, maxsize); } } show_rusage(); /* second pass with output */ fastx_handle h2 = fastx_open(input_filename); if (not h2) { fatal("Cannot open and read from the input file."); } progress_init("Writing FASTA output file", filesize); uint64_t selected = 0; while (fastx_next(h2, not parameters.opt_notrunclabels, chrmap_no_change)) { int64_t const seqlen = fastx_get_sequence_length(h2); if ((seqlen < parameters.opt_minseqlength) or (seqlen > parameters.opt_maxseqlength)) { continue; } char * seq = fastx_get_sequence(h2); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); /* reverse complement if necessary */ if (parameters.opt_strand) { reverse_complement(rc_seq_up.data(), seq_up.data(), seqlen); } uint128 const hash = HASH(seq_up.data(), seqlen); uint64_t j = hash2bucket(hash, hashtablesize); struct sm_bucket * bp = hashtable + j; while ((bp->size) and (hash != bp->hash)) { j = next_bucket(j, hashtablesize); bp = hashtable + j; } if (parameters.opt_strand and not bp->size) { /* no match on plus strand */ /* check minus strand as well */ uint128 const rc_hash = HASH(rc_seq_up.data(), seqlen); uint64_t k = hash2bucket(rc_hash, hashtablesize); struct sm_bucket * rc_bp = hashtable + k; while ((rc_bp->size) and (rc_hash != rc_bp->hash)) { k = next_bucket(k, hashtablesize); rc_bp = hashtable + k; } if (rc_bp->size) { bp = rc_bp; j = k; } } int64_t const size = bp->size; if (size > 0) { /* print sequence */ char * header = fastx_get_header(h2); int const headerlen = fastx_get_header_length(h2); if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++selected; fasta_print_general(fp_fastaout, nullptr, seq, seqlen, header, headerlen, size, selected, -1.0, -1, -1, nullptr, 0.0); } bp->size = -1; } progress_update(fastx_get_position(h2)); } progress_done(); fastx_close(h2); fclose(fp_fastaout); show_rusage(); if (selected < clusters) { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } if (parameters.opt_log) { fprintf(fp_log, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } } show_rusage(); xfree(hashtable); show_rusage(); } vsearch-2.30.0/src/derep_smallmem.h000066400000000000000000000047601476012147200171700ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto derep_smallmem(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/dynlibs.cc000066400000000000000000000114161476012147200160000ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dynlibs.h" #include // std::FILE #ifdef HAVE_ZLIB_H # ifdef _WIN32 const char gz_libname[] = "zlib1.dll"; HMODULE gz_lib; # else # ifdef __APPLE__ const char gz_libname[] = "libz.dylib"; # else const char gz_libname[] = "libz.so.1"; # endif void * gz_lib; # endif gzFile ZEXPORT (*gzdopen_p) OF((int, const char *)); int ZEXPORT (*gzclose_p) OF((gzFile)); int ZEXPORT (*gzread_p) OF((gzFile, void *, unsigned)); #endif #ifdef HAVE_BZLIB_H # ifdef _WIN32 const char bz2_libname[] = "libbz2.dll"; HMODULE bz2_lib; # else # ifdef __APPLE__ const char bz2_libname[] = "libbz2.dylib"; # else const char bz2_libname[] = "libbz2.so.1"; # endif void * bz2_lib; # endif BZFILE* (*BZ2_bzReadOpen_p)(int*, FILE*, int, int, void*, int); void (*BZ2_bzReadClose_p)(int*, BZFILE*); int (*BZ2_bzRead_p)(int*, BZFILE*, void*, int); #endif void dynlibs_open() { #ifdef HAVE_ZLIB_H #ifdef _WIN32 gz_lib = LoadLibraryA(gz_libname); #else gz_lib = dlopen(gz_libname, RTLD_LAZY); #endif if (gz_lib) { gzdopen_p = (gzFile (*)(int, const char*)) arch_dlsym(gz_lib, "gzdopen"); gzclose_p = (int (*)(gzFile)) arch_dlsym(gz_lib, "gzclose"); gzread_p = (int (*)(gzFile, void*, unsigned)) arch_dlsym(gz_lib, "gzread"); if (not (gzdopen_p && gzclose_p && gzread_p)) { fatal("Invalid compression library (zlib)"); } } #endif #ifdef HAVE_BZLIB_H #ifdef _WIN32 bz2_lib = LoadLibraryA(bz2_libname); #else bz2_lib = dlopen(bz2_libname, RTLD_LAZY); #endif if (bz2_lib) { BZ2_bzReadOpen_p = (BZFILE* (*)(int*, FILE*, int, int, void*, int)) arch_dlsym(bz2_lib, "BZ2_bzReadOpen"); BZ2_bzReadClose_p = (void (*)(int*, BZFILE*)) arch_dlsym(bz2_lib, "BZ2_bzReadClose"); BZ2_bzRead_p = (int (*)(int*, BZFILE*, void*, int)) arch_dlsym(bz2_lib, "BZ2_bzRead"); if (not (BZ2_bzReadOpen_p && BZ2_bzReadClose_p && BZ2_bzRead_p)) { fatal("Invalid compression library (bz2)"); } } #endif } void dynlibs_close() { #ifdef HAVE_ZLIB_H if (gz_lib) { #ifdef _WIN32 FreeLibrary(gz_lib); #else dlclose(gz_lib); #endif } gz_lib = nullptr; #endif #ifdef HAVE_BZLIB_H if (bz2_lib) { #ifdef _WIN32 FreeLibrary(bz2_lib); #else dlclose(bz2_lib); #endif } bz2_lib = nullptr; #endif } vsearch-2.30.0/src/dynlibs.h000066400000000000000000000061721476012147200156450ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_ZLIB_H #ifdef _WIN32 extern HMODULE gz_lib; #else extern void * gz_lib; #endif extern gzFile (*gzdopen_p)(int, const char *); extern int (*gzclose_p)(gzFile); extern int (*gzread_p)(gzFile, void*, unsigned); extern int (*gzgetc_p)(gzFile); extern int (*gzrewind_p)(gzFile); extern int (*gzungetc_p)(int, gzFile); extern const char * (*gzerror_p)(gzFile, int*); #endif #ifdef HAVE_BZLIB_H #ifdef _WIN32 extern HMODULE bz2_lib; #else extern void * bz2_lib; #endif extern BZFILE* (*BZ2_bzReadOpen_p)(int*, FILE*, int, int, void*, int); extern void (*BZ2_bzReadClose_p)(int*, BZFILE*); extern int (*BZ2_bzRead_p)(int*, BZFILE*, void*, int); #endif auto dynlibs_open() -> void; auto dynlibs_close() -> void; vsearch-2.30.0/src/eestats.cc000066400000000000000000000375601476012147200160140ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // std::max, std::min #include // macros PRIu64 and PRId64 #include // std::pow #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::exit, EXIT_FAILURE #include // std::memset #include #include inline auto fastq_get_qual_eestats(char q) -> int { int const qual = q - opt_fastq_ascii; if (qual < opt_fastq_qmin) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); if (fp_log) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); } exit(EXIT_FAILURE); } else if (qual > opt_fastq_qmax) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(stderr, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); if (fp_log) { fprintf(fp_log, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(fp_log, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); } exit(EXIT_FAILURE); } return qual; } auto q2p(int quality_value) -> double { static constexpr auto base = 10.0; return std::pow(base, -quality_value / base); } auto ee_start(int pos, int resolution) -> int64_t { return pos * (resolution * (pos + 1) + 2) / 2; } auto fastq_eestats() -> void { if (not opt_output) { fatal("Output file for fastq_eestats must be specified with --output"); } fastx_handle h = fastq_open(opt_fastq_eestats); uint64_t const filesize = fastq_get_size(h); std::FILE * fp_output = nullptr; if (opt_output) { fp_output = fopen_output(opt_output); if (not fp_output) { fatal("Unable to open output file for writing"); } } progress_init("Reading FASTQ file", filesize); uint64_t seq_count = 0; int64_t len_alloc = 10; const int resolution = 1000; int const max_quality = opt_fastq_qmax - opt_fastq_qmin + 1; int64_t ee_size = ee_start(len_alloc, resolution); std::vector read_length_table(len_alloc); std::vector qual_length_table(len_alloc * (max_quality + 1)); std::vector ee_length_table(ee_size); std::vector sum_ee_length_table(len_alloc); std::vector sum_pe_length_table(len_alloc); int64_t len_min = std::numeric_limits::max(); int64_t len_max = 0; while (fastq_next(h, false, chrmap_upcase)) { ++seq_count; int64_t const len = fastq_get_sequence_length(h); char * q = fastq_get_quality(h); /* update length statistics */ int64_t const new_alloc = len + 1; if (new_alloc > len_alloc) { int64_t const new_ee_size = ee_start(new_alloc, resolution); read_length_table.resize(new_alloc); qual_length_table.resize(new_alloc * (max_quality + 1)); ee_length_table.resize(new_ee_size); sum_ee_length_table.resize(new_alloc); sum_pe_length_table.resize(new_alloc); len_alloc = new_alloc; ee_size = new_ee_size; } len_min = std::min(len, len_min); len_max = std::max(len, len_max); /* update quality statistics */ double ee = 0.0; for (int64_t i = 0; i < len; i++) { ++read_length_table[i]; /* quality score */ auto const qual = std::max(fastq_get_qual_eestats(q[i]), 0); ++qual_length_table[((max_quality + 1) * i) + qual]; /* probability of error (Pe) */ auto const probability_of_error = q2p(qual); sum_pe_length_table[i] += probability_of_error; /* expected number of errors */ ee += probability_of_error; auto const e_int = std::min(resolution * (i + 1), (int) (resolution * ee)); ++ee_length_table[ee_start(i, resolution) + e_int]; sum_ee_length_table[i] += ee; } progress_update(fastq_get_position(h)); } progress_done(); fprintf(fp_output, "Pos\tRecs\tPctRecs\t" "Min_Q\tLow_Q\tMed_Q\tMean_Q\tHi_Q\tMax_Q\t" "Min_Pe\tLow_Pe\tMed_Pe\tMean_Pe\tHi_Pe\tMax_Pe\t" "Min_EE\tLow_EE\tMed_EE\tMean_EE\tHi_EE\tMax_EE\n"); for (int64_t i = 0; i < len_max; i++) { int64_t const reads = read_length_table[i]; double const pctrecs = 100.0 * reads / seq_count; /* q */ double min_q = -1.0; double low_q = -1.0; double med_q = -1.0; double hi_q = -1.0; double max_q = -1.0; double qsum = 0; double n = 0; for (int q = 0; q <= max_quality; q++) { double const x = qual_length_table[((max_quality + 1) * i) + q]; if (x > 0) { qsum += q * x; n += x; if (min_q < 0) { min_q = q; } if ((low_q < 0) && (n >= 0.25 * reads)) { low_q = q; } if ((med_q < 0) && (n >= 0.50 * reads)) { med_q = q; } if ((hi_q < 0) && (n >= 0.75 * reads)) { hi_q = q; } max_q = q; } } double const mean_q = 1.0 * qsum / reads; /* pe */ double min_pe = -1.0; double low_pe = -1.0; double med_pe = -1.0; double hi_pe = -1.0; double max_pe = -1.0; double pesum = 0; n = 0; for (int q = max_quality; q >= 0; q--) { double const x = qual_length_table[((max_quality + 1) * i) + q]; if (x > 0) { double const pe = q2p(q); pesum += pe * x; n += x; if (min_pe < 0) { min_pe = pe; } if ((low_pe < 0) && (n >= 0.25 * reads)) { low_pe = pe; } if ((med_pe < 0) && (n >= 0.50 * reads)) { med_pe = pe; } if ((hi_pe < 0) && (n >= 0.75 * reads)) { hi_pe = pe; } max_pe = pe; } } double const mean_pe = 1.0 * pesum / reads; /* expected errors */ double min_ee = -1.0; double low_ee = -1.0; double med_ee = -1.0; double hi_ee = -1.0; double max_ee = -1.0; int64_t const ee_offset = ee_start(i, resolution); int64_t const max_errors = resolution * (i + 1); n = 0; for (int64_t e = 0; e <= max_errors; e++) { int64_t const x = ee_length_table[ee_offset + e]; if (x > 0) { n += x; if (min_ee < 0) { min_ee = e; } if ((low_ee < 0) && (n >= 0.25 * reads)) { low_ee = e; } if ((med_ee < 0) && (n >= 0.50 * reads)) { med_ee = e; } if ((hi_ee < 0) && (n >= 0.75 * reads)) { hi_ee = e; } max_ee = e; } } double const mean_ee = sum_ee_length_table[i] / reads; min_ee = (min_ee + 0.5) / resolution; low_ee = (low_ee + 0.5) / resolution; med_ee = (med_ee + 0.5) / resolution; hi_ee = (hi_ee + 0.5) / resolution; max_ee = (max_ee + 0.5) / resolution; fprintf(fp_output, "%" PRId64 "\t%" PRId64 "\t%.1lf" "\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.1lf" "\t%.2lg\t%.2lg\t%.2lg\t%.2lg\t%.2lg\t%.2lg" "\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", i + 1, reads, pctrecs, min_q, low_q, med_q, mean_q, hi_q, max_q, min_pe, low_pe, med_pe, mean_pe, hi_pe, max_pe, min_ee, low_ee, med_ee, mean_ee, hi_ee, max_ee); } fclose(fp_output); fastq_close(h); } auto fastq_eestats2() -> void { if (! opt_output) { fatal("Output file for fastq_eestats2 must be specified with --output"); } fastx_handle h = fastq_open(opt_fastq_eestats2); uint64_t const filesize = fastq_get_size(h); std::FILE * fp_output = nullptr; if (opt_output) { fp_output = fopen_output(opt_output); if (! fp_output) { fatal("Unable to open output file for writing"); } } progress_init("Reading FASTQ file", filesize); uint64_t seq_count = 0; uint64_t symbols = 0; uint64_t longest = 0; int len_steps = 0; std::vector count_table; while (fastq_next(h, false, chrmap_upcase)) { ++seq_count; uint64_t const len = fastq_get_sequence_length(h); char * q = fastq_get_quality(h); /* update length statistics */ if (len > longest) { longest = len; // opt_length_cutoffs_longest is an int between 1 and INT_MAX int const high = MIN(longest, (uint64_t) (opt_length_cutoffs_longest)); int const new_len_steps = 1 + MAX(0, ((high - opt_length_cutoffs_shortest) / opt_length_cutoffs_increment)); if (new_len_steps > len_steps) { count_table.resize(new_len_steps * opt_ee_cutoffs_count); len_steps = new_len_steps; } } /* update quality statistics */ symbols += len; double ee = 0.0; for (uint64_t i = 0; i < len; i++) { /* quality score */ auto const qual = std::max(fastq_get_qual_eestats(q[i]), 0); auto const pe = q2p(qual); ee += pe; for (int x = 0; x < len_steps; x++) { uint64_t const len_cutoff = opt_length_cutoffs_shortest + (x * opt_length_cutoffs_increment); if (i + 1 == len_cutoff) { for (int y = 0; y < opt_ee_cutoffs_count; y++) { if (ee <= opt_ee_cutoffs_values[y]) { ++count_table[(x * opt_ee_cutoffs_count) + y]; } } } } } progress_update(fastq_get_position(h)); } progress_done(); fprintf(fp_output, "%" PRIu64 " reads", seq_count); if (seq_count > 0) { fprintf(fp_output, ", max len %" PRIu64 ", avg %.1f", longest, 1.0 * symbols / seq_count); } fprintf(fp_output, "\n\n"); fprintf(fp_output, "Length"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_output, " MaxEE %.2f", opt_ee_cutoffs_values[y]); } fprintf(fp_output, "\n"); fprintf(fp_output, "------"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_output, " ----------------"); } fprintf(fp_output, "\n"); for (int x = 0; x < len_steps; x++) { int const len_cutoff = opt_length_cutoffs_shortest + (x * opt_length_cutoffs_increment); if (len_cutoff > opt_length_cutoffs_longest) { break; } fprintf(fp_output, "%6d", len_cutoff); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_output, " %8" PRIu64 "(%5.1f%%)", count_table[(x * opt_ee_cutoffs_count) + y], 100.0 * count_table[(x * opt_ee_cutoffs_count) + y] / seq_count); } fprintf(fp_output, "\n"); } if (fp_log) { fprintf(fp_log, "%" PRIu64 " reads, max len %" PRIu64 ", avg %.1f\n\n", seq_count, longest, 1.0 * symbols / seq_count); fprintf(fp_log, "Length"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_log, " MaxEE %.2f", opt_ee_cutoffs_values[y]); } fprintf(fp_log, "\n"); fprintf(fp_log, "------"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_log, " ----------------"); } fprintf(fp_log, "\n"); for (int x = 0; x < len_steps; x++) { int const len_cutoff = opt_length_cutoffs_shortest + (x * opt_length_cutoffs_increment); if (len_cutoff > opt_length_cutoffs_longest) { break; } fprintf(fp_log, "%6d", len_cutoff); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_log, " %8" PRIu64 "(%5.1f%%)", count_table[(x * opt_ee_cutoffs_count) + y], 100.0 * count_table[(x * opt_ee_cutoffs_count) + y] / seq_count); } fprintf(fp_log, "\n"); } } fclose(fp_output); fastq_close(h); } vsearch-2.30.0/src/eestats.h000066400000000000000000000047521476012147200156530ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_eestats() -> void; auto fastq_eestats2() -> void; vsearch-2.30.0/src/fasta.cc000066400000000000000000000343001476012147200154270ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "maps.h" #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::size_t #include // std::memchr auto fasta_open(const char * filename) -> fastx_handle { fastx_handle h = fastx_open(filename); if (fastx_is_fastq(h) and not h->is_empty) { fatal("FASTA file expected, FASTQ file found (%s)", filename); } return h; } auto fasta_close(fastx_handle h) -> void { fastx_close(h); } auto fasta_filter_sequence(fastx_handle h, unsigned int * char_action, const unsigned char * char_mapping) -> void { /* Strip unwanted characters from the sequence and raise warnings or errors on certain characters. */ char * p = h->sequence_buffer.data; char * q = p; char c = '\0'; char msg[200]; while ((c = *p++)) { char const m = char_action[(unsigned char) c]; switch (m) { case 0: /* stripped */ h->stripped_all++; h->stripped[(unsigned char) c]++; break; case 1: /* legal character */ *q++ = char_mapping[(unsigned char) (c)]; break; case 2: /* fatal character */ if ((c >= 32) && (c < 127)) { snprintf(msg, 200, "Illegal character '%c' in sequence on line %" PRIu64 " of FASTA file", (unsigned char) c, h->lineno); } else { snprintf(msg, 200, "Illegal unprintable ASCII character no %d in sequence on line %" PRIu64 " of FASTA file", (unsigned char) c, h->lineno); } fatal(msg); break; case 3: /* silently stripped chars (whitespace) */ break; case 4: /* newline (silently stripped) */ h->lineno++; break; } } /* add zero after sequence */ *q = 0; h->sequence_buffer.length = q - h->sequence_buffer.data; } auto fasta_next(fastx_handle h, bool truncateatspace, const unsigned char * char_mapping) -> bool { h->lineno_start = h->lineno; h->header_buffer.length = 0; h->header_buffer.data[0] = 0; h->sequence_buffer.length = 0; h->sequence_buffer.data[0] = 0; uint64_t rest = fastx_file_fill_buffer(h); if (rest == 0) { return false; } /* read header */ /* check initial > character */ if (h->file_buffer.data[h->file_buffer.position] != '>') { fprintf(stderr, "Found character %02x\n", (unsigned char)(h->file_buffer.data[h->file_buffer.position])); fatal("Invalid FASTA - header must start with > character"); } h->file_buffer.position++; --rest; char * lf = nullptr; while (lf == nullptr) { /* get more data if buffer empty*/ rest = fastx_file_fill_buffer(h); if (rest == 0) { fatal("Invalid FASTA - header must be terminated with newline"); } /* find LF */ lf = (char *) memchr(h->file_buffer.data + h->file_buffer.position, '\n', rest); /* copy to header buffer */ uint64_t len = rest; if (lf != nullptr) { /* LF found, copy up to and including LF */ len = lf - (h->file_buffer.data + h->file_buffer.position) + 1; h->lineno++; } buffer_extend(& h->header_buffer, h->file_buffer.data + h->file_buffer.position, len); h->file_buffer.position += len; rest -= len; } /* read one or more sequence lines */ while (true) { /* get more data, if necessary */ rest = fastx_file_fill_buffer(h); /* end if no more data */ if (rest == 0) { break; } /* end if new sequence starts */ if ((lf != nullptr) && (h->file_buffer.data[h->file_buffer.position] == '>')) { break; } /* find LF */ lf = (char *) memchr(h->file_buffer.data + h->file_buffer.position, '\n', rest); uint64_t len = rest; if (lf != nullptr) { /* LF found, copy up to and including LF */ len = lf - (h->file_buffer.data + h->file_buffer.position) + 1; } buffer_extend(& h->sequence_buffer, h->file_buffer.data + h->file_buffer.position, len); h->file_buffer.position += len; rest -= len; } ++h->seqno; fastx_filter_header(h, truncateatspace); fasta_filter_sequence(h, char_fasta_action, char_mapping); return true; } auto fasta_get_abundance(fastx_handle h) -> int64_t { // return 1 if not present int64_t const size = header_get_size(h->header_buffer.data, h->header_buffer.length); if (size > 0) { return size; } else { return 1; } } auto fasta_get_abundance_and_presence(fastx_handle h) -> int64_t { // return 0 if not present return header_get_size(h->header_buffer.data, h->header_buffer.length); } auto fasta_get_position(fastx_handle h) -> uint64_t { return h->file_position; } auto fasta_get_size(fastx_handle h) -> uint64_t { return h->file_size; } auto fasta_get_lineno(fastx_handle h) -> uint64_t { return h->lineno_start; } auto fasta_get_seqno(fastx_handle h) -> uint64_t { return h->seqno; } auto fasta_get_header_length(fastx_handle h) -> uint64_t { return h->header_buffer.length; } auto fasta_get_sequence_length(fastx_handle h) -> uint64_t { return h->sequence_buffer.length; } auto fasta_get_header(fastx_handle h) -> char * { return h->header_buffer.data; } auto fasta_get_sequence(fastx_handle h) -> char * { return h->sequence_buffer.data; } /* fasta output */ auto fasta_print_sequence(std::FILE * fp, char * seq, uint64_t len, int width) -> void { /* The actual length of the sequence may be longer than "len", but only "len" characters are printed. Specify width of lines - zero (or <1) means linearize (all on one line). */ if (width < 1) { fprintf(fp, "%.*s\n", (int) (len), seq); } else { int64_t rest = len; for (uint64_t i = 0; i < len; i += width) { fprintf(fp, "%.*s\n", (int) (MIN(rest, width)), seq + i); rest -= width; } } } auto fasta_print(std::FILE * fp, const char * hdr, char * seq, uint64_t len) -> void { fprintf(fp, ">%s\n", hdr); fasta_print_sequence(fp, seq, len, opt_fasta_width); } inline auto fprint_seq_label(std::FILE * fp, char * seq, int len) -> void { /* normalize first? */ fprintf(fp, "%.*s", len, seq); } auto fasta_print_general(std::FILE * output_handle, const char * prefix, char * seq, int len, char * header, int header_length, unsigned int abundance, int ordinal, double ee, int clustersize, int clusterid, const char * score_name, double score) -> void { fprintf(output_handle, ">"); if (prefix != nullptr) { fprintf(output_handle, "%s", prefix); } if (opt_relabel_self) { fprint_seq_label(output_handle, seq, len); } else if (opt_relabel_sha1) { fprint_seq_digest_sha1(output_handle, seq, len); } else if (opt_relabel_md5) { fprint_seq_digest_md5(output_handle, seq, len); } else if ((opt_relabel != nullptr) && (ordinal > 0)) { fprintf(output_handle, "%s%d", opt_relabel, ordinal); } else { bool const strip_size = opt_xsize || (opt_sizeout && (abundance > 0)); bool const strip_ee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)); bool const strip_length = opt_xlength || opt_lengthout; header_fprint_strip(output_handle, header, header_length, strip_size, strip_ee, strip_length); } if (opt_label_suffix != nullptr) { fprintf(output_handle, "%s", opt_label_suffix); } if (opt_sample != nullptr) { fprintf(output_handle, ";sample=%s", opt_sample); } if (clustersize > 0) { fprintf(output_handle, ";seqs=%d", clustersize); } if (clusterid >= 0) { fprintf(output_handle, ";clusterid=%d", clusterid); } if (opt_sizeout && (abundance > 0)) { fprintf(output_handle, ";size=%u", abundance); } if ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)) { if (ee < 0.000000001) { fprintf(output_handle, ";ee=%.13lf", ee); } else if (ee < 0.00000001) { fprintf(output_handle, ";ee=%.12lf", ee); } else if (ee < 0.0000001) { fprintf(output_handle, ";ee=%.11lf", ee); } else if (ee < 0.000001) { fprintf(output_handle, ";ee=%.10lf", ee); } else if (ee < 0.00001) { fprintf(output_handle, ";ee=%.9lf", ee); } else if (ee < 0.0001) { fprintf(output_handle, ";ee=%.8lf", ee); } else if (ee < 0.001) { fprintf(output_handle, ";ee=%.7lf", ee); } else if (ee < 0.01) { fprintf(output_handle, ";ee=%.6lf", ee); } else if (ee < 0.1) { fprintf(output_handle, ";ee=%.5lf", ee); } else { fprintf(output_handle, ";ee=%.4lf", ee); } } if (opt_lengthout) { fprintf(output_handle, ";length=%d", len); } if (score_name != nullptr) { fprintf(output_handle, ";%s=%.4lf", score_name, score); } if (opt_relabel_keep && (((opt_relabel != nullptr) && (ordinal > 0)) || opt_relabel_sha1 || opt_relabel_md5 || opt_relabel_self)) { fprintf(output_handle, " %s", header); } fprintf(output_handle, "\n"); if (seq != nullptr) { fasta_print_sequence(output_handle, seq, len, opt_fasta_width); } } auto fasta_print_db_relabel(std::FILE * fp, uint64_t seqno, int ordinal) -> void { fasta_print_general(fp, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), db_getabundance(seqno), ordinal, -1.0, -1, -1, nullptr, 0.0); } auto fasta_print_db_relabel(std::FILE * fp, uint64_t seqno, std::size_t ordinal) -> void { fasta_print_general(fp, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), db_getabundance(seqno), static_cast(ordinal), -1.0, -1, -1, nullptr, 0.0); } auto fasta_print_db(std::FILE * fp, uint64_t seqno) -> void { fasta_print_general(fp, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), db_getabundance(seqno), 0, -1.0, -1, -1, nullptr, 0.0); } vsearch-2.30.0/src/fasta.h000066400000000000000000000105541476012147200152760ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // uint64_t /* fasta input */ auto fasta_open_rest(fastx_handle h) -> void; auto fasta_open(const char * filename) -> fastx_handle; auto fasta_close(fastx_handle h) -> void; auto fasta_next(fastx_handle h, bool truncateatspace, const unsigned char * char_mapping) -> bool; auto fasta_get_position(fastx_handle h) -> uint64_t; auto fasta_get_size(fastx_handle h) -> uint64_t; auto fasta_get_lineno(fastx_handle h) -> uint64_t; auto fasta_get_seqno(fastx_handle h) -> uint64_t; auto fasta_get_header(fastx_handle h) -> char *; auto fasta_get_sequence(fastx_handle h) -> char *; auto fasta_get_header_length(fastx_handle h) -> uint64_t; auto fasta_get_sequence_length(fastx_handle h) -> uint64_t; auto fasta_get_abundance(fastx_handle h) -> int64_t; auto fasta_get_abundance_and_presence(fastx_handle h) -> int64_t; /* fasta output */ auto fasta_print(std::FILE * fp, const char * hdr, char * seq, uint64_t len) -> void; auto fasta_print_general(std::FILE * output_handle, const char * prefix, char * seq, int len, char * header, int header_length, unsigned int abundance, int ordinal, double ee, int clustersize, int clusterid, const char * score_name, double score) -> void; auto fasta_print_db(std::FILE * fp, uint64_t seqno) -> void; auto fasta_print_db_relabel(std::FILE * fp, uint64_t seqno, int ordinal) -> void; auto fasta_print_db_relabel(std::FILE * fp, uint64_t seqno, std::size_t ordinal) -> void; vsearch-2.30.0/src/fasta2fastq.cc000066400000000000000000000107461476012147200165600ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/maps.hpp" #include #include // std::FILE, std::size_t, std::fclose #include auto fasta2fastq(struct Parameters const & parameters) -> void { auto const max_ascii_value = static_cast(parameters.opt_fastq_asciiout + parameters.opt_fastq_qmaxout); assert(parameters.opt_fastqout != nullptr); // check performed in auto * fp_input = fasta_open(parameters.opt_fasta2fastq); assert(fp_input != nullptr); // check performed in fasta_open(fastx_open()) auto * fp_fastqout = fopen_output(parameters.opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open FASTQ output file for writing"); } static constexpr auto initial_length = 1024U; std::vector quality(initial_length, max_ascii_value); progress_init("Converting FASTA file to FASTQ", fasta_get_size(fp_input)); auto counter = 0; while (fasta_next(fp_input, false, chrmap_no_change_vector.data())) { /* get sequence length and allocate more mem if necessary */ auto const length = fastq_get_sequence_length(fp_input); if (quality.size() < length + 1) { quality.resize(length + 1, max_ascii_value); } // note: adding '\0' and the end of the quality string is not necessary, // fastq_print_general() uses 'length' for both sequence and quality ++counter; /* write to fastq file */ fastq_print_general(fp_fastqout, fastq_get_sequence(fp_input), static_cast(length), fasta_get_header(fp_input), static_cast(fasta_get_header_length(fp_input)), quality.data(), static_cast(fastq_get_abundance(fp_input)), counter, -1.0); progress_update(fasta_get_position(fp_input)); } progress_done(); if (fp_fastqout != nullptr) { static_cast(std::fclose(fp_fastqout)); } fasta_close(fp_input); } vsearch-2.30.0/src/fasta2fastq.h000066400000000000000000000047551476012147200164250ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fasta2fastq(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/fastq.cc000066400000000000000000000433041476012147200154530ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "maps.h" #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::snprintf #include // std::memcmp, std::memchr, std::strlen auto fastq_fatal(uint64_t lineno, const char * msg) -> void { char * string = nullptr; if (xsprintf(&string, "Invalid line %lu in FASTQ file: %s", lineno, msg) == -1) { fatal("Out of memory"); } if (string) { fatal(string); xfree(string); } else { fatal("Out of memory"); } } auto buffer_filter_extend(fastx_handle input_handle, struct fastx_buffer_s * dest_buffer, char * source_buf, uint64_t len, unsigned int * char_action, const unsigned char * char_mapping, bool * ok, char * illegal_char) -> void { buffer_makespace(dest_buffer, len + 1); /* Strip unwanted characters from the string and raise warnings or errors on certain characters. */ auto * p = source_buf; auto * d = dest_buffer->data + dest_buffer->length; auto * q = d; *ok = true; for (auto i = 0ULL; i < len; i++) { auto const c = *p++; char const m = char_action[(unsigned char) (c)]; switch(m) { case 0: /* stripped */ input_handle->stripped_all++; input_handle->stripped[(unsigned char) (c)]++; break; case 1: /* legal character */ *q++ = char_mapping[(unsigned char) (c)]; break; case 2: /* fatal character */ if (*ok) { *illegal_char = c; } *ok = false; break; case 3: /* silently stripped chars (whitespace) */ break; case 4: /* newline (silently stripped) */ break; } } /* add zero after sequence */ *q = 0; dest_buffer->length += q - d; } auto fastq_open(const char * filename) -> fastx_handle { auto * input_handle = fastx_open(filename); if (! fastx_is_fastq(input_handle)) { fatal("FASTQ file expected, FASTA file found (%s)", filename); } return input_handle; } auto fastq_close(fastx_handle input_handle) -> void { fastx_close(input_handle); } auto fastq_next(fastx_handle input_handle, bool const truncateatspace, const unsigned char * char_mapping) -> bool { input_handle->header_buffer.length = 0; input_handle->header_buffer.data[0] = 0; input_handle->sequence_buffer.length = 0; input_handle->sequence_buffer.data[0] = 0; input_handle->plusline_buffer.length = 0; input_handle->plusline_buffer.data[0] = 0; input_handle->quality_buffer.length = 0; input_handle->quality_buffer.data[0] = 0; input_handle->lineno_start = input_handle->lineno; char msg[200]; auto ok = true; char illegal_char = '\0'; auto rest = fastx_file_fill_buffer(input_handle); /* check end of file */ if (rest == 0) { return false; } /* read header */ /* check initial @ character */ if (input_handle->file_buffer.data[input_handle->file_buffer.position] != '@') { fastq_fatal(input_handle->lineno, "Header line must start with '@' character"); } input_handle->file_buffer.position++; rest--; char * lf = nullptr; while (lf == nullptr) { /* get more data if buffer empty */ rest = fastx_file_fill_buffer(input_handle); if (rest == 0) { fastq_fatal(input_handle->lineno, "Unexpected end of file"); } /* find LF */ lf = (char *) memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to header buffer */ auto len = rest; if (lf) { /* LF found, copy up to and including LF */ len = lf - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_extend(&input_handle->header_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len); input_handle->file_buffer.position += len; rest -= len; } /* read sequence line(s) */ lf = nullptr; while (true) { /* get more data, if necessary */ rest = fastx_file_fill_buffer(input_handle); /* cannot end here */ if (rest == 0) { fastq_fatal(input_handle->lineno, "Unexpected end of file"); } /* end when new line starting with + is seen */ if (lf && (input_handle->file_buffer.data[input_handle->file_buffer.position] == '+')) { break; } /* find LF */ lf = (char *) memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to sequence buffer */ auto len = rest; if (lf) { /* LF found, copy up to and including LF */ len = lf - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_filter_extend(input_handle, &input_handle->sequence_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len, char_fq_action_seq, char_mapping, &ok, &illegal_char); input_handle->file_buffer.position += len; rest -= len; if (! ok) { if ((illegal_char >= 32) && (illegal_char < 127)) { snprintf(msg, 200, "Illegal sequence character '%c'", illegal_char); } else { snprintf(msg, 200, "Illegal sequence character (unprintable, no %d)", (unsigned char) illegal_char); } fastq_fatal(input_handle->lineno - (lf ? 1 : 0), msg); } } /* read + line */ /* skip + character */ input_handle->file_buffer.position++; rest--; lf = nullptr; while (lf == nullptr) { /* get more data if buffer empty */ rest = fastx_file_fill_buffer(input_handle); /* cannot end here */ if (rest == 0) { fastq_fatal(input_handle->lineno, "Unexpected end of file"); } /* find LF */ lf = (char *) memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to plusline buffer */ auto len = rest; if (lf) { /* LF found, copy up to and including LF */ len = lf - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_extend(&input_handle->plusline_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len); input_handle->file_buffer.position += len; rest -= len; } /* check that the plus line is empty or identical to @ line */ bool plusline_invalid = false; if (input_handle->header_buffer.length == input_handle->plusline_buffer.length) { if (memcmp(input_handle->header_buffer.data, input_handle->plusline_buffer.data, input_handle->header_buffer.length)) { plusline_invalid = true; } } else { if ((input_handle->plusline_buffer.length > 2) || ((input_handle->plusline_buffer.length == 2) && (input_handle->plusline_buffer.data[0] != '\r'))) { plusline_invalid = true; } } if (plusline_invalid) { fastq_fatal(input_handle->lineno - (lf ? 1 : 0), "'+' line must be empty or identical to header"); } /* read quality line(s) */ lf = nullptr; while (true) { /* get more data, if necessary */ rest = fastx_file_fill_buffer(input_handle); /* end if no more data */ if (rest == 0) { break; } /* end if next entry starts : LF + '@' + correct length */ if (lf && (input_handle->file_buffer.data[input_handle->file_buffer.position] == '@') && (input_handle->quality_buffer.length == input_handle->sequence_buffer.length)) { break; } /* find LF */ lf = (char *) memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to quality buffer */ auto len = rest; if (lf) { /* LF found, copy up to and including LF */ len = lf - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_filter_extend(input_handle, &input_handle->quality_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len, char_fq_action_qual, chrmap_identity, &ok, &illegal_char); input_handle->file_buffer.position += len; rest -= len; /* break if quality line already too long */ if (input_handle->quality_buffer.length > input_handle->sequence_buffer.length) { break; } if (! ok) { if ((illegal_char >= 32) && (illegal_char < 127)) { snprintf(msg, 200, "Illegal quality character '%c'", illegal_char); } else { snprintf(msg, 200, "Illegal quality character (unprintable, no %d)", (unsigned char) illegal_char); } fastq_fatal(input_handle->lineno - (lf ? 1 : 0), msg); } } if (input_handle->sequence_buffer.length != input_handle->quality_buffer.length) { fastq_fatal(input_handle->lineno - (lf ? 1 : 0), "Sequence and quality lines must be equally long"); } fastx_filter_header(input_handle, truncateatspace); input_handle->seqno++; return true; } auto fastq_get_quality(fastx_handle input_handle) -> char * { return input_handle->quality_buffer.data; } auto fastq_get_quality_length(fastx_handle input_handle) -> uint64_t { return input_handle->quality_buffer.length; } auto fastq_get_position(fastx_handle input_handle) -> uint64_t { return input_handle->file_position; } auto fastq_get_size(fastx_handle input_handle) -> uint64_t { return input_handle->file_size; } auto fastq_get_lineno(fastx_handle input_handle) -> uint64_t { return input_handle->lineno_start; } auto fastq_get_seqno(fastx_handle input_handle) -> uint64_t { return input_handle->seqno; } auto fastq_get_header_length(fastx_handle input_handle) -> uint64_t { return input_handle->header_buffer.length; } auto fastq_get_sequence_length(fastx_handle input_handle) -> uint64_t { return input_handle->sequence_buffer.length; } auto fastq_get_header(fastx_handle input_handle) -> char * { return input_handle->header_buffer.data; } auto fastq_get_sequence(fastx_handle input_handle) -> char * { return input_handle->sequence_buffer.data; } auto fastq_get_abundance(fastx_handle input_handle) -> int64_t { // return 1 if not present auto const size = header_get_size(input_handle->header_buffer.data, input_handle->header_buffer.length); if (size > 0) { return size; } else { return 1; } } auto fastq_get_abundance_and_presence(fastx_handle input_handle) -> int64_t { // return 0 if not present return header_get_size(input_handle->header_buffer.data, input_handle->header_buffer.length); } inline auto fprint_seq_label(std::FILE * output_handle, char * seq, int len) -> void { /* normalize first? */ std::fprintf(output_handle, "%.*s", len, seq); } auto fastq_print_general(FILE * output_handle, char * seq, int len, char * header, int header_len, char * quality, int abundance, int ordinal, double ee) -> void { std::fprintf(output_handle, "@"); if (opt_relabel_self) { fprint_seq_label(output_handle, seq, len); } else if (opt_relabel_sha1) { fprint_seq_digest_sha1(output_handle, seq, len); } else if (opt_relabel_md5) { fprint_seq_digest_md5(output_handle, seq, len); } else if (opt_relabel && (ordinal > 0)) { std::fprintf(output_handle, "%s%d", opt_relabel, ordinal); } else { auto const xsize = opt_xsize || (opt_sizeout && (abundance > 0)); auto const xee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)); auto const xlength = opt_xlength || opt_lengthout; header_fprint_strip(output_handle, header, header_len, xsize, xee, xlength); } if (opt_label_suffix) { std::fprintf(output_handle, "%s", opt_label_suffix); } if (opt_sample) { std::fprintf(output_handle, ";sample=%s", opt_sample); } if (opt_sizeout && (abundance > 0)) { std::fprintf(output_handle, ";size=%u", abundance); } if ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)) { if (ee < 0.000000001) { std::fprintf(output_handle, ";ee=%.13lf", ee); } else if (ee < 0.00000001) { std::fprintf(output_handle, ";ee=%.12lf", ee); } else if (ee < 0.0000001) { std::fprintf(output_handle, ";ee=%.11lf", ee); } else if (ee < 0.000001) { std::fprintf(output_handle, ";ee=%.10lf", ee); } else if (ee < 0.00001) { std::fprintf(output_handle, ";ee=%.9lf", ee); } else if (ee < 0.0001) { std::fprintf(output_handle, ";ee=%.8lf", ee); } else if (ee < 0.001) { std::fprintf(output_handle, ";ee=%.7lf", ee); } else if (ee < 0.01) { std::fprintf(output_handle, ";ee=%.6lf", ee); } else if (ee < 0.1) { std::fprintf(output_handle, ";ee=%.5lf", ee); } else { std::fprintf(output_handle, ";ee=%.4lf", ee); } } if (opt_lengthout) { std::fprintf(output_handle, ";length=%d", len); } if (opt_relabel_keep && ((opt_relabel && (ordinal > 0)) || opt_relabel_sha1 || opt_relabel_md5 || opt_relabel_self)) { std::fprintf(output_handle, " %.*s", header_len, header); } std::fprintf(output_handle, "\n%.*s\n+\n%.*s\n", len, seq, len, quality); } auto fastq_print(std::FILE * output_handle, char * header, char * sequence, char * quality) -> void { auto const slen = static_cast(std::strlen(sequence)); auto const hlen = static_cast(std::strlen(header)); fastq_print_general(output_handle, sequence, slen, header, hlen, quality, 0, 0, -1.0); } vsearch-2.30.0/src/fastq.h000066400000000000000000000100071476012147200153070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // uint64_t auto fastq_open_rest(fastx_handle input_handle) -> void; auto fastq_open(const char * filename) -> fastx_handle; auto fastq_close(fastx_handle input_handle) -> void; auto fastq_next(fastx_handle input_handle, bool truncateatspace, const unsigned char * char_mapping) -> bool; auto fastq_get_position(fastx_handle input_handle) -> uint64_t; auto fastq_get_size(fastx_handle input_handle) -> uint64_t; auto fastq_get_lineno(fastx_handle input_handle) -> uint64_t; auto fastq_get_seqno(fastx_handle input_handle) -> uint64_t; auto fastq_get_header(fastx_handle input_handle) -> char *; auto fastq_get_sequence(fastx_handle input_handle) -> char *; auto fastq_get_quality(fastx_handle input_handle) -> char *; auto fastq_get_abundance(fastx_handle input_handle) -> int64_t; auto fastq_get_abundance_and_presence(fastx_handle input_handle) -> int64_t; auto fastq_get_header_length(fastx_handle input_handle) -> uint64_t; auto fastq_get_sequence_length(fastx_handle input_handle) -> uint64_t; auto fastq_get_quality_length(fastx_handle input_handle) -> uint64_t; auto fastq_print(std::FILE * output_handle, char * header, char * sequence, char * quality) -> void; auto fastq_print_general(std::FILE * output_handle, char * seq, int len, char * header, int header_len, char * quality, int abundance, int ordinal, double ee) -> void; vsearch-2.30.0/src/fastq_chars.cc000066400000000000000000000273071476012147200166400ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/maps.hpp" #include // std::find_if #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::distance #include #ifndef NDEBUG #include constexpr long int char_max = std::numeric_limits::max(); #endif constexpr unsigned int n_characters = 256; struct statistics { std::vector sequence_chars; std::vector quality_chars; std::vector tail_chars; std::vector maxrun; uint64_t total_chars = 0; uint64_t seq_count = 0; unsigned char qmin_n = 255; unsigned char qmax_n = 0; char qmin = '\0'; char qmax = '\0'; char fastq_ascii = '\0'; char fastq_qmin = '\0'; char fastq_qmax = '\0'; }; namespace { auto guess_quality_offset(struct statistics & stats) -> void { static constexpr auto lowerbound = ';'; // char 59 (-5 to offset +64) static constexpr auto upperbound = 'K'; // char 75 (+1 after offset +33 normal range) if ((stats.qmin < lowerbound) or (stats.qmax < upperbound)) { stats.fastq_ascii = static_cast(default_ascii_offset); // +33, from vsearch.h } else { stats.fastq_ascii = alternative_ascii_offset; // +64, from vsearch.h } stats.fastq_qmax = static_cast(stats.qmax - stats.fastq_ascii); stats.fastq_qmin = static_cast(stats.qmin - stats.fastq_ascii); } auto find_lowest_quality_symbol(struct statistics & stats) -> void { auto lowest = std::find_if(stats.quality_chars.cbegin(), stats.quality_chars.cend(), [](uint64_t const counter) -> bool { return counter != 0; }); if (lowest == stats.quality_chars.cend()) { return; } auto const index = std::distance(stats.quality_chars.cbegin(), lowest); assert(index >= 0); assert(index <= char_max); stats.qmin = static_cast(index); } auto find_highest_quality_symbol(struct statistics & stats) -> void { // note: searching using reverse iterators auto highest = std::find_if(stats.quality_chars.rbegin(), stats.quality_chars.rend(), [](uint64_t const counter) -> bool { return counter != 0; } ); if (highest == stats.quality_chars.rend()) { return; } auto const index = std::distance(highest, stats.quality_chars.rend()) - 1; assert(index >= 0); assert(index <= char_max); stats.qmax = static_cast(index); } auto stats_message(std::FILE * output_stream, struct statistics const & stats) -> void { static constexpr char first_char_in_Illumina_1_5 = 'B'; // 66th char static constexpr char last_char_in_original_Sanger = 'I'; // 73th char assert(stats.sequence_chars['n'] == 0); // sequences are uppercased, no results for lowercase symbols std::fprintf(output_stream, "Read %" PRIu64 " sequences.\n", stats.seq_count); if (stats.seq_count == 0) { return; } std::fprintf(output_stream, "Qmin %d, Qmax %d, Range %d\n", stats.qmin, stats.qmax, stats.qmax - stats.qmin + 1); std::fprintf(output_stream, "Guess: -fastq_qmin %d -fastq_qmax %d -fastq_ascii %d\n", stats.fastq_qmin, stats.fastq_qmax, stats.fastq_ascii); if (stats.fastq_ascii == alternative_ascii_offset) { if (stats.qmin < alternative_ascii_offset) { std::fprintf(output_stream, "Guess: Solexa format (phred+64)\n"); } else if (stats.qmin < first_char_in_Illumina_1_5) { std::fprintf(output_stream, "Guess: Illumina 1.3+ format (phred+64)\n"); } else { // Illumina 1.5+ Phred+64, quality values ranging from 3 to 41 (ascii: 67 to 105) // Q2 (ascii 66, 'B') is the Read Segment Quality Control Indicator std::fprintf(output_stream, "Guess: Illumina 1.5+ format (phred+64)\n"); } } else { if (stats.qmax > last_char_in_original_Sanger) { std::fprintf(output_stream, "Guess: Illumina 1.8+ format (phred+33)\n"); } else { // Sanger Phred+33, quality values ranging from 0 to 40 (ascii: 33 to 73) std::fprintf(output_stream, "Guess: Original Sanger format (phred+33)\n"); } } std::fprintf(output_stream, "\n"); std::fprintf(output_stream, "Letter N Freq MaxRun\n"); std::fprintf(output_stream, "------ ---------- ------ ------\n"); double const percentage_factor = 100.0 / static_cast(stats.total_chars); unsigned char index = 0; for (auto const counter: stats.sequence_chars) { if (counter == 0) { ++index ; continue; } std::fprintf(output_stream, " %c %10" PRIu64 " %5.1f%% %6d", index, counter, static_cast(counter) * percentage_factor, stats.maxrun[index]); if (index == 'N') { if (stats.qmin_n < stats.qmax_n) { std::fprintf(output_stream, " Q=%c..%c", stats.qmin_n, stats.qmax_n); } else { std::fprintf(output_stream, " Q=%c", stats.qmin_n); } } std::fprintf(output_stream, "\n"); ++index; } std::fprintf(output_stream, "\n"); std::fprintf(output_stream, "Char ASCII Freq Tails\n"); std::fprintf(output_stream, "---- ----- ------ ----------\n"); for (char i = stats.qmin; i <= stats.qmax; ++i) { if (stats.quality_chars[i] == 0) { continue; } std::fprintf(output_stream, " '%c' %5d %5.1f%% %10" PRIu64 "\n", i, i, static_cast(stats.quality_chars[i]) * percentage_factor, stats.tail_chars[i]); } } auto output_stats_message(struct Parameters const & parameters, struct statistics const & stats, char const * log_filename) -> void { if (log_filename == nullptr) { return; } stats_message(parameters.fp_log, stats); } auto output_stats_message(struct Parameters const & parameters, struct statistics const & stats) -> void { if (parameters.opt_quiet) { return; } stats_message(stderr, stats); } } auto fastq_chars(struct Parameters const & parameters) -> void { struct statistics stats; stats.sequence_chars.resize(n_characters); stats.quality_chars.resize(n_characters); stats.tail_chars.resize(n_characters); stats.maxrun.resize(n_characters); auto * fastq_handle = fastq_open(parameters.opt_fastq_chars); auto const filesize = fastq_get_size(fastq_handle); progress_init("Reading FASTQ file", filesize); while (fastq_next(fastq_handle, false, chrmap_upcase_vector.data())) { auto const seq_length = fastq_get_sequence_length(fastq_handle); auto * seq_ptr = fastq_get_sequence(fastq_handle); auto * qual_ptr = fastq_get_quality(fastq_handle); ++stats.seq_count; stats.total_chars += seq_length; auto run_char = -1; auto run = 0; for (auto i = 0ULL ; i < seq_length ; ++i) { auto const seq_symbol = static_cast(*seq_ptr); std::advance(seq_ptr, 1); auto const qual_symbol = static_cast(*qual_ptr); std::advance(qual_ptr, 1); ++stats.sequence_chars[seq_symbol]; ++stats.quality_chars[qual_symbol]; if (seq_symbol == 'N') { stats.qmin_n = std::min(qual_symbol, stats.qmin_n); stats.qmax_n = std::max(qual_symbol, stats.qmax_n); } if (seq_symbol == run_char) { ++run; stats.maxrun[run_char] = std::max(run, stats.maxrun[run_char]); } else { run_char = seq_symbol; run = 0; } } if (seq_length >= static_cast(parameters.opt_fastq_tail)) { qual_ptr = std::next(fastq_get_quality(fastq_handle), static_cast(seq_length - 1)); auto const tail_char = *qual_ptr; std::advance(qual_ptr, -1); auto tail_len = 1; while (*qual_ptr == tail_char) { std::advance(qual_ptr, -1); ++tail_len; if (tail_len >= parameters.opt_fastq_tail) { break; } } if (tail_len >= parameters.opt_fastq_tail) { ++stats.tail_chars[tail_char]; } } progress_update(fastq_get_position(fastq_handle)); } progress_done(); fastq_close(fastq_handle); find_lowest_quality_symbol(stats); find_highest_quality_symbol(stats); guess_quality_offset(stats); output_stats_message(parameters, stats); output_stats_message(parameters, stats, parameters.opt_log); } vsearch-2.30.0/src/fastq_chars.h000066400000000000000000000047551476012147200165040ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_chars(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/fastq_join.cc000066400000000000000000000255561476012147200165030ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/maps.hpp" #include // std::transform #include // macros PRIu64 and PRId64 #include // uint64_t #include // std::FILE, std::fprintf, std::fclose #include struct input_file { char * name = nullptr; fastx_handle handle = nullptr; }; struct input_files { input_file forward; input_file reverse; }; struct output_file { char * name = nullptr; std::FILE * handle = nullptr; }; struct output_files { output_file fasta; output_file fastq; }; auto check_parameters(struct Parameters const & parameters) -> void { if (parameters.opt_reverse == nullptr) { fatal("No reverse reads file specified with --reverse"); } if ((parameters.opt_fastqout == nullptr) and (parameters.opt_fastaout == nullptr)) { fatal("No output files specified"); } if (parameters.opt_join_padgap.length() != parameters.opt_join_padgapq.length()) { fatal("Strings given by --join_padgap and --join_padgapq differ in length"); } } auto open_input_files(struct Parameters const & parameters) -> struct input_files { struct input_files infiles; infiles.forward.name = parameters.opt_fastq_join; infiles.reverse.name = parameters.opt_reverse; if (infiles.forward.name != nullptr) { infiles.forward.handle = fastq_open(infiles.forward.name); } if (infiles.reverse.name != nullptr) { infiles.reverse.handle = fastq_open(infiles.reverse.name); } return infiles; } auto open_output_files(struct Parameters const & parameters) -> struct output_files { struct output_files outfiles; outfiles.fasta.name = parameters.opt_fastaout; outfiles.fastq.name = parameters.opt_fastqout; if (outfiles.fasta.name != nullptr) { outfiles.fasta.handle = fopen_output(outfiles.fasta.name); } if (outfiles.fastq.name != nullptr) { outfiles.fastq.handle = fopen_output(outfiles.fastq.name); } return outfiles; } auto check_output_files(struct output_files const & outfiles) -> void { if (outfiles.fasta.name != nullptr) { if (outfiles.fasta.handle == nullptr) { fatal("Unable to open file for writing (%s)", outfiles.fasta.name); } } if (outfiles.fastq.name != nullptr) { if (outfiles.fastq.handle == nullptr) { fatal("Unable to open file for writing (%s)", outfiles.fastq.name); } } } auto close_output_files(struct output_files const & outfiles) -> void { for (auto * fp_outputfile : {outfiles.fasta.handle, outfiles.fastq.handle}) { if (fp_outputfile != nullptr) { static_cast(std::fclose(fp_outputfile)); } } } auto close_input_files(struct input_files const & infiles) -> void { for (auto * fp_inputfile : {infiles.forward.handle, infiles.reverse.handle}) { if (fp_inputfile != nullptr) { fastq_close(fp_inputfile); } } } auto stats_message(std::FILE * output_stream, uint64_t const total) -> void { static_cast(std::fprintf(output_stream, "%" PRIu64 " pairs joined\n", total)); } auto output_stats_message(struct Parameters const & parameters, uint64_t const total, char const * log_filename) -> void { if (log_filename == nullptr) { return; } stats_message(parameters.fp_log, total); } auto output_stats_message(struct Parameters const & parameters, uint64_t const total) -> void { if (parameters.opt_quiet) { return; } stats_message(stderr, total); } auto fastq_join(struct Parameters const & parameters) -> void { /* check parameters */ check_parameters(parameters); /* open and check input and output files */ auto const infiles = open_input_files(parameters); // check_input_files(infiles)? already done by the function fastq_open() auto const outfiles = open_output_files(parameters); check_output_files(outfiles); /* main */ auto const filesize = fastq_get_size(infiles.forward.handle); progress_init("Joining reads", filesize); /* do it */ constexpr auto bufferlength = 1024U; auto const padlen = parameters.opt_join_padgap.length(); uint64_t total = 0; std::string final_sequence; final_sequence.reserve(bufferlength + padlen + bufferlength); std::string final_quality; final_quality.reserve(final_sequence.capacity()); std::string reverse_sequence; reverse_sequence.reserve(bufferlength); std::string reverse_quality; reverse_quality.reserve(bufferlength); while (fastq_next(infiles.forward.handle, false, chrmap_no_change_vector.data())) { if (not fastq_next(infiles.reverse.handle, false, chrmap_no_change_vector.data())) { fatal("More forward reads than reverse reads"); } final_sequence.clear(); final_quality.clear(); reverse_sequence.clear(); reverse_quality.clear(); auto const fwd_seq_length = fastq_get_sequence_length(infiles.forward.handle); auto const rev_seq_length = fastq_get_sequence_length(infiles.reverse.handle); auto const needed = fwd_seq_length + padlen + rev_seq_length; /* allocate enough memory */ if (rev_seq_length > reverse_sequence.capacity()) { reverse_sequence.reserve(rev_seq_length); } if (rev_seq_length > reverse_quality.capacity()) { reverse_quality.reserve(rev_seq_length); } if (needed > final_sequence.capacity()) { final_sequence.reserve(needed); } /* reverse read: reverse-complement sequence */ reverse_sequence.assign(fastq_get_sequence(infiles.reverse.handle), rev_seq_length); std::reverse(reverse_sequence.begin(), reverse_sequence.end()); std::transform(reverse_sequence.begin(), reverse_sequence.end(), reverse_sequence.begin(), [](char const & lhs) -> char { auto const unsigned_lhs = static_cast(lhs); auto const complement_lhs = chrmap_complement_vector[unsigned_lhs]; return static_cast(complement_lhs); }); /* reverse read: reverse quality */ reverse_quality.assign(fastq_get_quality(infiles.reverse.handle), rev_seq_length); std::reverse(reverse_quality.begin(), reverse_quality.end()); /* join them */ final_sequence = std::string{fastq_get_sequence(infiles.forward.handle), fwd_seq_length} + parameters.opt_join_padgap + reverse_sequence; final_quality = std::string{fastq_get_quality(infiles.forward.handle), fwd_seq_length} + parameters.opt_join_padgapq + reverse_quality; /* write output */ if (parameters.opt_fastqout != nullptr) { fastq_print_general(outfiles.fastq.handle, const_cast(final_sequence.c_str()), static_cast(needed), fastq_get_header(infiles.forward.handle), static_cast(fastq_get_header_length(infiles.forward.handle)), const_cast(final_quality.c_str()), static_cast(fastq_get_abundance(infiles.forward.handle)), static_cast(total + 1), -1.0); } if (parameters.opt_fastaout != nullptr) { fasta_print_general(outfiles.fasta.handle, nullptr, const_cast(final_sequence.c_str()), static_cast(needed), fastq_get_header(infiles.forward.handle), static_cast(fastq_get_header_length(infiles.forward.handle)), static_cast(fasta_get_abundance(infiles.forward.handle)), static_cast(total + 1), -1.0, -1, -1, nullptr, 0); } ++total; progress_update(fastq_get_position(infiles.forward.handle)); } progress_done(); if (fastq_next(infiles.reverse.handle, false, chrmap_no_change_vector.data())) { fatal("More reverse reads than forward reads"); } output_stats_message(parameters, total); output_stats_message(parameters, total, parameters.opt_log); /* clean up */ close_output_files(outfiles); close_input_files(infiles); } vsearch-2.30.0/src/fastq_join.h000066400000000000000000000047541476012147200163420ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_join(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/fastqops.cc000066400000000000000000000501021476012147200161670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include #include // macros PRIu64 and PRId64 #include // std::pow #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::memset #include #include auto q2p(double quality_value) -> double { static constexpr auto base = 10.0; return std::pow(base, -quality_value / base); } auto fastq_stats() -> void { auto input_handle = fastq_open(opt_fastq_stats); auto const filesize = fastq_get_size(input_handle); progress_init("Reading FASTQ file", filesize); uint64_t seq_count = 0; uint64_t symbols = 0; int64_t read_length_alloc = 512; auto * read_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc); memset(read_length_table, 0, sizeof(uint64_t) * read_length_alloc); auto * qual_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 256); memset(qual_length_table, 0, sizeof(uint64_t) * read_length_alloc * 256); auto * ee_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 4); memset(ee_length_table, 0, sizeof(uint64_t) * read_length_alloc * 4); auto * q_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 4); memset(q_length_table, 0, sizeof(uint64_t) * read_length_alloc * 4); auto * sumee_length_table = (double *) xmalloc(sizeof(double) * read_length_alloc); memset(sumee_length_table, 0, sizeof(double) * read_length_alloc); int64_t len_min = std::numeric_limits::max(); int64_t len_max = 0; auto qmin = std::numeric_limits::max(); auto qmax = std::numeric_limits::min(); std::vector quality_chars(256); while (fastq_next(input_handle, false, chrmap_upcase)) { ++seq_count; auto const len = static_cast(fastq_get_sequence_length(input_handle)); auto * q = fastq_get_quality(input_handle); /* update length statistics */ if (len + 1 > read_length_alloc) { read_length_table = (uint64_t *) xrealloc(read_length_table, sizeof(uint64_t) * (len + 1)); memset(read_length_table + read_length_alloc, 0, sizeof(uint64_t) * (len + 1 - read_length_alloc)); qual_length_table = (uint64_t *) xrealloc(qual_length_table, sizeof(uint64_t) * (len + 1) * 256); memset(qual_length_table + (256 * read_length_alloc), 0, sizeof(uint64_t) * (len + 1 - read_length_alloc) * 256); ee_length_table = (uint64_t *) xrealloc(ee_length_table, sizeof(uint64_t) * (len + 1) * 4); memset(ee_length_table + (4 * read_length_alloc), 0, sizeof(uint64_t) * (len + 1 - read_length_alloc) * 4); q_length_table = (uint64_t *) xrealloc(q_length_table, sizeof(uint64_t) * (len + 1) * 4); memset(q_length_table + (4 * read_length_alloc), 0, sizeof(uint64_t) * (len + 1 - read_length_alloc) * 4); sumee_length_table = (double *) xrealloc(sumee_length_table, sizeof(double) * (len + 1)); memset(sumee_length_table + read_length_alloc, 0, sizeof(double) * (len + 1 - read_length_alloc)); read_length_alloc = len + 1; } ++read_length_table[len]; len_min = std::min(len, len_min); len_max = std::max(len, len_max); /* update quality statistics */ symbols += len; std::array const ee_limits = { 1.0, 0.5, 0.25, 0.1 }; double ee = 0.0; int qmin_this = std::numeric_limits::max(); for (int64_t i = 0; i < len; i++) { int const qc = q[i]; int const qual = qc - opt_fastq_ascii; if ((qual < opt_fastq_qmin) || (qual > opt_fastq_qmax)) { char * msg = nullptr; if (xsprintf(& msg, "FASTQ quality value (%d) out of range (%" PRId64 "-%" PRId64 ").\n" "Please adjust the FASTQ quality base character or range with the\n" "--fastq_ascii, --fastq_qmin or --fastq_qmax options. For a complete\n" "diagnosis with suggested values, please run vsearch --fastq_chars file.", qual, opt_fastq_qmin, opt_fastq_qmax) > 0) { fatal(msg); } else { fatal("Out of memory"); } xfree(msg); } ++quality_chars[qc]; qmin = std::min(qc, qmin); qmax = std::max(qc, qmax); ++qual_length_table[(256 * i) + qc]; ee += q2p(qual); sumee_length_table[i] += ee; for (int z = 0; z < 4; z++) { if (ee <= ee_limits[z]) { ++ee_length_table[(4 * i) + z]; } else { break; } } qmin_this = std::min(qual, qmin_this); for (int z = 0; z < 4; z++) { if (qmin_this > 5 * (z + 1)) { ++q_length_table[(4 * i) + z]; } else { break; } } } progress_update(fastq_get_position(input_handle)); } progress_done(); /* compute various distributions */ std::vector length_dist(len_max + 1); std::vector symb_dist(len_max + 1); std::vector rate_dist(len_max + 1); std::vector avgq_dist(len_max + 1); std::vector avgee_dist(len_max + 1); std::vector avgp_dist(len_max + 1); int64_t length_accum = 0; int64_t symb_accum = 0; for (int64_t i = 0; i <= len_max; i++) { length_accum += read_length_table[i]; length_dist[i] = length_accum; symb_accum += seq_count - length_accum; symb_dist[i] = symb_accum; int64_t q = 0; int64_t x = 0; double e_sum = 0.0; for (int c = qmin; c <= qmax; c++) { int const qual = c - opt_fastq_ascii; x += qual_length_table[(256 * i) + c]; q += qual_length_table[(256 * i) + c] * qual; e_sum += qual_length_table[(256 * i) + c] * q2p(qual); } avgq_dist[i] = 1.0 * q / x; avgp_dist[i] = e_sum / x; avgee_dist[i] = sumee_length_table[i] / x; rate_dist[i] = avgee_dist[i] / (i + 1); } if (fp_log) { fprintf(fp_log, "\n"); fprintf(fp_log, "Read length distribution\n"); fprintf(fp_log, " L N Pct AccPct\n"); fprintf(fp_log, "------- ---------- ------- -------\n"); for (int64_t i = len_max; i >= len_min; i--) { if (read_length_table[i] > 0) { fprintf(fp_log, "%2s%5" PRId64 " %10" PRIu64 " %5.1lf%% %5.1lf%%\n", (i == len_max ? ">=" : " "), i, read_length_table[i], read_length_table[i] * 100.0 / seq_count, 100.0 * (seq_count - (i > 0 ? length_dist[i - 1] : 0)) / seq_count); } } fprintf(fp_log, "\n"); fprintf(fp_log, "Q score distribution\n"); fprintf(fp_log, "ASCII Q Pe N Pct AccPct\n"); fprintf(fp_log, "----- --- ------- ---------- ------- -------\n"); int64_t qual_accum = 0; for (int c = qmax ; c >= qmin ; c--) { if (quality_chars[c] > 0) { qual_accum += quality_chars[c]; fprintf(fp_log, " %c %3" PRId64 " %7.5lf %10" PRIu64 " %6.1lf%% %6.1lf%%\n", c, c - opt_fastq_ascii, q2p(c - opt_fastq_ascii), quality_chars[c], 100.0 * quality_chars[c] / symbols, 100.0 * qual_accum / symbols); } } fprintf(fp_log, "\n"); fprintf(fp_log, " L PctRecs AvgQ P(AvgQ) AvgP AvgEE Rate RatePct\n"); fprintf(fp_log, "----- ------- ---- ------- -------- ----- --------- --------\n"); for (int64_t i = 2; i <= len_max; i++) { double const PctRecs = 100.0 * (seq_count - length_dist[i - 1]) / seq_count; double const AvgQ = avgq_dist[i - 1]; double const AvgP = avgp_dist[i - 1]; double const AvgEE = avgee_dist[i - 1]; double const Rate = rate_dist[i - 1]; fprintf(fp_log, "%5" PRId64 " %6.1lf%% %4.1lf %7.5lf %8.6lf %5.2lf %9.6lf %7.3lf%%\n", i, PctRecs, AvgQ, q2p(AvgQ), AvgP, AvgEE, Rate, 100.0 * Rate); } fprintf(fp_log, "\n"); fprintf(fp_log, " L 1.0000 0.5000 0.2500 0.1000 1.0000 0.5000 0.2500 0.1000\n"); fprintf(fp_log, "----- ------- ------- ------- ------- ------- ------- ------- -------\n"); for (int64_t i = len_max; i >= 1; i--) { int64_t read_count[4]; double read_percentage[4]; for (int z = 0; z < 4; z++) { read_count[z] = ee_length_table[(4 * (i - 1)) + z]; read_percentage[z] = 100.0 * read_count[z] / seq_count; } if (read_count[0] > 0) { fprintf(fp_log, "%5" PRId64 " %7" PRId64 " %7" PRId64 " %7" PRId64 " %7" PRId64 " " "%6.2lf%% %6.2lf%% %6.2lf%% %6.2lf%%\n", i, read_count[0], read_count[1], read_count[2], read_count[3], read_percentage[0], read_percentage[1], read_percentage[2], read_percentage[3]); } } fprintf(fp_log, "\n"); fprintf(fp_log, "Truncate at first Q\n"); fprintf(fp_log, " Len Q=5 Q=10 Q=15 Q=20\n"); fprintf(fp_log, "----- ------ ------ ------ ------\n"); for (int64_t i = len_max; i >= MAX(1, len_max / 2); i--) { double read_percentage[4]; for (int z = 0; z < 4; z++) { read_percentage[z] = 100.0 * q_length_table[(4 * (i - 1)) + z] / seq_count; } fprintf(fp_log, "%5" PRId64 " %5.1lf%% %5.1lf%% %5.1lf%% %5.1lf%%\n", i, read_percentage[0], read_percentage[1], read_percentage[2], read_percentage[3]); } fprintf(fp_log, "\n"); fprintf(fp_log, "%10" PRIu64 " Recs (%.1lfM), 0 too long\n", seq_count, seq_count / 1.0e6); if (seq_count > 0) { fprintf(fp_log, "%10.1lf Avg length\n", 1.0 * symbols / seq_count); } fprintf(fp_log, "%9.1lfM Bases\n", symbols / 1.0e6); } xfree(read_length_table); xfree(qual_length_table); xfree(ee_length_table); xfree(q_length_table); xfree(sumee_length_table); fastq_close(input_handle); if (! opt_quiet) { fprintf(stderr, "Read %" PRIu64 " sequences.\n", seq_count); } } auto fastx_revcomp() -> void { uint64_t buffer_alloc = 512; char * seq_buffer = (char*) xmalloc(buffer_alloc); char * qual_buffer = (char*) xmalloc(buffer_alloc); if ((! opt_fastaout) && (! opt_fastqout)) { fatal("No output files specified"); } fastx_handle h = fastx_open(opt_fastx_revcomp); if (! h) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } if (opt_fastqout && ! (h->is_fastq || h->is_empty)) { fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); } uint64_t const filesize = fastx_get_size(h); std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); if (! fp_fastaout) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout) { fp_fastqout = fopen_output(opt_fastqout); if (! fp_fastqout) { fatal("Unable to open FASTQ output file for writing"); } } if (h->is_fastq) { progress_init("Reading FASTQ file", filesize); } else { progress_init("Reading FASTA file", filesize); } int count = 0; while (fastx_next(h, false, chrmap_no_change)) { ++count; /* header */ uint64_t const hlen = fastx_get_header_length(h); char * header = fastx_get_header(h); int64_t const abundance = fastx_get_abundance(h); /* sequence */ uint64_t const length = fastx_get_sequence_length(h); if (length + 1 > buffer_alloc) { buffer_alloc = length + 1; seq_buffer = (char *) xrealloc(seq_buffer, buffer_alloc); qual_buffer = (char *) xrealloc(qual_buffer, buffer_alloc); } char * p = fastx_get_sequence(h); reverse_complement(seq_buffer, p, length); /* quality values */ char * q = fastx_get_quality(h); if (fastx_is_fastq(h)) { /* reverse quality values */ for (uint64_t i = 0; i < length; i++) { qual_buffer[i] = q[length - 1 - i]; } qual_buffer[length] = 0; } if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, seq_buffer, length, header, hlen, abundance, count, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout) { fastq_print_general(fp_fastqout, seq_buffer, length, header, hlen, qual_buffer, abundance, count, -1.0); } progress_update(fastx_get_position(h)); } progress_done(); if (opt_fastaout) { fclose(fp_fastaout); } if (opt_fastqout) { fclose(fp_fastqout); } fastx_close(h); xfree(seq_buffer); xfree(qual_buffer); } auto fastq_convert() -> void { if (! opt_fastqout) { fatal("No output file specified with --fastqout"); } auto input_handle = fastq_open(opt_fastq_convert); if (! input_handle) { fatal("Unable to open FASTQ file"); } auto const filesize = fastq_get_size(input_handle); std::FILE * fp_fastqout = nullptr; fp_fastqout = fopen_output(opt_fastqout); if (! fp_fastqout) { fatal("Unable to open FASTQ output file for writing"); } progress_init("Reading FASTQ file", filesize); auto n_entries = 1; static constexpr double default_expected_error = -1.0; // refactoring: print no ee value? while (fastq_next(input_handle, false, chrmap_no_change)) { /* header */ auto * header = fastq_get_header(input_handle); auto const abundance = fastq_get_abundance(input_handle); /* sequence */ auto const length = fastq_get_sequence_length(input_handle); auto * sequence = fastq_get_sequence(input_handle); /* convert quality values */ auto * quality = fastq_get_quality(input_handle); for (uint64_t i = 0; i < length; i++) { int q = quality[i] - opt_fastq_ascii; if (q < opt_fastq_qmin) { fprintf(stderr, "\nFASTQ quality score (%d) below minimum (%" PRId64 ") in entry no %" PRIu64 " starting on line %" PRIu64 "\n", q, opt_fastq_qmin, fastq_get_seqno(input_handle) + 1, fastq_get_lineno(input_handle)); fatal("FASTQ quality score too low"); } if (q > opt_fastq_qmax) { fprintf(stderr, "\nFASTQ quality score (%d) above maximum (%" PRId64 ") in entry no %" PRIu64 " starting on line %" PRIu64 "\n", q, opt_fastq_qmax, fastq_get_seqno(input_handle) + 1, fastq_get_lineno(input_handle)); fatal("FASTQ quality score too high"); } q = std::max(q, opt_fastq_qminout); q = std::min(q, opt_fastq_qmaxout); q += opt_fastq_asciiout; q = std::max(q, 33); q = std::min(q, 126); quality[i] = q; } quality[length] = 0; int const hlen = fastq_get_header_length(input_handle); fastq_print_general(fp_fastqout, sequence, length, header, hlen, quality, abundance, n_entries, default_expected_error); // refactoring: prefer function overload? ++n_entries; progress_update(fastq_get_position(input_handle)); } progress_done(); fclose(fp_fastqout); fastq_close(input_handle); } vsearch-2.30.0/src/fastqops.h000066400000000000000000000050051476012147200160330ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_convert() -> void; auto fastq_stats() -> void; auto fastx_revcomp() -> void; vsearch-2.30.0/src/fastx.cc000066400000000000000000000446101476012147200154630ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dynlibs.h" #include "maps.h" #include // macros PRIu64 and PRId64 #include // LONG_MAX #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t, std::fread, std::fileno #include // std::exit, EXIT_FAILURE #include // std::memcpy, std::memcmp /* file compression and format detector */ /* basic file buffering function for fastq and fastx parsers */ constexpr uint64_t fastx_buffer_alloc = 8192; #ifdef HAVE_BZLIB_H #define BZ_VERBOSE_0 0 #define BZ_VERBOSE_1 1 #define BZ_VERBOSE_2 2 #define BZ_VERBOSE_3 3 #define BZ_VERBOSE_4 4 #define BZ_MORE_MEM 0 /* faster decompression using more memory */ #define BZ_LESS_MEM 1 /* slower decompression but requires less memory */ #endif constexpr int format_plain = 1; constexpr int format_bzip = 2; constexpr int format_gzip = 3; static unsigned char MAGIC_GZIP[] = "\x1f\x8b"; static unsigned char MAGIC_BZIP[] = "BZ"; auto buffer_init(struct fastx_buffer_s * buffer) -> void { buffer->alloc = fastx_buffer_alloc; buffer->data = (char *) xmalloc(buffer->alloc); buffer->data[0] = 0; buffer->length = 0; buffer->position = 0; } auto buffer_free(struct fastx_buffer_s * buffer) -> void { if (buffer->data) { xfree(buffer->data); } buffer->data = nullptr; buffer->alloc = 0; buffer->length = 0; buffer->position = 0; } auto buffer_makespace(struct fastx_buffer_s * buffer, uint64_t x) -> void { /* make sure there is space for x more chars in buffer */ if (buffer->length + x > buffer->alloc) { /* alloc space for x more characters, but round up to nearest block size */ buffer->alloc = ((buffer->length + x + fastx_buffer_alloc - 1) / fastx_buffer_alloc) * fastx_buffer_alloc; buffer->data = (char *) xrealloc(buffer->data, buffer->alloc); } } auto buffer_extend(struct fastx_buffer_s * dest_buffer, char * source_buf, uint64_t len) -> void { buffer_makespace(dest_buffer, len + 1); memcpy(dest_buffer->data + dest_buffer->length, source_buf, len); dest_buffer->length += len; dest_buffer->data[dest_buffer->length] = 0; } auto fastx_filter_header(fastx_handle h, bool truncateatspace) -> void { /* filter and truncate header */ char * p = h->header_buffer.data; char * q = p; while (true) { unsigned char const c = *p++; unsigned int const m = char_header_action[c]; switch(m) { case 1: /* legal, printable character */ *q++ = c; break; case 2: /* illegal, fatal */ fprintf(stderr, "\n\n" "Fatal error: Illegal character encountered in FASTA/FASTQ header.\n" "Unprintable ASCII character no %d on or right before line %" PRIu64 ".\n", c, h->lineno); if (fp_log) { fprintf(fp_log, "\n\n" "Fatal error: Illegal character encountered in FASTA/FASTQ header.\n" "Unprintable ASCII character no %d on or right before line %" PRIu64 ".\n", c, h->lineno); } exit(EXIT_FAILURE); case 7: /* Non-ASCII but acceptable */ fprintf(stderr, "\n" "WARNING: Non-ASCII character encountered in FASTA/FASTQ header.\n" "Character no %d (0x%2x) on or right before line %" PRIu64 ".\n", c, c, h->lineno); if (fp_log) { fprintf(fp_log, "\n" "WARNING: Non-ASCII character encountered in FASTA/FASTQ header.\n" "Character no %d (0x%2x) on or right before line %" PRIu64 ".\n", c, c, h->lineno); } *q++ = c; break; case 5: case 6: /* tab or space */ /* conditional end of line */ if (truncateatspace) { goto end_of_line; } *q++ = c; break; case 0: /* null */ case 3: /* cr */ case 4: /* lf */ /* end of line */ goto end_of_line; default: fatal("Internal error"); break; } } end_of_line: /* add a null character at the end */ *q = 0; h->header_buffer.length = q - h->header_buffer.data; } auto fastx_open(const char * filename) -> fastx_handle { auto * h = (fastx_handle) xmalloc(sizeof(struct fastx_s)); h->fp = nullptr; #ifdef HAVE_ZLIB_H h->fp_gz = nullptr; #endif #ifdef HAVE_BZLIB_H h->fp_bz = nullptr; int bzError = 0; #endif h->fp = fopen_input(filename); if (! h->fp) { fatal("Unable to open file for reading (%s)", filename); } /* Get mode and size of original (uncompressed) file */ xstat_t fs; if (xfstat(fileno(h->fp), & fs)) { fatal("Unable to get status for input file (%s)", filename); } h->is_pipe = S_ISFIFO(fs.st_mode); if (h->is_pipe) { h->file_size = 0; } else { h->file_size = fs.st_size; } if (opt_gzip_decompress) { h->format = format_gzip; } else if (opt_bzip2_decompress) { h->format = format_bzip; } else if (h->is_pipe) { h->format = format_plain; } else { /* autodetect compression (plain, gzipped or bzipped) */ /* read two characters and compare with magic */ unsigned char magic[2]; h->format = format_plain; size_t const bytes_read = fread(&magic, 1, 2, h->fp); if (bytes_read >= 2) { if (memcmp(magic, MAGIC_GZIP, 2) == 0) { h->format = format_gzip; } else if (memcmp(magic, MAGIC_BZIP, 2) == 0) { h->format = format_bzip; } } else { /* consider it an empty file or a tiny fasta file, uncompressed */ } /* close and reopen to avoid problems with gzip library */ /* rewind was not enough */ fclose(h->fp); h->fp = fopen_input(filename); if (! h->fp) { fatal("Unable to open file for reading (%s)", filename); } } if (h->format == format_gzip) { /* GZIP: Keep original file open, then open as gzipped file as well */ #ifdef HAVE_ZLIB_H if (! gz_lib) { fatal("Files compressed with gzip are not supported"); } h->fp_gz = (*gzdopen_p)(fileno(h->fp), "rb"); if (! h->fp_gz) { // dup? fatal("Unable to open gzip compressed file (%s)", filename); } #else fatal("Files compressed with gzip are not supported"); #endif } if (h->format == format_bzip) { /* BZIP2: Keep original file open, then open as bzipped file as well */ #ifdef HAVE_BZLIB_H if (! bz2_lib) { fatal("Files compressed with bzip2 are not supported"); } h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp, BZ_VERBOSE_0, BZ_MORE_MEM, nullptr, 0); if (! h->fp_bz) { fatal("Unable to open bzip2 compressed file (%s)", filename); } #else fatal("Files compressed with bzip2 are not supported"); #endif } /* init buffers */ h->file_position = 0; buffer_init(& h->file_buffer); /* start filling up file buffer */ uint64_t const rest = fastx_file_fill_buffer(h); /* examine first char and see if it starts with > or @ */ int filetype = 0; h->is_empty = true; h->is_fastq = false; if (rest > 0) { h->is_empty = false; char * first = h->file_buffer.data; if (*first == '>') { filetype = 1; } else if (*first == '@') { filetype = 2; h->is_fastq = true; } if (filetype == 0) { /* close files if unrecognized file type */ switch(h->format) { case format_plain: break; case format_gzip: #ifdef HAVE_ZLIB_H (*gzclose_p)(h->fp_gz); h->fp_gz = nullptr; break; #endif case format_bzip: #ifdef HAVE_BZLIB_H (*BZ2_bzReadClose_p)(&bzError, h->fp_bz); h->fp_bz = nullptr; break; #endif default: fatal("Internal error"); } fclose(h->fp); h->fp = nullptr; if (rest >= 2) { if (memcmp(first, MAGIC_GZIP, 2) == 0) { fatal("File appears to be gzip compressed. Please use --gzip_decompress"); } if (memcmp(first, MAGIC_BZIP, 2) == 0) { fatal("File appears to be bzip2 compressed. Please use --bzip2_decompress"); } } fatal("File type not recognized."); return nullptr; } } /* more initialization */ buffer_init(& h->header_buffer); buffer_init(& h->sequence_buffer); buffer_init(& h->plusline_buffer); buffer_init(& h->quality_buffer); h->stripped_all = 0; for (uint64_t & i : h->stripped) { i = 0; } h->lineno = 1; h->lineno_start = 1; h->seqno = -1; return h; } auto fastx_is_fastq(fastx_handle h) -> bool { return h->is_fastq || h->is_empty; } auto fastx_is_empty(fastx_handle h) -> bool { return h->is_empty; } auto fastx_is_pipe(fastx_handle h) -> bool { return h->is_pipe; } auto fastx_close(fastx_handle h) -> void { /* Warn about stripped chars */ if (h->stripped_all) { fprintf(stderr, "WARNING: %" PRIu64 " invalid characters stripped from %s file:", h->stripped_all, (h->is_fastq ? "FASTQ" : "FASTA")); for (int i = 0; i < 256; i++) { if (h->stripped[i]) { fprintf(stderr, " %c(%" PRIu64 ")", i, h->stripped[i]); } } fprintf(stderr, "\n"); fprintf(stderr, "REMINDER: vsearch does not support amino acid sequences\n"); if (opt_log) { fprintf(fp_log, "WARNING: %" PRIu64 " invalid characters stripped from %s file:", h->stripped_all, (h->is_fastq ? "FASTQ" : "FASTA")); for (int i = 0; i < 256; i++) { if (h->stripped[i]) { fprintf(fp_log, " %c(%" PRIu64 ")", i, h->stripped[i]); } } fprintf(fp_log, "\n"); fprintf(fp_log, "REMINDER: vsearch does not support amino acid sequences\n"); } } #ifdef HAVE_BZLIB_H int bz_error = 0; #endif switch(h->format) { case format_plain: break; case format_gzip: #ifdef HAVE_ZLIB_H (*gzclose_p)(h->fp_gz); h->fp_gz = nullptr; break; #endif case format_bzip: #ifdef HAVE_BZLIB_H (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz); h->fp_bz = nullptr; break; #endif default: fatal("Internal error"); } fclose(h->fp); h->fp = nullptr; buffer_free(& h->file_buffer); buffer_free(& h->header_buffer); buffer_free(& h->sequence_buffer); buffer_free(& h->plusline_buffer); buffer_free(& h->quality_buffer); h->file_size = 0; h->file_position = 0; h->lineno = 0; h->seqno = -1; xfree(h); h=nullptr; } auto fastx_file_fill_buffer(fastx_handle h) -> uint64_t { /* read more data if necessary */ uint64_t const rest = h->file_buffer.length - h->file_buffer.position; if (rest > 0) { return rest; } else { uint64_t space = h->file_buffer.alloc - h->file_buffer.length; if (space == 0) { /* back to beginning of buffer */ h->file_buffer.position = 0; h->file_buffer.length = 0; space = h->file_buffer.alloc; } int bytes_read = 0; #ifdef HAVE_BZLIB_H int bzError = 0; #endif switch(h->format) { case format_plain: bytes_read = fread(h->file_buffer.data + h->file_buffer.position, 1, space, h->fp); break; case format_gzip: #ifdef HAVE_ZLIB_H bytes_read = (*gzread_p)(h->fp_gz, h->file_buffer.data + h->file_buffer.position, space); if (bytes_read < 0) { fatal("Unable to read gzip compressed file"); } break; #endif case format_bzip: #ifdef HAVE_BZLIB_H bytes_read = (*BZ2_bzRead_p)(& bzError, h->fp_bz, h->file_buffer.data + h->file_buffer.position, space); if ((bytes_read < 0) || ! ((bzError == BZ_OK) || (bzError == BZ_STREAM_END) || (bzError == BZ_SEQUENCE_ERROR))) { fatal("Unable to read from bzip2 compressed file"); } break; #endif default: fatal("Internal error"); } if (! h->is_pipe) { #ifdef HAVE_ZLIB_H if (h->format == format_gzip) { /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */ int const fd = dup(fileno(h->fp)); h->file_position = xlseek(fd, 0, SEEK_CUR); close(fd); } else #endif { h->file_position = xftello(h->fp); } } h->file_buffer.length += bytes_read; return bytes_read; } } auto fastx_next(fastx_handle h, bool truncateatspace, const unsigned char * char_mapping) -> bool { if (h->is_fastq) { return fastq_next(h, truncateatspace, char_mapping); } else { return fasta_next(h, truncateatspace, char_mapping); } } auto fastx_get_position(fastx_handle h) -> uint64_t { if (h->is_fastq) { return fastq_get_position(h); } else { return fasta_get_position(h); } } auto fastx_get_size(fastx_handle h) -> uint64_t { if (h->is_fastq) { return fastq_get_size(h); } else { return fasta_get_size(h); } } auto fastx_get_lineno(fastx_handle h) -> uint64_t { if (h->is_fastq) { return fastq_get_lineno(h); } else { return fasta_get_lineno(h); } } auto fastx_get_seqno(fastx_handle h) -> uint64_t { if (h->is_fastq) { return fastq_get_seqno(h); } else { return fasta_get_seqno(h); } } auto fastx_get_header(fastx_handle h) -> char * { if (h->is_fastq) { return fastq_get_header(h); } else { return fasta_get_header(h); } } auto fastx_get_sequence(fastx_handle h) -> char * { if (h->is_fastq) { return fastq_get_sequence(h); } else { return fasta_get_sequence(h); } } auto fastx_get_header_length(fastx_handle h) -> uint64_t { if (h->is_fastq) { return fastq_get_header_length(h); } else { return fasta_get_header_length(h); } } auto fastx_get_sequence_length(fastx_handle h) -> uint64_t { if (h->is_fastq) { return fastq_get_sequence_length(h); } else { return fasta_get_sequence_length(h); } } auto fastx_get_quality(fastx_handle h) -> char * { if (h->is_fastq) { return fastq_get_quality(h); } else { return nullptr; } } auto fastx_get_abundance(fastx_handle h) -> int64_t { if (h->is_fastq) { return fastq_get_abundance(h); } else { return fasta_get_abundance(h); } } vsearch-2.30.0/src/fastx.h000066400000000000000000000110511476012147200153160ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // uint64_t constexpr auto byte_range = 256U; struct fastx_buffer_s { char * data; uint64_t length; uint64_t alloc; uint64_t position; }; auto buffer_init(struct fastx_buffer_s * buffer) -> void; auto buffer_free(struct fastx_buffer_s * buffer) -> void; auto buffer_extend(struct fastx_buffer_s * dest_buffer, char * source_buf, uint64_t len) -> void; auto buffer_makespace(struct fastx_buffer_s * buffer, uint64_t x) -> void; struct fastx_s { bool is_pipe; bool is_fastq; bool is_empty; std::FILE * fp; #ifdef HAVE_ZLIB_H gzFile fp_gz; #endif #ifdef HAVE_BZLIB_H BZFILE * fp_bz; #endif struct fastx_buffer_s file_buffer; struct fastx_buffer_s header_buffer; struct fastx_buffer_s sequence_buffer; struct fastx_buffer_s plusline_buffer; struct fastx_buffer_s quality_buffer; uint64_t file_size; uint64_t file_position; uint64_t lineno; uint64_t lineno_start; int64_t seqno; uint64_t stripped_all; uint64_t stripped[byte_range]; int format; }; using fastx_handle = struct fastx_s *; /* fastx input */ auto fastx_is_fastq(fastx_handle h) -> bool; auto fastx_is_empty(fastx_handle h) -> bool; auto fastx_is_pipe(fastx_handle h) -> bool; auto fastx_filter_header(fastx_handle h, bool truncateatspace) -> void; auto fastx_open(const char * filename) -> fastx_handle; auto fastx_close(fastx_handle h) -> void; auto fastx_next(fastx_handle h, bool truncateatspace, const unsigned char * char_mapping) -> bool; auto fastx_get_position(fastx_handle h) -> uint64_t; auto fastx_get_size(fastx_handle h) -> uint64_t; auto fastx_get_lineno(fastx_handle h) -> uint64_t; auto fastx_get_seqno(fastx_handle h) -> uint64_t; auto fastx_get_header(fastx_handle h) -> char *; auto fastx_get_sequence(fastx_handle h) -> char *; auto fastx_get_header_length(fastx_handle h) -> uint64_t; auto fastx_get_sequence_length(fastx_handle h) -> uint64_t; auto fastx_get_quality(fastx_handle h) -> char *; auto fastx_get_abundance(fastx_handle h) -> int64_t; auto fastx_file_fill_buffer(fastx_handle h) -> uint64_t; vsearch-2.30.0/src/filter.cc000066400000000000000000000460331476012147200156240ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // macros PRIu64 and PRId64 #include // std::pow #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::exit, EXIT_FAILURE #include inline auto fastq_get_qual(char q) -> int { int const qual = q - opt_fastq_ascii; if (qual < opt_fastq_qmin) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); if (fp_log) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); } exit(EXIT_FAILURE); } else if (qual > opt_fastq_qmax) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(stderr, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); if (fp_log) { fprintf(fp_log, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(fp_log, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); } exit(EXIT_FAILURE); } return qual; } struct analysis_res { bool discarded = false; bool truncated = false; int start = 0; int length = 0; double ee = -1.0; }; auto analyse(fastx_handle h) -> struct analysis_res { struct analysis_res res; res.length = fastx_get_sequence_length(h); int64_t const old_length = res.length; /* strip left (5') end */ if (opt_fastq_stripleft < res.length) { res.start += opt_fastq_stripleft; res.length -= opt_fastq_stripleft; } else { res.start = res.length; res.length = 0; } /* strip right (3') end */ if (opt_fastq_stripright < res.length) { res.length -= opt_fastq_stripright; } else { res.length = 0; } /* truncate trailing (3') part */ if (opt_fastq_trunclen >= 0) { if (res.length > opt_fastq_trunclen) { res.length = opt_fastq_trunclen; } } /* truncate trailing (3') part, but keep if short */ if (opt_fastq_trunclen_keep >= 0) { if (res.length > opt_fastq_trunclen_keep) { res.length = opt_fastq_trunclen_keep; } } if (h->is_fastq) { /* truncate by quality and expected errors (ee) */ res.ee = 0.0; static constexpr auto base = 10.0; char * q = fastx_get_quality(h) + res.start; for (int64_t i = 0; i < res.length; i++) { int const qual = fastq_get_qual(q[i]); auto const e = std::pow(base, -qual / base); res.ee += e; if ((qual <= opt_fastq_truncqual) || (res.ee > opt_fastq_truncee) || (res.ee > opt_fastq_truncee_rate * (i + 1))) { res.ee -= e; res.length = i; break; } if (qual < opt_fastq_minqual) { res.discarded = true; } } /* filter by expected errors (ee) */ if (res.ee > opt_fastq_maxee) { res.discarded = true; } if ((res.length > 0) && (res.ee / res.length > opt_fastq_maxee_rate)) { res.discarded = true; } } /* filter by length */ if ((opt_fastq_trunclen >= 0) && (res.length < opt_fastq_trunclen)) { res.discarded = true; } if (res.length < opt_fastq_minlen) { res.discarded = true; } if (res.length > opt_fastq_maxlen) { res.discarded = true; } /* filter by n's */ int64_t ncount = 0; char * p = fastx_get_sequence(h) + res.start; for (int64_t i = 0; i < res.length; i++) { int const pc = p[i]; if ((pc == 'N') || (pc == 'n')) { ++ncount; } } if (ncount > opt_fastq_maxns) { res.discarded = true; } /* filter by abundance */ int64_t const abundance = fastx_get_abundance(h); if (abundance < opt_minsize) { res.discarded = true; } if (abundance > opt_maxsize) { res.discarded = true; } res.truncated = res.length < old_length; return res; } auto filter(bool fastq_only, char * filename) -> void { static constexpr auto dbl_max = std::numeric_limits::max(); static constexpr auto long_min = std::numeric_limits::min(); if ((! opt_fastqout) && (! opt_fastaout) && (! opt_fastqout_discarded) && (! opt_fastaout_discarded) && (! opt_fastqout_rev) && (! opt_fastaout_rev) && (! opt_fastqout_discarded_rev) && (! opt_fastaout_discarded_rev)) { fatal("No output files specified"); } fastx_handle h1 = nullptr; fastx_handle h2 = nullptr; h1 = fastx_open(filename); if (! h1) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } if (! (h1->is_fastq || h1->is_empty)) { if (fastq_only) { fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead"); } else if (opt_eeout || (opt_fastq_ascii != 33) || opt_fastq_eeout || (opt_fastq_maxee < dbl_max) || (opt_fastq_maxee_rate < dbl_max) || opt_fastqout || (opt_fastq_qmax < 41) || (opt_fastq_qmin > 0) || (opt_fastq_truncee < dbl_max) || (opt_fastq_truncee_rate < dbl_max) || (opt_fastq_truncqual < long_min) || (opt_fastq_minqual > 0) || opt_fastqout_discarded || opt_fastqout_discarded_rev || opt_fastqout_rev) { fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_minqual, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncee_rate, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev"); } } uint64_t const filesize = fastx_get_size(h1); if (opt_reverse) { h2 = fastx_open(opt_reverse); if (! h2) { fatal("Unrecognized file type (not proper FASTA or FASTQ format) for reverse reads"); } if (h1->is_fastq != h2->is_fastq) { fatal("The forward and reverse input sequence must in the same format, either FASTA or FASTQ"); } } FILE * fp_fastaout = nullptr; FILE * fp_fastqout = nullptr; FILE * fp_fastaout_discarded = nullptr; FILE * fp_fastqout_discarded = nullptr; FILE * fp_fastaout_rev = nullptr; FILE * fp_fastqout_rev = nullptr; FILE * fp_fastaout_discarded_rev = nullptr; FILE * fp_fastqout_discarded_rev = nullptr; if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); if (! fp_fastaout) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout) { fp_fastqout = fopen_output(opt_fastqout); if (! fp_fastqout) { fatal("Unable to open FASTQ output file for writing"); } } if (opt_fastaout_discarded) { fp_fastaout_discarded = fopen_output(opt_fastaout_discarded); if (! fp_fastaout_discarded) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout_discarded) { fp_fastqout_discarded = fopen_output(opt_fastqout_discarded); if (! fp_fastqout_discarded) { fatal("Unable to open FASTQ output file for writing"); } } if (h2) { if (opt_fastaout_rev) { fp_fastaout_rev = fopen_output(opt_fastaout_rev); if (! fp_fastaout_rev) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout_rev) { fp_fastqout_rev = fopen_output(opt_fastqout_rev); if (! fp_fastqout_rev) { fatal("Unable to open FASTQ output file for writing"); } } if (opt_fastaout_discarded_rev) { fp_fastaout_discarded_rev = fopen_output(opt_fastaout_discarded_rev); if (! fp_fastaout_discarded_rev) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout_discarded_rev) { fp_fastqout_discarded_rev = fopen_output(opt_fastqout_discarded_rev); if (! fp_fastqout_discarded_rev) { fatal("Unable to open FASTQ output file for writing"); } } } progress_init("Reading input file", filesize); int64_t kept = 0; int64_t discarded = 0; int64_t truncated = 0; while (fastx_next(h1, false, chrmap_no_change)) { if (h2 && ! fastx_next(h2, false, chrmap_no_change)) { fatal("More forward reads than reverse reads"); } struct analysis_res res1; res1.ee = 0.0; struct analysis_res res2; res1 = analyse(h1); if (h2) { res2 = analyse(h2); } if (res1.discarded || res2.discarded) { /* discard the sequence(s) */ ++discarded; if (opt_fastaout_discarded) { fasta_print_general(fp_fastaout_discarded, nullptr, fastx_get_sequence(h1) + res1.start, res1.length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_abundance(h1), discarded, res1.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout_discarded) { fastq_print_general(fp_fastqout_discarded, fastx_get_sequence(h1) + res1.start, res1.length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_quality(h1) + res1.start, fastx_get_abundance(h1), discarded, res1.ee); } if (h2) { if (opt_fastaout_discarded_rev) { fasta_print_general(fp_fastaout_discarded_rev, nullptr, fastx_get_sequence(h2) + res2.start, res2.length, fastx_get_header(h2), fastx_get_header_length(h2), fastx_get_abundance(h2), discarded, res2.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout_discarded_rev) { fastq_print_general(fp_fastqout_discarded_rev, fastx_get_sequence(h2) + res2.start, res2.length, fastx_get_header(h2), fastx_get_header_length(h2), fastx_get_quality(h2) + res2.start, fastx_get_abundance(h2), discarded, res2.ee); } } } else { /* keep the sequence(s) */ ++kept; if (res1.truncated || res2.truncated) { ++truncated; } if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, fastx_get_sequence(h1) + res1.start, res1.length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_abundance(h1), kept, res1.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout) { fastq_print_general(fp_fastqout, fastx_get_sequence(h1) + res1.start, res1.length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_quality(h1) + res1.start, fastx_get_abundance(h1), kept, res1.ee); } if (h2) { if (opt_fastaout_rev) { fasta_print_general(fp_fastaout_rev, nullptr, fastx_get_sequence(h2) + res2.start, res2.length, fastx_get_header(h2), fastx_get_header_length(h2), fastx_get_abundance(h2), kept, res2.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout_rev) { fastq_print_general(fp_fastqout_rev, fastx_get_sequence(h2) + res2.start, res2.length, fastx_get_header(h2), fastx_get_header_length(h2), fastx_get_quality(h2) + res2.start, fastx_get_abundance(h2), kept, res2.ee); } } } progress_update(fastx_get_position(h1)); } progress_done(); if (h2 && fastx_next(h2, false, chrmap_no_change)) { fatal("More reverse reads than forward reads"); } if (! opt_quiet) { fprintf(stderr, "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n", kept, truncated, discarded); } if (opt_log) { fprintf(fp_log, "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n", kept, truncated, discarded); } if (h2) { if (opt_fastaout_rev) { fclose(fp_fastaout_rev); } if (opt_fastqout_rev) { fclose(fp_fastqout_rev); } if (opt_fastaout_discarded_rev) { fclose(fp_fastaout_discarded_rev); } if (opt_fastqout_discarded_rev) { fclose(fp_fastqout_discarded_rev); } fastx_close(h2); } if (opt_fastaout) { fclose(fp_fastaout); } if (opt_fastqout) { fclose(fp_fastqout); } if (opt_fastaout_discarded) { fclose(fp_fastaout_discarded); } if (opt_fastqout_discarded) { fclose(fp_fastqout_discarded); } fastx_close(h1); } auto fastq_filter() -> void { filter(true, opt_fastq_filter); } auto fastx_filter() -> void { filter(false, opt_fastx_filter); } vsearch-2.30.0/src/filter.h000066400000000000000000000047471476012147200154740ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_filter() -> void; auto fastx_filter() -> void; vsearch-2.30.0/src/getseq.cc000066400000000000000000000405321476012147200156250ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Implement fastx_getseq, fastx_getseqs and fastx_getsubseq as described here: https://drive5.com/usearch/manual/cmd_fastx_getseqs.html */ #include "vsearch.h" #include "maps.h" #include // std::max, std::min #include // isalnum #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::snprintf, std::fileno, std::fgets #include // std::realloc, std::free #include // std::strlen, std::memset, std::strcpy, std::strstr #include // strdup, strcasecmp static int labels_alloc = 0; static int labels_count = 0; static int labels_longest = 0; static char * * labels_data = nullptr; auto read_labels_file(char * filename) -> void { FILE * fp_labels = fopen_input(filename); if (! fp_labels) { fatal("Unable to open labels file (%s)", filename); } xstat_t fs; if (xfstat(fileno(fp_labels), & fs)) { fatal("Unable to get status for labels file (%s)", filename); } bool const is_pipe = S_ISFIFO(fs.st_mode); // linuxism uint64_t file_size = 0; if (! is_pipe) { file_size = fs.st_size; } progress_init("Reading labels", file_size); while (true) { const int buffer_size = 1024; char buffer[buffer_size]; char * ret = fgets(buffer, buffer_size, fp_labels); if (ret) { int len = strlen(buffer); if ((len > 0) && (buffer[len - 1] == '\n')) { buffer[len - 1] = 0; --len; } labels_longest = std::max(len, labels_longest); if (labels_count + 1 > labels_alloc) { labels_alloc += 1024; labels_data = (char **) xrealloc(labels_data, labels_alloc * sizeof (char *)); if (! labels_data) { fatal("Unable to allocate memory for labels"); } } labels_data[labels_count++] = strdup(buffer); } else { break; } } fclose(fp_labels); progress_done(); if (labels_longest >= 1023) { if (! opt_quiet) { fprintf(stderr, "WARNING: Labels longer than 1023 characters are not supported\n"); } if (opt_log) { fprintf(fp_log, "WARNING: Labels longer than 1023 characters are not supported\n"); } } } auto free_labels() -> void { for (int i = 0; i < labels_count; i++) { free(labels_data[i]); } free(labels_data); labels_data = nullptr; } auto test_label_match(fastx_handle h) -> bool { char * header = fastx_get_header(h); int const hlen = fastx_get_header_length(h); char * field_buffer = nullptr; int field_len = 0; if (opt_label_field) { field_len = strlen(opt_label_field); int field_buffer_size = field_len + 2; if (opt_label_word) { field_buffer_size += strlen(opt_label_word); } else { field_buffer_size += labels_longest; } field_buffer = (char *) xmalloc(field_buffer_size); snprintf(field_buffer, field_buffer_size, "%s=", opt_label_field); } if (opt_label) { char * needle = opt_label; int const wlen = strlen(needle); if (opt_label_substr_match) { return xstrcasestr(header, needle); } else { return (hlen == wlen) && ! strcasecmp(header, needle); // strcasecmp is a linuxism } } else if (opt_labels) { if (opt_label_substr_match) { for (int i = 0; i < labels_count; i++) { if (xstrcasestr(header, labels_data[i])) { return true; } } } else { for (int i = 0; i < labels_count; i++) { char * needle = labels_data[i]; int const wlen = strlen(needle); if ((hlen == wlen) && ! strcasecmp(header, needle)) // strcasecmp is a linuxism { return true; } } } } else if (opt_label_word) { char * needle = opt_label_word; if (opt_label_field) { strcpy(field_buffer + field_len + 1, needle); needle = field_buffer; } int const wlen = strlen(needle); char * hit = header; while (true) { hit = strstr(hit, needle); if (hit) { if (opt_label_field) { /* check of field */ if (((hit == header) || (*(hit - 1) == ';')) && ((hit + wlen == header + hlen) || (*(hit + wlen) == ';'))) { return true; } } else { /* check of full word */ if (((hit == header) || (! isalnum(*(hit - 1)))) && ((hit + wlen == header + hlen) || (! isalnum(*(hit + wlen))))) { return true; } } ++hit; } else { break; } } } else if (opt_label_words) { for (int i = 0; i < labels_count; i++) { char * needle = labels_data[i]; if (opt_label_field) { strcpy(field_buffer + field_len + 1, needle); needle = field_buffer; } int const wlen = strlen(needle); char * hit = header; while (true) { hit = strstr(hit, needle); if (hit) { if (opt_label_field) { /* check of field */ if (((hit == header) || (*(hit - 1) == ';')) && ((hit + wlen == header + hlen) || (*(hit + wlen) == ';'))) { return true; } } else { /* check of full word */ if (((hit == header) || (! isalnum(*(hit - 1)))) && ((hit + wlen == header + hlen) || (! isalnum(*(hit + wlen))))) { return true; } } ++hit; } else { break; } } } } return false; } auto getseq(char * filename) -> void { if ((! opt_fastqout) && (! opt_fastaout) && (! opt_notmatched) && (! opt_notmatchedfq)) { fatal("No output files specified"); } if (opt_fastx_getseq) { if (! opt_label) { fatal("Missing label option"); } } else if (opt_fastx_getsubseq) { if (! opt_label) { fatal("Missing label option"); } if ((opt_subseq_start < 1) || (opt_subseq_end < 1)) { fatal("The argument to options subseq_start and subseq_end must be at least 1"); } if (opt_subseq_start > opt_subseq_end) { fatal("The argument to option subseq_start must be equal or less than to subseq_end"); } } else if (opt_fastx_getseqs) { int label_options = 0; if (opt_label) { ++label_options; } if (opt_labels) { ++label_options; } if (opt_label_word) { ++label_options; } if (opt_label_words) { ++label_options; } if (label_options != 1) { fatal("Specify one label option (label, labels, label_word or label_words)"); } if (opt_labels) { read_labels_file(opt_labels); } if (opt_label_words) { read_labels_file(opt_label_words); } } fastx_handle h1 = nullptr; h1 = fastx_open(filename); if (! h1) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } if ((opt_fastqout || opt_notmatchedfq) && ! (h1->is_fastq || h1->is_empty)) { fatal("Cannot write FASTQ output from FASTA input"); } uint64_t const filesize = fastx_get_size(h1); FILE * fp_fastaout = nullptr; FILE * fp_fastqout = nullptr; FILE * fp_notmatched = nullptr; FILE * fp_notmatchedfq = nullptr; if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); if (! fp_fastaout) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout) { fp_fastqout = fopen_output(opt_fastqout); if (! fp_fastqout) { fatal("Unable to open FASTQ output file for writing"); } } if (opt_notmatched) { fp_notmatched = fopen_output(opt_notmatched); if (! fp_notmatched) { fatal("Unable to open FASTA output file (notmatched) for writing"); } } if (opt_notmatchedfq) { fp_notmatchedfq = fopen_output(opt_notmatchedfq); if (! fp_notmatchedfq) { fatal("Unable to open FASTQ output file (notmatchedfq) for writing"); } } progress_init("Extracting sequences", filesize); int64_t kept = 0; int64_t discarded = 0; while (fastx_next(h1, ! opt_notrunclabels, chrmap_no_change)) { bool const match = test_label_match(h1); int64_t start = 1; int64_t end = fastx_get_sequence_length(h1); if (opt_fastx_getsubseq) { start = std::max(opt_subseq_start, start); end = std::min(opt_subseq_end, end); } int64_t const length = end - start + 1; if (match) { /* keep the sequence(s) */ ++kept; if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_abundance(h1), kept, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout) { fastq_print_general(fp_fastqout, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_quality(h1) + start - 1, fastx_get_abundance(h1), kept, -1.0); } } else { /* discard the sequence */ ++discarded; if (opt_notmatched) { fasta_print_general(fp_notmatched, nullptr, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_abundance(h1), discarded, -1.0, -1, -1, nullptr, 0.0); } if (opt_notmatchedfq) { fastq_print_general(fp_notmatchedfq, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_quality(h1) + start - 1, fastx_get_abundance(h1), discarded, -1.0); } } progress_update(fastx_get_position(h1)); } progress_done(); if (! opt_quiet) { fprintf(stderr, "%" PRId64 " of %" PRId64 " sequences extracted", kept, kept + discarded); if (kept + discarded > 0) { fprintf(stderr, " (%.1lf%%)", 100.0 * kept / (kept + discarded)); } fprintf(stderr, "\n"); } if (opt_log) { fprintf(fp_log, "%" PRId64 " of %" PRId64 " sequences extracted", kept, kept + discarded); if (kept + discarded > 0) { fprintf(fp_log, " (%.1lf%%)", 100.0 * kept / (kept + discarded)); } fprintf(fp_log, "\n"); } if (opt_fastaout) { fclose(fp_fastaout); } if (opt_fastqout) { fclose(fp_fastqout); } if (opt_notmatched) { fclose(fp_notmatched); } if (opt_notmatchedfq) { fclose(fp_notmatchedfq); } fastx_close(h1); if (opt_labels || opt_label_words) { free_labels(); } } auto fastx_getseq() -> void { getseq(opt_fastx_getseq); } auto fastx_getseqs() -> void { getseq(opt_fastx_getseqs); } auto fastx_getsubseq() -> void { getseq(opt_fastx_getsubseq); } vsearch-2.30.0/src/getseq.h000066400000000000000000000050101476012147200154570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastx_getseq() -> void; auto fastx_getseqs() -> void; auto fastx_getsubseq() -> void; vsearch-2.30.0/src/kmerhash.cc000066400000000000000000000171301476012147200161350ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // std::memset #include #define HASH CityHash64 struct kh_bucket_s { unsigned int kmer; unsigned int pos; /* 1-based position, 0 = empty */ }; struct kh_handle_s { struct kh_bucket_s * hash; unsigned int hash_mask; int size; int alloc; int maxpos; }; auto kh_init() -> struct kh_handle_s * { auto * kh = (struct kh_handle_s *) xmalloc(sizeof(struct kh_handle_s)); kh->maxpos = 0; kh->alloc = 256; kh->size = 0; kh->hash_mask = kh->alloc - 1; kh->hash = (struct kh_bucket_s *) xmalloc(kh->alloc * sizeof(struct kh_bucket_s)); return kh; } auto kh_exit(struct kh_handle_s * kh) -> void { if (kh->hash) { xfree(kh->hash); } xfree(kh); } inline auto kh_insert_kmer(struct kh_handle_s * kh, int k, unsigned int kmer, unsigned int pos) -> void { /* find free bucket in hash */ unsigned int j = HASH((char *) &kmer, (k + 3) / 4) & kh->hash_mask; while(kh->hash[j].pos) { j = (j + 1) & kh->hash_mask; } kh->hash[j].kmer = kmer; kh->hash[j].pos = pos; } auto kh_insert_kmers(struct kh_handle_s * kh, int k, char * seq, int len) -> void { int const kmers = 1U << (2U * k); unsigned int const kmer_mask = kmers - 1; /* reallocate hash table if necessary */ if (kh->alloc < 2 * len) { while (kh->alloc < 2 * len) { kh->alloc *= 2; } kh->hash = (struct kh_bucket_s *) xrealloc(kh->hash, kh->alloc * sizeof(struct kh_bucket_s)); } kh->size = 1; while (kh->size < 2 * len) { kh->size *= 2; } kh->hash_mask = kh->size - 1; kh->maxpos = len; memset(kh->hash, 0, kh->size * sizeof(struct kh_bucket_s)); unsigned int bad = kmer_mask; unsigned int kmer = 0; char * s = seq; unsigned int * maskmap = chrmap_mask_ambig; for (int pos = 0; pos < len; pos++) { int const c = *s++; bad <<= 2ULL; bad |= maskmap[c]; bad &= kmer_mask; kmer <<= 2ULL; kmer |= chrmap_2bit[c]; kmer &= kmer_mask; if (! bad) { /* 1-based pos of start of kmer */ kh_insert_kmer(kh, k, kmer, pos - k + 1 + 1); } } } auto kh_find_best_diagonal(struct kh_handle_s * kh, int k, char * seq, int len) -> int { std::vector diag_counts(kh->maxpos, 0); int const kmers = 1U << (2U * k); unsigned int const kmer_mask = kmers - 1; unsigned int bad = kmer_mask; unsigned int kmer = 0; char * s = seq + len - 1; unsigned int * maskmap = chrmap_mask_ambig; for (int pos = 0; pos < len; pos++) { int const c = *s--; bad <<= 2ULL; bad |= maskmap[c]; bad &= kmer_mask; kmer <<= 2ULL; kmer |= chrmap_2bit[chrmap_complement[c]]; kmer &= kmer_mask; if (! bad) { /* find matching buckets in hash */ unsigned int j = HASH((char *) &kmer, (k + 3) / 4) & kh->hash_mask; while(kh->hash[j].pos) { if (kh->hash[j].kmer == kmer) { int const fpos = kh->hash[j].pos - 1; int const diag = fpos - (pos - k + 1); if (diag >= 0) { diag_counts[diag]++; } } j = (j + 1) & kh->hash_mask; } } } int best_diag_count = -1; int best_diag = -1; int good_diags = 0; for (int d = 0; d < kh->maxpos - k + 1; d++) { int const diag_len = kh->maxpos - d; int const minmatch = MAX(1, diag_len - k + 1 - (k * MAX(diag_len / 20, 0))); int const c = diag_counts[d]; if (c >= minmatch) { good_diags++; } if (c > best_diag_count) { best_diag_count = c; best_diag = d; } } if (good_diags == 1) { return best_diag; } else { return -1; } } auto kh_find_diagonals(struct kh_handle_s * kh, int k, char * seq, int len, int * diags) -> void { memset(diags, 0, (kh->maxpos+len) * sizeof(int)); int const kmers = 1U << (2U * k); unsigned int const kmer_mask = kmers - 1; unsigned int bad = kmer_mask; unsigned int kmer = 0; char * s = seq + len - 1; for (int pos = 0; pos < len; pos++) { int const c = *s--; bad <<= 2ULL; bad |= chrmap_mask_ambig[c]; bad &= kmer_mask; kmer <<= 2ULL; kmer |= chrmap_2bit[chrmap_complement[c]]; kmer &= kmer_mask; if (! bad) { /* find matching buckets in hash */ unsigned int j = HASH((char *) &kmer, (k + 3) / 4) & kh->hash_mask; while(kh->hash[j].pos) { if (kh->hash[j].kmer == kmer) { int const fpos = kh->hash[j].pos - 1; int const diag = len + fpos - (pos - k + 1); if (diag >= 0) { diags[diag]++; } } j = (j + 1) & kh->hash_mask; } } } } vsearch-2.30.0/src/kmerhash.h000066400000000000000000000056051476012147200160030ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct kh_handle_s; auto kh_init() -> struct kh_handle_s *; auto kh_exit(struct kh_handle_s * kh) -> void; auto kh_insert_kmers(struct kh_handle_s * kh, int k, char * seq, int len) -> void; auto kh_find_best_diagonal(struct kh_handle_s * kh, int k, char * seq, int len) -> int; auto kh_find_diagonals(struct kh_handle_s * kh, int k, char * seq, int len, int * diags) -> void; vsearch-2.30.0/src/linmemalign.cc000066400000000000000000000504141476012147200166310ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // std::max #include // macros PRIu64 and PRId64 #include // int64_t #include // std::FILE, std::printf, std::size_t, std::snprintf, std::sscanf #include /* Compute the optimal global alignment of two sequences in linear space using the divide and conquer method. These functions are based on the following articles: - Hirschberg (1975) Comm ACM 18:341-343 - Myers & Miller (1988) CABIOS 4:11-17 The method has been adapted for the use of different gap penalties for query/target/left/interior/right gaps. scorematrix consists of 16x16 int64_t integers Sequences and alignment matrix: A/a/i/query/q/downwards/vertical/top/bottom B/b/j/target/t/rightwards/horizontal/left/right f corresponds to score ending with gap in A/query EE corresponds to score ending with gap in B/target */ LinearMemoryAligner::LinearMemoryAligner() { scorematrix = nullptr; cigar_alloc = 0; cigar_string = nullptr; vector_alloc = 0; HH = nullptr; EE = nullptr; XX = nullptr; YY = nullptr; } LinearMemoryAligner::~LinearMemoryAligner() { if (cigar_string) { xfree(cigar_string); } if (HH) { xfree(HH); } if (EE) { xfree(EE); } if (XX) { xfree(XX); } if (YY) { xfree(YY); } } auto LinearMemoryAligner::scorematrix_create(int64_t match, int64_t mismatch) -> int64_t * { auto * newscorematrix = (int64_t *) xmalloc(16 * 16 * sizeof(int64_t)); for (int i = 0; i < 16; i++) { for (int j = 0; j < 16; j++) { int64_t value = 0; if (opt_n_mismatch && ((i == 15) || (j == 15))) { value = mismatch; } else if (ambiguous_4bit[i] || ambiguous_4bit[j]) { value = 0; } else if (i == j) { value = match; } else { value = mismatch; } newscorematrix[(16 * i) + j] = value; } } return newscorematrix; } auto LinearMemoryAligner::alloc_vectors(size_t x) -> void { if (vector_alloc < x) { vector_alloc = x; if (HH) { xfree(HH); } if (EE) { xfree(EE); } if (XX) { xfree(XX); } if (YY) { xfree(YY); } HH = (int64_t *) xmalloc(vector_alloc * (sizeof(int64_t))); EE = (int64_t *) xmalloc(vector_alloc * (sizeof(int64_t))); XX = (int64_t *) xmalloc(vector_alloc * (sizeof(int64_t))); YY = (int64_t *) xmalloc(vector_alloc * (sizeof(int64_t))); } } auto LinearMemoryAligner::cigar_reset() -> void { if (cigar_alloc < 1) { cigar_alloc = 64; cigar_string = (char *) xrealloc(cigar_string, cigar_alloc); } cigar_string[0] = 0; cigar_length = 0; op = 0; op_run = 0; } auto LinearMemoryAligner::cigar_flush() -> void { if (op_run > 0) { while (true) { /* try writing string until enough memory has been allocated */ int64_t const rest = cigar_alloc - cigar_length; int n = 0; if (op_run > 1) { n = snprintf(cigar_string + cigar_length, rest, "%" PRId64 "%c", op_run, op); } else { n = snprintf(cigar_string + cigar_length, rest, "%c", op); } if (n < 0) { fatal("snprintf returned a negative number.\n"); } else if (n >= rest) { cigar_alloc += MAX(n - rest + 1, 64); cigar_string = (char *) xrealloc(cigar_string, cigar_alloc); } else { cigar_length += n; break; } } } } auto LinearMemoryAligner::cigar_add(char _op, int64_t run) -> void { if (op == _op) { op_run += run; } else { cigar_flush(); op = _op; op_run = run; } } auto LinearMemoryAligner::show_matrix() -> void { for (int i = 0; i < 16; i++) { printf("%2d:", i); for (int j = 0; j < 16; j++) { printf(" %2" PRId64, scorematrix[(16 * i) + j]); } printf("\n"); } } auto LinearMemoryAligner::diff(int64_t a_start, int64_t b_start, int64_t a_len, int64_t b_len, bool gap_b_left, /* gap open left of b */ bool gap_b_right, /* gap open right of b */ bool a_left, /* includes left end of a */ bool a_right, /* includes right end of a */ bool b_left, /* includes left end of b */ bool b_right) -> void /* includes right end of b */ { static constexpr auto long_min = std::numeric_limits::min(); if (b_len == 0) { /* B and possibly A is empty */ if (a_len > 0) { // Delete a_len from A // AAA // --- cigar_add('D', a_len); } } else if (a_len == 0) { /* A is empty, B is not */ // Delete b_len from B // --- // BBB cigar_add('I', b_len); } else if (a_len == 1) { /* Convert 1 symbol from A to b_len symbols from B b_len >= 1 */ int64_t MaxScore = 0; int64_t best = 0; int64_t Score = 0; /* First possibility */ // Delete 1 from A, Insert b_len from B // A---- // -BBBB /* gap penalty for gap in B of length 1 */ if (! gap_b_left) { Score -= b_left ? go_t_l : go_t_i; } Score -= b_left ? ge_t_l : ge_t_i; /* gap penalty for gap in A of length b_len */ Score -= a_right ? go_q_r + (b_len * ge_q_r) : go_q_i + (b_len * ge_q_i); MaxScore = Score; best = -1; /* Second possibility */ // Insert b_len from B, Delete 1 from A // ----A // BBBB- /* gap penalty for gap in A of length b_len */ Score -= a_left ? go_q_l + (b_len * ge_q_l) : go_q_i + (b_len * ge_q_i); /* gap penalty for gap in B of length 1 */ if (! gap_b_right) { Score -= b_right ? go_t_r : go_t_i; } Score -= b_right ? ge_t_r : ge_t_i; if (Score > MaxScore) { MaxScore = Score; best = b_len; } /* Third possibility */ for (int64_t j = 0; j < b_len; j++) { // Insert zero or more from B, replace 1, insert rest of B // -A-- // BBBB Score = 0; if (j > 0) { Score -= a_left ? go_q_l + (j * ge_q_l) : go_q_i + (j * ge_q_i); } Score += subst_score(a_start, b_start + j); if (j < b_len - 1) { Score -= a_right ? go_q_r + ((b_len - 1 - j) * ge_q_r) : go_q_i + ((b_len - 1 - j) * ge_q_i); } if (Score > MaxScore) { MaxScore = Score; best = j; } } if (best == -1) { cigar_add('D', 1); cigar_add('I', b_len); } else if (best == b_len) { cigar_add('I', b_len); cigar_add('D', 1); } else { if (best > 0) { cigar_add('I', best); } cigar_add('M', 1); if (best < b_len - 1) { cigar_add('I', b_len - 1 - best); } } } else { /* a_len >= 2, b_len >= 1 */ int64_t const I = a_len / 2; // Compute HH & EE in forward phase // Upper part /* initialize HH and EE for values corresponding to empty seq A vs B of j symbols, i.e. a gap of length j in A */ HH[0] = 0; EE[0] = 0; for (int64_t j = 1; j <= b_len; j++) { HH[j] = - (a_left ? go_q_l + (j * ge_q_l) : go_q_i + (j * ge_q_i)); EE[j] = long_min; } /* compute matrix */ for (int64_t i = 1; i <= I; i++) { int64_t p = HH[0]; int64_t h = - (b_left ? (gap_b_left ? 0 : go_t_l) + (i * ge_t_l) : (gap_b_left ? 0 : go_t_i) + (i * ge_t_i)); HH[0] = h; int64_t f = long_min; for (int64_t j = 1; j <= b_len; j++) { f = MAX(f, h - go_q_i) - ge_q_i; if (b_right && (j == b_len)) { EE[j] = MAX(EE[j], HH[j] - go_t_r) - ge_t_r; } else { EE[j] = MAX(EE[j], HH[j] - go_t_i) - ge_t_i; } h = p + subst_score(a_start + i - 1, b_start + j - 1); h = std::max(f, h); h = std::max(EE[j], h); p = HH[j]; HH[j] = h; } } EE[0] = HH[0]; // Compute XX & YY in reverse phase // Lower part /* initialize XX and YY */ XX[0] = 0; YY[0] = 0; for (int64_t j = 1; j <= b_len; j++) { XX[j] = - (a_right ? go_q_r + (j * ge_q_r) : go_q_i + (j * ge_q_i)); YY[j] = long_min; } /* compute matrix */ for (int64_t i = 1; i <= a_len - I; i++) { int64_t p = XX[0]; int64_t h = - (b_right ? (gap_b_right ? 0 : go_t_r) + (i * ge_t_r) : (gap_b_right ? 0 : go_t_i) + (i * ge_t_i)); XX[0] = h; int64_t f = long_min; for (int64_t j = 1; j <= b_len; j++) { f = MAX(f, h - go_q_i) - ge_q_i; if (b_left && (j==b_len)) { YY[j] = MAX(YY[j], XX[j] - go_t_l) - ge_t_l; } else { YY[j] = MAX(YY[j], XX[j] - go_t_i) - ge_t_i; } h = p + subst_score(a_start + a_len - i, b_start + b_len - j); h = std::max(f, h); h = std::max(YY[j], h); p = XX[j]; XX[j] = h; } } YY[0] = XX[0]; /* find maximum score along division line */ int64_t MaxScore0 = long_min; int64_t best0 = -1; /* solutions with diagonal at break */ for (int64_t j = 0; j <= b_len; j++) { int64_t const Score = HH[j] + XX[b_len - j]; if (Score > MaxScore0) { MaxScore0 = Score; best0 = j; } } int64_t MaxScore1 = long_min; int64_t best1 = -1; /* solutions that end with a gap in b from both ends at break */ for (int64_t j = 0; j <= b_len; j++) { int64_t g = 0; if (b_left && (j == 0)) { g = go_t_l; } else if (b_right && (j == b_len)) { g = go_t_r; } else { g = go_t_i; } int64_t const Score = EE[j] + YY[b_len - j] + g; if (Score > MaxScore1) { MaxScore1 = Score; best1 = j; } } int64_t P = 0; int64_t best = 0; if (MaxScore0 > MaxScore1) { P = 0; best = best0; } else if (MaxScore1 > MaxScore0) { P = 1; best = best1; } else { if (best0 <= best1) { P = 0; best = best0; } else { P = 1; best = best1; } } /* recursively compute upper left and lower right parts */ if (P == 0) { diff(a_start, b_start, I, best, gap_b_left, false, a_left, false, b_left, b_right && (best == b_len)); diff(a_start + I, b_start + best, a_len - I, b_len - best, false, gap_b_right, false, a_right, b_left && (best == 0), b_right); } else if (P == 1) { diff(a_start, b_start, I - 1, best, gap_b_left, true, a_left, false, b_left, b_right && (best == b_len)); cigar_add('D', 2); diff(a_start + I + 1, b_start + best, a_len - I - 1, b_len - best, true, gap_b_right, false, a_right, b_left && (best == 0), b_right); } } } auto LinearMemoryAligner::set_parameters(int64_t * _scorematrix, int64_t _gap_open_query_left, int64_t _gap_open_target_left, int64_t _gap_open_query_interior, int64_t _gap_open_target_interior, int64_t _gap_open_query_right, int64_t _gap_open_target_right, int64_t _gap_extension_query_left, int64_t _gap_extension_target_left, int64_t _gap_extension_query_interior, int64_t _gap_extension_target_interior, int64_t _gap_extension_query_right, int64_t _gap_extension_target_right) -> void { scorematrix = _scorematrix; /* a = query/q b = t/target */ go_q_l = _gap_open_query_left; go_t_l = _gap_open_target_left; go_q_i = _gap_open_query_interior; go_t_i = _gap_open_target_interior; go_q_r = _gap_open_query_right; go_t_r = _gap_open_target_right; ge_q_l = _gap_extension_query_left; ge_t_l = _gap_extension_target_left; ge_q_i = _gap_extension_query_interior; ge_t_i = _gap_extension_target_interior; ge_q_r = _gap_extension_query_right; ge_t_r = _gap_extension_target_right; q = _gap_open_query_interior; r = _gap_extension_query_interior; } auto LinearMemoryAligner::align(char * _a_seq, char * _b_seq, int64_t a_len, int64_t b_len) -> char * { /* copy parameters */ a_seq = _a_seq; b_seq = _b_seq; /* init cigar operations */ cigar_reset(); /* allocate enough memory for vectors */ alloc_vectors(b_len + 1); /* perform alignment */ diff(0, 0, a_len, b_len, false, false, true, true, true, true); /* ensure entire cigar has been written */ cigar_flush(); /* return cigar */ return cigar_string; } auto LinearMemoryAligner::alignstats(char * cigar, char * _a_seq, char * _b_seq, int64_t * _nwscore, int64_t * _nwalignmentlength, int64_t * _nwmatches, int64_t * _nwmismatches, int64_t * _nwgaps) -> void { a_seq = _a_seq; b_seq = _b_seq; int64_t nwscore = 0; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; int64_t a_pos = 0; int64_t b_pos = 0; char * p = cigar; int64_t g = 0; while (*p) { int64_t run = 1; int scanlength = 0; sscanf(p, "%" PRId64 "%n", &run, &scanlength); p += scanlength; switch (*p++) { case 'M': nwalignmentlength += run; for (int64_t k = 0; k < run; k++) { nwscore += subst_score(a_pos, b_pos); if (opt_n_mismatch && ((chrmap_4bit[(int) (a_seq[a_pos])] == 15) || (chrmap_4bit[(int) (b_seq[b_pos])] == 15))) { nwmismatches++; } else if (chrmap_4bit[(int)(a_seq[a_pos])] & chrmap_4bit[(int)(b_seq[b_pos])]) { nwmatches++; } else { nwmismatches++; } a_pos++; b_pos++; } break; case 'I': if ((a_pos == 0) && (b_pos == 0)) { g = go_q_l + run * ge_q_l; } else if (*p == 0) { g = go_q_r + run * ge_q_r; } else { g = go_q_i + run * ge_q_i; } nwscore -= g; nwgaps++; nwalignmentlength += run; b_pos += run; break; case 'D': if ((a_pos == 0) && (b_pos == 0)) { g = go_t_l + run * ge_t_l; } else if (*p == 0) { g = go_t_r + run * ge_t_r; } else { g = go_t_i + run * ge_t_i; } nwscore -= g; nwgaps++; nwalignmentlength += run; a_pos += run; break; } } *_nwscore = nwscore; *_nwalignmentlength = nwalignmentlength; *_nwmatches = nwmatches; *_nwmismatches = nwmismatches; *_nwgaps = nwgaps; } vsearch-2.30.0/src/linmemalign.h000066400000000000000000000126101476012147200164670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "maps.h" #include // std::FILE, std::size_t #include // int64_t class LinearMemoryAligner { char op; int64_t op_run; int64_t cigar_alloc; int64_t cigar_length; char * cigar_string; char * a_seq; char * b_seq; int64_t * scorematrix; int64_t q; int64_t r; /* gap penalties for open/extension query/target left/interior/right */ int64_t go_q_l; int64_t go_t_l; int64_t go_q_i; int64_t go_t_i; int64_t go_q_r; int64_t go_t_r; int64_t ge_q_l; int64_t ge_t_l; int64_t ge_q_i; int64_t ge_t_i; int64_t ge_q_r; int64_t ge_t_r; std::size_t vector_alloc; int64_t * HH; int64_t * EE; int64_t * XX; int64_t * YY; auto cigar_reset() -> void; auto cigar_flush() -> void; auto cigar_add(char _op, int64_t run) -> void; auto subst_score(int64_t lhs_pos, int64_t rhs_pos) -> int64_t { /* return substitution score for replacing symbol at position lhs_pos in a with symbol at position rhs_pos in b */ constexpr auto offset = 16; return scorematrix[(chrmap_4bit[(int) (b_seq[rhs_pos])] * offset) + chrmap_4bit[(int) (a_seq[lhs_pos])]]; } auto diff(int64_t a_start, int64_t b_start, int64_t a_len, int64_t b_len, bool gap_b_left, /* gap open left of b */ bool gap_b_right, /* gap open right of b */ bool a_left, /* includes left end of a */ bool a_right, /* includes right end of a */ bool b_left, /* includes left end of b */ bool b_right) -> void; /* includes right end of b */ auto alloc_vectors(std::size_t x) -> void; auto show_matrix() -> void; public: LinearMemoryAligner(); ~LinearMemoryAligner(); auto scorematrix_create(int64_t match, int64_t mismatch) -> int64_t *; auto set_parameters(int64_t * _scorematrix, int64_t _gap_open_query_left, int64_t _gap_open_target_left, int64_t _gap_open_query_interior, int64_t _gap_open_target_interior, int64_t _gap_open_query_right, int64_t _gap_open_target_right, int64_t _gap_extension_query_left, int64_t _gap_extension_target_left, int64_t _gap_extension_query_interior, int64_t _gap_extension_target_interior, int64_t _gap_extension_query_right, int64_t _gap_extension_target_right) -> void; auto align(char * _a_seq, char * _b_seq, int64_t a_len, int64_t b_len) -> char *; auto alignstats(char * cigar, char * a_seq, char * b_seq, int64_t * nwscore, int64_t * nwalignmentlength, int64_t * nwmatches, int64_t * nwmismatches, int64_t * nwgaps) -> void; }; vsearch-2.30.0/src/maps.cc000066400000000000000000000562521476012147200153030ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" /* legal symbols: *abcdefghiklmnpqrstuvxyz (all except j and o), also upper case fatal symbols: .- fatal: ascii 0-26 except tab (9), newline (10 and 13), vt (11), formfeed (12) stripped: !"#$&'()+,/0123456789:;<=>?@JO[\]^_`jo{|}~ and chrs 9-13, 127 includes both amino acid and nucleotide sequences, adapt to nt only */ constexpr auto illegal = 2; constexpr auto tab = 5; constexpr auto space = 6; constexpr auto non_ascii = 7; char sym_nt_2bit[] = "ACGT"; char sym_nt_4bit[] = "-ACMGRSVTWYHKDBN"; // | | | | // 0....5...10...15 unsigned int char_header_action[256] = { /* FASTA/FASTQ header characters 0 = null 1 = legal, printable ascii 2 = illegal, fatal 3 = cr 4 = lf 5 = tab 6 = space 7 = non-ascii, legal, but warn @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, tab, 4, illegal, illegal, 3, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, illegal, space, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, illegal, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii, non_ascii }; unsigned int char_fasta_action[256] = { /* How to handle input characters for FASTA 0=stripped, 1=legal, 2=fatal, 3=silently stripped, 4=newline @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; unsigned int char_fq_action_seq[256] = { /* How to handle input characters for FASTQ: All IUPAC characters are valid. CR (^M) silently stripped. LF is newline. Rest is fatal 0=stripped, 1=legal, 2=fatal, 3=silently stripped, 4=newline @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, }; unsigned int char_fq_action_qual[256] = { /* Quality characters, any from 33 to 126 is valid. CR (^M) silently stripped. LF is newline. Rest is fatal @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; unsigned int chrmap_2bit[256] = { /* Map from ascii to 2-bit nucleotide code Aa: 0 Cc: 1 Gg: 2 TtUu: 3 All others: 0 @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* New 4 bit ambiguous nucleic acid symbol encoding bit 0 = A bit 1 = C bit 2 = G bit 3 = T - = = 0000 = 0 A = A = 0001 = 1 C = C = 0010 = 2 M = AC = 0011 = 3 G = G = 0100 = 4 R = A G = 0101 = 5 S = CG = 0110 = 6 V = ACG = 0111 = 7 T = T = 1000 = 8 W = A T = 1001 = 9 Y = C T = 1010 = 10 H = AC T = 1011 = 11 K = GT = 1100 = 12 D = A GT = 1101 = 13 B = CGT = 1110 = 14 N = ACGT = 1111 = 15 */ unsigned int ambiguous_4bit[16] = { /* - A C M G R S V T W Y H K D B N */ 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }; unsigned int chrmap_4bit[256] = { /* Map from ascii to 4-bit nucleotide code Aa: 1 Bb: 14 Cc: 2 Dd: 13 Gg: 4 Hh: 11 Kk: 12 Mm: 3 Nn: 15 Rr: 5 Ss: 6 Tt: 8 Uu: 8 Vv: 7 Ww: 9 Yy: 10 Others: 0 @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 8, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 8, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; unsigned int chrmap_mask_lower[256] = { /* Should character be masked and not used for search ? Mask everything but A, C, G, T and U. All lower case letters are masked (soft masking). @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; unsigned int chrmap_mask_ambig[256] = { /* Should character be masked and not used for search ? Mask everything but A, C, G, T and U. Lower case letters are NOT masked. @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; const unsigned char chrmap_complement[256] = { /* Map from ascii to ascii, complementary nucleotide @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','T','V','G','H','N','N','C','D','N','N','M','N','K','N','N', 'N','N','Y','S','A','A','B','W','N','R','N','N','N','N','N','N', 'N','t','v','g','h','N','N','c','d','N','N','m','N','k','n','N', 'N','N','y','s','a','a','b','w','N','r','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const unsigned char chrmap_normalize[256] = { /* Map from ascii to ascii Convert to upper case nucleotide, and replace U by T @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','N','N','G','H','N','N','K','N','M','N','N', 'N','N','R','S','T','T','V','W','N','Y','N','N','N','N','N','N', 'N','A','B','C','D','N','N','G','H','N','N','K','N','M','N','N', 'N','N','R','S','T','T','V','W','N','Y','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const unsigned char chrmap_upcase[256] = { /* Map from ascii to ascii Convert to upper case nucleotide @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const unsigned char chrmap_no_change[256] = { /* Map from ascii to ascii - no change @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o', 'p','q','r','s','t','u','v','w','x','y','z','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const unsigned char chrmap_identity[256] = { /* identity map */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; vsearch-2.30.0/src/maps.h000066400000000000000000000066731476012147200151470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MAPS_H #define MAPS_H constexpr auto two_bit_capacity = 4U; constexpr auto four_bit_capacity = 16U; constexpr auto byte_capacity = 256U; extern char sym_nt_2bit[two_bit_capacity + 1]; extern char sym_nt_4bit[four_bit_capacity + 1]; extern unsigned int ambiguous_4bit[four_bit_capacity]; extern unsigned int char_header_action[byte_capacity]; extern unsigned int char_fasta_action[byte_capacity]; extern unsigned int char_fq_action_seq[byte_capacity]; extern unsigned int char_fq_action_qual[byte_capacity]; extern unsigned int chrmap_2bit[byte_capacity]; extern unsigned int chrmap_4bit[byte_capacity]; extern unsigned int chrmap_mask_ambig[byte_capacity]; extern unsigned int chrmap_mask_lower[byte_capacity]; extern const unsigned char chrmap_complement[byte_capacity]; extern const unsigned char chrmap_normalize[byte_capacity]; extern const unsigned char chrmap_upcase[byte_capacity]; extern const unsigned char chrmap_no_change[byte_capacity]; extern const unsigned char chrmap_identity[byte_capacity]; #endif // MAPS_H vsearch-2.30.0/src/mask.cc000066400000000000000000000276521476012147200153000ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include "mask.h" #include #include // std::toupper, std::isupper #include // int64_t, uint64_t #include // std::FILE #include // std::strcpy #include // #include constexpr int dust_window = 64; auto wo(int len, const char *s, int *beg, int *end) -> int { static constexpr int dust_word = 3; static constexpr int word_count = 1U << (2U * dust_word); // 64 static constexpr int bitmask = word_count - 1; const int l1 = len - dust_word + 1 - 5; /* smallest possible region is 8 */ if (l1 < 0) { return 0; } int bestv = 0; int besti = 0; int bestj = 0; std::array counts {{}}; std::array words {{}}; int word = 0; for (int j = 0; j < len; j++) { word <<= 2U; word |= chrmap_2bit[(int) (s[j])]; words[j] = word & bitmask; } for (int i = 0; i < l1; i++) { counts.fill(0); // reset counts to zero int sum = 0; for (int j = dust_word - 1; j < len - i; j++) { word = words[i + j]; const int c = counts[word]; if (c) { sum += c; const int v = 10 * sum / j; if (v > bestv) { bestv = v; besti = i; bestj = j; } } counts[word]++; } } *beg = besti; *end = besti + bestj; return bestv; } auto dust(char * seq, int len) -> void { static constexpr int dust_level = 20; static constexpr int half_dust_window = dust_window / 2; int a = 0; int b = 0; /* make a local copy of the original sequence */ char * local_seq = (char*) xmalloc(len + 1); strcpy(local_seq, seq); // refactoring: // std::string local_seq2; // local_seq2.reserve(len + 1); // local_seq2.insert(0, m); // local_seq2.insert(len, 1, '\0'); if (not opt_hardmask) { /* convert sequence to upper case unless hardmask in effect */ for(int i = 0; i < len; i++) { seq[i] = toupper(seq[i]); } seq[len] = 0; } for (int i = 0; i < len; i += half_dust_window) { const int l = (len > i + dust_window) ? dust_window : len - i; const int v = wo(l, local_seq + i, &a, &b); if (v > dust_level) { if (opt_hardmask) { for (int j = a + i; j <= b + i; j++) { seq[j] = 'N'; } } else { for (int j = a + i; j <= b + i; j++) { seq[j] = local_seq[j] | 32U; // check_5th_bit (0x20) } } if (b < half_dust_window) { i += half_dust_window - b; } } } xfree(local_seq); } static pthread_t * pthread; static pthread_attr_t attr; static pthread_mutex_t mutex; static int nextseq = 0; static int seqcount = 0; auto dust_all_worker(void * vp) -> void * { (void) vp; // not used, but required for thread creation while (true) { xpthread_mutex_lock(&mutex); const int seqno = nextseq; if (seqno < seqcount) { nextseq++; progress_update(seqno); xpthread_mutex_unlock(&mutex); dust(db_getsequence(seqno), db_getsequencelen(seqno)); } else { xpthread_mutex_unlock(&mutex); break; } } return nullptr; } auto dust_all() -> void { nextseq = 0; seqcount = db_getsequencecount(); progress_init("Masking", seqcount); xpthread_mutex_init(&mutex, nullptr); xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); for (int t = 0; t < opt_threads; t++) { xpthread_create(pthread + t, &attr, dust_all_worker, (void *) (int64_t) t); } for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); } xfree(pthread); xpthread_attr_destroy(&attr); xpthread_mutex_destroy(&mutex); progress_done(); } auto hardmask(char * seq, int len) -> void { /* convert all lower case letters in seq to N */ static constexpr auto check_5th_bit = 32U; // 0x20 static constexpr auto hardmask_char = 'N'; for (int j = 0; j < len; j++) { if (seq[j] & check_5th_bit) { seq[j] = hardmask_char; } } } auto hardmask_all() -> void { for (uint64_t i = 0; i < db_getsequencecount(); i++) { hardmask(db_getsequence(i), db_getsequencelen(i)); } } auto maskfasta() -> void { if (! opt_output) { fatal("Output file for masking must be specified with --output"); } std::FILE * fp_output = fopen_output(opt_output); if (! fp_output) { fatal("Unable to open mask output file for writing"); } db_read(opt_maskfasta, 0); show_rusage(); seqcount = db_getsequencecount(); if (opt_qmask == MASK_DUST) { dust_all(); } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) { hardmask_all(); } show_rusage(); progress_init("Writing output", seqcount); for (int i = 0; i < seqcount; i++) { fasta_print_db_relabel(fp_output, i, i + 1); progress_update(i); } progress_done(); show_rusage(); db_free(); fclose(fp_output); } auto fastx_mask() -> void { std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; if ((! opt_fastaout) && (! opt_fastqout)) { fatal("Specify output files for masking with --fastaout and/or --fastqout"); } if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); if (! fp_fastaout) { fatal("Unable to open mask output FASTA file for writing"); } } if (opt_fastqout) { fp_fastqout = fopen_output(opt_fastqout); if (! fp_fastqout) { fatal("Unable to open mask output FASTQ file for writing"); } } db_read(opt_fastx_mask, 0); show_rusage(); if (fp_fastqout && ! db_is_fastq()) { fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); } seqcount = db_getsequencecount(); if (opt_qmask == MASK_DUST) { dust_all(); } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) { hardmask_all(); } show_rusage(); int kept = 0; int discarded_less = 0; int discarded_more = 0; progress_init("Writing output", seqcount); for (int i = 0; i < seqcount; i++) { int unmasked = 0; char * seq = db_getsequence(i); const int len = db_getsequencelen(i); if (opt_qmask == MASK_NONE) { unmasked = len; } else if (opt_hardmask) { for (int j = 0; j < len; j++) { if (seq[j] != 'N') { unmasked++; } } } else { for (int j = 0; j < len; j++) { if (isupper(seq[j])) { unmasked++; } } } const double unmasked_pct = 100.0 * unmasked / len; if (unmasked_pct < opt_min_unmasked_pct) { discarded_less++; } else if (unmasked_pct > opt_max_unmasked_pct) { discarded_more++; } else { kept++; if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, seq, len, db_getheader(i), db_getheaderlen(i), db_getabundance(i), kept, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout) { fastq_print_general(fp_fastqout, seq, len, db_getheader(i), db_getheaderlen(i), db_getquality(i), db_getabundance(i), kept, -1.0); } } progress_update(i); } progress_done(); if (! opt_quiet) { if (opt_min_unmasked_pct > 0.0) { fprintf(stderr, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, opt_min_unmasked_pct); } if (opt_max_unmasked_pct < 100.0) { fprintf(stderr, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, opt_max_unmasked_pct); } fprintf(stderr, "%d sequences kept\n", kept); } if (opt_log) { if (opt_min_unmasked_pct > 0.0) { fprintf(fp_log, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, opt_min_unmasked_pct); } if (opt_max_unmasked_pct < 100.0) { fprintf(fp_log, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, opt_max_unmasked_pct); } fprintf(fp_log, "%d sequences kept\n", kept); } show_rusage(); db_free(); if (fp_fastaout) { fclose(fp_fastaout); } if (fp_fastqout) { fclose(fp_fastqout); } } vsearch-2.30.0/src/mask.h000066400000000000000000000054571476012147200151410ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // refactoring: enum struct Masking: int { error, none, dust, soft }; constexpr auto MASK_ERROR = -1; constexpr auto MASK_NONE = 0; constexpr auto MASK_DUST = 1; constexpr auto MASK_SOFT = 2; auto maskfasta() -> void; auto fastx_mask() -> void; auto dust(char * seq, int len) -> void; auto hardmask(char * seq, int len) -> void; auto dust_all() -> void; auto hardmask_all() -> void; vsearch-2.30.0/src/md5.c000066400000000000000000000244451476012147200146640ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * (This is a heavily cut-down "BSD license".) * * This differs from Colin Plumb's older public domain implementation in that * no exactly 32-bit integer data type is required (any 32-bit or wider * unsigned integer data type will do), there's no compile-time endianness * configuration, and the function prototypes match OpenSSL's. No code from * Colin Plumb's implementation has been reused; this comment merely compares * the properties of the two independent implementations. * * The primary goals of this implementation are portability and ease of use. * It is meant to be fast, but not as fast as possible. Some known * optimizations are not included to reduce source code size and avoid * compile-time configuration. */ #ifndef HAVE_OPENSSL #include #include "md5.h" /* * The basic MD5 functions. * * F and G are optimized compared to their RFC 1321 definitions for * architectures that lack an AND-NOT instruction, just like in Colin Plumb's * implementation. */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | ~(z))) /* * The MD5 transformation for all four rounds. */ #define STEP(f, a, b, c, d, x, t, s) \ (a) += f((b), (c), (d)) + (x) + (t); \ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ (a) += (b); /* * SET reads 4 input bytes in little-endian byte order and stores them * in a properly aligned word in host byte order. * * The check for little-endian architectures that tolerate unaligned * memory accesses is just an optimization. Nothing will break if it * doesn't work. */ #if defined(__i386__) || defined(__x86_64__) || defined(__vax__) #define SET(n) \ (*(MD5_u32plus *)&ptr[(n) * 4]) #define GET(n) \ SET(n) #else #define SET(n) \ (ctx->block[(n)] = \ (MD5_u32plus)ptr[(n) * 4] | \ ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) #define GET(n) \ (ctx->block[(n)]) #endif /* * This processes one or more 64-byte data blocks, but does NOT update * the bit counters. There are no alignment requirements. */ static void *body(MD5_CTX *ctx, void *data, unsigned long size) { unsigned char *ptr; MD5_u32plus a = 0; MD5_u32plus b = 0; MD5_u32plus c = 0; MD5_u32plus d = 0; MD5_u32plus saved_a = 0; MD5_u32plus saved_b = 0; MD5_u32plus saved_c = 0; MD5_u32plus saved_d = 0; ptr = data; a = ctx->a; b = ctx->b; c = ctx->c; d = ctx->d; do { saved_a = a; saved_b = b; saved_c = c; saved_d = d; /* Round 1 */ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) STEP(F, c, d, a, b, SET(2), 0x242070db, 17) STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) /* Round 2 */ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) STEP(G, d, a, b, c, GET(10), 0x02441453, 9) STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) /* Round 3 */ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) /* Round 4 */ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) a += saved_a; b += saved_b; c += saved_c; d += saved_d; ptr += 64; } while (size -= 64); ctx->a = a; ctx->b = b; ctx->c = c; ctx->d = d; return ptr; } void MD5_Init(MD5_CTX *ctx) { ctx->a = 0x67452301; ctx->b = 0xefcdab89; ctx->c = 0x98badcfe; ctx->d = 0x10325476; ctx->lo = 0; ctx->hi = 0; } void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) { MD5_u32plus saved_lo = 0; unsigned long used = 0; unsigned long free = 0; saved_lo = ctx->lo; ctx->lo = (saved_lo + size) & 0x1fffffff; if (ctx->lo < saved_lo) { ctx->hi++; } ctx->hi += size >> 29; used = saved_lo & 0x3f; if (used) { free = 64 - used; if (size < free) { memcpy(&ctx->buffer[used], data, size); return; } memcpy(&ctx->buffer[used], data, free); data = (unsigned char *)data + free; size -= free; body(ctx, ctx->buffer, 64); } if (size >= 64) { data = body(ctx, data, size & ~(unsigned long)0x3f); size &= 0x3f; } memcpy(ctx->buffer, data, size); } void MD5_Final(unsigned char *result, MD5_CTX *ctx) { unsigned long used; unsigned long free; used = ctx->lo & 0x3f; ctx->buffer[used++] = 0x80; free = 64 - used; if (free < 8) { memset(&ctx->buffer[used], 0, free); body(ctx, ctx->buffer, 64); used = 0; free = 64; } memset(&ctx->buffer[used], 0, free - 8); ctx->lo <<= 3; ctx->buffer[56] = ctx->lo; ctx->buffer[57] = ctx->lo >> 8; ctx->buffer[58] = ctx->lo >> 16; ctx->buffer[59] = ctx->lo >> 24; ctx->buffer[60] = ctx->hi; ctx->buffer[61] = ctx->hi >> 8; ctx->buffer[62] = ctx->hi >> 16; ctx->buffer[63] = ctx->hi >> 24; body(ctx, ctx->buffer, 64); result[0] = ctx->a; result[1] = ctx->a >> 8; result[2] = ctx->a >> 16; result[3] = ctx->a >> 24; result[4] = ctx->b; result[5] = ctx->b >> 8; result[6] = ctx->b >> 16; result[7] = ctx->b >> 24; result[8] = ctx->c; result[9] = ctx->c >> 8; result[10] = ctx->c >> 16; result[11] = ctx->c >> 24; result[12] = ctx->d; result[13] = ctx->d >> 8; result[14] = ctx->d >> 16; result[15] = ctx->d >> 24; memset(ctx, 0, sizeof(*ctx)); } #endif vsearch-2.30.0/src/md5.h000066400000000000000000000027751476012147200146730ustar00rootroot00000000000000/* Slightly modified for vsearch by Torbjorn Rognes, 29 Sep 2015 */ /* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * See md5.c for more information. */ #ifndef __MD5_H #define __MD5_H #ifdef __cplusplus extern "C" { #endif /* Any 32-bit or wider unsigned integer data type will do */ typedef unsigned int MD5_u32plus; typedef struct { MD5_u32plus lo, hi; MD5_u32plus a, b, c, d; unsigned char buffer[64]; MD5_u32plus block[16]; } MD5_CTX; extern void MD5_Init(MD5_CTX *ctx); extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); #ifdef __cplusplus } #endif #endif /* __MD5_H */ vsearch-2.30.0/src/mergepairs.cc000066400000000000000000001236601476012147200164770ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include #include // macros PRIu64 and PRId64 #include // std::pow, std::sqrt, std::round, std::log10, std::log2 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::exit, EXIT_FAILURE #include // std::strcpy, std::strlen #include #include /* chunk constants */ constexpr auto chunk_size = 500; /* read pairs per chunk */ constexpr auto chunk_factor = 2; /* chunks per thread */ /* scores in bits */ static const int k = 5; static int merge_mindiagcount = 4; static double merge_minscore = 16.0; static const double merge_dropmax = 16.0; static const double merge_mismatchmax = -4.0; /* static variables */ static std::FILE * fp_fastqout = nullptr; static std::FILE * fp_fastaout = nullptr; static std::FILE * fp_fastqout_notmerged_fwd = nullptr; static std::FILE * fp_fastqout_notmerged_rev = nullptr; static std::FILE * fp_fastaout_notmerged_fwd = nullptr; static std::FILE * fp_fastaout_notmerged_rev = nullptr; static std::FILE * fp_eetabbedout = nullptr; static fastx_handle fastq_fwd; static fastx_handle fastq_rev; static int64_t merged = 0; static int64_t notmerged = 0; static int64_t total = 0; static double sum_read_length = 0.0; static double sum_squared_fragment_length = 0.0; static double sum_fragment_length = 0.0; static pthread_t * pthread; static pthread_attr_t attr; constexpr auto n_quality_symbols = 128U; static char merge_qual_same[n_quality_symbols][n_quality_symbols]; static char merge_qual_diff[n_quality_symbols][n_quality_symbols]; static double match_score[n_quality_symbols][n_quality_symbols]; static double mism_score[n_quality_symbols][n_quality_symbols]; static double q2p[n_quality_symbols]; static double sum_ee_fwd = 0.0; static double sum_ee_rev = 0.0; static double sum_ee_merged = 0.0; static uint64_t sum_errors_fwd = 0.0; static uint64_t sum_errors_rev = 0.0; static uint64_t failed_undefined = 0; static uint64_t failed_minlen = 0; static uint64_t failed_maxlen = 0; static uint64_t failed_maxns = 0; static uint64_t failed_minovlen = 0; static uint64_t failed_maxdiffs = 0; static uint64_t failed_maxdiffpct = 0; static uint64_t failed_staggered = 0; static uint64_t failed_indel = 0; static uint64_t failed_repeat = 0; static uint64_t failed_minmergelen = 0; static uint64_t failed_maxmergelen = 0; static uint64_t failed_maxee = 0; static uint64_t failed_minscore = 0; static uint64_t failed_nokmers = 0; /* reasons for not merging: - undefined - ok - input seq too short (after truncation) - input seq too long - too many Ns in input - overlap too short - too many differences (maxdiffs) - too high percentage of differences (maxdiffpct) - staggered - indels in overlap region - potential repeats in overlap region / multiple overlaps - merged sequence too short - merged sequence too long - expected error too high - alignment score too low, insignificant, potential indel - too few kmers on same diag found */ enum reason_enum { undefined, ok, minlen, maxlen, maxns, minovlen, maxdiffs, maxdiffpct, staggered, indel, repeat, minmergelen, maxmergelen, maxee, minscore, nokmers }; enum state_enum { empty, filled, inprogress, processed }; struct merge_data_s { char * fwd_header; char * rev_header; char * fwd_sequence; char * rev_sequence; char * fwd_quality; char * rev_quality; int64_t header_alloc; int64_t seq_alloc; int64_t fwd_length; int64_t rev_length; int64_t fwd_trunc; int64_t rev_trunc; int64_t pair_no; char * merged_sequence; char * merged_quality; int64_t merged_length; int64_t merged_seq_alloc; double ee_merged; double ee_fwd; double ee_rev; int64_t fwd_errors; int64_t rev_errors; int64_t offset; bool merged; reason_enum reason; state_enum state; }; using merge_data_t = struct merge_data_s; struct chunk_s { int size; /* size of merge_data = number of pairs of reads */ state_enum state; /* state of chunk: empty, read, processed */ merge_data_t * merge_data; /* data for merging */ }; using chunk_t = struct chunk_s; static chunk_t * chunks; /* pointer to array of chunks */ static int chunk_count; static int chunk_read_next; static int chunk_process_next; static int chunk_write_next; static bool finished_reading = false; static bool finished_all = false; static int pairs_read = 0; static int pairs_written = 0; static pthread_mutex_t mutex_chunks; static pthread_cond_t cond_chunks; auto fileopenw(char * filename) -> std::FILE * { std::FILE * fp = nullptr; fp = fopen_output(filename); if (! fp) { fatal("Unable to open file for writing (%s)", filename); } return fp; } inline auto get_qual(char q) -> int { int const qual = q - opt_fastq_ascii; if (qual < opt_fastq_qmin) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); if (fp_log) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); } exit(EXIT_FAILURE); } else if (qual > opt_fastq_qmax) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(stderr, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); if (fp_log) { fprintf(fp_log, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(fp_log, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); } exit(EXIT_FAILURE); } return qual; } inline auto q_to_p(int quality_symbol) -> double { static constexpr int low_quality_threshold = 2; static constexpr double max_probability = 0.75; static constexpr double quality_divider = 10.0; static constexpr double power_base = 10.0; assert(quality_symbol >= 33); assert(quality_symbol <= 126); const auto quality_value = static_cast(quality_symbol - opt_fastq_ascii); // refactor: extract branch to a separate operation if (quality_value < low_quality_threshold) { return max_probability; } // probability = 10^-(quality / 10) return std::pow(power_base, -quality_value / quality_divider); } auto precompute_qual() -> void { /* Precompute tables of scores etc */ for (int x = 33; x <= 126; x++) { double const px = q_to_p(x); q2p[x] = px; for (int y = 33; y <= 126; y++) { double const py = q_to_p(y); double p = 0.0; double q = 0.0; /* Quality score equations from Edgar & Flyvbjerg (2015) */ /* Match */ p = px * py / 3.0 / (1.0 - px - py + 4.0 * px * py / 3.0); q = round(-10.0 * log10(p)); q = MIN(q, opt_fastq_qmaxout); q = MAX(q, opt_fastq_qminout); merge_qual_same[x][y] = opt_fastq_ascii + q; /* Mismatch, x is highest quality */ p = px * (1.0 - py / 3.0) / (px + py - 4.0 * px * py / 3.0); q = round(-10.0 * log10(p)); q = MIN(q, opt_fastq_qmaxout); q = MAX(q, opt_fastq_qminout); merge_qual_diff[x][y] = opt_fastq_ascii + q; /* observed match, p = probability that they truly are identical, given error probabilites of px and py, resp. */ // Given two initially identical aligned bases, and // the error probabilities px and py, // what is the probability of observing a match (or a mismatch)? p = 1.0 - px - py + px * py * 4.0 / 3.0; match_score[x][y] = log2(p/0.25); // Use a minimum mismatch penalty mism_score[x][y] = MIN(log2((1.0-p)/0.75), merge_mismatchmax); } } } auto merge_sym(char * sym, char * qual, char fwd_sym, char rev_sym, char fwd_qual, char rev_qual) -> void { if (rev_sym == 'N') { * sym = fwd_sym; * qual = fwd_qual; } else if (fwd_sym == 'N') { * sym = rev_sym; * qual = rev_qual; } else if (fwd_sym == rev_sym) { /* agreement */ * sym = fwd_sym; * qual = merge_qual_same[(unsigned)fwd_qual][(unsigned)rev_qual]; } else { /* disagreement */ if (fwd_qual > rev_qual) { * sym = fwd_sym; * qual = merge_qual_diff[(unsigned)fwd_qual][(unsigned)rev_qual]; } else { * sym = rev_sym; * qual = merge_qual_diff[(unsigned)rev_qual][(unsigned)fwd_qual]; } } } auto keep(merge_data_t * ip) -> void { merged++; sum_fragment_length += ip->merged_length; sum_squared_fragment_length += ip->merged_length * ip->merged_length; sum_ee_merged += ip->ee_merged; sum_ee_fwd += ip->ee_fwd; sum_ee_rev += ip->ee_rev; sum_errors_fwd += ip->fwd_errors; sum_errors_rev += ip->rev_errors; if (opt_fastqout) { fastq_print_general(fp_fastqout, ip->merged_sequence, ip->merged_length, ip->fwd_header, strlen(ip->fwd_header), ip->merged_quality, 0, merged, ip->ee_merged); } if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, ip->merged_sequence, ip->merged_length, ip->fwd_header, strlen(ip->fwd_header), 0, merged, ip->ee_merged, -1, -1, nullptr, 0.0); } if (opt_eetabbedout) { fprintf(fp_eetabbedout, "%.2lf\t%.2lf\t%" PRId64 "\t%" PRId64 "\n", ip->ee_fwd, ip->ee_rev, ip->fwd_errors, ip->rev_errors); } } auto discard(merge_data_t * ip) -> void { switch(ip->reason) { case undefined: failed_undefined++; break; case ok: break; case minlen: failed_minlen++; break; case maxlen: failed_maxlen++; break; case maxns: failed_maxns++; break; case minovlen: failed_minovlen++; break; case maxdiffs: failed_maxdiffs++; break; case maxdiffpct: failed_maxdiffpct++; break; case staggered: failed_staggered++; break; case indel: failed_indel++; break; case repeat: failed_repeat++; break; case minmergelen: failed_minmergelen++; break; case maxmergelen: failed_maxmergelen++; break; case maxee: failed_maxee++; break; case minscore: failed_minscore++; break; case nokmers: failed_nokmers++; break; } notmerged++; if (opt_fastqout_notmerged_fwd) { fastq_print_general(fp_fastqout_notmerged_fwd, ip->fwd_sequence, ip->fwd_length, ip->fwd_header, strlen(ip->fwd_header), ip->fwd_quality, 0, notmerged, -1.0); } if (opt_fastqout_notmerged_rev) { fastq_print_general(fp_fastqout_notmerged_rev, ip->rev_sequence, ip->rev_length, ip->rev_header, strlen(ip->rev_header), ip->rev_quality, 0, notmerged, -1.0); } if (opt_fastaout_notmerged_fwd) { fasta_print_general(fp_fastaout_notmerged_fwd, nullptr, ip->fwd_sequence, ip->fwd_length, ip->fwd_header, strlen(ip->fwd_header), 0, notmerged, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastaout_notmerged_rev) { fasta_print_general(fp_fastaout_notmerged_rev, nullptr, ip->rev_sequence, ip->rev_length, ip->rev_header, strlen(ip->rev_header), 0, notmerged, -1.0, -1, -1, nullptr, 0.0); } } auto merge(merge_data_t * ip) -> void { /* length of 5' overhang of the forward sequence not merged with the reverse sequence */ int64_t const fwd_5prime_overhang = ip->fwd_trunc > ip->offset ? ip->fwd_trunc - ip->offset : 0; ip->ee_merged = 0.0; ip->ee_fwd = 0.0; ip->ee_rev = 0.0; ip->fwd_errors = 0; ip->rev_errors = 0; char sym = '\0'; char qual = '\0'; char fwd_sym = '\0'; char fwd_qual = '\0'; char rev_sym = '\0'; char rev_qual = '\0'; int64_t fwd_pos = 0; int64_t rev_pos = 0; int64_t merged_pos = 0; double ee = 0.0; merged_pos = 0; // 5' overhang in forward sequence fwd_pos = 0; while (fwd_pos < fwd_5prime_overhang) { sym = ip->fwd_sequence[fwd_pos]; qual = ip->fwd_quality[fwd_pos]; ip->merged_sequence[merged_pos] = sym; ip->merged_quality[merged_pos] = qual; ee = q2p[(unsigned)qual]; ip->ee_merged += ee; ip->ee_fwd += ee; fwd_pos++; merged_pos++; } // Merged region int64_t const rev_3prime_overhang = ip->offset > ip->fwd_trunc ? ip->offset - ip->fwd_trunc : 0; rev_pos = ip->rev_trunc - 1 - rev_3prime_overhang; while ((fwd_pos < ip->fwd_trunc) && (rev_pos >= 0)) { fwd_sym = ip->fwd_sequence[fwd_pos]; rev_sym = chrmap_complement[(int) (ip->rev_sequence[rev_pos])]; fwd_qual = ip->fwd_quality[fwd_pos]; rev_qual = ip->rev_quality[rev_pos]; merge_sym(& sym, & qual, fwd_qual < 2 ? 'N' : fwd_sym, rev_qual < 2 ? 'N' : rev_sym, fwd_qual, rev_qual); if (sym != fwd_sym) { ip->fwd_errors++; } if (sym != rev_sym) { ip->rev_errors++; } ip->merged_sequence[merged_pos] = sym; ip->merged_quality[merged_pos] = qual; ip->ee_merged += q2p[(unsigned) qual]; ip->ee_fwd += q2p[(unsigned) fwd_qual]; ip->ee_rev += q2p[(unsigned) rev_qual]; fwd_pos++; rev_pos--; merged_pos++; } // 5' overhang in reverse sequence while (rev_pos >= 0) { sym = chrmap_complement[(int) (ip->rev_sequence[rev_pos])]; qual = ip->rev_quality[rev_pos]; ip->merged_sequence[merged_pos] = sym; ip->merged_quality[merged_pos] = qual; merged_pos++; ee = q2p[(unsigned) qual]; ip->ee_merged += ee; ip->ee_rev += ee; rev_pos--; } int64_t const mergelen = merged_pos; ip->merged_length = mergelen; ip->merged_sequence[mergelen] = 0; ip->merged_quality[mergelen] = 0; if (ip->ee_merged <= opt_fastq_maxee) { ip->reason = ok; ip->merged = true; } else { ip->reason = maxee; } } auto optimize(merge_data_t * ip, kh_handle_s * kmerhash) -> int64_t { /* ungapped alignment in each diagonal */ int64_t const i1 = 1; int64_t const i2 = ip->fwd_trunc + ip->rev_trunc - 1; double best_score = 0.0; int64_t best_i = 0; int64_t best_diffs = 0; int hits = 0; int kmers = 0; std::vector diags(ip->fwd_trunc + ip->rev_trunc, 0); kh_insert_kmers(kmerhash, k, ip->fwd_sequence, ip->fwd_trunc); kh_find_diagonals(kmerhash, k, ip->rev_sequence, ip->rev_trunc, diags.data()); for (int64_t i = i1; i <= i2; i++) { int const diag = ip->rev_trunc + ip->fwd_trunc - i; int const diagcount = diags[diag]; if (diagcount >= merge_mindiagcount) { kmers = 1; /* for each interesting diagonal */ int64_t const fwd_3prime_overhang = i > ip->rev_trunc ? i - ip->rev_trunc : 0; int64_t const rev_3prime_overhang = i > ip->fwd_trunc ? i - ip->fwd_trunc : 0; int64_t const overlap = i - fwd_3prime_overhang - rev_3prime_overhang; int64_t const fwd_pos_start = ip->fwd_trunc - fwd_3prime_overhang - 1; int64_t const rev_pos_start = ip->rev_trunc - rev_3prime_overhang - overlap; int64_t fwd_pos = fwd_pos_start; int64_t rev_pos = rev_pos_start; double score = 0.0; int64_t diffs = 0; double score_high = 0.0; double dropmax = 0.0; for (int64_t j=0; j < overlap; j++) { /* for each pair of bases in the overlap */ char const fwd_sym = ip->fwd_sequence[fwd_pos]; char const rev_sym = chrmap_complement[(int) (ip->rev_sequence[rev_pos])]; unsigned int const fwd_qual = ip->fwd_quality[fwd_pos]; unsigned int const rev_qual = ip->rev_quality[rev_pos]; fwd_pos--; rev_pos++; if (fwd_sym == rev_sym) { score += match_score[fwd_qual][rev_qual]; if (score > score_high) { score_high = score; } } else { score += mism_score[fwd_qual][rev_qual]; diffs++; if (score < score_high - dropmax) { dropmax = score_high - score; } } } if (dropmax >= merge_dropmax) { score = 0.0; } if (score >= merge_minscore) { hits++; } if (score > best_score) { best_score = score; best_i = i; best_diffs = diffs; } } } if (hits > 1) { ip->reason = repeat; return 0; } if ((! opt_fastq_allowmergestagger) && (best_i > ip->fwd_trunc)) { ip->reason = staggered; return 0; } if (best_diffs > opt_fastq_maxdiffs) { ip->reason = maxdiffs; return 0; } if ((100.0 * best_diffs / best_i) > opt_fastq_maxdiffpct) { ip->reason = maxdiffpct; return 0; } if (kmers == 0) { ip->reason = nokmers; return 0; } if (best_score < merge_minscore) { ip->reason = minscore; return 0; } if (best_i < opt_fastq_minovlen) { ip->reason = minovlen; return 0; } int const mergelen = ip->fwd_trunc + ip->rev_trunc - best_i; if (mergelen < opt_fastq_minmergelen) { ip->reason = minmergelen; return 0; } if (mergelen > opt_fastq_maxmergelen) { ip->reason = maxmergelen; return 0; } return best_i; } auto process(merge_data_t * ip, struct kh_handle_s * kmerhash) -> void { ip->merged = false; bool skip = false; /* check length */ if ((ip->fwd_length < opt_fastq_minlen) || (ip->rev_length < opt_fastq_minlen)) { ip->reason = minlen; skip = true; } if ((ip->fwd_length > opt_fastq_maxlen) || (ip->rev_length > opt_fastq_maxlen)) { ip->reason = maxlen; skip = true; } /* truncate sequences by quality */ int64_t fwd_trunc = ip->fwd_length; if (! skip) { for (int64_t i = 0; i < ip->fwd_length; i++) { if (get_qual(ip->fwd_quality[i]) <= opt_fastq_truncqual) { fwd_trunc = i; break; } } if (fwd_trunc < opt_fastq_minlen) { ip->reason = minlen; skip = true; } } ip->fwd_trunc = fwd_trunc; int64_t rev_trunc = ip->rev_length; if (! skip) { for (int64_t i = 0; i < ip->rev_length; i++) { if (get_qual(ip->rev_quality[i]) <= opt_fastq_truncqual) { rev_trunc = i; break; } } if (rev_trunc < opt_fastq_minlen) { ip->reason = minlen; skip = true; } } ip->rev_trunc = rev_trunc; /* count n's */ /* replace quality of N's by zero */ if (! skip) { int64_t fwd_ncount = 0; for (int64_t i = 0; i < fwd_trunc; i++) { if (ip->fwd_sequence[i] == 'N') { ip->fwd_quality[i] = opt_fastq_ascii; fwd_ncount++; } } if (fwd_ncount > opt_fastq_maxns) { ip->reason = maxns; skip = true; } } if (! skip) { int64_t rev_ncount = 0; for (int64_t i = 0; i < rev_trunc; i++) { if (ip->rev_sequence[i] == 'N') { ip->rev_quality[i] = opt_fastq_ascii; rev_ncount++; } } if (rev_ncount > opt_fastq_maxns) { ip->reason = maxns; skip = true; } } ip->offset = 0; if (! skip) { ip->offset = optimize(ip, kmerhash); } if (ip->offset > 0) { merge(ip); } ip->state = processed; } auto read_pair(merge_data_t * ip) -> bool { if (fastq_next(fastq_fwd, false, chrmap_upcase)) { if (! fastq_next(fastq_rev, false, chrmap_upcase)) { fatal("More forward reads than reverse reads"); } /* allocate more memory if necessary */ int64_t const fwd_header_len = fastq_get_header_length(fastq_fwd); int64_t const rev_header_len = fastq_get_header_length(fastq_rev); int64_t const header_needed = MAX(fwd_header_len, rev_header_len) + 1; if (header_needed > ip->header_alloc) { ip->header_alloc = header_needed; ip->fwd_header = (char *) xrealloc(ip->fwd_header, header_needed); ip->rev_header = (char *) xrealloc(ip->rev_header, header_needed); } ip->fwd_length = fastq_get_sequence_length(fastq_fwd); ip->rev_length = fastq_get_sequence_length(fastq_rev); int64_t const seq_needed = MAX(ip->fwd_length, ip->rev_length) + 1; sum_read_length += ip->fwd_length + ip->rev_length; if (seq_needed > ip->seq_alloc) { ip->seq_alloc = seq_needed; ip->fwd_sequence = (char *) xrealloc(ip->fwd_sequence, seq_needed); ip->rev_sequence = (char *) xrealloc(ip->rev_sequence, seq_needed); ip->fwd_quality = (char *) xrealloc(ip->fwd_quality, seq_needed); ip->rev_quality = (char *) xrealloc(ip->rev_quality, seq_needed); } int64_t const merged_seq_needed = ip->fwd_length + ip->rev_length + 1; if (merged_seq_needed > ip->merged_seq_alloc) { ip->merged_seq_alloc = merged_seq_needed; ip->merged_sequence = (char *) xrealloc(ip->merged_sequence, merged_seq_needed); ip->merged_quality = (char *) xrealloc(ip->merged_quality, merged_seq_needed); } /* make local copies of the seq, header and qual */ strcpy(ip->fwd_header, fastq_get_header(fastq_fwd)); strcpy(ip->rev_header, fastq_get_header(fastq_rev)); strcpy(ip->fwd_sequence, fastq_get_sequence(fastq_fwd)); strcpy(ip->rev_sequence, fastq_get_sequence(fastq_rev)); strcpy(ip->fwd_quality, fastq_get_quality(fastq_fwd)); strcpy(ip->rev_quality, fastq_get_quality(fastq_rev)); ip->merged_sequence[0] = 0; ip->merged_quality[0] = 0; ip->merged = false; ip->pair_no = total++; return true; } else { return false; } } auto keep_or_discard(merge_data_t * ip) -> void { if (ip->merged) { keep(ip); } else { discard(ip); } } auto init_merge_data(merge_data_t * ip) -> void { ip->fwd_header = nullptr; ip->rev_header = nullptr; ip->fwd_sequence = nullptr; ip->rev_sequence = nullptr; ip->fwd_quality = nullptr; ip->rev_quality = nullptr; ip->header_alloc = 0; ip->seq_alloc = 0; ip->fwd_length = 0; ip->rev_length = 0; ip->fwd_trunc = 0; ip->rev_trunc = 0; ip->pair_no = 0; ip->reason = undefined; ip->merged_seq_alloc = 0; ip->merged_sequence = nullptr; ip->merged_quality = nullptr; ip->merged_length = 0; } auto free_merge_data(merge_data_t * ip) -> void { if (ip->fwd_header) { xfree(ip->fwd_header); } if (ip->rev_header) { xfree(ip->rev_header); } if (ip->fwd_sequence) { xfree(ip->fwd_sequence); } if (ip->rev_sequence) { xfree(ip->rev_sequence); } if (ip->fwd_quality) { xfree(ip->fwd_quality); } if (ip->rev_quality) { xfree(ip->rev_quality); } if (ip->merged_sequence) { xfree(ip->merged_sequence); } if (ip->merged_quality) { xfree(ip->merged_quality); } } inline auto chunk_perform_read() -> void { while((!finished_reading) && (chunks[chunk_read_next].state == empty)) { xpthread_mutex_unlock(&mutex_chunks); progress_update(fastq_get_position(fastq_fwd)); int r = 0; while ((r < chunk_size) && read_pair(chunks[chunk_read_next].merge_data + r)) { r++; } chunks[chunk_read_next].size = r; xpthread_mutex_lock(&mutex_chunks); pairs_read += r; if (r > 0) { chunks[chunk_read_next].state = filled; chunk_read_next = (chunk_read_next + 1) % chunk_count; } if (r < chunk_size) { finished_reading = true; if (pairs_written >= pairs_read) { finished_all = true; } } xpthread_cond_broadcast(&cond_chunks); } } inline auto chunk_perform_write() -> void { while (chunks[chunk_write_next].state == processed) { xpthread_mutex_unlock(&mutex_chunks); for (int i = 0; i < chunks[chunk_write_next].size; i++) { keep_or_discard(chunks[chunk_write_next].merge_data + i); } xpthread_mutex_lock(&mutex_chunks); pairs_written += chunks[chunk_write_next].size; chunks[chunk_write_next].state = empty; if (finished_reading && (pairs_written >= pairs_read)) { finished_all = true; } chunk_write_next = (chunk_write_next + 1) % chunk_count; xpthread_cond_broadcast(&cond_chunks); } } inline auto chunk_perform_process(struct kh_handle_s * kmerhash) -> void { int const chunk_current = chunk_process_next; if (chunks[chunk_current].state == filled) { chunks[chunk_current].state = inprogress; chunk_process_next = (chunk_current + 1) % chunk_count; xpthread_cond_broadcast(&cond_chunks); xpthread_mutex_unlock(&mutex_chunks); for (int i = 0; i < chunks[chunk_current].size; i++) { process(chunks[chunk_current].merge_data + i, kmerhash); } xpthread_mutex_lock(&mutex_chunks); chunks[chunk_current].state = processed; xpthread_cond_broadcast(&cond_chunks); } } auto pair_worker(void * vp) -> void * { /* new */ auto t = (int64_t) vp; struct kh_handle_s * kmerhash = kh_init(); xpthread_mutex_lock(&mutex_chunks); while (! finished_all) { if (opt_threads == 1) { /* One thread does it all */ chunk_perform_read(); chunk_perform_process(kmerhash); chunk_perform_write(); } else if (opt_threads == 2) { if (t == 0) { /* first thread reads and processes */ while (! ( finished_all || (chunks[chunk_process_next].state == filled) || ((! finished_reading) && chunks[chunk_read_next].state == empty))) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_read(); chunk_perform_process(kmerhash); } else /* t == 1 */ { /* second thread writes and processes */ while (! ( finished_all || (chunks[chunk_process_next].state == filled) || (chunks[chunk_write_next].state == processed) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_write(); chunk_perform_process(kmerhash); } } else { if (t == 0) { /* first thread reads and processes */ while (! ( finished_all || ((! finished_reading) && (chunks[chunk_read_next].state == empty)) || (chunks[chunk_process_next].state == filled) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_read(); chunk_perform_process(kmerhash); } else if (t == opt_threads - 1) { /* last thread writes and processes */ while (! ( finished_all || (chunks[chunk_write_next].state == processed) || (chunks[chunk_process_next].state == filled) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_write(); chunk_perform_process(kmerhash); } else { /* the other threads are only processing */ while (! ( finished_all || (chunks[chunk_process_next].state == filled) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_process(kmerhash); } } } xpthread_mutex_unlock(&mutex_chunks); kh_exit(kmerhash); return nullptr; } auto pair_all() -> void { /* prepare chunks */ chunk_count = chunk_factor * opt_threads; chunk_read_next = 0; chunk_process_next = 0; chunk_write_next = 0; chunks = (chunk_t *) xmalloc(chunk_count * sizeof(chunk_t)); for (int i = 0; i < chunk_count; i++) { chunks[i].state = empty; chunks[i].size = 0; chunks[i].merge_data = (merge_data_t *) xmalloc(chunk_size * sizeof(merge_data_t)); for (int64_t j = 0; j < chunk_size; j++) { init_merge_data(chunks[i].merge_data + j); } } xpthread_mutex_init(&mutex_chunks, nullptr); xpthread_cond_init(&cond_chunks, nullptr); /* prepare threads */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); for (int t = 0; t < opt_threads; t++) { xpthread_create(pthread+t, &attr, pair_worker, (void *) (int64_t) t); } /* wait for threads to terminate */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); } /* free threads */ xfree(pthread); xpthread_attr_destroy(&attr); /* free chunks */ xpthread_cond_destroy(&cond_chunks); xpthread_mutex_destroy(&mutex_chunks); for (int i = 0; i < chunk_count; i++) { for (int j=0; j < chunk_size; j++) { free_merge_data(chunks[i].merge_data + j); } xfree(chunks[i].merge_data); chunks[i].merge_data = nullptr; } xfree(chunks); chunks = nullptr; } auto print_stats(std::FILE * fp) -> void { fprintf(fp, "%10" PRIu64 " Pairs\n", total); fprintf(fp, "%10" PRIu64 " Merged", merged); if (total > 0) { fprintf(fp, " (%.1lf%%)", 100.0 * merged / total); } fprintf(fp, "\n"); fprintf(fp, "%10" PRIu64 " Not merged", notmerged); if (total > 0) { fprintf(fp, " (%.1lf%%)", 100.0 * notmerged / total); } fprintf(fp, "\n"); if (notmerged > 0) { fprintf(fp, "\nPairs that failed merging due to various reasons:\n"); } if (failed_undefined) { fprintf(fp, "%10" PRIu64 " undefined reason\n", failed_undefined); } if (failed_minlen) { fprintf(fp, "%10" PRIu64 " reads too short (after truncation)\n", failed_minlen); } if (failed_maxlen) { fprintf(fp, "%10" PRIu64 " reads too long (after truncation)\n", failed_maxlen); } if (failed_maxns) { fprintf(fp, "%10" PRIu64 " too many N's\n", failed_maxns); } if (failed_nokmers) { fprintf(fp, "%10" PRIu64 " too few kmers found on same diagonal\n", failed_nokmers); } if (failed_repeat) { fprintf(fp, "%10" PRIu64 " multiple potential alignments\n", failed_repeat); } if (failed_maxdiffs) { fprintf(fp, "%10" PRIu64 " too many differences\n", failed_maxdiffs); } if (failed_maxdiffpct) { fprintf(fp, "%10" PRIu64 " too high percentage of differences\n", failed_maxdiffpct); } if (failed_minscore) { fprintf(fp, "%10" PRIu64 " alignment score too low, or score drop too high\n", failed_minscore); } if (failed_minovlen) { fprintf(fp, "%10" PRIu64 " overlap too short\n", failed_minovlen); } if (failed_maxee) { fprintf(fp, "%10" PRIu64 " expected error too high\n", failed_maxee); } if (failed_minmergelen) { fprintf(fp, "%10" PRIu64 " merged fragment too short\n", failed_minmergelen); } if (failed_maxmergelen) { fprintf(fp, "%10" PRIu64 " merged fragment too long\n", failed_maxmergelen); } if (failed_staggered) { fprintf(fp, "%10" PRIu64 " staggered read pairs\n", failed_staggered); } if (failed_indel) { fprintf(fp, "%10" PRIu64 " indel errors\n", failed_indel); } fprintf(fp, "\n"); if (total > 0) { fprintf(fp, "Statistics of all reads:\n"); double const mean_read_length = sum_read_length / (2.0 * pairs_read); fprintf(fp, "%10.2f Mean read length\n", mean_read_length); } if (merged > 0) { fprintf(fp, "\n"); fprintf(fp, "Statistics of merged reads:\n"); double const mean = sum_fragment_length / merged; fprintf(fp, "%10.2f Mean fragment length\n", mean); double const stdev = sqrt((sum_squared_fragment_length - 2.0 * mean * sum_fragment_length + mean * mean * merged) / (merged + 0.0)); fprintf(fp, "%10.2f Standard deviation of fragment length\n", stdev); fprintf(fp, "%10.2f Mean expected error in forward sequences\n", sum_ee_fwd / merged); fprintf(fp, "%10.2f Mean expected error in reverse sequences\n", sum_ee_rev / merged); fprintf(fp, "%10.2f Mean expected error in merged sequences\n", sum_ee_merged / merged); fprintf(fp, "%10.2f Mean observed errors in merged region of forward sequences\n", 1.0 * sum_errors_fwd / merged); fprintf(fp, "%10.2f Mean observed errors in merged region of reverse sequences\n", 1.0 * sum_errors_rev / merged); fprintf(fp, "%10.2f Mean observed errors in merged region\n", 1.0 * (sum_errors_fwd + sum_errors_rev) / merged); } } auto fastq_mergepairs() -> void { /* fatal error if specified overlap is too small */ if (opt_fastq_minovlen < 5) { fatal("Overlap specified with --fastq_minovlen must be at least 5"); } /* relax default parameters in case of short overlaps */ if (opt_fastq_minovlen < 9) { merge_mindiagcount = opt_fastq_minovlen - 4; merge_minscore = 1.6 * opt_fastq_minovlen; } /* open input files */ fastq_fwd = fastq_open(opt_fastq_mergepairs); fastq_rev = fastq_open(opt_reverse); /* open output files */ if (opt_fastqout) { fp_fastqout = fileopenw(opt_fastqout); } if (opt_fastaout) { fp_fastaout = fileopenw(opt_fastaout); } if (opt_fastqout_notmerged_fwd) { fp_fastqout_notmerged_fwd = fileopenw(opt_fastqout_notmerged_fwd); } if (opt_fastqout_notmerged_rev) { fp_fastqout_notmerged_rev = fileopenw(opt_fastqout_notmerged_rev); } if (opt_fastaout_notmerged_fwd) { fp_fastaout_notmerged_fwd = fileopenw(opt_fastaout_notmerged_fwd); } if (opt_fastaout_notmerged_rev) { fp_fastaout_notmerged_rev = fileopenw(opt_fastaout_notmerged_rev); } if (opt_eetabbedout) { fp_eetabbedout = fileopenw(opt_eetabbedout); } /* precompute merged quality values */ precompute_qual(); /* main */ uint64_t const filesize = fastq_get_size(fastq_fwd); progress_init("Merging reads", filesize); if (! fastq_fwd->is_empty) { pair_all(); } progress_done(); if (fastq_next(fastq_rev, true, chrmap_upcase)) { fatal("More reverse reads than forward reads"); } if (fp_log) { print_stats(fp_log); } else { print_stats(stderr); } /* clean up */ if (opt_eetabbedout) { fclose(fp_eetabbedout); } if (opt_fastaout_notmerged_rev) { fclose(fp_fastaout_notmerged_rev); } if (opt_fastaout_notmerged_fwd) { fclose(fp_fastaout_notmerged_fwd); } if (opt_fastqout_notmerged_rev) { fclose(fp_fastqout_notmerged_rev); } if (opt_fastqout_notmerged_fwd) { fclose(fp_fastqout_notmerged_fwd); } if (opt_fastaout) { fclose(fp_fastaout); } if (opt_fastqout) { fclose(fp_fastqout); } fastq_close(fastq_rev); fastq_rev = nullptr; fastq_close(fastq_fwd); fastq_fwd = nullptr; } vsearch-2.30.0/src/mergepairs.h000066400000000000000000000047161476012147200163410ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_mergepairs() -> void; vsearch-2.30.0/src/minheap.cc000066400000000000000000000166331476012147200157630ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "minheap.h" #include // printf #include // qsort() /* implement a priority queue with a min heap binary array structure */ /* elements with the lowest count should be at the top (root) */ // refactoring: std::priority_queue (#include ) /* To keep track of the n best potential target sequences, we store them in a min heap. The root element corresponds to the least good target, while the best elements are found at the leaf nodes. This makes it simple to decide whether a new target should be included or not, because it just needs to be compared to the root note. The list will be fully sorted before use when we want to find the best element and then the second best and so on. */ auto elem_smaller(elem_t * lhs, elem_t * rhs) -> int { /* return 1 if lhs is smaller than rhs, 0 if equal or greater */ if (lhs->count < rhs->count) { return 1; } else if (lhs->count > rhs->count) { return 0; } else if (lhs->length > rhs->length) { return 1; } else if (lhs->length < rhs->length) { return 0; } else if (lhs->seqno > rhs->seqno) { return 1; } else { return 0; } } auto minheap_compare(const void * lhs_a, const void * rhs_b) -> int { auto * lhs = (elem_t *) lhs_a; auto * rhs = (elem_t *) rhs_b; /* return -1 if a is smaller than b, +1 if greater, otherwize 0 */ /* first: lower count, larger length, lower seqno */ if (lhs->count < rhs->count) { return -1; } else if (lhs->count > rhs->count) { return +1; } else if (lhs->length > rhs->length) { return -1; } else if (lhs->length < rhs->length) { return +1; } else if (lhs->seqno > rhs->seqno) { return -1; } else if (lhs->seqno < rhs->seqno) { return +1; } else { return 0; } } auto minheap_init(int size) -> minheap_t * { auto * a_minheap = static_cast(xmalloc(sizeof(minheap_t))); a_minheap->alloc = size; a_minheap->array = static_cast(xmalloc(size * sizeof(elem_t))); a_minheap->count = 0; return a_minheap; } auto minheap_exit(minheap_t * a_minheap) -> void { xfree(a_minheap->array); xfree(a_minheap); } auto minheap_replaceroot(minheap_t * a_minheap, elem_t tmp) -> void { /* remove the element at the root, then swap children up to the root and insert tmp at suitable place */ /* start with root */ int parent = 0; int nth_child = (2 * parent) + 1; /* while at least one child */ while (nth_child < a_minheap->count) { /* if two children: swap with the one with smallest value */ if ((nth_child + 1 < a_minheap->count) && (elem_smaller(a_minheap->array + nth_child + 1, a_minheap->array + nth_child) != 0)) { ++nth_child; } /* swap parent and child if child has lower value */ if (elem_smaller(a_minheap->array + nth_child, &tmp) != 0) { a_minheap->array[parent] = a_minheap->array[nth_child]; } else { break; } /* step down */ parent = nth_child; nth_child = 2 * parent + 1; } a_minheap->array[parent] = tmp; } auto minheap_add(minheap_t * a_minheap, elem_t * n) -> void { if (a_minheap->count < a_minheap->alloc) { /* space for another item at end; swap upwards */ int index = a_minheap->count++; int pos = (index - 1) / 2; while ((index > 0) && (elem_smaller(n, a_minheap->array + pos) != 0)) { a_minheap->array[index] = a_minheap->array[pos]; index = pos; pos = (index - 1) / 2; } a_minheap->array[index] = *n; } else if (elem_smaller(a_minheap->array, n) != 0) { /* replace the root if new element is larger than root */ minheap_replaceroot(a_minheap, *n); } } auto minheap_pop(minheap_t * a_minheap) -> elem_t { /* return top element and restore order */ static const elem_t zero = {0, 0, 0}; if (a_minheap->count != 0) { elem_t top = a_minheap->array[0]; --a_minheap->count; if (a_minheap->count != 0) { const elem_t tmp = a_minheap->array[a_minheap->count]; minheap_replaceroot(a_minheap, tmp); } return top; } return zero; } auto minheap_sort(minheap_t * a_minheap) -> void { std::qsort(a_minheap->array, a_minheap->count, sizeof(elem_t), minheap_compare); } auto minheap_poplast(minheap_t * a_minheap) -> elem_t { /* return top element and restore order */ static const elem_t zero = {0, 0, 0}; if (a_minheap->count != 0) { return a_minheap->array[--a_minheap->count]; } return zero; } vsearch-2.30.0/src/minheap.h000066400000000000000000000061761476012147200156260ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct topscore { unsigned int count; unsigned int seqno; unsigned int length; }; using elem_t = struct topscore; struct minheap_s { int alloc; int count; elem_t * array; }; using minheap_t = struct minheap_s; inline auto minheap_isempty(minheap_t * a_minheap) -> bool { return (a_minheap->count == 0); } inline auto minheap_empty(minheap_t * a_minheap) -> void { a_minheap->count = 0; } auto minheap_poplast(minheap_t * a_minheap) -> elem_t; auto minheap_sort(minheap_t * a_minheap) -> void; auto minheap_init(int size) -> minheap_t *; auto minheap_exit(minheap_t * a_minheap) -> void; auto minheap_add(minheap_t * a_minheap, elem_t * n) -> void; auto minheap_pop(minheap_t * a_minheap) -> elem_t; vsearch-2.30.0/src/msa.cc000066400000000000000000000550051476012147200151160ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include "msa.h" #include #include // std::max() #include #include // std::toupper #include // macro PRId64 #include // INT_MAX #include // uint64_t #include // std::FILE, std::sscanf, std::fprintf #include // str::strtoll #include // std::memset, std::strlen #include // std::next #include // std::accumulate #include /* Compute multiple sequence alignment (msa), profile, and consensus sequence of clustered sequences */ using prof_type = std::uint64_t; constexpr auto profsize = 6; auto update_profile(char const nucleotide, int const position_in_alignment, prof_type const abundance, std::vector& profile) -> void { static constexpr auto A_counter = 0; static constexpr auto C_counter = 1; static constexpr auto G_counter = 2; static constexpr auto U_counter = 3; // note: T converted to U? static constexpr auto N_counter = 4; static constexpr auto gap_counter = 5; auto const offset = profsize * position_in_alignment; // refactoring: eliminate unused cases? No, T and U are merged, same as IUPAC and N switch(std::toupper(nucleotide)) { case 'A': profile[offset + A_counter] += abundance; break; case 'C': profile[offset + C_counter] += abundance; break; case 'G': profile[offset + G_counter] += abundance; break; case 'T': case 'U': profile[offset + U_counter] += abundance; break; case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M': case 'B': case 'D': case 'H': case 'V': case 'N': profile[offset + N_counter] += abundance; break; case '-': profile[offset + gap_counter] += abundance; break; default: break; } } auto update_msa(char const nucleotide, int &position_in_alignment, std::vector& alignment) -> void { alignment[position_in_alignment] = nucleotide; ++position_in_alignment; } auto find_runlength_of_leftmost_operation(char * first_character, char ** first_non_digit) -> long long { // std::strtoll: // - start from the 'first_character' pointed to, // - consume as many characters as possible to form a valid integer, // - advance pointer to the first non-digit character, // - return the valid integer // - if there is no valid integer: pointer is not advanced and function returns zero, static constexpr auto decimal_base = 10; auto runlength = std::strtoll(first_character, first_non_digit, decimal_base); assert(runlength <= INT_MAX); // in the context of cigar strings, runlength is at least 1 if (runlength == 0) { runlength = 1; } return runlength; // is in [1, LLONG_MAX] } auto find_max_insertions_per_position(int const target_count, std::vector const & target_list_v, int const centroid_len) -> std::vector { std::vector max_insertions(centroid_len + 1); for (auto i = 1; i < target_count; ++i) { char * cigar_start = target_list_v[i].cigar; auto const cigar_length = static_cast(std::strlen(cigar_start)); char * cigar_end = std::next(cigar_start, cigar_length); auto * position_in_cigar = cigar_start; auto position_in_centroid = 0LL; while (position_in_cigar < cigar_end) { auto** next_operation = &position_in_cigar; // operations: match (M), insertion (I), or deletion (D) auto const runlength = find_runlength_of_leftmost_operation(position_in_cigar, next_operation); auto const operation = **next_operation; position_in_cigar = std::next(position_in_cigar); switch (operation) { case 'M': case 'I': position_in_centroid += runlength; break; case 'D': max_insertions[position_in_centroid] = std::max(static_cast(runlength), max_insertions[position_in_centroid]); break; default: break; } } } return max_insertions; } auto find_total_alignment_length(std::vector const & max_insertions) -> int { auto const centroid_len = static_cast(max_insertions.size() - 1); return std::accumulate(max_insertions.begin(), max_insertions.end(), centroid_len); } auto find_longest_target_on_reverse_strand(int const target_count, std::vector const & target_list_v) -> int64_t { int64_t longest_reversed = 0; for (auto i = 0; i < target_count; ++i) { auto const & target = target_list_v[i]; if (target.strand == 0) { continue; } auto const len = static_cast(db_getsequencelen(target.seqno)); longest_reversed = std::max(len, longest_reversed); } return longest_reversed; } auto allocate_buffer_for_reverse_strand_target(int const target_count, std::vector const & target_list_v, std::vector & rc_buffer_v) -> char * { /* Find longest target sequence on reverse strand and allocate buffer */ auto const longest_reversed = find_longest_target_on_reverse_strand(target_count, target_list_v); if (longest_reversed > 0) { rc_buffer_v.resize(longest_reversed + 1); return rc_buffer_v.data(); } return nullptr; } auto blank_line_before_each_msa(std::FILE * fp_msaout) -> void { if (fp_msaout == nullptr) { return ; } static_cast(std::fprintf(fp_msaout, "\n")); } auto print_header_and_sequence(std::FILE * fp_msaout, char const * header_prefix, int const target_seqno, std::vector & aln_v) -> void { // header_prefix == "*" or "", resulting in ">*header" or ">header" if (fp_msaout == nullptr) { return ; } fasta_print_general(fp_msaout, header_prefix, aln_v.data(), static_cast(aln_v.size() - 1), db_getheader(target_seqno), static_cast(db_getheaderlen(target_seqno)), db_getabundance(target_seqno), 0, -1.0, -1, -1, nullptr, 0.0); } auto reverse_complement_target_if_need_be(int const strand, int const target_seqno, char * rc_buffer, char * target_seq) -> char * { if (strand == 0) { return target_seq; } reverse_complement(rc_buffer, target_seq, static_cast(db_getsequencelen(target_seqno))); return rc_buffer; } auto process_and_print_centroid(char *rc_buffer, std::vector const &target_list_v, std::vector const &max_insertions, std::vector &profile, std::vector &aln_v, std::FILE * fp_msaout) -> void { auto const centroid_len = static_cast(max_insertions.size() - 1); auto const & target = target_list_v.front(); auto const target_seqno = target.seqno; auto * const target_seq = reverse_complement_target_if_need_be(target.strand, target_seqno, rc_buffer, db_getsequence(target_seqno)); prof_type const target_abundance = opt_sizein ? db_getabundance(target_seqno) : 1; auto position_in_alignment = 0; for (auto i = 0; i < centroid_len; ++i) { for (auto j = 0; j < max_insertions[i]; ++j) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } update_profile(*std::next(target_seq, i), position_in_alignment, target_abundance, profile); update_msa(*std::next(target_seq, i), position_in_alignment, aln_v); } // insert for (auto j = 0; j < max_insertions[centroid_len]; ++j) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } /* end of sequence string */ aln_v[position_in_alignment] = '\0'; /* print header & sequence */ print_header_and_sequence(fp_msaout, "*", target_seqno, aln_v); } auto insert_gaps_in_alignment_and_profile(bool const is_inserted, int const max_insertions_at_position, int & position_in_alignment, prof_type const target_abundance, std::vector & profile, std::vector & aln_v) -> void { if (is_inserted) { return ; } for (auto i = 0; i < max_insertions_at_position; ++i) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } } auto compute_and_print_msa(int const target_count, std::vector const & target_list_v, std::vector const &max_insertions, std::vector &profile, std::vector &aln_v, std::FILE * fp_msaout) -> void { blank_line_before_each_msa(fp_msaout); /* Find longest target sequence on reverse strand and allocate buffer */ std::vector rc_buffer_v; char * rc_buffer = allocate_buffer_for_reverse_strand_target(target_count, target_list_v, rc_buffer_v); // ------------------------------------------------------- deal with centroid process_and_print_centroid(rc_buffer, target_list_v, max_insertions, profile, aln_v, fp_msaout); // --------------------------------- deal with other sequences in the cluster for (auto i = 1; i < target_count; ++i) { auto const & target = target_list_v[i]; auto const target_seqno = target.seqno; auto * const target_seq = reverse_complement_target_if_need_be(target.strand, target_seqno, rc_buffer, db_getsequence(target_seqno)); prof_type const target_abundance = opt_sizein ? db_getabundance(target_seqno) : 1; int position_in_alignment = 0; auto is_inserted = false; auto qpos = 0; auto tpos = 0; char * cigar_start = target.cigar; auto const cigar_length = static_cast(std::strlen(cigar_start)); char * cigar_end = std::next(cigar_start, cigar_length); auto * position_in_cigar = cigar_start; while (position_in_cigar < cigar_end) { // Consume digits (if any), return the position of the // first char (M, D, or I), store it, move cursor to the next byte. // Operations: match (M), insertion (I), or deletion (D) auto** next_operation = &position_in_cigar; auto const runlength = find_runlength_of_leftmost_operation(position_in_cigar, next_operation); auto const operation = **next_operation; position_in_cigar = std::next(position_in_cigar); switch (operation) { case 'D': for (auto j = 0; j < runlength; ++j) { update_profile(*std::next(target_seq, tpos), position_in_alignment, target_abundance, profile); update_msa(*std::next(target_seq, tpos), position_in_alignment, aln_v); ++tpos; } for (auto j = runlength; j < max_insertions[qpos]; ++j) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } is_inserted = true; break; case 'M': for (auto j = 0; j < runlength; ++j) { insert_gaps_in_alignment_and_profile(is_inserted, max_insertions[qpos], position_in_alignment, target_abundance, profile, aln_v); update_profile(*std::next(target_seq, tpos), position_in_alignment, target_abundance, profile); update_msa(*std::next(target_seq, tpos), position_in_alignment, aln_v); ++tpos; ++qpos; is_inserted = false; } break; case 'I': for (auto j = 0; j < runlength; ++j) { insert_gaps_in_alignment_and_profile(is_inserted, max_insertions[qpos], position_in_alignment, target_abundance, profile, aln_v); update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); ++qpos; is_inserted = false; } break; default: break; } } insert_gaps_in_alignment_and_profile(is_inserted, max_insertions[qpos], position_in_alignment, target_abundance, profile, aln_v); /* end of sequence string */ aln_v[position_in_alignment] = '\0'; /* print header & sequence */ print_header_and_sequence(fp_msaout, "", target_seqno, aln_v); } } auto compute_and_print_consensus(std::vector const &max_insertions, std::vector &aln_v, std::vector &cons_v, std::vector &profile, std::FILE * fp_msaout) -> void { static constexpr char index_of_N = 15; // 15th char in sym_nt_4bit[] (=> 'N') auto const alignment_length = static_cast(aln_v.size() - 1); int conslen = 0; /* Censor part of the consensus sequence outside the centroid sequence */ auto const left_censored = max_insertions.front(); auto const right_censored = max_insertions.back(); for (auto i = 0; i < left_censored; ++i) { aln_v[i] = '+'; } for (auto i = alignment_length - right_censored; i < alignment_length; ++i) { aln_v[i] = '+'; } for (auto i = left_censored; i < alignment_length - right_censored; ++i) { /* find most common symbol of A, C, G and T */ char best_sym = 0; prof_type best_count = 0; for (auto nucleotide = 0U; nucleotide < 4; ++nucleotide) { auto const count = profile[(profsize * i) + nucleotide]; if (count > best_count) { best_count = count; best_sym = static_cast(1U << nucleotide); // 1, 2, 4, or 8 } } /* if no A, C, G, or T, check if there are any N's */ auto const N_count = profile[(profsize * i) + 4]; if ((best_count == 0) and (N_count > 0)) { best_count = N_count; best_sym = index_of_N; // N } /* compare to the number of gap symbols */ auto const gap_count = profile[(profsize * i) + 5]; if (best_count >= gap_count) { auto const index = static_cast(best_sym); auto const sym = sym_nt_4bit[index]; // A, C, G, T, or N aln_v[i] = sym; cons_v[conslen] = sym; ++conslen; } else { aln_v[i] = '-'; } } aln_v.back() = '\0'; cons_v[conslen] = '\0'; cons_v.resize(conslen + 1); if (fp_msaout != nullptr) { fasta_print(fp_msaout, "consensus", aln_v.data(), alignment_length); } } auto print_consensus_sequence(std::FILE *fp_consout, std::vector & cons_v, int64_t const totalabundance, int const target_count, int const cluster, int const centroid_seqno) -> void { if (fp_consout == nullptr) { return ; } fasta_print_general(fp_consout, "centroid=", cons_v.data(), static_cast(cons_v.size()), db_getheader(centroid_seqno), static_cast(db_getheaderlen(centroid_seqno)), totalabundance, cluster + 1, -1.0, target_count, opt_clusterout_id ? cluster : -1, nullptr, 0.0); } auto print_alignment_profile(std::FILE *fp_profile, std::vector &aln_v, std::vector const &profile, int64_t const totalabundance, int const target_count, int const cluster, int const centroid_seqno) -> void { if (fp_profile == nullptr) { return ; } // Note: gaps before Ns in profile output // 0 = A, 1 = C, 2 = G, 3 = T, 4 = N, 5 = '-' (gap) static const std::array symbol_indexes = {0, 1, 2, 3, 5, 4}; fasta_print_general(fp_profile, "centroid=", nullptr, 0, db_getheader(centroid_seqno), static_cast(db_getheaderlen(centroid_seqno)), totalabundance, cluster + 1, -1.0, target_count, opt_clusterout_id ? cluster : -1, nullptr, 0.0); aln_v.pop_back(); // remove last element ('\0') auto counter = 0; for (auto const nucleotide: aln_v) { static_cast(std::fprintf(fp_profile, "%d\t%c", counter, nucleotide)); // A, C, G and T, then gap '-', then N for (auto const symbol_index : symbol_indexes) { static_cast(std::fprintf(fp_profile, "\t%" PRId64, profile[(profsize * counter) + symbol_index])); } static_cast(std::fprintf(fp_profile, "\n")); ++counter; } static_cast(std::fprintf(fp_profile, "\n")); } auto msa(std::FILE * fp_msaout, std::FILE * fp_consout, std::FILE * fp_profile, int cluster, int const target_count, std::vector const & target_list_v, int64_t totalabundance) -> void { int const centroid_seqno = target_list_v[0].seqno; auto const centroid_length = static_cast(db_getsequencelen(centroid_seqno)); /* find max insertions in front of each position in the centroid sequence */ auto const max_insertions = find_max_insertions_per_position(target_count, target_list_v, centroid_length); auto const alignment_length = find_total_alignment_length(max_insertions); /* allocate memory for profile (for consensus) and aligned seq */ std::vector profile(static_cast(profsize) * alignment_length); // C++20 refactoring: std::vector>(alnlen); std::vector aln_v(alignment_length + 1); std::vector cons_v(alignment_length + 1); /* msaout: multiple sequence alignment ... */ compute_and_print_msa(target_count, target_list_v, max_insertions, profile, aln_v, fp_msaout); /* msaout: ... and consensus sequence at the end */ compute_and_print_consensus(max_insertions, aln_v, cons_v, profile, fp_msaout); /* consout: consensus sequence (dedicated input) */ print_consensus_sequence(fp_consout, cons_v, totalabundance, target_count, cluster, centroid_seqno); /* profile: multiple sequence alignment profile (dedicated input) */ print_alignment_profile(fp_profile, aln_v, profile, totalabundance, target_count, cluster, centroid_seqno); } vsearch-2.30.0/src/msa.h000066400000000000000000000054501476012147200147570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // int64_t #include // std::FILE #include struct msa_target_s { int seqno; char * cigar; int strand; }; auto msa(std::FILE * fp_msaout, std::FILE * fp_consout, std::FILE * fp_profile, int cluster, int target_count, std::vector const & target_list_v, int64_t totalabundance) -> void; vsearch-2.30.0/src/orient.cc000066400000000000000000000350321476012147200156340ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dbindex.h" #include "maps.h" #include "mask.h" #include "udb.h" #include "unique.h" #include #include // uint64_t #include // std::FILE, std::fprintf, std::size_t, std::fclose auto rc_kmer(unsigned int kmer) -> unsigned int { /* reverse complement a kmer where k = opt_wordlength */ auto fwd = kmer; auto rev = 0U; for (auto i = int64_t{0}; i < opt_wordlength; ++i) { auto const x = (fwd & 3U) ^ 3U; fwd = fwd >> 2U; rev = rev << 2U; rev |= x; } return rev; } auto orient() -> void { fastx_handle query_h = nullptr; // refactoring: use struct, like in subsample std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; std::FILE * fp_tabbedout = nullptr; std::FILE * fp_notmatched = nullptr; int queries = 0; int qmatches = 0; int matches_fwd = 0; int matches_rev = 0; int notmatched = 0; /* check arguments */ if (not opt_db) { fatal("Database not specified with --db"); } if (not (opt_fastaout or opt_fastqout or opt_notmatched or opt_tabbedout)) { fatal("Output file not specified with --fastaout, --fastqout, --notmatched or --tabbedout"); } /* prepare reading of queries */ query_h = fastx_open(opt_orient); /* open output files */ if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); if (not fp_fastaout) { fatal("Unable to open fasta output file for writing"); } } if (opt_fastqout) { if (not fastx_is_fastq(query_h)) { fatal("Cannot write FASTQ output with FASTA input"); } fp_fastqout = fopen_output(opt_fastqout); if (not fp_fastqout) { fatal("Unable to open fastq output file for writing"); } } if (opt_notmatched) { fp_notmatched = fopen_output(opt_notmatched); if (not fp_notmatched) { fatal("Unable to open notmatched output file for writing"); } } if (opt_tabbedout) { fp_tabbedout = fopen_output(opt_tabbedout); if (not fp_tabbedout) { fatal("Unable to open tabbedout output file for writing"); } } /* check if it may be an UDB file */ auto const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); } else { db_read(opt_db, 0); } if (not is_udb) { if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) and (opt_hardmask)) { hardmask_all(); } } if (not is_udb) { dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } uhandle_s * uh_fwd = unique_init(); std::size_t alloc = 0; char * qseq_rev = nullptr; char * query_qual_rev = nullptr; progress_init("Orienting sequences", fasta_get_size(query_h)); while (fastx_next(query_h, not opt_notrunclabels, chrmap_no_change)) { char * query_head = fastx_get_header(query_h); int const query_head_len = fastx_get_header_length(query_h); char * qseq_fwd = fastx_get_sequence(query_h); int const qseqlen = fastx_get_sequence_length(query_h); int const qsize = fastx_get_abundance(query_h); char * query_qual_fwd = fastx_get_quality(query_h); /* find kmers in query sequence */ unsigned int kmer_count_fwd = 0; unsigned int * kmer_list_fwd = nullptr; unique_count(uh_fwd, opt_wordlength, qseqlen, qseq_fwd, & kmer_count_fwd, & kmer_list_fwd, opt_qmask); /* count kmers matching on each strand */ unsigned int count_fwd = 0; unsigned int count_rev = 0; constexpr auto hits_factor = 8U; for (unsigned int i = 0; i < kmer_count_fwd; i++) { unsigned int const kmer_fwd = kmer_list_fwd[i]; unsigned int const kmer_rev = rc_kmer(kmer_fwd); unsigned int const hits_fwd = dbindex_getmatchcount(kmer_fwd); unsigned int const hits_rev = dbindex_getmatchcount(kmer_rev); /* require 8 times as many matches on one stand than the other */ if (hits_fwd > hits_factor * hits_rev) { ++count_fwd; } else if (hits_rev > hits_factor * hits_fwd) { ++count_rev; } } /* get progress as amount of input file read */ uint64_t const progress = fasta_get_position(query_h); /* update stats */ ++queries; int strand = 2; unsigned int const min_count = 1; unsigned int const min_factor = 4; if ((count_fwd >= min_count) and (count_fwd >= min_factor * count_rev)) { /* fwd */ strand = 0; ++matches_fwd; ++qmatches; if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, qseq_fwd, qseqlen, query_head, query_head_len, qsize, qmatches, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout) { fastq_print_general(fp_fastqout, qseq_fwd, qseqlen, query_head, query_head_len, query_qual_fwd, qsize, qmatches, -1.0); } } else if ((count_rev >= min_count) and (count_rev >= min_factor * count_fwd)) { /* rev */ strand = 1; ++matches_rev; ++qmatches; /* alloc more mem if necessary to keep reverse sequence and qual */ assert(qseqlen > 0); static_assert(sizeof(std::size_t) >= sizeof(int), "size_t is too small"); const std::size_t requirements = qseqlen + 1; // refactoring: unsigned int qseqlen if (requirements > alloc) { alloc = requirements; qseq_rev = (char*) xrealloc(qseq_rev, alloc); if (fastx_is_fastq(query_h)) { query_qual_rev = (char*) xrealloc(query_qual_rev, alloc); } } /* get reverse complementary sequence */ reverse_complement(qseq_rev, qseq_fwd, qseqlen); if (opt_fastaout) { fasta_print_general(fp_fastaout, nullptr, qseq_rev, qseqlen, query_head, query_head_len, qsize, qmatches, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout) { /* reverse quality scores */ if (fastx_is_fastq(query_h)) { for (int i = 0; i < qseqlen; i++) { query_qual_rev[i] = query_qual_fwd[qseqlen-1-i]; } query_qual_rev[qseqlen] = 0; } fastq_print_general(fp_fastqout, qseq_rev, qseqlen, query_head, query_head_len, query_qual_rev, qsize, qmatches, -1.0); } } else { /* undecided */ strand = 2; ++notmatched; if (opt_notmatched) { if (fastx_is_fastq(query_h)) { fastq_print_general(fp_notmatched, qseq_fwd, qseqlen, query_head, query_head_len, query_qual_fwd, qsize, notmatched, -1.0); } else { fasta_print_general(fp_notmatched, nullptr, qseq_fwd, qseqlen, query_head, query_head_len, qsize, notmatched, -1.0, -1, -1, nullptr, 0.0); } } } if (opt_tabbedout) { fprintf(fp_tabbedout, "%s\t%c\t%d\t%d\n", query_head, strand == 0 ? '+' : (strand == 1 ? '-' : '?'), count_fwd, count_rev); } /* show progress */ progress_update(progress); } progress_done(); /* clean up */ if (qseq_rev) { xfree(qseq_rev); } if (query_qual_rev) { xfree(query_qual_rev); } unique_exit(uh_fwd); dbindex_free(); db_free(); if (opt_tabbedout) { fclose(fp_tabbedout); } if (opt_notmatched) { fclose(fp_notmatched); } if (opt_fastqout) { fclose(fp_fastqout); } if (opt_fastaout) { fclose(fp_fastaout); } fasta_close(query_h); if (not opt_quiet) { fprintf(stderr, "Forward oriented sequences: %d", matches_fwd); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * matches_fwd / queries); } fprintf(stderr, "\n"); fprintf(stderr, "Reverse oriented sequences: %d", matches_rev); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * matches_rev / queries); } fprintf(stderr, "\n"); fprintf(stderr, "All oriented sequences: %d", qmatches); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); fprintf(stderr, "Not oriented sequences: %d", notmatched); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * notmatched / queries); } fprintf(stderr, "\n"); fprintf(stderr, "Total number of sequences: %d\n", queries); } if (opt_log) { fprintf(fp_log, "Forward oriented sequences: %d", matches_fwd); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * matches_fwd / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "Reverse oriented sequences: %d", matches_rev); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * matches_rev / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "All oriented sequences: %d", qmatches); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "Not oriented sequences: %d", notmatched); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * notmatched / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "Total number of sequences: %d\n", queries); } } vsearch-2.30.0/src/orient.h000066400000000000000000000047041476012147200155000ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto orient() -> void; vsearch-2.30.0/src/otutable.cc000066400000000000000000000347101476012147200161550ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // macros PRIu64 and PRId64 #include // std::strftime, std::localtime, std::time, std::time_t, std::tm #include // int64_t, uint64_t #include // std::FILE, std::fprintf #include // std::strncpy, std::strcspn, std::strspn #include #include #include #include // std::pair #include #ifdef HAVE_REGEX_H #include #else #include #endif /* Identify sample and otu identifiers in headers, and count abundance of the samples in different OTUs. http://www.drive5.com/usearch/manual/upp_labels_sample.html http://www.drive5.com/usearch/manual/upp_labels_otus.html TODO: - add relabel @ */ #ifndef HAVE_REGEX_H const std::regex regex_sample("(^|;)(sample|barcodelabel)=([^;]*)($|;)", std::regex::extended); const std::regex regex_otu("(^|;)otu=([^;]*)($|;)", std::regex::extended); const std::regex regex_tax("(^|;)tax=([^;]*)($|;)", std::regex::extended); #endif using string_set_t = std::set; using string_pair_t = std::pair; using string_pair_map_t = std::map; using otu_tax_map_t = std::map; using string_no_map_t = std::map; struct otutable_s { #ifdef HAVE_REGEX_H regex_t regex_sample; regex_t regex_otu; regex_t regex_tax; #endif string_set_t otu_set; string_set_t sample_set; string_pair_map_t sample_otu_count; string_pair_map_t otu_sample_count; otu_tax_map_t otu_tax_map; }; static otutable_s * otutable; auto otutable_init() -> void { otutable = new otutable_s; #ifdef HAVE_REGEX_H /* compile regular expression matchers */ if (regcomp(&otutable->regex_sample, "(^|;)(sample|barcodelabel)=([^;]*)($|;)", REG_EXTENDED)) { fatal("Compilation of regular expression for sample annotation failed"); } if (regcomp(&otutable->regex_otu, "(^|;)otu=([^;]*)($|;)", REG_EXTENDED)) { fatal("Compilation of regular expression for otu annotation failed"); } if (regcomp(&otutable->regex_tax, "(^|;)tax=([^;]*)($|;)", REG_EXTENDED)) { fatal("Compilation of regular expression for taxonomy annotation failed"); } #endif } auto otutable_done() -> void { #ifdef HAVE_REGEX_H regfree(&otutable->regex_sample); regfree(&otutable->regex_otu); regfree(&otutable->regex_tax); #endif otutable->otu_set.clear(); otutable->sample_set.clear(); otutable->sample_otu_count.clear(); otutable->otu_sample_count.clear(); delete otutable; } auto otutable_add(char * query_header, char * target_header, int64_t abundance) -> void { /* read sample annotation in query */ int len_sample = 0; char * start_sample = query_header; char * sample_name = nullptr; if (query_header) { #ifdef HAVE_REGEX_H regmatch_t pmatch_sample[5]; if (! regexec(&otutable->regex_sample, query_header, 5, pmatch_sample, 0)) { /* match: use the matching sample name */ len_sample = pmatch_sample[3].rm_eo - pmatch_sample[3].rm_so; start_sample += pmatch_sample[3].rm_so; } #else std::cmatch cmatch_sample; if (regex_search(query_header, cmatch_sample, regex_sample)) { len_sample = cmatch_sample.length(3); start_sample += cmatch_sample.position(3); } #endif else { /* no match: use first name in header with A-Za-z0-9_ */ len_sample = strspn(query_header, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_" "0123456789"); } sample_name = (char *) xmalloc(len_sample + 1); std::strncpy(sample_name, start_sample, len_sample); sample_name[len_sample] = '\0'; } /* read OTU annotation in target */ int len_otu = 0; char * start_otu = target_header; char * otu_name = nullptr; if (target_header) { #ifdef HAVE_REGEX_H regmatch_t pmatch_otu[4]; if (! regexec(&otutable->regex_otu, target_header, 4, pmatch_otu, 0)) { /* match: use the matching otu name */ len_otu = pmatch_otu[2].rm_eo - pmatch_otu[2].rm_so; start_otu += pmatch_otu[2].rm_so; } #else std::cmatch cmatch_otu; if (regex_search(target_header, cmatch_otu, regex_otu)) { len_otu = cmatch_otu.length(2); start_otu += cmatch_otu.position(2); } #endif else { /* no match: use first name in header up to ; */ len_otu = strcspn(target_header, ";"); } otu_name = (char *) xmalloc(len_otu + 1); std::strncpy(otu_name, start_otu, len_otu); otu_name[len_otu] = 0; /* read tax annotation in target */ #ifdef HAVE_REGEX_H char * start_tax = target_header; regmatch_t pmatch_tax[4]; if (! regexec(&otutable->regex_tax, target_header, 4, pmatch_tax, 0)) { /* match: use the matching tax name */ int const len_tax = pmatch_tax[2].rm_eo - pmatch_tax[2].rm_so; start_tax += pmatch_tax[2].rm_so; std::vector tax_name(len_tax + 1); std::strncpy(tax_name.data(), start_tax, len_tax); tax_name[len_tax] = '\0'; otutable->otu_tax_map[otu_name] = tax_name.data(); } #else std::cmatch cmatch_tax; if (regex_search(target_header, cmatch_tax, regex_tax)) { otutable->otu_tax_map[otu_name] = cmatch_tax.str(2); } #endif } /* store data */ if (sample_name) { otutable->sample_set.insert(sample_name); } if (otu_name) { otutable->otu_set.insert(otu_name); } if (sample_name && otu_name && abundance) { otutable->sample_otu_count[string_pair_t(sample_name,otu_name)] += abundance; otutable->otu_sample_count[string_pair_t(otu_name,sample_name)] += abundance; } if (otu_name) { xfree(otu_name); } if (sample_name) { xfree(sample_name); } } auto otutable_print_otutabout(std::FILE * output_handle) -> void { int64_t progress = 0; progress_init("Writing OTU table (classic)", otutable->otu_set.size()); fprintf(output_handle, "#OTU ID"); for (auto const & it_sample : otutable->sample_set) { fprintf(output_handle, "\t%s", it_sample.c_str()); } if (! otutable->otu_tax_map.empty()) { fprintf(output_handle, "\ttaxonomy"); } fprintf(output_handle, "\n"); auto it_map = otutable->otu_sample_count.begin(); for (auto it_otu = otutable->otu_set.begin(); it_otu != otutable->otu_set.end(); ++it_otu) { fprintf(output_handle, "%s", it_otu->c_str()); for (auto it_sample = otutable->sample_set.begin(); it_sample != otutable->sample_set.end(); ++it_sample) { uint64_t a = 0; if ((it_map != otutable->otu_sample_count.end()) && (it_map->first.first == *it_otu) && (it_map->first.second == *it_sample)) { a = it_map->second; ++it_map; } fprintf(output_handle, "\t%" PRIu64, a); } if (! otutable->otu_tax_map.empty()) { fprintf(output_handle, "\t"); auto it = otutable->otu_tax_map.find(*it_otu); if (it != otutable->otu_tax_map.end()) { fprintf(output_handle, "%s", it->second.c_str()); } } fprintf(output_handle, "\n"); progress_update(++progress); } progress_done(); } auto otutable_print_mothur_shared_out(std::FILE * output_handle) -> void { int64_t progress = 0; progress_init("Writing OTU table (mothur)", otutable->sample_set.size()); fprintf(output_handle, "label\tGroup\tnumOtus"); int64_t numotus = 0; for (const auto & it_otu : otutable->otu_set) { const char * otu_name = it_otu.c_str(); fprintf(output_handle, "\t%s", otu_name); ++numotus; } fprintf(output_handle, "\n"); auto it_map = otutable->sample_otu_count.begin(); for (auto it_sample = otutable->sample_set.begin(); it_sample != otutable->sample_set.end(); ++it_sample) { fprintf(output_handle, "vsearch\t%s\t%" PRId64, it_sample->c_str(), numotus); for (auto it_otu = otutable->otu_set.begin(); it_otu != otutable->otu_set.end(); ++it_otu) { uint64_t a = 0; if ((it_map != otutable->sample_otu_count.end()) && (it_map->first.first == *it_sample) && (it_map->first.second == *it_otu)) { a = it_map->second; ++it_map; } fprintf(output_handle, "\t%" PRIu64, a); } fprintf(output_handle, "\n"); progress_update(++progress); } progress_done(); } auto otutable_print_biomout(std::FILE * output_handle) -> void { int64_t progress = 0; progress_init("Writing OTU table (biom 1.0)", otutable->otu_sample_count.size()); int64_t const rows = otutable->otu_set.size(); int64_t const columns = otutable->sample_set.size(); static const time_t time_now = time(nullptr); struct tm * tm_now = localtime(& time_now); char date[50]; strftime(date, 50, "%Y-%m-%dT%H:%M:%S", tm_now); fprintf(output_handle, "{\n" "\t\"id\":\"%s\",\n" "\t\"format\": \"Biological Observation Matrix 1.0\",\n" "\t\"format_url\": \"http://biom-format.org/documentation/format_versions/biom-1.0.html\",\n" "\t\"type\": \"OTU table\",\n" "\t\"generated_by\": \"%s %s\",\n" "\t\"date\": \"%s\",\n" "\t\"matrix_type\": \"sparse\",\n" "\t\"matrix_element_type\": \"int\",\n" "\t\"shape\": [%" PRId64 ",%" PRId64 "],\n", opt_biomout, PROG_NAME, PROG_VERSION, date, rows, columns); string_no_map_t otu_no_map; uint64_t otu_no = 0; fprintf(output_handle, "\t\"rows\":["); for (auto it_otu = otutable->otu_set.begin(); it_otu != otutable->otu_set.end(); ++it_otu) { if (it_otu != otutable->otu_set.begin()) { fprintf(output_handle, ","); } const char * otu_name = it_otu->c_str(); fprintf(output_handle, "\n\t\t{\"id\":\"%s\", \"metadata\":", otu_name); if (otutable->otu_tax_map.empty()) { fprintf(output_handle, "null"); } else { fprintf(output_handle, R"({"taxonomy":")"); auto it = otutable->otu_tax_map.find(otu_name); if (it != otutable->otu_tax_map.end()) { fprintf(output_handle, "%s", it->second.c_str()); } fprintf(output_handle, "\"}"); } fprintf(output_handle, "}"); otu_no_map[*it_otu] = otu_no++; } fprintf(output_handle, "\n"); fprintf(output_handle, "\t],\n"); string_no_map_t sample_no_map; uint64_t sample_no = 0; fprintf(output_handle, "\t\"columns\":["); for (auto it_sample = otutable->sample_set.begin(); it_sample != otutable->sample_set.end(); ++it_sample) { if (it_sample != otutable->sample_set.begin()) { fprintf(output_handle, ","); } fprintf(output_handle, "\n\t\t{\"id\":\"%s\", \"metadata\":null}", it_sample->c_str()); sample_no_map[*it_sample] = sample_no++; } fprintf(output_handle, "\n\t],\n"); bool first = true; fprintf(output_handle, "\t\"data\": ["); for (auto & it_map : otutable->otu_sample_count) { if (! first) { fprintf(output_handle, ","); } otu_no = otu_no_map[it_map.first.first]; sample_no = sample_no_map[it_map.first.second]; fprintf(output_handle, "\n\t\t[%" PRIu64 ",%" PRIu64 ",%" PRIu64 "]", otu_no, sample_no, it_map.second); first = false; progress_update(++progress); } fprintf(output_handle, "\n\t]\n"); fprintf(output_handle, "}\n"); progress_done(); } vsearch-2.30.0/src/otutable.h000066400000000000000000000055171476012147200160220ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // int64_t auto otutable_init() -> void; auto otutable_done() -> void; auto otutable_add(char * query_header, char * target_header, int64_t abundance) -> void; auto otutable_print_otutabout(std::FILE * output_handle) -> void; auto otutable_print_mothur_shared_out(std::FILE * output_handle) -> void; auto otutable_print_biomout(std::FILE * output_handle) -> void; vsearch-2.30.0/src/rereplicate.cc000066400000000000000000000121571476012147200166360ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/maps.hpp" #include // macros PRIu64 and PRId64 #include // std::FILE, std::fprintf #include // int64_t auto rereplicate(struct Parameters & parameters) -> void { if (parameters.opt_output == nullptr) { fatal("FASTA output file for rereplicate must be specified with --output"); } auto * fp_output = fopen_output(parameters.opt_output); if (fp_output == nullptr) { fatal("Unable to open FASTA output file for writing"); } opt_xsize = true; parameters.opt_xsize = true; fastx_handle file_handle = fasta_open(parameters.opt_rereplicate); auto const filesize = static_cast(fasta_get_size(file_handle)); progress_init("Rereplicating", filesize); int64_t n_amplicons = 0; int64_t missing = 0; int64_t n_reads = 0; auto const truncateatspace = not parameters.opt_notrunclabels; while (fasta_next(file_handle, truncateatspace, chrmap_no_change_vector.data())) { ++n_amplicons; int64_t abundance = fasta_get_abundance_and_presence(file_handle); if (abundance == 0) { ++missing; abundance = 1; } for(int64_t i = 0; i < abundance; ++i) { ++n_reads; if (parameters.opt_output != nullptr) { fasta_print_general(fp_output, nullptr, fasta_get_sequence(file_handle), static_cast(fasta_get_sequence_length(file_handle)), fasta_get_header(file_handle), static_cast(fasta_get_header_length(file_handle)), 1, static_cast(n_reads), -1.0, -1, -1, nullptr, 0.0); } } progress_update(fasta_get_position(file_handle)); } progress_done(); if (not parameters.opt_quiet) { if (missing != 0) { std::fprintf(stderr, "WARNING: Missing abundance information for some input sequences, assumed 1\n"); } std::fprintf(stderr, "Rereplicated %" PRId64 " reads from %" PRId64 " amplicons\n", n_reads, n_amplicons); } if (parameters.opt_log != nullptr) { if (missing != 0) { std::fprintf(stderr, "WARNING: Missing abundance information for some input sequences, assumed 1\n"); } std::fprintf(fp_log, "Rereplicated %" PRId64 " reads from %" PRId64 " amplicons\n", n_reads, n_amplicons); } fasta_close(file_handle); if (fp_output != nullptr) { static_cast(std::fclose(fp_output)); } } vsearch-2.30.0/src/rereplicate.h000066400000000000000000000047471476012147200165060ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto rereplicate(struct Parameters & parameters) -> void; vsearch-2.30.0/src/results.cc000066400000000000000000000740531476012147200160430ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "maps.h" #include "showalign.h" #include "tax.h" #include "userfields.h" #include // std::max #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::snprintf, std::sscanf #include // std::strlen, std::strncmp auto results_show_fastapairs_one(std::FILE * output_handle, struct hit * hits, char * query_head, char * qsequence, char * qsequence_rc) -> void { /* http://www.drive5.com/usearch/manual/fastapairs.html */ if (hits == nullptr) { return; } auto * qrow = align_getrow(hits->strand ? qsequence_rc : qsequence, hits->nwalignment, hits->nwalignmentlength, 0); fasta_print_general(output_handle, nullptr, qrow + hits->trim_q_left + hits->trim_t_left, hits->internal_alignmentlength, query_head, strlen(query_head), 0, 0, -1.0, -1, -1, nullptr, 0.0); xfree(qrow); auto * trow = align_getrow(db_getsequence(hits->target), hits->nwalignment, hits->nwalignmentlength, 1); fasta_print_general(output_handle, nullptr, trow + hits->trim_q_left + hits->trim_t_left, hits->internal_alignmentlength, db_getheader(hits->target), db_getheaderlen(hits->target), 0, 0, -1.0, -1, -1, nullptr, 0.0); xfree(trow); fprintf(output_handle, "\n"); } auto results_show_qsegout_one(std::FILE * output_handle, struct hit * hits, char * query_head, char * qsequence, int64_t qseqlen, char * qsequence_rc) -> void { if (hits == nullptr) { return; } char * qseg = (hits->strand ? qsequence_rc : qsequence) + hits->trim_q_left; int const qseglen = qseqlen - hits->trim_q_left - hits->trim_q_right; fasta_print_general(output_handle, nullptr, qseg, qseglen, query_head, strlen(query_head), 0, 0, -1.0, -1, -1, nullptr, 0.0); } auto results_show_tsegout_one(std::FILE * output_handle, struct hit * hits) -> void { if (hits == nullptr) { return; } auto * tseg = db_getsequence(hits->target) + hits->trim_t_left; int const tseglen = db_getsequencelen(hits->target) - hits->trim_t_left - hits->trim_t_right; fasta_print_general(output_handle, nullptr, tseg, tseglen, db_getheader(hits->target), db_getheaderlen(hits->target), 0, 0, -1.0, -1, -1, nullptr, 0.0); } auto results_show_blast6out_one(std::FILE * output_handle, struct hit * hits, char * query_head, int64_t qseqlen) -> void { /* http://www.drive5.com/usearch/manual/blast6out.html query label target label percent identity alignment length number of mismatches number of gap opens 1-based position of start in query 1-based position of end in query 1-based position of start in target 1-based position of end in target E-value bit score Note that USEARCH shows 13 fields when there is no hit, but only 12 when there is a hit. Fixed in VSEARCH. */ if (hits == nullptr) { fprintf(output_handle, "%s\t*\t0.0\t0\t0\t0\t0\t0\t0\t0\t-1\t0\n", query_head); return; } // if 'hp->strand' then 'minus strand' else 'plus strand' const int qstart = hits->strand ? qseqlen : 1; const int qend = hits->strand ? 1 : qseqlen; fprintf(output_handle, "%s\t%s\t%.1f\t%d\t%d\t%d\t%d\t%d\t%d\t%" PRIu64 "\t%d\t%d\n", query_head, db_getheader(hits->target), hits->id, hits->internal_alignmentlength, hits->mismatches, hits->internal_gaps, qstart, qend, 1, db_getsequencelen(hits->target), -1, 0); } auto results_show_uc_one(std::FILE * output_handle, struct hit * hits, char * query_head, int64_t qseqlen, int clusterno) -> void { /* http://www.drive5.com/usearch/manual/ucout.html Columns: H/N cluster no (0-based) (target sequence no) sequence length (query) percent identity strand: + or - 0 0 compressed alignment, e.g. 9I92M14D, or "=" if perfect alignment query label target label */ if (hits != nullptr) { auto perfect = false; if (opt_cluster_fast) { /* cluster_fast */ /* use = for identical sequences ignoring terminal gaps */ perfect = (hits->matches == hits->internal_alignmentlength); } else { /* cluster_size, cluster_smallmem, cluster_unoise */ /* usearch_global, search_exact, allpairs_global */ /* use = for strictly identical sequences */ perfect = (hits->matches == hits->nwalignmentlength); } fprintf(output_handle, "H\t%d\t%" PRId64 "\t%.1f\t%c\t0\t0\t%s\t", clusterno, qseqlen, hits->id, hits->strand ? '-' : '+', perfect ? "=" : hits->nwalignment); header_fprint_strip(output_handle, query_head, strlen(query_head), opt_xsize, opt_xee, opt_xlength); fprintf(output_handle, "\t"); header_fprint_strip(output_handle, db_getheader(hits->target), db_getheaderlen(hits->target), opt_xsize, opt_xee, opt_xlength); fprintf(output_handle, "\n"); } else { fprintf(output_handle, "N\t*\t*\t*\t.\t*\t*\t*\t%s\t*\n", query_head); } } auto results_show_userout_one(std::FILE * output_handle, struct hit * hits, char * query_head, char * qsequence, int64_t qseqlen, char * qsequence_rc) -> void { /* http://drive5.com/usearch/manual/userout.html qlo, qhi, tlo, thi and raw are given more meaningful values here */ for (auto c = 0; c < userfields_requested_count; c++) { if (c != 0) { fprintf(output_handle, "\t"); } auto const field = userfields_requested[c]; char * tsequence = nullptr; int64_t tseqlen = 0; char * t_head = nullptr; if (hits != nullptr) { tsequence = db_getsequence(hits->target); tseqlen = db_getsequencelen(hits->target); t_head = db_getheader(hits->target); } char * qrow = nullptr; char * trow = nullptr; switch (field) { case 0: /* query */ fprintf(output_handle, "%s", query_head); break; case 1: /* target */ fprintf(output_handle, "%s", hits ? t_head : "*"); break; case 2: /* evalue */ fprintf(output_handle, "-1"); break; case 3: /* id */ fprintf(output_handle, "%.1f", hits ? hits->id : 0.0); break; case 4: /* pctpv */ fprintf(output_handle, "%.1f", (hits and (hits->internal_alignmentlength > 0)) ? 100.0 * hits->matches / hits->internal_alignmentlength : 0.0); break; case 5: /* pctgaps */ fprintf(output_handle, "%.1f", (hits and (hits->internal_alignmentlength > 0)) ? 100.0 * hits->internal_indels / hits->internal_alignmentlength : 0.0); break; case 6: /* pairs */ fprintf(output_handle, "%d", hits ? hits->matches + hits->mismatches : 0); break; case 7: /* gaps */ fprintf(output_handle, "%d", hits ? hits->internal_indels : 0); break; case 8: /* qlo */ fprintf(output_handle, "%" PRId64, hits ? (hits->strand ? qseqlen : 1) : 0); break; case 9: /* qhi */ fprintf(output_handle, "%" PRId64, hits ? (hits->strand ? 1 : qseqlen) : 0); break; case 10: /* tlo */ fprintf(output_handle, "%d", hits ? 1 : 0); break; case 11: /* thi */ fprintf(output_handle, "%" PRId64, tseqlen); break; case 12: /* pv */ fprintf(output_handle, "%d", hits ? hits->matches : 0); break; case 13: /* ql */ fprintf(output_handle, "%" PRId64, qseqlen); break; case 14: /* tl */ fprintf(output_handle, "%" PRId64, hits ? tseqlen : 0); break; case 15: /* qs */ fprintf(output_handle, "%" PRId64, qseqlen); break; case 16: /* ts */ fprintf(output_handle, "%" PRId64, hits ? tseqlen : 0); break; case 17: /* alnlen */ fprintf(output_handle, "%d", hits ? hits->internal_alignmentlength : 0); break; case 18: /* opens */ fprintf(output_handle, "%d", hits ? hits->internal_gaps : 0); break; case 19: /* exts */ fprintf(output_handle, "%d", hits ? hits->internal_indels - hits->internal_gaps : 0); break; case 20: /* raw */ fprintf(output_handle, "%d", hits ? hits->nwscore : 0); break; case 21: /* bits */ fprintf(output_handle, "%d", 0); break; case 22: /* aln */ if (hits) { align_fprint_uncompressed_alignment(output_handle, hits->nwalignment); } break; case 23: /* caln */ if (hits) { fprintf(output_handle, "%s", hits->nwalignment); } break; case 24: /* qstrand */ if (hits) { fprintf(output_handle, "%c", hits->strand ? '-' : '+'); } break; case 25: /* tstrand */ if (hits) { fprintf(output_handle, "%c", '+'); } break; case 26: /* qrow */ if (hits) { qrow = align_getrow(hits->strand ? qsequence_rc : qsequence, hits->nwalignment, hits->nwalignmentlength, 0); fprintf(output_handle, "%.*s", hits->internal_alignmentlength, qrow + hits->trim_q_left + hits->trim_t_left); xfree(qrow); } break; case 27: /* trow */ if (hits) { trow = align_getrow(tsequence, hits->nwalignment, hits->nwalignmentlength, 1); fprintf(output_handle, "%.*s", hits->internal_alignmentlength, trow + hits->trim_q_left + hits->trim_t_left); xfree(trow); } break; case 28: /* qframe */ fprintf(output_handle, "+0"); break; case 29: /* tframe */ fprintf(output_handle, "+0"); break; case 30: /* mism */ fprintf(output_handle, "%d", hits ? hits->mismatches : 0); break; case 31: /* ids */ fprintf(output_handle, "%d", hits ? hits->matches : 0); break; case 32: /* qcov */ fprintf(output_handle, "%.1f", hits ? 100.0 * (hits->matches + hits->mismatches) / qseqlen : 0.0); break; case 33: /* tcov */ fprintf(output_handle, "%.1f", hits ? 100.0 * (hits->matches + hits->mismatches) / tseqlen : 0.0); break; case 34: /* id0 */ fprintf(output_handle, "%.1f", hits ? hits->id0 : 0.0); break; case 35: /* id1 */ fprintf(output_handle, "%.1f", hits ? hits->id1 : 0.0); break; case 36: /* id2 */ fprintf(output_handle, "%.1f", hits ? hits->id2 : 0.0); break; case 37: /* id3 */ fprintf(output_handle, "%.1f", hits ? hits->id3 : 0.0); break; case 38: /* id4 */ fprintf(output_handle, "%.1f", hits ? hits->id4 : 0.0); break; /* new internal alignment coordinates */ case 39: /* qilo */ fprintf(output_handle, "%d", hits ? hits->trim_q_left + 1 : 0); break; case 40: /* qihi */ fprintf(output_handle, "%" PRId64, hits ? qseqlen - hits->trim_q_right : 0); break; case 41: /* tilo */ fprintf(output_handle, "%d", hits ? hits->trim_t_left + 1 : 0); break; case 42: /* tihi */ fprintf(output_handle, "%" PRId64, hits ? tseqlen - hits->trim_t_right : 0); break; } } fprintf(output_handle, "\n"); } auto results_show_lcaout(std::FILE * output_handle, struct hit * hits, int hitcount, char * query_head) -> void { /* Output last common ancestor (LCA) of the hits, in a similar way to the Sintax command */ /* Use a modified Boyer-Moore majority voting algorithm at each taxonomic level to find the most common name at each level */ fprintf(output_handle, "%s\t", query_head); std::array votes {{}}; std::array cand; cand.fill(-1); std::array, tax_levels> cand_level_start {{}}; std::array, tax_levels> cand_level_len {{}}; std::array level_match {{}}; auto const top_hit_id = hits[0].id; auto tophitcount = 0; for (auto t = 0; t < hitcount; t++) { struct hit * hp = hits + t; if (opt_top_hits_only and (hp->id < top_hit_id)) { break; } ++tophitcount; int const seqno = hp->target; int new_level_start[tax_levels]; int new_level_len[tax_levels]; tax_split(seqno, new_level_start, new_level_len); for (auto k = 0; k < tax_levels; k++) { if (votes[k] == 0) { cand[k] = seqno; votes[k] = 1; for (auto j = 0; j < tax_levels; j++) { cand_level_start[k][j] = new_level_start[j]; cand_level_len[k][j] = new_level_len[j]; } } else { auto match = true; for (auto j = 0; j <= k; j++) { if ((new_level_len[j] != cand_level_len[k][j]) or (strncmp(db_getheader(cand[k]) + cand_level_start[k][j], db_getheader(seqno) + new_level_start[j], new_level_len[j]) != 0)) { match = false; break; } } if (match) { ++votes[k]; } else { --votes[k]; } } } } /* count actual matches to the candidate at each level */ for (auto t = 0; t < tophitcount; t++) { auto const seqno = hits[t].target; int new_level_start[tax_levels]; int new_level_len[tax_levels]; tax_split(seqno, new_level_start, new_level_len); for (auto k = 0; k < tax_levels; k++) { auto match = true; for (auto j = 0; j <= k; j++) { if ((new_level_len[j] != cand_level_len[k][j]) or (strncmp(db_getheader(cand[k]) + cand_level_start[k][j], db_getheader(seqno) + new_level_start[j], new_level_len[j]) != 0)) { match = false; break; } } if (match) { ++level_match[k]; } } } /* output results */ if (tophitcount > 0) { auto comma = false; for (auto j = 0; j < tax_levels; j++) { if (1.0 * level_match[j] / tophitcount < opt_lca_cutoff) { break; } if (cand_level_len[j][j] > 0) { fprintf(output_handle, "%s%c:%.*s", (comma ? "," : ""), tax_letters[j], cand_level_len[j][j], db_getheader(cand[j]) + cand_level_start[j][j]); comma = true; } } } fprintf(output_handle, "\n"); } auto results_show_alnout(std::FILE * output_handle, struct hit * hits, int hitcount, char * query_head, char * qsequence, int64_t qseqlen) -> void { /* http://drive5.com/usearch/manual/alnout.html */ if (hitcount > 0) { fprintf(output_handle, "\n"); fprintf(output_handle,"Query >%s\n", query_head); fprintf(output_handle," %%Id TLen Target\n"); auto const top_hit_id = hits[0].id; for (auto t = 0; t < hitcount; t++) { auto * hp = hits + t; if (opt_top_hits_only and (hp->id < top_hit_id)) { break; } fprintf(output_handle,"%3.0f%% %6" PRIu64 " %s\n", hp->id, db_getsequencelen(hp->target), db_getheader(hp->target)); } for (auto t = 0; t < hitcount; t++) { auto * hp = hits + t; if (opt_top_hits_only and (hp->id < top_hit_id)) { break; } fprintf(output_handle,"\n"); auto * dseq = db_getsequence(hp->target); int64_t const dseqlen = db_getsequencelen(hp->target); auto const qlenlen = snprintf(nullptr, 0, "%" PRId64, qseqlen); auto const tlenlen = snprintf(nullptr, 0, "%" PRId64, dseqlen); auto const numwidth = std::max(qlenlen, tlenlen); fprintf(output_handle," Query %*" PRId64 "nt >%s\n", numwidth, qseqlen, query_head); fprintf(output_handle,"Target %*" PRId64 "nt >%s\n", numwidth, dseqlen, db_getheader(hp->target)); int const rowlen = opt_rowlen == 0 ? qseqlen + dseqlen : opt_rowlen; align_show(output_handle, qsequence, qseqlen, hp->trim_q_left, "Qry", dseq, dseqlen, hp->trim_t_left, "Tgt", hp->nwalignment + hp->trim_aln_left, strlen(hp->nwalignment) - hp->trim_aln_left - hp->trim_aln_right, numwidth, 3, rowlen, hp->strand); fprintf(output_handle, "\n%d cols, %d ids (%3.1f%%), %d gaps (%3.1f%%)\n", hp->internal_alignmentlength, hp->matches, hp->id, hp->internal_indels, hp->internal_alignmentlength > 0 ? 100.0 * hp->internal_indels / hp->internal_alignmentlength : 0.0); #if 0 fprintf(output_handle, "%d kmers, %d score, %d gap opens. %s %s %d %d %d %d %d\n", hp->count, hp->nwscore, hp->nwgaps, hp->accepted ? "accepted" : "not accepted", hp->nwalignment, hp->nwalignmentlength, hp->trim_q_left, hp->trim_q_right, hp->trim_t_left, hp->trim_t_right ); #endif } } else if (opt_output_no_hits) { fprintf(output_handle, "\n"); fprintf(output_handle,"Query >%s\n", query_head); fprintf(output_handle,"No hits\n"); } } auto inline nucleotide_equal(char lhs, char rhs) -> bool { return chrmap_4bit[(int) lhs] == chrmap_4bit[(int) rhs]; } auto build_sam_strings(char * alignment, char * queryseq, char * targetseq, xstring * cigar, xstring * md) -> void { /* convert cigar to sam format: add "1" to operations without run length flip direction of indels in cigar string build MD-string with substitutions */ cigar->empty(); md->empty(); auto * p = alignment; auto * e = p + strlen(p); auto qpos = 0; auto tpos = 0; auto matched = 0; auto flag = false; /* 1: MD string ends with a number */ while (p < e) { auto run = 1; auto scanned = 0; sscanf(p, "%d%n", &run, &scanned); p += scanned; auto const op = *p; ++p; switch (op) { case 'M': cigar->add_d(run); cigar->add_c('M'); for (auto i = 0; i < run; i++) { if (nucleotide_equal(queryseq[qpos], targetseq[tpos])) { ++matched; } else { if (not flag) { md->add_d(matched); matched = 0; flag = true; } md->add_c(targetseq[tpos]); flag = false; } ++qpos; ++tpos; } break; case 'D': cigar->add_d(run); cigar->add_c('I'); qpos += run; break; case 'I': cigar->add_d(run); cigar->add_c('D'); if (not flag) { md->add_d(matched); matched = 0; flag = true; } md->add_c('^'); for (auto i = 0; i < run; i++) { md->add_c(targetseq[tpos]); ++tpos; } flag = false; break; } } if (not flag) { md->add_d(matched); matched = 0; flag = true; } } auto results_show_samheader(std::FILE * output_handle, char * cmdline, char * dbname) -> void { if (opt_samout and opt_samheader) { fprintf(output_handle, "@HD\tVN:1.0\tSO:unsorted\tGO:query\n"); std::array md5hex; for (uint64_t i = 0; i < db_getsequencecount(); i++) { get_hex_seq_digest_md5(md5hex.data(), db_getsequence(i), db_getsequencelen(i)); fprintf(output_handle, "@SQ\tSN:%s\tLN:%" PRIu64 "\tM5:%s\tUR:file:%s\n", db_getheader(i), db_getsequencelen(i), md5hex.data(), dbname); } fprintf(output_handle, "@PG\tID:%s\tVN:%s\tCL:%s\n", PROG_NAME, PROG_VERSION, cmdline); } } auto results_show_samout(std::FILE * output_handle, struct hit * hits, int hitcount, char * query_head, char * qsequence, char * qsequence_rc) -> void { /* SAM format output http://samtools.github.io/hts-specs/SAMv1.pdf http://www.drive5.com/usearch/manual/sam_files.html http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output http://davetang.org/muse/2011/01/28/perl-and-sam/ 1: qname, query template name 2: flag, bitwise flag (12 bits) (0x004=unmapped, 0x010=rev strand, 0x100 sec. alignment) 3: rname, reference sequence name 4: pos, 1-based leftmost mapping position (1) 5: mapq, mapping quality (255) 6: cigar, cigar string (MID) 7: rnext, ref name of next/paired read (*) 8: pnest, position of next/paired read (0) 9: tlen, obs template length (target length) 10: seq, segment of sequence 11: qual, ascii of phred based quality+33 (*) 12: optional tags (tag:type:value) Optional tags AS, XN, XM, XO, XG, NM, MD and YT used in usearch8. Usearch8: AS:i:? alignment score (i.e percent identity) XN:i:? next best alignment score (always 0?) XM:i:? number of mismatches XO:i:? number of gap opens (excluding terminal gaps) XG:i:? number of gap extensions (excluding terminal gaps) NM:i:? edit distance (sum of XM and XG) MD:Z:? variant string YT:Z:UU string representing alignment type */ if (hitcount > 0) { auto const top_hit_id = hits[0].id; for (auto t = 0; t < hitcount; t++) { auto * hp = hits + t; if (opt_top_hits_only and (hp->id < top_hit_id)) { break; } /* */ xstring cigar; xstring md; build_sam_strings(hp->nwalignment, hp->strand ? qsequence_rc : qsequence, db_getsequence(hp->target), &cigar, &md); fprintf(output_handle, "%s\t%u\t%s\t%" PRIu64 "\t%u\t%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%s\t%s\t" "AS:i:%.0f\tXN:i:%d\tXM:i:%d\tXO:i:%d\t" "XG:i:%d\tNM:i:%d\tMD:Z:%s\tYT:Z:%s\n", query_head, (0x10 * hp->strand) | (t > 0 ? 0x100 : 0), db_getheader(hp->target), (uint64_t) 1, 255, cigar.get_string(), "*", (uint64_t) 0, (uint64_t) 0, hp->strand ? qsequence_rc : qsequence, "*", hp->id, 0, hp->mismatches, hp->internal_gaps, hp->internal_indels, hp->mismatches + hp->internal_indels, md.get_string(), "UU"); } } else if (opt_output_no_hits) { fprintf(output_handle, "%s\t%u\t%s\t%" PRIu64 "\t%u\t%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%s\t%s\n", query_head, 0x04, "*", (uint64_t) 0, 255, "*", "*", (uint64_t) 0, (uint64_t) 0, qsequence, "*"); } } vsearch-2.30.0/src/results.h000066400000000000000000000114471476012147200157030ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // int64_t auto results_show_alnout(std::FILE * output_handle, struct hit * hits, int hitcount, char * query_head, char * qsequence, int64_t qseqlen) -> void; auto results_show_lcaout(std::FILE * output_handle, struct hit * hits, int hitcount, char * query_head) -> void; auto results_show_blast6out_one(std::FILE * output_handle, struct hit * hits, char * query_head, int64_t qseqlen) -> void; auto results_show_uc_one(std::FILE * output_handle, struct hit * hits, char * query_head, int64_t qseqlen, int clusterno) -> void; auto results_show_userout_one(std::FILE * output_handle, struct hit * hits, char * query_head, char * qsequence, int64_t qseqlen, char * qsequence_rc) -> void; auto results_show_fastapairs_one(std::FILE * output_handle, struct hit * hits, char * query_head, char * qsequence, char * qsequence_rc) -> void; auto results_show_qsegout_one(std::FILE * output_handle, struct hit * hits, char * query_head, char * qsequence, int64_t qseqlen, char * qsequence_rc) -> void; auto results_show_tsegout_one(std::FILE * output_handle, struct hit * hits) -> void; auto results_show_samheader(std::FILE * output_handle, char * cmdline, char * dbname) -> void; auto results_show_samout(std::FILE * output_handle, struct hit * hits, int hitcount, char * query_head, char * qsequence, char * qsequence_rc) -> void; vsearch-2.30.0/src/search.cc000066400000000000000000000642651476012147200156130ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "dbindex.h" #include "maps.h" #include "mask.h" #include "minheap.h" #include "otutable.h" #include "udb.h" #include "unique.h" #include // std::min #include // macros PRIu64 and PRId64 #include // uint64_t, int64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::strlen, std::memset, std::strcpy #include static struct searchinfo_s * si_plus; static struct searchinfo_s * si_minus; static pthread_t * pthread; /* global constants/data, no need for synchronization */ static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; static fastx_handle query_fastx_h; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static int qmatches; static uint64 qmatches_abundance; static int queries; static uint64 queries_abundance; static uint64 * dbmatched; static FILE * fp_samout = nullptr; static FILE * fp_alnout = nullptr; static FILE * fp_userout = nullptr; static FILE * fp_blast6out = nullptr; static FILE * fp_uc = nullptr; static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; static FILE * fp_dbmatched = nullptr; static FILE * fp_dbnotmatched = nullptr; static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; static FILE * fp_lcaout = nullptr; static FILE * fp_qsegout = nullptr; static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; auto search_output_results(int hit_count, struct hit * hits, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { xpthread_mutex_lock(&mutex_output); /* show results */ auto const toreport = std::min(opt_maxhits, hit_count); if (fp_alnout) { results_show_alnout(fp_alnout, hits, toreport, query_head, qsequence, qseqlen); } if (fp_lcaout) { results_show_lcaout(fp_lcaout, hits, toreport, query_head); } if (fp_samout) { results_show_samout(fp_samout, hits, toreport, query_head, qsequence, qsequence_rc); } if (toreport) { double const top_hit_id = hits[0].id; if (opt_otutabout || opt_mothur_shared_out || opt_biomout) { otutable_add(query_head, db_getheader(hits[0].target), qsize); } for (int t = 0; t < toreport; t++) { struct hit * hp = hits + t; if (opt_top_hits_only && (hp->id < top_hit_id)) { break; } if (fp_fastapairs) { results_show_fastapairs_one(fp_fastapairs, hp, query_head, qsequence, qsequence_rc); } if (fp_qsegout) { results_show_qsegout_one(fp_qsegout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout) { results_show_tsegout_one(fp_tsegout, hp); } if (fp_uc) { if ((t==0) || opt_uc_allhits) { results_show_uc_one(fp_uc, hp, query_head, qseqlen, hp->target); } } if (fp_userout) { results_show_userout_one(fp_userout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, hp, query_head, qseqlen); } } } else { if (opt_otutabout || opt_mothur_shared_out || opt_biomout) { otutable_add(query_head, nullptr, qsize); } if (fp_uc) { results_show_uc_one(fp_uc, nullptr, query_head, qseqlen, 0); } if (opt_output_no_hits) { if (fp_userout) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } } if (hit_count) { count_matched++; if (opt_matched) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_matched, -1.0, -1, -1, nullptr, 0.0); } } else { count_notmatched++; if (opt_notmatched) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } /* update matching db sequences */ for (int i = 0; i < hit_count; i++) { if (hits[i].accepted || hits[i].weak) { dbmatched[hits[i].target] += opt_sizein ? qsize : 1; } } xpthread_mutex_unlock(&mutex_output); } auto search_query(int64_t t) -> int { for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_minus + t : si_plus + t; /* mask query */ if (opt_qmask == MASK_DUST) { dust(si->qsequence, si->qseqlen); } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) { hardmask(si->qsequence, si->qseqlen); } /* perform search */ search_onequery(si, opt_qmask); } struct hit * hits = nullptr; int hit_count = 0; search_joinhits(si_plus + t, opt_strand > 1 ? si_minus + t : nullptr, & hits, & hit_count); search_output_results(hit_count, hits, si_plus[t].query_head, si_plus[t].qseqlen, si_plus[t].qsequence, opt_strand > 1 ? si_minus[t].qsequence : nullptr, si_plus[t].qsize); /* free memory for alignment strings */ for (int i = 0; i < hit_count; i++) { if (hits[i].aligned) { xfree(hits[i].nwalignment); } } xfree(hits); return hit_count; } auto search_thread_run(int64_t t) -> void { while (true) { xpthread_mutex_lock(&mutex_input); if (fastx_next(query_fastx_h, ! opt_notrunclabels, chrmap_no_change)) { char * qhead = fastx_get_header(query_fastx_h); int const query_head_len = fastx_get_header_length(query_fastx_h); char * qseq = fastx_get_sequence(query_fastx_h); int const qseqlen = fastx_get_sequence_length(query_fastx_h); int const query_no = fastx_get_seqno(query_fastx_h); int const qsize = fastx_get_abundance(query_fastx_h); for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_minus + t : si_plus + t; si->query_head_len = query_head_len; si->qseqlen = qseqlen; si->query_no = query_no; si->qsize = qsize; si->strand = s; /* allocate more memory for header and sequence, if necessary */ if (si->query_head_len + 1 > si->query_head_alloc) { si->query_head_alloc = si->query_head_len + 2001; si->query_head = (char *) xrealloc(si->query_head, (size_t) (si->query_head_alloc)); } if (si->qseqlen + 1 > si->seq_alloc) { si->seq_alloc = si->qseqlen + 2001; si->qsequence = (char *) xrealloc(si->qsequence, (size_t) (si->seq_alloc)); } } /* plus strand: copy header and sequence */ strcpy(si_plus[t].query_head, qhead); strcpy(si_plus[t].qsequence, qseq); /* get progress as amount of input file read */ uint64_t const progress = fastx_get_position(query_fastx_h); /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* minus strand: copy header and reverse complementary sequence */ if (opt_strand > 1) { strcpy(si_minus[t].query_head, si_plus[t].query_head); reverse_complement(si_minus[t].qsequence, si_plus[t].qsequence, si_plus[t].qseqlen); } int const match = search_query(t); /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* update stats */ queries++; queries_abundance += qsize; if (match) { qmatches++; qmatches_abundance += qsize; } /* show progress */ progress_update(progress); xpthread_mutex_unlock(&mutex_output); } else { xpthread_mutex_unlock(&mutex_input); break; } } } auto search_thread_init(struct searchinfo_s * si) -> void { /* thread specific initialiation */ si->uh = unique_init(); si->kmers = (count_t *) xmalloc((seqcount * sizeof(count_t)) + 32); si->m = minheap_init(tophits); si->hits = (struct hit *) xmalloc (sizeof(struct hit) * (tophits) * opt_strand); si->qsize = 1; si->query_head_alloc = 0; si->query_head = nullptr; si->seq_alloc = 0; si->qsequence = nullptr; si->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); } auto search_thread_exit(struct searchinfo_s * si) -> void { /* thread specific clean up */ search16_exit(si->s); unique_exit(si->uh); xfree(si->hits); minheap_exit(si->m); xfree(si->kmers); if (si->query_head) { xfree(si->query_head); } if (si->qsequence) { xfree(si->qsequence); } } auto search_thread_worker(void * vp) -> void * { auto t = (int64_t) vp; search_thread_run(t); return nullptr; } auto search_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { search_thread_init(si_plus + t); if (si_minus) { search_thread_init(si_minus + t); } xpthread_create(pthread + t, &attr, search_thread_worker, (void *) (int64_t) t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); search_thread_exit(si_plus + t); if (si_minus) { search_thread_exit(si_minus + t); } } xpthread_attr_destroy(&attr); } auto search_prep(char * cmdline, char * progheader) -> void { /* open output files */ if (opt_alnout) { fp_alnout = fopen_output(opt_alnout); if (! fp_alnout) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_lcaout) { fp_lcaout = fopen_output(opt_lcaout); if (! fp_lcaout) { fatal("Unable to open lca output file for writing"); } } if (opt_samout) { fp_samout = fopen_output(opt_samout); if (! fp_samout) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout) { fp_userout = fopen_output(opt_userout); if (! fp_userout) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out) { fp_blast6out = fopen_output(opt_blast6out); if (! fp_blast6out) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_uc) { fp_uc = fopen_output(opt_uc); if (! fp_uc) { fatal("Unable to open uc output file for writing"); } } if (opt_fastapairs) { fp_fastapairs = fopen_output(opt_fastapairs); if (! fp_fastapairs) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout) { fp_qsegout = fopen_output(opt_qsegout); if (! fp_qsegout) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout) { fp_tsegout = fopen_output(opt_tsegout); if (! fp_tsegout) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched) { fp_matched = fopen_output(opt_matched); if (! fp_matched) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched) { fp_notmatched = fopen_output(opt_notmatched); if (! fp_notmatched) { fatal("Unable to open notmatched output file for writing"); } } if (opt_otutabout) { fp_otutabout = fopen_output(opt_otutabout); if (! fp_otutabout) { fatal("Unable to open OTU table (text format) output file for writing"); } } if (opt_mothur_shared_out) { fp_mothur_shared_out = fopen_output(opt_mothur_shared_out); if (! fp_mothur_shared_out) { fatal("Unable to open OTU table (mothur format) output file for writing"); } } if (opt_biomout) { fp_biomout = fopen_output(opt_biomout); if (! fp_biomout) { fatal("Unable to open OTU table (biom 1.0 format) output file for writing"); } } /* check if it may be an UDB file */ bool const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); results_show_samheader(fp_samout, cmdline, opt_db); show_rusage(); seqcount = db_getsequencecount(); } else { db_read(opt_db, 0); results_show_samheader(fp_samout, cmdline, opt_db); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } /* tophits = the maximum number of hits we need to store */ if ((opt_maxrejects == 0) || (opt_maxrejects > seqcount)) { opt_maxrejects = seqcount; } if ((opt_maxaccepts == 0) || (opt_maxaccepts > seqcount)) { opt_maxaccepts = seqcount; } tophits = opt_maxrejects + opt_maxaccepts + MAXDELAYED; tophits = std::min(tophits, seqcount); } auto search_done() -> void { /* clean up, global */ dbindex_free(); db_free(); if (opt_lcaout) { fclose(fp_lcaout); } if (opt_matched) { fclose(fp_matched); } if (opt_notmatched) { fclose(fp_notmatched); } if (opt_fastapairs) { fclose(fp_fastapairs); } if (opt_qsegout) { fclose(fp_qsegout); } if (opt_tsegout) { fclose(fp_tsegout); } if (fp_uc) { fclose(fp_uc); } if (fp_blast6out) { fclose(fp_blast6out); } if (fp_userout) { fclose(fp_userout); } if (fp_alnout) { fclose(fp_alnout); } if (fp_samout) { fclose(fp_samout); } show_rusage(); } auto usearch_global(char * cmdline, char * progheader) -> void { search_prep(cmdline, progheader); if (opt_dbmatched) { fp_dbmatched = fopen_output(opt_dbmatched); if (! fp_dbmatched) { fatal("Unable to open dbmatched output file for writing"); } } if (opt_dbnotmatched) { fp_dbnotmatched = fopen_output(opt_dbnotmatched); if (! fp_dbnotmatched) { fatal("Unable to open dbnotmatched output file for writing"); } } dbmatched = (uint64*) xmalloc(seqcount * sizeof(uint64*)); memset(dbmatched, 0, seqcount * sizeof(uint64*)); otutable_init(); /* prepare reading of queries */ qmatches = 0; qmatches_abundance = 0; queries = 0; queries_abundance = 0; query_fastx_h = fastx_open(opt_usearch_global); /* allocate memory for thread info */ si_plus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); if (opt_strand > 1) { si_minus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); } else { si_minus = nullptr; } pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); progress_init("Searching", fastx_get_size(query_fastx_h)); search_thread_worker_run(); progress_done(); xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); xfree(pthread); xfree(si_plus); if (si_minus) { xfree(si_minus); } fastx_close(query_fastx_h); if (! opt_quiet) { fprintf(stderr, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); if (opt_sizein) { fprintf(stderr, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(stderr, "\n"); } } if (opt_log) { fprintf(fp_log, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n"); if (opt_sizein) { fprintf(fp_log, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(fp_log, "\n"); } } // Add OTUs with no matches to OTU table if (opt_otutabout || opt_mothur_shared_out || opt_biomout) { for (int64_t i = 0; i < seqcount; i++) { if (! dbmatched[i]) { otutable_add(nullptr, db_getheader(i), 0); } } } if (opt_biomout) { otutable_print_biomout(fp_biomout); fclose(fp_biomout); } if (opt_otutabout) { otutable_print_otutabout(fp_otutabout); fclose(fp_otutabout); } if (opt_mothur_shared_out) { otutable_print_mothur_shared_out(fp_mothur_shared_out); fclose(fp_mothur_shared_out); } otutable_done(); int count_dbmatched = 0; int count_dbnotmatched = 0; if (opt_dbmatched || opt_dbnotmatched) { for (int64_t i = 0; i < seqcount; i++) { if (dbmatched[i]) { count_dbmatched++; if (opt_dbmatched) { fasta_print_general(fp_dbmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), dbmatched[i], count_dbmatched, -1.0, -1, -1, nullptr, 0.0); } } else { count_dbnotmatched++; if (opt_dbnotmatched) { fasta_print_general(fp_dbnotmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), db_getabundance(i), count_dbnotmatched, -1.0, -1, -1, nullptr, 0.0); } } } } xfree(dbmatched); if (opt_dbmatched) { fclose(fp_dbmatched); } if (opt_dbnotmatched) { fclose(fp_dbnotmatched); } search_done(); } vsearch-2.30.0/src/search.h000066400000000000000000000047551476012147200154530ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto usearch_global(char * cmdline, char * progheader) -> void; vsearch-2.30.0/src/search_exact.cc000066400000000000000000000636051476012147200167740ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include "mask.h" #include "otutable.h" #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::strlen, std::memset, std::strcpy #include #include static struct searchinfo_s * si_plus = nullptr; static struct searchinfo_s * si_minus = nullptr; static pthread_t * pthread = nullptr; /* global constants/data, no need for synchronization */ static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; static fastx_handle query_fastx_h; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static int qmatches; static uint64 qmatches_abundance; static int queries; static uint64 queries_abundance; static uint64 * dbmatched; static FILE * fp_samout = nullptr; static FILE * fp_alnout = nullptr; static FILE * fp_userout = nullptr; static FILE * fp_blast6out = nullptr; static FILE * fp_uc = nullptr; static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; static FILE * fp_dbmatched = nullptr; static FILE * fp_dbnotmatched = nullptr; static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; static FILE * fp_qsegout = nullptr; static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; auto add_hit(struct searchinfo_s * si, uint64_t seqno) -> void { if (search_acceptable_unaligned(si, seqno)) { struct hit * hp = si->hits + si->hit_count; si->hit_count++; hp->target = seqno; hp->strand = si->strand; hp->count = 0; hp->nwscore = si->qseqlen * opt_match; hp->nwdiff = 0; hp->nwgaps = 0; hp->nwindels = 0; hp->nwalignmentlength = si->qseqlen; hp->nwid = 100.0; hp->matches = si->qseqlen; hp->mismatches = 0; int const ret = xsprintf(&hp->nwalignment, "%dM", si->qseqlen); if ((ret == -1) || (! hp->nwalignment)) { fatal("Out of memory"); } hp->internal_alignmentlength = si->qseqlen; hp->internal_gaps = 0; hp->internal_indels = 0; hp->trim_q_left = 0; hp->trim_q_right = 0; hp->trim_t_left = 0; hp->trim_t_right = 0; hp->trim_aln_left = 0; hp->trim_aln_right = 0; hp->id = 100.0; hp->id0 = 100.0; hp->id1 = 100.0; hp->id2 = 100.0; hp->id3 = 100.0; hp->id4 = 100.0; hp->shortest = si->qseqlen; hp->longest = si->qseqlen; hp->aligned = true; hp->accepted = false; hp->rejected = false; hp->weak = false; (void) search_acceptable_aligned(si, hp); } } auto search_exact_onequery(struct searchinfo_s * si) -> void { dbhash_search_info_s info; char * seq = si->qsequence; uint64_t const seqlen = si->qseqlen; std::vector normalized(seqlen + 1); string_normalize(normalized.data(), seq, seqlen); si->hit_count = 0; int64_t ret = dbhash_search_first(normalized.data(), seqlen, & info); while (ret >= 0) { add_hit(si, ret); ret = dbhash_search_next(&info); } } auto search_exact_output_results(int hit_count, struct hit * hits, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { xpthread_mutex_lock(&mutex_output); /* show results */ int64_t const toreport = MIN(opt_maxhits, hit_count); if (fp_alnout) { results_show_alnout(fp_alnout, hits, toreport, query_head, qsequence, qseqlen); } if (fp_samout) { results_show_samout(fp_samout, hits, toreport, query_head, qsequence, qsequence_rc); } if (toreport) { double const top_hit_id = hits[0].id; if (opt_otutabout || opt_mothur_shared_out || opt_biomout) { otutable_add(query_head, db_getheader(hits[0].target), qsize); } for (int t = 0; t < toreport; t++) { struct hit * hp = hits + t; if (opt_top_hits_only && (hp->id < top_hit_id)) { break; } if (fp_fastapairs) { results_show_fastapairs_one(fp_fastapairs, hp, query_head, qsequence, qsequence_rc); } if (fp_qsegout) { results_show_qsegout_one(fp_qsegout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout) { results_show_tsegout_one(fp_tsegout, hp); } if (fp_uc) { if ((t == 0) || opt_uc_allhits) { results_show_uc_one(fp_uc, hp, query_head, qseqlen, hp->target); } } if (fp_userout) { results_show_userout_one(fp_userout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, hp, query_head, qseqlen); } } } else { if (opt_otutabout || opt_mothur_shared_out || opt_biomout) { otutable_add(query_head, nullptr, qsize); } if (fp_uc) { results_show_uc_one(fp_uc, nullptr, query_head, qseqlen, 0); } if (opt_output_no_hits) { if (fp_userout) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } } if (hit_count) { ++count_matched; if (opt_matched) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_matched, -1.0, -1, -1, nullptr, 0.0); } } else { ++count_notmatched; if (opt_notmatched) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } /* update matching db sequences */ for (int i=0; i < hit_count; i++) { if (hits[i].accepted) { dbmatched[hits[i].target] += opt_sizein ? qsize : 1; } } xpthread_mutex_unlock(&mutex_output); } auto search_exact_query(int64_t t) -> int { for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_minus + t : si_plus + t; /* mask query */ if (opt_qmask == MASK_DUST) { dust(si->qsequence, si->qseqlen); } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) { hardmask(si->qsequence, si->qseqlen); } /* perform search */ search_exact_onequery(si); } struct hit * hits = nullptr; int hit_count = 0; search_joinhits(si_plus + t, opt_strand > 1 ? si_minus + t : nullptr, & hits, & hit_count); search_exact_output_results(hit_count, hits, si_plus[t].query_head, si_plus[t].qseqlen, si_plus[t].qsequence, opt_strand > 1 ? si_minus[t].qsequence : nullptr, si_plus[t].qsize); /* free memory for alignment strings */ for (int i = 0; i < hit_count; i++) { if (hits[i].aligned) { xfree(hits[i].nwalignment); } } xfree(hits); return hit_count; } auto search_exact_thread_run(int64_t t) -> void { while (true) { xpthread_mutex_lock(&mutex_input); if (fastx_next(query_fastx_h, ! opt_notrunclabels, chrmap_no_change)) { char * qhead = fastx_get_header(query_fastx_h); int const query_head_len = fastx_get_header_length(query_fastx_h); char * qseq = fastx_get_sequence(query_fastx_h); int const qseqlen = fastx_get_sequence_length(query_fastx_h); int const query_no = fastx_get_seqno(query_fastx_h); int const qsize = fastx_get_abundance(query_fastx_h); for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_minus + t : si_plus + t; si->query_head_len = query_head_len; si->qseqlen = qseqlen; si->query_no = query_no; si->qsize = qsize; si->strand = s; /* allocate more memory for header and sequence, if necessary */ if (si->query_head_len + 1 > si->query_head_alloc) { si->query_head_alloc = si->query_head_len + 2001; si->query_head = (char *) xrealloc(si->query_head, (size_t) (si->query_head_alloc)); } if (si->qseqlen + 1 > si->seq_alloc) { si->seq_alloc = si->qseqlen + 2001; si->qsequence = (char *) xrealloc(si->qsequence, (size_t) (si->seq_alloc)); } } /* plus strand: copy header and sequence */ strcpy(si_plus[t].query_head, qhead); strcpy(si_plus[t].qsequence, qseq); /* get progress as amount of input file read */ uint64_t const progress = fastx_get_position(query_fastx_h); /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* minus strand: copy header and reverse complementary sequence */ if (opt_strand > 1) { strcpy(si_minus[t].query_head, si_plus[t].query_head); reverse_complement(si_minus[t].qsequence, si_plus[t].qsequence, si_plus[t].qseqlen); } int const match = search_exact_query(t); /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* update stats */ queries++; queries_abundance += qsize; if (match) { qmatches++; qmatches_abundance += qsize; } /* show progress */ progress_update(progress); xpthread_mutex_unlock(&mutex_output); } else { xpthread_mutex_unlock(&mutex_input); break; } } } auto search_exact_thread_init(struct searchinfo_s * si) -> void { /* thread specific initialiation */ si->uh = nullptr; si->kmers = nullptr; si->m = nullptr; si->hits_v.resize(tophits * opt_strand); si->hits = si->hits_v.data(); si->qsize = 1; si->query_head_alloc = 0; si->query_head = nullptr; si->seq_alloc = 0; si->qsequence = nullptr; si->nw = nullptr; si->s = nullptr; } auto search_exact_thread_exit(struct searchinfo_s * si) -> void { /* thread specific clean up */ if (si->query_head) { xfree(si->query_head); } if (si->qsequence) { xfree(si->qsequence); } } auto search_exact_thread_worker(void * vp) -> void * { auto t = (int64_t) vp; search_exact_thread_run(t); return nullptr; } auto search_exact_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { search_exact_thread_init(si_plus + t); if (si_minus) { search_exact_thread_init(si_minus + t); } xpthread_create(pthread + t, &attr, search_exact_thread_worker, (void *) (int64_t) t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); search_exact_thread_exit(si_plus + t); if (si_minus) { search_exact_thread_exit(si_minus + t); } } xpthread_attr_destroy(&attr); } auto search_exact_prep(char * cmdline, char * progheader) -> void { /* open output files */ if (opt_alnout) { fp_alnout = fopen_output(opt_alnout); if (! fp_alnout) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_samout) { fp_samout = fopen_output(opt_samout); if (! fp_samout) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout) { fp_userout = fopen_output(opt_userout); if (! fp_userout) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out) { fp_blast6out = fopen_output(opt_blast6out); if (! fp_blast6out) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_uc) { fp_uc = fopen_output(opt_uc); if (! fp_uc) { fatal("Unable to open uc output file for writing"); } } if (opt_fastapairs) { fp_fastapairs = fopen_output(opt_fastapairs); if (! fp_fastapairs) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout) { fp_qsegout = fopen_output(opt_qsegout); if (! fp_qsegout) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout) { fp_tsegout = fopen_output(opt_tsegout); if (! fp_tsegout) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched) { fp_matched = fopen_output(opt_matched); if (! fp_matched) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched) { fp_notmatched = fopen_output(opt_notmatched); if (! fp_notmatched) { fatal("Unable to open notmatched output file for writing"); } } if (opt_dbmatched) { fp_dbmatched = fopen_output(opt_dbmatched); if (! fp_dbmatched) { fatal("Unable to open dbmatched output file for writing"); } } if (opt_dbnotmatched) { fp_dbnotmatched = fopen_output(opt_dbnotmatched); if (! fp_dbnotmatched) { fatal("Unable to open dbnotmatched output file for writing"); } } if (opt_otutabout) { fp_otutabout = fopen_output(opt_otutabout); if (! fp_otutabout) { fatal("Unable to open OTU table (text format) output file for writing"); } } if (opt_mothur_shared_out) { fp_mothur_shared_out = fopen_output(opt_mothur_shared_out); if (! fp_mothur_shared_out) { fatal("Unable to open OTU table (mothur format) output file for writing"); } } if (opt_biomout) { fp_biomout = fopen_output(opt_biomout); if (! fp_biomout) { fatal("Unable to open OTU table (biom 1.0 format) output file for writing"); } } db_read(opt_db, 0); results_show_samheader(fp_samout, cmdline, opt_db); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); /* tophits = the maximum number of hits we need to store */ tophits = seqcount; dbmatched = (uint64*) xmalloc(seqcount * sizeof(uint64*)); memset(dbmatched, 0, seqcount * sizeof(uint64*)); dbhash_open(seqcount); dbhash_add_all(); } auto search_exact_done() -> void { /* clean up, global */ dbhash_close(); db_free(); xfree(dbmatched); if (opt_dbmatched) { fclose(fp_dbmatched); } if (opt_dbnotmatched) { fclose(fp_dbnotmatched); } if (opt_matched) { fclose(fp_matched); } if (opt_notmatched) { fclose(fp_notmatched); } if (opt_fastapairs) { fclose(fp_fastapairs); } if (opt_qsegout) { fclose(fp_qsegout); } if (opt_tsegout) { fclose(fp_tsegout); } if (fp_uc) { fclose(fp_uc); } if (fp_blast6out) { fclose(fp_blast6out); } if (fp_userout) { fclose(fp_userout); } if (fp_alnout) { fclose(fp_alnout); } if (fp_samout) { fclose(fp_samout); } show_rusage(); } auto search_exact(char * cmdline, char * progheader) -> void { opt_id = 1.0; search_exact_prep(cmdline, progheader); otutable_init(); /* prepare reading of queries */ qmatches = 0; qmatches_abundance = 0; queries = 0; queries_abundance = 0; query_fastx_h = fastx_open(opt_search_exact); /* allocate memory for thread info */ std::vector si_plus_v(opt_threads); si_plus = si_plus_v.data(); if (opt_strand > 1) { std::vector si_minus_v(opt_threads); si_minus = si_minus_v.data(); } std::vector pthread_v(opt_threads); pthread = pthread_v.data(); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); progress_init("Searching", fastx_get_size(query_fastx_h)); search_exact_thread_worker_run(); progress_done(); xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); // si_plus not used below that point // si_minus not used below that point fastx_close(query_fastx_h); if (! opt_quiet) { fprintf(stderr, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); if (opt_sizein) { fprintf(stderr, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(stderr, "\n"); } } if (opt_log) { fprintf(fp_log, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n"); if (opt_sizein) { fprintf(fp_log, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(fp_log, "\n"); } } // Add OTUs with no matches to OTU table if (opt_otutabout || opt_mothur_shared_out || opt_biomout) { for (int64_t i = 0; i < seqcount; i++) { if (! dbmatched[i]) { otutable_add(nullptr, db_getheader(i), 0); } } } if (fp_biomout) { otutable_print_biomout(fp_biomout); fclose(fp_biomout); } if (fp_otutabout) { otutable_print_otutabout(fp_otutabout); fclose(fp_otutabout); } if (fp_mothur_shared_out) { otutable_print_mothur_shared_out(fp_mothur_shared_out); fclose(fp_mothur_shared_out); } otutable_done(); int count_dbmatched = 0; int count_dbnotmatched = 0; if (opt_dbmatched || opt_dbnotmatched) { for (int64_t i = 0; i < seqcount; i++) { if (dbmatched[i]) { ++count_dbmatched; if (opt_dbmatched) { fasta_print_general(fp_dbmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), dbmatched[i], count_dbmatched, -1.0, -1, -1, nullptr, 0.0); } } else { ++count_dbnotmatched; if (opt_dbnotmatched) { fasta_print_general(fp_dbnotmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), 0, count_dbnotmatched, -1.0, -1, -1, nullptr, 0.0); } } } } search_exact_done(); } vsearch-2.30.0/src/search_exact.h000066400000000000000000000047531476012147200166350ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto search_exact(char * cmdline, char * progheader) -> void; vsearch-2.30.0/src/searchcore.cc000066400000000000000000000602421476012147200164530ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "dbindex.h" #include "maps.h" #include "minheap.h" #include "otutable.h" #include "unique.h" #include // macros PRIu64 and PRId64 #include // std::pow #include // int64_t, uint64_t #include // std::sscanf #include // std::qsort #include // std::strlen, std::memset, std::strcmp #include /* per thread data */ inline auto hit_compare_byid_typed(struct hit * x, struct hit * y) -> int { /* Order: accepted, then rejected (weak) high id, then low id early target, then late target */ if (x->rejected < y->rejected) { return -1; } else if (x->rejected > y->rejected) { return +1; } else if (x->aligned > y->aligned) { return -1; } else if (x->aligned < y->aligned) { return +1; } else if (x->aligned == 0) { return 0; } else if (x->id > y->id) { return -1; } else if (x->id < y->id) { return +1; } else if (x->target < y->target) { return -1; } else if (x->target > y->target) { return +1; } else { return 0; } } inline auto hit_compare_bysize_typed(struct hit * x, struct hit * y) -> int { // high abundance, then low abundance // high id, then low id // early target, then late target if (x->rejected < y->rejected) { return -1; } else if (x->rejected > y->rejected) { return +1; } else if (x->rejected == 1) { return 0; } else if (x->aligned > y->aligned) { return -1; } else if (x->aligned < y->aligned) { return +1; } else if (x->aligned == 0) { return 0; } else if (db_getabundance(x->target) > db_getabundance(y->target)) { return -1; } else if (db_getabundance(x->target) < db_getabundance(y->target)) { return +1; } else if (x->id > y->id) { return -1; } else if (x->id < y->id) { return +1; } else if (x->target < y->target) { return -1; } else if (x->target > y->target) { return +1; } else { return 0; } } auto hit_compare_byid(const void * a, const void * b) -> int { return hit_compare_byid_typed((struct hit *) a, (struct hit *) b); } auto hit_compare_bysize(const void * a, const void * b) -> int { return hit_compare_bysize_typed((struct hit *) a, (struct hit *) b); } auto search_enough_kmers(struct searchinfo_s * si, unsigned int count) -> bool { return (count >= opt_minwordmatches) or (count >= si->kmersamplecount); } auto search_topscores(struct searchinfo_s * si) -> void { /* Count the kmer hits in each database sequence and make a sorted list of a given number (th) of the database sequences with the highest number of matching kmers. These are stored in the min heap array. */ /* count kmer hits in the database sequences */ const int indexed_count = dbindex_getcount(); /* zero counts */ memset(si->kmers, 0, indexed_count * sizeof(count_t)); minheap_empty(si->m); for(unsigned int i = 0; i < si->kmersamplecount; i++) { unsigned int const kmer = si->kmersample[i]; unsigned char * bitmap = dbindex_getbitmap(kmer); if (bitmap) { #ifdef __x86_64__ if (ssse3_present) { increment_counters_from_bitmap_ssse3(si->kmers, bitmap, indexed_count); } else { increment_counters_from_bitmap_sse2(si->kmers, bitmap, indexed_count); } #else increment_counters_from_bitmap(si->kmers, bitmap, indexed_count); #endif } else { unsigned int * list = dbindex_getmatchlist(kmer); unsigned int const count = dbindex_getmatchcount(kmer); for(unsigned int j = 0; j < count; j++) { si->kmers[list[j]]++; } } } const int minmatches = MIN(opt_minwordmatches, si->kmersamplecount); for(int i = 0; i < indexed_count; i++) { count_t const count = si->kmers[i]; if (count >= minmatches) { unsigned int const seqno = dbindex_getmapping(i); unsigned int const length = db_getsequencelen(seqno); elem_t novel; novel.count = count; novel.seqno = seqno; novel.length = length; minheap_add(si->m, & novel); } } minheap_sort(si->m); } auto seqncmp(char * a, char * b, uint64_t n) -> int { for(unsigned int i = 0; i < n; i++) { const int x = chrmap_4bit[(int)(a[i])]; const int y = chrmap_4bit[(int)(b[i])]; if (x < y) { return -1; } else if (x > y) { return +1; } } return 0; } auto align_trim(struct hit * hit) -> void { /* trim alignment and fill in info */ /* assumes that the hit has been aligned */ /* info for semi-global alignment (without gaps at ends) */ hit->trim_aln_left = 0; hit->trim_q_left = 0; hit->trim_t_left = 0; hit->trim_aln_right = 0; hit->trim_q_right = 0; hit->trim_t_right = 0; /* left trim alignment */ char * p = hit->nwalignment; char op = '\0'; int64_t run = 0; if (*p) { run = 1; int scanlength = 0; sscanf(p, "%" PRId64 "%n", &run, &scanlength); op = *(p+scanlength); if (op != 'M') { hit->trim_aln_left = 1 + scanlength; if (op == 'D') { hit->trim_q_left = run; } else { hit->trim_t_left = run; } } } /* right trim alignment */ char * e = hit->nwalignment + strlen(hit->nwalignment); if (e > hit->nwalignment) { p = e - 1; op = *p; if (op != 'M') { while ((p > hit->nwalignment) && (*(p-1) <= '9')) { --p; } run = 1; sscanf(p, "%" PRId64, &run); hit->trim_aln_right = e - p; if (op == 'D') { hit->trim_q_right = run; } else { hit->trim_t_right = run; } } } if (hit->trim_q_left >= hit->nwalignmentlength) { hit->trim_q_right = 0; } if (hit->trim_t_left >= hit->nwalignmentlength) { hit->trim_t_right = 0; } hit->internal_alignmentlength = hit->nwalignmentlength - hit->trim_q_left - hit->trim_t_left - hit->trim_q_right - hit->trim_t_right; hit->internal_indels = hit->nwindels - hit->trim_q_left - hit->trim_t_left - hit->trim_q_right - hit->trim_t_right; hit->internal_gaps = hit->nwgaps - ((hit->trim_q_left + hit->trim_t_left) > 0 ? 1 : 0) - ((hit->trim_q_right + hit->trim_t_right) > 0 ? 1 : 0); /* CD-HIT */ hit->id0 = hit->shortest > 0 ? 100.0 * hit->matches / hit->shortest : 0.0; /* all diffs */ hit->id1 = hit->nwalignmentlength > 0 ? 100.0 * hit->matches / hit->nwalignmentlength : 0.0; /* internal diffs */ hit->id2 = hit->internal_alignmentlength > 0 ? 100.0 * hit->matches / hit->internal_alignmentlength : 0.0; /* Marine Biology Lab */ hit->id3 = MAX(0.0, 100.0 * (1.0 - (1.0 * (hit->mismatches + hit->nwgaps) / hit->longest))); /* BLAST */ hit->id4 = hit->nwalignmentlength > 0 ? 100.0 * hit->matches / hit->nwalignmentlength : 0.0; switch (opt_iddef) { case 0: hit->id = hit->id0; break; case 1: hit->id = hit->id1; break; case 2: hit->id = hit->id2; break; case 3: hit->id = hit->id3; break; case 4: hit->id = hit->id4; break; } } auto search_acceptable_unaligned(struct searchinfo_s * si, int target) -> bool { /* consider whether a hit satisfies accept criteria before alignment */ char * qseq = si->qsequence; char * dlabel = db_getheader(target); char * dseq = db_getsequence(target); const int64_t dseqlen = db_getsequencelen(target); const int64_t tsize = db_getabundance(target); if ( /* maxqsize */ (si->qsize <= opt_maxqsize) && /* mintsize */ (tsize >= opt_mintsize) && /* minsizeratio */ (si->qsize >= opt_minsizeratio * tsize) && /* maxsizeratio */ (si->qsize <= opt_maxsizeratio * tsize) && /* minqt */ (si->qseqlen >= opt_minqt * dseqlen) && /* maxqt */ (si->qseqlen <= opt_maxqt * dseqlen) && /* minsl */ (si->qseqlen < dseqlen ? si->qseqlen >= opt_minsl * dseqlen : dseqlen >= opt_minsl * si->qseqlen) && /* maxsl */ (si->qseqlen < dseqlen ? si->qseqlen <= opt_maxsl * dseqlen : dseqlen <= opt_maxsl * si->qseqlen) && /* idprefix */ ((si->qseqlen >= opt_idprefix) && (dseqlen >= opt_idprefix) && (not seqncmp(qseq, dseq, opt_idprefix))) && /* idsuffix */ ((si->qseqlen >= opt_idsuffix) && (dseqlen >= opt_idsuffix) && (not seqncmp(qseq+si->qseqlen-opt_idsuffix, dseq+dseqlen-opt_idsuffix, opt_idsuffix))) && /* self */ ((not opt_self) or (strcmp(si->query_head, dlabel))) && /* selfid */ ((not opt_selfid) or (si->qseqlen != dseqlen) or (seqncmp(qseq, dseq, si->qseqlen))) ) { /* needs further consideration */ return true; } else { /* reject */ return false; } } auto search_acceptable_aligned(struct searchinfo_s * si, struct hit * hit) -> bool { if (/* weak_id */ (hit->id >= 100.0 * opt_weak_id) && /* maxsubs */ (hit->mismatches <= opt_maxsubs) && /* maxgaps */ (hit->internal_gaps <= opt_maxgaps) && /* mincols */ (hit->internal_alignmentlength >= opt_mincols) && /* leftjust */ ((not opt_leftjust) or (hit->trim_q_left + hit->trim_t_left == 0)) && /* rightjust */ ((not opt_rightjust) or (hit->trim_q_right + hit->trim_t_right == 0)) && /* query_cov */ (hit->matches + hit->mismatches >= opt_query_cov * si->qseqlen) && /* target_cov */ (hit->matches + hit->mismatches >= opt_target_cov * db_getsequencelen(hit->target)) && /* maxid */ (hit->id <= 100.0 * opt_maxid) && /* mid */ (100.0 * hit->matches / (hit->matches + hit->mismatches) >= opt_mid) && /* maxdiffs */ (hit->mismatches + hit->internal_indels <= opt_maxdiffs)) { if (opt_cluster_unoise) { const auto mismatches = hit->mismatches; const double skew = 1.0 * si->qsize / db_getabundance(hit->target); const double beta = 1.0 / pow(2, (1.0 * opt_unoise_alpha * mismatches) + 1); if (skew <= beta or mismatches == 0) { /* accepted */ hit->accepted = true; hit->weak = false; return true; } else { /* rejected, but weak hit */ hit->rejected = true; hit->weak = true; return false; } } else { if (hit->id >= 100.0 * opt_id) { /* accepted */ hit->accepted = true; hit->weak = false; return true; } else { /* rejected, but weak hit */ hit->rejected = true; hit->weak = true; return false; } } } else { /* rejected */ hit->rejected = true; hit->weak = false; return false; } } auto align_delayed(struct searchinfo_s * si) -> void { /* compute global alignment */ unsigned int target_list[MAXDELAYED]; CELL nwscore_list[MAXDELAYED]; unsigned short nwalignmentlength_list[MAXDELAYED]; unsigned short nwmatches_list[MAXDELAYED]; unsigned short nwmismatches_list[MAXDELAYED]; unsigned short nwgaps_list[MAXDELAYED]; char * nwcigar_list[MAXDELAYED]; int target_count = 0; for(int x = si->finalized; x < si->hit_count; x++) { struct hit * hit = si->hits + x; if (not hit->rejected) { target_list[target_count++] = hit->target; } } if (target_count) { search16(si->s, target_count, target_list, nwscore_list, nwalignmentlength_list, nwmatches_list, nwmismatches_list, nwgaps_list, nwcigar_list); } int i = 0; for(int x = si->finalized; x < si->hit_count; x++) { /* maxrejects or maxaccepts reached - ignore remaining hits */ if ((si->rejects < opt_maxrejects) && (si->accepts < opt_maxaccepts)) { struct hit * hit = si->hits + x; if (hit->rejected) { si->rejects++; } else { int64_t const target = hit->target; int64_t nwscore = nwscore_list[i]; char * nwcigar = nullptr; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; int64_t const dseqlen = db_getsequencelen(target); if (nwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * dseq = db_getsequence(target); if (nwcigar_list[i]) { xfree(nwcigar_list[i]); } nwcigar = xstrdup(si->lma->align(si->qsequence, dseq, si->qseqlen, dseqlen)); si->lma->alignstats(nwcigar, si->qsequence, dseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); } else { nwalignmentlength = nwalignmentlength_list[i]; nwmatches = nwmatches_list[i]; nwmismatches = nwmismatches_list[i]; nwgaps = nwgaps_list[i]; nwcigar = nwcigar_list[i]; } hit->aligned = true; hit->shortest = MIN(si->qseqlen, dseqlen); hit->longest = MAX(si->qseqlen, dseqlen); hit->nwalignment = nwcigar; hit->nwscore = nwscore; hit->nwdiff = nwalignmentlength - nwmatches; hit->nwgaps = nwgaps; hit->nwindels = nwalignmentlength - nwmatches - nwmismatches; hit->nwalignmentlength = nwalignmentlength; hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) / nwalignmentlength; hit->matches = nwalignmentlength - hit->nwdiff; hit->mismatches = hit->nwdiff - hit->nwindels; /* trim alignment and compute numbers excluding terminal gaps */ align_trim(hit); /* test accept/reject criteria after alignment */ if (search_acceptable_aligned(si, hit)) { si->accepts++; } else { si->rejects++; } ++i; } } } /* free ignored alignments */ while (i < target_count) { xfree(nwcigar_list[i++]); } si->finalized = si->hit_count; } auto search_onequery(struct searchinfo_s * si, int seqmask) -> void { si->hit_count = 0; search16_qprep(si->s, si->qsequence, si->qseqlen); si->lma = new LinearMemoryAligner; int64_t * scorematrix = si->lma->scorematrix_create(opt_match, opt_mismatch); si->lma->set_parameters(scorematrix, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); /* extract unique kmer samples from query*/ unique_count(si->uh, opt_wordlength, si->qseqlen, si->qsequence, &si->kmersamplecount, &si->kmersample, seqmask); /* find database sequences with the most kmer hits */ search_topscores(si); /* analyse targets with the highest number of kmer hits */ si->accepts = 0; si->rejects = 0; si->finalized = 0; int delayed = 0; while ((si->finalized + delayed < opt_maxaccepts + opt_maxrejects - 1) && (si->rejects < opt_maxrejects) && (si->accepts < opt_maxaccepts) && (not minheap_isempty(si->m))) { elem_t const e = minheap_poplast(si->m); struct hit * hit = si->hits + si->hit_count; hit->target = e.seqno; hit->count = e.count; hit->strand = si->strand; hit->rejected = false; hit->accepted = false; hit->aligned = false; hit->weak = false; hit->nwalignment = nullptr; /* Test some accept/reject criteria before alignment */ if (search_acceptable_unaligned(si, e.seqno)) { ++delayed; } else { hit->rejected = true; } si->hit_count++; if (delayed == MAXDELAYED) { align_delayed(si); delayed = 0; } } if (delayed > 0) { align_delayed(si); } delete si->lma; xfree(scorematrix); } auto search_findbest2_byid(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit * { struct hit * best = nullptr; for(int i = 0; i < si_p->hit_count; i++) { if ((not best) or (hit_compare_byid_typed(si_p->hits + i, best) < 0)) { best = si_p->hits + i; } } if (opt_strand>1) { for(int i = 0; i < si_m->hit_count; i++) { if ((not best) or (hit_compare_byid_typed(si_m->hits + i, best) < 0)) { best = si_m->hits + i; } } } if (best and not best->accepted) { best = nullptr; } return best; } auto search_findbest2_bysize(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit * { struct hit * best = nullptr; for(int i = 0; i < si_p->hit_count; i++) { if ((not best) or (hit_compare_bysize_typed(si_p->hits + i, best) < 0)) { best = si_p->hits + i; } } if (opt_strand>1) { for(int i = 0; i < si_m->hit_count; i++) { if ((not best) or (hit_compare_bysize_typed(si_m->hits + i, best) < 0)) { best = si_m->hits + i; } } } if (best and not best->accepted) { best = nullptr; } return best; } auto search_joinhits(struct searchinfo_s * si_p, struct searchinfo_s * si_m, struct hit * * hitsp, int * hit_count) -> void { /* join and sort accepted and weak hits from both strands */ /* free the remaining alignments */ /* first, just count the number of hits to keep */ int a = 0; for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_m : si_p; for(int i=0; ihit_count; i++) { struct hit * h = si->hits + i; if (h->accepted || h->weak) { ++a; } } } /* allocate new array of hits */ auto * hits = (struct hit *) xmalloc(a * sizeof(struct hit)); /* copy over the hits to be kept */ a = 0; for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_m : si_p; for(int i = 0; ihit_count; i++) { struct hit * h = si->hits + i; if (h->accepted || h->weak) { hits[a++] = *h; } else if (h->aligned) { xfree(h->nwalignment); } } } /* last, sort the hits */ qsort(hits, a, sizeof(struct hit), hit_compare_byid); *hitsp = hits; *hit_count = a; } vsearch-2.30.0/src/searchcore.h000066400000000000000000000153421476012147200163160ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include struct uhandle_s; /* the number of alignments that can be delayed */ constexpr auto MAXDELAYED = 8U; /* Default minimum number of word matches for word lengths 3-15 */ constexpr std::array minwordmatches_defaults = {{ -1, -1, -1, 18, 17, 16, 15, 14, 12, 11, 10, 9, 8, 7, 5, 3 }}; struct hit { int target; int strand; /* candidate info */ unsigned int count; /* number of unique kmers shared with query */ bool accepted; /* is it accepted? */ bool rejected; /* is it rejected? */ bool aligned; /* has this hit been aligned */ bool weak; /* weak hits are aligned with id > weak_id */ /* info about global alignment, including terminal gaps */ int nwscore; /* alignment score */ int nwdiff; /* indels and mismatches in global alignment */ int nwgaps; /* gaps in global alignment */ int nwindels; /* indels in global alignment */ int nwalignmentlength; /* length of global alignment */ double nwid; /* percent identity of global alignment */ char * nwalignment; /* alignment string (cigar) of global alignment */ int matches; int mismatches; /* info about alignment excluding terminal gaps */ int internal_alignmentlength; int internal_gaps; int internal_indels; int trim_q_left; int trim_q_right; int trim_t_left; int trim_t_right; int trim_aln_left; int trim_aln_right; /* more info */ double id; /* identity used for ranking */ double id0; double id1; double id2; double id3; double id4; int shortest; /* length of shortest of query and target */ int longest; /* length of longest of query and target */ }; /* type of kmer hit counter element remember possibility of overflow */ using count_t = unsigned short; struct searchinfo_s { int query_no = 0; /* query number, zero-based */ int strand = 0; /* strand of query being analysed */ int qsize = 0; /* query abundance */ int query_head_len = 0; /* query header length */ int query_head_alloc = 0; /* bytes allocated for the header */ char * query_head = nullptr; /* query header */ int qseqlen = 0; /* query length */ int seq_alloc = 0; /* bytes allocated for the query sequence */ char * qsequence = nullptr; /* query sequence */ unsigned int kmersamplecount = 0; /* number of kmer samples from query */ unsigned int * kmersample = nullptr; /* list of kmers sampled from query */ count_t * kmers = nullptr; /* list of kmer counts for each db seq */ std::vector hits_v; /* vector of hits */ struct hit * hits = nullptr; /* list of hits */ int hit_count = 0; /* number of hits in the above list */ struct uhandle_s * uh = nullptr; /* unique kmer finder instance */ struct s16info_s * s = nullptr; /* SIMD aligner instance */ struct nwinfo_s * nw = nullptr; /* NW aligner instance */ LinearMemoryAligner * lma = nullptr; /* Linear memory aligner instance pointer */ int accepts = 0; /* number of accepts */ int rejects = 0; /* number of rejects */ struct minheap_s * m = nullptr; /* min heap with the top kmer db seqs */ int finalized = 0; }; auto search_topscores(struct searchinfo_s * si) -> void; auto search_onequery(struct searchinfo_s * si, int seqmask) -> void; auto search_findbest2_byid(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit *; auto search_findbest2_bysize(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit *; auto search_acceptable_unaligned(struct searchinfo_s * si, int target) -> bool; auto search_acceptable_aligned(struct searchinfo_s * si, struct hit * hit) -> bool; auto align_trim(struct hit * hit) -> void; auto search_joinhits(struct searchinfo_s * si_p, struct searchinfo_s * si_m, struct hit * * hits, int * hit_count) -> void; auto search_enough_kmers(struct searchinfo_s * si, unsigned int count) -> bool; vsearch-2.30.0/src/sff_convert.cc000066400000000000000000000422341476012147200166540ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include #include // std::tolower, std::toupper #include // UINT_MAX #include // uint64_t, uint32_t, uint16_t, uint8_t #include // std::fprintf, std::FILE, std:fclose, std::fread #include // std::strlen #include constexpr uint32_t sff_magic = 0x2e736666; struct sff_header_s { uint32_t magic_number; /* .sff */ uint32_t version; uint64_t index_offset; uint32_t index_length; uint32_t number_of_reads; uint16_t header_length; uint16_t key_length; uint16_t flows_per_read; uint8_t flowgram_format_code; } sff_header; struct sff_read_header_s { uint16_t read_header_length; uint16_t name_length; uint32_t number_of_bases; uint16_t clip_qual_left; uint16_t clip_qual_right; uint16_t clip_adapter_left; uint16_t clip_adapter_right; } read_header; auto fskip(FILE * fp, uint64_t length) -> uint64_t { /* read given amount of data from a stream and ignore it */ /* used instead of seeking in order to work with pipes */ static constexpr auto blocksize = uint64_t{4096}; std::array buffer; uint64_t skipped = 0; uint64_t rest = length; while (rest > 0) { uint64_t const want = (rest > blocksize) ? blocksize : rest; uint64_t const got = fread(buffer.data(), 1, want, fp); skipped += got; rest -= got; if (got < want) { break; } } return skipped; } auto sff_convert() -> void { if (! opt_fastqout) { fatal("No output file for sff_convert specified with --fastqout."); } FILE * fp_fastqout = fopen_output(opt_fastqout); if (! fp_fastqout) { fatal("Unable to open FASTQ output file for writing."); } FILE * fp_sff = fopen_input(opt_sff_convert); if (! fp_sff) { fatal("Unable to open SFF input file for reading."); } /* read and check header */ uint64_t filepos = 0; if (fread(&sff_header, 1, 31, fp_sff) < 31) { fatal("Unable to read from SFF file. File may be truncated."); } filepos += 31; sff_header.magic_number = bswap_32(sff_header.magic_number); sff_header.version = bswap_32(sff_header.version); sff_header.index_offset = bswap_64(sff_header.index_offset); sff_header.index_length = bswap_32(sff_header.index_length); sff_header.number_of_reads = bswap_32(sff_header.number_of_reads); sff_header.header_length = bswap_16(sff_header.header_length); sff_header.key_length = bswap_16(sff_header.key_length); sff_header.flows_per_read = bswap_16(sff_header.flows_per_read); if (sff_header.magic_number != sff_magic) { fatal("Invalid SFF file. Incorrect magic number. Must be 0x2e736666 (.sff)."); } if (sff_header.version != 1) { fatal("Invalid SFF file. Incorrect version. Must be 1."); } if (sff_header.flowgram_format_code != 1) { fatal("Invalid SFF file. Incorrect flowgram format code. Must be 1."); } if (sff_header.header_length != 8 * ((31 + sff_header.flows_per_read + sff_header.key_length + 7) / 8)) { fatal("Invalid SFF file. Incorrect header length."); } if (sff_header.key_length != 4) { fatal("Invalid SFF file. Incorrect key length. Must be 4."); } if ((sff_header.index_length > 0) && (sff_header.index_length < 8)) { fatal("Invalid SFF file. Incorrect index size. Must be at least 8."); } /* read and check flow chars, key and padding */ if (fskip(fp_sff, sff_header.flows_per_read) < sff_header.flows_per_read) { fatal("Invalid SFF file. Unable to read flow characters. File may be truncated."); } filepos += sff_header.flows_per_read; char * key_sequence = (char *) xmalloc(sff_header.key_length + 1); if (fread(key_sequence, 1, sff_header.key_length, fp_sff) < sff_header.key_length) { fatal("Invalid SFF file. Unable to read key sequence. File may be truncated."); } key_sequence[sff_header.key_length] = 0; filepos += sff_header.key_length; uint32_t const padding_length = sff_header.header_length - sff_header.flows_per_read - sff_header.key_length - 31; if (fskip(fp_sff, padding_length) < padding_length) { fatal("Invalid SFF file. Unable to read padding. File may be truncated."); } filepos += padding_length; double totallength = 0.0; uint32_t minimum = UINT_MAX; uint32_t maximum = 0; bool index_done = (sff_header.index_offset == 0) || (sff_header.index_length == 0); bool index_odd = false; char index_kind[9]; uint32_t index_padding = 0; if ((sff_header.index_length & 7U) > 0) { index_padding = 8 - (sff_header.index_length & 7U); } if (! opt_quiet) { fprintf(stderr, "Number of reads: %d\n", sff_header.number_of_reads); fprintf(stderr, "Flows per read: %d\n", sff_header.flows_per_read); fprintf(stderr, "Key sequence: %s\n", key_sequence); } if (opt_log) { fprintf(fp_log, "Number of reads: %d\n", sff_header.number_of_reads); fprintf(fp_log, "Flows per read: %d\n", sff_header.flows_per_read); fprintf(fp_log, "Key sequence: %s\n", key_sequence); } progress_init("Converting SFF: ", sff_header.number_of_reads); for (uint32_t read_no = 0; read_no < sff_header.number_of_reads; read_no++) { /* check if the index block is here */ if (! index_done) { if (filepos == sff_header.index_offset) { if (fread(index_kind, 1, 8, fp_sff) < 8) { fatal("Invalid SFF file. Unable to read index header. File may be truncated."); } filepos += 8; index_kind[8] = 0; uint64 const index_size = sff_header.index_length - 8 + index_padding; if (fskip(fp_sff, index_size) != index_size) { fatal("Invalid SFF file. Unable to read entire index. File may be truncated."); } filepos += index_size; index_done = true; index_odd = true; } } /* read and check each read header */ if (fread(&read_header, 1, 16, fp_sff) < 16) { fatal("Invalid SFF file. Unable to read read header. File may be truncated."); } filepos += 16; read_header.read_header_length = bswap_16(read_header.read_header_length); read_header.name_length = bswap_16(read_header.name_length); read_header.number_of_bases = bswap_32(read_header.number_of_bases); read_header.clip_qual_left = bswap_16(read_header.clip_qual_left); read_header.clip_qual_right = bswap_16(read_header.clip_qual_right); read_header.clip_adapter_left = bswap_16(read_header.clip_adapter_left); read_header.clip_adapter_right = bswap_16(read_header.clip_adapter_right); if (read_header.read_header_length != 8 * ((16 + read_header.name_length + 7) / 8)) { fatal("Invalid SFF file. Incorrect read header length."); } if (read_header.clip_qual_left > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_qual_left value."); } if (read_header.clip_adapter_left > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_adapter_left value."); } if (read_header.clip_qual_right > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_qual_right value."); } if (read_header.clip_adapter_right > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_adapter_right value."); } std::vector read_name(read_header.name_length + 1); if (fread(read_name.data(), 1, read_header.name_length, fp_sff) < read_header.name_length) { fatal("Invalid SFF file. Unable to read read name. File may be truncated."); } filepos += read_header.name_length; read_name[read_header.name_length] = 0; uint32_t const read_header_padding_length = read_header.read_header_length - read_header.name_length - 16; if (fskip(fp_sff, read_header_padding_length) < read_header_padding_length) { fatal("Invalid SFF file. Unable to read read header padding. File may be truncated."); } filepos += read_header_padding_length; /* read and check the flowgram and sequence */ if (fskip(fp_sff, 2 * sff_header.flows_per_read) < sff_header.flows_per_read) { fatal("Invalid SFF file. Unable to read flowgram values. File may be truncated."); } filepos += 2 * sff_header.flows_per_read; if (fskip(fp_sff, read_header.number_of_bases) < read_header.number_of_bases) { fatal("Invalid SFF file. Unable to read flow indices. File may be truncated."); } filepos += read_header.number_of_bases; char * bases = (char *) xmalloc(read_header.number_of_bases + 1); if (fread(bases, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) { fatal("Invalid SFF file. Unable to read read length. File may be truncated."); } bases[read_header.number_of_bases] = 0; filepos += read_header.number_of_bases; char * qual = (char *) xmalloc(read_header.number_of_bases + 1); if (fread(qual, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) { fatal("Invalid SFF file. Unable to read quality scores. File may be truncated."); } filepos += read_header.number_of_bases; /* convert quality scores to ascii characters */ for (uint32_t base_no = 0; base_no < read_header.number_of_bases; base_no++) { int q = qual[base_no]; if (q < opt_fastq_qminout) { q = opt_fastq_qminout; } if (q > opt_fastq_qmaxout) { q = opt_fastq_qmaxout; } qual[base_no] = opt_fastq_asciiout + q; } qual[read_header.number_of_bases] = 0; uint32_t const read_data_length = ((2 * sff_header.flows_per_read) + (3 * read_header.number_of_bases)); uint32_t const read_data_padded_length = 8 * ((read_data_length + 7) / 8); uint32_t const read_data_padding_length = read_data_padded_length - read_data_length; if (fskip(fp_sff, read_data_padding_length) < read_data_padding_length) { fatal("Invalid SFF file. Unable to read read data padding. File may be truncated."); } filepos += read_data_padding_length; uint32_t clip_start = 0; clip_start = MAX(1, MAX(read_header.clip_qual_left, read_header.clip_adapter_left)) - 1; uint32_t clip_end = read_header.number_of_bases; clip_end = MIN((read_header.clip_qual_right == 0 ? read_header.number_of_bases : read_header.clip_qual_right), (read_header.clip_adapter_right == 0 ? read_header.number_of_bases : read_header.clip_adapter_right)); /* make the clipped bases lowercase and the rest uppercase */ for (uint32_t i = 0; i < read_header.number_of_bases; i++) { if ((i < clip_start) || (i >= clip_end)) { bases[i] = tolower(bases[i]); } else { bases[i] = toupper(bases[i]); } } if (opt_sff_clip) { bases[clip_end] = 0; qual[clip_end] = 0; } else { clip_start = 0; clip_end = read_header.number_of_bases; } uint32_t const length = clip_end - clip_start; fastq_print_general(fp_fastqout, bases + clip_start, length, read_name.data(), strlen(read_name.data()), qual + clip_start, 1, read_no + 1, -1.0); xfree(bases); xfree(qual); totallength += length; if (length < minimum) { minimum = length; } if (length > maximum) { maximum = length; } progress_update(read_no + 1); } progress_done(); /* check if the index block is here */ if (! index_done) { if (filepos == sff_header.index_offset) { if (fread(index_kind, 1, 8, fp_sff) < 8) { fatal("Invalid SFF file. Unable to read index header. File may be truncated."); } filepos += 8; index_kind[8] = 0; uint64 const index_size = sff_header.index_length - 8; if (fskip(fp_sff, index_size) != index_size) { fatal("Invalid SFF file. Unable to read entire index. File may be truncated."); } filepos += index_size; index_done = true; /* try to skip padding, if any */ if (index_padding > 0) { uint64_t const got = fskip(fp_sff, index_padding); if ((got < index_padding) && (got != 0)) { fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n"); } } } } if (! index_done) { fprintf(stderr, "WARNING: SFF index missing\n"); if (opt_log) { fprintf(fp_log, "WARNING: SFF index missing\n"); } } if (index_odd) { fprintf(stderr, "WARNING: Index at unusual position in file\n"); if (opt_log) { fprintf(fp_log, "WARNING: Index at unusual position in file\n"); } } /* ignore the rest of file */ /* try reading just another byte */ if (fskip(fp_sff, 1) > 0) { fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n"); if (opt_log) { fprintf(fp_log, "WARNING: Additional data at end of SFF file ignored\n"); } } fclose(fp_sff); fclose(fp_fastqout); double const average = totallength / sff_header.number_of_reads; if (! opt_quiet) { if (sff_header.index_length > 0) { fprintf(stderr, "Index type: %s\n", index_kind); } fprintf(stderr, "\nSFF file read successfully.\n"); if (sff_header.number_of_reads > 0) { fprintf(stderr, "Sequence length: minimum %d, average %.1f, maximum %d\n", minimum, average, maximum); } } if (opt_log) { if (sff_header.index_length > 0) { fprintf(fp_log, "Index type: %s\n", index_kind); } fprintf(fp_log, "\nSFF file read successfully.\n"); if (sff_header.number_of_reads > 0) { fprintf(fp_log, "Sequence length: minimum %d, average %.1f, maximum %d\n", minimum, average, maximum); } } xfree(key_sequence); } vsearch-2.30.0/src/sff_convert.h000066400000000000000000000047111476012147200165140ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sff_convert() -> void; vsearch-2.30.0/src/sha1.c000066400000000000000000000301101476012147200150150ustar00rootroot00000000000000/* refactoring: sha1.h headers are available in gcc and clang alternatively, there are C++ implementations available. OpenSSH's version is the fastest */ /* Slightly modified for vsearch by Torbjorn Rognes */ /* SHA-1 in C By Steve Reid 100% Public Domain ----------------- Modified 7/98 By James H. Brown Still 100% Public Domain Corrected a problem which generated improper hash values on 16 bit machines Routine SHA1Update changed from void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned int len) to void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned long len) The 'len' parameter was declared an int which works fine on 32 bit machines. However, on 16 bit machines an int is too small for the shifts being done against it. This caused the hash function to generate incorrect values if len was greater than 8191 (8K - 1) due to the 'len << 3' on line 3 of SHA1Update(). Since the file IO in main() reads 16K at a time, any file 8K or larger would be guaranteed to generate the wrong hash (e.g. Test Vector #3, a million "a"s). I also changed the declaration of variables i & j in SHA1Update to unsigned long from unsigned int for the same reason. These changes should make no difference to any 32 bit implementations since an int and a long are the same size in those environments. -- I also corrected a few compiler warnings generated by Borland C. 1. Added #include for exit() prototype 2. Removed unused variable 'j' in SHA1Final 3. Changed exit(0) to return(0) at end of main. ALL changes I made can be located by searching for comments containing 'JHB' ----------------- Modified 8/98 By Steve Reid Still 100% public domain 1- Removed #include and used return() instead of exit() 2- Fixed overwriting of finalcount in SHA1Final() (discovered by Chris Hall) 3- Changed email address from steve@edmweb.com to sreid@sea-to-sky.net ----------------- Modified 4/01 By Saul Kravitz Still 100% PD Modified to run on Compaq Alpha hardware. ----------------- Modified 07/2002 By Ralph Giles Still 100% public domain modified for use with stdint types, autoconf code cleanup, removed attribution comments switched SHA1Final() argument order for consistency use SHA1_ prefix for public api move public api to sha1.h */ /* Test Vectors (from FIPS PUB 180-1) "abc" A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 A million repetitions of "a" 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F */ /* #define SHA1HANDSOFF */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "sha1.h" void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64]); #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ /* FIXME: can we do this in an endian-proof way? */ #ifdef WORDS_BIGENDIAN #define blk0(i) block->l[i] #else #define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ |(rol(block->l[i],8)&0x00FF00FF)) #endif #define blk(i) (block->l[(i)&15] = rol(block->l[((i)+13)&15]^block->l[((i)+8)&15] \ ^block->l[((i)+2)&15]^block->l[(i)&15],1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v,w,x,y,z,i) z+=(((w)&((x)^(y)))^(y))+blk0(i)+0x5A827999+rol(v,5);(w)=rol(w,30); #define R1(v,w,x,y,z,i) z+=(((w)&((x)^(y)))^(y))+blk(i)+0x5A827999+rol(v,5);(w)=rol(w,30); #define R2(v,w,x,y,z,i) z+=((w)^(x)^(y))+blk(i)+0x6ED9EBA1+rol(v,5);(w)=rol(w,30); #define R3(v,w,x,y,z,i) z+=((((w)|(x))&(y))|((w)&(x)))+blk(i)+0x8F1BBCDC+rol(v,5);(w)=rol(w,30); #define R4(v,w,x,y,z,i) z+=((w)^(x)^(y))+blk(i)+0xCA62C1D6+rol(v,5);(w)=rol(w,30); #ifdef VERBOSE /* SAK */ void SHAPrintContext(SHA1_CTX *context, char *msg){ printf("%s (%d,%d) %x %x %x %x %x\n", msg, context->count[0], context->count[1], context->state[0], context->state[1], context->state[2], context->state[3], context->state[4]); } #endif /* VERBOSE */ /* Hash a single 512-bit block. This is the core of the algorithm. */ void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64]) { uint32_t a = 0; uint32_t b = 0; uint32_t c = 0; uint32_t d = 0; uint32_t e = 0; typedef union { uint8_t c[64]; uint32_t l[16]; } CHAR64LONG16; CHAR64LONG16 * block; #ifdef SHA1HANDSOFF static uint8_t workspace[64]; block = (CHAR64LONG16 *) workspace; memcpy(block, buffer, 64); #else block = (CHAR64LONG16*)buffer; #endif /* Copy context->state[] to working vars */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); /* Add the working vars back into context.state[] */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; /* Wipe variables */ a = b = c = d = e = 0; } /* SHA1Init - Initialize new context */ void SHA1_Init(SHA1_CTX* context) { /* SHA1 initialization constants */ context->state[0] = 0x67452301; context->state[1] = 0xEFCDAB89; context->state[2] = 0x98BADCFE; context->state[3] = 0x10325476; context->state[4] = 0xC3D2E1F0; context->count[0] = context->count[1] = 0; } /* Run your data through this. */ void SHA1_Update(SHA1_CTX* context, const uint8_t* data, const size_t len) { size_t i = 0; size_t j = 0; #ifdef VERBOSE SHAPrintContext(context, "before"); #endif j = (context->count[0] >> 3) & 63; context->count[0] += len << 3; if (context->count[0] < (len << 3)) { context->count[1]++; } context->count[1] += (len >> 29); if ((j + len) > 63) { memcpy(&context->buffer[j], data, (i = 64-j)); SHA1_Transform(context->state, context->buffer); for ( ; i + 63 < len; i += 64) { SHA1_Transform(context->state, data + i); } j = 0; } else { i = 0; } memcpy(&context->buffer[j], &data[i], len - i); #ifdef VERBOSE SHAPrintContext(context, "after "); #endif } /* Add padding and return the message digest. */ void SHA1_Final(SHA1_CTX* context, uint8_t digest[SHA1_DIGEST_SIZE]) { uint32_t i = 0; uint8_t finalcount[8]; uint8_t padding_buffer[64]; for (i = 0; i < 64; i++) { padding_buffer[i] = 0; } for (i = 0; i < 8; i++) { finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ } padding_buffer[0] = 0x80; SHA1_Update(context, padding_buffer, 1); padding_buffer[0] = 0x00; while ((context->count[0] & 504) != 448) { SHA1_Update(context, padding_buffer, 1); } SHA1_Update(context, finalcount, 8); /* Should cause a SHA1_Transform() */ for (i = 0; i < SHA1_DIGEST_SIZE; i++) { digest[i] = (uint8_t) ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); } /* Wipe variables */ i = 0; memset(context->buffer, 0, 64); memset(context->state, 0, 20); memset(context->count, 0, 8); memset(finalcount, 0, 8); /* SWR */ #ifdef SHA1HANDSOFF /* make SHA1Transform overwrite its own static vars */ SHA1_Transform(context->state, context->buffer); #endif } /*************************************************************/ #if 0 int main(int argc, char** argv) { int i, j; SHA1_CTX context; unsigned char digest[SHA1_DIGEST_SIZE], buffer[16384]; FILE* file; if (argc > 2) { puts("Public domain SHA-1 implementation - by Steve Reid "); puts("Modified for 16 bit environments 7/98 - by James H. Brown "); /* JHB */ puts("Produces the SHA-1 hash of a file, or stdin if no file is specified."); return(0); } if (argc < 2) { file = stdin; } else { if (!(file = fopen(argv[1], "rb"))) { fputs("Unable to open file.", stderr); return(-1); } } SHA1_Init(&context); while (!feof(file)) { /* note: what if ferror(file) */ i = fread(buffer, 1, 16384, file); SHA1_Update(&context, buffer, i); } SHA1_Final(&context, digest); fclose(file); for (i = 0; i < SHA1_DIGEST_SIZE/4; i++) { for (j = 0; j < 4; j++) { printf("%02X", digest[i*4+j]); } putchar(' '); } putchar('\n'); return(0); /* JHB */ } #endif /* self test */ #ifdef TEST static char *test_data[] = { "abc", "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", "A million repetitions of 'a'"}; static char *test_results[] = { "A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D", "84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1", "34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F"}; void digest_to_hex(const uint8_t digest[SHA1_DIGEST_SIZE], char *output) { int i,j; char *c = output; for (i = 0; i < SHA1_DIGEST_SIZE/4; i++) { for (j = 0; j < 4; j++) { sprintf(c,"%02X", digest[i*4+j]); c += 2; } sprintf(c, " "); c += 1; } *(c - 1) = '\0'; } int main(int argc, char** argv) { int k; SHA1_CTX context; uint8_t digest[20]; char output[80]; fprintf(stdout, "verifying SHA-1 implementation... "); for (k = 0; k < 2; k++){ SHA1_Init(&context); SHA1_Update(&context, (uint8_t*)test_data[k], strlen(test_data[k])); SHA1_Final(&context, digest); digest_to_hex(digest, output); if (strcmp(output, test_results[k])) { fprintf(stdout, "FAIL\n"); fprintf(stderr,"* hash of \"%s\" incorrect:\n", test_data[k]); fprintf(stderr,"\t%s returned\n", output); fprintf(stderr,"\t%s is correct\n", test_results[k]); return (1); } } /* million 'a' vector we feed separately */ SHA1_Init(&context); for (k = 0; k < 1000000; k++) SHA1_Update(&context, (uint8_t*)"a", 1); SHA1_Final(&context, digest); digest_to_hex(digest, output); if (strcmp(output, test_results[2])) { fprintf(stdout, "FAIL\n"); fprintf(stderr,"* hash of \"%s\" incorrect:\n", test_data[2]); fprintf(stderr,"\t%s returned\n", output); fprintf(stderr,"\t%s is correct\n", test_results[2]); return (1); } /* success */ fprintf(stdout, "ok\n"); return(0); } #endif /* TEST */ vsearch-2.30.0/src/sha1.h000066400000000000000000000010361476012147200150270ustar00rootroot00000000000000/* public api for steve reid's public domain SHA-1 implementation */ /* this file is in the public domain */ #ifndef __SHA1_H #define __SHA1_H #ifdef __cplusplus extern "C" { #endif typedef struct { uint32_t state[5]; uint32_t count[2]; uint8_t buffer[64]; } SHA1_CTX; #define SHA1_DIGEST_SIZE 20 void SHA1_Init(SHA1_CTX* context); void SHA1_Update(SHA1_CTX* context, const uint8_t* data, size_t len); void SHA1_Final(SHA1_CTX* context, uint8_t digest[SHA1_DIGEST_SIZE]); #ifdef __cplusplus } #endif #endif /* __SHA1_H */ vsearch-2.30.0/src/showalign.cc000066400000000000000000000206731476012147200163340ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include // macros PRIu64 and PRId64 #include // int64_t #include // FILE #include // std::strncpy static int64_t line_pos; static char * q_seq; static char * d_seq; static int64_t q_start; static int64_t d_start; static int64_t q_pos; static int64_t d_pos; static int64_t q_strand; static int64_t alignlen; static char * q_line; static char * a_line; static char * d_line; static std::FILE * out; constexpr int poswidth_default {3}; static int poswidth = poswidth_default; constexpr int headwidth_default {5}; static int headwidth = headwidth_default; static const char * q_name; static const char * d_name; static int64_t q_len; static int64_t d_len; inline auto putop(char c, int64_t len) -> void { const int64_t delta = q_strand != 0 ? -1 : +1; int64_t count = len; while (count != 0) { if (line_pos == 0) { q_start = q_pos; d_start = d_pos; } char qs = '\0'; char ds = '\0'; unsigned int qs4 = 0; unsigned int ds4 = 0; switch(c) { case 'M': qs = q_strand != 0 ? chrmap_complement[static_cast(q_seq[q_pos])] : q_seq[q_pos]; ds = d_seq[d_pos]; q_pos += delta; d_pos += 1; q_line[line_pos] = qs; qs4 = chrmap_4bit[static_cast(qs)]; ds4 = chrmap_4bit[static_cast(ds)]; if (opt_n_mismatch && ((qs4 == 15) || (ds4 == 15))) { a_line[line_pos] = ' '; } else if ((qs4 == ds4) and (ambiguous_4bit[qs4] == 0U)) { a_line[line_pos] = '|'; } else if ((qs4 & ds4) != 0U) { a_line[line_pos] = '+'; } else { a_line[line_pos] = ' '; } d_line[line_pos] = ds; ++line_pos; break; case 'D': qs = q_strand != 0 ? chrmap_complement[static_cast(q_seq[q_pos])] : q_seq[q_pos]; q_pos += delta; q_line[line_pos] = qs; a_line[line_pos] = ' '; d_line[line_pos] = '-'; ++line_pos; break; case 'I': ds = d_seq[d_pos]; d_pos += 1; q_line[line_pos] = '-'; a_line[line_pos] = ' '; d_line[line_pos] = ds; ++line_pos; break; } if ((line_pos == alignlen) or ((c == 0) and (line_pos > 0))) { q_line[line_pos] = 0; a_line[line_pos] = 0; d_line[line_pos] = 0; const int64_t q1 = q_start + 1 > q_len ? q_len : q_start + 1; const int64_t q2 = q_strand != 0 ? q_pos + 2 : q_pos; const int64_t d1 = d_start + 1 > d_len ? d_len : d_start + 1; const int64_t d2 = d_pos; fprintf(out, "\n"); fprintf(out, "%*s %*" PRId64 " %c %s %" PRId64 "\n", headwidth, q_name, poswidth, q1, q_strand != 0 ? '-' : '+', q_line, q2); fprintf(out, "%*s %*s %s\n", headwidth, "", poswidth, "", a_line); fprintf(out, "%*s %*" PRId64 " %c %s %" PRId64 "\n", headwidth, d_name, poswidth, d1, '+', d_line, d2); line_pos = 0; } --count; } } auto align_show(std::FILE * output_handle, char * seq1, int64_t seq1len, int64_t seq1off, const char * seq1name, char * seq2, int64_t seq2len, int64_t seq2off, const char * seq2name, char * cigar, int64_t cigarlen, int numwidth, int namewidth, int alignwidth, int strand) -> void { out = output_handle; q_seq = seq1; q_len = seq1len; q_name = seq1name; q_strand = strand; d_seq = seq2; d_len = seq2len; d_name = seq2name; char * p = cigar; char * e = p + cigarlen; poswidth = numwidth; headwidth = namewidth; alignlen = alignwidth; q_line = (char *) xmalloc(alignwidth + 1); a_line = (char *) xmalloc(alignwidth + 1); d_line = (char *) xmalloc(alignwidth + 1); q_pos = strand != 0 ? seq1len - 1 - seq1off : seq1off; d_pos = seq2off; line_pos = 0; while (p < e) { int64_t len = 0; int n = 0; if (sscanf(p, "%" PRId64 "%n", & len, & n) == 0) { n = 0; len = 1; } p += n; const char op = *p++; putop(op, len); } putop(0, 1); xfree(q_line); xfree(a_line); xfree(d_line); } auto align_getrow(char * seq, char * cigar, int alignlen, int origin) -> char * { char * row = (char *) xmalloc(alignlen + 1); char * r = row; char * p = cigar; char * s = seq; while (*p != 0) { int64_t len = 0; int n = 0; if (sscanf(p, "%" PRId64 "%n", & len, & n) == 0) { n = 0; len = 1; } p += n; const char op = *p++; if ((op == 'M') or ((op == 'D') and (origin == 0)) or ((op == 'I') and (origin == 1))) { strncpy(r, s, len); r += len; s += len; } else { /* insert len gap symbols */ for (int64_t i = 0; i < len; i++) { *r++ = '-'; } } } *r = 0; return row; } auto align_fprint_uncompressed_alignment(std::FILE * output_handle, char * cigar) -> void { char * p = cigar; while (*p != 0) { if (*p > '9') { fprintf(output_handle, "%c", *p++); } else { int n = 0; char c = 0; int x = 0; if (sscanf(p, "%d%c%n", &n, &c, &x) == 2) { for (int i = 0; i < n; i++) { fprintf(output_handle, "%c", c); } p += x; } else { fatal("bad alignment string"); } } } } vsearch-2.30.0/src/showalign.h000066400000000000000000000062201476012147200161660ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // int64_t #include // FILE auto align_getrow(char * seq, char * cigar, int alignlen, int origin) -> char *; auto align_fprint_uncompressed_alignment(std::FILE * output_handle, char * cigar) -> void; auto align_show(std::FILE * output_handle, char * seq1, int64_t seq1len, int64_t seq1off, const char * seq1name, char * seq2, int64_t seq2len, int64_t seq2off, const char * seq2name, char * cigar, int64_t cigarlen, int numwidth, int namewidth, int alignwidth, int strand) -> void; vsearch-2.30.0/src/shuffle.cc000066400000000000000000000114131476012147200157650ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::min, std::shuffle #include // std::FILE, std::size_t #include // std::iota #include #include namespace { // anonymous namespace to avoid linker error (multiple definitions // of function with identical names and parameters) auto create_deck() -> std::vector { auto const dbsequencecount = db_getsequencecount(); std::vector deck(dbsequencecount); std::iota(deck.begin(), deck.end(), 0); return deck; } } auto generate_seed(long int const user_seed) -> unsigned int { if (user_seed != 0) { return static_cast(user_seed); } std::random_device number_generator; return number_generator(); } auto shuffle_deck(std::vector & deck, long int const user_seed) -> void { static constexpr auto one_hundred_percent = 100ULL; progress_init("Shuffling", one_hundred_percent); auto const seed = generate_seed(user_seed); std::mt19937_64 uniform_generator(seed); std::shuffle(deck.begin(), deck.end(), uniform_generator); progress_done(); } auto truncate_deck(std::vector & deck, long int const n_first_sequences) -> void { if (deck.size() > static_cast(n_first_sequences)) deck.resize(n_first_sequences); } auto output_shuffled_fasta(std::vector const & deck, std::FILE * output_file) -> void { progress_init("Writing output", deck.size()); auto counter = std::size_t{0}; for (auto const sequence_id: deck) { fasta_print_db_relabel(output_file, sequence_id, counter + 1); progress_update(counter); ++counter; } progress_done(); } auto shuffle(struct Parameters const & parameters) -> void { // pre-conditions if (parameters.opt_output == nullptr) { fatal("Output file for shuffling must be specified with --output"); } auto * fp_output = fopen_output(parameters.opt_output); if (fp_output == nullptr) { fatal("Unable to open shuffle output file for writing"); } db_read(parameters.opt_shuffle, 0); show_rusage(); auto deck = create_deck(); shuffle_deck(deck, parameters.opt_randseed); show_rusage(); truncate_deck(deck, parameters.opt_topn); output_shuffled_fasta(deck, fp_output); show_rusage(); db_free(); if (fp_output != nullptr) { static_cast(std::fclose(fp_output)); } } vsearch-2.30.0/src/shuffle.h000066400000000000000000000047511476012147200156360ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto shuffle(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/sintax.cc000066400000000000000000000502731476012147200156460ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Implements the Sintax algorithm as described in Robert Edgar's preprint: Robert Edgar (2016) SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences BioRxiv, 074161 doi: https://doi.org/10.1101/074161 Further details: https://www.drive5.com/usearch/manual/cmd_sintax.html Note that due to the lack of details in the description, this implementation in vsearch is surely somewhat different from the one in usearch. */ #include "vsearch.h" #include "bitmap.h" #include "dbindex.h" #include "maps.h" #include "mask.h" #include "minheap.h" #include "tax.h" #include "udb.h" #include "unique.h" #include // std::min, std::max #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::memset, std::strncmp, std::strcpy #include static struct searchinfo_s * si_plus; static struct searchinfo_s * si_minus; static pthread_t * pthread; /* global constants/data, no need for synchronization */ static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; static fastx_handle query_fastx_h; const int subset_size = 32; const int bootstrap_count = 100; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static FILE * fp_tabbedout; static int queries = 0; static int classified = 0; auto sintax_analyse(char * query_head, int strand, int * all_seqno, int count) -> void { int level_matchcount[tax_levels]; int level_best[tax_levels]; char * cand_level_name_start[bootstrap_count][tax_levels]; int cand_level_name_len[bootstrap_count][tax_levels]; /* Check number of successful bootstraps, must be at least half */ bool const enough = count >= (bootstrap_count + 1) / 2; if (enough) { /* Find the most common name at each taxonomic rank, but with the same names at higher ranks. */ for (int i = 0; i < count ; i++) { /* Split headers of all candidates by taxonomy ranks */ int const seqno = all_seqno[i]; int new_level_name_start[tax_levels]; int new_level_name_len[tax_levels]; tax_split(seqno, new_level_name_start, new_level_name_len); for (int k = 0; k < tax_levels; k++) { cand_level_name_start[i][k] = db_getheader(seqno) + new_level_name_start[k]; cand_level_name_len[i][k] = new_level_name_len[k]; } } bool cand_included[bootstrap_count]; for (int i = 0; i < count; i++) cand_included[i] = true; /* Count matching names among candidates */ for (int k = 0; k < tax_levels; k++) { level_best[k] = -1; level_matchcount[k] = 0; int cand_match[bootstrap_count]; int cand_matchcount[bootstrap_count]; for (int i = 0; i < count ; i++) { cand_match[i] = -1; cand_matchcount[i] = 0; } for (int i = 0; i < count ; i++) if (cand_included[i]) for (int j = 0; j <= i ; j++) if (cand_included[j]) { /* check match at current level */ if ((cand_level_name_len[i][k] == cand_level_name_len[j][k]) && (strncmp(cand_level_name_start[i][k], cand_level_name_start[j][k], cand_level_name_len[i][k]) == 0)) { cand_match[i] = j; cand_matchcount[j]++; break; /* stop at first match */ } } for (int i = 0; i < count ; i++) if (cand_matchcount[i] > level_matchcount[k]) { level_best[k] = i; level_matchcount[k] = cand_matchcount[i]; } for (int i = 0; i < count; i++) if (cand_match[i] != level_best[k]) cand_included[i] = false; } } /* write to tabbedout file */ xpthread_mutex_lock(&mutex_output); fprintf(fp_tabbedout, "%s\t", query_head); queries++; if (enough) { classified++; bool comma = false; for (int j = 0; j < tax_levels; j++) { int const best = level_best[j]; if (cand_level_name_len[best][j] > 0) { fprintf(fp_tabbedout, "%s%c:%.*s(%.2f)", (comma ? "," : ""), tax_letters[j], cand_level_name_len[best][j], cand_level_name_start[best][j], 1.0 * level_matchcount[j] / count); comma = true; } } fprintf(fp_tabbedout, "\t%c", strand ? '-' : '+'); if (opt_sintax_cutoff > 0.0) { fprintf(fp_tabbedout, "\t"); bool comma = false; for (int j = 0; j < tax_levels; j++) { int const best = level_best[j]; if ((cand_level_name_len[best][j] > 0) && (1.0 * level_matchcount[j] / count >= opt_sintax_cutoff)) { fprintf(fp_tabbedout, "%s%c:%.*s", (comma ? "," : ""), tax_letters[j], cand_level_name_len[best][j], cand_level_name_start[best][j]); comma = true; } } } } else { if (opt_sintax_cutoff > 0.0) { fprintf(fp_tabbedout, "\t\t"); } else { fprintf(fp_tabbedout, "\t"); } } fprintf(fp_tabbedout, "\n"); xpthread_mutex_unlock(&mutex_output); } auto sintax_search_topscores(struct searchinfo_s * si) -> void { /* Count the number of kmer hits in each database sequence and select the database sequence with the highest number of matching kmers. If several sequences have equally many kmer matches, choose one of them according to the following rules: By default, choose the shortest. If two are equally short, choose the one that comes first in the database. If the sintax_random option is in effect, ties will instead be chosen randomly. */ /* count kmer hits in the database sequences */ const int indexed_count = dbindex_getcount(); /* zero counts */ memset(si->kmers, 0, indexed_count * sizeof(count_t)); for (unsigned int i = 0; i < si->kmersamplecount; i++) { unsigned int const kmer = si->kmersample[i]; unsigned char * bitmap = dbindex_getbitmap(kmer); if (bitmap) { #ifdef __x86_64__ if (ssse3_present) { increment_counters_from_bitmap_ssse3(si->kmers, bitmap, indexed_count); } else { increment_counters_from_bitmap_sse2(si->kmers, bitmap, indexed_count); } #else increment_counters_from_bitmap(si->kmers, bitmap, indexed_count); #endif } else { unsigned int * list = dbindex_getmatchlist(kmer); unsigned int const count = dbindex_getmatchcount(kmer); for (unsigned int j = 0; j < count; j++) { si->kmers[list[j]]++; } } } unsigned int tophits = 0; elem_t best; best.count = 0; best.seqno = 0; best.length = 0; for (int i = 0; i < indexed_count; i++) { count_t const count = si->kmers[i]; unsigned int const seqno = dbindex_getmapping(i); unsigned int const length = db_getsequencelen(best.seqno); if (count > best.count) { best.count = count; best.seqno = seqno; best.length = length; tophits = 1; } else if (count == best.count) { if (opt_sintax_random) { tophits++; if (random_int(tophits) == 0) { best.seqno = seqno; best.length = length; } } else { if (length < best.length) { best.seqno = seqno; best.length = length; } else if (length == best.length) { best.seqno = std::min(seqno, best.seqno); } } } } minheap_empty(si->m); if (best.count > 1) { minheap_add(si->m, &best); } } auto sintax_query(int64_t t) -> void { int all_seqno[2][bootstrap_count]; int boot_count[2] = {0, 0}; unsigned int best_count[2] = {0, 0}; int const qseqlen = si_plus[t].qseqlen; char * query_head = si_plus[t].query_head; auto * b = bitmap_init(qseqlen); for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_minus + t : si_plus + t; /* perform search */ unsigned int kmersamplecount = 0; unsigned int * kmersample = nullptr; /* find unique kmers */ unique_count(si->uh, opt_wordlength, si->qseqlen, si->qsequence, & kmersamplecount, & kmersample, MASK_NONE); /* perform 100 bootstraps */ if (kmersamplecount >= subset_size) { for (int i = 0; i < bootstrap_count ; i++) { /* subsample 32 kmers */ unsigned int kmersample_subset[subset_size]; int subsamples = 0; bitmap_reset_all(b); for (int j = 0; j < subset_size ; j++) { int64_t const x = random_int(kmersamplecount); if (! bitmap_get(b, x)) { kmersample_subset[subsamples++] = kmersample[x]; bitmap_set(b, x); } } si->kmersamplecount = subsamples; si->kmersample = kmersample_subset; sintax_search_topscores(si); if (! minheap_isempty(si->m)) { elem_t const e = minheap_poplast(si->m); all_seqno[s][boot_count[s]++] = e.seqno; best_count[s] = std::max(e.count, best_count[s]); } } } } int best_strand = 0; if (opt_strand == 1) { best_strand = 0; } else { if (best_count[0] > best_count[1]) { best_strand = 0; } else if (best_count[1] > best_count[0]) { best_strand = 1; } else { if (boot_count[0] >= boot_count[1]) { best_strand = 0; } else { best_strand = 1; } } } sintax_analyse(query_head, best_strand, all_seqno[best_strand], boot_count[best_strand]); bitmap_free(b); } auto sintax_thread_run(int64_t t) -> void { while (true) { xpthread_mutex_lock(&mutex_input); if (fastx_next(query_fastx_h, ! opt_notrunclabels, chrmap_no_change)) { char * qhead = fastx_get_header(query_fastx_h); int const query_head_len = fastx_get_header_length(query_fastx_h); char * qseq = fastx_get_sequence(query_fastx_h); int const qseqlen = fastx_get_sequence_length(query_fastx_h); int const query_no = fastx_get_seqno(query_fastx_h); int const qsize = fastx_get_abundance(query_fastx_h); for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = s ? si_minus + t : si_plus + t; si->query_head_len = query_head_len; si->qseqlen = qseqlen; si->query_no = query_no; si->qsize = qsize; si->strand = s; /* allocate more memory for header and sequence, if necessary */ if (si->query_head_len + 1 > si->query_head_alloc) { si->query_head_alloc = si->query_head_len + 2001; si->query_head = (char *) xrealloc(si->query_head, (size_t)(si->query_head_alloc)); } if (si->qseqlen + 1 > si->seq_alloc) { si->seq_alloc = si->qseqlen + 2001; si->qsequence = (char *) xrealloc(si->qsequence, (size_t)(si->seq_alloc)); } } /* plus strand: copy header and sequence */ strcpy(si_plus[t].query_head, qhead); strcpy(si_plus[t].qsequence, qseq); /* get progress as amount of input file read */ uint64_t const progress = fastx_get_position(query_fastx_h); /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* minus strand: copy header and reverse complementary sequence */ if (opt_strand > 1) { strcpy(si_minus[t].query_head, si_plus[t].query_head); reverse_complement(si_minus[t].qsequence, si_plus[t].qsequence, si_plus[t].qseqlen); } sintax_query(t); /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* show progress */ progress_update(progress); xpthread_mutex_unlock(&mutex_output); } else { xpthread_mutex_unlock(&mutex_input); break; } } } auto sintax_thread_init(struct searchinfo_s * si) -> void { /* thread specific initialiation */ si->uh = unique_init(); si->kmers = (count_t *) xmalloc((seqcount * sizeof(count_t)) + 32); si->m = minheap_init(tophits); si->hits = nullptr; si->qsize = 1; si->query_head_alloc = 0; si->query_head = nullptr; si->seq_alloc = 0; si->qsequence = nullptr; si->nw = nullptr; si->s = nullptr; } auto sintax_thread_exit(struct searchinfo_s * si) -> void { /* thread specific clean up */ unique_exit(si->uh); minheap_exit(si->m); xfree(si->kmers); if (si->query_head) { xfree(si->query_head); } if (si->qsequence) { xfree(si->qsequence); } } auto sintax_thread_worker(void * vp) -> void * { auto t = (int64_t) vp; sintax_thread_run(t); return nullptr; } auto sintax_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { sintax_thread_init(si_plus + t); if (si_minus) { sintax_thread_init(si_minus + t); } xpthread_create(pthread + t, &attr, sintax_thread_worker, (void *) (int64_t)t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); sintax_thread_exit(si_plus + t); if (si_minus) { sintax_thread_exit(si_minus + t); } } xpthread_attr_destroy(&attr); } auto sintax() -> void { /* tophits = the maximum number of hits we need to store */ tophits = 1; /* open output files */ if (! opt_db) { fatal("No database file specified with --db"); } if (opt_tabbedout) { fp_tabbedout = fopen_output(opt_tabbedout); if (! fp_tabbedout) { fatal("Unable to open tabbedout output file for writing"); } } else { fatal("No output file specified with --tabbedout"); } /* check if db may be an UDB file */ bool const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); } else { db_read(opt_db, 0); } seqcount = db_getsequencecount(); if (! is_udb) { dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } /* prepare reading of queries */ query_fastx_h = fastx_open(opt_sintax); /* allocate memory for thread info */ si_plus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); if (opt_strand > 1) { si_minus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); } else { si_minus = nullptr; } pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); /* run */ progress_init("Classifying sequences", fastx_get_size(query_fastx_h)); sintax_thread_worker_run(); progress_done(); if (! opt_quiet) { fprintf(stderr, "Classified %d of %d sequences", classified, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * classified / queries); } fprintf(stderr, "\n"); } if (opt_log) { fprintf(fp_log, "Classified %d of %d sequences", classified, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * classified / queries); } fprintf(fp_log, "\n"); } /* clean up */ xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); xfree(pthread); xfree(si_plus); if (si_minus) { xfree(si_minus); } fastx_close(query_fastx_h); fclose(fp_tabbedout); dbindex_free(); db_free(); } vsearch-2.30.0/src/sintax.h000066400000000000000000000047041476012147200155060ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sintax() -> void; vsearch-2.30.0/src/sortbylength.cc000066400000000000000000000161651476012147200170660ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::sort, std::min #include #include // std::FILE, std::fprintf, std::size_t #include // std::ldiv #include // std::strcmp #include #ifndef NDEBUG #include #endif struct sortinfo_length_s { unsigned int length = 0; unsigned int size = 0; unsigned int seqno = 0; }; namespace { // anonymous namespace to avoid linker error (multiple definitions // of function with identical names and parameters) auto create_deck() -> std::vector { auto const dbsequencecount = db_getsequencecount(); assert(dbsequencecount < std::numeric_limits::max()); std::vector deck(dbsequencecount); progress_init("Getting lengths", deck.size()); auto counter = std::size_t{0}; for (auto & sequence: deck) { sequence.seqno = counter; sequence.length = db_getsequencelen(counter); sequence.size = db_getabundance(counter); progress_update(counter); ++counter; } progress_done(); return deck; } } auto sort_deck(std::vector & deck) -> void { auto compare_sequences = [](struct sortinfo_length_s const & lhs, struct sortinfo_length_s const & rhs) -> bool { // longest first... if (lhs.length < rhs.length) { return false; } if (lhs.length > rhs.length) { return true; } // ... then ties are sorted by decreasing abundance values... if (lhs.size < rhs.size) { return false; } if (lhs.size > rhs.size) { return true; } // ...then ties are sorted by sequence labels (alpha-numerical ordering), // preserve input order auto const result = std::strcmp(db_getheader(lhs.seqno), db_getheader(rhs.seqno)); return result < 0; }; static constexpr auto one_hundred_percent = 100ULL; progress_init("Sorting", one_hundred_percent); std::stable_sort(deck.begin(), deck.end(), compare_sequences); progress_done(); } // refactoring C++17 [[nodiscard]] auto find_median_length(std::vector const &deck) -> double { // function returns a round value or a value with a remainder of 0.5 static constexpr double half = 0.5; if (deck.empty()) { return 0.0; } // refactoring C++11: use const& std::vector.size() auto const midarray = std::ldiv(static_cast(deck.size()), 2L); // odd number of valid amplicons if (deck.size() % 2 != 0) { return deck[midarray.quot].length * 1.0; // a round value } // even number of valid amplicons // (average of two ints is either round or has a remainder of .5) // avoid risk of silent overflow for large abundance values: // a >= b ; (a + b) / 2 == b + (a - b) / 2 return deck[midarray.quot].length + ((deck[midarray.quot - 1].length - deck[midarray.quot].length) * half); } auto output_median_length(std::vector const & deck, struct Parameters const & parameters) -> void { // Banker's rounding (round half to even) auto const median = find_median_length(deck); if (not parameters.opt_quiet) { std::fprintf(stderr, "Median length: %.0f\n", median); } if (parameters.opt_log != nullptr) { std::fprintf(fp_log, "Median length: %.0f\n", median); } } auto truncate_deck(std::vector &deck, long int const n_first_sequences) -> void { if (deck.size() > static_cast(n_first_sequences)) deck.resize(n_first_sequences); } // refactoring: extract as a template auto output_sorted_fasta(std::vector const & deck, std::FILE * output_file) -> void { progress_init("Writing output", deck.size()); auto counter = std::size_t{0}; for (auto const & sequence: deck) { fasta_print_db_relabel(output_file, sequence.seqno, counter + 1); progress_update(counter); ++counter; } progress_done(); } auto sortbylength(struct Parameters const & parameters) -> void { if (parameters.opt_output == nullptr) { fatal("FASTA output file for sortbylength must be specified with --output"); } auto * fp_output = fopen_output(parameters.opt_output); if (fp_output == nullptr) { fatal("Unable to open sortbylength output file for writing"); } db_read(parameters.opt_sortbylength, 0); show_rusage(); auto deck = create_deck(); show_rusage(); sort_deck(deck); output_median_length(deck, parameters); show_rusage(); truncate_deck(deck, parameters.opt_topn); output_sorted_fasta(deck, fp_output); show_rusage(); db_free(); if (fp_output != nullptr) { static_cast(std::fclose(fp_output)); } } vsearch-2.30.0/src/sortbylength.h000066400000000000000000000047561476012147200167330ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sortbylength(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/sortbysize.cc000066400000000000000000000217631476012147200165570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::min, std::sort #include #include // int64_t #include // std::FILE, std::fprintf, std::size_t #include // std::ldiv #include // std::strcmp #include #ifndef NDEBUG #include #endif struct sortinfo_size_s { unsigned int size = 0; unsigned int seqno = 0; }; namespace { // anonymous namespace to avoid linker error (multiple definitions // of function with identical names and parameters) auto create_deck(struct Parameters const & parameters) -> std::vector { auto const dbsequencecount = db_getsequencecount(); assert(dbsequencecount < std::numeric_limits::max()); std::vector deck(dbsequencecount); progress_init("Getting sizes", deck.size()); auto counter = std::size_t{0}; for (auto seqno = 0U; seqno < dbsequencecount; ++seqno) { auto const size = static_cast(db_getabundance(seqno)); if ((size < parameters.opt_minsize) or (size > parameters.opt_maxsize)) { continue; } deck[counter].seqno = seqno; deck[counter].size = static_cast(size); progress_update(seqno); ++counter; } progress_done(); deck.resize(counter); return deck; } } auto sort_deck(std::vector & deck) -> void { auto compare_sequences = [](struct sortinfo_size_s const & lhs, struct sortinfo_size_s const & rhs) -> bool { // highest abundance first... if (lhs.size < rhs.size) { return false; } if (lhs.size > rhs.size) { return true; } // ...then ties are sorted by sequence labels (alpha-numerical ordering), // preserve input order auto const result = std::strcmp(db_getheader(lhs.seqno), db_getheader(rhs.seqno)); return result < 0; }; static constexpr auto one_hundred_percent = 100ULL; progress_init("Sorting", one_hundred_percent); std::stable_sort(deck.begin(), deck.end(), compare_sequences); progress_done(); } // refactoring C++17 [[nodiscard]] auto find_median_abundance(std::vector const & deck) -> double { // function returns a round value or a value with a remainder of 0.5 static constexpr double half = 0.5; if (deck.empty()) { return 0.0; } // refactoring C++11: use const& std::vector.size() auto const midarray = std::ldiv(static_cast(deck.size()), 2L); // odd number of valid amplicons if (deck.size() % 2 != 0) { return deck[midarray.quot].size * 1.0; // a round value } // even number of valid amplicons // (average of two ints is either round or has a remainder of .5) // avoid risk of silent overflow for large abundance values: // a >= b ; (a + b) / 2 == b + (a - b) / 2 return deck[midarray.quot].size + ((deck[midarray.quot - 1].size - deck[midarray.quot].size) * half); } auto output_median_abundance(std::vector const & deck, struct Parameters const & parameters) -> void { // Banker's rounding (round half to even) auto const median = find_median_abundance(deck); if (not parameters.opt_quiet) { static_cast(fprintf(stderr, "Median abundance: %.0f\n", median)); } if (parameters.opt_log != nullptr) { static_cast(fprintf(fp_log, "Median abundance: %.0f\n", median)); } } // auto trim_deck(std::vector & deck) // -> std::vector { // // assume deck is sorted by decreasing abundance // // - opt_minsize = 0 by default // // - opt_maxsize = LONG_MAX by default // // - size is unsigned int // auto begin = std::upper_bound(deck.begin(), deck.end(), opt_maxsize, // [](int64_t maxsize, struct sortinfo_size_s & seq) -> bool { // return seq.size > maxsize; // }); // auto end = std::lower_bound(deck.begin(), deck.end(), opt_minsize, // [](int64_t minsize, struct sortinfo_size_s & seq) -> bool { // return seq.size <= minsize; // }); // return std::vector{begin, end}; // } auto truncate_deck(std::vector & deck, long int const n_first_sequences) -> void { if (deck.size() > static_cast(n_first_sequences)) deck.resize(n_first_sequences); } // refactoring: extract as a template auto output_sorted_fasta(std::vector const & deck, std::FILE * output_file) -> void { progress_init("Writing output", deck.size()); auto counter = std::size_t{0}; for (auto const & sequence: deck) { fasta_print_db_relabel(output_file, sequence.seqno, counter + 1); progress_update(counter); ++counter; } progress_done(); } // refactoring: trim misize and maxsize with a free function // https://stackoverflow.com/questions/26719144/how-to-erase-a-value-efficiently-from-a-sorted-vector // auto erase_high_abundances(std::vector & vec, int value) -> void // { // auto lb = std::lower_bound(std::begin(vec), std::end(vec), value); // if (lb != std::end(vec) and *lb == value) { // auto ub = std::upper_bound(lb, std::end(vec), value); // vec.erase(lb, ub); // } // } // refactoring: // - create vector (no branch) // - stable_sort vector (by increasing size, then label) // - find lower_bound(comp(opt_minsize)), // - deck.resize() // - find upper_bound(comp(opt_maxsize)), // - std::vector subdeck = {deck.begin() + upper_bound, deck.end()}; // view? // - opt_minsize = 0 by default // - opt_maxsize = LONG_MAX by default // - top_n = LONG_MAX by default // - mediane, etc... // - std::min(subdeck.size(), topn); auto sortbysize(struct Parameters const & parameters) -> void { if (parameters.opt_output == nullptr) { fatal("FASTA output file for sortbysize must be specified with --output"); } auto * fp_output = fopen_output(parameters.opt_output); if (fp_output == nullptr) { fatal("Unable to open sortbysize output file for writing"); } db_read(parameters.opt_sortbysize, 0); show_rusage(); auto deck = create_deck(parameters); show_rusage(); sort_deck(deck); output_median_abundance(deck, parameters); show_rusage(); truncate_deck(deck, parameters.opt_topn); output_sorted_fasta(deck, fp_output); show_rusage(); // refactoring: why three calls to show_rusage()? db_free(); if (fp_output != nullptr) { static_cast(std::fclose(fp_output)); } } vsearch-2.30.0/src/sortbysize.h000066400000000000000000000047541476012147200164220ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sortbysize(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/subsample.cc000066400000000000000000000334541476012147200163350ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::count_if #include #include // macros PRIu64 and PRId64 #include // std::floor #include // int64_t #include // std::FILE, std::fprintf, std::fclose #include // std::minus #include // std::fill #include #ifndef NDEBUG // all contiguous integers from 0 to 2^53 can be represented in the // mantissa of a double constexpr uint64_t contiguous_mantissa = 9007199254740992; // 9 x 10^15 reads #endif struct a_file { char * name = nullptr; std::FILE * handle = nullptr; }; struct file_purposes { a_file kept; a_file lost; }; struct file_types { file_purposes fasta; file_purposes fastq; }; // refactoring: // - accept sample_size = 0 and sample_pct = 0.0? // - fastaout should be empty, all reads should be in fastaout_discarded // refactoring: // - store parameters in a struct, pass struct to reduce list of arguments // - use uint64_t for abundance values? // - create discarded vector only if needed // // discrete_distribution // #include // #include // #include // int main() // { // const auto nreads = 500ULL; // target number of reads // std::random_device r; // std::default_random_engine generator(r()); // std::vector v1 = {1000, 100, 10, 1}; // std::discrete_distribution distribution(v1.begin(), v1.end()); // std::vector v2(v1.size()); // for (auto i=0ULL; i void { if (ouput_files.fasta.kept.name != nullptr) { ouput_files.fasta.kept.handle = fopen_output(ouput_files.fasta.kept.name); } if (ouput_files.fasta.lost.name != nullptr) { ouput_files.fasta.lost.handle = fopen_output(ouput_files.fasta.lost.name); } if (ouput_files.fastq.kept.name != nullptr) { ouput_files.fastq.kept.handle = fopen_output(ouput_files.fastq.kept.name); } if (ouput_files.fastq.lost.name != nullptr) { ouput_files.fastq.lost.handle = fopen_output(ouput_files.fastq.lost.name); } } auto abort_if_fastq_out_of_fasta(struct file_types const & ouput_files) -> void { auto const output_is_fastq = (ouput_files.fastq.kept.handle != nullptr or ouput_files.fastq.lost.handle != nullptr); auto const input_is_fasta = not db_is_fastq(); if (input_is_fasta and output_is_fastq) { fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); } } auto check_output_files(struct file_types const & ouput_files) -> void { if (ouput_files.fasta.kept.name != nullptr) { if (ouput_files.fasta.kept.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (ouput_files.fasta.lost.name != nullptr) { if (ouput_files.fasta.lost.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (ouput_files.fastq.kept.name != nullptr) { if (ouput_files.fastq.kept.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (ouput_files.fastq.lost.name != nullptr) { if (ouput_files.fastq.lost.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } } namespace { // anonymous namespace to avoid linker error (multiple definitions // of function with identical names and parameters) auto create_deck(bool const sizein_requested) -> std::vector { auto const dbsequencecount = db_getsequencecount(); std::vector deck(dbsequencecount, 1); if (sizein_requested) { auto counter = std::size_t{0}; for (auto & abundance : deck) { abundance = db_getabundance(counter); ++counter; } } return deck; } } auto write_original_stats(std::vector const & deck, uint64_t const mass_total, struct Parameters const & parameters) -> void { if (not parameters.opt_quiet) { std::fprintf(stderr, "Got %" PRIu64 " reads from %d amplicons\n", mass_total, static_cast(deck.size())); } if (parameters.opt_log != nullptr) { std::fprintf(parameters.fp_log, "Got %" PRIu64 " reads from %d amplicons\n", mass_total, static_cast(deck.size())); } } auto number_of_reads_to_sample(struct Parameters const & parameters, uint64_t const mass_total) -> uint64_t { assert(mass_total <= contiguous_mantissa); if (parameters.opt_sample_size != 0) { return static_cast(parameters.opt_sample_size); } return static_cast(std::floor(static_cast(mass_total) * parameters.opt_sample_pct / 100.0)); } auto write_subsampling_stats(std::vector const &deck, uint64_t const n_reads, struct Parameters const & parameters) -> void { int const samples = std::count_if(deck.begin(), deck.end(), [](int abundance) -> bool { return abundance != 0; }); if (not parameters.opt_quiet) { std::fprintf(stderr, "Subsampled %" PRIu64 " reads from %d amplicons\n", n_reads, samples); } if (parameters.opt_log != nullptr) { std::fprintf(parameters.fp_log, "Subsampled %" PRIu64 " reads from %d amplicons\n", n_reads, samples); } } auto random_subsampling(std::vector & deck, uint64_t const mass_total, uint64_t const n_reads, bool const sizein_requested) -> void { auto n_reads_left = n_reads; auto amplicon_number = 0; uint64_t n_read_being_checked = 0; uint64_t accumulated_mass = 0; auto amplicon_mass = sizein_requested ? db_getabundance(0) : 1; // refactoring C++17: std::sample() progress_init("Subsampling", mass_total); while (n_reads_left > 0) { auto const random = random_ulong(mass_total - n_read_being_checked); if (random < n_reads_left) { /* selected read r from amplicon a */ ++deck[amplicon_number]; --n_reads_left; } ++n_read_being_checked; ++accumulated_mass; if (accumulated_mass >= amplicon_mass) { /* next amplicon */ ++amplicon_number; amplicon_mass = sizein_requested ? db_getabundance(amplicon_number) : 1; accumulated_mass = 0; } progress_update(n_read_being_checked); } progress_done(); } auto substract_two_decks(std::vector const & original_deck, std::vector const & subsampled_deck) -> std::vector { std::vector difference_deck(original_deck.size()); std::transform(original_deck.cbegin(), original_deck.cend(), subsampled_deck.cbegin(), difference_deck.begin(), std::minus()); return difference_deck; } auto writing_fasta_output(std::vector const & deck, struct a_file const & fasta_file) -> void { if (fasta_file.name == nullptr) { return; } int amplicons_printed = 0; progress_init("Writing fasta output", deck.size()); auto counter = 0U; for (auto const abundance_value : deck) { int64_t const new_abundance = abundance_value; if (new_abundance == 0) { ++counter; continue; } ++amplicons_printed; fasta_print_general(fasta_file.handle, nullptr, db_getsequence(counter), static_cast(db_getsequencelen(counter)), db_getheader(counter), static_cast(db_getheaderlen(counter)), new_abundance, amplicons_printed, -1.0, -1, -1, nullptr, 0.0); progress_update(counter); ++counter; } progress_done(); } auto writing_fastq_output(std::vector const & deck, struct a_file const & fastq_file) -> void { if (fastq_file.name == nullptr) { return; } int amplicons_printed = 0; progress_init("Writing fastq output", deck.size()); auto counter = 0U; for (auto const abundance_value : deck) { int64_t const new_abundance = abundance_value; if (new_abundance == 0) { ++counter; continue; } ++amplicons_printed; fastq_print_general(fastq_file.handle, db_getsequence(counter), static_cast(db_getsequencelen(counter)), db_getheader(counter), static_cast(db_getheaderlen(counter)), db_getquality(counter), static_cast(new_abundance), amplicons_printed, -1.0); progress_update(counter); ++counter; } progress_done(); } auto close_output_files(struct file_types const & ouput_files) -> void { for (auto * fp_outputfile : { ouput_files.fasta.kept.handle, ouput_files.fastq.kept.handle, ouput_files.fasta.lost.handle, ouput_files.fastq.lost.handle}) { if (fp_outputfile != nullptr) { static_cast(std::fclose(fp_outputfile)); } } } auto subsample(struct Parameters const & parameters) -> void { struct file_types ouput_files; ouput_files.fasta.kept.name = parameters.opt_fastaout; ouput_files.fasta.lost.name = parameters.opt_fastaout_discarded; ouput_files.fastq.kept.name = parameters.opt_fastqout; ouput_files.fastq.lost.name = parameters.opt_fastqout_discarded; open_output_files(ouput_files); check_output_files(ouput_files); db_read(parameters.opt_fastx_subsample, 0); show_rusage(); abort_if_fastq_out_of_fasta(ouput_files); // subsampling auto const original_abundances = create_deck(parameters.opt_sizein); auto const mass_total = std::accumulate(original_abundances.cbegin(), original_abundances.cend(), uint64_t{0}); auto subsampled_abundances = original_abundances; std::fill(subsampled_abundances.begin(), subsampled_abundances.end(), 0); // temporary fix: reset vector to zero write_original_stats(original_abundances, mass_total, parameters); // refactoring: move up? auto const n_reads = number_of_reads_to_sample(parameters, mass_total); if (n_reads > mass_total) { fatal("Cannot subsample more reads than in the original sample"); } random_subsampling(subsampled_abundances, mass_total, n_reads, parameters.opt_sizein); // refactoring: pass & original, copy, subsample, return new (const) vector // write output files writing_fasta_output(subsampled_abundances, ouput_files.fasta.kept); writing_fastq_output(subsampled_abundances, ouput_files.fastq.kept); auto const discarded_output_is_requested = (ouput_files.fasta.lost.handle != nullptr) or (ouput_files.fastq.lost.handle != nullptr); if (discarded_output_is_requested) { auto const discarded_abundances = substract_two_decks(original_abundances, subsampled_abundances); writing_fasta_output(discarded_abundances, ouput_files.fasta.lost); writing_fastq_output(discarded_abundances, ouput_files.fastq.lost); } write_subsampling_stats(subsampled_abundances, n_reads, parameters); // clean up db_free(); close_output_files(ouput_files); } vsearch-2.30.0/src/subsample.h000066400000000000000000000047531476012147200161770ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto subsample(struct Parameters const & parameters) -> void; vsearch-2.30.0/src/tax.cc000066400000000000000000000121351476012147200151270ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::tolower #include // std::strlen, std::strstr, std::strchr const char * tax_letters = "dkpcofgst"; auto tax_parse(const char * header, int header_length, int * tax_start, int * tax_end) -> bool { /* Identify the first occurence of the pattern (^|;)tax=([^;]*)(;|$) */ if (! header) { return false; } const char * attribute = "tax="; int const hlen = header_length; int const alen = strlen(attribute); int i = 0; while (i < hlen - alen) { char * r = (char *) strstr(header + i, attribute); /* no match */ if (r == nullptr) { break; } i = r - header; /* check for ';' in front */ if ((i > 0) && (header[i - 1] != ';')) { i += alen + 1; continue; } * tax_start = i; /* find end (semicolon or end of header) */ const char * s = strchr(header + i + alen, ';'); if (s == nullptr) { * tax_end = hlen; } else { * tax_end = s - header; } return true; } return false; } auto tax_split(int seqno, int * level_start, int * level_len) -> void { /* Parse taxonomy string into the following 9 parts d domain k kingdom p phylum c class o order f family g genus s species t strain */ for (int i = 0; i < tax_levels; i++) { level_start[i] = 0; level_len[i] = 0; } int tax_start = 0; int tax_end = 0; char * h = db_getheader(seqno); int const hlen = db_getheaderlen(seqno); if (tax_parse(h, hlen, & tax_start, & tax_end)) { int t = tax_start + 4; while (t < tax_end) { /* Is the next char a recogized tax level letter? */ const char * r = strchr(tax_letters, tolower(h[t])); if (r) { int const level = r - tax_letters; /* Is there a colon after it? */ if (h[t + 1] == ':') { level_start[level] = t + 2; char * z = strchr(h + t + 2, ','); if (z) { level_len[level] = z - h - t - 2; } else { level_len[level] = tax_end - t - 2; } } } /* skip past next comma */ char * x = strchr(h + t, ','); if (x) { t = x - h + 1; } else { t = tax_end; } } } } vsearch-2.30.0/src/tax.h000066400000000000000000000050261476012147200147720ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ extern const char * tax_letters; auto tax_split(int seqno, int * level_start, int * level_len) -> void; vsearch-2.30.0/src/udb.cc000066400000000000000000000660201476012147200151070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "bitmap.h" #include "dbindex.h" #include "mask.h" #include "unique.h" #include // macros PRIu64 and PRId64 #include // UINT_MAX #include #include // uint64_t #include // std::FILE, std::fprintf, std::size_t #include // std::qsort #include // std::memset, std::memmove #include constexpr auto blocksize = 4096U * 4096U; static unsigned int udb_dbaccel = 0; struct wordfreq { unsigned int kmer; unsigned int count; }; using wordfreq_t = struct wordfreq; auto wc_compare(const void * a, const void * b) -> int { auto * x = (wordfreq_t *) a; auto * y = (wordfreq_t *) b; if (x->count < y->count) { return -1; } else if (x->count > y->count) { return +1; } else { if (x->kmer < y->kmer) { return +1; } else if (x->kmer > y->kmer) { return -1; } else { return 0; } } } auto largeread(int fd, void * buf, uint64_t nbyte, uint64_t offset) -> uint64_t { /* call pread multiple times and update progress */ uint64_t progress = offset; for (uint64_t i = 0; i < nbyte; i += blocksize) { uint64_t const res = xlseek(fd, offset + i, SEEK_SET); if (res != offset + i) { fatal("Unable to seek in UDB file or invalid UDB file"); } uint64 const rem = MIN(blocksize, nbyte - i); uint64_t const bytesread = read(fd, ((char *) buf) + i, rem); if (bytesread != rem) { fatal("Unable to read from UDB file or invalid UDB file"); } progress += rem; progress_update(progress); } return nbyte; } auto largewrite(int fd, void * buf, uint64_t nbyte, uint64_t offset) -> uint64_t { /* call write multiple times and update progress */ uint64_t progress = offset; for (uint64_t i = 0; i < nbyte; i += blocksize) { uint64_t const res = xlseek(fd, offset + i, SEEK_SET); if (res != offset + i) { fatal("Unable to seek in UDB file or invalid UDB file"); } uint64 const rem = MIN(blocksize, nbyte - i); uint64_t const byteswritten = write(fd, ((char *) buf) + i, rem); if (byteswritten != rem) { fatal("Unable to write to UDB file"); } progress += rem; progress_update(progress); } return nbyte; } auto udb_detect_isudb(const char * filename) -> bool { /* Detect whether the given filename seems to refer to an UDB file. It must be an uncompressed regular file, not a pipe. */ constexpr static uint32_t udb_file_signature {0x55444246}; constexpr static uint64_t expected_n_bytes {sizeof(uint32_t)}; xstat_t fs; if (xstat(filename, & fs)) { fatal("Unable to get status for input file (%s)", filename); } bool const is_pipe = S_ISFIFO(fs.st_mode); if (is_pipe) { return false; } int fd = 0; fd = xopen_read(filename); if (! fd) { fatal("Unable to open input file for reading (%s)", filename); } unsigned int magic = 0; uint64_t const bytesread = read(fd, & magic, expected_n_bytes); close(fd); if ((bytesread == expected_n_bytes) && (magic == udb_file_signature)) { return true; } return false; } auto udb_info() -> void { /* Read UDB header and show basic info */ unsigned int buffer[50]; int fd_udbinfo = 0; fd_udbinfo = xopen_read(opt_udbinfo); if (! fd_udbinfo) { fatal("Unable to open UDB file for reading"); } uint64_t const bytesread = read(fd_udbinfo, buffer, 4 * 50); if (bytesread != 4 * 50) { fatal("Unable to read from UDB file or invalid UDB file"); } if ((buffer[0] != 0x55444246) || (buffer[2] != 32) || (buffer[4] < 3) || (buffer[4] > 15) || (buffer[13] == 0) || (buffer[17] != 0x0000746e) || (buffer[49] != 0x55444266)) { fatal("Invalid UDB file"); } if (! opt_quiet) { fprintf(stderr, " Seqs %u\n", buffer[13]); fprintf(stderr, " SeqIx bits %u\n", buffer[2]); fprintf(stderr, " Alpha nt (4)\n"); fprintf(stderr, " Word width %u\n", buffer[4]); fprintf(stderr, " Slots %u\n", buffer[11]); fprintf(stderr, " Dict size %u (%.1fk)\n", (1U << (2 * buffer[4])), (1U << (2 * buffer[4])) * 1.0 / 1000.0); fprintf(stderr, " DBstep %u\n", buffer[5]); fprintf(stderr, " DBAccel %u%%\n", buffer[6]); } if (opt_log) { fprintf(fp_log, " Seqs %u\n", buffer[13]); fprintf(fp_log, " SeqIx bits %u\n", buffer[2]); fprintf(fp_log, " Alpha nt (4)\n"); fprintf(fp_log, " Word width %u\n", buffer[4]); fprintf(fp_log, " Slots %u\n", buffer[11]); fprintf(fp_log, " Dict size %u (%.1fk)\n", (1U << (2 * buffer[4])), (1U << (2 * buffer[4])) * 1.0 / 1000.0); fprintf(fp_log, " DBstep %u\n", buffer[5]); fprintf(fp_log, " DBAccel %u%%\n", buffer[6]); } close(fd_udbinfo); } auto udb_read(const char * filename, bool create_bitmaps, bool parse_abundances) -> void { /* read UDB as indexed database */ unsigned int seqcount = 0; unsigned int udb_wordlength = 0; uint64 nucleotides = 0; xstat_t fs; if (xstat(filename, & fs)) { fatal("Unable to get status for input file (%s)", filename); } bool const is_pipe = S_ISFIFO(fs.st_mode); if (is_pipe) { fatal("Cannot read UDB file from a pipe"); } /* get file size */ uint64_t const filesize = fs.st_size; /* open UDB file */ int fd_udb = 0; fd_udb = xopen_read(filename); if (! fd_udb) { fatal("Unable to open UDB file for reading"); } char * prompt = nullptr; if (xsprintf(& prompt, "Reading UDB file %s", filename) == -1) { fatal("Out of memory"); } progress_init(prompt, filesize); /* header */ unsigned int buffer[50]; uint64_t pos = 0; pos += largeread(fd_udb, buffer, 4 * 50, pos); if ((buffer[0] != 0x55444246) || (buffer[2] != 32) || (buffer[4] < 3) || (buffer[4] > 15) || (buffer[13] == 0) || (buffer[17] != 0x0000746e) || (buffer[49] != 0x55444266)) { fatal("Invalid UDB file"); } udb_wordlength = buffer[4]; seqcount = buffer[13]; udb_dbaccel = buffer[6]; if (udb_wordlength != opt_wordlength) { fprintf(stderr, "\nWARNING: Wordlength adjusted to %u as indicated in UDB file\n", udb_wordlength); opt_wordlength = udb_wordlength; } /* word match counts */ kmerhashsize = 1U << (2 * udb_wordlength); kmercount = (unsigned int *) xmalloc(kmerhashsize * sizeof(unsigned int)); kmerhash = (uint64_t *) xmalloc(kmerhashsize * sizeof(uint64_t)); kmerbitmap = (struct bitmap_s * *) xmalloc(kmerhashsize * sizeof(struct bitmap_s **)); memset(kmerbitmap, 0, kmerhashsize * sizeof(struct bitmap_s **)); pos += largeread(fd_udb, kmercount, 4 * kmerhashsize, pos); kmerindexsize = 0; for (uint64_t i = 0; i < kmerhashsize; i++) { kmerhash[i] = kmerindexsize; kmerindexsize += kmercount[i]; } /* signature */ pos += largeread(fd_udb, buffer, 4, pos); if (buffer[0] != 0x55444233) { fatal("Invalid UDB file"); } /* sequence numbers for word matches */ kmerindex = (unsigned int *) xmalloc(kmerindexsize * 4); pos += largeread(fd_udb, kmerindex, 4 * kmerindexsize, pos); /* new header */ pos += largeread(fd_udb, buffer, 4 * 8, pos); if ((buffer[0] != 0x55444234) || (buffer[1] != 0x005e0db3) || (buffer[2] != seqcount) || (buffer[7] != 0x005e0db4)) { fatal("Invalid UDB file"); } nucleotides = (((uint64_t) buffer[4]) << 32U) | buffer[3]; uint64_t const udb_headerchars = (((uint64_t) buffer[6]) << 32U) | buffer[5]; /* header index */ seqindex = (seqinfo_t *) xmalloc(seqcount * sizeof(seqinfo_t)); std::vector header_index(seqcount + 1); pos += largeread(fd_udb, header_index.data(), 4 * seqcount, pos); header_index[seqcount] = udb_headerchars; unsigned last = 0; for (unsigned int i = 0; i < seqcount; i++) { unsigned int const x = header_index[i]; if ((x < last) || (x >= udb_headerchars)) { fatal("Invalid UDB file"); } seqindex[i].header_p = x; seqindex[i].headerlen = header_index[i + 1] - x - 1; seqindex[i].size = 1; last = x; } /* headers */ datap = (char *) xmalloc(udb_headerchars + nucleotides + seqcount); pos += largeread(fd_udb, datap, udb_headerchars, pos); uint64_t longestheader = 0; for (unsigned int i = 0; i < seqcount; i++) { if (seqindex[i].headerlen > longestheader) { longestheader = seqindex[i].headerlen; } } /* sequence lengths */ std::vector sequence_lengths(seqcount); pos += largeread(fd_udb, sequence_lengths.data(), 4 * seqcount, pos); uint64_t sum = 0; unsigned int shortest = UINT_MAX; unsigned int longest = 0; for (unsigned int i = 0; i < seqcount; i++) { unsigned int const x = sequence_lengths[i]; seqindex[i].seq_p = udb_headerchars + sum; seqindex[i].seqlen = x; seqindex[i].qual_p = 0; if (x < shortest) { shortest = x; } if (x > longest) { longest = x; } sum += x; if (sum > nucleotides) { fatal("Invalid UDB file"); } } if (sum != nucleotides) { fatal("Invalid UDB file"); } /* sequences */ pos += largeread(fd_udb, datap + udb_headerchars, nucleotides, pos); if (pos != filesize) { fatal("Incorrect UDB file size"); } /* close UDB file */ close(fd_udb); progress_done(); xfree(prompt); /* move sequences and insert zero at end of each sequence */ progress_init("Reorganizing data in memory", seqcount); for (unsigned int i = seqcount-1; i > 0; i--) { size_t const old_p = seqindex[i].seq_p; size_t const new_p = seqindex[i].seq_p + i; size_t const len = seqindex[i].seqlen; memmove(datap + new_p, datap + old_p, len); *(datap + new_p + len) = 0; seqindex[i].seq_p = new_p; progress_update(seqcount - i); } *(datap + seqindex[0].seq_p + seqindex[0].seqlen) = 0; progress_done(); /* Create bitmaps for the most frequent words */ if (create_bitmaps) { progress_init("Creating bitmaps", kmerhashsize); unsigned int const bitmap_mincount = seqcount / 8; for (unsigned int i = 0; i < kmerhashsize; i++) { if (kmercount[i] >= bitmap_mincount) { kmerbitmap[i] = bitmap_init(seqcount+127); // pad for xmm bitmap_reset_all(kmerbitmap[i]); for (unsigned j = 0; j < kmercount[i]; j++) { bitmap_set(kmerbitmap[i], kmerindex[kmerhash[i]+j]); } } progress_update(i+1); } progress_done(); } /* get abundances and longest header */ if (parse_abundances) { progress_init("Parsing abundances", seqcount); for (unsigned int i = 0; i < seqcount; i++) { int64_t const size = header_get_size(datap + seqindex[i].header_p, seqindex[i].headerlen); if (size > 0) { seqindex[i].size = size; } else { seqindex[i].size = 1; } progress_update(i+1); } progress_done(); } /* set database info */ dbindex_uh = unique_init(); db_setinfo(false, seqcount, nucleotides, longest, shortest, longestheader); /* make mapping from indexno to seqno */ dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int)); dbindex_count = seqcount; for (unsigned int i = 0; i < seqcount; i++) { dbindex_map[i] = i; } /* done */ /* some stats */ if (! opt_quiet) { if (seqcount > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", db_getnucleotidecount(), db_getsequencecount()); } } if (opt_log) { if (seqcount > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n\n", db_getnucleotidecount(), db_getsequencecount()); } } } auto udb_fasta() -> void { if (! opt_output) { fatal("FASTA output file must be specified with --output"); } /* open FASTA file for writing */ FILE * fp_output = fopen_output(opt_output); if (! fp_output) { fatal("Unable to open FASTA output file for writing"); } /* read UDB file */ udb_read(opt_udb2fasta, false, false); /* dump fasta */ unsigned int const seqcount = db_getsequencecount(); progress_init("Writing FASTA file", seqcount); for (std::size_t i = 0; i < seqcount; i++) { fasta_print_db_relabel(fp_output, i, i+1); progress_update(i+1); } progress_done(); fclose(fp_output); dbindex_free(); db_free(); } auto udb_stats() -> void { /* show word statistics for an UDB file */ /* read UDB file */ udb_read(opt_udbstats, false, false); /* analyze word counts */ std::vector freqtable(kmerhashsize); for (unsigned int i = 0; i < kmerhashsize; i++) { freqtable[i].kmer = i; freqtable[i].count = kmercount[i]; } qsort(freqtable.data(), kmerhashsize, sizeof(wordfreq_t), wc_compare); unsigned int const wcmax = freqtable[kmerhashsize-1].count; unsigned int const wcmedian = ( freqtable[(kmerhashsize / 2) - 1].count + freqtable[kmerhashsize / 2].count ) / 2; unsigned int const seqcount = db_getsequencecount(); uint64_t const nt = db_getnucleotidecount(); /* show stats */ if (opt_log) { fprintf(fp_log, " Alphabet nt\n"); fprintf(fp_log, " Word width %" PRIu64 "\n", opt_wordlength); fprintf(fp_log, " Word ones %" PRIu64 "\n", opt_wordlength); fprintf(fp_log, " Spaced No\n"); fprintf(fp_log, " Hashed No\n"); fprintf(fp_log, " Coded No\n"); fprintf(fp_log, " Stepped No\n"); fprintf(fp_log, " Slots %u (%.1fk)\n", kmerhashsize, 1.0 * kmerhashsize / 1000.0); fprintf(fp_log, " DBAccel %u%%\n", udb_dbaccel); fprintf(fp_log, "\n"); fprintf(fp_log, "%10" PRIu64 " DB size (%.1fk)\n", nt, 1.0 * nt / 1000.0); fprintf(fp_log, "%10" PRIu64 " Words\n", kmerindexsize); fprintf(fp_log, "%10u Median size\n", wcmedian); fprintf(fp_log, "%10.1f Mean size\n", 1.0 * kmerindexsize / kmerhashsize); fprintf(fp_log, "\n"); fprintf(fp_log, " iWord sWord Cap Size Row\n"); fprintf(fp_log, "---------- ------------ ---------- ---------- ---\n"); for (unsigned int i = 0; i < kmerhashsize; i++) { fprintf(fp_log, "%10u ", freqtable[kmerhashsize - 1 - i].kmer); fprintf(fp_log, "%.*s", MAX(12 - (int)(opt_wordlength), 0), " "); fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize - 1 - i].kmer); fprintf(fp_log, " %10u %10u", 0, freqtable[kmerhashsize - 1 - i].count); fprintf(fp_log, " "); for (unsigned j = 0; j < freqtable[kmerhashsize - 1 - i].count; j++) { fprintf(fp_log, " %u", kmerindex[kmerhash[freqtable[kmerhashsize - 1 - i].kmer] + j]); if (j == 7) { break; } } if (freqtable[kmerhashsize-1-i].count > 8) { fprintf(fp_log, "..."); } fprintf(fp_log, "\n"); if (i == 10) { break; } } fprintf(fp_log, "\n\n"); fprintf(fp_log, "Word width %" PRIu64 "\n", opt_wordlength); fprintf(fp_log, "Slots %u\n", kmerhashsize); fprintf(fp_log, "Words %" PRIu64 "\n", kmerindexsize); fprintf(fp_log, "Max size %u (", wcmax); fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize - 1].kmer); fprintf(fp_log, ")\n\n"); fprintf(fp_log, " Size lo Size hi Total size Nr. Words Pct TotPct\n"); fprintf(fp_log, "---------- ---------- ---------- ---------- ------ ------\n"); unsigned int size_lo = 0; unsigned int size_hi = 0; unsigned int x = 0; double totpct = 0.0; while (size_lo < seqcount) { int count = 0; int size = 0; while ((x < kmerhashsize) && (freqtable[x].count <= size_hi)) { count++; size += freqtable[x].count; x++; } double const pct = 100.0 * count / kmerhashsize; totpct += pct; if (size_lo < size_hi) { fprintf(fp_log, "%10u", size_lo); } else { fprintf(fp_log, " "); } fprintf(fp_log, " %10u", size_hi); if (size >= 10000) { fprintf(fp_log, " %9.1fk", size * 0.001); } else { fprintf(fp_log, " %10.1f", size * 1.0); } if (count >= 10000) { fprintf(fp_log, " %9.1fk", count * 0.001); } else { fprintf(fp_log, " %10.1f", count * 1.0); } fprintf(fp_log, " %5.1f%% %5.1f%%", pct, totpct); static constexpr double divider = 3.0; const auto dots = std::lround(pct / divider); if (dots > 0) { fprintf(fp_log, " "); } for (auto i = 0L; i < dots ; i++) { fprintf(fp_log, "*"); } fprintf(fp_log, "\n"); size_lo = size_hi + 1; if (size_hi > 0) { size_hi *= 2; } else { size_hi = 1; } if (size_hi > seqcount) { size_hi = seqcount; } } fprintf(fp_log, "---------- ---------- ---------- ----------\n"); fprintf(fp_log, " "); if (kmerindexsize >= 10000) { fprintf(fp_log, " %9.1fk", kmerindexsize * 0.001); } else { fprintf(fp_log, " %10.1f", kmerindexsize * 1.0); } if (kmerhashsize >= 10000) { fprintf(fp_log, " %9.1fk", kmerhashsize * 0.001); } else { fprintf(fp_log, " %10.1f", kmerhashsize * 1.0); } fprintf(fp_log, "\n\n"); fprintf(fp_log, "%10" PRIu64 " Upper\n", nt); fprintf(fp_log, "%10u Lower (%.1f%%)\n", 0, 0.0); fprintf(fp_log, "%10" PRIu64 " Total\n", nt); fprintf(fp_log, "%10" PRIu64 " Indexed words\n", kmerindexsize); } dbindex_free(); db_free(); } auto udb_make() -> void { if (! opt_output) { fatal("UDB output file must be specified with --output"); } int fd_output = 0; fd_output = xopen_write(opt_output); if (! fd_output) { fatal("Unable to open output file for writing"); } db_read(opt_makeudb_usearch, 1); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) { hardmask_all(); } dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); unsigned int const seqcount = db_getsequencecount(); uint64_t const ntcount = db_getnucleotidecount(); uint64_t header_characters = 0; for (unsigned int i = 0; i < seqcount; i++) { header_characters += db_getheaderlen(i) + 1; } uint64_t const kmerhashsize = 1U << (2 * static_cast(opt_wordlength)); /* count word matches */ uint64_t wordmatches = 0; for (unsigned int i = 0; i < kmerhashsize; i++) { wordmatches += kmercount[i]; } uint64_t pos = 0; uint64_t const progress_all = (4 * 50) + (4 * kmerhashsize) + (4 * 1) + (4 * wordmatches) + (4 * 8) + (4 * seqcount) + header_characters + (4 * seqcount) + ntcount; progress_init("Writing UDB file", progress_all); uint64_t const buffersize = MAX(50, seqcount); std::vector buffer(buffersize); /* Header */ buffer[0] = 0x55444246; /* FBDU UDBF */ buffer[2] = 32; /* bits */ buffer[4] = opt_wordlength; /* default 8 */ buffer[5] = 1; /* dbstep */ buffer[6] = 100; /* dbaccelpct % */ buffer[11] = 0; /* slots */ buffer[13] = seqcount; /* number of sequences */ buffer[17] = 0x0000746e; /* alphabet: "nt" */ buffer[49] = 0x55444266; /* fBDU UDBf */ pos += largewrite(fd_output, buffer.data(), 50 * 4, 0); /* write 4^wordlength uint32's with word match counts */ pos += largewrite(fd_output, kmercount, 4 * kmerhashsize, pos); /* 3BDU */ buffer[0] = 0x55444233; /* 3BDU UDB3 */ pos += largewrite(fd_output, buffer.data(), 1 * 4, pos); /* lists of sequence no's with matches for all words */ for (unsigned int i = 0; i < kmerhashsize; i++) { if (kmerbitmap[i]) { memset(buffer.data(), 0, 4 * kmercount[i]); unsigned int elements = 0; for (unsigned int j = 0; j < seqcount; j++) { if (bitmap_get(kmerbitmap[i], j)) { buffer[elements++] = j; } } pos += largewrite(fd_output, buffer.data(), 4 * elements, pos); } else { if (kmercount[i] > 0) { pos += largewrite(fd_output, kmerindex + kmerhash[i], 4 * kmercount[i], pos); } } } /* New header */ buffer[0] = 0x55444234; /* 4BDU UDB4 */ /* 0x005e0db3 */ buffer[1] = 0x005e0db3; /* number of sequences, uint32 */ buffer[2] = seqcount; /* total number of nucleotides, uint64 */ buffer[3] = (unsigned int) (ntcount & 0xffffffff); buffer[4] = (unsigned int) (ntcount >> 32U); /* total number of header characters, incl zero-terminator, uint64 */ buffer[5] = (unsigned int) (header_characters & 0xffffffff); buffer[6] = (unsigned int) (header_characters >> 32U); /* 0x005e0db4 */ buffer[7] = 0x005e0db4; pos += largewrite(fd_output, buffer.data(), 4 * 8, pos); /* indices to headers (uint32) */ unsigned int sum = 0; for (unsigned int i = 0; i < seqcount; i++) { buffer[i] = sum; sum += db_getheaderlen(i) + 1; } pos += largewrite(fd_output, buffer.data(), 4 * seqcount, pos); /* headers (ascii, zero terminated, not padded) */ for (unsigned int i = 0; i < seqcount; i++) { unsigned int const len = db_getheaderlen(i); pos += largewrite(fd_output, db_getheader(i), len + 1, pos); } /* sequence lengths (uint32) */ for (unsigned int i = 0; i < seqcount; i++) { buffer[i] = db_getsequencelen(i); } pos += largewrite(fd_output, buffer.data(), 4 * seqcount, pos); /* sequences (ascii, no term, no pad) */ for (unsigned int i = 0; i < seqcount; i++) { unsigned int const len = db_getsequencelen(i); pos += largewrite(fd_output, db_getsequence(i), len, pos); } if (close(fd_output) != 0) { fatal("Unable to close UDB file"); } progress_done(); dbindex_free(); db_free(); } vsearch-2.30.0/src/udb.h000066400000000000000000000052771476012147200147600ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto udb_detect_isudb(const char * filename) -> bool; auto udb_read(const char * filename, bool create_bitmaps, bool parse_abundances) -> void; auto udb_fasta() -> void; auto udb_info() -> void; auto udb_make() -> void; auto udb_stats() -> void; vsearch-2.30.0/src/unique.cc000066400000000000000000000242011476012147200156360ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "maps.h" #include "mask.h" #include // std::min #include // uint64_t #include // std::memset /* Find the unique kmers or words in a given sequence. Unique is now defined as all different words occuring at least once. Earlier it was defined as those words occuring exactly once, but that caused a problem when searching for sequences with many repeats. */ #define HASH CityHash64 struct bucket_s { unsigned int kmer; unsigned int count; }; struct uhandle_s { struct bucket_s * hash; unsigned int * list; unsigned int hash_mask; int size; int alloc; uint64_t bitmap_size; uint64_t * bitmap; }; auto unique_init() -> struct uhandle_s * { auto * unique_handle = (struct uhandle_s *) xmalloc(sizeof(struct uhandle_s)); unique_handle->alloc = 2048; unique_handle->size = 0; unique_handle->hash_mask = unique_handle->alloc - 1; unique_handle->hash = (struct bucket_s *) xmalloc(sizeof(struct bucket_s) * unique_handle->alloc); unique_handle->list = (unsigned int *) xmalloc(sizeof(unsigned int) * unique_handle->alloc); unique_handle->bitmap_size = 0; unique_handle->bitmap = nullptr; return unique_handle; } auto unique_exit(struct uhandle_s * unique_handle) -> void { if (unique_handle->bitmap) { xfree(unique_handle->bitmap); } if (unique_handle->hash) { xfree(unique_handle->hash); } if (unique_handle->list) { xfree(unique_handle->list); } xfree(unique_handle); } auto unique_compare(const void * a, const void * b) -> int { auto * x = (unsigned int *) a; auto * y = (unsigned int *) b; if (x < y) { return -1; } else if (x > y) { return +1; } else { return 0; } } auto unique_count_bitmap(struct uhandle_s * unique_handle, int wordlength, int seqlen, char * seq, unsigned int * listlen, unsigned int * * list, int seqmask) -> void { /* if necessary, reallocate list of unique kmers */ if (unique_handle->alloc < seqlen) { while (unique_handle->alloc < seqlen) { unique_handle->alloc *= 2; } unique_handle->list = (unsigned int *) xrealloc(unique_handle->list, sizeof(unsigned int) * unique_handle->alloc); } uint64_t const size = 1ULL << (wordlength << 1ULL); /* reallocate bitmap arrays if necessary */ if (unique_handle->bitmap_size < size) { unique_handle->bitmap = (uint64_t *) xrealloc(unique_handle->bitmap, size >> 3ULL); unique_handle->bitmap_size = size; } memset(unique_handle->bitmap, 0, size >> 3ULL); uint64_t bad = 0; uint64_t kmer = 0; uint64_t const mask = size - 1ULL; char * s = seq; char * e1 = s + wordlength - 1; char * e2 = s + seqlen; e1 = std::min(e2, e1); unsigned int * maskmap = (seqmask != MASK_NONE) ? chrmap_mask_lower : chrmap_mask_ambig; while (s < e1) { bad <<= 2ULL; bad |= maskmap[(int) (*s)]; kmer <<= 2ULL; kmer |= chrmap_2bit[(int) (*s++)]; } int unique = 0; while (s < e2) { bad <<= 2ULL; bad |= maskmap[(int) (*s)]; bad &= mask; kmer <<= 2ULL; kmer |= chrmap_2bit[(int) (*s++)]; kmer &= mask; if (! bad) { uint64_t const x = kmer >> 6ULL; uint64_t const y = 1ULL << (kmer & 63ULL); if (! (unique_handle->bitmap[x] & y)) { /* not seen before */ unique_handle->list[unique++] = kmer; unique_handle->bitmap[x] |= y; } } } *listlen = unique; *list = unique_handle->list; } auto unique_count_hash(struct uhandle_s * unique_handle, int wordlength, int seqlen, char * seq, unsigned int * listlen, unsigned int * * list, int seqmask) -> void { /* if necessary, reallocate hash table and list of unique kmers */ if (unique_handle->alloc < 2 * seqlen) { while (unique_handle->alloc < 2 * seqlen) { unique_handle->alloc *= 2; } unique_handle->hash = (struct bucket_s *) xrealloc(unique_handle->hash, sizeof(struct bucket_s) * unique_handle->alloc); unique_handle->list = (unsigned int *) xrealloc(unique_handle->list, sizeof(unsigned int) * unique_handle->alloc); } /* hashtable variant */ unique_handle->size = 1; while (unique_handle->size < 2 * seqlen) { unique_handle->size *= 2; } unique_handle->hash_mask = unique_handle->size - 1; memset(unique_handle->hash, 0, sizeof(struct bucket_s) * unique_handle->size); uint64_t bad = 0; uint64_t j = 0; unsigned int kmer = 0; unsigned int const mask = (1ULL << (2ULL * wordlength)) - 1ULL; char * s = seq; char * e1 = s + wordlength - 1; char * e2 = s + seqlen; e1 = std::min(e2, e1); unsigned int * maskmap = (seqmask != MASK_NONE) ? chrmap_mask_lower : chrmap_mask_ambig; while (s < e1) { bad <<= 2ULL; bad |= maskmap[(int) (*s)]; kmer <<= 2ULL; kmer |= chrmap_2bit[(int) (*s++)]; } uint64_t unique = 0; while (s < e2) { bad <<= 2ULL; bad |= maskmap[(int) (*s)]; bad &= mask; kmer <<= 2ULL; kmer |= chrmap_2bit[(int) (*s++)]; kmer &= mask; if (! bad) { /* find free appropriate bucket in hash */ j = HASH((char *) &kmer, (wordlength + 3) / 4) & unique_handle->hash_mask; while((unique_handle->hash[j].count) && (unique_handle->hash[j].kmer != kmer)) { j = (j + 1) & unique_handle->hash_mask; } if (! (unique_handle->hash[j].count)) { /* not seen before */ unique_handle->list[unique++] = kmer; unique_handle->hash[j].kmer = kmer; unique_handle->hash[j].count = 1; } } } *listlen = unique; *list = unique_handle->list; } auto unique_count(struct uhandle_s * unique_handle, int wordlength, int seqlen, char * seq, unsigned int * listlen, unsigned int * * list, int seqmask) -> void { if (wordlength < 10) { unique_count_bitmap(unique_handle, wordlength, seqlen, seq, listlen, list, seqmask); } else { unique_count_hash(unique_handle, wordlength, seqlen, seq, listlen, list, seqmask); } } auto unique_count_shared(struct uhandle_s * unique_handle, int wordlength, int listlen, unsigned int * list) -> int { /* counts how many of the kmers in list are present in the (already computed) hash or bitmap */ auto count = 0; if (wordlength < 10) { for (auto i = 0; i < listlen; i++) { auto const kmer = list[i]; uint64_t const x = kmer >> 6ULL; uint64_t const y = 1ULL << (kmer & 63ULL); if (unique_handle->bitmap[x] & y) { ++count; } } } else { for (auto i = 0; i < listlen; i++) { auto kmer = list[i]; uint64_t j = HASH((char *) &kmer, (wordlength + 3) / 4) & unique_handle->hash_mask; while ((unique_handle->hash[j].count) && (unique_handle->hash[j].kmer != kmer)) { j = (j + 1) & unique_handle->hash_mask; } if (unique_handle->hash[j].count) { ++count; } } } return count; } vsearch-2.30.0/src/unique.h000066400000000000000000000060071476012147200155040ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct bucket_s; struct uhandle_s; auto unique_init() -> struct uhandle_s *; auto unique_exit(struct uhandle_s * unique_handle) -> void; auto unique_count(struct uhandle_s * unique_handle, int wordlength, int seqlen, char * seq, unsigned int * listlen, unsigned int * * list, int seqmask) -> void; auto unique_count_shared(struct uhandle_s * unique_handle, int wordlength, int listlen, unsigned int * list) -> int; vsearch-2.30.0/src/userfields.cc000066400000000000000000000110411476012147200164730ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // uint64_t #include // std::strcmp, std::strchr, std::strlen static const char * userfields_names[] = { "query", // 0 "target", // 1 "evalue", // 2 "id", // 3 "pctpv", "pctgaps", "pairs", "gaps", "qlo", "qhi", "tlo", "thi", "pv", "ql", "tl", "qs", "ts", "alnlen", "opens", "exts", "raw", "bits", "aln", "caln", "qstrand", "tstrand", "qrow", "trow", "qframe", "tframe", "mism", "ids", "qcov", "tcov", // 33 "id0", "id1", "id2", "id3", "id4", // 38 "qilo", // 39 "qihi", "tilo", "tihi", // 42 nullptr }; int * userfields_requested = nullptr; int userfields_requested_count = 0; auto parse_userfields_arg(char * arg) -> int { // Parses the userfields option argument, e.g. query+target+id+alnlen+mism // and returns 1 if it is ok or 0 if not. char * p = arg; char * e = p + strlen(p); // pointer to end of string // refactoring: // auto const userfields_requested_count = std::count(v.cbegin(), v.cend(), '+'); userfields_requested_count = 1; while (p < e) { if (*p++ == '+') { ++userfields_requested_count; } } userfields_requested = (int *) xmalloc(sizeof(int) * (uint64_t) userfields_requested_count); p = arg; char * q = nullptr; int fields = 0; while (true) { q = strchr(p, '+'); if (not q) { q = e; } auto n = (uint64_t) (q - p); char ** u = (char **) userfields_names; while (*u) { if ((strncmp(p, *u, n) == 0) and (strlen(*u) == n)) { break; } ++u; } if (not *u) { // reached end of list -> unrecognized field return 0; // bad argument } int const i = (int) (((const char **) u) - userfields_names); userfields_requested[fields++] = i; p = q; if (p == e) { // reached end of argument return 1; } ++p; } } vsearch-2.30.0/src/userfields.h000066400000000000000000000050461476012147200163450ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ extern int * userfields_requested; extern int userfields_requested_count; auto parse_userfields_arg(char * arg) -> int; vsearch-2.30.0/src/util.cc000066400000000000000000000321271476012147200153130ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "city.h" #include "md5.h" #include "utils/maps.hpp" #include #include // macros PRIu64 and PRId64 #include // ULONG_MAX, RAND_MAX #include // va_list #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t, std::vsnprintf, std::fopen #include // std::exit, EXIT_FAILURE #include // std::strlen, std::strcmp, std::strcpy, std::strchr #include // timeval, gettimeofday #include // std::next #include #include constexpr auto one_hundred_percent = 100UL; constexpr auto nighty_nine_percent = 99UL; static const char * progress_prompt; static uint64_t progress_next; static uint64_t progress_size; static uint64_t progress_pct; static bool progress_show; auto progress_init(const char * prompt, uint64_t size) -> void { progress_show = (isatty(fileno(stderr)) != 0) and (not opt_quiet) and (not opt_no_progress); progress_prompt = prompt; progress_size = size; progress_pct = 0; progress_next = ((progress_pct + 1) * progress_size + nighty_nine_percent) / one_hundred_percent; if (opt_quiet) { return; } std::fprintf(stderr, "%s", prompt); if (not progress_show) { return; } std::fprintf(stderr, " %d%%", 0); } auto progress_update(uint64_t progress) -> void { if ((progress < progress_next) or not progress_show) { return; } if (progress_size == 0) { std::fprintf(stderr, " \r%s 0%%", progress_prompt); return; } progress_pct = one_hundred_percent * progress / progress_size; std::fprintf(stderr, " \r%s %" PRIu64 "%%", progress_prompt, progress_pct); progress_next = ((progress_pct + 1) * progress_size + nighty_nine_percent) / one_hundred_percent; } auto progress_done() -> void { if (opt_quiet) { return; } if (progress_show) { std::fprintf(stderr, " \r%s", progress_prompt); } std::fprintf(stderr, " %ld%%\n", one_hundred_percent); } __attribute__((noreturn)) auto fatal(const char * msg) -> void { std::fprintf(stderr, "\n\n"); std::fprintf(stderr, "Fatal error: %s\n", msg); if (fp_log != nullptr) { std::fprintf(fp_log, "\n\n"); std::fprintf(fp_log, "Fatal error: %s\n", msg); } std::exit(EXIT_FAILURE); } __attribute__((noreturn)) auto fatal(const char * format, const char * message) -> void { std::fprintf(stderr, "\n\nFatal error: "); std::fprintf(stderr, format, message); std::fprintf(stderr, "\n"); if (opt_log != nullptr) { std::fprintf(fp_log, "\n\nFatal error: "); std::fprintf(fp_log, format, message); std::fprintf(fp_log, "\n"); } std::exit(EXIT_FAILURE); } auto xstrdup(char const * src) -> char * { auto const len = std::strlen(src); auto * dest = (char *) xmalloc(len + 1); return std::strcpy(dest, src); } auto xstrchrnul(char * str, int target) -> char * { // find the first occurrence to static_cast(target) auto * first_occurrence = std::strchr(str, target); if (first_occurrence != nullptr) { return first_occurrence; } return std::next(str, static_cast(std::strlen(str))); } auto xsprintf(char * * ret, const char * format, ...) -> int { std::va_list args; va_start(args, format); auto len = std::vsnprintf(nullptr, 0, format, args); va_end(args); if (len < 0) { fatal("Error with vsnprintf in xsprintf"); } auto * buffer = (char *) xmalloc(len + 1); va_start(args, format); len = std::vsnprintf(buffer, len + 1, format, args); va_end(args); *ret = buffer; return len; } auto hash_cityhash64(char * sequence, uint64_t length) -> uint64_t { return CityHash64((const char *) sequence, length); } auto hash_cityhash128(char * sequence, uint64_t length) -> uint128 { return CityHash128((const char *) sequence, length); } auto show_rusage() -> void { #ifdef SHOW_RUSAGE static constexpr auto a_megabyte = 1024.0 * 1024.0; double user_time = 0.0; double system_time = 0.0; arch_get_user_system_time(&user_time, &system_time); double const megabytes = arch_get_memused() / a_megabyte; std::fprintf(stderr, "Time: %.3fs (user) %.3fs (sys) Memory: %.0lfMB\n", user_time, system_time, megabytes); if (opt_log) std::fprintf(fp_log, "Time: %.3fs (user) %.3fs (sys) Memory: %.0lfMB\n", user_time, system_time, megabytes); #endif } auto reverse_complement(char * rc_seq, char * seq, int64_t len) -> void { /* Write the reverse complementary sequence to rc_seq. The memory for rc_seq must be long enough for the rc_seq of the sequence (identical to the length of seq + 1). */ for (auto i = 0LL; i < len; ++i) { auto const unsigned_char = static_cast(*std::next(seq, len - 1 - i)); auto const complement_char = static_cast(chrmap_complement_vector[unsigned_char]); *std::next(rc_seq, i) = complement_char; } *std::next(rc_seq, len) = '\0'; } auto random_init() -> void { arch_srandom(); } auto random_int(int64_t upper_limit) -> int64_t { /* Generate a random integer in the range 0 to n-1, inclusive. n must be > 0 The random() function returns a random number in the range 0 to 2147483647 (=2^31-1=RAND_MAX), inclusive. We should avoid some of the upper generated numbers to avoid modulo bias. */ assert(upper_limit != 0); int64_t const random_max = RAND_MAX; int64_t const limit = random_max - ((random_max + 1) % upper_limit); auto random_value = static_cast(arch_random()); while (random_value > limit) { random_value = static_cast(arch_random()); } return random_value % upper_limit; } auto random_ulong(uint64_t upper_limit) -> uint64_t { /* Generate a random integer in the range 0 to n-1, inclusive, n must be > 0 */ assert(upper_limit != 0U); static constexpr auto shift_16_bits = 16U; static constexpr auto shift_32_bits = 32U; static constexpr auto shift_48_bits = 48U; auto const random_max = std::numeric_limits::max(); auto const limit = random_max - ((random_max - upper_limit + 1) % upper_limit); auto random_value = ((arch_random() << shift_48_bits) ^ (arch_random() << shift_32_bits) ^ (arch_random() << shift_16_bits) ^ (arch_random())); while (random_value > limit) { random_value = ((arch_random() << shift_48_bits) ^ (arch_random() << shift_32_bits) ^ (arch_random() << shift_16_bits) ^ (arch_random())); } return random_value % upper_limit; } auto string_normalize(char * normalized, char * raw_seq, unsigned int len) -> void { /* convert string to upper case and replace U by T */ for (auto i = 0U; i < len; ++i) { auto const unsigned_char = static_cast(*raw_seq); auto const normalized_char = chrmap_normalize_vector[unsigned_char]; *normalized = static_cast(normalized_char); std::advance(normalized, 1); std::advance(raw_seq, 1); } *normalized = '\0'; } auto fprint_hex(std::FILE * output_handle, unsigned char * data, int len) -> void { for (auto i = 0; i < len; ++i) { std::fprintf(output_handle, "%02x", *std::next(data, i)); } } auto SHA1(const unsigned char * data, unsigned long len, unsigned char * digest) -> void { if (digest == nullptr) { fatal("Error in computing SHA1 digest"); } SHA1_CTX a_context; SHA1_Init(&a_context); SHA1_Update(&a_context, data, len); SHA1_Final(&a_context, digest); } auto MD5(void * data, unsigned long len, unsigned char * digest) -> void { if (digest == nullptr) { fatal("Error in computing MD5 digest"); } MD5_CTX a_context; MD5_Init(&a_context); MD5_Update(&a_context, data, len); MD5_Final(digest, &a_context); } static constexpr auto drop_lower_nibble = 4U; static constexpr auto mask_upper_nibble = 15U; static const std::vector hexdigits = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; auto get_hex_seq_digest_sha1(char * hex, char * seq, int seqlen) -> void { /* Save hexadecimal representation of the SHA1 hash of the sequence. The string array digest must be large enough (len_hex_dig_sha1). First normalize string by uppercasing it and replacing U's with T's. */ std::vector normalized(seqlen + 1); string_normalize(normalized.data(), seq, seqlen); std::vector digest(sha1_digest_length); SHA1((const unsigned char *) normalized.data(), static_cast(seqlen), digest.data()); for (auto const & element: digest) { *hex = hexdigits[element >> drop_lower_nibble]; std::advance(hex, 1); *hex = hexdigits[element & mask_upper_nibble]; std::advance(hex, 1); } *hex = '\0'; } auto get_hex_seq_digest_md5(char * hex, char * seq, int seqlen) -> void { /* Save hexadecimal representation of the MD5 hash of the sequence. The string array digest must be large enough (len_hex_dig_md5). First normalize string by uppercasing it and replacing U's with T's. */ std::vector normalized(seqlen + 1); string_normalize(normalized.data(), seq, seqlen); std::vector digest(md5_digest_length); MD5(normalized.data(), static_cast(seqlen), digest.data()); for (auto const & element: digest) { *hex = hexdigits[element >> drop_lower_nibble]; std::advance(hex, 1); *hex = hexdigits[element & mask_upper_nibble]; std::advance(hex, 1); } *hex = '\0'; } auto fprint_seq_digest_sha1(std::FILE * output_handle, char * seq, int seqlen) -> void { std::vector hex_digest(len_hex_dig_sha1); get_hex_seq_digest_sha1(hex_digest.data(), seq, seqlen); std::fprintf(output_handle, "%s", hex_digest.data()); } auto fprint_seq_digest_md5(std::FILE * output_handle, char * seq, int seqlen) -> void { std::vector hex_digest(len_hex_dig_md5); get_hex_seq_digest_md5(hex_digest.data(), seq, seqlen); std::fprintf(output_handle, "%s", hex_digest.data()); } auto fopen_input(const char * filename) -> std::FILE * { /* open the input stream given by filename, but use stdin if name is - */ if (std::strcmp(filename, "-") == 0) { auto const file_descriptor = dup(STDIN_FILENO); if (file_descriptor < 0) { return nullptr; } return fdopen(file_descriptor, "rb"); } return std::fopen(filename, "rb"); } auto fopen_output(const char * filename) -> std::FILE * { /* open the output stream given by filename, but use stdout if name is - */ if (std::strcmp(filename, "-") == 0) { auto const file_descriptor = dup(STDOUT_FILENO); if (file_descriptor < 0) { return nullptr; } return fdopen(file_descriptor, "w"); } return std::fopen(filename, "w"); } vsearch-2.30.0/src/util.h000066400000000000000000000154371476012147200151620ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t #include // std::FILE, std::size_t #ifndef MIN #define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif #ifndef MAX #define MAX(a,b) ((a) > (b) ? (a) : (b)) #endif constexpr auto md5_digest_length = 16; constexpr auto sha1_digest_length = 20; constexpr auto len_hex_dig_md5 = (2 * md5_digest_length) + 1; constexpr int len_hex_dig_sha1 = (2 * sha1_digest_length) + 1; auto fatal(const char * msg) -> void; auto fatal(const char * format, const char * message) -> void; auto xstrdup(const char * src) -> char *; auto xstrchrnul(char * str, int target) -> char *; auto xsprintf(char * * ret, const char * format, ...) -> int; auto hash_cityhash64(char * sequence, uint64_t length) -> uint64_t; auto hash_cityhash128(char * sequence, uint64_t length) -> uint128; auto show_rusage() -> void; auto progress_init(const char * prompt, uint64_t size) -> void; auto progress_update(uint64_t progress) -> void; auto progress_done() -> void; auto random_init() -> void; auto random_int(int64_t upper_limit) -> int64_t; auto random_ulong(uint64_t upper_limit) -> uint64_t; auto string_normalize(char * normalized, char * raw_seq, unsigned int len) -> void; auto reverse_complement(char * rc_seq, char * seq, int64_t len) -> void; auto fprint_hex(std::FILE * output_handle, unsigned char * data, int len) -> void; auto get_hex_seq_digest_sha1(char * hex, char * seq, int seqlen) -> void; auto get_hex_seq_digest_md5(char * hex, char * seq, int seqlen) -> void; auto fprint_seq_digest_sha1(std::FILE * output_handle, char * seq, int seqlen) -> void; auto fprint_seq_digest_md5(std::FILE * output_handle, char * seq, int seqlen) -> void; auto fopen_input(const char * filename) -> std::FILE *; auto fopen_output(const char * filename) -> std::FILE *; inline auto xpthread_attr_init(pthread_attr_t * attr) -> void { if (pthread_attr_init(attr) != 0) { fatal("Unable to init thread attributes"); } } inline auto xpthread_attr_destroy(pthread_attr_t * attr) -> void { if (pthread_attr_destroy(attr) != 0) { fatal("Unable to destroy thread attributes"); } } inline auto xpthread_attr_setdetachstate(pthread_attr_t * attr, int detachstate) -> void { if (pthread_attr_setdetachstate(attr, detachstate) != 0) { fatal("Unable to set thread attributes detach state"); } } inline auto xpthread_create(pthread_t * thread, const pthread_attr_t * attr, void *(*start_routine)(void *), void * arg) -> void { if (pthread_create(thread, attr, start_routine, arg) != 0) { fatal("Unable to create thread"); } } inline auto xpthread_join(pthread_t thread, void ** value_ptr) -> void { if (pthread_join(thread, value_ptr) != 0) { fatal("Unable to join thread"); } } inline auto xpthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) -> void { if (pthread_mutex_init(mutex, attr) != 0) { fatal("Unable to init mutex"); } } inline auto xpthread_mutex_destroy(pthread_mutex_t * mutex) -> void { if (pthread_mutex_destroy(mutex) != 0) { fatal("Unable to destroy mutex"); } } inline auto xpthread_mutex_lock(pthread_mutex_t * mutex) -> void { if (pthread_mutex_lock(mutex) != 0) { fatal("Unable to lock mutex"); } } inline auto xpthread_mutex_unlock(pthread_mutex_t * mutex) -> void { if (pthread_mutex_unlock(mutex) != 0) { fatal("Unable to unlock mutex"); } } inline auto xpthread_cond_init(pthread_cond_t * cond, const pthread_condattr_t * attr) -> void { if (pthread_cond_init(cond, attr) != 0) { fatal("Unable to init condition variable"); } } inline auto xpthread_cond_destroy(pthread_cond_t * cond) -> void { if (pthread_cond_destroy(cond) != 0) { fatal("Unable to destroy condition variable"); } } inline auto xpthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) -> void { if (pthread_cond_wait(cond, mutex) != 0) { fatal("Unable to wait on condition variable"); } } inline auto xpthread_cond_signal(pthread_cond_t * cond) -> void { if (pthread_cond_signal(cond) != 0) { fatal("Unable to signal condition variable"); } } inline auto xpthread_cond_broadcast(pthread_cond_t * cond) -> void { if (pthread_cond_broadcast(cond) != 0) { fatal("Unable to broadcast condition variable"); } } vsearch-2.30.0/src/utils/000077500000000000000000000000001476012147200151625ustar00rootroot00000000000000vsearch-2.30.0/src/utils/maps.cpp000066400000000000000000000236311476012147200166330ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "maps.hpp" #include const std::vector chrmap_no_change_vector = { /* Map from ascii to ascii - no change @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'N', 'N', 'N', 'N', 'N', 'N', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; const std::vector chrmap_4bit_vector = { /* Map from ascii to 4-bit nucleotide code Aa: 1 0001 Bb: 14 1110 ex: 'B' & 'A' == 0000 while 'B' & anyother != 0000 Cc: 2 0010 Dd: 13 1101 Gg: 4 0100 Hh: 11 1011 Kk: 12 1100 Mm: 3 0011 Nn: 15 1111 ex: 'N' & any != 0000 Rr: 5 0101 Ss: 6 0110 Tt: 8 1000 Uu: 8 1000 Vv: 7 0111 Ww: 9 1001 Yy: 10 1010 ex: 'Y' & 'C' or 'T' == 0000 Others: 0 @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 8, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 8, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const std::vector chrmap_complement_vector = { /* Map from ascii to ascii, complementary nucleotide @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','T','V','G','H','N','N','C','D','N','N','M','N','K','N','N', 'N','N','Y','S','A','A','B','W','N','R','N','N','N','N','N','N', 'N','t','v','g','h','N','N','c','d','N','N','m','N','k','n','N', 'N','N','y','s','a','a','b','w','N','r','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const std::vector chrmap_normalize_vector = { /* Map from ascii to ascii Convert to upper case nucleotide, and replace U by T @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','N','N','G','H','N','N','K','N','M','N','N', 'N','N','R','S','T','T','V','W','N','Y','N','N','N','N','N','N', 'N','A','B','C','D','N','N','G','H','N','N','K','N','M','N','N', 'N','N','R','S','T','T','V','W','N','Y','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const std::vector chrmap_upcase_vector = { /* Map from ascii to ascii Convert to upper case nucleotide @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; vsearch-2.30.0/src/utils/maps.hpp000066400000000000000000000054721476012147200166430ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MAPS_HPP #define MAPS_HPP #include extern const std::vector chrmap_no_change_vector; extern const std::vector chrmap_4bit_vector; extern const std::vector chrmap_complement_vector; extern const std::vector chrmap_normalize_vector; extern const std::vector chrmap_upcase_vector; #endif // MAPS_HPP vsearch-2.30.0/src/utils/seqcmp.cc000066400000000000000000000057311476012147200167670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "maps.hpp" #include auto seqcmp(char * lhs, char * rhs, int length) -> int { if (length <= 0) { return 0; } while ((length-- > 0) and (chrmap_4bit_vector[static_cast(*lhs)] == chrmap_4bit_vector[static_cast(*rhs)])) { if ((length == 0) or (*lhs == '\0') or (*rhs == '\0')) { break; } std::advance(lhs, 1); std::advance(rhs, 1); } return static_cast(chrmap_4bit_vector[static_cast(*lhs)] - chrmap_4bit_vector[static_cast(*rhs)]); } vsearch-2.30.0/src/utils/seqcmp.h000066400000000000000000000047451476012147200166350ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto seqcmp(char * lhs, char * rhs, int length) -> int; vsearch-2.30.0/src/vsearch.cc000066400000000000000000005533671476012147200160070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "allpairs.h" #include "chimera.h" #include "cluster.h" #include "cut.h" #include "derep.h" #include "derep_prefix.h" #include "derep_smallmem.h" #include "dynlibs.h" #include "eestats.h" #include "fasta2fastq.h" #include "fastq_chars.h" #include "fastq_join.h" #include "fastqops.h" #include "filter.h" #include "getseq.h" #include "mask.h" #include "mergepairs.h" #include "orient.h" #include "rereplicate.h" #include "search.h" #include "search_exact.h" #include "sff_convert.h" #include "shuffle.h" #include "sintax.h" #include "sortbylength.h" #include "sortbysize.h" #include "subsample.h" #include "udb.h" #include "userfields.h" #include // macros PRIu64 and PRId64 #include // std::floor #include // std::strftime, std::localtime, std::time, std::time_t, std::tm, std::difftime #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::size_t, std::sscanf, std::fclose, std::snprintf, std::printf, std::strcat #include // std::exit, EXIT_FAILURE #include // std::strlen, std::memset #include // getopt_long_only, optarg, optind, opterr, struct // option (no_argument, required_argument) #include #include // strcasecmp #include /* options */ bool opt_bzip2_decompress = false; bool opt_clusterout_id = false; bool opt_clusterout_sort = false; bool opt_eeout; bool opt_fasta_score; bool opt_fastq_allowmergestagger; bool opt_fastq_eeout; bool opt_fastq_nostagger; bool opt_gzip_decompress; bool opt_label_substr_match; bool opt_lengthout; bool opt_n_mismatch; bool opt_no_progress; bool opt_quiet; bool opt_relabel_keep; bool opt_relabel_md5; bool opt_relabel_self; bool opt_relabel_sha1; bool opt_samheader; bool opt_sff_clip; bool opt_sintax_random; bool opt_sizein; bool opt_sizeorder; bool opt_sizeout; bool opt_xee; bool opt_xlength; bool opt_xsize; char * opt_allpairs_global; char * opt_alnout; char * opt_biomout; char * opt_blast6out; char * opt_borderline; char * opt_centroids; char * opt_chimeras; char * opt_chimeras_alnout; char * opt_chimeras_denovo; char * opt_cluster_fast; char * opt_cluster_size; char * opt_cluster_smallmem; char * opt_cluster_unoise; char * opt_clusters; char * opt_consout; char * opt_db; char * opt_dbmatched; char * opt_dbnotmatched; char * opt_eetabbedout; char * opt_fastaout; char * opt_fastaout_discarded; char * opt_fastaout_discarded_rev; char * opt_fastaout_notmerged_fwd; char * opt_fastaout_notmerged_rev; char * opt_fastaout_rev; char * opt_fastapairs; char * opt_fastq_convert; char * opt_fastq_eestats2; char * opt_fastq_eestats; char * opt_fastq_filter; char * opt_fastq_mergepairs; char * opt_fastq_stats; char * opt_fastqout; char * opt_fastqout_discarded; char * opt_fastqout_discarded_rev; char * opt_fastqout_notmerged_fwd; char * opt_fastqout_notmerged_rev; char * opt_fastqout_rev; char * opt_fastx_filter; char * opt_fastx_getseq; char * opt_fastx_getseqs; char * opt_fastx_getsubseq; char * opt_fastx_mask; char * opt_fastx_revcomp; char * opt_label; char * opt_label_field; char * opt_label_suffix; char * opt_label_word; char * opt_label_words; char * opt_labels; char * opt_lcaout; char * opt_log; char * opt_makeudb_usearch; char * opt_maskfasta; char * opt_matched; char * opt_mothur_shared_out; char * opt_msaout; char * opt_nonchimeras; char * opt_notmatched; char * opt_notmatchedfq; char * opt_orient; char * opt_otutabout; char * opt_output; char * opt_pattern; char * opt_profile; char * opt_qsegout; char * opt_relabel; char * opt_reverse; char * opt_samout; char * opt_sample; char * opt_search_exact; char * opt_sff_convert; char * opt_sintax; char * opt_tabbedout; char * opt_tsegout; char * opt_uc; char * opt_uchime2_denovo; char * opt_uchime3_denovo; char * opt_uchime_denovo; char * opt_uchime_ref; char * opt_uchimealns; char * opt_uchimeout; char * opt_udb2fasta; char * opt_udbinfo; char * opt_udbstats; char * opt_usearch_global; char * opt_userout; double * opt_ee_cutoffs_values; double opt_abskew; double opt_chimeras_diff_pct; double opt_dn; double opt_fastq_maxdiffpct; double opt_fastq_maxee; double opt_fastq_maxee_rate; double opt_fastq_truncee; double opt_fastq_truncee_rate; double opt_id; double opt_lca_cutoff; double opt_max_unmasked_pct; double opt_maxid; double opt_maxqt; double opt_maxsizeratio; double opt_maxsl; double opt_mid; double opt_min_unmasked_pct; double opt_mindiv; double opt_minh; double opt_minqt; double opt_minsizeratio; double opt_minsl; double opt_query_cov; double opt_sample_pct; double opt_sintax_cutoff; double opt_target_cov; double opt_unoise_alpha; double opt_weak_id; double opt_xn; int opt_acceptall; int opt_alignwidth; int opt_chimeras_length_min; int opt_chimeras_parents_max; int opt_chimeras_parts; int opt_cons_truncate; int opt_ee_cutoffs_count; int opt_gap_extension_query_interior; int opt_gap_extension_query_left; int opt_gap_extension_query_right; int opt_gap_extension_target_interior; int opt_gap_extension_target_left; int opt_gap_extension_target_right; int opt_gap_open_query_interior; int opt_gap_open_query_left; int opt_gap_open_query_right; int opt_gap_open_target_interior; int opt_gap_open_target_left; int opt_gap_open_target_right; int opt_length_cutoffs_increment; int opt_length_cutoffs_longest; int opt_length_cutoffs_shortest; int opt_mindiffs; int opt_slots; int opt_uchimeout5; int opt_usersort; int64_t opt_dbmask; int64_t opt_fasta_width; int64_t opt_fastq_ascii; int64_t opt_fastq_asciiout; int64_t opt_fastq_maxdiffs; int64_t opt_fastq_maxlen; int64_t opt_fastq_maxmergelen; int64_t opt_fastq_maxns; int64_t opt_fastq_minlen; int64_t opt_fastq_minmergelen; int64_t opt_fastq_minovlen; int64_t opt_fastq_minqual; int64_t opt_fastq_qmax; int64_t opt_fastq_qmaxout; int64_t opt_fastq_qmin; int64_t opt_fastq_qminout; int64_t opt_fastq_stripleft; int64_t opt_fastq_stripright; int64_t opt_fastq_trunclen; int64_t opt_fastq_trunclen_keep; int64_t opt_fastq_truncqual; int64_t opt_fulldp; int64_t opt_hardmask; int64_t opt_iddef; int64_t opt_idprefix; int64_t opt_idsuffix; int64_t opt_leftjust; int64_t opt_match; int64_t opt_maxaccepts; int64_t opt_maxdiffs; int64_t opt_maxgaps; int64_t opt_maxhits; int64_t opt_maxqsize; int64_t opt_maxrejects; int64_t opt_maxseqlength; int64_t opt_maxsize; int64_t opt_maxsubs; int64_t opt_maxuniquesize; int64_t opt_mincols; int64_t opt_minseqlength; int64_t opt_minsize; int64_t opt_mintsize; int64_t opt_minuniquesize; int64_t opt_minwordmatches; int64_t opt_mismatch; int64_t opt_notrunclabels; int64_t opt_output_no_hits; int64_t opt_qmask; int64_t opt_randseed; int64_t opt_rightjust; int64_t opt_rowlen; int64_t opt_sample_size; int64_t opt_self; int64_t opt_selfid; int64_t opt_strand; int64_t opt_subseq_end; int64_t opt_subseq_start; int64_t opt_threads; int64_t opt_top_hits_only; int64_t opt_topn; int64_t opt_uc_allhits; int64_t opt_wordlength; /* Other variables */ /* cpu features available */ int64_t altivec_present = 0; int64_t neon_present = 0; int64_t mmx_present = 0; int64_t sse_present = 0; int64_t sse2_present = 0; int64_t sse3_present = 0; int64_t ssse3_present = 0; int64_t sse41_present = 0; int64_t sse42_present = 0; int64_t popcnt_present = 0; int64_t avx_present = 0; int64_t avx2_present = 0; static char progheader[80]; // static constexpr auto max_line_length = std::size_t{80}; static char * cmdline; static time_t time_start; static time_t time_finish; std::FILE * fp_log = nullptr; char * STDIN_NAME = (char *) "/dev/stdin"; char * STDOUT_NAME = (char *) "/dev/stdout"; #ifdef __x86_64__ #define cpuid(f1, f2, a, b, c, d) \ __asm__ __volatile__ ("cpuid" \ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ : "a" (f1), "c" (f2)); #endif auto cpu_features_detect() -> void { #ifdef __aarch64__ #ifdef __ARM_NEON /* may check /proc/cpuinfo for asimd or neon */ neon_present = 1; #else #error ARM Neon not present #endif #elif __PPC__ altivec_present = 1; #elif __x86_64__ unsigned int a = 0; unsigned int b = 0; unsigned int c = 0; unsigned int d = 0; cpuid(0, 0, a, b, c, d); unsigned int const maxlevel = a & 0xff; if (maxlevel >= 1) { cpuid(1, 0, a, b, c, d); mmx_present = (d >> 23U) & 1U; sse_present = (d >> 25U) & 1U; sse2_present = (d >> 26U) & 1U; sse3_present = (c >> 0U) & 1U; ssse3_present = (c >> 9U) & 1U; sse41_present = (c >> 19U) & 1U; sse42_present = (c >> 20U) & 1U; popcnt_present = (c >> 23U) & 1U; avx_present = (c >> 28U) & 1U; if (maxlevel >= 7) { cpuid(7, 0, a, b, c, d); avx2_present = (b >> 5U) & 1U; } } #else // simde #endif } auto cpu_features_show() -> void { fprintf(stderr, "CPU features:"); if (neon_present) { fprintf(stderr, " neon"); } if (altivec_present) { fprintf(stderr, " altivec"); } if (mmx_present) { fprintf(stderr, " mmx"); } if (sse_present) { fprintf(stderr, " sse"); } if (sse2_present) { fprintf(stderr, " sse2"); } if (sse3_present) { fprintf(stderr, " sse3"); } if (ssse3_present) { fprintf(stderr, " ssse3"); } if (sse41_present) { fprintf(stderr, " sse4.1"); } if (sse42_present) { fprintf(stderr, " sse4.2"); } if (popcnt_present) { fprintf(stderr, " popcnt"); } if (avx_present) { fprintf(stderr, " avx"); } if (avx2_present) { fprintf(stderr, " avx2"); } fprintf(stderr, "\n"); } auto args_get_ee_cutoffs(char * arg) -> void { /* get comma-separated list of floating point numbers */ /* save in ee_cutoffs_count and ee_cutoffs_values */ int commas = 0; for (size_t i = 0; i < strlen(arg); i++) { if (arg[i] == ',') { commas++; } } opt_ee_cutoffs_count = 0; opt_ee_cutoffs_values = (double *) xrealloc(opt_ee_cutoffs_values, (commas + 1) * sizeof(double)); char * s = arg; while (true) { double val = 0; int skip = 0; if ((sscanf(s, "%lf%n", &val, &skip) != 1) or (val <= 0.0)) { fatal("Invalid arguments to ee_cutoffs"); } opt_ee_cutoffs_values[opt_ee_cutoffs_count++] = val; s += skip; if (*s == ',') { s++; } else if (*s == 0) { break; } else { fatal("Invalid arguments to ee_cutoffs"); } } } auto args_get_length_cutoffs(char * arg) -> void { /* get comma-separated list of 3 integers: */ /* smallest, largest and increment. */ /* second value may be * indicating no limit */ /* save in length_cutoffs_{smallest,largest,increment} */ int skip = 0; if (sscanf(arg, "%d,%d,%d%n", &opt_length_cutoffs_shortest, &opt_length_cutoffs_longest, &opt_length_cutoffs_increment, & skip) == 3) { if ((size_t) skip < strlen(arg)) { fatal("Invalid arguments to length_cutoffs"); } } else if (sscanf(arg, "%d,*,%d%n", &opt_length_cutoffs_shortest, &opt_length_cutoffs_increment, &skip) == 2) { if ((size_t) skip < strlen(arg)) { fatal("Invalid arguments to length_cutoffs"); } opt_length_cutoffs_longest = std::numeric_limits::max(); } else { fatal("Invalid arguments to length_cutoffs"); } if ((opt_length_cutoffs_shortest < 1) or (opt_length_cutoffs_shortest > opt_length_cutoffs_longest) or (opt_length_cutoffs_increment < 1)) { fatal("Invalid arguments to length_cutoffs"); } } auto args_get_gap_penalty_string(char * arg, int is_open) -> void { /* See http://www.drive5.com/usearch/manual/aln_params.html --gapopen *E/10I/1E/2L/3RQ/4RT/1IQ --gapext *E/10I/1E/2L/3RQ/4RT/1IQ integer or * followed by I, E, L, R, Q or T characters separated by / * means infinitely high (disallow) E=end I=interior L=left R=right Q=query T=target E cannot be combined with L or R We do not support floating point values. Therefore, all default score and penalties are multiplied by 2. */ char * p = arg; while (*p) { int skip = 0; int pen = 0; if (sscanf(p, "%d%n", &pen, &skip) == 1) { p += skip; } else if (*p == '*') { pen = 1000; p++; } else { fatal("Invalid gap penalty argument (%s)", p); } char * q = p; int set_E = 0; int set_I = 0; int set_L = 0; int set_R = 0; int set_Q = 0; int set_T = 0; while ((*p) and (*p != '/')) { switch(*p) { case 'E': set_E = 1; break; case 'I': set_I = 1; break; case 'L': set_L = 1; break; case 'R': set_R = 1; break; case 'Q': set_Q = 1; break; case 'T': set_T = 1; break; default: fatal("Invalid char '%.1s' in gap penalty string", p); break; } p++; } if (*p == '/') { p++; } if (set_E and (set_L or set_R)) { fatal("Invalid gap penalty string (E and L or R) '%s'", q); } if (set_E) { set_L = 1; set_R = 1; } /* if neither L, I, R nor E is specified, it applies to all */ if ((not set_L) and (not set_I) and (not set_R)) { set_L = 1; set_I = 1; set_R = 1; } /* if neither Q nor T is specified, it applies to both */ if ((not set_Q) and (not set_T)) { set_Q = 1; set_T = 1; } if (is_open) { if (set_Q) { if (set_L) { opt_gap_open_query_left = pen; } if (set_I) { opt_gap_open_query_interior = pen; } if (set_R) { opt_gap_open_query_right = pen; } } if (set_T) { if (set_L) { opt_gap_open_target_left = pen; } if (set_I) { opt_gap_open_target_interior = pen; } if (set_R) { opt_gap_open_target_right = pen; } } } else { if (set_Q) { if (set_L) { opt_gap_extension_query_left = pen; } if (set_I) { opt_gap_extension_query_interior = pen; } if (set_R) { opt_gap_extension_query_right = pen; } } if (set_T) { if (set_L) { opt_gap_extension_target_left = pen; } if (set_I) { opt_gap_extension_target_interior = pen; } if (set_R) { opt_gap_extension_target_right = pen; } } } } } auto args_getlong(char * arg) -> int64_t { int len = 0; int64_t temp = 0; const int ret = sscanf(arg, "%" PRId64 "%n", &temp, &len); if ((ret == 0) or (((unsigned int) (len)) < strlen(arg))) { fatal("Illegal option argument"); } return temp; } auto args_getdouble(char * arg) -> double { int len = 0; double temp = 0; const int ret = sscanf(arg, "%lf%n", &temp, &len); if ((ret == 0) or (((unsigned int)(len)) < strlen(arg))) { fatal("Illegal option argument"); } return temp; } auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void { /* Set defaults */ static constexpr auto dbl_max = std::numeric_limits::max(); static constexpr auto int_max = std::numeric_limits::max(); static constexpr auto long_min = std::numeric_limits::min(); parameters.progname = argv[0]; opt_abskew = 0.0; opt_acceptall = 0; opt_alignwidth = 80; opt_allpairs_global = nullptr; opt_alnout = nullptr; opt_biomout = nullptr; opt_blast6out = nullptr; opt_borderline = nullptr; opt_centroids = nullptr; opt_chimeras = nullptr; opt_chimeras_denovo = nullptr; opt_chimeras_diff_pct = 0.0; opt_chimeras_length_min = 10; opt_chimeras_parents_max = 3; opt_chimeras_parts = 0; opt_cluster_fast = nullptr; opt_cluster_size = nullptr; opt_cluster_smallmem = nullptr; opt_cluster_unoise = nullptr; opt_clusters = nullptr; opt_cons_truncate = 0; opt_consout = nullptr; opt_db = nullptr; opt_dbmask = MASK_DUST; opt_dbmatched = nullptr; opt_dbnotmatched = nullptr; opt_dn = 1.4; opt_ee_cutoffs_count = 3; opt_ee_cutoffs_values = (double *) xmalloc(opt_ee_cutoffs_count * sizeof(double)); opt_ee_cutoffs_values[0] = 0.5; opt_ee_cutoffs_values[1] = 1.0; opt_ee_cutoffs_values[2] = 2.0; opt_eeout = false; opt_eetabbedout = nullptr; opt_fasta_score = false; opt_fasta_width = 80; opt_fastaout = nullptr; opt_fastaout_discarded = nullptr; opt_fastaout_discarded_rev = nullptr; opt_fastaout_notmerged_fwd = nullptr; opt_fastaout_notmerged_rev = nullptr; opt_fastaout_rev = nullptr; opt_fastapairs = nullptr; opt_fastq_allowmergestagger = false; opt_fastq_ascii = 33; opt_fastq_asciiout = 33; opt_fastq_convert = nullptr; opt_fastq_eeout = false; opt_fastq_eestats = nullptr; opt_fastq_eestats2 = nullptr; opt_fastq_filter = nullptr; opt_fastq_maxdiffpct = 100.0; opt_fastq_maxdiffs = 10; opt_fastq_maxee = dbl_max; opt_fastq_maxee_rate = dbl_max; opt_fastq_maxlen = int64_max; opt_fastq_maxmergelen = 1000000; opt_fastq_maxns = int64_max; opt_fastq_mergepairs = nullptr; opt_fastq_minlen = 1; opt_fastq_minmergelen = 0; opt_fastq_minovlen = 10; opt_fastq_minqual = 0; opt_fastq_nostagger = true; opt_fastq_qmax = 41; opt_fastq_qmaxout = 41; opt_fastq_qmin = 0; opt_fastq_qminout = 0; opt_fastq_stats = nullptr; opt_fastq_stripleft = 0; opt_fastq_stripright = 0; opt_fastq_truncee = dbl_max; opt_fastq_truncee_rate = dbl_max; opt_fastq_trunclen = -1; opt_fastq_trunclen_keep = -1; opt_fastq_truncqual = long_min; opt_fastqout = nullptr; opt_fastqout_discarded = nullptr; opt_fastqout_discarded_rev = nullptr; opt_fastqout_notmerged_fwd = nullptr; opt_fastqout_notmerged_rev = nullptr; opt_fastqout_rev = nullptr; opt_fastx_filter = nullptr; opt_fastx_getseq = nullptr; opt_fastx_getseqs = nullptr; opt_fastx_getsubseq = nullptr; opt_fastx_mask = nullptr; opt_fastx_revcomp = nullptr; opt_fulldp = 0; opt_gap_extension_query_interior=2; opt_gap_extension_query_left=1; opt_gap_extension_query_right=1; opt_gap_extension_target_interior=2; opt_gap_extension_target_left=1; opt_gap_extension_target_right=1; opt_gap_open_query_interior=20; opt_gap_open_query_left=2; opt_gap_open_query_right=2; opt_gap_open_target_interior=20; opt_gap_open_target_left=2; opt_gap_open_target_right=2; opt_gzip_decompress = false; opt_hardmask = 0; opt_id = -1.0; opt_iddef = 2; opt_idprefix = 0; opt_idsuffix = 0; opt_label = nullptr; opt_label_field = nullptr; opt_label_substr_match = false; opt_label_suffix = nullptr; opt_label_word = nullptr; opt_label_words = nullptr; opt_labels = nullptr; opt_lca_cutoff = 1.0; opt_lcaout = nullptr; opt_leftjust = 0; opt_length_cutoffs_increment = 50; opt_length_cutoffs_longest = int_max; opt_length_cutoffs_shortest = 50; opt_lengthout = false; opt_log = nullptr; opt_makeudb_usearch = nullptr; opt_maskfasta = nullptr; opt_match = 2; opt_matched = nullptr; opt_max_unmasked_pct = 100.0; opt_maxaccepts = 1; opt_maxdiffs = int_max; opt_maxgaps = int_max; opt_maxhits = 0; opt_maxid = 1.0; opt_maxqsize = int_max; opt_maxqt = dbl_max; opt_maxrejects = -1; opt_maxseqlength = default_maxseqlength; opt_maxsize = int64_max; opt_maxsizeratio = dbl_max; opt_maxsl = dbl_max; opt_maxsubs = int_max; opt_maxuniquesize = int64_max; opt_mid = 0.0; opt_min_unmasked_pct = 0.0; opt_mincols = 0; opt_mindiffs = 3; opt_mindiv = 0.8; opt_minh = 0.28; opt_minqt = 0.0; opt_minseqlength = -1; opt_minsize = 0; opt_minsizeratio = 0.0; opt_minsl = 0.0; opt_mintsize = 0; opt_minuniquesize = 1; opt_minwordmatches = -1; opt_mismatch = -4; opt_mothur_shared_out = nullptr; opt_msaout = nullptr; opt_n_mismatch = false; opt_no_progress = false; opt_nonchimeras = nullptr; opt_notmatched = nullptr; opt_notmatched = nullptr; opt_notrunclabels = 0; opt_orient = nullptr; opt_otutabout = nullptr; opt_output = nullptr; opt_output_no_hits = 0; opt_pattern = nullptr; opt_profile = nullptr; opt_qmask = MASK_DUST; opt_qsegout = nullptr; opt_query_cov = 0.0; opt_quiet = false; opt_randseed = 0; opt_relabel = nullptr; opt_relabel_keep = false; opt_relabel_md5 = false; opt_relabel_self = false; opt_relabel_sha1 = false; opt_reverse = nullptr; opt_rightjust = 0; opt_rowlen = 64; opt_samheader = false; opt_samout = nullptr; opt_sample = nullptr; opt_sample_pct = 0; opt_sample_size = 0; opt_search_exact = nullptr; opt_self = 0; opt_selfid = 0; opt_sff_clip = false; opt_sff_convert = nullptr; opt_sintax = nullptr; opt_sintax_cutoff = 0.0; opt_sintax_random = false; opt_sizein = false; opt_sizeorder = false; opt_sizeout = false; opt_slots = 0; opt_strand = 1; opt_subseq_end = int64_max; opt_subseq_start = 1; opt_tabbedout = nullptr; opt_target_cov = 0.0; opt_threads = 0; opt_top_hits_only = 0; opt_topn = int64_max; opt_tsegout = nullptr; opt_uc = nullptr; opt_uc_allhits = 0; opt_uchime2_denovo = nullptr; opt_uchime3_denovo = nullptr; opt_uchime_denovo = nullptr; opt_uchime_ref = nullptr; opt_uchimealns = nullptr; opt_uchimeout = nullptr; opt_uchimeout5 = 0; opt_udb2fasta = nullptr; opt_udbinfo = nullptr; opt_udbstats = nullptr; opt_unoise_alpha = 2.0; opt_usearch_global = nullptr; opt_userout = nullptr; opt_usersort = 0; opt_weak_id = 10.0; opt_wordlength = 0; opt_xee = false; opt_xlength = false; opt_xn = 8.0; opt_xsize = false; opterr = 1; enum { option_abskew, option_acceptall, option_alignwidth, option_allpairs_global, option_alnout, option_band, option_biomout, option_blast6out, option_borderline, option_bzip2_decompress, option_centroids, option_chimeras, option_chimeras_denovo, option_chimeras_diff_pct, option_chimeras_length_min, option_chimeras_parents_max, option_chimeras_parts, option_cluster_fast, option_cluster_size, option_cluster_smallmem, option_cluster_unoise, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_cut, option_cut_pattern, option_db, option_dbmask, option_dbmatched, option_dbnotmatched, option_derep_fulllength, option_derep_id, option_derep_prefix, option_derep_smallmem, option_dn, option_ee_cutoffs, option_eeout, option_eetabbedout, option_fasta2fastq, option_fasta_score, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_notmerged_fwd, option_fastaout_notmerged_rev, option_fastaout_rev, option_fastapairs, option_fastq_allowmergestagger, option_fastq_ascii, option_fastq_asciiout, option_fastq_chars, option_fastq_convert, option_fastq_eeout, option_fastq_eestats, option_fastq_eestats2, option_fastq_filter, option_fastq_join, option_fastq_maxdiffpct, option_fastq_maxdiffs, option_fastq_maxee, option_fastq_maxee_rate, option_fastq_maxlen, option_fastq_maxmergelen, option_fastq_maxns, option_fastq_mergepairs, option_fastq_minlen, option_fastq_minmergelen, option_fastq_minovlen, option_fastq_minqual, option_fastq_nostagger, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastq_qout_max, option_fastq_stats, option_fastq_stripleft, option_fastq_stripright, option_fastq_tail, option_fastq_truncee, option_fastq_truncee_rate, option_fastq_trunclen, option_fastq_trunclen_keep, option_fastq_truncqual, option_fastqout, option_fastqout_discarded, option_fastqout_discarded_rev, option_fastqout_notmerged_fwd, option_fastqout_notmerged_rev, option_fastqout_rev, option_fastx_filter, option_fastx_getseq, option_fastx_getseqs, option_fastx_getsubseq, option_fastx_mask, option_fastx_revcomp, option_fastx_subsample, option_fastx_uniques, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_h, option_hardmask, option_help, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_join_padgap, option_join_padgapq, option_label, option_label_field, option_label_substr_match, option_label_suffix, option_label_word, option_label_words, option_labels, option_lca_cutoff, option_lcaout, option_leftjust, option_length_cutoffs, option_lengthout, option_log, option_makeudb_usearch, option_maskfasta, option_match, option_matched, option_max_unmasked_pct, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsize, option_maxsizeratio, option_maxsl, option_maxsubs, option_maxuniquesize, option_mid, option_min_unmasked_pct, option_mincols, option_mindiffs, option_mindiv, option_minh, option_minhsp, option_minqt, option_minseqlength, option_minsize, option_minsizeratio, option_minsl, option_mintsize, option_minuniquesize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_nonchimeras, option_notmatched, option_notmatchedfq, option_notrunclabels, option_orient, option_otutabout, option_output, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_randseed, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rereplicate, option_reverse, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_sample_pct, option_sample_size, option_search_exact, option_self, option_selfid, option_sff_clip, option_sff_convert, option_shuffle, option_sintax, option_sintax_cutoff, option_sintax_random, option_sizein, option_sizeorder, option_sizeout, option_slots, option_sortbylength, option_sortbysize, option_strand, option_subseq_end, option_subseq_start, option_tabbedout, option_target_cov, option_threads, option_top_hits_only, option_topn, option_tsegout, option_uc, option_uc_allhits, option_uchime2_denovo, option_uchime3_denovo, option_uchime_denovo, option_uchime_ref, option_uchimealns, option_uchimeout, option_uchimeout5, option_udb2fasta, option_udbinfo, option_udbstats, option_unoise_alpha, option_usearch_global, option_userfields, option_userout, option_usersort, option_v, option_version, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xn, option_xsize }; static struct option long_options[] = { {"abskew", required_argument, nullptr, 0 }, {"acceptall", no_argument, nullptr, 0 }, {"alignwidth", required_argument, nullptr, 0 }, {"allpairs_global", required_argument, nullptr, 0 }, {"alnout", required_argument, nullptr, 0 }, {"band", required_argument, nullptr, 0 }, {"biomout", required_argument, nullptr, 0 }, {"blast6out", required_argument, nullptr, 0 }, {"borderline", required_argument, nullptr, 0 }, {"bzip2_decompress", no_argument, nullptr, 0 }, {"centroids", required_argument, nullptr, 0 }, {"chimeras", required_argument, nullptr, 0 }, {"chimeras_denovo", required_argument, nullptr, 0 }, {"chimeras_diff_pct", required_argument, nullptr, 0 }, {"chimeras_length_min", required_argument, nullptr, 0 }, {"chimeras_parents_max", required_argument, nullptr, 0 }, {"chimeras_parts", required_argument, nullptr, 0 }, {"cluster_fast", required_argument, nullptr, 0 }, {"cluster_size", required_argument, nullptr, 0 }, {"cluster_smallmem", required_argument, nullptr, 0 }, {"cluster_unoise", required_argument, nullptr, 0 }, {"clusterout_id", no_argument, nullptr, 0 }, {"clusterout_sort", no_argument, nullptr, 0 }, {"clusters", required_argument, nullptr, 0 }, {"cons_truncate", no_argument, nullptr, 0 }, {"consout", required_argument, nullptr, 0 }, {"cut", required_argument, nullptr, 0 }, {"cut_pattern", required_argument, nullptr, 0 }, {"db", required_argument, nullptr, 0 }, {"dbmask", required_argument, nullptr, 0 }, {"dbmatched", required_argument, nullptr, 0 }, {"dbnotmatched", required_argument, nullptr, 0 }, {"derep_fulllength", required_argument, nullptr, 0 }, {"derep_id", required_argument, nullptr, 0 }, {"derep_prefix", required_argument, nullptr, 0 }, {"derep_smallmem", required_argument, nullptr, 0 }, {"dn", required_argument, nullptr, 0 }, {"ee_cutoffs", required_argument, nullptr, 0 }, {"eeout", no_argument, nullptr, 0 }, {"eetabbedout", required_argument, nullptr, 0 }, {"fasta2fastq", required_argument, nullptr, 0 }, {"fasta_score", no_argument, nullptr, 0 }, {"fasta_width", required_argument, nullptr, 0 }, {"fastaout", required_argument, nullptr, 0 }, {"fastaout_discarded", required_argument, nullptr, 0 }, {"fastaout_discarded_rev",required_argument, nullptr, 0 }, {"fastaout_notmerged_fwd",required_argument, nullptr, 0 }, {"fastaout_notmerged_rev",required_argument, nullptr, 0 }, {"fastaout_rev", required_argument, nullptr, 0 }, {"fastapairs", required_argument, nullptr, 0 }, {"fastq_allowmergestagger", no_argument, nullptr, 0 }, {"fastq_ascii", required_argument, nullptr, 0 }, {"fastq_asciiout", required_argument, nullptr, 0 }, {"fastq_chars", required_argument, nullptr, 0 }, {"fastq_convert", required_argument, nullptr, 0 }, {"fastq_eeout", no_argument, nullptr, 0 }, {"fastq_eestats", required_argument, nullptr, 0 }, {"fastq_eestats2", required_argument, nullptr, 0 }, {"fastq_filter", required_argument, nullptr, 0 }, {"fastq_join", required_argument, nullptr, 0 }, {"fastq_maxdiffpct", required_argument, nullptr, 0 }, {"fastq_maxdiffs", required_argument, nullptr, 0 }, {"fastq_maxee", required_argument, nullptr, 0 }, {"fastq_maxee_rate", required_argument, nullptr, 0 }, {"fastq_maxlen", required_argument, nullptr, 0 }, {"fastq_maxmergelen", required_argument, nullptr, 0 }, {"fastq_maxns", required_argument, nullptr, 0 }, {"fastq_mergepairs", required_argument, nullptr, 0 }, {"fastq_minlen", required_argument, nullptr, 0 }, {"fastq_minmergelen", required_argument, nullptr, 0 }, {"fastq_minovlen", required_argument, nullptr, 0 }, {"fastq_minqual", required_argument, nullptr, 0 }, {"fastq_nostagger", no_argument, nullptr, 0 }, {"fastq_qmax", required_argument, nullptr, 0 }, {"fastq_qmaxout", required_argument, nullptr, 0 }, {"fastq_qmin", required_argument, nullptr, 0 }, {"fastq_qminout", required_argument, nullptr, 0 }, {"fastq_qout_max", no_argument, nullptr, 0 }, {"fastq_stats", required_argument, nullptr, 0 }, {"fastq_stripleft", required_argument, nullptr, 0 }, {"fastq_stripright", required_argument, nullptr, 0 }, {"fastq_tail", required_argument, nullptr, 0 }, {"fastq_truncee", required_argument, nullptr, 0 }, {"fastq_truncee_rate", required_argument, nullptr, 0 }, {"fastq_trunclen", required_argument, nullptr, 0 }, {"fastq_trunclen_keep", required_argument, nullptr, 0 }, {"fastq_truncqual", required_argument, nullptr, 0 }, {"fastqout", required_argument, nullptr, 0 }, {"fastqout_discarded", required_argument, nullptr, 0 }, {"fastqout_discarded_rev",required_argument, nullptr, 0 }, {"fastqout_notmerged_fwd",required_argument, nullptr, 0 }, {"fastqout_notmerged_rev",required_argument, nullptr, 0 }, {"fastqout_rev", required_argument, nullptr, 0 }, {"fastx_filter", required_argument, nullptr, 0 }, {"fastx_getseq", required_argument, nullptr, 0 }, {"fastx_getseqs", required_argument, nullptr, 0 }, {"fastx_getsubseq", required_argument, nullptr, 0 }, {"fastx_mask", required_argument, nullptr, 0 }, {"fastx_revcomp", required_argument, nullptr, 0 }, {"fastx_subsample", required_argument, nullptr, 0 }, {"fastx_uniques", required_argument, nullptr, 0 }, {"fulldp", no_argument, nullptr, 0 }, {"gapext", required_argument, nullptr, 0 }, {"gapopen", required_argument, nullptr, 0 }, {"gzip_decompress", no_argument, nullptr, 0 }, {"h", no_argument, nullptr, 0 }, {"hardmask", no_argument, nullptr, 0 }, {"help", no_argument, nullptr, 0 }, {"hspw", required_argument, nullptr, 0 }, {"id", required_argument, nullptr, 0 }, {"iddef", required_argument, nullptr, 0 }, {"idprefix", required_argument, nullptr, 0 }, {"idsuffix", required_argument, nullptr, 0 }, {"join_padgap", required_argument, nullptr, 0 }, {"join_padgapq", required_argument, nullptr, 0 }, {"label", required_argument, nullptr, 0 }, {"label_field", required_argument, nullptr, 0 }, {"label_substr_match", no_argument, nullptr, 0 }, {"label_suffix", required_argument, nullptr, 0 }, {"label_word", required_argument, nullptr, 0 }, {"label_words", required_argument, nullptr, 0 }, {"labels", required_argument, nullptr, 0 }, {"lca_cutoff", required_argument, nullptr, 0 }, {"lcaout", required_argument, nullptr, 0 }, {"leftjust", no_argument, nullptr, 0 }, {"length_cutoffs", required_argument, nullptr, 0 }, {"lengthout", no_argument, nullptr, 0 }, {"log", required_argument, nullptr, 0 }, {"makeudb_usearch", required_argument, nullptr, 0 }, {"maskfasta", required_argument, nullptr, 0 }, {"match", required_argument, nullptr, 0 }, {"matched", required_argument, nullptr, 0 }, {"max_unmasked_pct", required_argument, nullptr, 0 }, {"maxaccepts", required_argument, nullptr, 0 }, {"maxdiffs", required_argument, nullptr, 0 }, {"maxgaps", required_argument, nullptr, 0 }, {"maxhits", required_argument, nullptr, 0 }, {"maxid", required_argument, nullptr, 0 }, {"maxqsize", required_argument, nullptr, 0 }, {"maxqt", required_argument, nullptr, 0 }, {"maxrejects", required_argument, nullptr, 0 }, {"maxseqlength", required_argument, nullptr, 0 }, {"maxsize", required_argument, nullptr, 0 }, {"maxsizeratio", required_argument, nullptr, 0 }, {"maxsl", required_argument, nullptr, 0 }, {"maxsubs", required_argument, nullptr, 0 }, {"maxuniquesize", required_argument, nullptr, 0 }, {"mid", required_argument, nullptr, 0 }, {"min_unmasked_pct", required_argument, nullptr, 0 }, {"mincols", required_argument, nullptr, 0 }, {"mindiffs", required_argument, nullptr, 0 }, {"mindiv", required_argument, nullptr, 0 }, {"minh", required_argument, nullptr, 0 }, {"minhsp", required_argument, nullptr, 0 }, {"minqt", required_argument, nullptr, 0 }, {"minseqlength", required_argument, nullptr, 0 }, {"minsize", required_argument, nullptr, 0 }, {"minsizeratio", required_argument, nullptr, 0 }, {"minsl", required_argument, nullptr, 0 }, {"mintsize", required_argument, nullptr, 0 }, {"minuniquesize", required_argument, nullptr, 0 }, {"minwordmatches", required_argument, nullptr, 0 }, {"mismatch", required_argument, nullptr, 0 }, {"mothur_shared_out", required_argument, nullptr, 0 }, {"msaout", required_argument, nullptr, 0 }, {"n_mismatch", no_argument, nullptr, 0 }, {"no_progress", no_argument, nullptr, 0 }, {"nonchimeras", required_argument, nullptr, 0 }, {"notmatched", required_argument, nullptr, 0 }, {"notmatchedfq", required_argument, nullptr, 0 }, {"notrunclabels", no_argument, nullptr, 0 }, {"orient", required_argument, nullptr, 0 }, {"otutabout", required_argument, nullptr, 0 }, {"output", required_argument, nullptr, 0 }, {"output_no_hits", no_argument, nullptr, 0 }, {"pattern", required_argument, nullptr, 0 }, {"profile", required_argument, nullptr, 0 }, {"qmask", required_argument, nullptr, 0 }, {"qsegout", required_argument, nullptr, 0 }, {"query_cov", required_argument, nullptr, 0 }, {"quiet", no_argument, nullptr, 0 }, {"randseed", required_argument, nullptr, 0 }, {"relabel", required_argument, nullptr, 0 }, {"relabel_keep", no_argument, nullptr, 0 }, {"relabel_md5", no_argument, nullptr, 0 }, {"relabel_self", no_argument, nullptr, 0 }, {"relabel_sha1", no_argument, nullptr, 0 }, {"rereplicate", required_argument, nullptr, 0 }, {"reverse", required_argument, nullptr, 0 }, {"rightjust", no_argument, nullptr, 0 }, {"rowlen", required_argument, nullptr, 0 }, {"samheader", no_argument, nullptr, 0 }, {"samout", required_argument, nullptr, 0 }, {"sample", required_argument, nullptr, 0 }, {"sample_pct", required_argument, nullptr, 0 }, {"sample_size", required_argument, nullptr, 0 }, {"search_exact", required_argument, nullptr, 0 }, {"self", no_argument, nullptr, 0 }, {"selfid", no_argument, nullptr, 0 }, {"sff_clip", no_argument, nullptr, 0 }, {"sff_convert", required_argument, nullptr, 0 }, {"shuffle", required_argument, nullptr, 0 }, {"sintax", required_argument, nullptr, 0 }, {"sintax_cutoff", required_argument, nullptr, 0 }, {"sintax_random", no_argument, nullptr, 0 }, {"sizein", no_argument, nullptr, 0 }, {"sizeorder", no_argument, nullptr, 0 }, {"sizeout", no_argument, nullptr, 0 }, {"slots", required_argument, nullptr, 0 }, {"sortbylength", required_argument, nullptr, 0 }, {"sortbysize", required_argument, nullptr, 0 }, {"strand", required_argument, nullptr, 0 }, {"subseq_end", required_argument, nullptr, 0 }, {"subseq_start", required_argument, nullptr, 0 }, {"tabbedout", required_argument, nullptr, 0 }, {"target_cov", required_argument, nullptr, 0 }, {"threads", required_argument, nullptr, 0 }, {"top_hits_only", no_argument, nullptr, 0 }, {"topn", required_argument, nullptr, 0 }, {"tsegout", required_argument, nullptr, 0 }, {"uc", required_argument, nullptr, 0 }, {"uc_allhits", no_argument, nullptr, 0 }, {"uchime2_denovo", required_argument, nullptr, 0 }, {"uchime3_denovo", required_argument, nullptr, 0 }, {"uchime_denovo", required_argument, nullptr, 0 }, {"uchime_ref", required_argument, nullptr, 0 }, {"uchimealns", required_argument, nullptr, 0 }, {"uchimeout", required_argument, nullptr, 0 }, {"uchimeout5", no_argument, nullptr, 0 }, {"udb2fasta", required_argument, nullptr, 0 }, {"udbinfo", required_argument, nullptr, 0 }, {"udbstats", required_argument, nullptr, 0 }, {"unoise_alpha", required_argument, nullptr, 0 }, {"usearch_global", required_argument, nullptr, 0 }, {"userfields", required_argument, nullptr, 0 }, {"userout", required_argument, nullptr, 0 }, {"usersort", no_argument, nullptr, 0 }, {"v", no_argument, nullptr, 0 }, {"version", no_argument, nullptr, 0 }, {"weak_id", required_argument, nullptr, 0 }, {"wordlength", required_argument, nullptr, 0 }, {"xdrop_nw", required_argument, nullptr, 0 }, {"xee", no_argument, nullptr, 0 }, {"xlength", no_argument, nullptr, 0 }, {"xn", required_argument, nullptr, 0 }, {"xsize", no_argument, nullptr, 0 }, { nullptr, 0, nullptr, 0 } }; constexpr int options_count = (sizeof(long_options) / sizeof(struct option)) - 1; std::vector options_selected(options_count); int options_index = 0; int c = 0; while ((c = getopt_long_only(argc, argv, "", long_options, &options_index)) == 0) { if (options_index < options_count) { options_selected[options_index] = true; } switch(options_index) { case option_help: parameters.opt_help = true; break; case option_version: parameters.opt_version = true; break; case option_alnout: opt_alnout = optarg; break; case option_usearch_global: opt_usearch_global = optarg; break; case option_db: opt_db = optarg; break; case option_id: opt_id = args_getdouble(optarg); break; case option_maxaccepts: opt_maxaccepts = args_getlong(optarg); break; case option_maxrejects: opt_maxrejects = args_getlong(optarg); break; case option_wordlength: opt_wordlength = args_getlong(optarg); break; case option_match: opt_match = args_getlong(optarg); break; case option_mismatch: opt_mismatch = args_getlong(optarg); break; case option_fulldp: opt_fulldp = 1; fprintf(stderr, "WARNING: Option --fulldp is ignored\n"); break; case option_strand: if (strcasecmp(optarg, "plus") == 0) { opt_strand = 1; parameters.opt_strand = false; } else if (strcasecmp(optarg, "both") == 0) { opt_strand = 2; parameters.opt_strand = true; } else { fatal("The argument to --strand must be plus or both"); } break; case option_threads: opt_threads = static_cast(args_getdouble(optarg)); parameters.opt_threads = static_cast(args_getdouble(optarg)); break; case option_gapopen: args_get_gap_penalty_string(optarg, 1); break; case option_gapext: args_get_gap_penalty_string(optarg, 0); break; case option_rowlen: opt_rowlen = args_getlong(optarg); break; case option_userfields: if (not parse_userfields_arg(optarg)) { fatal("Unrecognized userfield argument"); } break; case option_userout: opt_userout = optarg; break; case option_self: opt_self = 1; break; case option_blast6out: opt_blast6out = optarg; break; case option_uc: opt_uc = optarg; parameters.opt_uc = optarg; break; case option_weak_id: opt_weak_id = args_getdouble(optarg); break; case option_uc_allhits: opt_uc_allhits = 1; break; case option_notrunclabels: opt_notrunclabels = 1; parameters.opt_notrunclabels = true; break; case option_sortbysize: parameters.opt_sortbysize = optarg; break; case option_output: opt_output = optarg; parameters.opt_output = optarg; break; case option_minsize: opt_minsize = args_getlong(optarg); parameters.opt_minsize = args_getlong(optarg); if (parameters.opt_minsize <= 0) { fatal("The argument to --minsize must be at least 1"); } break; case option_maxsize: opt_maxsize = args_getlong(optarg); parameters.opt_maxsize = args_getlong(optarg); break; case option_relabel: opt_relabel = optarg; parameters.opt_relabel = optarg; break; case option_sizeout: opt_sizeout = true; break; case option_derep_fulllength: parameters.opt_derep_fulllength = optarg; break; case option_minseqlength: opt_minseqlength = args_getlong(optarg); parameters.opt_minseqlength = args_getlong(optarg); if (parameters.opt_minseqlength < 0) { fatal("The argument to --minseqlength must not be negative"); } break; case option_minuniquesize: opt_minuniquesize = args_getlong(optarg); parameters.opt_minuniquesize = args_getlong(optarg); break; case option_topn: opt_topn = args_getlong(optarg); parameters.opt_topn = args_getlong(optarg); if (parameters.opt_topn == 0) { fatal("The argument to --topn must be greater than zero"); } break; case option_maxseqlength: opt_maxseqlength = args_getlong(optarg); parameters.opt_maxseqlength = args_getlong(optarg); break; case option_sizein: opt_sizein = true; parameters.opt_sizein = true; break; case option_sortbylength: parameters.opt_sortbylength = optarg; break; case option_matched: opt_matched = optarg; break; case option_notmatched: opt_notmatched = optarg; break; case option_dbmatched: opt_dbmatched = optarg; break; case option_dbnotmatched: opt_dbnotmatched = optarg; break; case option_fastapairs: opt_fastapairs = optarg; break; case option_output_no_hits: opt_output_no_hits = 1; break; case option_maxhits: opt_maxhits = args_getlong(optarg); break; case option_top_hits_only: opt_top_hits_only = 1; break; case option_fasta_width: opt_fasta_width = args_getlong(optarg); break; case option_query_cov: opt_query_cov = args_getdouble(optarg); break; case option_target_cov: opt_target_cov = args_getdouble(optarg); break; case option_idprefix: opt_idprefix = args_getlong(optarg); break; case option_idsuffix: opt_idsuffix = args_getlong(optarg); break; case option_minqt: opt_minqt = args_getdouble(optarg); break; case option_maxqt: opt_maxqt = args_getdouble(optarg); break; case option_minsl: opt_minsl = args_getdouble(optarg); break; case option_maxsl: opt_maxsl = args_getdouble(optarg); break; case option_leftjust: opt_leftjust = 1; break; case option_rightjust: opt_rightjust = 1; break; case option_selfid: opt_selfid = 1; break; case option_maxid: opt_maxid = args_getdouble(optarg); break; case option_minsizeratio: opt_minsizeratio = args_getdouble(optarg); break; case option_maxsizeratio: opt_maxsizeratio = args_getdouble(optarg); break; case option_maxdiffs: opt_maxdiffs = args_getlong(optarg); break; case option_maxsubs: opt_maxsubs = args_getlong(optarg); break; case option_maxgaps: opt_maxgaps = args_getlong(optarg); break; case option_mincols: opt_mincols = args_getlong(optarg); break; case option_maxqsize: opt_maxqsize = args_getlong(optarg); break; case option_mintsize: opt_mintsize = args_getlong(optarg); break; case option_mid: opt_mid = args_getdouble(optarg); break; case option_shuffle: parameters.opt_shuffle = optarg; break; case option_randseed: opt_randseed = args_getlong(optarg); parameters.opt_randseed = args_getlong(optarg); break; case option_maskfasta: opt_maskfasta = optarg; break; case option_hardmask: opt_hardmask = 1; break; case option_qmask: if (strcasecmp(optarg, "none") == 0) { opt_qmask = MASK_NONE; } else if (strcasecmp(optarg, "dust") == 0) { opt_qmask = MASK_DUST; } else if (strcasecmp(optarg, "soft") == 0) { opt_qmask = MASK_SOFT; } else { opt_qmask = MASK_ERROR; } break; case option_dbmask: if (strcasecmp(optarg, "none") == 0) { opt_dbmask = MASK_NONE; } else if (strcasecmp(optarg, "dust") == 0) { opt_dbmask = MASK_DUST; } else if (strcasecmp(optarg, "soft") == 0) { opt_dbmask = MASK_SOFT; } else { opt_dbmask = MASK_ERROR; } break; case option_cluster_smallmem: opt_cluster_smallmem = optarg; break; case option_cluster_fast: opt_cluster_fast = optarg; break; case option_centroids: opt_centroids = optarg; break; case option_clusters: opt_clusters = optarg; break; case option_consout: opt_consout = optarg; break; case option_cons_truncate: fprintf(stderr, "WARNING: Option --cons_truncate is ignored\n"); opt_cons_truncate = 1; break; case option_msaout: opt_msaout = optarg; break; case option_usersort: opt_usersort = 1; break; case option_xn: opt_xn = args_getdouble(optarg); break; case option_iddef: opt_iddef = args_getlong(optarg); break; case option_slots: fprintf(stderr, "WARNING: Option --slots is ignored\n"); opt_slots = args_getlong(optarg); break; case option_pattern: fprintf(stderr, "WARNING: Option --pattern is ignored\n"); opt_pattern = optarg; break; case option_maxuniquesize: opt_maxuniquesize = args_getlong(optarg); parameters.opt_maxuniquesize = args_getlong(optarg); break; case option_abskew: opt_abskew = args_getdouble(optarg); break; case option_chimeras: opt_chimeras = optarg; break; case option_dn: opt_dn = args_getdouble(optarg); break; case option_mindiffs: opt_mindiffs = args_getlong(optarg); break; case option_mindiv: opt_mindiv = args_getdouble(optarg); break; case option_minh: opt_minh = args_getdouble(optarg); break; case option_nonchimeras: opt_nonchimeras = optarg; break; case option_uchime_denovo: opt_uchime_denovo = optarg; break; case option_uchime_ref: opt_uchime_ref = optarg; break; case option_uchimealns: opt_uchimealns = optarg; break; case option_uchimeout: opt_uchimeout = optarg; break; case option_uchimeout5: opt_uchimeout5 = 1; break; case option_alignwidth: opt_alignwidth = args_getlong(optarg); break; case option_allpairs_global: opt_allpairs_global = optarg; break; case option_acceptall: opt_acceptall = 1; break; case option_cluster_size: opt_cluster_size = optarg; break; case option_samout: opt_samout = optarg; break; case option_log: opt_log = optarg; parameters.opt_log = optarg; break; case option_quiet: opt_quiet = true; parameters.opt_quiet = true; break; case option_fastx_subsample: parameters.opt_fastx_subsample = optarg; break; case option_sample_pct: opt_sample_pct = args_getdouble(optarg); parameters.opt_sample_pct = args_getdouble(optarg); break; case option_fastq_chars: parameters.opt_fastq_chars = optarg; break; case option_profile: opt_profile = optarg; break; case option_sample_size: opt_sample_size = args_getlong(optarg); parameters.opt_sample_size = args_getlong(optarg); break; case option_fastaout: opt_fastaout = optarg; parameters.opt_fastaout = optarg; break; case option_xsize: opt_xsize = true; parameters.opt_xsize = true; break; case option_clusterout_id: opt_clusterout_id = true; break; case option_clusterout_sort: opt_clusterout_sort = true; break; case option_borderline: opt_borderline = optarg; break; case option_relabel_sha1: opt_relabel_sha1 = true; break; case option_relabel_md5: opt_relabel_md5 = true; break; case option_derep_prefix: parameters.opt_derep_prefix = optarg; break; case option_fastq_filter: opt_fastq_filter = optarg; break; case option_fastqout: opt_fastqout = optarg; parameters.opt_fastqout = optarg; break; case option_fastaout_discarded: opt_fastaout_discarded = optarg; parameters.opt_fastaout_discarded = optarg; break; case option_fastqout_discarded: opt_fastqout_discarded = optarg; parameters.opt_fastqout_discarded = optarg; break; case option_fastq_truncqual: opt_fastq_truncqual = args_getlong(optarg); break; case option_fastq_maxee: opt_fastq_maxee = args_getdouble(optarg); break; case option_fastq_trunclen: opt_fastq_trunclen = args_getlong(optarg); break; case option_fastq_minlen: opt_fastq_minlen = args_getlong(optarg); break; case option_fastq_stripleft: opt_fastq_stripleft = args_getlong(optarg); break; case option_fastq_maxee_rate: opt_fastq_maxee_rate = args_getdouble(optarg); break; case option_fastq_maxns: opt_fastq_maxns = args_getlong(optarg); break; case option_eeout: opt_eeout = true; break; case option_fastq_ascii: opt_fastq_ascii = args_getlong(optarg); parameters.opt_fastq_ascii = args_getlong(optarg); break; case option_fastq_qmin: opt_fastq_qmin = args_getlong(optarg); break; case option_fastq_qmax: opt_fastq_qmax = args_getlong(optarg); break; case option_fastq_qmaxout: opt_fastq_qmaxout = args_getlong(optarg); parameters.opt_fastq_qmaxout = args_getlong(optarg); break; case option_fastq_stats: opt_fastq_stats = optarg; break; case option_fastq_tail: parameters.opt_fastq_tail = args_getlong(optarg); break; case option_fastx_revcomp: opt_fastx_revcomp = optarg; break; case option_label_suffix: opt_label_suffix = optarg; break; case option_h: parameters.opt_help = true; break; case option_samheader: opt_samheader = true; break; case option_sizeorder: opt_sizeorder = true; break; case option_minwordmatches: opt_minwordmatches = args_getlong(optarg); if (opt_minwordmatches < 0) { fatal("The argument to --minwordmatches must not be negative"); } break; case option_v: parameters.opt_version = true; break; case option_relabel_keep: opt_relabel_keep = true; break; case option_search_exact: opt_search_exact = optarg; break; case option_fastx_mask: opt_fastx_mask = optarg; break; case option_min_unmasked_pct: opt_min_unmasked_pct = args_getdouble(optarg); break; case option_max_unmasked_pct: opt_max_unmasked_pct = args_getdouble(optarg); break; case option_fastq_convert: opt_fastq_convert = optarg; break; case option_fastq_asciiout: opt_fastq_asciiout = args_getlong(optarg); parameters.opt_fastq_asciiout = args_getlong(optarg); break; case option_fastq_qminout: opt_fastq_qminout = args_getlong(optarg); parameters.opt_fastq_qminout = args_getlong(optarg); break; case option_fastq_mergepairs: opt_fastq_mergepairs = optarg; break; case option_fastq_eeout: opt_fastq_eeout = true; break; case option_fastqout_notmerged_fwd: opt_fastqout_notmerged_fwd = optarg; break; case option_fastqout_notmerged_rev: opt_fastqout_notmerged_rev = optarg; break; case option_fastq_minovlen: opt_fastq_minovlen = args_getlong(optarg); break; case option_fastq_minmergelen: opt_fastq_minmergelen = args_getlong(optarg); break; case option_fastq_maxmergelen: opt_fastq_maxmergelen = args_getlong(optarg); break; case option_fastq_nostagger: opt_fastq_nostagger = optarg; break; case option_fastq_allowmergestagger: opt_fastq_allowmergestagger = true; break; case option_fastq_maxdiffs: opt_fastq_maxdiffs = args_getlong(optarg); break; case option_fastaout_notmerged_fwd: opt_fastaout_notmerged_fwd = optarg; break; case option_fastaout_notmerged_rev: opt_fastaout_notmerged_rev = optarg; break; case option_reverse: opt_reverse = optarg; parameters.opt_reverse = optarg; break; case option_eetabbedout: opt_eetabbedout = optarg; break; case option_fasta_score: opt_fasta_score = true; break; case option_fastq_eestats: opt_fastq_eestats = optarg; break; case option_rereplicate: parameters.opt_rereplicate = optarg; break; case option_xdrop_nw: /* xdrop_nw ignored */ fprintf(stderr, "WARNING: Option --xdrop_nw is ignored\n"); break; case option_minhsp: /* minhsp ignored */ fprintf(stderr, "WARNING: Option --minhsp is ignored\n"); break; case option_band: /* band ignored */ fprintf(stderr, "WARNING: Option --band is ignored\n"); break; case option_hspw: /* hspw ignored */ fprintf(stderr, "WARNING: Option --hspw is ignored\n"); break; case option_gzip_decompress: opt_gzip_decompress = true; break; case option_bzip2_decompress: opt_bzip2_decompress = true; break; case option_fastq_maxlen: opt_fastq_maxlen = args_getlong(optarg); break; case option_fastq_truncee: opt_fastq_truncee = args_getdouble(optarg); break; case option_fastx_filter: opt_fastx_filter = optarg; break; case option_otutabout: opt_otutabout = optarg; break; case option_mothur_shared_out: opt_mothur_shared_out = optarg; break; case option_biomout: opt_biomout = optarg; break; case option_fastq_trunclen_keep: opt_fastq_trunclen_keep = args_getlong(optarg); break; case option_fastq_stripright: opt_fastq_stripright = args_getlong(optarg); break; case option_no_progress: opt_no_progress = true; break; case option_fastq_eestats2: opt_fastq_eestats2 = optarg; break; case option_ee_cutoffs: args_get_ee_cutoffs(optarg); break; case option_length_cutoffs: args_get_length_cutoffs(optarg); break; case option_makeudb_usearch: opt_makeudb_usearch = optarg; break; case option_udb2fasta: opt_udb2fasta = optarg; break; case option_udbinfo: opt_udbinfo = optarg; break; case option_udbstats: opt_udbstats = optarg; break; case option_cluster_unoise: opt_cluster_unoise = optarg; break; case option_unoise_alpha: opt_unoise_alpha = args_getdouble(optarg); break; case option_uchime2_denovo: opt_uchime2_denovo = optarg; break; case option_uchime3_denovo: opt_uchime3_denovo = optarg; break; case option_sintax: opt_sintax = optarg; break; case option_sintax_cutoff: opt_sintax_cutoff = args_getdouble(optarg); break; case option_tabbedout: opt_tabbedout = optarg; parameters.opt_tabbedout = optarg; break; case option_fastq_maxdiffpct: opt_fastq_maxdiffpct = args_getdouble(optarg); break; case option_fastq_join: parameters.opt_fastq_join = optarg; break; case option_join_padgap: parameters.opt_join_padgap = optarg; break; case option_join_padgapq: parameters.opt_join_padgapq = optarg; parameters.opt_join_padgapq_set_by_user = true; break; case option_sff_convert: opt_sff_convert = optarg; break; case option_sff_clip: opt_sff_clip = true; break; case option_fastaout_rev: opt_fastaout_rev = optarg; parameters.opt_fastaout_rev = optarg; break; case option_fastaout_discarded_rev: opt_fastaout_discarded_rev = optarg; parameters.opt_fastaout_discarded_rev = optarg; break; case option_fastqout_rev: opt_fastqout_rev = optarg; parameters.opt_fastqout_rev = optarg; break; case option_fastqout_discarded_rev: opt_fastqout_discarded_rev = optarg; parameters.opt_fastqout_discarded_rev = optarg; break; case option_xee: opt_xee = true; break; case option_fastx_getseq: opt_fastx_getseq = optarg; break; case option_fastx_getseqs: opt_fastx_getseqs = optarg; break; case option_fastx_getsubseq: opt_fastx_getsubseq = optarg; break; case option_label_substr_match: opt_label_substr_match = true; break; case option_label: opt_label = optarg; break; case option_subseq_start: opt_subseq_start = args_getlong(optarg); break; case option_subseq_end: opt_subseq_end = args_getlong(optarg); break; case option_notmatchedfq: opt_notmatchedfq = optarg; break; case option_label_field: opt_label_field = optarg; break; case option_label_word: opt_label_word = optarg; break; case option_label_words: opt_label_words = optarg; break; case option_labels: opt_labels = optarg; break; case option_cut: parameters.opt_cut = optarg; break; case option_cut_pattern: parameters.opt_cut_pattern = optarg; break; case option_relabel_self: opt_relabel_self = true; break; case option_derep_id: parameters.opt_derep_id = optarg; break; case option_orient: opt_orient = optarg; break; case option_fasta2fastq: parameters.opt_fasta2fastq = optarg; break; case option_lcaout: opt_lcaout = optarg; break; case option_lca_cutoff: opt_lca_cutoff = args_getdouble(optarg); break; case option_fastx_uniques: parameters.opt_fastx_uniques = optarg; break; case option_fastq_qout_max: parameters.opt_fastq_qout_max = true; break; case option_sample: opt_sample = optarg; break; case option_qsegout: opt_qsegout = optarg; break; case option_tsegout: opt_tsegout = optarg; break; case option_derep_smallmem: parameters.opt_derep_smallmem = optarg; break; case option_lengthout: opt_lengthout = true; break; case option_xlength: opt_xlength = true; break; case option_chimeras_denovo: opt_chimeras_denovo = optarg; break; case option_chimeras_length_min: opt_chimeras_length_min = args_getlong(optarg); break; case option_chimeras_parts: opt_chimeras_parts = args_getlong(optarg); break; case option_chimeras_parents_max: opt_chimeras_parents_max = args_getlong(optarg); break; case option_chimeras_diff_pct: opt_chimeras_diff_pct = args_getdouble(optarg); break; case option_sintax_random: opt_sintax_random = true; break; case option_n_mismatch: opt_n_mismatch = true; break; case option_fastq_minqual: opt_fastq_minqual = args_getlong(optarg); break; case option_fastq_truncee_rate: opt_fastq_truncee_rate = args_getdouble(optarg); break; default: fatal("Internal error in option parsing"); } } /* Terminate if ambiguous or illegal options have been detected */ if (c != -1) { exit(EXIT_FAILURE); } /* Terminate after reporting any extra non-option arguments */ if (optind < argc) { fatal("Unrecognized string on command line (%s)", argv[optind]); } /* Below is a list of all command names, in alphabetical order. */ int const command_options[] = { option_allpairs_global, option_chimeras_denovo, option_cluster_fast, option_cluster_size, option_cluster_smallmem, option_cluster_unoise, option_cut, option_derep_fulllength, option_derep_id, option_derep_prefix, option_derep_smallmem, option_fasta2fastq, option_fastq_chars, option_fastq_convert, option_fastq_eestats, option_fastq_eestats2, option_fastq_filter, option_fastq_join, option_fastq_mergepairs, option_fastq_stats, option_fastx_filter, option_fastx_getseq, option_fastx_getseqs, option_fastx_getsubseq, option_fastx_mask, option_fastx_revcomp, option_fastx_subsample, option_fastx_uniques, option_h, option_help, option_makeudb_usearch, option_maskfasta, option_orient, option_rereplicate, option_search_exact, option_sff_convert, option_shuffle, option_sintax, option_sortbylength, option_sortbysize, option_uchime2_denovo, option_uchime3_denovo, option_uchime_denovo, option_uchime_ref, option_udb2fasta, option_udbinfo, option_udbstats, option_usearch_global, option_v, option_version }; const int commands_count = sizeof(command_options) / sizeof(int); /* Below is a list of all the options that are valid for each command. The first line is the command and the lines below are the valid options. */ const int valid_options[][99] = { { option_allpairs_global, option_acceptall, option_alnout, option_band, option_blast6out, option_bzip2_decompress, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_output_no_hits, option_pattern, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeout, option_slots, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_chimeras_denovo, option_abskew, option_alignwidth, option_alnout, option_chimeras, option_chimeras_diff_pct, option_chimeras_length_min, option_chimeras_parents_max, option_chimeras_parts, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_log, option_match, option_maxseqlength, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_tabbedout, option_threads, option_xee, option_xn, option_xsize, -1 }, { option_cluster_fast, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cluster_size, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cluster_smallmem, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_usersort, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cluster_unoise, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsize, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_qsegout, option_pattern, option_profile, option_qmask, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_unoise_alpha, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cut, option_bzip2_decompress, option_cut_pattern, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_xee, option_xlength, option_xsize, -1 }, { option_derep_fulllength, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_derep_id, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_derep_prefix, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_derep_smallmem, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fasta2fastq, option_bzip2_decompress, option_fastq_asciiout, option_fastq_qmaxout, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_chars, option_bzip2_decompress, option_fastq_tail, option_gzip_decompress, option_log, option_no_progress, option_quiet, option_threads, -1 }, { option_fastq_convert, option_bzip2_decompress, option_fastq_ascii, option_fastq_asciiout, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_eestats, option_bzip2_decompress, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_log, option_no_progress, option_output, option_quiet, option_threads, -1 }, { option_fastq_eestats2, option_bzip2_decompress, option_ee_cutoffs, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_length_cutoffs, option_log, option_no_progress, option_output, option_quiet, option_threads, -1 }, { option_fastq_filter, option_bzip2_decompress, option_eeout, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_rev, option_fastq_ascii, option_fastq_eeout, option_fastq_maxee, option_fastq_maxee_rate, option_fastq_maxlen, option_fastq_maxns, option_fastq_minlen, option_fastq_minqual, option_fastq_qmax, option_fastq_qmin, option_fastq_stripleft, option_fastq_stripright, option_fastq_truncee, option_fastq_truncee_rate, option_fastq_trunclen, option_fastq_trunclen_keep, option_fastq_truncqual, option_fastqout, option_fastqout_discarded, option_fastqout_discarded_rev, option_fastqout_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxsize, option_minsize, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_join, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_join_padgap, option_join_padgapq, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_mergepairs, option_bzip2_decompress, option_eeout, option_eetabbedout, option_fasta_width, option_fastaout, option_fastaout_notmerged_fwd, option_fastaout_notmerged_rev, option_fastq_allowmergestagger, option_fastq_ascii, option_fastq_eeout, option_fastq_maxdiffpct, option_fastq_maxdiffs, option_fastq_maxee, option_fastq_maxlen, option_fastq_maxmergelen, option_fastq_maxns, option_fastq_minlen, option_fastq_minmergelen, option_fastq_minovlen, option_fastq_nostagger, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastq_truncqual, option_fastqout, option_fastqout_notmerged_fwd, option_fastqout_notmerged_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_stats, option_bzip2_decompress, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_log, option_no_progress, option_quiet, option_threads, -1 }, { option_fastx_filter, option_bzip2_decompress, option_eeout, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_rev, option_fastq_ascii, option_fastq_eeout, option_fastq_maxee, option_fastq_maxee_rate, option_fastq_maxlen, option_fastq_maxns, option_fastq_minlen, option_fastq_minqual, option_fastq_qmax, option_fastq_qmin, option_fastq_stripleft, option_fastq_stripright, option_fastq_truncee, option_fastq_truncee_rate, option_fastq_trunclen, option_fastq_trunclen_keep, option_fastq_truncqual, option_fastqout, option_fastqout_discarded, option_fastqout_discarded_rev, option_fastqout_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxsize, option_minsize, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_getseq, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label, option_label_substr_match, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notmatched, option_notmatchedfq, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_getseqs, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label, option_label_field, option_label_substr_match, option_label_suffix, option_label_word, option_label_words, option_labels, option_lengthout, option_log, option_no_progress, option_notmatched, option_notmatchedfq, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_getsubseq, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label, option_label_substr_match, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notmatched, option_notmatchedfq, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_subseq_end, option_subseq_start, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_mask, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_hardmask, option_label_suffix, option_lengthout, option_log, option_max_unmasked_pct, option_min_unmasked_pct, option_no_progress, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_revcomp, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_subsample, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_fastqout_discarded, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_quiet, option_randseed, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sample_pct, option_sample_size, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_uniques, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_asciiout, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastq_qout_max, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_tabbedout, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_h, option_log, option_quiet, option_threads, -1 }, { option_help, option_log, option_quiet, option_threads, -1 }, { option_makeudb_usearch, option_bzip2_decompress, option_dbmask, option_gzip_decompress, option_hardmask, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_quiet, option_threads, option_wordlength, -1 }, { option_maskfasta, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_hardmask, option_label_suffix, option_lengthout, option_log, option_max_unmasked_pct, option_maxseqlength, option_min_unmasked_pct, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_orient, option_bzip2_decompress, option_db, option_dbmask, option_fasta_width, option_fastaout, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notmatched, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_tabbedout, option_threads, option_wordlength, option_xee, option_xlength, option_xsize, -1 }, { option_rereplicate, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_search_exact, option_alnout, option_biomout, option_blast6out, option_bzip2_decompress, option_db, option_dbmask, option_dbmatched, option_dbnotmatched, option_fasta_width, option_fastapairs, option_gzip_decompress, option_hardmask, option_label_suffix, option_lca_cutoff, option_lcaout, option_lengthout, option_log, option_match, option_matched, option_maxhits, option_maxqsize, option_maxqt, option_maxseqlength, option_maxsizeratio, option_maxsl, option_mincols, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_mismatch, option_mothur_shared_out, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_qmask, option_qsegout, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_sizein, option_sizeout, option_strand, option_threads, option_top_hits_only, option_tsegout, option_uc, option_uc_allhits, option_userfields, option_userout, option_xee, option_xlength, option_xsize, -1 }, { option_sff_convert, option_fastq_asciiout, option_fastq_qmaxout, option_fastq_qminout, option_fastqout, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sff_clip, option_sizeout, option_threads, -1 }, { option_shuffle, option_bzip2_decompress, option_fasta_width, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_quiet, option_randseed, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_topn, option_xee, option_xlength, option_xsize, -1 }, { option_sintax, option_bzip2_decompress, option_db, option_dbmask, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_quiet, option_randseed, option_sintax_cutoff, option_sintax_random, option_strand, option_tabbedout, option_threads, option_wordlength, -1 }, { option_sortbylength, option_bzip2_decompress, option_fasta_width, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_topn, option_xee, option_xlength, option_xsize, -1 }, { option_sortbysize, option_bzip2_decompress, option_fasta_width, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxsize, option_minseqlength, option_minsize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_topn, option_xee, option_xlength, option_xsize, -1 }, { option_uchime2_denovo, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_uchime3_denovo, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_uchime_denovo, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_uchime_ref, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_db, option_dbmask, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_self, option_selfid, option_sizein, option_sizeout, option_strand, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_udb2fasta, option_fasta_width, option_label_suffix, option_lengthout, option_log, option_no_progress, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_udbinfo, option_log, option_quiet, option_threads, -1 }, { option_udbstats, option_log, option_no_progress, option_quiet, option_threads, -1 }, { option_usearch_global, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_db, option_dbmask, option_dbmatched, option_dbnotmatched, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_lca_cutoff, option_lcaout, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_uc_allhits, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_v, option_log, option_quiet, option_threads, -1 }, { option_version, option_log, option_quiet, option_threads, -1 } }; /* check that only one commmand is specified */ int commands = 0; int k = -1; for (int i = 0; i < commands_count; i++) { if (options_selected[command_options[i]]) { commands++; k = i; } } if (commands > 1) { fatal("More than one command specified"); } /* check that only valid options are specified */ int invalid_options = 0; if (commands == 0) { /* check if any options are specified */ bool any_options = false; for (bool const i: options_selected) { if (i) { any_options = true; } } if (any_options) { fprintf(stderr, "WARNING: Options given, but no valid command specified.\n"); } } else { for (int i = 0; i < options_count; i++) { if (options_selected[i]) { int j = 0; bool ok = false; while (valid_options[k][j] >= 0) { if (valid_options[k][j] == i) { ok = true; break; } j++; } if (not ok) { invalid_options++; if (invalid_options == 1) { fprintf(stderr, "Fatal error: Invalid options to command %s\n", long_options[command_options[k]].name); fprintf(stderr, "Invalid option(s):"); } fprintf(stderr, " --%s", long_options[i].name); } } } if (invalid_options > 0) { fprintf(stderr, "\nThe valid options for the %s command are:", long_options[command_options[k]].name); int count = 0; for(int j = 1; valid_options[k][j] >= 0; j++) { fprintf(stderr, " --%s", long_options[valid_options[k][j]].name); count++; } if (not count) { fprintf(stderr, " (none)"); } fprintf(stderr, "\n"); exit(EXIT_FAILURE); } } /* multi-threaded commands */ if ((opt_threads < 0) or (opt_threads > 1024)) { fatal("The argument to --threads must be in the range 0 (default) to 1024"); } if (opt_allpairs_global or opt_cluster_fast or opt_cluster_size or opt_cluster_smallmem or opt_cluster_unoise or opt_fastq_mergepairs or opt_fastx_mask or opt_maskfasta or opt_search_exact or opt_sintax or opt_uchime_ref or opt_usearch_global) { if (parameters.opt_threads == 0) { opt_threads = arch_get_cores(); parameters.opt_threads = arch_get_cores(); } } else { if (parameters.opt_threads > 1) { fprintf(stderr, "WARNING: The %s command does not support multithreading.\nOnly 1 thread used.\n", long_options[command_options[k]].name); } opt_threads = 1; parameters.opt_threads = 1; } if (opt_sintax and parameters.opt_randseed and (parameters.opt_threads > 1)) { fprintf(stderr, "WARNING: Using the --sintax command with the --randseed option may not work as intended with multiple threads. Use a single thread (--threads 1) to ensure reproducible results.\n"); } if (opt_cluster_unoise) { opt_weak_id = 0.90; } else if (opt_weak_id > opt_id) { opt_weak_id = opt_id; } if (opt_maxrejects == -1) { if (opt_cluster_fast) { opt_maxrejects = 8; } else { opt_maxrejects = 32; } } if (opt_maxaccepts < 0) { fatal("The argument to --maxaccepts must not be negative"); } if (opt_maxrejects < 0) { fatal("The argument to --maxrejects must not be negative"); } if (opt_wordlength == 0) { /* set default word length */ if (opt_orient) { opt_wordlength = 12; } else { opt_wordlength = 8; } } if ((opt_wordlength < 3) or (opt_wordlength > 15)) { fatal("The argument to --wordlength must be in the range 3 to 15"); } if ((opt_iddef < 0) or (opt_iddef > 4)) { fatal("The argument to --iddef must in the range 0 to 4"); } #if 0 if (opt_match <= 0) fatal("The argument to --match must be positive"); if (opt_mismatch >= 0) fatal("The argument to --mismatch must be negative"); #endif if (opt_alignwidth < 0) { fatal("The argument to --alignwidth must not be negative"); } if (opt_rowlen < 0) { fatal("The argument to --rowlen must not be negative"); } if (opt_qmask == MASK_ERROR) { fatal("The argument to --qmask must be none, dust or soft"); } if (opt_dbmask == MASK_ERROR) { fatal("The argument to --dbmask must be none, dust or soft"); } if ((opt_sample_pct < 0.0) or (opt_sample_pct > 100.0)) { fatal("The argument to --sample_pct must be in the range 0.0 to 100.0"); } if (opt_sample_size < 0) { fatal("The argument to --sample_size must not be negative"); } if (((parameters.opt_relabel ? 1 : 0) + opt_relabel_md5 + opt_relabel_self + opt_relabel_sha1) > 1) { fatal("Specify only one of --relabel, --relabel_self, --relabel_sha1, or --relabel_md5"); } if (parameters.opt_fastq_tail < 1) { fatal("The argument to --fastq_tail must be positive"); } if ((opt_min_unmasked_pct < 0.0) and (opt_min_unmasked_pct > 100.0)) { fatal("The argument to --min_unmasked_pct must be between 0.0 and 100.0"); } if ((opt_max_unmasked_pct < 0.0) and (opt_max_unmasked_pct > 100.0)) { fatal("The argument to --max_unmasked_pct must be between 0.0 and 100.0"); } if (opt_min_unmasked_pct > opt_max_unmasked_pct) { fatal("The argument to --min_unmasked_pct cannot be larger than --max_unmasked_pct"); } if ((parameters.opt_fastq_ascii != 33) and (parameters.opt_fastq_ascii != 64)) { fatal("The argument to --fastq_ascii must be 33 or 64"); } if (opt_fastq_qmin > opt_fastq_qmax) { fatal("The argument to --fastq_qmin cannot be greater than --fastq_qmax"); } if (parameters.opt_fastq_ascii + opt_fastq_qmin < 33) { fatal("Sum of arguments to --fastq_ascii and --fastq_qmin must be no less than 33"); } if (parameters.opt_fastq_ascii + opt_fastq_qmax > 126) { fatal("Sum of arguments to --fastq_ascii and --fastq_qmax must be no more than 126"); } if (parameters.opt_fastq_qminout > parameters.opt_fastq_qmaxout) { fatal("The argument to --fastq_qminout cannot be larger than --fastq_qmaxout"); } if ((parameters.opt_fastq_asciiout != 33) and (parameters.opt_fastq_asciiout != 64)) { fatal("The argument to --fastq_asciiout must be 33 or 64"); } if (parameters.opt_fastq_asciiout + parameters.opt_fastq_qminout < 33) { fatal("Sum of arguments to --fastq_asciiout and --fastq_qminout must be no less than 33"); } if (parameters.opt_fastq_asciiout + parameters.opt_fastq_qmaxout > 126) { fatal("Sum of arguments to --fastq_asciiout and --fastq_qmaxout must be no more than 126"); } if (opt_gzip_decompress and opt_bzip2_decompress) { fatal("Specify either --gzip_decompress or --bzip2_decompress, not both"); } if ((opt_sintax_cutoff < 0.0) or (opt_sintax_cutoff > 1.0)) { fatal("The argument to sintax_cutoff must be in the range 0.0 to 1.0"); } if ((opt_lca_cutoff <= 0.5) or (opt_lca_cutoff > 1.0)) { fatal("The argument to lca_cutoff must be larger than 0.5, but not larger than 1.0"); } if (parameters.opt_minuniquesize < 1) { fatal("The argument to minuniquesize must be at least 1"); } if (parameters.opt_maxuniquesize < 1) { fatal("The argument to maxuniquesize must be at least 1"); } if (parameters.opt_maxsize < 1) { fatal("The argument to maxsize must be at least 1"); } if (opt_maxhits < 0) { fatal("The argument to maxhits cannot be negative"); } if (opt_chimeras_length_min < 1) { fatal("The argument to chimeras_length_min must be at least 1"); } if ((opt_chimeras_parents_max < 2) or (opt_chimeras_parents_max > maxparents)) { char maxparents_string[25]; snprintf(maxparents_string, 25, "%d", maxparents); fatal("The argument to chimeras_parents_max must be in the range 2 to %s.\n", maxparents_string); } if ((opt_chimeras_diff_pct < 0.0) or (opt_chimeras_diff_pct > 50.0)) { fatal("The argument to chimeras_diff_pct must be in the range 0.0 to 50.0"); } if (options_selected[option_chimeras_parts] and ((opt_chimeras_parts < 2) or (opt_chimeras_parts > 100))) { fatal("The argument to chimeras_parts must be in the range 2 to 100"); } if (opt_chimeras_denovo) { if (not options_selected[option_alignwidth]) { opt_alignwidth = 60; } } /* TODO: check valid range of gap penalties */ /* adapt/adjust parameters */ #if 1 /* Adjust gap open penalty according to convention. The specified gap open penalties include the penalty for a single nucleotide gap: gap penalty = gap open penalty + (gap length - 1) * gap extension penalty The rest of the code assumes the first nucleotide gap penalty is not included in the gap opening penalty. */ opt_gap_open_query_left -= opt_gap_extension_query_left; opt_gap_open_target_left -= opt_gap_extension_target_left; opt_gap_open_query_interior -= opt_gap_extension_query_interior; opt_gap_open_target_interior -= opt_gap_extension_target_interior; opt_gap_open_query_right -= opt_gap_extension_query_right; opt_gap_open_target_right -= opt_gap_extension_target_right; #endif /* set defaults parameters, if not specified */ if (opt_maxhits == 0) { opt_maxhits = int64_max; } if (opt_minwordmatches < 0) { opt_minwordmatches = minwordmatches_defaults[opt_wordlength]; } /* set default opt_minsize depending on command */ if (parameters.opt_minsize == 0) { if (opt_cluster_unoise) { opt_minsize = 8; parameters.opt_minsize = 8; } else { opt_minsize = 1; parameters.opt_minsize = 1; } } /* set default opt_abskew depending on command */ if (not options_selected[option_abskew]) { if (opt_chimeras_denovo) { opt_abskew = 1.0; } else if (opt_uchime3_denovo) { opt_abskew = 16.0; } else { opt_abskew = 2.0; } } /* set default opt_minseqlength depending on command */ if (parameters.opt_minseqlength < 0) { if (opt_cluster_fast or opt_cluster_size or opt_cluster_smallmem or opt_cluster_unoise or parameters.opt_derep_fulllength or parameters.opt_derep_id or parameters.opt_derep_prefix or opt_makeudb_usearch or opt_sintax or opt_usearch_global) { opt_minseqlength = 32; parameters.opt_minseqlength = 32; } else { opt_minseqlength = 1; parameters.opt_minseqlength = 1; } } if (opt_sintax) { opt_notrunclabels = 1; parameters.opt_notrunclabels = true; } } auto show_publication() -> void { fprintf(stdout, "Rognes T, Flouri T, Nichols B, Quince C, Mahe F (2016)\n" "VSEARCH: a versatile open source tool for metagenomics\n" "PeerJ 4:e2584 doi: 10.7717/peerj.2584 https://doi.org/10.7717/peerj.2584\n" "\n"); } auto cmd_version(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } show_publication(); #ifdef HAVE_ZLIB_H printf("Compiled with support for gzip-compressed files,"); if (gz_lib) { printf(" and the library is loaded.\n"); char * (*zlibVersion_p)(); zlibVersion_p = (char * (*)()) arch_dlsym(gz_lib, "zlibVersion"); char * gz_version = (*zlibVersion_p)(); uLong (*zlibCompileFlags_p)(); zlibCompileFlags_p = (uLong (*)()) arch_dlsym(gz_lib, "zlibCompileFlags"); uLong const flags = (*zlibCompileFlags_p)(); printf("zlib version %s, compile flags %lx", gz_version, flags); static constexpr auto check_10th_bit = 1024U; // 0x0400 if (flags & check_10th_bit) { printf(" (ZLIB_WINAPI)"); } printf("\n"); } else { printf(" but the library was not found.\n"); } #else printf("Compiled without support for gzip-compressed files.\n"); #endif #ifdef HAVE_BZLIB_H printf("Compiled with support for bzip2-compressed files,"); if (bz2_lib) { printf(" and the library is loaded.\n"); } else { printf(" but the library was not found.\n"); } #else printf("Compiled without support for bzip2-compressed files.\n"); #endif } auto cmd_help(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } show_publication(); /* 0 1 2 3 4 5 6 7 */ /* 01234567890123456789012345678901234567890123456789012345678901234567890123456789 */ fprintf(stdout, "Usage: %s [OPTIONS]\n", parameters.progname); fprintf(stdout, "\n" "For further details, please consult the manual by entering: man vsearch\n" "\n" "General options\n" " --bzip2_decompress decompress input with bzip2 (required if pipe)\n" " --fasta_width INT width of FASTA seq lines, 0 for no wrap (80)\n" " --gzip_decompress decompress input with gzip (required if pipe)\n" " --help | -h display help information\n" " --log FILENAME write messages, timing and memory info to file\n" " --maxseqlength INT maximum sequence length (50000)\n" " --minseqlength INT min seq length (clust/derep/search: 32, other:1)\n" " --no_progress do not show progress indicator\n" " --notrunclabels do not truncate labels at first space\n" " --quiet output just warnings and fatal errors to stderr\n" " --threads INT number of threads to use, zero for all cores (0)\n" " --version | -v display version information\n" "\n" "Chimera detection with new algorithm\n" " --chimeras_denovo FILENAME detect chimeras de novo in long exact sequences\n" " Parameters\n" " --abskew REAL minimum abundance ratio (1.0)\n" " --chimeras_diff_pct mismatch %% allowed in each chimeric region (0.0)\n" " --chimeras_length_min minimum length of each chimeric region (10)\n" " --chimeras_parents_max maximum number of parent sequences (3)\n" " --chimeras_parts number of parts to divide sequences (length/100)\n" " --sizein propagate abundance annotation from input\n" " Output\n" " --alignwidth INT width of alignments in alignment output file (60)\n" " --alnout FILENAME output chimera alignments to file\n" " --chimeras FILENAME output chimeric sequences to file\n" " --nonchimeras FILENAME output non-chimeric sequences to file\n" " --relabel STRING relabel nonchimeras with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout include abundance information when relabelling\n" " --tabbedout FILENAME output chimera info to tab-separated file\n" " --xsize strip abundance information in output\n" "\n" "Chimera detection with UCHIME algorithms\n" " --uchime_denovo FILENAME detect chimeras de novo\n" " --uchime2_denovo FILENAME detect chimeras de novo in denoised amplicons\n" " --uchime3_denovo FILENAME detect chimeras de novo in denoised amplicons\n" " --uchime_ref FILENAME detect chimeras using a reference database\n" " Data\n" " --db FILENAME reference database for --uchime_ref\n" " Parameters\n" " --abskew REAL minimum abundance ratio (2.0, 16.0 for uchime3)\n" " --dn REAL 'no' vote pseudo-count (1.4)\n" " --mindiffs INT minimum number of differences in segment (3) *\n" " --mindiv REAL minimum divergence from closest parent (0.8) *\n" " --minh REAL minimum score (0.28) * ignored in uchime2/3\n" " --sizein propagate abundance annotation from input\n" " --self exclude identical labels for --uchime_ref\n" " --selfid exclude identical sequences for --uchime_ref\n" " --xn REAL 'no' vote weight (8.0)\n" " Output\n" " --alignwidth INT width of alignment in uchimealn output (80)\n" " --borderline FILENAME output borderline chimeric sequences to file\n" " --chimeras FILENAME output chimeric sequences to file\n" " --fasta_score include chimera score in FASTA output\n" " --nonchimeras FILENAME output non-chimeric sequences to file\n" " --relabel STRING relabel nonchimeras with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout include abundance information when relabelling\n" " --uchimealns FILENAME output chimera alignments to file\n" " --uchimeout FILENAME output to chimera info to tab-separated file\n" " --uchimeout5 make output compatible with uchime version 5\n" " --xsize strip abundance information in output\n" "\n" "Clustering\n" " --cluster_fast FILENAME cluster sequences after sorting by length\n" " --cluster_size FILENAME cluster sequences after sorting by abundance\n" " --cluster_smallmem FILENAME cluster already sorted sequences (see -usersort)\n" " --cluster_unoise FILENAME denoise Illumina amplicon reads\n" " Parameters (most searching options also apply)\n" " --cons_truncate do not ignore terminal gaps in MSA for consensus\n" " --id REAL reject if identity lower, accepted values: 0-1.0\n" " --iddef INT id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n" " --qmask none|dust|soft mask seqs with dust, soft or no method (dust)\n" " --sizein propagate abundance annotation from input\n" " --strand plus|both cluster using plus or both strands (plus)\n" " --usersort indicate sequences not pre-sorted by length\n" " --minsize INT minimum abundance (unoise only) (8)\n" " --unoise_alpha REAL alpha parameter (unoise only) (2.0)\n" " Output\n" " --biomout FILENAME filename for OTU table output in biom 1.0 format\n" " --centroids FILENAME output centroid sequences to FASTA file\n" " --clusterout_id add cluster id info to consout and profile files\n" " --clusterout_sort order msaout, consout, profile by decr abundance\n" " --clusters STRING output each cluster to a separate FASTA file\n" " --consout FILENAME output cluster consensus sequences to FASTA file\n" " --mothur_shared_out FN filename for OTU table output in mothur format\n" " --msaout FILENAME output multiple seq. alignments to FASTA file\n" " --otutabout FILENAME filename for OTU table output in classic format\n" " --profile FILENAME output sequence profile of each cluster to file\n" " --relabel STRING relabel centroids with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeorder sort accepted centroids by abundance, AGC\n" " --sizeout write cluster abundances to centroid file\n" " --uc FILENAME specify filename for UCLUST-like output\n" " --xsize strip abundance information in output\n" "\n" "Convert SFF to FASTQ\n" " --sff_convert FILENAME convert given SFF file to FASTQ format\n" " Parameters\n" " --sff_clip clip ends of sequences as indicated in file (no)\n" " --fastq_asciiout INT FASTQ output quality score ASCII base char (33)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " Output\n" " --fastqout FILENAME output converted sequences to given FASTQ file\n" "\n" "Dereplication and rereplication\n" " --derep_fulllength FILENAME dereplicate sequences in the given FASTA file\n" " --derep_id FILENAME dereplicate using both identifiers and sequences\n" " --derep_prefix FILENAME dereplicate sequences in file based on prefixes\n" " --derep_smallmem FILENAME dereplicate sequences in file using less memory\n" " --fastx_uniques FILENAME dereplicate sequences in the FASTA/FASTQ file\n" " --rereplicate FILENAME rereplicate sequences in the given FASTA file\n" " Parameters\n" " --maxuniquesize INT maximum abundance for output from dereplication\n" " --minuniquesize INT minimum abundance for output from dereplication\n" " --sizein propagate abundance annotation from input\n" " --strand plus|both dereplicate plus or both strands (plus)\n" " Output\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " --fastaout FILENAME output FASTA file (for fastx_uniques)\n" " --fastqout FILENAME output FASTQ file (for fastx_uniques)\n" " --output FILENAME output FASTA file (not for fastx_uniques)\n" " --relabel STRING relabel with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout write abundance annotation to output\n" " --tabbedout FILENAME write cluster info to tsv file for fastx_uniques\n" " --topn INT output only n most abundant sequences after derep\n" " --uc FILENAME filename for UCLUST-like dereplication output\n" " --xsize strip abundance information in derep output\n" "\n" "FASTA to FASTQ conversion\n" " --fasta2fastq FILENAME convert from FASTA to FASTQ, fake quality scores\n" " Parameters\n" " --fastq_asciiout INT FASTQ output quality score ASCII base char (33)\n" " --fastq_qmaxout INT fake quality score for FASTQ output (41)\n" " Output\n" " --fastqout FILENAME FASTQ output filename for converted sequences\n" "\n" "FASTQ format conversion\n" " --fastq_convert FILENAME convert between FASTQ file formats\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_asciiout INT FASTQ output quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " Output\n" " --fastqout FILENAME FASTQ output filename for converted sequences\n" "\n" "FASTQ format detection and quality analysis\n" " --fastq_chars FILENAME analyse FASTQ file for version and quality range\n" " Parameters\n" " --fastq_tail INT min length of tails to count for fastq_chars (4)\n" "\n" "FASTQ quality statistics\n" " --fastq_stats FILENAME report statistics on FASTQ file\n" " --fastq_eestats FILENAME quality score and expected error statistics\n" " --fastq_eestats2 FILENAME expected error and length cutoff statistics\n" " Parameters\n" " --ee_cutoffs REAL,... fastq_eestats2 expected error cutoffs (0.5,1,2)\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --length_cutoffs INT,INT,INT fastq_eestats2 length (min,max,incr) (50,*,50)\n" " Output\n" " --log FILENAME output file for fastq_stats statistics\n" " --output FILENAME output file for fastq_eestats(2) statistics\n" "\n" "Masking (new)\n" " --fastx_mask FILENAME mask sequences in the given FASTA or FASTQ file\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --hardmask mask by replacing with N instead of lower case\n" " --max_unmasked_pct max unmasked %% of sequences to keep (100.0)\n" " --min_unmasked_pct min unmasked %% of sequences to keep (0.0)\n" " --qmask none|dust|soft mask seqs with dust, soft or no method (dust)\n" " Output\n" " --fastaout FILENAME output to specified FASTA file\n" " --fastqout FILENAME output to specified FASTQ file\n" "\n" "Masking (old)\n" " --maskfasta FILENAME mask sequences in the given FASTA file\n" " Parameters\n" " --hardmask mask by replacing with N instead of lower case\n" " --qmask none|dust|soft mask seqs with dust, soft or no method (dust)\n" " Output\n" " --output FILENAME output to specified FASTA file\n" "\n" "Orient sequences in forward or reverse direction\n" " --orient FILENAME orient sequences in given FASTA/FASTQ file\n" " Data\n" " --db FILENAME database of sequences in correct orientation\n" " --dbmask none|dust|soft mask db seqs with dust, soft or no method (dust)\n" " --qmask none|dust|soft mask query with dust, soft or no method (dust)\n" " --wordlength INT length of words used for matching 3-15 (12)\n" " Output\n" " --fastaout FILENAME FASTA output filename for oriented sequences\n" " --fastqout FILENAME FASTQ output filenamr for oriented sequences\n" " --notmatched FILENAME output filename for undetermined sequences\n" " --tabbedout FILENAME output filename for result information\n" "\n" "Paired-end reads joining\n" " --fastq_join FILENAME join paired-end reads into one sequence with gap\n" " Data\n" " --reverse FILENAME specify FASTQ file with reverse reads\n" " --join_padgap STRING sequence string used for padding (NNNNNNNN)\n" " --join_padgapq STRING quality string used for padding (IIIIIIII)\n" " Output\n" " --fastaout FILENAME FASTA output filename for joined sequences\n" " --fastqout FILENAME FASTQ output filename for joined sequences\n" "\n" "Paired-end reads merging\n" " --fastq_mergepairs FILENAME merge paired-end reads into one sequence\n" " Data\n" " --reverse FILENAME specify FASTQ file with reverse reads\n" " Parameters\n" " --fastq_allowmergestagger allow merging of staggered reads\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_maxdiffpct REAL maximum percentage diff. bases in overlap (100.0)\n" " --fastq_maxdiffs INT maximum number of different bases in overlap (10)\n" " --fastq_maxee REAL maximum expected error value for merged sequence\n" " --fastq_maxmergelen maximum length of entire merged sequence\n" " --fastq_maxns INT maximum number of N's\n" " --fastq_minlen INT minimum input read length after truncation (1)\n" " --fastq_minmergelen minimum length of entire merged sequence\n" " --fastq_minovlen minimum length of overlap between reads (10)\n" " --fastq_nostagger disallow merging of staggered reads (default)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " --fastq_truncqual INT base quality value for truncation\n" " Output\n" " --eetabbedout FILENAME output error statistics to specified file\n" " --fastaout FILENAME FASTA output filename for merged sequences\n" " --fastaout_notmerged_fwd FN FASTA filename for non-merged forward sequences\n" " --fastaout_notmerged_rev FN FASTA filename for non-merged reverse sequences\n" " --fastq_eeout include expected errors (ee) in FASTQ output\n" " --fastqout FILENAME FASTQ output filename for merged sequences\n" " --fastqout_notmerged_fwd FN FASTQ filename for non-merged forward sequences\n" " --fastqout_notmerged_rev FN FASTQ filename for non-merged reverse sequences\n" " --label_suffix STRING suffix to append to label of merged sequences\n" " --xee remove expected errors (ee) info from output\n" "\n" "Pairwise alignment\n" " --allpairs_global FILENAME perform global alignment of all sequence pairs\n" " Output (most searching options also apply)\n" " --alnout FILENAME filename for human-readable alignment output\n" " --acceptall output all pairwise alignments\n" "\n" "Restriction site cutting\n" " --cut FILENAME filename of FASTA formatted input sequences\n" " Parameters\n" " --cut_pattern STRING pattern to match with ^ and _ at cut sites\n" " Output\n" " --fastaout FILENAME FASTA filename for fragments on forward strand\n" " --fastaout_rev FILENAME FASTA filename for fragments on reverse strand\n" " --fastaout_discarded FN FASTA filename for non-matching sequences\n" " --fastaout_discarded_rev FN FASTA filename for non-matching, reverse compl.\n" "\n" "Reverse complementation\n" " --fastx_revcomp FILENAME reverse-complement seqs in FASTA or FASTQ file\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " Output\n" " --fastaout FILENAME FASTA output filename\n" " --fastqout FILENAME FASTQ output filename\n" " --label_suffix STRING label to append to identifier in the output\n" "\n" "Searching\n" " --search_exact FILENAME filename of queries for exact match search\n" " --usearch_global FILENAME filename of queries for global alignment search\n" " Data\n" " --db FILENAME name of UDB or FASTA database for search\n" " Parameters\n" " --dbmask none|dust|soft mask db with dust, soft or no method (dust)\n" " --fulldp full dynamic programming alignment (always on)\n" " --gapext STRING penalties for gap extension (2I/1E)\n" " --gapopen STRING penalties for gap opening (20I/2E)\n" " --hardmask mask by replacing with N instead of lower case\n" " --id REAL reject if identity lower\n" " --iddef INT id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n" " --idprefix INT reject if first n nucleotides do not match\n" " --idsuffix INT reject if last n nucleotides do not match\n" " --lca_cutoff REAL fraction of matching hits required for LCA (1.0)\n" " --leftjust reject if terminal gaps at alignment left end\n" " --match INT score for match (2)\n" " --maxaccepts INT number of hits to accept and show per strand (1)\n" " --maxdiffs INT reject if more substitutions or indels\n" " --maxgaps INT reject if more indels\n" " --maxhits INT maximum number of hits to show (unlimited)\n" " --maxid REAL reject if identity higher\n" " --maxqsize INT reject if query abundance larger\n" " --maxqt REAL reject if query/target length ratio higher\n" " --maxrejects INT number of non-matching hits to consider (32)\n" " --maxsizeratio REAL reject if query/target abundance ratio higher\n" " --maxsl REAL reject if shorter/longer length ratio higher\n" " --maxsubs INT reject if more substitutions\n" " --mid REAL reject if percent identity lower, ignoring gaps\n" " --mincols INT reject if alignment length shorter\n" " --minqt REAL reject if query/target length ratio lower\n" " --minsizeratio REAL reject if query/target abundance ratio lower\n" " --minsl REAL reject if shorter/longer length ratio lower\n" " --mintsize INT reject if target abundance lower\n" " --minwordmatches INT minimum number of word matches required (12)\n" " --mismatch INT score for mismatch (-4)\n" " --n_mismatch consider aligning with N's as mismatches\n" " --pattern STRING option is ignored\n" " --qmask none|dust|soft mask query with dust, soft or no method (dust)\n" " --query_cov REAL reject if fraction of query seq. aligned lower\n" " --rightjust reject if terminal gaps at alignment right end\n" " --sizein propagate abundance annotation from input\n" " --self reject if labels identical\n" " --selfid reject if sequences identical\n" " --slots INT option is ignored\n" " --strand plus|both search plus or both strands (plus)\n" " --target_cov REAL reject if fraction of target seq. aligned lower\n" " --weak_id REAL include aligned hits with >= id; continue search\n" " --wordlength INT length of words for database index 3-15 (8)\n" " Output\n" " --alnout FILENAME filename for human-readable alignment output\n" " --biomout FILENAME filename for OTU table output in biom 1.0 format\n" " --blast6out FILENAME filename for blast-like tab-separated output\n" " --dbmatched FILENAME FASTA file for matching database sequences\n" " --dbnotmatched FILENAME FASTA file for non-matching database sequences\n" " --fastapairs FILENAME FASTA file with pairs of query and target\n" " --lcaout FILENAME output LCA of matching sequences to file\n" " --matched FILENAME FASTA file for matching query sequences\n" " --mothur_shared_out FN filename for OTU table output in mothur format\n" " --notmatched FILENAME FASTA file for non-matching query sequences\n" " --otutabout FILENAME filename for OTU table output in classic format\n" " --output_no_hits output non-matching queries to output files\n" " --rowlen INT width of alignment lines in alnout output (64)\n" " --samheader include a header in the SAM output file\n" " --samout FILENAME filename for SAM format output\n" " --sizeout write abundance annotation to dbmatched file\n" " --top_hits_only output only hits with identity equal to the best\n" " --uc FILENAME filename for UCLUST-like output\n" " --uc_allhits show all, not just top hit with uc output\n" " --userfields STRING fields to output in userout file\n" " --userout FILENAME filename for user-defined tab-separated output\n" "\n" "Shuffling and sorting\n" " --shuffle FILENAME shuffle order of sequences in FASTA file randomly\n" " --sortbylength FILENAME sort sequences by length in given FASTA file\n" " --sortbysize FILENAME abundance sort sequences in given FASTA file\n" " Parameters\n" " --maxsize INT maximum abundance for sortbysize\n" " --minsize INT minimum abundance for sortbysize\n" " --randseed INT seed for PRNG, zero to use random data source (0)\n" " --sizein propagate abundance annotation from input\n" " Output\n" " --output FILENAME output to specified FASTA file\n" " --relabel STRING relabel sequences with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout include abundance information when relabelling\n" " --topn INT output just first n sequences\n" " --xsize strip abundance information in output\n" "\n" "Subsampling\n" " --fastx_subsample FILENAME subsample sequences from given FASTA/FASTQ file\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --randseed INT seed for PRNG, zero to use random data source (0)\n" " --sample_pct REAL sampling percentage between 0.0 and 100.0\n" " --sample_size INT sampling size\n" " --sizein consider abundance info from input, do not ignore\n" " Output\n" " --fastaout FILENAME output subsampled sequences to FASTA file\n" " --fastaout_discarded FILE output non-subsampled sequences to FASTA file\n" " --fastqout FILENAME output subsampled sequences to FASTQ file\n" " --fastqout_discarded output non-subsampled sequences to FASTQ file\n" " --relabel STRING relabel sequences with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout update abundance information in output\n" " --xsize strip abundance information in output\n" "\n" "Taxonomic classification\n" " --sintax FILENAME classify sequences in given FASTA/FASTQ file\n" " Parameters\n" " --db FILENAME taxonomic reference db in given FASTA or UDB file\n" " --sintax_cutoff REAL confidence value cutoff level (0.0)\n" " --sintax_random use random sequence, not shortest, if equal match\n" " Output\n" " --tabbedout FILENAME write results to given tab-delimited file\n" "\n" "Trimming and filtering\n" " --fastx_filter FILENAME trim and filter sequences in FASTA/FASTQ file\n" " --fastq_filter FILENAME trim and filter sequences in FASTQ file\n" " --reverse FILENAME FASTQ file with other end of paired-end reads\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_maxee REAL discard if expected error value is higher\n" " --fastq_maxee_rate REAL discard if expected error rate is higher\n" " --fastq_maxlen INT discard if length of sequence is longer\n" " --fastq_maxns INT discard if number of N's is higher\n" " --fastq_minlen INT discard if length of sequence is shorter\n" " --fastq_minqual INT discard if any base quality value lower (0)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_stripleft INT delete given number of bases from the 5' end\n" " --fastq_stripright INT delete given number of bases from the 3' end\n" " --fastq_truncee REAL truncate to given maximum expected error\n" " --fastq_truncee_rate REAL truncate to given maximum expected error rate\n" " --fastq_trunclen INT truncate to given length (discard if shorter)\n" " --fastq_trunclen_keep INT truncate to given length (keep if shorter)\n" " --fastq_truncqual INT truncate to given minimum base quality\n" " --maxsize INT discard if abundance of sequence is above\n" " --minsize INT discard if abundance of sequence is below\n" " Output\n" " --eeout include expected errors in output\n" " --fastaout FN FASTA filename for passed sequences\n" " --fastaout_discarded FN FASTA filename for discarded sequences\n" " --fastaout_discarded_rev FN FASTA filename for discarded reverse sequences\n" " --fastaout_rev FN FASTA filename for passed reverse sequences\n" " --fastqout FN FASTQ filename for passed sequences\n" " --fastqout_discarded FN FASTQ filename for discarded sequences\n" " --fastqout_discarded_rev FN FASTQ filename for discarded reverse sequences\n" " --fastqout_rev FN FASTQ filename for passed reverse sequences\n" " --relabel STRING relabel filtered sequences with given prefix\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel filtered sequences with md5 digest\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel filtered sequences with sha1 digest\n" " --sizeout include abundance information when relabelling\n" " --xee remove expected errors (ee) info from output\n" " --xsize strip abundance information in output\n" "\n" "UDB files\n" " --makeudb_usearch FILENAME make UDB file from given FASTA file\n" " --udb2fasta FILENAME output FASTA file from given UDB file\n" " --udbinfo FILENAME show information about UDB file\n" " --udbstats FILENAME report statistics about indexed words in UDB file\n" " Parameters\n" " --dbmask none|dust|soft mask db with dust, soft or no method (dust)\n" " --hardmask mask by replacing with N instead of lower case\n" " --wordlength INT length of words for database index 3-15 (8)\n" " Output\n" " --output FILENAME UDB or FASTA output file\n" ); } auto cmd_allpairs_global() -> void { /* check options */ if ((not opt_alnout) and (not opt_userout) and (not opt_uc) and (not opt_blast6out) and (not opt_matched) and (not opt_notmatched) and (not opt_samout) and (not opt_fastapairs)) { fatal("No output files specified"); } if (not (opt_acceptall or ((opt_id >= 0.0) and (opt_id <= 1.0)))) { fatal("Specify either --acceptall or --id with an identity from 0.0 to 1.0"); } allpairs_global(cmdline, progheader); } auto cmd_usearch_global() -> void { /* check options */ if ((not opt_alnout) and (not opt_userout) and (not opt_uc) and (not opt_blast6out) and (not opt_matched) and (not opt_notmatched) and (not opt_dbmatched) and (not opt_dbnotmatched) and (not opt_samout) and (not opt_otutabout) and (not opt_biomout) and (not opt_mothur_shared_out) and (not opt_fastapairs) and (not opt_lcaout)) { fatal("No output files specified"); } if (not opt_db) { fatal("Database filename not specified with --db"); } if ((opt_id < 0.0) or (opt_id > 1.0)) { fatal("Identity between 0.0 and 1.0 must be specified with --id"); } usearch_global(cmdline, progheader); } auto cmd_search_exact() -> void { /* check options */ if ((not opt_alnout) and (not opt_userout) and (not opt_uc) and (not opt_blast6out) and (not opt_matched) and (not opt_notmatched) and (not opt_dbmatched) and (not opt_dbnotmatched) and (not opt_samout) and (not opt_otutabout) and (not opt_biomout) and (not opt_mothur_shared_out) and (not opt_fastapairs) and (not opt_lcaout)) { fatal("No output files specified"); } if (not opt_db) { fatal("Database filename not specified with --db"); } search_exact(cmdline, progheader); } auto cmd_subsample(struct Parameters const & parameters) -> void { if ((not opt_fastaout) and (not opt_fastqout)) { fatal("Specify output files for subsampling with --fastaout and/or --fastqout"); } if ((opt_sample_pct > 0) == (opt_sample_size > 0)) { fatal("Specify either --sample_pct or --sample_size"); } subsample(parameters); } auto cmd_none(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } fprintf(stderr, "For more help, please enter: %s --help\n" "For further details, please consult the manual by entering: man vsearch\n" "\n" "Selected command examples:\n" "\n" "vsearch --allpairs_global FILENAME --id 0.5 --alnout FILENAME\n" "vsearch --cluster_size FILENAME --id 0.97 --centroids FILENAME\n" "vsearch --cut FILENAME --cut_pattern G^AATT_C --fastaout FILENAME\n" "vsearch --fastq_chars FILENAME\n" "vsearch --fastq_convert FILENAME --fastqout FILENAME --fastq_ascii 64\n" "vsearch --fastq_eestats FILENAME --output FILENAME\n" "vsearch --fastq_eestats2 FILENAME --output FILENAME\n" "vsearch --fastq_mergepairs FILENAME --reverse FILENAME --fastqout FILENAME\n" "vsearch --fastq_stats FILENAME --log FILENAME\n" "vsearch --fastx_filter FILENAME --fastaout FILENAME --fastq_trunclen 100\n" "vsearch --fastx_getseq FILENAME --label LABEL --fastaout FILENAME\n" "vsearch --fastx_mask FILENAME --fastaout FILENAME\n" "vsearch --fastx_revcomp FILENAME --fastqout FILENAME\n" "vsearch --fastx_subsample FILENAME --fastaout FILENAME --sample_pct 1\n" "vsearch --fastx_uniques FILENAME --fastaout FILENAME\n" "vsearch --makeudb_usearch FILENAME --output FILENAME\n" "vsearch --search_exact FILENAME --db FILENAME --alnout FILENAME\n" "vsearch --sff_convert FILENAME --output FILENAME --sff_clip\n" "vsearch --shuffle FILENAME --output FILENAME\n" "vsearch --sintax FILENAME --db FILENAME --tabbedout FILENAME\n" "vsearch --sortbylength FILENAME --output FILENAME\n" "vsearch --sortbysize FILENAME --output FILENAME\n" "vsearch --uchime_denovo FILENAME --nonchimeras FILENAME\n" "vsearch --uchime_ref FILENAME --db FILENAME --nonchimeras FILENAME\n" "vsearch --usearch_global FILENAME --db FILENAME --id 0.97 --alnout FILENAME\n" "\n" "Other commands: cluster_fast, cluster_smallmem, cluster_unoise, cut,\n" " derep_id, derep_fulllength, derep_prefix, derep_smallmem,\n" " fasta2fastq, fastq_filter, fastq_join, fastx_getseqs,\n" " fastx_getsubseq, maskfasta, orient, rereplicate, uchime2_denovo,\n" " uchime3_denovo, udb2fasta, udbinfo, udbstats, version\n" "\n", parameters.progname); } auto cmd_cluster() -> void { if ((not opt_alnout) and (not opt_userout) and (not opt_uc) and (not opt_blast6out) and (not opt_matched) and (not opt_notmatched) and (not opt_centroids) and (not opt_clusters) and (not opt_consout) and (not opt_msaout) and (not opt_samout) and (not opt_profile) and (not opt_otutabout) and (not opt_biomout) and (not opt_mothur_shared_out)) { fatal("No output files specified"); } if (not opt_cluster_unoise) { if ((opt_id < 0.0) or (opt_id > 1.0)) { fatal("Identity between 0.0 and 1.0 must be specified with --id"); } } if (opt_cluster_fast) { cluster_fast(cmdline, progheader); } else if (opt_cluster_smallmem) { cluster_smallmem(cmdline, progheader); } else if (opt_cluster_size) { cluster_size(cmdline, progheader); } else if (opt_cluster_unoise) { cluster_unoise(cmdline, progheader); } } auto cmd_chimera() -> void { if ((not opt_chimeras) and (not opt_nonchimeras) and (not opt_uchimeout) and (not opt_uchimealns)) { fatal("No output files specified"); } if (opt_uchime_ref and not opt_db) { fatal("Database filename not specified with --db"); } if (opt_abskew < 1.0) { fatal("Argument to --abskew must be >= 1.0"); } if (opt_xn <= 1.0) { fatal("Argument to --xn must be > 1"); } if (opt_dn <= 0.0) { fatal("Argument to --dn must be > 0"); } if ((not opt_uchime2_denovo) and (not opt_uchime3_denovo)) { if (opt_mindiffs <= 0) { fatal("Argument to --mindiffs must be > 0"); } if (opt_mindiv <= 0.0) { fatal("Argument to --mindiv must be > 0"); } if (opt_minh <= 0.0) { fatal("Argument to --minh must be > 0"); } } chimera(); } auto cmd_fastq_mergepairs() -> void { if (not opt_reverse) { fatal("No reverse reads file specified with --reverse"); } if ((not opt_fastqout) and (not opt_fastaout) and (not opt_fastqout_notmerged_fwd) and (not opt_fastqout_notmerged_rev) and (not opt_fastaout_notmerged_fwd) and (not opt_fastaout_notmerged_rev) and (not opt_eetabbedout)) { fatal("No output files specified"); } if (opt_fastq_maxdiffs < 0) { fatal("Argument to --fastq_maxdiffs must be positive"); } fastq_mergepairs(); } auto fillheader() -> void { static constexpr auto max_line_length = std::size_t{80}; constexpr static double one_gigabyte {1024 * 1024 * 1024}; snprintf(progheader, max_line_length, "%s v%s_%s, %.1fGB RAM, %ld cores", PROG_NAME, PROG_VERSION, PROG_ARCH, arch_get_memtotal() / one_gigabyte, arch_get_cores()); } auto getentirecommandline(int argc, char** argv) -> void { int len = 0; for (int i = 0; i < argc; i++) { len += strlen(argv[i]); } cmdline = (char *) xmalloc(len + argc); cmdline[0] = 0; for (int i = 0; i < argc; i++) { if (i > 0) { strcat(cmdline, " "); } strcat(cmdline, argv[i]); } } auto show_header() -> void { if (opt_quiet) { return ; } fprintf(stderr, "%s\n", progheader); fprintf(stderr, "https://github.com/torognes/vsearch\n"); fprintf(stderr, "\n"); } auto main(int argc, char** argv) -> int { fillheader(); struct Parameters parameters; getentirecommandline(argc, argv); cpu_features_detect(); args_init(argc, argv, parameters); if (parameters.opt_log != nullptr) { fp_log = fopen_output(opt_log); parameters.fp_log = fp_log; if (not fp_log) { fatal("Unable to open log file for writing"); } fprintf(fp_log, "%s\n", progheader); fprintf(fp_log, "%s\n", cmdline); char time_string[26]; time_start = time(nullptr); struct tm * tm_start = localtime(& time_start); strftime(time_string, 26, "%Y-%m-%dT%H:%M:%S", tm_start); fprintf(fp_log, "Started %s\n", time_string); } random_init(); show_header(); dynlibs_open(); #ifdef __x86_64__ if (not sse2_present) { fatal("Sorry, this program requires a cpu with SSE2."); } #endif if (parameters.opt_help) { cmd_help(parameters); } else if (opt_allpairs_global) { cmd_allpairs_global(); } else if (opt_usearch_global) { cmd_usearch_global(); } else if (parameters.opt_sortbysize) { sortbysize(parameters); } else if (parameters.opt_sortbylength) { sortbylength(parameters); } else if (parameters.opt_derep_fulllength) { derep(parameters, parameters.opt_derep_fulllength, false); } else if (parameters.opt_derep_prefix) { derep_prefix(parameters); } else if (parameters.opt_derep_smallmem) { derep_smallmem(parameters); } else if (parameters.opt_derep_id) { derep(parameters, parameters.opt_derep_id, true); } else if (parameters.opt_shuffle) { shuffle(parameters); } else if (parameters.opt_fastx_subsample) { cmd_subsample(parameters); } else if (opt_maskfasta) { maskfasta(); } else if (opt_cluster_smallmem or opt_cluster_fast or opt_cluster_size or opt_cluster_unoise) { cmd_cluster(); } else if (opt_uchime_denovo or opt_uchime_ref or opt_uchime2_denovo or opt_uchime3_denovo or opt_chimeras_denovo) { cmd_chimera(); } else if (parameters.opt_fastq_chars) { fastq_chars(parameters); } else if (opt_fastq_stats) { fastq_stats(); } else if (opt_fastq_filter) { fastq_filter(); } else if (opt_fastx_filter) { fastx_filter(); } else if (opt_fastx_revcomp) { fastx_revcomp(); } else if (opt_search_exact) { cmd_search_exact(); } else if (opt_fastx_mask) { fastx_mask(); } else if (opt_fastq_convert) { fastq_convert(); } else if (opt_fastq_mergepairs) { cmd_fastq_mergepairs(); } else if (opt_fastq_eestats) { fastq_eestats(); } else if (opt_fastq_eestats2) { fastq_eestats2(); } else if (parameters.opt_fastq_join) { if ((not parameters.opt_join_padgapq_set_by_user) and (parameters.opt_fastq_ascii != default_ascii_offset)) { parameters.opt_join_padgapq = alternative_quality_padding; } fastq_join(parameters); } else if (parameters.opt_rereplicate) { rereplicate(parameters); } else if (parameters.opt_version) { cmd_version(parameters); } else if (opt_makeudb_usearch) { udb_make(); } else if (opt_udb2fasta) { udb_fasta(); } else if (opt_udbinfo) { udb_info(); } else if (opt_udbstats) { udb_stats(); } else if (opt_sintax) { sintax(); } else if (opt_sff_convert) { sff_convert(); } else if (opt_fastx_getseq) { fastx_getseq(); } else if (opt_fastx_getseqs) { fastx_getseqs(); } else if (opt_fastx_getsubseq) { fastx_getsubseq(); } else if (parameters.opt_cut) { cut(parameters); } else if (opt_orient) { orient(); } else if (parameters.opt_fasta2fastq) { fasta2fastq(parameters); } else if (parameters.opt_fastx_uniques) { derep(parameters, parameters.opt_fastx_uniques, false); } else { cmd_none(parameters); } if (parameters.opt_log) { time_finish = time(nullptr); struct tm * tm_finish = localtime(& time_finish); char time_string[26]; strftime(time_string, 26, "%Y-%m-%dT%H:%M:%S", tm_finish); fprintf(fp_log, "\n"); fprintf(fp_log, "Finished %s", time_string); double const time_diff = difftime(time_finish, time_start); fprintf(fp_log, "\n"); fprintf(fp_log, "Elapsed time %02.0lf:%02.0lf\n", floor(time_diff / 60.0), floor(time_diff - (60.0 * floor(time_diff / 60.0)))); double const maxmem = arch_get_memused() / 1048576.0; if (maxmem < 1024.0) { fprintf(fp_log, "Max memory %.1lfMB\n", maxmem); } else { fprintf(fp_log, "Max memory %.1lfGB\n", maxmem / 1024.0); } fclose(fp_log); } if (opt_ee_cutoffs_values) { xfree(opt_ee_cutoffs_values); } opt_ee_cutoffs_values = nullptr; xfree(cmdline); dynlibs_close(); } vsearch-2.30.0/src/vsearch.h000066400000000000000000000362551476012147200156410ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define _GNU_SOURCE 1 #define __STDC_CONSTANT_MACROS 1 #define __STDC_FORMAT_MACROS 1 #define __STDC_LIMIT_MACROS 1 #define __restrict #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* include appropriate regex library */ #ifdef HAVE_REGEX_H #include #else #include #endif #include #include #include #include #include #include #include #define PROG_NAME PACKAGE #define PROG_VERSION PACKAGE_VERSION #ifdef __x86_64__ #define PROG_CPU "x86_64" #include #elif __PPC__ #ifdef __LITTLE_ENDIAN__ #define PROG_CPU "ppc64le" #include #undef bool #else #error Big endian ppc64 CPUs not supported #endif #elif __aarch64__ #define PROG_CPU "aarch64" #include #else #define PROG_CPU "simde" #define SIMDE_ENABLE_NATIVE_ALIASES #include #endif #ifdef _WIN32 #define PROG_OS "win" #include #include #include #define bswap_16(x) _byteswap_ushort(x) #define bswap_32(x) _byteswap_ulong(x) #define bswap_64(x) _byteswap_uint64(x) #elif __APPLE__ #define PROG_OS "macos" #include #include #include #define bswap_16(x) OSSwapInt16(x) #define bswap_32(x) OSSwapInt32(x) #define bswap_64(x) OSSwapInt64(x) #elif __linux__ #define PROG_OS "linux" #include #include #include #elif __FreeBSD__ #define PROG_OS "freebsd" #include #include #include #define bswap_16(x) bswap16(x) #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) #elif __NetBSD__ #define PROG_OS "netbsd" #include #include #include #define bswap_16(x) bswap16(x) #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) /* Alters behavior, but NetBSD 7 does not have getopt_long_only() */ #define getopt_long_only getopt_long #else #define PROG_OS "unknown" #include #include #include #endif #define PROG_ARCH PROG_OS "_" PROG_CPU #ifdef HAVE_DLFCN_H #include #endif #ifdef HAVE_ZLIB_H #include #endif #ifdef HAVE_BZLIB_H #include #endif #include "city.h" #include "sha1.h" #include "arch.h" #include "util.h" #include "xstring.h" #include "db.h" #include "linmemalign.h" #include "searchcore.h" #include "results.h" #include "cpu.h" #include "fastx.h" #include "fasta.h" #include "fastq.h" #include "dbhash.h" #include "kmerhash.h" /* options */ extern bool opt_bzip2_decompress; extern bool opt_clusterout_id; extern bool opt_clusterout_sort; extern bool opt_eeout; extern bool opt_fasta_score; extern bool opt_fastq_allowmergestagger; extern bool opt_fastq_eeout; extern bool opt_fastq_nostagger; extern bool opt_gzip_decompress; extern bool opt_label_substr_match; extern bool opt_lengthout; extern bool opt_n_mismatch; extern bool opt_no_progress; extern bool opt_quiet; extern bool opt_relabel_keep; extern bool opt_relabel_md5; extern bool opt_relabel_self; extern bool opt_relabel_sha1; extern bool opt_samheader; extern bool opt_sff_clip; extern bool opt_sintax_random; extern bool opt_sizein; extern bool opt_sizeorder; extern bool opt_sizeout; extern bool opt_xee; extern bool opt_xlength; extern bool opt_xsize; extern char * opt_allpairs_global; extern char * opt_alnout; extern char * opt_biomout; extern char * opt_blast6out; extern char * opt_borderline; extern char * opt_centroids; extern char * opt_chimeras; extern char * opt_chimeras_denovo; extern char * opt_cluster_fast; extern char * opt_cluster_size; extern char * opt_cluster_smallmem; extern char * opt_cluster_unoise; extern char * opt_clusters; extern char * opt_consout; extern char * opt_db; extern char * opt_dbmatched; extern char * opt_dbnotmatched; extern char * opt_eetabbedout; extern char * opt_fastaout; extern char * opt_fastaout_discarded; extern char * opt_fastaout_discarded_rev; extern char * opt_fastaout_notmerged_fwd; extern char * opt_fastaout_notmerged_rev; extern char * opt_fastaout_rev; extern char * opt_fastapairs; extern char * opt_fastq_convert; extern char * opt_fastq_eestats2; extern char * opt_fastq_eestats; extern char * opt_fastq_filter; extern char * opt_fastq_mergepairs; extern char * opt_fastq_stats; extern char * opt_fastqout; extern char * opt_fastqout_discarded; extern char * opt_fastqout_discarded_rev; extern char * opt_fastqout_rev; extern char * opt_fastqout_notmerged_fwd; extern char * opt_fastqout_notmerged_rev; extern char * opt_fastx_filter; extern char * opt_fastx_getseq; extern char * opt_fastx_getseqs; extern char * opt_fastx_getsubseq; extern char * opt_fastx_mask; extern char * opt_fastx_revcomp; extern char * opt_label; extern char * opt_label_suffix; extern char * opt_labels; extern char * opt_label_word; extern char * opt_label_words; extern char * opt_label_field; extern char * opt_lcaout; extern char * opt_log; extern char * opt_makeudb_usearch; extern char * opt_maskfasta; extern char * opt_matched; extern char * opt_mothur_shared_out; extern char * opt_msaout; extern char * opt_nonchimeras; extern char * opt_notmatched; extern char * opt_notmatchedfq; extern char * opt_orient; extern char * opt_otutabout; extern char * opt_output; extern char * opt_pattern; extern char * opt_profile; extern char * opt_qsegout; extern char * opt_relabel; extern char * opt_reverse; extern char * opt_samout; extern char * opt_sample; extern char * opt_search_exact; extern char * opt_sff_convert; extern char * opt_sintax; extern char * opt_tabbedout; extern char * opt_tsegout; extern char * opt_uc; extern char * opt_uchime2_denovo; extern char * opt_uchime3_denovo; extern char * opt_uchime_denovo; extern char * opt_uchime_ref; extern char * opt_uchimealns; extern char * opt_uchimeout; extern char * opt_udb2fasta; extern char * opt_udbinfo; extern char * opt_udbstats; extern char * opt_usearch_global; extern char * opt_userout; extern double * opt_ee_cutoffs_values; extern double opt_abskew; extern double opt_chimeras_diff_pct; extern double opt_dn; extern double opt_fastq_maxdiffpct; extern double opt_fastq_maxee; extern double opt_fastq_maxee_rate; extern double opt_fastq_truncee; extern double opt_fastq_truncee_rate; extern double opt_id; extern double opt_lca_cutoff; extern double opt_max_unmasked_pct; extern double opt_maxid; extern double opt_maxqt; extern double opt_maxsizeratio; extern double opt_maxsl; extern double opt_mid; extern double opt_min_unmasked_pct; extern double opt_mindiv; extern double opt_minh; extern double opt_minqt; extern double opt_minsizeratio; extern double opt_minsl; extern double opt_query_cov; extern double opt_sample_pct; extern double opt_sintax_cutoff; extern double opt_target_cov; extern double opt_unoise_alpha; extern double opt_weak_id; extern double opt_xn; extern int opt_acceptall; extern int opt_alignwidth; extern int opt_chimeras_length_min; extern int opt_chimeras_parents_max; extern int opt_chimeras_parts; extern int opt_cons_truncate; extern int opt_ee_cutoffs_count; extern int opt_gap_extension_query_interior; extern int opt_gap_extension_query_left; extern int opt_gap_extension_query_right; extern int opt_gap_extension_target_interior; extern int opt_gap_extension_target_left; extern int opt_gap_extension_target_right; extern int opt_gap_open_query_interior; extern int opt_gap_open_query_left; extern int opt_gap_open_query_right; extern int opt_gap_open_target_interior; extern int opt_gap_open_target_left; extern int opt_gap_open_target_right; extern int opt_length_cutoffs_increment; extern int opt_length_cutoffs_longest; extern int opt_length_cutoffs_shortest; extern int opt_mindiffs; extern int opt_slots; extern int opt_uchimeout5; extern int opt_usersort; extern int64_t opt_dbmask; extern int64_t opt_fasta_width; extern int64_t opt_fastq_ascii; extern int64_t opt_fastq_asciiout; extern int64_t opt_fastq_maxdiffs; extern int64_t opt_fastq_maxlen; extern int64_t opt_fastq_maxmergelen; extern int64_t opt_fastq_maxns; extern int64_t opt_fastq_minlen; extern int64_t opt_fastq_minmergelen; extern int64_t opt_fastq_minovlen; extern int64_t opt_fastq_minqual; extern int64_t opt_fastq_qmax; extern int64_t opt_fastq_qmaxout; extern int64_t opt_fastq_qmin; extern int64_t opt_fastq_qminout; extern int64_t opt_fastq_stripleft; extern int64_t opt_fastq_stripright; extern int64_t opt_fastq_trunclen; extern int64_t opt_fastq_trunclen_keep; extern int64_t opt_fastq_truncqual; extern int64_t opt_fulldp; extern int64_t opt_hardmask; extern int64_t opt_iddef; extern int64_t opt_idprefix; extern int64_t opt_idsuffix; extern int64_t opt_leftjust; extern int64_t opt_match; extern int64_t opt_maxaccepts; extern int64_t opt_maxdiffs; extern int64_t opt_maxgaps; extern int64_t opt_maxhits; extern int64_t opt_maxqsize; extern int64_t opt_maxrejects; extern int64_t opt_maxseqlength; extern int64_t opt_maxsize; extern int64_t opt_maxsubs; extern int64_t opt_maxuniquesize; extern int64_t opt_mincols; extern int64_t opt_minseqlength; extern int64_t opt_minsize; extern int64_t opt_mintsize; extern int64_t opt_minuniquesize; extern int64_t opt_minwordmatches; extern int64_t opt_mismatch; extern int64_t opt_notrunclabels; extern int64_t opt_output_no_hits; extern int64_t opt_qmask; extern int64_t opt_randseed; extern int64_t opt_rightjust; extern int64_t opt_rowlen; extern int64_t opt_sample_size; extern int64_t opt_self; extern int64_t opt_selfid; extern int64_t opt_strand; extern int64_t opt_subseq_start; extern int64_t opt_subseq_end; extern int64_t opt_threads; extern int64_t opt_top_hits_only; extern int64_t opt_topn; extern int64_t opt_uc_allhits; extern int64_t opt_wordlength; extern int64_t altivec_present; extern int64_t mmx_present; extern int64_t sse_present; extern int64_t sse2_present; extern int64_t sse3_present; extern int64_t ssse3_present; extern int64_t sse41_present; extern int64_t sse42_present; extern int64_t popcnt_present; extern int64_t avx_present; extern int64_t avx2_present; extern std::FILE * fp_log; constexpr auto tax_levels = 9; constexpr int64_t default_maxseqlength = 50000; constexpr int64_t default_ascii_offset = 33; constexpr char alternative_ascii_offset = 64; constexpr int64_t default_max_quality = 41; constexpr auto int64_max = std::numeric_limits::max(); std::string const default_quality_padding = "IIIIIIII"; // Q40 with an offset of 33 std::string const alternative_quality_padding = "hhhhhhhh"; // Q40 with an offset of 64 std::string const default_sequence_padding = "NNNNNNNN"; struct Parameters { char * opt_cut = nullptr; std::string opt_cut_pattern; char * opt_derep_fulllength = nullptr; char * opt_derep_id = nullptr; char * opt_derep_prefix = nullptr; char * opt_derep_smallmem = nullptr; char * opt_fasta2fastq = nullptr; char * opt_fastaout = nullptr; char * opt_fastaout_rev = nullptr; char * opt_fastaout_discarded = nullptr; char * opt_fastaout_discarded_rev = nullptr; char * opt_fastq_chars = nullptr; char * opt_fastq_join = nullptr; char * opt_fastqout = nullptr; char * opt_fastqout_rev = nullptr; char * opt_fastqout_discarded = nullptr; char * opt_fastqout_discarded_rev = nullptr; char * opt_fastx_subsample = nullptr; char * opt_fastx_uniques = nullptr; std::string opt_join_padgap = default_sequence_padding; std::string opt_join_padgapq = default_quality_padding; char * opt_log = nullptr; char * opt_output = nullptr; char * opt_relabel = nullptr; char * opt_rereplicate = nullptr; char * opt_reverse = nullptr; char * opt_shuffle = nullptr; char * opt_sortbylength = nullptr; char * opt_sortbysize = nullptr; char * opt_tabbedout = nullptr; char * opt_uc = nullptr; char * progname = nullptr; std::FILE * fp_log = nullptr; double opt_sample_pct = 0; int64_t opt_fastq_ascii = default_ascii_offset; int64_t opt_fastq_asciiout = default_ascii_offset; int64_t opt_fastq_qmaxout = default_max_quality; int64_t opt_fastq_qminout = 0; int64_t opt_fastq_tail = 4; int64_t opt_maxseqlength = default_maxseqlength; int64_t opt_maxsize = int64_max; int64_t opt_maxuniquesize = int64_max; int64_t opt_minseqlength = -1; int64_t opt_minsize = 0; int64_t opt_minuniquesize = 1; int64_t opt_randseed = 0; int64_t opt_sample_size = 0; int64_t opt_threads = 0; int64_t opt_topn = int64_max; bool opt_fastq_qout_max = false; bool opt_help = false; bool opt_join_padgapq_set_by_user = false; bool opt_notrunclabels = false; bool opt_quiet = false; bool opt_sizein = false; bool opt_strand = false; bool opt_version = false; bool opt_xsize = false; }; vsearch-2.30.0/src/xstring.h000066400000000000000000000101341476012147200156700ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2024, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include // std::size_t, std::snprintf #include // std::strlen, std::strcpy static std::array empty_string = {""}; class xstring { char * string; std::size_t length; std::size_t alloc; public: xstring() { length = 0; alloc = 0; string = nullptr; } ~xstring() { if (alloc > 0) { xfree(string); } alloc = 0; string = nullptr; length = 0; } auto empty() -> void { length = 0; } auto get_string() -> char * { if (length > 0) { return string; } return empty_string.data(); } auto get_length() const -> std::size_t { return length; } auto add_c(char a_char) -> void { const std::size_t needed = 1; if (length + needed + 1 > alloc) { alloc = length + needed + 1; string = (char *) xrealloc(string, alloc); } string[length] = a_char; length += 1; string[length] = 0; } auto add_d(int a_number) -> void { auto const needed = snprintf(nullptr, 0, "%d", a_number); if (needed < 0) { fatal("snprintf failed"); } if (length + needed + 1 > alloc) { alloc = length + needed + 1; string = (char *) xrealloc(string, alloc); } std::snprintf(string + length, needed + 1, "%d", a_number); length += needed; } auto add_s(char * a_string) -> void { auto const needed = std::strlen(a_string); if (length + needed + 1 > alloc) { alloc = length + needed + 1; string = (char *) xrealloc(string, alloc); } std::strcpy(string + length, a_string); length += needed; } };