Source code for bioconvert.bam2fastq

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022  Institut Pasteur, Paris and CNRS.                #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################
"""Convert :term:`BAM` format to :term:`FASTQ` foarmat"""
import os
import subprocess
from collections import Counter

from bioconvert import ConvBase
from bioconvert.core.decorators import requires
from bioconvert.core.utils import get_extension


[docs]class BAM2FASTQ(ConvBase):
    """Convert sorted :term:`BAM` file into :term:`FASTQ` file

    Methods available are based on samtools [SAMTOOLS]_ or bedtools [BEDTOOLS]_.

    .. warning:: Using the bedtools method, the R1 and R2 reads must be next to
        each other so that the reads are sorted similarly

    .. warning:: there is no guarantee that the R1/R2 output file are sorted
        similarly in paired-end case due to supp and second reads

    """

    #: Default value
    _default_method = "samtools"

    def __init__(self, infile, outfile):
        """.. rubric:: constructor

        :param str infile:
        :param str outfile:

        """
        super(BAM2FASTQ, self).__init__(infile, outfile)

    """@requires("bamtools")
    def __method_bamtools(self, *args, **kwargs):

        # this method contains supplementary reads and we don't know
        # what to do with them for now. So, this method is
        # commented. Indeed final R1 and R2 files will not be paired.

        cmd = "bamtools convert -format fastq -in {0} -out {1}".format(
            self.infile, self.outfile
        )
        self.execute(cmd)
    """

[docs]    @requires("bedtools")
    def _method_bedtools(self, *args, **kwargs):
        """Do the conversion :term:`BAM` -> :term:`Fastq` using bedtools

        `bedtools documentation <https://bedtools.readthedocs.io/en/latest/>`_"""
        outbasename = os.path.splitext(self.outfile)[0]

        cmd = "bedtools bamtofastq -i {} -fq {}".format(self.infile, self.outfile)
        self.execute(cmd)

        output_ext = get_extension(self.outfile, remove_compression=True)

        # Due to the IO, paired reads are not always consecutive.
        # So, checking the first and second reads for paired data does not
        # work. Instead, we check the first 10 reads and check whether we have
        # at least one paired data.

        data = []
        with open(self.outfile, "r") as fin:
            for i in range(40000):
                x = fin.readline()
                if len(x) == 0:
                    break
                data.append(x)

        isPaired = 2 in Counter([x for i, x in enumerate(data) if i % 4 == 0]).values()
        if isPaired:
            cmd = "bedtools bamtofastq -i {} -fq {}_1.{} -fq2 {}_2.{}".format(
                self.infile, outbasename, output_ext, outbasename, output_ext
            )
            self.execute(cmd)

            # Compress the output if required. We do not use compressor
            # since we may have two outputs.
            comp_ext = get_extension(self.outfile, remove_compression=False)
            if comp_ext in [".gz", ".dsrc", "bz2"]:
                from bioconvert.core.utils import compressor

                compressor("{}_1.{}".format(outbasename, output_ext), comp_ext)
                compressor("{}_2.{}".format(outbasename, output_ext), comp_ext)

[docs]    @requires("samtools")
    def _method_samtools(self, *args, **kwargs):
        """Do the conversion :term:`BAM` -> :term:`FASTQ` using samtools

        `SAMtools documentation <http://www.htslib.org/doc/samtools.html>`_"""
        # Test if input bam file is paired
        p = subprocess.Popen(
            "samtools view -c -f 1 {}".format(self.infile).split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
        )
        isPaired = p.communicate()[0].strip()

        # Collect the extension
        ext = os.path.splitext(self.outfile)[1]

        # FIXME: this compression code may be factorised ?

        output_ext = get_extension(self.outfile, remove_compression=True)

        # If the output file extension is compress extension
        if ext in [".gz", ".bz2", ".dsrc"]:
            outbasename = os.path.splitext(self.outfile)[0].split(".", 1)[0]

            if ext == ".gz":
                compresscmd = "gzip -f"
            elif ext == ".bz2":
                compresscmd = "pbzip2 -f"
            else:
                compresscmd = "dsrc c"

            # When the input file is not paired and the output file needs to be compressed
            if isPaired == "0":
                cmd = "samtools fastq {} > {}.{}".format(self.infile, outbasename, output_ext)
                self.execute(cmd)
                if ext == ".dsrc":
                    cmd = "{} {}.{} {}.{}.dsrc".format(compresscmd, outbasename, output_ext, outbasename, output_ext)

                else:
                    cmd = "{} {}.{}".format(compresscmd, outbasename, output_ext)
                self.execute(cmd)
            # When the input file is paired and the output file needs to be compressed
            else:

                cmd = "samtools fastq -1 {}_1.{} -2 {}_2.{} -n {} ".format(
                    outbasename, output_ext, outbasename, output_ext, self.infile
                )
                self.execute(cmd)
                if ext == ".dsrc":
                    cmd = "{} {}_1.{} {}_1.{}.dsrc".format(
                        compresscmd, outbasename, output_ext, outbasename, output_ext
                    )
                    self.execute(cmd)
                    cmd = "{} {}_2.{} {}_2.{}.dsrc".format(
                        compresscmd, outbasename, output_ext, outbasename, output_ext
                    )
                    self.execute(cmd)
                else:
                    cmd = "{} {}_1.{}".format(compresscmd, outbasename, output_ext)
                    self.execute(cmd)
                    cmd = "{} {}_2.{}".format(compresscmd, outbasename, output_ext)
                    self.execute(cmd)
        else:
            outbasename = os.path.splitext(self.outfile)[0]

            # When the input file is not paired
            if isPaired == "0":
                cmd = "samtools fastq {} > {}".format(self.infile, self.outfile)
                self.execute(cmd)
            # When the input file is paired
            else:
                # os.remove(self.outfile)
                cmd = "samtools fastq -1 {}_1.{} -2 {}_2.{} -n {} ".format(
                    outbasename, output_ext, outbasename, output_ext, self.infile
                )
                self.execute(cmd)