Source code for bioconvert.bam2fasta

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022  Institut Pasteur, Paris and CNRS.                #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################
"""Convert :term:`BAM` format to :term:`FASTA` format"""
import os
import subprocess

from bioconvert import ConvBase
from bioconvert.core.decorators import requires
from bioconvert.core.utils import get_extension


[docs]class BAM2FASTA(ConvBase): """Convert sorted :term:`BAM` file into :term:`FASTA` file Methods available are based on samtools [SAMTOOLS]_ or bedtools [BEDTOOLS]_. .. warning:: Using the bedtools method, the R1 and R2 reads must be next to each other so that the reads are sorted similarly .. warning:: there is no guarantee that the R1/R2 output file are sorted similarly in paired-end case due to supp and second reads """ #: Default value _default_method = "samtools" def __init__(self, infile, outfile): """.. rubric:: constructor :param str infile: BAM file :param str outfile: FASTA file """ super(BAM2FASTA, self).__init__(infile, outfile) """@requires("bamtools") def __method_bamtools(self, *args, **kwargs): # fastq are split on several lines (80 characters) # this method contains supplementary reads and we don't know # what to do with them for now. So, this method is # commented. Indeed final R1 and R2 files will not be paired. cmd = "bamtools stats -in '%s' | sed '12!d' | awk '{print $3}' " % (self.infile) ps = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,universal_newlines=True) isPaired = ps.communicate()[0].strip() # Collect the extension ext = os.path.splitext(self.outfile)[1] cmd = "bamtools convert -format fasta -in {} -out {}".format( self.infile, self.outfile) self.execute(cmd) """
[docs] @requires("samtools") def _method_samtools(self, *args, **kwargs): """ do the conversion :term:`BAM` -> :term:`FASTA` using samtools. .. note:: fasta are on one line `SAMtools documentation <http://www.htslib.org/doc/samtools.html>`_""" # Test if input bam file is paired p = subprocess.Popen( "samtools view -c -f 1 {}".format(self.infile).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) isPaired = p.communicate()[0].strip() # Collect the extension ext = os.path.splitext(self.outfile)[1] # FIXME: this compression code may be factorised ? output_ext = get_extension(self.outfile, remove_compression=True) # If the output file extension is compress extension if ext in [".gz", ".bz2"]: outbasename = os.path.splitext(self.outfile)[0].split(".", 1)[0] if ext == ".gz": compresscmd = "gzip -f" elif ext == ".bz2": compresscmd = "pbzip2 -f" # When the input file is not paired and the output file needs to be compressed if isPaired == "0": cmd = "samtools fasta {} > {}.{}".format(self.infile, outbasename, output_ext) self.execute(cmd) cmd = "{} {}.{}".format(compresscmd, outbasename, output_ext) self.execute(cmd) # When the input file is paired and the output file needs to be compressed else: cmd = "samtools fasta -1 {}_1.{} -2 {}_2.{} -n {} ".format( outbasename, output_ext, outbasename, output_ext, self.infile ) self.execute(cmd) cmd = "{} {}_1.{}".format(compresscmd, outbasename, output_ext) self.execute(cmd) cmd = "{} {}_2.{}".format(compresscmd, outbasename, output_ext) self.execute(cmd) else: outbasename = os.path.splitext(self.outfile)[0] # When the input file is not paired if isPaired == "0": cmd = "samtools fasta {} > {}".format(self.infile, self.outfile) self.execute(cmd) # When the input file is paired else: cmd = "samtools fasta -1 {}_1.{} -2 {}_2.{} -n {} ".format( outbasename, output_ext, outbasename, output_ext, self.infile ) self.execute(cmd)