Source code for bioconvert.core.converter

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022 Institut Pasteur, Paris and CNRS.                 #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################
""".. rubric:: Standalone application dedicated to conversion"""
import os
import sys

import colorlog
from bioconvert.core.base import ConvMeta
from bioconvert.core.registry import Registry

_log = colorlog.getLogger(__name__)


from bioconvert.core.base import make_chain
from bioconvert.core.utils import get_extension as getext
from bioconvert.core.utils import get_format_from_extension

__all__ = ["Bioconvert"]

[docs]class Bioconvert(object): """Universal converter used by the standalone :: from bioconvert import Bioconvert c = Bioconvert("test.fastq", "test.fasta", threads=4, force=True) """ def __init__(self, infile, outfile, force=False, threads=None, extra=None): """.. rubric:: constructor :param str infile: The path of the input file. :param str outfile: The path of The output file :param bool force: overwrite output file if it exists already otherwise raises an error """ # check existence of output file. If it exists, # fails except if force argument is set to True if type(outfile) is str: outfile = [outfile] if type(infile) is str: infile = [infile] # some checking on the output files (existence, special case of dsrc) for filename in outfile: if os.path.exists(filename) is True: msg = "output file {} exists already.".format(filename) if force is False: _log.critical("output file exists. If you are using bioconvert, use --force ") raise ValueError(msg) else: _log.warning(msg + " --force used so will be over written") # Only fastq files can be compressed with dsrc if filename.endswith(".dsrc"): # only valid for FastQ files extension # dsrc accepts only .fastq file extension if filename.endswith(".fastq.dsrc") is False: msg = ( "When compressing with .dsrc extension, " + "only files ending with .fastq extension are " + "accepted. This is due to the way dsrc executable " + "is implemented." ) _log.critical(msg) raise IOError Lin = len(infile) Lout = len(outfile) self.inext = [] self.outext = [] # populate the inext for filename in infile: # example: fastq.gz to fasta.bz2 # Here, we want to decompress, convert, compress. # so we need the extension without .gz or .bz2 # We should have inext set to fastq and outext # set to fasta.bz2 self.inext.append(getext(filename, remove_compression=True)) # populate the outext for filename in outfile: self.outext.append(getext(filename, remove_compression=True)) # special case one to one for compression/decompression # Case 2, fastq.gz to fastq.bz2 # data is not changed, just the type of compression, so we want # to keep the original extensions, here inext and outext will contain # .gz and .bz2 # if 1 to 1 and same extension, we overwrite self.inext and self.outext if Lin == Lout == 1: if self.inext == self.outext: _log.info("decompression/compression mode") self.inext = [getext(infile[0])] self.outext = [getext(outfile[0])] self.mapper = Registry() # From the input parameters 1 and 2, we get the module name if not list(set(list(self.mapper.get_converters_names())).intersection(sys.argv)): # get format from extensions in_fmt = [get_format_from_extension(x) for x in self.inext] out_fmt = [get_format_from_extension(x) for x in self.outext] else: in_fmt, out_fmt = ConvMeta.split_converter_to_format( list(set(list(self.mapper.get_converters_names())).intersection(sys.argv))[0] ) self.in_fmt = in_fmt self.out_fmt = out_fmt self.in_fmt = [format.lower() for format in in_fmt] self.in_fmt = tuple(in_fmt) self.out_fmt = [format.lower() for format in out_fmt] self.out_fmt = tuple(out_fmt) _log.info("Input: {}".format(self.in_fmt)) _log.info("Output: {}".format(self.out_fmt)) try: class_converter = self.mapper[(self.in_fmt, self.out_fmt)] self.name = class_converter.__name__ except KeyError: # This module name was not found # Try to find path of converters conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt) _log.debug("path: {}".format(conv_path)) if conv_path: _log.info("Direct conversion not implemented. " "Chaining converters.") # implemented in bioconvert/core/base.py # using temporary files class_converter = make_chain([(pair, self.mapper[pair]) for pair in conv_path]) else: msg = "Requested input format ('{}') to output format ('{}') is not available in bioconvert".format( self.in_fmt, self.out_fmt, ) _log.critical(msg) _log.critical("Use --formats to know the available formats and --help for examples") raise Exception(msg) # If --threads provided, we update the threads attribute # FIXME: hack for the compression/decompression decorators if Lin == 1: infile = infile[0] if Lout == 1: outfile = outfile[0] self.converter = class_converter(infile, outfile) if threads is not None: self.converter.threads = threads if extra: self.converter._extra_arguments = extra _log.info("Using {} class (with {} threads if needed)".format(self.converter.name, self.converter.threads)) # For the benchmarking only self.converter.others = {} def __call__(self, *args, **kwargs): self.converter(*args, **kwargs)