Source code for bioconvert.fasta2genbank

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022  Institut Pasteur, Paris and CNRS.                #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################
"""Convert :term:`FASTA` to :term:`GENBANK` format"""
import datetime
from math import floor, log

from bioconvert import ConvBase
from bioconvert.core.decorators import compressor, requires, requires_nothing
from bioconvert.io.fasta import Fasta

__all__ = ["FASTA2GENBANK"]


[docs]class FASTA2GENBANK(ConvBase): """Convert :term:`FASTA` file to :term:`GENBANK` file Methods available are based on squizz [SQUIZZ]_ or biopython [BIOPYTHON]_ or Bioconvert pure implementation (default). """ # squizz works as well but keeps lower cases while # biopython uses upper cases #: Default value _default_method = "bioconvert" def __init__(self, infile, outfile, *args, **kargs): """.. rubric:: constructor :param str infile: input FASTA file :param str outfile: output GENBANK filename """ super(FASTA2GENBANK, self).__init__(infile, outfile, *args, **kargs)
[docs] @requires("squizz") @compressor def _method_squizz(self, *args, **kwargs): """Header is less informative than the one obtained with biopython""" cmd = "squizz -f fasta -c genbank {} > {} ".format(self.infile, self.outfile) self.execute(cmd)
[docs] @requires(python_library="biopython") @compressor def _method_biopython(self, *args, **kwargs): """For this method we use the biopython package Bio.SeqIO. `Bio.SeqIO Documentation <https://biopython.org/docs/1.76/api/Bio.SeqIO.html>`_""" from Bio import SeqIO SeqIO.convert(self.infile, "fasta", self.outfile, "genbank", "DNA")
# --- Pure python methods ---
[docs] @requires_nothing def _method_bioconvert(self, *args, **kwargs): """Internal method""" reader = Fasta(self.infile) with open(self.outfile, "w") as writer: for sequence in reader.read(): seq_size = len(sequence["value"]) num_digit = floor(log(seq_size, 10)) + 1 # Sequence header now = datetime.datetime.now() writer.write( "LOCUS {}{}{} bp DNA XXX {}-{}-{}\n".format( sequence["id"], " " * (max(1, 28 - len(sequence["id"]) - num_digit)), seq_size, now.day, now.month, now.year, ) ) writer.write("DEFINITION {}\n".format(sequence["comment"])) writer.write("ORIGIN \n") # Print sequence for seq_idx in range(0, seq_size, 60): # Write line header (idx in the sequence) idx_num_digit = floor(log(seq_idx + 1, 10)) + 1 writer.write("{}{}".format(" " * (9 - idx_num_digit), seq_idx + 1)) # write the sequence itself for i in range(6): begin = seq_idx + i * 10 end = seq_idx + (i + 1) * 10 # sequence over before this slice if begin >= seq_size: break # sequence over during this slice elif end > seq_size: writer.write(" {}".format(sequence["value"][begin:seq_size])) else: writer.write(" {}".format(sequence["value"][begin:end])) # newline writer.write("\n") writer.write("//\n")