Source code for bioconvert.fasta2genbank

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022  Institut Pasteur, Paris and CNRS.                #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################
"""Convert :term:`FASTA` to :term:`GENBANK` format"""
import datetime
from math import floor, log

from bioconvert import ConvBase
from bioconvert.core.decorators import compressor, requires, requires_nothing
from bioconvert.io.fasta import Fasta

__all__ = ["FASTA2GENBANK"]


[docs]class FASTA2GENBANK(ConvBase):
    """Convert :term:`FASTA` file to :term:`GENBANK` file

    Methods available are based on squizz [SQUIZZ]_ or biopython [BIOPYTHON]_ or
    Bioconvert pure implementation (default).

    """

    # squizz works as well but keeps lower cases while
    # biopython uses upper cases
    #: Default value
    _default_method = "bioconvert"

    def __init__(self, infile, outfile, *args, **kargs):
        """.. rubric:: constructor

        :param str infile: input FASTA file
        :param str outfile: output GENBANK filename

        """
        super(FASTA2GENBANK, self).__init__(infile, outfile, *args, **kargs)

[docs]    @requires("squizz")
    @compressor
    def _method_squizz(self, *args, **kwargs):
        """Header is less informative than the one obtained with biopython"""
        cmd = "squizz -f fasta -c genbank  {} > {} ".format(self.infile, self.outfile)
        self.execute(cmd)

[docs]    @requires(python_library="biopython")
    @compressor
    def _method_biopython(self, *args, **kwargs):
        """For this method we use the biopython package Bio.SeqIO.

        `Bio.SeqIO Documentation <https://biopython.org/docs/1.76/api/Bio.SeqIO.html>`_"""
        from Bio import SeqIO

        SeqIO.convert(self.infile, "fasta", self.outfile, "genbank", "DNA")

    # --- Pure python methods ---

[docs]    @requires_nothing
    def _method_bioconvert(self, *args, **kwargs):
        """Internal method"""
        reader = Fasta(self.infile)

        with open(self.outfile, "w") as writer:
            for sequence in reader.read():
                seq_size = len(sequence["value"])
                num_digit = floor(log(seq_size, 10)) + 1

                # Sequence header
                now = datetime.datetime.now()
                writer.write(
                    "LOCUS       {}{}{} bp DNA              XXX {}-{}-{}\n".format(
                        sequence["id"],
                        " " * (max(1, 28 - len(sequence["id"]) - num_digit)),
                        seq_size,
                        now.day,
                        now.month,
                        now.year,
                    )
                )
                writer.write("DEFINITION  {}\n".format(sequence["comment"]))
                writer.write("ORIGIN      \n")

                # Print sequence
                for seq_idx in range(0, seq_size, 60):
                    # Write line header (idx in the sequence)
                    idx_num_digit = floor(log(seq_idx + 1, 10)) + 1
                    writer.write("{}{}".format(" " * (9 - idx_num_digit), seq_idx + 1))

                    # write the sequence itself
                    for i in range(6):
                        begin = seq_idx + i * 10
                        end = seq_idx + (i + 1) * 10

                        # sequence over before this slice
                        if begin >= seq_size:
                            break
                        # sequence over during this slice
                        elif end > seq_size:
                            writer.write(" {}".format(sequence["value"][begin:seq_size]))
                        else:
                            writer.write(" {}".format(sequence["value"][begin:end]))

                    # newline
                    writer.write("\n")
                writer.write("//\n")