Source code for bioconvert.scf2fasta

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022  Institut Pasteur, Paris and CNRS.                #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################

"""Convert :term:`SCF` file to :term:`FASTA` file"""

import colorlog

from bioconvert import ConvBase
from bioconvert.core.decorators import compressor, requires_nothing
from bioconvert.io import scf

_log = colorlog.getLogger(__name__)

__all__ = ["SCF2FASTA"]


[docs]class SCF2FASTA(ConvBase):
    """
    Converts a binary SCF/ABI file to Fasta format.

    :param str infile: input SCF/ABI file
    :param str outfile: output name file
    """

    #: Default value
    _default_method = "python"

[docs]    @requires_nothing
    @compressor
    def _method_python(self, *args, **kwargs):
        """Internal method"""
        sequence, qualities, comments = scf.read_scf(self.infile)

        # Wrinting output file
        with open(self.outfile, "w") as output_file:
            output_file.write(">" + comments.replace("\n", "-").replace(" ", "_") + "\n")
            output_file.write(sequence + "\n")

        """
        print(sequence)
        print("INFORMATIONS")
        print("magic_number = " + magic_number.decode("utf-8"))
        print("samples = " + str(samples))
        print("samples_offset = " + str(samples_offset))
        print("bases = " + str(bases))
        print("bases_left_clip = " + str(bases_left_clip))
        print("bases_right_clip = " + str(bases_right_clip))
        print("bases_offset = " + str(bases_offset))
        print("comments_size = " + str(comments_size))
        print("comments_offset = " + str(comments_offset))
        print("version = " + version.decode("utf-8"))
        print("sample_size = " + str(sample_size))
        print("code_set = " + str(code_set))
        print("private_size = " + str(private_size))
        print("private_offset = " + str(private_offset))
        #print("spare = " + str(spare))
        print()
        print("COMMENTS")
        print(comments)
        print()
        print("SEQUENCE")
        print(sequence)
        print()
        print("QUALITIES")
        print(qualities)
        """


"""
http://staden.sourceforge.net/manual/formats_unix_2.html
http://doc.bioperl.org/bioperl-live/Bio/SeqIO/scf.html#POD6
https://docs.python.org/2/library/struct.html
http://www.perlmonks.org/?node_id=224666

SCF file organisation (more or less)

Length in bytes                        Data
---------------------------------------------------------------------------
128                                    header
Number of samples * sample size        Samples for A trace
Number of samples * sample size        Samples for C trace
Number of samples * sample size        Samples for G trace
Number of samples * sample size        Samples for T trace
Number of bases * 4                    Offset into peak index for each base
Number of bases                        Accuracy estimate bases being 'A'
Number of bases                        Accuracy estimate bases being 'C'
Number of bases                        Accuracy estimate bases being 'G'
Number of bases                        Accuracy estimate bases being 'T'
Number of bases                        The called bases
Number of bases * 3                    Reserved for future use
Comments size                          Comments
Private data size                      Private data
"""