Source code for bioconvert.sam2paf

###########################################################################
# Bioconvert is a project to facilitate the interconversion               #
# of life science data from one format to another.                        #
#                                                                         #
# Copyright © 2018-2022  Institut Pasteur, Paris and CNRS.                #
#                                                                         #
# bioconvert is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by    #
# the Free Software Foundation, either version 3 of the License, or       #
# (at your option) any later version.                                     #
#                                                                         #
# bioconvert is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of          #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           #
# GNU General Public License for more details.                            #
#                                                                         #
# You should have received a copy of the GNU General Public License       #
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
#                                                                         #
# Repository: https://github.com/bioconvert/bioconvert                    #
# Documentation: http://bioconvert.readthedocs.io                         #
###########################################################################

""""Convert :term:`CRAM` to :term:`BAM` format"""
import re

import colorlog

from bioconvert import ConvBase
from bioconvert.core.decorators import requires_nothing

logger = colorlog.getLogger(__name__)


[docs]class SAM2PAF(ConvBase):
    """Convert :term:`SAM` file to :term:`PAF` file

    The :term:`SAM` and :term:`PAF` formats are described in the :ref:`formats`
    section.

    Description:

    The header of the SAM file (lines starting with @) are dropped. However, the
    length of the target is retrieved from the @SQ line that must be present.


    Consider this SAM file with two alignements only. One is aligned on the
    target (first) while the other is not (indicated by the ``*`` characters)::

        @SQ	SN:ENA|K01711|K01711.1	LN:15894
        @PG	ID:minimap2	PN:minimap2	VN:2.5-r572	CL:minimap2 -a measles.fa Hm2_GTGAAA_L005_R1_001.fastq.gz
        HISEQ:426:C5T65ACXX:5:2302:1943:2127	0	ENA|K01711|K01711.1	448	60	101M	*	00	CTTACCTTCGCATCAAGAGGTACCAACATGGAGGATGAGGCGGACCAATACTTTTCACATGATGATCCAATTAGTAGTGATCAATCCAGGTTCGGATGGTT	BCCFFFFFHHHHHIIJJJJJJIIJJJJJJJJFHIHIJJJIJIIIIGHFFFFFFEEEEEEEDDDDDFDDDDDDDDD>CDDEDEEDDDDDDCCDDDDDDDDCD	NM:i:0	ms:i:202	AS:i:202	nn:i:0	tp:A:P	cm:i:14	s1:i:94	s2:i:0
        HISEQ:426:C5T65ACXX:5:2302:4953:2090	4	*	0	0	*	*	0	0	AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAACAACCAAAAAGAGACGAACAA	CCCFFDDFAFFBHJHGGGIHIJBGGHIIJJJJJJJHGEIJGIFIIIHCBGHIJIIIIIJJHHHHEF@D@=;=,0)0&5&))+(((+((((&+(((()&&)(


    The equivalent PAF file is ::

        HISEQ:426:C5T65ACXX:5:2302:1943:2127	101	0	101	+	ENA|K01711|K01711.1	15894	447	548	101	101	60	NM:i:0	ms:i:202	AS:i:202	nn:i:0	tp:A:P	cm:i:14	s1:i:94	s2:i:0	cg:Z:101M

    In brief, the sequences are dropped. The final file is therefore smaller.
    Extra fields (starting from NM:i:0) can be dropped or kept using the
    keep_extra_field argument. Alignement with ``*`` characters are dropped.
    The first line (@SQ) is used to retrieve the length of the contigs that is
    stored in the PAF file (column 6).


    The 12 compulsary PAF fields are:

    ====== ======== ===========================================
    Col     Type    Description
    ====== ======== ===========================================
    1      string   Query sequence name
    2       int     Query sequence length
    3       int     Query start (0-based)
    4       int     Query end (0-based)
    5       char    Relative strand: "+" or "-"
    6      string   Target sequence name
    7       int     Target sequence length
    8       int     Target start on original strand (0-based)
    9       int     Target end on original strand (0-based)
    10      int     Number of residue matches
    11      int     Alignment block length
    12      int     Mapping quality (0-255; 255 for missing)
    ====== ======== ===========================================


    For developesr:

    Get the measles data from Sequana library (2 paired fastq files)::

        minimap2 measles.fa R1.fastq > approx-mapping.paf

    You can ask minimap2 to generate CIGAR at the cg tag of PAF with:

        minimap2 -c measles.fa R1.fastq > alignment.paf

    or to output alignments in the SAM format::

        minimap2 -a measles.fa R1.fastq > alignment.sam


    The SAM lines must contains 11 positional element and the NM:i and nn:i
    fields (see example above).

    """

    #: Default value
    _default_method = "python"

    def __init__(self, infile, outfile, *args, **kargs):
        """.. rubric:: constructor

        :param str infile: input SAM file
        :param str outfile: output PAF filename

        :reference: This function is a direct translation of
            https://github.com/lh3/miniasm/blob/master/misc/sam2paf.js (Dec.
            2017).

        """
        super(SAM2PAF, self).__init__(infile, outfile, *args, **kargs)

[docs]    @requires_nothing
    def _method_python(self, *args, **kwargs):
        """Internal method"""
        pattern = r"(\d+)([MIDSHNX=])"

        extra_fields = kwargs.get("extra_fields", "SAM")
        # TODO: what is this ?
        pri_only = kwargs.get("pri_only", True)

        skipped = 0
        reference_lengths = {}
        with open(self.infile, "r") as fin:
            with open(self.outfile, "w") as fout:
                for lineno, line in enumerate(fin.readlines()):

                    if line.startswith("@"):
                        if line.startswith("@SQ"):
                            match = re.findall(r"\tSN:(\S+)", line)
                            name = match[0] if len(match) else "unknown_reference"

                            match = re.findall(r"\tLN:(\d+)", line)
                            if len(match) == 1:
                                reference_lengths[name] = int(match[0])
                            else:
                                raise ValueError(
                                    "Could not parse SQ line to extract the length " "(LN: field missing maybe ?)"
                                )
                        continue

                    t = line.split()
                    flag = int(t[1])

                    if t[9] != "*" and t[10] != "*" and len(t[9]) != len(t[10]):
                        raise ValueError(
                            "ERROR at line "
                            + str(lineno)
                            + ":inconsistent SEQ and QUAL lengths - "
                            + str(len(t[9]))
                            + " != "
                            + str(len(t[10]))
                        )

                    if t[2] == "*" or (flag & 4):
                        skipped += 1
                        continue

                    # if flag is 256 and pri_only, we skip the alignment
                    if pri_only and (flag & 0x100):
                        continue

                    # Get the reference length for this alignment
                    if t[2] in reference_lengths:
                        tlen = reference_lengths[t[2]]
                    else:
                        raise KeyError("can't find the length of contig {}".format(t[2]))

                    # The reference is known but the length is not
                    if tlen == -1:
                        raise ValueError(
                            "ERROR at line " + str(lineno) + ": can't find the length of contig " + str(t[2])
                        )

                    # TODO explain what are the nn and NM tags
                    match = re.findall(r"\tnn:i:(\d+)", line)
                    if match:
                        nn = int(match[0])
                    else:
                        nn = 0

                    match = re.findall(r"\tNM:i:(\d+)", line)
                    if match:
                        NM = int(match[0])
                        have_NM = True
                    else:
                        NM = 0
                        have_NM = False
                    NM += nn

                    # See sequana.cigar for more information
                    clip = [0, 0]
                    I = [0, 0]  # Insertion
                    D = [0, 0]  # Deletion
                    M, N = 0, 0  # Matches
                    ql, tl, mm = (
                        0,
                        0,
                        0,
                    )
                    ext_cigar = False
                    n_cigar = 0

                    Zacc = ""
                    for count, letter in re.findall(pattern, t[5]):
                        l = int(count)
                        if letter == "M":
                            M += l
                            ql += l
                            tl += l
                            ext_cigar = False
                            Zacc += "{}M".format(count)
                        elif letter == "I":
                            I[0] += 1
                            I[1] += l
                            ql += l
                            Zacc += "{}I".format(count)
                        elif letter == "D":
                            D[0] += 1
                            D[1] += l
                            tl += l
                            Zacc += "{}D".format(count)
                        elif letter == "N":
                            N += l
                            tl += l
                        elif letter == "S":
                            clip[0 if M == 0 else 1] = l
                            ql += l
                        elif letter == "H":
                            clip[0 if M == 0 else 1] = l
                        elif letter == "=":
                            M += l
                            ql += l
                            tl += l
                            ext_cigar = True
                        elif letter == "X":
                            M += l
                            ql += l
                            tl += l
                            mm += l
                            ext_cigar = True
                        n_cigar += 1

                    prefix_msg = "at line {}: ".format(lineno) + "{}"

                    if n_cigar > 65535:
                        logger.warning(prefix_msg.format(str(n_cigar) + " CIGAR operations"))

                    if tl + int(t[3]) - 1 > tlen:
                        logger.warning(prefix_msg.format("alignment end " "position larger than ref length; skipped"))
                        continue

                    if t[9] != "*" and len(t[9]) != ql:
                        logger.warning(
                            prefix_msg.format(
                                " SEQ length inconsistent with CIGAR("
                                + str(len(t[9]))
                                + " != "
                                + str(ql)
                                + "); skipped"
                            )
                        )
                        continue

                    if have_NM is False or ext_cigar:
                        NM = I[1] + D[1] + mm

                    if NM < I[1] + D[1] + mm:
                        logger.warning(
                            prefix_msg.format(
                                " NM is less than the total number of gaps ("
                                + str(NM)
                                + " < "
                                + str(I[1] + D[1] + mm)
                                + ")"
                            )
                        )
                        NM = I[1] + D[1] + mm

                    # extra information to store in the PAF after the 12
                    # extra field from original code. The insert and
                    # deletions (io/in and do/di) and mm is the number of other
                    # substitutions ? NM -I[1] -D[1]
                    extra = [
                        "mm:i:" + str(NM - I[1] - D[1]),
                        "io:i:" + str(I[0]),
                        "in:i:" + str(I[1]),
                        "do:i:" + str(D[0]),
                        "dn:i:" + str(D[1]),
                    ]

                    match = M - (NM - I[1] - D[1])
                    blen = M + I[1] + D[1]
                    qlen = M + I[1] + clip[0] + clip[1]

                    # What does flag 16 means ?
                    if flag & 16:
                        qs = clip[1]
                        qe = qlen - clip[0]
                    else:
                        qs = clip[0]
                        qe = qlen - clip[1]

                    ts = int(t[3]) - 1
                    te = ts + M + D[1] + N

                    # WARNING: difference with sam2paf.js : we add and substract nn
                    # from match and blen to agree with the output SAM file
                    # generated by minimap2

                    # The 12 compulsary fields to have a valid PAF format
                    a = [
                        t[0],
                        qlen,
                        qs,
                        qe,
                        "-" if flag & 16 else "+",
                        t[2],
                        tlen,
                        ts,
                        te,
                        match + nn,
                        blen - nn,
                        t[4],
                    ]
                    # cast to string and save in file
                    a = [str(x) for x in a]

                    # What extra fields do we want to add ?
                    # original fields found in the SAM file ?
                    if extra_fields == "SAM" and len(t) > 11:
                        fout.write("\t".join(a) + "\t" + "\t".join(t[11:]) + "\tcg:Z:{}\n".format(Zacc))
                    elif extra_fields == "summary":
                        fout.write("\t".join(a) + "\t" + "\t".join(extra) + "\n")
                    elif extra_fields is None:
                        fout.write("\t".join(a) + "\n")

        self.skipped = skipped

    # @requires(external_binaries=["k8", "paftools"])
    # def _method_paftools(self, *args, **kwargs):
    #    cmd = "paftools sam2paf {} > {}".format(self.infile, self.outfile)
    #    self.execute(cmd)