###########################################################################
# Bioconvert is a project to facilitate the interconversion #
# of life science data from one format to another. #
# #
# Copyright © 2018-2022 Institut Pasteur, Paris and CNRS. #
# #
# bioconvert is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# bioconvert is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program (COPYING file). #
# If not, see <http://www.gnu.org/licenses/>. #
# #
# Repository: https://github.com/bioconvert/bioconvert #
# Documentation: http://bioconvert.readthedocs.io #
###########################################################################
"""Convert :term:`SRA` format to :term:`FASTA` format"""
import os
import shutil
import subprocess
import tempfile
from bioconvert import ConvBase
from bioconvert.core.decorators import requires
[docs]class SRA2FASTQ(ConvBase):
"""Download FASTQ from SRA archive
::
bioconvert sra2fastq ERR043367
This may take some times since the files are downloaded from SRA website.
"""
#: Default value
_default_method = "fastq_dump"
# If test: will take only the first 10 reads from the sra file
def __init__(self, infile, outfile, test=False):
""".. rubric:: constructor
https://edwards.flinders.edu.au/fastq-dump/
library used: sra-toolkit
"""
super().__init__(infile, outfile)
self.test = test
[docs] @requires("fastq-dump")
def _method_fastq_dump(self, *args, **kwargs):
"""Uses Sratoolkit (fastq-dump) to convert a sra file to fastq
`Fastq-dump documentation <https://edwards.flinders.edu.au/fastq-dump/>`_"""
inname = os.path.split(os.path.splitext(self.infile)[0])[1]
outbasename, ext = os.path.splitext(self.outfile)
compresscmd = ""
gzext = ""
if ext == ".gz":
compresscmd = "--gzip"
gzext = ".gz"
outbasename = os.path.splitext(outbasename)[0]
infile = self.infile
# If the file does not exist locally, we take the basename
# it should correspond to a SRA ID
if os.path.isfile(infile) is False:
infile = inname
tmpdir = tempfile.mkdtemp()
testcmd = ""
# If in test mode, we retrieve only 10 reads from sra
if self.test:
testcmd = "-X 10"
if self.isPairedSRA(infile):
cmd = "fastq-dump {} {} --split-files -O {} {}".format(testcmd, compresscmd, tmpdir, infile)
self.execute(cmd)
cmd = "mv {}/{}_1.fastq{} {}_1.fastq{}".format(tmpdir, inname, gzext, outbasename, gzext)
self.execute(cmd)
cmd = "mv {}/{}_2.fastq{} {}_2.fastq{}".format(tmpdir, inname, gzext, outbasename, gzext)
self.execute(cmd)
else:
cmd = "fastq-dump {} {} -O {} {}".format(testcmd, compresscmd, tmpdir, infile)
self.execute(cmd)
cmd = "mv {}/{}.fastq{} {}".format(tmpdir, inname, gzext, self.outfile)
self.execute(cmd)
shutil.rmtree(tmpdir)
def isPairedSRA(self, filename):
try:
contents = subprocess.check_output(["fastq-dump", "-X", "1", "-Z", "--split-spot", filename])
except subprocess.CalledProcessError:
raise Exception("Error running fastq-dump on", filename)
if contents.count(b"\n") == 4:
return False
elif contents.count(b"\n") == 8:
return True
else:
raise Exception("Unexpected output from fast-dump on ", filename)