Source code for aspring.step_01_preprocess
import os
import sys
import glob
import subprocess # library to execute bash command line in python script
import argparse
from aspring import __version__
[docs]def parse_args(args):
parser = get_arg_parser()
return parser.parse_args(args)
[docs]def get_arg_parser():
parser = argparse.ArgumentParser(
description=
'STEP 1 : Reformat s-exons fasta files to a2m'
)
parser.add_argument('--gene',
dest='geneName',
type=str,
required=True,
help='name of gene')
parser.add_argument('--dataPATH',
type=str,
required=True,
help='path to dir containing Thoraxe outputs')
parser.add_argument(
'--path_hhsuite_scripts',
type=str,
required=True,
help='path to the folder containing the scripts of hhsuite')
parser.add_argument(
'--len',
type=int,
required=False,
help=
'dont create profile for msa in which sequences are of length < X aa (def=5)',
default=5)
parser.add_argument(
"--version",
action="version",
version=f"aspring {__version__}",
)
return parser
[docs]def preprocess_msas(gene, path_data, path_hhsuite, msa_len):
msa_folder = os.path.join(path_data, gene, 'thoraxe', 'msa')
files = glob.glob(f"{msa_folder}/*")
msas = [msa for msa in files if '.fasta' in msa]
a2ms = [file for file in files if '.a2m' in file]
if len(a2ms) > 0:
#not == len(msas) because cases where we won't create profiles (len(msa)<5aa)
print('all MSAs are already converted')
exit()
for msa in msas:
f = open(msa, 'r')
beginning = [next(f).rstrip('\n') for x in range(2)]
#get first sequence of msa (['>specie1', 'seq'])
f.close()
first_seq = beginning[1]
if len(first_seq) < msa_len:
#don't make profile for msa where len(seq) < 5 aa or user defined value
continue
else:
s_exon_name = msa.split('/')[-1].rstrip('.fasta')
a2m_file = os.path.join(path_data, gene, 'thoraxe', 'msa',
f"{s_exon_name}.a2m")
bashCommand = f"{os.path.join(path_hhsuite, 'reformat.pl')} {msa} {a2m_file}"
subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
[docs]def run():
"""Entry point for the application script"""
args = parse_args(sys.argv[1:])
gene = args.geneName
path_data = args.dataPATH
path_hhsuite = args.path_hhsuite_scripts
msa_len = args.len
preprocess_msas(gene, path_data, path_hhsuite, msa_len)
if __name__ == "__main__":
run()