Source code for aspring.step_02_hmm_maker

import os
import sys
import subprocess  # library to execute bash command line in python script
import glob
import argparse

from aspring import __version__


[docs]def parse_args(args): parser = get_arg_parser() return parser.parse_args(args)
[docs]def get_arg_parser(): parser = argparse.ArgumentParser( description= 'STEP 2 : Generates a Hidden Markov Model (HMM) profile for each s-exon.' ) parser.add_argument('--gene', dest='geneName', type=str, required=True, help='name of gene') parser.add_argument( '--id', type=float, required=False, help='[0,100] maximum pairwise sequence identity (%%) (def=100)', default=100) parser.add_argument('--path_data', type=str, required=True, help='path to dir containing Thoraxe outputs') #parser.add_argument('--M', type=float, required=False, help='[0,100] columns with fewer than X%% gaps are match states (def=50)', default=50) #parser.add_argument('--qsc', type=float, required=False, help='[0,100] minimum score per column with query (def=0)', default=0) #parser.add_argument('--len', type=int, required=False, help='dont create profile for msa in which sequences are of length < X aa (def=5)', default=5) parser.add_argument( "--version", action="version", version=f"aspring {__version__}", ) return parser
[docs]def hmm_maker(gene, msa_id_threshold, path_data): """Create HMM profiles for all s-exons of a chosen gene""" msa_folder = f'{path_data}/{gene}/thoraxe/msa' files = glob.glob(f"{msa_folder}/*") profiles = [profile for profile in files if '.hhm' in profile] a2ms = [msa for msa in files if '.a2m' in msa] if len(profiles) > 0: #not == len(msas) because cases where we won't create profiles (len(msa)<5aa) print('all HMM profiles are already created') exit() # Creating HMM profiles for gene for msa in a2ms: path = '/'.join(msa.split('/')[:-1]) s_exon_name = msa.split( '/')[-1][:-4] #-6 because it removes .a2m suffix s_exon_id = s_exon_name[11:] hmm_out = path + '/' + f'{s_exon_name}.hhm' if os.path.isfile(hmm_out): print('hmm_out is already created') continue bashCommand = f"hhmake -i {msa} -o {hmm_out} -v 0 -name {s_exon_id} -id {msa_id_threshold} -M a2m" subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
[docs]def run(): args = parse_args(sys.argv[1:]) gene = args.geneName # name of gene msa_id_threshold = args.id # maximum pairwise sequence identity #msa_gap_threshold = args.M # columns with fewer than X\% gaps are match states #msa_col_score = args.qsc # columns with fewer than X\% gaps are match states path_data = args.path_data #msa_len = args.len hmm_maker(gene, msa_id_threshold, path_data)
if __name__ == '__main__': run()