Source code for aspring.step_03_hmm_aligner

import os
import sys
import subprocess  # library to execute bash command line in python script
import glob
import itertools
import argparse
import datetime

from aspring import __version__


[docs]def parse_args(args): parser = get_arg_parser() return parser.parse_args(args)
[docs]def get_arg_parser(): parser = argparse.ArgumentParser( description= 'STEP 3 : HMM-HMM alignment of all the s-exons combinations.' ) parser.add_argument('--gene', dest='geneName', type=str, required=True, help='name of gene') parser.add_argument( '--id', type=float, required=False, help= '[0,100] maximum pairwise sequence identity (%%) applied to query MSA, template MSA, and result MSA (def=100)', default=100) parser.add_argument('--path_data', type=str, required=True, help='path to dir containing Thoraxe outputs') parser.add_argument( '--norealign', type=int, required=True, help= 'bool, 1 if norealign else 0, do NOT realign displayed hits with Maximum Accuracy algorithm (MAC) (def=0)', default=0) parser.add_argument( '--glo_loc', type=int, required=True, help= 'bool, 1 if global else 0, use global/local alignment mode for searching/ranking (def=local)', default=0) parser.add_argument( '--mact', type=float, required=True, help= '[0,1[ posterior prob threshold for MAC realignment controlling greediness at alignment ends: 0:global >0.1:local (default=0.35)', default=0.35) parser.add_argument( "--version", action="version", version=f"aspring {__version__}", ) return parser
[docs]def hmm_aligner(gene, msa_id_threshold, path_data, re_align, glo_loc, mact): """ HMM-HMM profile alignment of all s-exons combinations for a chosen gene """ print('start', datetime.datetime.now().time()) dico_gloloc = {0: '-local', 1: '-global'} dico_align = {0: '', 1: '-norealign'} msa_folder = f'{path_data}/{gene}/thoraxe/msa' profiles = glob.glob(f"{msa_folder}/*.hhm") #profiles=[profile for profile in files if '.hhm' in profile] if len(profiles) == 0: print('You need to generate the HMM profiles !!') exit() aln_folder = f'{path_data}/DupRaw' #where the alignments will be located gene_aln_folder = aln_folder + '/' + gene os.makedirs( gene_aln_folder, exist_ok=True ) #make gene_dir in DupRaw, if folder already exist don't do anything for (hmm_a, hmm_b) in itertools.combinations(profiles, 2): name_a = hmm_a.split('/')[-1].rstrip('.hhm') name_b = hmm_b.split('/')[-1].rstrip('.hhm') id_a, id_b = name_a[11:], name_b[11:] #name is msa_s_exon_X hhr_out = gene_aln_folder + '/' + id_a + '.' + id_b + '.hhr' bashCommand = f'hhalign -i {hmm_a} -t {hmm_b} -o {hhr_out} -v 0 -id {msa_id_threshold} {dico_align[re_align]} {dico_gloloc[glo_loc]} -mact {mact}' if not os.path.exists(hhr_out): subprocess.run(bashCommand.split(), stdout=subprocess.PIPE) print(gene, 'end', datetime.datetime.now().time())
[docs]def run(): args = parse_args(sys.argv[1:]) gene = args.geneName msa_id_threshold = args.id path_data = args.path_data re_align = args.norealign glo_loc = args.glo_loc mact = args.mact hmm_aligner(gene, msa_id_threshold, path_data, re_align, glo_loc, mact)
if __name__ == '__main__': run()