Source code for aspring.main

import os
import subprocess  # library to execute bash command line in python script
import argparse
import sys
import pandas as pd

from aspring import __version__

[docs]def parse_args(args):
    parser = get_arg_parser()
    return parser.parse_args(args)

[docs]def get_arg_parser():
    parser = argparse.ArgumentParser(
        description=
        'From Thoraxe outputs for a single query gene to its Alternative Splicing Repetitive Units'
    )
    parser.add_argument('--gene',
                        dest='geneName',
                        type=str,
                        required=True,
                        help='name of queried gene')
    parser.add_argument('--path_data',
                        type=str,
                        required=False,
                        help='path to dir containing ThorAxe outputs (default: %(default)s)',
                        default=os.getcwd())
    parser.add_argument(
        '--path_hhsuite_scripts',
        type=str,
        required=False,
        help='path to the folder containing the scripts of hhsuite, by default it uses the value of the environment variable HHSUITE_SCRIPTS if it exists',
        default=os.environ.get('HHSUITE_SCRIPTS', ''))  # default value is empty string
    parser.add_argument(
        '--len',
        type=int,
        required=False,
        help=
        "don't create profiles for msas in which sequences are of length < len aa (default: %(default)s)",
        default=5)
    parser.add_argument(
        '--id',
        type=float,
        required=False,
        help='[0.0,100.0] maximum pairwise sequence identity (%%) (default: %(default)s)',
        default=100.0)
    parser.add_argument(
        '--norealign',
        type=int,
        required=False,
        help=
        'bool, 1 if norealign else 0, do NOT realign displayed hits with Maximum Accuracy algorithm (MAC) (default: %(default)s)',
        default=1)
    parser.add_argument(
        '--glo_loc',
        type=int,
        required=False,
        help=
        'bool, 1 if global else 0, use global/local alignment mode for searching/ranking (default: %(default)s)',
        default=1)
    parser.add_argument(
        '--mact',
        type=float,
        required=False,
        help=
        '[0.0,1.0] posterior prob threshold for MAC realignment controlling greediness at alignment ends: 0:global >0.1:local (default: %(default)s)',
        default=0.0)
    parser.add_argument(
        '--id_pair',
        type=float,
        required=False,
        help=
        '[0.0,100.0] Identity percentage threshold between first sequence in msa of s-exon for each s-exon in a pair (default: %(default)s)',
        default=50.0
    )
    parser.add_argument(
        '--idCons_pair',
        type=float,
        required=False,
        help=
        '[0.0,100.0] Identity percentage threshold between consensus sequence of msa of s-exon for each s-exon in a pair (should be equal to id_pair) (default: %(default)s)',
        default=50.0
    )
    parser.add_argument(
        '--pval',
        type=float,
        required=False,
        help='[0.0,1.0] p-value threshold for HMM-HMM alignment of a s-exons pair  (default: %(default)s)',
        default=0.001)
    parser.add_argument(
        '--nbSpe',
        type=int,
        required=False,
        help='[1,10] minimum number of species in msa for s-exons in the pair  (default: %(default)s)',
        default=2)
    parser.add_argument(
        '--cov',
        type=float,
        required=False,
        help=
        '[0.0,1.0] Threshold for coverage of s-exon A and B in alignment of A and B (default: %(default)s)',
        default=0.8
    )
    parser.add_argument('--version',
                        action='version',
                        version=f'aspring {__version__}')
    return parser


[docs]def check_if_executable_exists(executable):
    try:
        subprocess.run([executable, '--help'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    except FileNotFoundError:
        raise Exception(f'{executable} is not in the PATH. Please install it and try again.')


[docs]def run_pipeline(gene, path_data, path_hhsuite_scripts, msa_len,
                 msa_id_threshold, re_align, glo_loc, mact, id_pair,
                 idCons_pair, pval, nbSpe, cov):

    check_if_executable_exists('hhmake')
    check_if_executable_exists('hhalign')

    print("START STEP 1 : pre-processing : convert .fasta to .a2m")
    bashCommand = f"step_01_preprocess --gene {gene} --dataPATH {path_data} --path_hhsuite_scripts {path_hhsuite_scripts} --len {msa_len}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 1")

    print("START STEP 2 : pre-processing : create HMM profiles")
    bashCommand = f"step_02_hmm_maker --gene {gene} --path_data {path_data} --id {msa_id_threshold}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 2")

    print("START STEP 3 : all-to-all pairwise HMM-HMM profile alignments")
    bashCommand = f"step_03_hmm_aligner --gene {gene} --path_data {path_data} --id {msa_id_threshold} --norealign {re_align} --glo_loc {glo_loc} --mact {mact}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 3")

    print("START STEP 4 : parse alignments and create the corresponding table")
    bashCommand = f"step_04_gettable --gene {gene} --path_data {path_data}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 4")

    print("START STEP 5 : post-processing : filter the table")
    bashCommand = f"step_05_filter --gene {gene} --dataPATH {path_data} --id_pair {id_pair} --idCons_pair {idCons_pair} --pval {pval} --nbSpe {nbSpe} --cov {cov}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 5")

    df = pd.read_csv(
        os.path.join(path_data, 'data', gene, gene + '_duplication_pairs_formated.csv'))
    if df.shape[0] == 0:
        print('There are no similar pairs of s-exons')
        sys.exit()

    print(
        "START STEP 6 : post-processing : determine the type of similar s-exons in the context of AS"
    )
    bashCommand = f"step_06_stats --gene {gene} --path_data {path_data}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 6")

    print("START STEP 7 : post-processing : reformat table")
    bashCommand = f"step_07_reformat --gene {gene} --dataPATH {path_data}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 7")

    print("START STEP 8 : create ASRUs and instances tables for query gene")
    bashCommand = f"step_08_ASRUs --gene {gene} --dataPATH {path_data}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END STEP 8")

    print("START STEP 9 : remove temporary files")
    bashCommand = f"step_09_clean --gene {gene} --path_data {path_data}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)

    print("START FINAL STEP : map s-exons and ASRUs to the AlphaFold protein structures")
    bashCommand = f"step_10_struct --gene {gene} --dataPATH {path_data}"
    subprocess.run(bashCommand.split(), stdout=subprocess.PIPE)
    print("END")


[docs]def run():
    args = parse_args(sys.argv[1:])

    gene = args.geneName
    path_data = args.path_data
    path_hhsuite_scripts = args.path_hhsuite_scripts
    if not path_hhsuite_scripts:
        raise Exception('You need to specify the path to the hhsuite scripts using the --path_hhsuite_scripts argument or by setting the HHSUITE_SCRIPTS environment variable.')
    msa_len = args.len
    msa_id_threshold = args.id  # maximum pairwise sequence identity
    re_align = args.norealign
    glo_loc = args.glo_loc
    mact = args.mact
    id_pair = args.id_pair
    idCons_pair = args.idCons_pair
    pval = args.pval
    nbSpe = args.nbSpe
    cov = args.cov

    run_pipeline(gene, path_data, path_hhsuite_scripts, msa_len,
                 msa_id_threshold, re_align, glo_loc, mact, id_pair,
                 idCons_pair, pval, nbSpe, cov)


if __name__ == '__main__':
    run()
Source code for aspring.main

aspring

Navigation

Related Topics