Source code for aspring.step_05_filter

import argparse
import pandas as pd
import sys

from aspring import __version__


[docs]def parse_args(args): parser = get_arg_parser() return parser.parse_args(args)
[docs]def get_arg_parser(): parser = argparse.ArgumentParser( description='Filter the table to keep gene duplication pairs based on identity, coverage, p-value and number of species in the MSAs', ) parser.add_argument('--gene', dest='geneName', type=str, required=True, help='name of gene') parser.add_argument('--dataPATH', type=str, required=True, help='path to dir containing Thoraxe outputs') parser.add_argument( '--id_pair', type=float, required=True, help= 'Identity percentage threshold between first sequence in msa of s-exon for each s-exon in a pair' ) parser.add_argument( '--idCons_pair', type=float, required=True, help= 'Identity percentage threshold between consensus sequence of msa of s-exon for each s-exon in a pair' ) parser.add_argument( '--pval', type=float, required=True, help='p-value threshold for HMM-HMM alignment of a s-exons pair') parser.add_argument( '--nbSpe', type=float, required=True, help='minimum number of species in msa for s-exons in the pair') parser.add_argument( '--cov', type=float, required=True, help='Threshold for coverage of s-exon A and B in alignment of A and B' ) parser.add_argument( "--version", action="version", version=f"aspring {__version__}", ) return parser
[docs]def filter(gene, path_data, id_pair, idCons_pair, pval, nbSpe, cov): intra_gene = pd.read_csv(path_data + f'/data/{gene}/{gene}_duplication_pairs.csv') #filtrer la table intra_gene_filtered = intra_gene.loc[ (intra_gene['Identities'] >= id_pair) & (intra_gene['IdCons'] >= idCons_pair) & (intra_gene['P-value'] <= pval) & (intra_gene['NoSpecies_Q'] >= nbSpe) & (intra_gene['NoSpecies_T'] >= nbSpe)] #!!!keep pairs where the alignement cover at least 80% of one of the s-exons ind_table = [] for ind in intra_gene_filtered.index: row = intra_gene.iloc[ind] colsQ = int(row[7].split('-')[1]) - int(row[7].split('-')[0]) + 1 coverageQ = colsQ / int(row[9]) colsT = int(row[8].split('-')[1]) - int(row[8].split('-')[0]) + 1 coverageT = colsT / int(row[10]) if max(coverageQ, coverageT) >= 0.8: ind_table.append(ind) intra_gene = pd.DataFrame([intra_gene.iloc[ind] for ind in ind_table], columns=list(intra_gene.columns)) #reformattage pour appliquer script d'Elodie intra_gene['Cols'] = [ int(i.split('-')[1]) - int(i.split('-')[0]) + 1 for i in intra_gene['Cols_Q'] ] intra_gene['Coverage_A'] = [ 100 * (int(row[7].split('-')[1]) - int(row[7].split('-')[0]) + 1) / int(row[9]) for row in intra_gene.to_numpy() ] intra_gene['Coverage_B'] = [ 100 * (int(row[8].split('-')[1]) - int(row[8].split('-')[0]) + 1) / int(row[10]) for row in intra_gene.to_numpy() ] intra_gene = intra_gene.drop(columns=[ 'Cols_Q', 'Cols_T', 'IdCons', 'NoSpecies_Q', 'NoSpecies_T', 'Score' ]) intra_gene = intra_gene.drop(columns=["Identities", "Similarity"]) intra_gene = intra_gene.rename( columns={ "S_exon_Q": "S_exon_A", "S_exon_T": "S_exon_B", "E-value": "E_value", "P-value": "P_value", 'Length_Q': 'Length_A', 'Length_T': 'Length_B', 'Identities': "Identity" }) intra_gene = intra_gene[[ 'S_exon_A', 'S_exon_B', 'Prob', 'E_value', 'P_value', 'Cols', 'Gene', 'Length_A', 'Length_B', 'Coverage_A', 'Coverage_B' ]] intra_gene.to_csv( f'{path_data}/data/{gene}/{gene}_duplication_pairs_formated.csv', index=0) fwrite = open(f"{path_data}/data/{gene}/{gene}_canonical_path.txt", 'w') f = open(path_data + "/" + gene + '/thoraxe/path_table.csv', 'r') tmp = [next(f) for x in range(2)] fwrite.write(gene + ' ' + tmp[1].split(',')[4] + '\n') fwrite.close()
[docs]def run(): args = parse_args(sys.argv[1:]) gene = args.geneName path_data = args.dataPATH id_pair = args.id_pair idCons_pair = args.idCons_pair pval = args.pval nbSpe = args.nbSpe cov = args.cov filter(gene, path_data, id_pair, idCons_pair, pval, nbSpe, cov)
if __name__ == "__main__": run()