PhyloCSF.zip_coservation_lncRNA_phyloCSF分析_phylocsf值分析

共1个文件

py：1个

版权申诉

lncrna

44 浏览量 2022-09-23 18:38:02 上传评论收藏 2KB ZIP 举报

资源详情

资源评论

资源推荐

收起资源包目录

PhyloCSF.zip （1个子文件）

PhyloCSF.py 5KB

#!/usr/bin/python #_*_coding:utf-8_*_ from __future__ import division import argparse from Bio import SeqIO import os import os.path import re import sys import shutil from collections import defaultdict import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import threading import multiprocessing PHAST_HOME = "/home/soft/phast" MAF_DATABASE_HOME = "/home/soft/phastConsDatabase" PhyloCSF_HOME = "/home/soft/PhyloCSF" def twodimdict(thedict, key_a, key_b, value): if key_a in thedict: thedict[key_a][key_b] = value else: thedict[key_a] = {} thedict[key_a][key_b] = value def get_maf(phast_home, maf_database_home, chrID, start, end, transID, odir): chr_maf_file = maf_database_home + "/chr" + str(chrID) + ".maf" trans_maf_file = odir + "/" + transID + ".maf" CMD = phast_home + "/bin/maf_parse --start " + start + " --end " + end + " " + chr_maf_file + " > " + trans_maf_file print(CMD) os.system(CMD) return trans_maf_file def parse_trans_maf_file(trans_maf_file, species_file): f = open(species_file, "r") species = [] for line in f: line = line.strip() spec = re.findall("\w+", line)[0] species.append(spec) new_trans_fas_file = os.path.splitext(trans_maf_file)[0] f = open(trans_maf_file, "r") flag = 0 flag_len = dict() seq = dict() for line in f: lines = line.strip().split() if len(lines) == 0 or re.findall("##", line): continue elif re.findall("a", lines[0]): flag += 1 elif re.findall("s", lines[0]): flag_len[flag] = len(lines[6]) sp = lines[1].split(".")[0] twodimdict(seq, flag, sp, lines[6]) f.close() new_seq = dict() for key1 in range(1, flag + 1): for key2 in species: if key2 not in seq[key1]: twodimdict(new_seq, key2, key1, "-"*flag_len[key1]) else: twodimdict(new_seq, key2, key1, seq[key1][key2]) new_new_seq = dict() for i in species: for j in range(1, flag + 1): if i not in new_new_seq: new_new_seq[i] = new_seq[i][j] else: new_new_seq[i] += new_seq[i][j] f1 = open(new_trans_fas_file, "w") for x in species: f1.writelines(">" + x + "\n" + new_new_seq[x] + "\n") f1.close() return new_trans_fas_file def get_cons_score(config_file, phylocsf_home, new_trans_fas_file): base_dir = os.path.split(new_trans_fas_file)[0] cons_file = base_dir + "/conservation_score.txt" CMD = phylocsf_home + "/PhyloCSF --removeRefGaps --orf=StopStop --frames=6 --aa " + config_file + " " + new_trans_fas_file + " >> " + cons_file print(CMD) os.system(CMD) def run_job(phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, odir): species_file = maf_database_home + "/hg38.20way.nh" config_file = maf_database_home + "/hg38.20way" trans_maf_file = get_maf(phast_home, maf_database_home, chrID, start, end, transID, odir) new_trans_fas_file = parse_trans_maf_file(trans_maf_file, species_file) get_cons_score(config_file, phylocsf_home, new_trans_fas_file) os.remove(trans_maf_file) if os.path.exists(trans_maf_file) else "" os.remove(new_trans_fas_file) if os.path.exists(new_trans_fas_file) else "" def run(bed_file, phast_home, maf_database_home, phylocsf_home, outdir): f = open(bed_file, "r") for line in f: lines = line.strip().split() chrID = lines[0] start = lines[1] end = lines[2] transID = lines[3] run_job(phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir) f.close() def run_t(bed_file, phast_home, maf_database_home, phylocsf_home, outdir): f = open(bed_file, "r") flag = 0 threads = [] for line in f: flag += 1 lines = line.strip().split() chrID = lines[0] start = lines[1] end = lines[2] transID = lines[3] t = threading.Thread(target=run_job,args=(phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir,)) threads.append(t) if flag % 40 == 0: for i in range(flag - 40, flag): threads[i].start() for i in range(flag - 40, flag): threads[i].join() f.close() def run_p(bed_file, phast_home, maf_database_home, phylocsf_home, outdir): f = open(bed_file, "r") pool = multiprocessing.Pool(processes = 40) for line in f: lines = line.strip().split() chrID = lines[0] start = lines[1] end = lines[2] transID = lines[3] pool.apply_async(run_job, (phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir,)) pool.close() pool.join() f.close() print("All done!") bed_file = "/home/zfc/selected_potential_lincRNA.gtf.bed" outdir = "/home/zfc/testagain" #run_p(bed_file, PHAST_HOME, MAF_DATABASE_HOME, PhyloCSF_HOME, outdir) def run_s(bed_file, phast_home, maf_database_home, phylocsf_home, outdir): f = open(bed_file, "r") pool = multiprocessing.Pool(processes = 40) flag = 0 for line in f: flag += 1 lines = line.strip().split() chrID = lines[0] start = lines[1] end = lines[2] transID = lines[3] if flag > 360: pool.apply_async(run_job, (phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir,)) pool.close() pool.join() f.close() print("All done!") bed_file = "/home/zfc/selected_potential_lincRNA.gtf.bed" outdir = "/home/zfc/testagain" run_s(bed_file, PHAST_HOME, MAF_DATABASE_HOME, PhyloCSF_HOME, outdir)