#!/usr/bin/python
#_*_coding:utf-8_*_
from __future__ import division
import argparse
from Bio import SeqIO
import os
import os.path
import re
import sys
import shutil
from collections import defaultdict
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import threading
import multiprocessing
PHAST_HOME = "/home/soft/phast"
MAF_DATABASE_HOME = "/home/soft/phastConsDatabase"
PhyloCSF_HOME = "/home/soft/PhyloCSF"
def twodimdict(thedict, key_a, key_b, value):
if key_a in thedict:
thedict[key_a][key_b] = value
else:
thedict[key_a] = {}
thedict[key_a][key_b] = value
def get_maf(phast_home, maf_database_home, chrID, start, end, transID, odir):
chr_maf_file = maf_database_home + "/chr" + str(chrID) + ".maf"
trans_maf_file = odir + "/" + transID + ".maf"
CMD = phast_home + "/bin/maf_parse --start " + start + " --end " + end + " " + chr_maf_file + " > " + trans_maf_file
print(CMD)
os.system(CMD)
return trans_maf_file
def parse_trans_maf_file(trans_maf_file, species_file):
f = open(species_file, "r")
species = []
for line in f:
line = line.strip()
spec = re.findall("\w+", line)[0]
species.append(spec)
new_trans_fas_file = os.path.splitext(trans_maf_file)[0]
f = open(trans_maf_file, "r")
flag = 0
flag_len = dict()
seq = dict()
for line in f:
lines = line.strip().split()
if len(lines) == 0 or re.findall("##", line):
continue
elif re.findall("a", lines[0]):
flag += 1
elif re.findall("s", lines[0]):
flag_len[flag] = len(lines[6])
sp = lines[1].split(".")[0]
twodimdict(seq, flag, sp, lines[6])
f.close()
new_seq = dict()
for key1 in range(1, flag + 1):
for key2 in species:
if key2 not in seq[key1]:
twodimdict(new_seq, key2, key1, "-"*flag_len[key1])
else:
twodimdict(new_seq, key2, key1, seq[key1][key2])
new_new_seq = dict()
for i in species:
for j in range(1, flag + 1):
if i not in new_new_seq:
new_new_seq[i] = new_seq[i][j]
else:
new_new_seq[i] += new_seq[i][j]
f1 = open(new_trans_fas_file, "w")
for x in species:
f1.writelines(">" + x + "\n" + new_new_seq[x] + "\n")
f1.close()
return new_trans_fas_file
def get_cons_score(config_file, phylocsf_home, new_trans_fas_file):
base_dir = os.path.split(new_trans_fas_file)[0]
cons_file = base_dir + "/conservation_score.txt"
CMD = phylocsf_home + "/PhyloCSF --removeRefGaps --orf=StopStop --frames=6 --aa " + config_file + " " + new_trans_fas_file + " >> " + cons_file
print(CMD)
os.system(CMD)
def run_job(phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, odir):
species_file = maf_database_home + "/hg38.20way.nh"
config_file = maf_database_home + "/hg38.20way"
trans_maf_file = get_maf(phast_home, maf_database_home, chrID, start, end, transID, odir)
new_trans_fas_file = parse_trans_maf_file(trans_maf_file, species_file)
get_cons_score(config_file, phylocsf_home, new_trans_fas_file)
os.remove(trans_maf_file) if os.path.exists(trans_maf_file) else ""
os.remove(new_trans_fas_file) if os.path.exists(new_trans_fas_file) else ""
def run(bed_file, phast_home, maf_database_home, phylocsf_home, outdir):
f = open(bed_file, "r")
for line in f:
lines = line.strip().split()
chrID = lines[0]
start = lines[1]
end = lines[2]
transID = lines[3]
run_job(phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir)
f.close()
def run_t(bed_file, phast_home, maf_database_home, phylocsf_home, outdir):
f = open(bed_file, "r")
flag = 0
threads = []
for line in f:
flag += 1
lines = line.strip().split()
chrID = lines[0]
start = lines[1]
end = lines[2]
transID = lines[3]
t = threading.Thread(target=run_job,args=(phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir,))
threads.append(t)
if flag % 40 == 0:
for i in range(flag - 40, flag):
threads[i].start()
for i in range(flag - 40, flag):
threads[i].join()
f.close()
def run_p(bed_file, phast_home, maf_database_home, phylocsf_home, outdir):
f = open(bed_file, "r")
pool = multiprocessing.Pool(processes = 40)
for line in f:
lines = line.strip().split()
chrID = lines[0]
start = lines[1]
end = lines[2]
transID = lines[3]
pool.apply_async(run_job, (phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir,))
pool.close()
pool.join()
f.close()
print("All done!")
bed_file = "/home/zfc/selected_potential_lincRNA.gtf.bed"
outdir = "/home/zfc/testagain"
#run_p(bed_file, PHAST_HOME, MAF_DATABASE_HOME, PhyloCSF_HOME, outdir)
def run_s(bed_file, phast_home, maf_database_home, phylocsf_home, outdir):
f = open(bed_file, "r")
pool = multiprocessing.Pool(processes = 40)
flag = 0
for line in f:
flag += 1
lines = line.strip().split()
chrID = lines[0]
start = lines[1]
end = lines[2]
transID = lines[3]
if flag > 360:
pool.apply_async(run_job, (phast_home, maf_database_home, phylocsf_home, chrID, start, end, transID, outdir,))
pool.close()
pool.join()
f.close()
print("All done!")
bed_file = "/home/zfc/selected_potential_lincRNA.gtf.bed"
outdir = "/home/zfc/testagain"
run_s(bed_file, PHAST_HOME, MAF_DATABASE_HOME, PhyloCSF_HOME, outdir)
PhyloCSF.zip_coservation_lncRNA_phyloCSF分析_phylocsf值分析_rna保守性分析
版权申诉
44 浏览量
2022-09-23
18:38:02
上传
评论
收藏 2KB ZIP 举报
御道御小黑
- 粉丝: 58
- 资源: 1万+
最新资源
- # 微信小程序-健康菜谱 基于微信小程序的一个查找检索菜谱的应用 ### 效果 !动态图(./res/gif/demo
- zabbix-get命令包资源
- 毕业设计,基于PyQt5实现的可视化界面的Python车牌自动识别系统源码
- 26-朴素贝叶斯分类.rar
- 没有安Matlab 也可以 生成FIR抽头系数工具.py
- python烟花代码.rar
- 实验目的: 1.构建基于verilog语言的组合逻辑电路和时序逻辑电路; 2.掌握verilog语言的电路设计技巧 3.完成如
- 扩展卡尔曼滤波matlab仿真
- 3_base.apk.1
- 躺赢者PRO飞控常见典型问题合集(续一)无名小哥 余义 20240501待修
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
评论0