#!/usr/bin/env python
# Copyright 2011(c) The Ontario Institute for Cancer Research. All rights reserved.
#
# This program and the accompanying materials are made available under the
# terms of the GNU Public License v3.0.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import division
from itertools import count, ifilter, imap
from collections import namedtuple
from functools import partial
import re
import os, sys
import subprocess
import csv
import pysam
from bx.intervals.intersection import Intersecter, Interval
from database import *
import alignment
Counter = namedtuple('Counter', ['xeno_mapped', 'ref_mapped', 'ref_unmapped', 'maps_gene'])
Meta = namedtuple('Meta', ['sample', 'alignment'])
LookupGroup = namedtuple('LookupGroup', ['xeno', 'ref'])
Lookup = namedtuple('Lookup', ['file', 'column'])
db = None
logger = None
meta = None
mapq = None
intersecters = {}
genomes = {}
counter = Counter(count(), count(), count(), count())
regex = re.compile("gi\|(.+?)($|\|)|ref\|(.+?)(\.|$|\|)")
temp = None
def check_align(args):
finder = {"name": args.align, "sample": args.sample, "projectLabel": args.project}
aln = db.alignment.find_one(finder)
if not aln:
logger.error("Alignment {0} not found in {1}/{2}.".format(args.align, args.project, args.sample))
sys.exit(1)
return aln
def get_meta(args):
"""Gather the meta data for the alignment from the database"""
aln = check_align(args)
sample = db.sample.find_one({"_id": aln['sampleId']})
if not sample:
logger.error("Alignment {0} not found in database. --project and --sample need to be passed to auto-create".format(args.align))
exit(1)
logger.debug('Subtraction for alignment: {0}'.format(args.align))
return Meta(sample, aln)
def update_isref(readId):
''' '''
db.mapped.update({'readId': readId, 'alignmentId':meta.alignment['_id']},
{'$set': {'isRef': 1}}, False, False, False, True)
def extract_unmapped(align, fastq):
"""Output unmapped alignments to Fastq file"""
# here align.qname should be extracted in its original form i.e. with /1 and /2 if they exist
fastq.write("@{0:>s}\n{1:>s}\n+\n{2:>s}\n".format(str(align.qname), str(align.query), str(align.qqual)))
# new
def tuple_to_CIG(readcig):
#cig_ops=['M','I','D','N','S','H','P','=','X']
cig_ops=['M','I','D','N','S','H','P','M','X']
cig_string=str()
for tple in readcig:
cig_string = cig_string + str(tple[1]) + str(cig_ops[tple[0]])
return cig_string
# new
def extract_mapped_sam(align, genome_sam, sam_file):
"""Output mapped xeno reads to a sam file"""
sam_file.write("{0:>s}\t{1:>s}\t{2:>s}\t{3:>s}\t{4:>s}\t{5:>s}\t{6:>s}\t{7:>s}\t{8:>s}\t{9:>s}\t{10:>s}\t{11:>s}\n".format(str(align.qname.split("/")[0]),str(align.flag),genome_sam,str(align.pos + 1),str(align.mapq),str(tuple_to_CIG(align.cigar)),'*','0','0',str(align.query),'*','AS:i:' + str(align.opt('AS'))))
# new
def extract_hg_readIds(align,hgids):
"""Output mapped hg readId to a file"""
hgids.write("{0:>s}\n".format(align.qname.split("/")[0]))
def get_readscan_data(readscan_file):
row_number = 0
genome_identifier = re.compile("gi:(\d+)")
with open(readscan_file, 'rU') as f:
readscan_reader = csv.DictReader(f, delimiter='\t', quotechar='|')
for row in readscan_reader:
row['row'] = row_number
row['index'] = int(row['index'])
row['score'] = float(row['score'])
row_number = row_number + 1
if row.has_key(None):
del row[None]
match = genome_identifier.match(row['id'])
if match:
row['genome'] = int(match.group(1))
yield row
# new
def get_only_xeno_reads(pathogen, human, args):
"""Obtain reads in sam format that only map to xeno"""
#print args.xeno.rsplit('/',1)[0]
logger.info("Temporary directory: {0}".format(temp))
logger.info("Sorting {0}".format(temp + pathogen))
pathogen_input_file = temp + pathogen
pathogen_sorted_file = temp + pathogen + '.sorted'
human_input_file = temp + human
human_sorted_file = temp + human + '.sorted'
if subprocess.call(['sort', pathogen_input_file, '-t\t', '-d', '--key=1,1', '-T', temp, '-o', pathogen_sorted_file]) != 0:
logger.error("Error sorting {0}".format(pathogen_input_file))
sys.exit(1)
logger.info("Sorting {0}".format(temp + human))
if subprocess.call(['sort', human_input_file, '-t\t', '-d', '--key=1,1', '-T', temp, '-o', human_sorted_file]) != 0:
logger.error("Error sorting {0}".format(human_input_file))
sys.exit(1)
os.remove(pathogen_input_file)
os.remove(human_input_file)
xeno_file = os.path.abspath(args.xeno)
xeno_directory = os.path.dirname(xeno_file)
gra = __file__.rsplit('/',1)[0] + '/gra.sh'
logger.info("Output directory: {0}".format(xeno_directory))
if subprocess.call([gra, pathogen_sorted_file, human_sorted_file, xeno_directory]) != 0:
logger.error("Failed to calculate genome relative abundance successfully")
sys.exit(1)
readscan_file = xeno_directory + '/pathogen.gra.txt'
result = list(get_readscan_data(readscan_file))
selector = {"_id" : meta.alignment['_id']}
updater = {"$set" : {"gra" : result}}
db.alignment.update(selector, updater)
def maps_gene(mapped):
"""Determine if the mapped alignment falls within a gene."""
global intersecters
try:
intersecter = intersecters[mapped['genome']]
except KeyError:
genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})
intersecter = Intersecter()
# Interval end is exclusive, need to +1 to line up with actual position
[intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid']))
for gene in genes]
intersecters[mapped['genome']] = intersecter
return intersecter.find(mapped['refStart'], mapped['refEnd'])
def build_mapped(align, genome, reference):
'''Generates dict for mapped alignments'''
# Since align.qqual is in Sanger format.
scores = [ord(m)-33 for m in align.qqual]
#if align.is_proper_pair:
#align_length = align.isize
#ref_end = align.pos + align.isize + 1
#else:
# If the cigar data is missing [(), ()] give it a length of 1
# align_length = align.alen
# ref_end = align.aend or align.pos + align_length + 1
if align.alen is None:
align_length = align.pos + align.qlen + 1
else:
align_length = align.alen
if align.aend is None:
ref_end = align.pos + align.qlen + 1
else:
ref_end = align.aend
try:
MD = align.opt('MD')
mismatch = len(re.findall("\D", MD))
except KeyError:
MD = None
try: mismatch = int(align.rlen)
except TypeError:
logger.debug(align)
logger.error('Aligned read with null length')
exit()
try:
AS = int(align.opt('AS'))
except KeyError:
AS = None
try:
PG = align.opt('PG')
except KeyError:
PG = None
mapped = {
# take only the read identifier exclude /1 and /2
"readId": align.qname.split("/")[0]
, "refStrand": -1 if align.is_reverse else 1
, "refStart": int(align.pos) + 1 # pysam is 0-based index
, "refEnd": int(ref_end)
, "alignLength": int(align_length)
, "readLength": int(align.rlen) # Total Length of the read
, "mapq": int(align.mapq)
, "minQual": min(scores)
, "avgQual": sum(scores) / len(scores)
, "qqual": align.qqual
, "miscalls": align.qqual.count('.')
, "mismatch": mismatch
, "pairEnd": 1 if align.is_proper_pair else 0
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
资源分类:Python库 所属语言:Python 资源全名:capsid-1.6.1.tar.gz 资源来源:官方 安装方法:https://lanzao.blog.csdn.net/article/details/101784059
资源推荐
资源详情
资源评论
收起资源包目录
capsid-1.6.1.tar.gz (20个子文件)
capsid-1.6.1
PKG-INFO 5KB
README.rst 1KB
bin
capsid 17KB
capsid
configure.py 6KB
sam2bam.py 2KB
taxonomy.py 7KB
alignment.py 3KB
gbloader.py 8KB
colorize.py 4KB
statistics.py 17KB
project.py 2KB
qfilter.py 8KB
sample.py 3KB
database.py 1KB
subtraction.py 18KB
__init__.py 2KB
fasta.py 2KB
intersect.py 2KB
NEWS.rst 2KB
setup.py 2KB
共 20 条
- 1
资源评论
挣扎的蓝藻
- 粉丝: 14w+
- 资源: 15万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功