Python库|capsid-1.6.1.tar.gz资源-CSDN文库

版权申诉

162 浏览量 2022-04-07 01:10:41 上传评论收藏 27KB GZ 举报

共20个文件

py：16个

rst：2个

capsid：1个

资源推荐

资源详情

资源评论

收起资源包目录

capsid-1.6.1.tar.gz （20个子文件）

capsid-1.6.1

PKG-INFO 5KB

README.rst 1KB

bin

capsid 17KB

capsid

configure.py 6KB

sam2bam.py 2KB

taxonomy.py 7KB

alignment.py 3KB

gbloader.py 8KB

colorize.py 4KB

statistics.py 17KB

project.py 2KB

qfilter.py 8KB

sample.py 3KB

database.py 1KB

subtraction.py 18KB

__init__.py 2KB

fasta.py 2KB

intersect.py 2KB

NEWS.rst 2KB

setup.py 2KB

#!/usr/bin/env python # Copyright 2011(c) The Ontario Institute for Cancer Research. All rights reserved. # # This program and the accompanying materials are made available under the # terms of the GNU Public License v3.0. # # You should have received a copy of the GNU General Public License along with # this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import division from itertools import count, ifilter, imap from collections import namedtuple from functools import partial import re import os, sys import subprocess import csv import pysam from bx.intervals.intersection import Intersecter, Interval from database import * import alignment Counter = namedtuple('Counter', ['xeno_mapped', 'ref_mapped', 'ref_unmapped', 'maps_gene']) Meta = namedtuple('Meta', ['sample', 'alignment']) LookupGroup = namedtuple('LookupGroup', ['xeno', 'ref']) Lookup = namedtuple('Lookup', ['file', 'column']) db = None logger = None meta = None mapq = None intersecters = {} genomes = {} counter = Counter(count(), count(), count(), count()) regex = re.compile("gi\|(.+?)($|\|)|ref\|(.+?)(\.|$|\|)") temp = None def check_align(args): finder = {"name": args.align, "sample": args.sample, "projectLabel": args.project} aln = db.alignment.find_one(finder) if not aln: logger.error("Alignment {0} not found in {1}/{2}.".format(args.align, args.project, args.sample)) sys.exit(1) return aln def get_meta(args): """Gather the meta data for the alignment from the database""" aln = check_align(args) sample = db.sample.find_one({"_id": aln['sampleId']}) if not sample: logger.error("Alignment {0} not found in database. --project and --sample need to be passed to auto-create".format(args.align)) exit(1) logger.debug('Subtraction for alignment: {0}'.format(args.align)) return Meta(sample, aln) def update_isref(readId): ''' ''' db.mapped.update({'readId': readId, 'alignmentId':meta.alignment['_id']}, {'$set': {'isRef': 1}}, False, False, False, True) def extract_unmapped(align, fastq): """Output unmapped alignments to Fastq file""" # here align.qname should be extracted in its original form i.e. with /1 and /2 if they exist fastq.write("@{0:>s}\n{1:>s}\n+\n{2:>s}\n".format(str(align.qname), str(align.query), str(align.qqual))) # new def tuple_to_CIG(readcig): #cig_ops=['M','I','D','N','S','H','P','=','X'] cig_ops=['M','I','D','N','S','H','P','M','X'] cig_string=str() for tple in readcig: cig_string = cig_string + str(tple[1]) + str(cig_ops[tple[0]]) return cig_string # new def extract_mapped_sam(align, genome_sam, sam_file): """Output mapped xeno reads to a sam file""" sam_file.write("{0:>s}\t{1:>s}\t{2:>s}\t{3:>s}\t{4:>s}\t{5:>s}\t{6:>s}\t{7:>s}\t{8:>s}\t{9:>s}\t{10:>s}\t{11:>s}\n".format(str(align.qname.split("/")[0]),str(align.flag),genome_sam,str(align.pos + 1),str(align.mapq),str(tuple_to_CIG(align.cigar)),'*','0','0',str(align.query),'*','AS:i:' + str(align.opt('AS')))) # new def extract_hg_readIds(align,hgids): """Output mapped hg readId to a file""" hgids.write("{0:>s}\n".format(align.qname.split("/")[0])) def get_readscan_data(readscan_file): row_number = 0 genome_identifier = re.compile("gi:(\d+)") with open(readscan_file, 'rU') as f: readscan_reader = csv.DictReader(f, delimiter='\t', quotechar='|') for row in readscan_reader: row['row'] = row_number row['index'] = int(row['index']) row['score'] = float(row['score']) row_number = row_number + 1 if row.has_key(None): del row[None] match = genome_identifier.match(row['id']) if match: row['genome'] = int(match.group(1)) yield row # new def get_only_xeno_reads(pathogen, human, args): """Obtain reads in sam format that only map to xeno""" #print args.xeno.rsplit('/',1)[0] logger.info("Temporary directory: {0}".format(temp)) logger.info("Sorting {0}".format(temp + pathogen)) pathogen_input_file = temp + pathogen pathogen_sorted_file = temp + pathogen + '.sorted' human_input_file = temp + human human_sorted_file = temp + human + '.sorted' if subprocess.call(['sort', pathogen_input_file, '-t\t', '-d', '--key=1,1', '-T', temp, '-o', pathogen_sorted_file]) != 0: logger.error("Error sorting {0}".format(pathogen_input_file)) sys.exit(1) logger.info("Sorting {0}".format(temp + human)) if subprocess.call(['sort', human_input_file, '-t\t', '-d', '--key=1,1', '-T', temp, '-o', human_sorted_file]) != 0: logger.error("Error sorting {0}".format(human_input_file)) sys.exit(1) os.remove(pathogen_input_file) os.remove(human_input_file) xeno_file = os.path.abspath(args.xeno) xeno_directory = os.path.dirname(xeno_file) gra = __file__.rsplit('/',1)[0] + '/gra.sh' logger.info("Output directory: {0}".format(xeno_directory)) if subprocess.call([gra, pathogen_sorted_file, human_sorted_file, xeno_directory]) != 0: logger.error("Failed to calculate genome relative abundance successfully") sys.exit(1) readscan_file = xeno_directory + '/pathogen.gra.txt' result = list(get_readscan_data(readscan_file)) selector = {"_id" : meta.alignment['_id']} updater = {"$set" : {"gra" : result}} db.alignment.update(selector, updater) def maps_gene(mapped): """Determine if the mapped alignment falls within a gene.""" global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd']) def build_mapped(align, genome, reference): '''Generates dict for mapped alignments''' # Since align.qqual is in Sanger format. scores = [ord(m)-33 for m in align.qqual] #if align.is_proper_pair: #align_length = align.isize #ref_end = align.pos + align.isize + 1 #else: # If the cigar data is missing [(), ()] give it a length of 1 # align_length = align.alen # ref_end = align.aend or align.pos + align_length + 1 if align.alen is None: align_length = align.pos + align.qlen + 1 else: align_length = align.alen if align.aend is None: ref_end = align.pos + align.qlen + 1 else: ref_end = align.aend try: MD = align.opt('MD') mismatch = len(re.findall("\D", MD)) except KeyError: MD = None try: mismatch = int(align.rlen) except TypeError: logger.debug(align) logger.error('Aligned read with null length') exit() try: AS = int(align.opt('AS')) except KeyError: AS = None try: PG = align.opt('PG') except KeyError: PG = None mapped = { # take only the read identifier exclude /1 and /2 "readId": align.qname.split("/")[0] , "refStrand": -1 if align.is_reverse else 1 , "refStart": int(align.pos) + 1 # pysam is 0-based index , "refEnd": int(ref_end) , "alignLength": int(align_length) , "readLength": int(align.rlen) # Total Length of the read , "mapq": int(align.mapq) , "minQual": min(scores) , "avgQual": sum(scores) / len(scores) , "qqual": align.qqual , "miscalls": align.qqual.count('.') , "mismatch": mismatch , "pairEnd": 1 if align.is_proper_pair else 0

评论收藏

内容反馈

版权申诉