PyPI官网下载|ctbBio-0.16.tar.gz资源-CSDN文库

版权申诉

Python库

121 浏览量 2022-01-10 06:25:42 上传评论收藏 72KB GZ 举报

共67个文件

py：65个

sh：1个

pkg-info：1个

资源推荐

资源详情

资源评论

收起资源包目录

ctbBio-0.16.tar.gz （67个子文件）

ctbBio-0.16

setup.py 2KB

ctbBio27

id2tax.py 6KB

kegginfo.py 2KB

__init__.py 0B

rax.py 9KB

ssu_tree.py 21KB

strip_masked.py 2KB

stockholm2fa.py 889B

ssufromHMM.py 12KB

fasta2tch.py 539B

numblast.py 2KB

nr_fasta.py 2KB

search.py 9KB

fasta.py 2KB

ctbBio

16SfromHMM.py 12KB

fastq_merge.py 865B

shuffle_genome.py 3KB

23SfromHMM.py 12KB

strip_align.py 2KB

fastq2fasta.py 663B

numblast-pident.py 2KB

transform.py 7KB

__init__.py 0B

compare_aligned.py 8KB

rax.py 10KB

subset_sam.py 2KB

rc.py 1KB

rp16_retreive.sh 447B

strip_masked.py 2KB

rec_best_blast.py 935B

stockholm2fa.py 957B

genome_variation.py 25KB

lookup-word.py 812B

fastq_split.py 812B

rRNA_insertions.py 22KB

fasta_stats.py 160B

fasta_region.py 1KB

calculate_coverage.py 3KB

concat_align.py 1KB

crossmap.py 5KB

fix_fasta.py 894B

sixframe.py 1KB

fasta_length.py 1KB

orthologer.py 9KB

stats.py 912B

genome_abundance.py 7KB

orthologer_summary.py 3KB

numblast.py 2KB

lookup.py 683B

genome_coverage.py 3KB

strip_align_inserts.py 712B

nr_fasta.py 2KB

parallel.py 720B

name2fasta.py 1KB

rp16.py 9KB

sam2fastq.py 3KB

stockholm2oneline.py 1KB

rRNA_copies.py 5KB

mapped.py 9KB

cluster_ani.py 11KB

filter_fastq_sam.py 3KB

neto.py 21KB

unmapped.py 1KB

search.py 9KB

fasta.py 2KB

n50.py 2KB

PKG-INFO 402B

#!/usr/bin/env python3 """ script for quantifying genommic variation from metagenomic sequencing read mapping """ # python import os import sys import random import argparse from itertools import cycle # biopython from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import IUPAC # ctb from ctbBio.fasta import iterate_fasta as parse_fasta def parse_sam(sam, qual): """ parse sam file and check mapping quality """ for line in sam: if line.startswith('@'): continue line = line.strip().split() if int(line[4]) == 0 or int(line[4]) < qual: continue yield line def save_refs(sam, fastas, genomes, s2b): """ genomes = {} # genomes[genome][contig][sample] = {'bp_stats':[]} """ if s2b is False: s2b = {} for fasta in fastas: # if no reference sequence supplied, get length from sam file if fasta is False: for line in sam: if line.startswith('@PG'): break if line.startswith('@SQ') is False: continue line = line.strip().split() contig, length = line[1].split(':', 1)[1], int(line[2].split(':', 1)[1]) if contig not in s2b: genome = 'n/a' s2b[contig] = genome else: genome = s2b[contig] if genome not in genomes: genomes[genome] = {} if contig not in genomes[genome]: genomes[genome][contig] = {} bp_stats = [] for i in range(0, length): bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \ 'In':[], 'Del':[], 'ref':'N'}) genomes[genome][contig][sam.name] = {'bp_stats':bp_stats} return genomes, s2b # save reference sequences, if available genome = fasta.rsplit('.', 1)[0] if genome not in genomes: genomes[genome] = {} bp_stats = [] for seq in parse_fasta(fasta): contig = seq[0].split('>')[1].split()[0] s2b[contig] = genome if contig not in genomes[genome]: genomes[genome][contig] = {} bp_stats = [] for base in seq[1]: bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \ 'In':[], 'Del':[], 'ref':base.upper()}) genomes[genome][contig][sam.name] = {'bp_stats':bp_stats} return genomes, s2b def parse_cigar(cigar): """ parse cigar string into list of operations e.g.: 28M1I29M2I6M1I46M -> [['28', 'M'], ['1', 'I'], ['29', 'M'], ['2', 'I'], ['6', 'M'], ['1', 'I'], ['46', 'M']] """ cigar = cigar.replace('M', 'M ').replace('I', 'I ').replace('D', 'D ').split() cigar = [c.replace('M', ' M').replace('I', ' I').replace('D', ' D').split() for c in cigar] return [(int(c[0]), c[1]) for c in cigar] def get_bp_stats(sam, genomes, s2b, qual): """ save base pair frequency statistics for each base from sam file genomes = {} # genomes[genome][contig][sample] = {'bp_stats':{}} """ for read in parse_sam(sam, qual): ref = read[2] # scaffold that read mapped to if ref not in s2b: continue genome = s2b[ref] # genome that scaffold belongs to refs = genomes[genome][ref][sam.name]['bp_stats'] ref_start = int(read[3]) - 1 # position of start of alignment on reference ref_pos = int(read[3]) - 1 # for keeping track of reference region sequence = list(read[9]) # read sequence cigar = parse_cigar(read[5]) # parsed cigar string cigar_start = 0 # for keeping track of start of cigar regions bases = [] # bases to compare with reference for cigar_pos, status in cigar: if status == 'D': # deletion compared to reference refs[ref_pos - 1]['Del'].append(cigar_pos) for b in range(0, cigar_pos): bases.append(False) ref_pos += cigar_pos else: cigar_stop = cigar_start + cigar_pos if status == 'M': # aligned to reference for b in sequence[cigar_start:cigar_stop]: # bases for cigar region bases.append(b) ref_pos += cigar_pos elif status == 'I': # insertion compared to reference refs[ref_pos - 1]['In'].append(cigar_pos) else: print('# unrecognized cigar character: %s' % (status), file=sys.stderr) exit() cigar_start += cigar_pos # add base to frequency at each position for base, position in zip(bases, list(range(ref_start, ref_start + len(bases)))): if base is False: continue try: refs[position][base.upper()] += 1 except IndexError: continue return genomes def rc_stats(stats): """ reverse completement stats """ rc_nucs = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N'} rcs = [] for pos in reversed(stats): rc = {} rc['reference frequencey'] = pos['reference frequency'] rc['consensus frequencey'] = pos['consensus frequency'] rc['In'] = pos['In'] rc['Del'] = pos['Del'] rc['ref'] = rc_nucs[pos['ref']] rc['consensus'] = (rc_nucs[pos['consensus'][0]], pos['consensus'][1]) for base, stat in list(pos.items()): if base in rc_nucs: rc[rc_nucs[base]] = stat rcs.append(rc) return rcs def parse_codons(ref, start, end, strand): """ parse codon nucleotide positions in range start -> end, wrt strand """ codon = [] c = cycle([1, 2, 3]) ref = ref[start - 1:end] if strand == -1: ref = rc_stats(ref) for pos in ref: n = next(c) codon.append(pos) if n == 3: yield codon codon = [] def calc_coverage(ref, start, end, length, nucs): """ calculate coverage for positions in range start -> end """ ref = ref[start - 1:end] bases = 0 for pos in ref: for base, count in list(pos.items()): if base in nucs: bases += count return float(bases)/float(length) def parse_gbk(gbks): """ parse gbk file """ for gbk in gbks: for record in SeqIO.parse(open(gbk), 'genbank'): for feature in record.features: if feature.type == 'gene': try: locus = feature.qualifiers['locus_tag'][0] except: continue if feature.type == 'CDS': try: locus = feature.qualifiers['locus_tag'][0] except: pass start = int(feature.location.start) + int(feature.qualifiers['codon_start'][0]) end, strand = int(feature.location.end), feature.location.strand if strand is None: strand = 1 else: strand = -1 contig = record.id # contig = record.id.rsplit('.', 1)[0] yield contig, [locus, \ [start, end, strand], \ feature.qualifiers] def parse_fasta_annotations(fastas, annot_tables, trans_table): """ parse gene call information from Prodigal fasta output """ if annot_tables is not False: annots = {} for table in annot_tables: for cds in open(table): ID, start, end, strand = cds.strip().split() annots[ID] = [sta

评论收藏

内容反馈

版权申诉