#!/usr/bin/env python3
"""
script for quantifying genommic variation from
metagenomic sequencing read mapping
"""
# python
import os
import sys
import random
import argparse
from itertools import cycle
# biopython
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import IUPAC
# ctb
from ctbBio.fasta import iterate_fasta as parse_fasta
def parse_sam(sam, qual):
"""
parse sam file and check mapping quality
"""
for line in sam:
if line.startswith('@'):
continue
line = line.strip().split()
if int(line[4]) == 0 or int(line[4]) < qual:
continue
yield line
def save_refs(sam, fastas, genomes, s2b):
"""
genomes = {} # genomes[genome][contig][sample] = {'bp_stats':[]}
"""
if s2b is False:
s2b = {}
for fasta in fastas:
# if no reference sequence supplied, get length from sam file
if fasta is False:
for line in sam:
if line.startswith('@PG'):
break
if line.startswith('@SQ') is False:
continue
line = line.strip().split()
contig, length = line[1].split(':', 1)[1], int(line[2].split(':', 1)[1])
if contig not in s2b:
genome = 'n/a'
s2b[contig] = genome
else:
genome = s2b[contig]
if genome not in genomes:
genomes[genome] = {}
if contig not in genomes[genome]:
genomes[genome][contig] = {}
bp_stats = []
for i in range(0, length):
bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \
'In':[], 'Del':[], 'ref':'N'})
genomes[genome][contig][sam.name] = {'bp_stats':bp_stats}
return genomes, s2b
# save reference sequences, if available
genome = fasta.rsplit('.', 1)[0]
if genome not in genomes:
genomes[genome] = {}
bp_stats = []
for seq in parse_fasta(fasta):
contig = seq[0].split('>')[1].split()[0]
s2b[contig] = genome
if contig not in genomes[genome]:
genomes[genome][contig] = {}
bp_stats = []
for base in seq[1]:
bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \
'In':[], 'Del':[], 'ref':base.upper()})
genomes[genome][contig][sam.name] = {'bp_stats':bp_stats}
return genomes, s2b
def parse_cigar(cigar):
"""
parse cigar string into list of operations
e.g.: 28M1I29M2I6M1I46M ->
[['28', 'M'], ['1', 'I'], ['29', 'M'], ['2', 'I'], ['6', 'M'], ['1', 'I'], ['46', 'M']]
"""
cigar = cigar.replace('M', 'M ').replace('I', 'I ').replace('D', 'D ').split()
cigar = [c.replace('M', ' M').replace('I', ' I').replace('D', ' D').split() for c in cigar]
return [(int(c[0]), c[1]) for c in cigar]
def get_bp_stats(sam, genomes, s2b, qual):
"""
save base pair frequency statistics for each base from sam file
genomes = {} # genomes[genome][contig][sample] = {'bp_stats':{}}
"""
for read in parse_sam(sam, qual):
ref = read[2] # scaffold that read mapped to
if ref not in s2b:
continue
genome = s2b[ref] # genome that scaffold belongs to
refs = genomes[genome][ref][sam.name]['bp_stats']
ref_start = int(read[3]) - 1 # position of start of alignment on reference
ref_pos = int(read[3]) - 1 # for keeping track of reference region
sequence = list(read[9]) # read sequence
cigar = parse_cigar(read[5]) # parsed cigar string
cigar_start = 0 # for keeping track of start of cigar regions
bases = [] # bases to compare with reference
for cigar_pos, status in cigar:
if status == 'D': # deletion compared to reference
refs[ref_pos - 1]['Del'].append(cigar_pos)
for b in range(0, cigar_pos):
bases.append(False)
ref_pos += cigar_pos
else:
cigar_stop = cigar_start + cigar_pos
if status == 'M': # aligned to reference
for b in sequence[cigar_start:cigar_stop]: # bases for cigar region
bases.append(b)
ref_pos += cigar_pos
elif status == 'I': # insertion compared to reference
refs[ref_pos - 1]['In'].append(cigar_pos)
else:
print('# unrecognized cigar character: %s' % (status), file=sys.stderr)
exit()
cigar_start += cigar_pos
# add base to frequency at each position
for base, position in zip(bases, list(range(ref_start, ref_start + len(bases)))):
if base is False:
continue
try:
refs[position][base.upper()] += 1
except IndexError:
continue
return genomes
def rc_stats(stats):
"""
reverse completement stats
"""
rc_nucs = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N'}
rcs = []
for pos in reversed(stats):
rc = {}
rc['reference frequencey'] = pos['reference frequency']
rc['consensus frequencey'] = pos['consensus frequency']
rc['In'] = pos['In']
rc['Del'] = pos['Del']
rc['ref'] = rc_nucs[pos['ref']]
rc['consensus'] = (rc_nucs[pos['consensus'][0]], pos['consensus'][1])
for base, stat in list(pos.items()):
if base in rc_nucs:
rc[rc_nucs[base]] = stat
rcs.append(rc)
return rcs
def parse_codons(ref, start, end, strand):
"""
parse codon nucleotide positions in range start -> end, wrt strand
"""
codon = []
c = cycle([1, 2, 3])
ref = ref[start - 1:end]
if strand == -1:
ref = rc_stats(ref)
for pos in ref:
n = next(c)
codon.append(pos)
if n == 3:
yield codon
codon = []
def calc_coverage(ref, start, end, length, nucs):
"""
calculate coverage for positions in range start -> end
"""
ref = ref[start - 1:end]
bases = 0
for pos in ref:
for base, count in list(pos.items()):
if base in nucs:
bases += count
return float(bases)/float(length)
def parse_gbk(gbks):
"""
parse gbk file
"""
for gbk in gbks:
for record in SeqIO.parse(open(gbk), 'genbank'):
for feature in record.features:
if feature.type == 'gene':
try:
locus = feature.qualifiers['locus_tag'][0]
except:
continue
if feature.type == 'CDS':
try:
locus = feature.qualifiers['locus_tag'][0]
except:
pass
start = int(feature.location.start) + int(feature.qualifiers['codon_start'][0])
end, strand = int(feature.location.end), feature.location.strand
if strand is None:
strand = 1
else:
strand = -1
contig = record.id
# contig = record.id.rsplit('.', 1)[0]
yield contig, [locus, \
[start, end, strand], \
feature.qualifiers]
def parse_fasta_annotations(fastas, annot_tables, trans_table):
"""
parse gene call information from Prodigal fasta output
"""
if annot_tables is not False:
annots = {}
for table in annot_tables:
for cds in open(table):
ID, start, end, strand = cds.strip().split()
annots[ID] = [sta
没有合适的资源?快使用搜索试试~ 我知道了~
PyPI 官网下载 | ctbBio-0.16.tar.gz
1.该资源内容由用户上传,如若侵权请联系客服进行举报
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
版权申诉
0 下载量 121 浏览量
2022-01-10
06:25:42
上传
评论
收藏 72KB GZ 举报
温馨提示
共67个文件
py:65个
sh:1个
pkg-info:1个
资源来自pypi官网。 资源全名:ctbBio-0.16.tar.gz
资源推荐
资源详情
资源评论
收起资源包目录
ctbBio-0.16.tar.gz (67个子文件)
ctbBio-0.16
setup.py 2KB
ctbBio27
id2tax.py 6KB
kegginfo.py 2KB
__init__.py 0B
rax.py 9KB
ssu_tree.py 21KB
strip_masked.py 2KB
stockholm2fa.py 889B
ssufromHMM.py 12KB
fasta2tch.py 539B
numblast.py 2KB
nr_fasta.py 2KB
search.py 9KB
fasta.py 2KB
ctbBio
16SfromHMM.py 12KB
fastq_merge.py 865B
shuffle_genome.py 3KB
23SfromHMM.py 12KB
strip_align.py 2KB
fastq2fasta.py 663B
numblast-pident.py 2KB
transform.py 7KB
__init__.py 0B
compare_aligned.py 8KB
rax.py 10KB
subset_sam.py 2KB
rc.py 1KB
rp16_retreive.sh 447B
strip_masked.py 2KB
rec_best_blast.py 935B
stockholm2fa.py 957B
genome_variation.py 25KB
lookup-word.py 812B
fastq_split.py 812B
rRNA_insertions.py 22KB
fasta_stats.py 160B
fasta_region.py 1KB
calculate_coverage.py 3KB
concat_align.py 1KB
crossmap.py 5KB
fix_fasta.py 894B
sixframe.py 1KB
fasta_length.py 1KB
orthologer.py 9KB
stats.py 912B
genome_abundance.py 7KB
orthologer_summary.py 3KB
numblast.py 2KB
lookup.py 683B
genome_coverage.py 3KB
strip_align_inserts.py 712B
nr_fasta.py 2KB
parallel.py 720B
name2fasta.py 1KB
rp16.py 9KB
sam2fastq.py 3KB
stockholm2oneline.py 1KB
rRNA_copies.py 5KB
mapped.py 9KB
cluster_ani.py 11KB
filter_fastq_sam.py 3KB
neto.py 21KB
unmapped.py 1KB
search.py 9KB
fasta.py 2KB
n50.py 2KB
PKG-INFO 402B
共 67 条
- 1
资源评论
挣扎的蓝藻
- 粉丝: 13w+
- 资源: 15万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功