from functools import cached_property
import logging
import re
from typing import Iterable, NamedTuple
from urllib.parse import quote
from pysam import TabixFile, VariantRecord # pylint: disable=E0611
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("Annotate CNV")
class Region(NamedTuple):
start: int
end: int
name: str
class Transcript:
def __init__(self, row: list[str]) -> None:
self.transcript = row[1]
self.chrom = row[2]
self.strand = row[3]
# self.tx_start = int(row[4])
# self.tx_end = int(row[5])
# self.cds_start = int(row[6])
# self.cds_end = int(row[7])
self.exon_count = int(row[8])
self.exon_starts = list(map(int, row[9].strip(",").split(",")))
self.exon_ends = list(map(int, row[10].strip(",").split(",")))
self.gene = row[12]
self.is_mrna = "Y" if row[4] == row[5] else "N"
@cached_property
def gene_pk(self) -> str:
return f"{self.chrom}:{self.gene}"
@cached_property
def regions(self) -> list[Region]:
regions: list[Region] = []
for i in range(self.exon_count):
name = f"exon{i + 1}" if self.strand == "+" else f"exon{self.exon_count - i}"
regions.append(Region(start=self.exon_starts[i], end=self.exon_ends[i], name=name))
if i > 0:
name = f"intron{i}" if self.strand == "+" else f"intron{self.exon_count - i}"
regions.append(Region(start=self.exon_ends[i - 1], end=self.exon_starts[i], name=name))
return regions
def calc_detail(self, start: int, end: int) -> str:
regions = sorted(filter(lambda x: x.start < end and x.end >= start, self.regions), key=lambda x: x.start, reverse=self.strand == "-")
region = regions[0].name if len(regions) == 1 else f"{regions[0].name}_{regions[1].name}"
return f"{self.gene}:{self.transcript}:{region}:{self.is_mrna}"
class CNV:
def __init__(self, record: VariantRecord): # pylint: disable=W0231
self.record = record
def validate_overlap(self, start: int, end: int, overlap: float) -> int:
overlap_size = min(self.record.stop, end) - max(self.record.start, start)
return overlap_size >= min(self.record.stop - self.record.start, end - start) * overlap
@property
def pk(self) -> str:
return f"{self.record.chrom}:{self.record.start}-{self.record.stop}"
def query_tabix(self, tbx: TabixFile, chrom_with_chr: bool = False) -> Iterable[str]:
chrom_without_chr = re.sub(r"^[Cc][Hh][Rr]", "", self.record.chrom)
chrom = f"chr{chrom_without_chr}" if chrom_with_chr else chrom_without_chr
try:
return tbx.fetch(chrom, self.record.start, self.record.stop)
except Exception: # pylint: disable=W0718
return []
def anno_gene(self, tbx: TabixFile, gene_ids: dict):
data = {}
for line in self.query_tabix(tbx, True):
transcript = Transcript(line.strip().split("\t"))
gene_id = gene_ids.get(transcript.gene_pk, ".")
detail = transcript.calc_detail(self.record.start, self.record.stop)
data.setdefault(transcript.gene_pk, {"gene": transcript.gene, "gene_id": gene_id, "detail": ""})
sep = "|" if data[transcript.gene_pk]["detail"] else ""
data[transcript.gene_pk]["detail"] += f"{sep}{detail}"
if data:
genes, gene_ids, details = list(zip(*map(lambda x: (x["gene"], x["gene_id"], x["detail"]), data.values())))
self.record.info["GENE"] = genes
self.record.info["GENE_ID"] = gene_ids
self.record.info["DETAIL"] = details
def anno_cytoband(self, tbx: TabixFile):
start_name, end_name = "", ""
records = [line.rstrip().split("\t") for line in self.query_tabix(tbx, True)]
if records:
start_name = records[0][3]
end_name = records[-1][3]
name = start_name if start_name == end_name else f"{start_name}{end_name}"
self.record.info["LOCATION"] = re.sub("[Cc][Hh][Rr]", "", f"{self.record.chrom}{name}")
def anno_dgv(self, tbx: TabixFile, overlap: float):
texts = []
for line in self.query_tabix(tbx):
row = line.rstrip().split("\t")
accession, start, end, an, gain_ac, loss_ac = row[0], int(row[2]), int(row[3]), int(row[14]), row[15], row[16]
if gain_ac and loss_ac:
gain_ac, loss_ac = int(gain_ac), int(loss_ac)
if self.validate_overlap(start, end, overlap):
gain_af = gain_ac / an if an else 0
loss_af = loss_ac / an if an else 0
texts.append(f"{accession}:{self.record.chrom}:{start}:{end}:{gain_af}:{loss_af}")
if texts:
self.record.info["DGV"] = "|".join(texts)
def anno_decipher(self, tbx: TabixFile, overlap: float):
texts = []
for line in self.query_tabix(tbx):
row = line.rstrip().split("\t")
accession, start, end, an, gain_ac, loss_ac = row[0], int(row[2]), int(row[3]), int(row[4]), row[7], row[14]
if gain_ac and loss_ac:
gain_ac, loss_ac = int(gain_ac), int(loss_ac)
if self.validate_overlap(start, end, overlap):
gain_af = gain_ac / an if an else 0
loss_af = loss_ac / an if an else 0
texts.append(f"{accession}:{self.record.chrom}:{start}:{end}:{gain_af}:{loss_af}")
if texts:
self.record.info["DECIPHER"] = "|".join(texts)
def anno_clinvar(self, tbx: TabixFile):
texts = []
for line in self.query_tabix(tbx):
row = line.rstrip().split("\t")
typo, clnsig, assembly, start, end, revstat, variant_id = (row[1], row[6], row[16], int(row[19]), int(row[20]), row[24], row[30])
if assembly == "GRCh38" and typo.startswith("copy_number"):
clnsig = quote(clnsig)
texts.append(f"{variant_id}:{self.record.chrom}:{start}:{end}:{revstat}:{clnsig}")
if texts:
self.record.info["CLINVAR"] = "|".join(texts)
def anno_clingen(self, tbx: TabixFile):
texts = []
for line in self.query_tabix(tbx):
row = line.rstrip().split("\t")
start, end, hi_score, ts_score = int(row[1]), int(row[2]), row[3], row[4]
texts.append(f"{self.record.chrom}:{start}:{end}:{hi_score}:{ts_score}")
if texts:
self.record.info["CLINGEN"] = "|".join(texts)
def anno_local(self, tbx: TabixFile, overlap: float):
texts = []
for line in self.query_tabix(tbx):
row = line.rstrip().split("\t")
start, end, gain_ac, loss_ac, an, gain_af, loss_af = int(row[1]), int(row[2]), row[3], row[4], row[5], row[6], row[7]
if gain_ac and loss_ac:
if self.validate_overlap(start, end, overlap):
texts.append(f"{self.record.chrom}:{start}:{end}:{gain_ac}:{loss_ac}:{an}:{gain_af}:{loss_af}")
if texts:
self.record.info["LOCAL"] = "|".join(texts)
没有合适的资源?快使用搜索试试~ 我知道了~
CNV注释软件,基于Python 3.11(源码)
共10个文件
py:5个
sh:1个
gitignore:1个
需积分: 5 0 下载量 132 浏览量
2024-10-09
14:42:03
上传
评论
收藏 7KB ZIP 举报
温馨提示
python CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码) CNV注释软件,基于Python 3.11(源码)
资源推荐
资源详情
资源评论
收起资源包目录
openanno-cnv-master.zip (10个子文件)
openanno-cnv-master
setup.py 38B
CHANGELOG.md 2KB
openanno_cnv
__init__.py 81B
anno.py 4KB
__main__.py 1KB
cnv.py 7KB
.gitignore 51B
setup.cfg 364B
release.sh 459B
Jenkinsfile 555B
共 10 条
- 1
资源评论
LeonDL168
- 粉丝: 2563
- 资源: 639
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- Untitled7.ipynb
- C#ASP.NET酒店管理系统源码 宾馆管理系统源码数据库 SQL2008源码类型 WebForm
- 【安卓毕业设计】基于安卓的奶牛管理源码(完整前后端+mysql+说明文档).zip
- 【安卓毕业设计】Android app作业源码(完整前后端+mysql+说明文档).zip
- Scrapy基础(讲解详细、包括框架流程代码实战,最佳学习资料).zip
- FPGA实现IIC通信quartus工程,纯verliog,可进行移植
- C#ASP.NET外贸订单管理系统源码 汽配订单管理系统源码数据库 SQL2008源码类型 WebForm
- 基于双流Faster R-CNN网络的图像篡改检测项目源码+训练好的模型+文档说明.zip
- 买的USB转485串口的驱动程序,使用的是美国TI芯片+WCH340芯片
- 二次平台培训视频,人事管理
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功