import pandas as pd
import numpy as np
import gc
import re
import sys
import time
import jieba
import os.path
import os
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import Word2Vec
df_molecule=pd.read_csv('df_molecule.csv')
df_protein_test_data=pd.read_csv('df_protein_test.csv')
df_affinity_test_toBePredicted=pd.read_csv('df_affinity_test_toBePredicted.csv')
df_protein_train_data=pd.read_csv('df_protein_train.csv')
df_affinity_train=pd.read_csv('df_affinity_train.csv')
#用于保存训练集标签
train_label=df_affinity_train['Ki']
#train_label=float(train_label)
del df_affinity_train['Ki']
#提取蛋白质序列基本数量特征
def fun_D_N_rate(x):
m=x.count('D')+x.count('N')
a=float(m/len(x))
return a
def fun_ALL_rate(x):
m = x.count('ALL')
a = float(m / len(x))
return a
def fun_water_rate(x):
m = x.count('V')+x.count('I')+x.count('L')+x.count('F')+x.count('W')+x.count('Y')+x.count('M')
a = float(m / len(x))
return a
def fun_f_w_y_h_rate(x):
m = x.count('H')+ x.count('F') + x.count('W') + x.count('Y')
a = float(m / len(x))
return a
def fun_v_i_l_m_rate(x):
m = x.count('V')+ x.count('I') + x.count('L') + x.count('M')
a = float(m / len(x))
return a
def fun_lovewater_rate(x):
m = x.count('S')+x.count('T')+x.count('H')+x.count('N')+x.count('Q')+x.count('E')+x.count('D')+x.count('K')+x.count('R')
a = float(m / len(x))
return a
def fun_krh_rate(x):
m = x.count('K')+ x.count('R') + x.count('H')
a = float(m / len(x))
return a
def fun_de_rate(x):
m = x.count('D')+ x.count('E')
a = float(m / len(x))
return a
def fun_PGAS_rate(x):
m = x.count('P')+ x.count('G') + x.count('A') + x.count('S')
a = float(m / len(x))
return a
def fun_zhengfulizi_rate(x):
m2 = x.count('K') + x.count('R') + x.count('H')
m1 = x.count('D') + x.count('E')
try:
a = float(m2 / m1)
return a
except:
return 0
def water_likeorhate_rate(x):
m1 = x.count('S') + x.count('T') + x.count('H') + x.count('N') + x.count('Q') + x.count('E') + x.count(
'D') + x.count('K') + x.count('R')
m2 = x.count('V') + x.count('I') + x.count('L') + x.count('F') + x.count('W') + x.count('Y') + x.count('M')
try:
a=float(m1/m2)
return a
except:
return 0
def protein_sequence_num_feat(df):
df['A_num']=df.Sequence.map(lambda x:x.count('A'))
df['R_num']=df.Sequence.map(lambda x:x.count('R'))
df['N_num'] = df.Sequence.map(lambda x: x.count('N'))
df['D_num'] = df.Sequence.map(lambda x: x.count('D'))
df['C_num'] = df.Sequence.map(lambda x: x.count('C'))
df['E_num'] = df.Sequence.map(lambda x: x.count('E'))
df['Q_num'] = df.Sequence.map(lambda x: x.count('Q'))
df['G_num'] = df.Sequence.map(lambda x: x.count('G'))
df['H_num'] = df.Sequence.map(lambda x: x.count('H'))
df['I_num'] = df.Sequence.map(lambda x: x.count('I'))
df['L_num'] = df.Sequence.map(lambda x: x.count('L'))
df['K_num'] = df.Sequence.map(lambda x: x.count('K'))
df['M_num'] = df.Sequence.map(lambda x: x.count('M'))
df['F_num'] = df.Sequence.map(lambda x: x.count('F'))
df['P_num'] = df.Sequence.map(lambda x: x.count('P'))
df['S_num'] = df.Sequence.map(lambda x: x.count('S'))
df['T_num'] = df.Sequence.map(lambda x: x.count('T'))
df['W_num'] = df.Sequence.map(lambda x: x.count('W'))
df['Y_num'] = df.Sequence.map(lambda x: x.count('Y'))
df['V_num'] = df.Sequence.map(lambda x: x.count('V'))
df['len_se']=df.Sequence.map(lambda x: len(x))
#df['D_N_rate']=df.Sequence.map(lambda x: fun_D_N_rate(x))
#df['all_rate'] = df.Sequence.map(lambda x: fun_ALL_rate(x))
#df['water_rate']=df.Sequence.map(lambda x: fun_water_rate(x))
#df['f_w_y_h_rate']=df.Sequence.map(lambda x: fun_f_w_y_h_rate(x))
#df['v_i_l_m_rate']=df.Sequence.map(lambda x: fun_v_i_l_m_rate(x))
#df['love_water']=df.Sequence.map(lambda x: fun_lovewater_rate(x))
#df['k_r_h_rate']=df.Sequence.map(lambda x: fun_krh_rate(x))
#df['d_e_rate']=df.Sequence.map(lambda x: fun_de_rate(x))
#df['pgas_rate']=df.Sequence.map(lambda x: fun_PGAS_rate(x))
#df['lizi_rate']=df.Sequence.map(lambda x: fun_zhengfulizi_rate(x))
#df['water_likeorhate_rate']=df.Sequence.map(lambda x: water_likeorhate_rate(x))
return df
#构建训练和测试样本的蛋白质特征
df_protein_train_feat=protein_sequence_num_feat(df_protein_train_data)
#del df_protein_train_feat['Sequence']
df_protein_test_feat=protein_sequence_num_feat(df_protein_test_data)
#del df_protein_test_feat['Sequence']
i=0
while(i<len(df_molecule['Fingerprint'][1])):
df_molecule['Fingerprint'+'_'+str(i)]=df_molecule.Fingerprint.map(lambda x:int(x[i]))
i=i+3
df_protein_train_feat=pd.merge(df_affinity_train,df_protein_train_feat,on='Protein_ID',how='left')
del df_molecule['Fingerprint']
df_protein_train_feat=pd.merge(df_protein_train_feat,df_molecule,on='Molecule_ID',how='left')
df_protein_train_feat=df_protein_train_feat.fillna(df_protein_train_feat.mean())
df_protein_test_feat=pd.merge(df_affinity_test_toBePredicted,df_protein_test_feat,on='Protein_ID',how='left')
df_protein_test_feat=pd.merge(df_protein_test_feat,df_molecule,on='Molecule_ID',how='left')
df_protein_test_feat=df_protein_test_feat.fillna(df_protein_test_feat.mean())
#test_vector
n = 128
texts = [[word for word in re.findall(r'.{3}',document)]
for document in list(df_protein_test_data['Sequence'])]
model = Word2Vec(texts,size=n,window=4,min_count=1,negative=3,
sg=1,sample=0.001,hs=1,workers=4)
vectors = pd.DataFrame([model[word] for word in (model.wv.vocab)])
vectors['Word'] = list(model.wv.vocab)
vectors.columns= ["vec_{0}".format(i) for i in range(0,n)]+["Word"]
#train_vector
texts2 = [[word for word in re.findall(r'.{3}',document)]
for document in list(df_protein_train_data['Sequence'])]
model2 = Word2Vec(texts,size=n,window=4,min_count=1,negative=3,
sg=1,sample=0.001,hs=1,workers=4)
vectors2 = pd.DataFrame([model[word] for word in (model.wv.vocab)])
vectors2['Word'] = list(model.wv.vocab)
vectors2.columns= ["vec_{0}".format(i) for i in range(0,n)]+["Word"]
vectors['Protein_ID']=df_protein_test_data['Protein_ID']
vectors2['Protein_ID']=df_protein_train_data['Protein_ID']
del vectors2['Word']
del vectors['Word']
print('vector_get_finish')
df_protein_train_feat=pd.merge(df_protein_train_feat,vectors2,on='Protein_ID',how='left')
df_protein_test_feat=pd.merge(df_protein_test_feat,vectors,on='Protein_ID',how='left')
del df_protein_test_feat['Protein_ID']
del df_protein_test_feat['Molecule_ID']
del df_protein_train_feat['Protein_ID']
del df_protein_train_feat['Molecule_ID']
del df_protein_train_feat['Sequence']
del df_protein_test_feat['Sequence']
print('feature_finish')
#----------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(df_protein_train_feat,train_label,test_size=0.2,random_state=100)
import xgboost as xgb
print ('start running ....')
dtrain = xgb.DMatrix(x_train,label=y_train)
dval = xgb.DMatrix(x_val,label=y_val)
param = {'learning_rate' : 0.1,
'n_estimators': 1000,
'max_depth': 4,
'min_child_weight': 6,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'eta': 0.05,
'silent': 1,
}
num_round =150
plst = list(param.items())
plst += [('eval_metric', 'rmse')]
evallist = [(dval, 'eval'), (dtrain, 'train')]
bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10)
dtest = xgb.DMatrix(df_protein_test_feat)
Pred = bst.predict(dtest)
df_affinity_test_toBePredicted['Ki']=Pred
df_affinity_test_toBePredicted.to_csv('baseline_4_12.csv',index=False)
没有合适的资源?快使用搜索试试~ 我知道了~
DC平台蛋白质分子匹配预测.zip
共2个文件
py:2个
需积分: 5 0 下载量 46 浏览量
2024-05-20
17:13:03
上传
评论
收藏 3KB ZIP 举报
温馨提示
蛋白质是生物体中普遍存在的一类重要生物大分子,由天然氨基酸通过肽键连接而成。它具有复杂的分子结构和特定的生物功能,是表达生物遗传性状的一类主要物质。 蛋白质的结构可分为四级:一级结构是组成蛋白质多肽链的线性氨基酸序列;二级结构是依靠不同氨基酸之间的C=O和N-H基团间的氢键形成的稳定结构,主要为α螺旋和β折叠;三级结构是通过多个二级结构元素在三维空间的排列所形成的一个蛋白质分子的三维结构;四级结构用于描述由不同多肽链(亚基)间相互作用形成具有功能的蛋白质复合物分子。 蛋白质在生物体内具有多种功能,包括提供能量、维持电解质平衡、信息交流、构成人的身体以及免疫等。例如,蛋白质分解可以为人体提供能量,每克蛋白质能产生4千卡的热能;血液里的蛋白质能帮助维持体内的酸碱平衡和血液的渗透压;蛋白质是组成人体器官组织的重要物质,可以修复受损的器官功能,以及维持细胞的生长和更新;蛋白质也是构成多种生理活性的物质,如免疫球蛋白,具有维持机体正常免疫功能的作用。 蛋白质的合成是指生物按照从脱氧核糖核酸(DNA)转录得到的信使核糖核酸(mRNA)上的遗传信息合成蛋白质的过程。这个过程包括氨基酸的活化、多肽链合成的起始、肽链的延长、肽链的终止和释放以及蛋白质合成后的加工修饰等步骤。 蛋白质降解是指食物中的蛋白质经过蛋白质降解酶的作用降解为多肽和氨基酸然后被人体吸收的过程。这个过程在细胞的生理活动中发挥着极其重要的作用,例如将蛋白质降解后成为小分子的氨基酸,并被循环利用;处理错误折叠的蛋白质以及多余组分,使之降解,以防机体产生错误应答。 总的来说,蛋白质是生物体内不可或缺的一类重要物质,对于维持生物体的正常生理功能具有至关重要的作用。
资源推荐
资源详情
资源评论
收起资源包目录
DC平台蛋白质分子匹配预测.zip (2个子文件)
content
feature.py 2KB
baseline.py 8KB
共 2 条
- 1
资源评论
生瓜蛋子
- 粉丝: 3828
- 资源: 5678
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功