#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
#1.csv,2.csv,3.csv文件为LABELEVENTS_mini_0,1,2文件经过po2,pco2指标筛选过且删除第一列,以及po2,pco2删除指标为Error行之后的数据表格
df=pd.read_csv('1.csv',na_values=[' '])#导入数据与缺失值处理
df['CHARTTIME']=pd.to_datetime(df['CHARTTIME'],format='%Y-%m-%d %H:%M:%S')#时间标准化
#按照病人编号和时间排序
df=df.sort_values(by=['SUBJECT_ID','CHARTTIME'])
#表格中相同时间的po2,pc02在两行,处理为同一行
df['VALUE_2']=[0]*len(df)
df['VALUENUM_2']=[0]*len(df)
df['FLAG_2']=range(0,len(df),1)
for i in range(len(df)-1):
#for j in range(10):
if df.iat[i,1]==df.iat[i+1,1] and df.iat[i,4]==df.iat[i+1,4]:
df.iat[i,9]=df.iat[i+1,5]
df.iat[i,10]=df.iat[i+1,6]
df.iat[i,11]=df.iat[i+1,8]
df=df[~df['VALUE_2'].isin([0])]
print(df.head(10))
#同理数据1
df1=pd.read_csv('2.csv',na_values=[' '])
df1['CHARTTIME']=pd.to_datetime(df1['CHARTTIME'],format='%Y-%m-%d %H:%M:%S')
df1=df1.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df1['VALUE_2']=[0]*len(df1)
df1['VALUENUM_2']=[0]*len(df1)
df1['FLAG_2']=range(0,len(df1),1)
for i in range(len(df1)-1):
#for j in range(10):
if df1.iat[i,1]==df1.iat[i+1,1] and df1.iat[i,4]==df1.iat[i+1,4]:
df1.iat[i,9]=df1.iat[i+1,5]
df1.iat[i,10]=df1.iat[i+1,6]
df1.iat[i,11]=df1.iat[i+1,8]
df1=df1[~df1['VALUE_2'].isin([0])]
print(df1.head(10))
#同理数据1
df2=pd.read_csv('3.csv',na_values=[' '])
df2['CHARTTIME']=pd.to_datetime(df2['CHARTTIME'],format='%Y-%m-%d %H:%M:%S')
df2=df2.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df2['VALUE_2']=[0]*len(df2)
df2['VALUENUM_2']=[0]*len(df2)
df2['FLAG_2']=range(0,len(df2),1)
for i in range(len(df2)-1):
#for j in range(10):
if df2.iat[i,1]==df2.iat[i+1,1] and df2.iat[i,4]==df2.iat[i+1,4]:
df2.iat[i,9]=df2.iat[i+1,5]
df2.iat[i,10]=df2.iat[i+1,6]
df2.iat[i,11]=df2.iat[i+1,8]
df2=df2[~df2['VALUE_2'].isin([0])]
print(df2.head(10))
df=df.append(df1,ignore_index=True)
df=df.append(df2,ignore_index=True)
#df=df.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df1=df.drop(columns=['ROW_ID','ITEMID','VALUENUM','VALUE_2','VALUEUOM','FLAG','FLAG_2'])
df1.rename(columns={'VALUE':'PCO2','VALUENUM_2':'PO2'},inplace=True)
df1.head()
df4=pd.read_csv('4.csv',na_values=[' ','ERROR'])#ERROR变为Nan
df5=pd.read_csv('5.csv',na_values=[' ','ERROR','GREATER THAN 800','CLOTTED SPECIMEN','DISREGARD PREVIOUS RESULT OF 156'])
df6=pd.read_csv('6.csv',na_values=[' '])
df7=pd.read_csv('7.csv',na_values=[' ','ERROR'])
def chuli(df):
df['CHARTTIME']=pd.to_datetime(df['CHARTTIME'],format='%Y-%m-%d %H:%M:%S')#时间标准化
df=df.drop(columns=['ROW_ID','ITEMID','VALUENUM','VALUEUOM','FLAG'])
df.rename(columns={'VALUE':'PCO2'},inplace=True)
df['PO2']=[0]*len(df)
df=df.fillna(0)#缺失值处理,Nan变为0
a=len(df)-1
for i in range(a):
if df.iat[i,0]==df.iat[i+1,0] and df.iat[i,2]==df.iat[i+1,2]:
df.iat[i,4]=df.iat[i+1,3]
df=df[~df['PO2'].isin([0])] #删去多余行以及删去错误行
return df
df4=chuli(df4)
df5=chuli(df5)
df6=chuli(df6)
df7=chuli(df7)
df7=df7.append(df4,ignore_index=True)
df7=df7.append(df5,ignore_index=True)
df7=df7.append(df5,ignore_index=True)
#与前3个数据合并
# df1=df1.append(df7,ignore_index=True)
#df7=df7.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df7.head(40)
#将53个chartevents放在了一张表
df8=pd.read_csv('chartevents.csv',na_values=[' ','VALUE','no data','########'])
df8=df8.fillna(0)
#df8['CHARTTIME']=pd.to_datetime(df8['CHARTTIME'].values,unit='ms')
df8=df8.drop(columns=['VALUENUM','VALUEUOM','WARNING','ERROR','RESULTSTATUS','STOPPED','STORETIME','CGID','ICUSTAY_ID'])
df8.rename(columns={'VALUE':'PCO2'},inplace=True)
df8['PO2']=[0]*len(df8)
for i in range(len(df8)-1):
if df8.iat[i,0]==df8.iat[i+1,0] and df8.iat[i,3]==df8.iat[i+1,3]:
df8.iat[i,5]=df8.iat[i+1,4]
df8.iat[i,5]=int(df8.iat[i,5])
df8=df8[~df8['PO2'].isin([0])] #删去多余行以及删去错误行
df8=df8[~df8['CHARTTIME'].isin([0])]
df8=df8.drop(columns=['ITEMID'])
#df8['CHARTTIME']=pd.to_datetime(df8['CHARTTIME'])
type(df8.iat[1,2])
df1=df1.append(df7,ignore_index=True)
#df1=df1.append(df8,ignore_index=True)
df1=df1.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df1.head(40)
a=df1.iat[1,2]-df1.iat[0,2]
n=0
if a<=pd.Timedelta('1 days'):
print(True)
for i in range(1,(len(df1)-1)):
if df1.iat[i,0]==df1.iat[i+1,0] and (df1.iat[i+1,2]-df1.iat[i,2])<=pd.Timedelta('1 days') :
a=df1.iat[i+1,2]-df1.iat[i,2]+a
n=n+1
m=a/n #m=Timedelta('0 days 04:19:42.881419'),求得平均差值时间
mean_value=pd.Timedelta('0 days 04:00:00') #近似集中两次采样时间为两个小时半
for i in range(len(df1)-1):
if type(df1.iat[i,4])==str :
df1.iat[i,4]=int(df1.iat[i,4])
elif type(df1.iat[i,3])==str:
df1.iat[i,3]=int(df1.iat[i,3])
type(df1.iat[65674,0])
#插值
for i in range(len(df1)-1):
if df1.iat[i,0]==df1.iat[i+1,0]and (df1.iat[i+1,2]-df1.iat[i,2])<=pd.Timedelta('1 days') :
if (df1.iat[i+1,2]-df1.iat[i,2])>=pd.Timedelta('0 days 04:00:00') and (df1.iat[i+1,2]-df1.iat[i,2])<=pd.Timedelta('0 days 08:00:00') :
# 差距大于5个小时且小于7个半小时插一个点
s = {'SUBJECT_ID':df1.iat[i,0],'HADM_ID':df1.iat[i,1], 'CHARTTIME':df1.iat[i+1,2]-pd.Timedelta('0 days 04:00:00'), 'PCO2': int((df1.iat[i,3]+df1.iat[i+1,3])/2 ),'PO2': int((df1.iat[i,4]+df1.iat[i+1,4])/2 )}
s = pd.Series(s)
s.name = 'a'
df1 = df1.append(s)
n=n+1
elif (df1.iat[i+1,2]-df1.iat[i,2])>=pd.Timedelta('0 days 08:00:00') :# 差距大于7个半小时插两个点
s = {'SUBJECT_ID':df1.iat[i,0],'HADM_ID':df1.iat[i,1], 'CHARTTIME':df1.iat[i+1,2]-pd.Timedelta('0 days 08:00:00'), 'PCO2':int(df1.iat[i,3]+(df1.iat[i+1,3]-df1.iat[i,3])/3),'PO2': int(df1.iat[i,4]+(df1.iat[i+1,4]-df1.iat[i,4])/3)}
s = pd.Series(s)
s.name = 'c'
s1 = {'SUBJECT_ID':df1.iat[i,0],'HADM_ID':df1.iat[i,1], 'CHARTTIME':df1.iat[i+1,2]-pd.Timedelta('0 days 04:00:00'), 'PCO2':int(df1.iat[i,3]+2*(df1.iat[i+1,3]-df1.iat[i,3])/3),'PO2':int(df1.iat[i,4]+2*(df1.iat[i+1,4]-df1.iat[i,4])/3)}
s1 = pd.Series(s1)
s1.name = 'd'
df1 = df1.append(s)
df1 = df1.append(s1)
n=n+2
df1=df1.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df1.head(50)
df1=df1.append(df8,ignore_index=True)
#df1=df1.sort_values(by=['SUBJECT_ID','CHARTTIME'])
df1.head()
#离群点
import matplotlib.pyplot as plt
import numpy as np
x=df1.iloc[:,0]
y=df1.iloc[:,3]
z=df1.iloc[:,4]
y2=np.array([130]*len(df1))#PCO2离群点划线
z2=np.array([520]*len(df1))#PO2离群点划线
fig=plt.figure()
ax1=fig.add_subplot(1,1,1)
ax1.set_title('PCO2_1')
ax1.set_xlabel('SUBJECT_ID')
ax1.set_ylabel('PCO2')
ax1.scatter(x,y,marker='.')
ax1.plot(x,y2,c='r')
plt.show()
fig=plt.figure()
ax2=fig.add_subplot(1,1,1)
ax2.set_title('PO2_1')
ax2.set_xlabel('SUBJECT_ID')
ax2.set_ylabel('PO2')
ax2.scatter(x,z,c='m',marker='.')
ax2.plot(x,z2,c='k')
plt.show()
#离群点
for i in range(1,(len(df1)-1)):
if df1.iat[i,0]==df1.iat[i+1,0]:
if df1.iat[i,3]>130:
df1.iat[i,3]=(df1.iat[i-1,3]+df1.iat[i+1,3])/2#pco2离群点上下两个数据均值
elif df1.iat[i,4]>520:
df1.iat[i,4]=(df1.iat[i-1,4]+df1.iat[i+1,4])/2#po2离群点上下两个数据均值
x=df1.iloc[:,0]
y=df1.iloc[:,3]
z=df1.iloc[:,4]
y2=np.array([130]*len(df1))#PCO2离群点划线
z2=np.array([520]*len(df1))#PO2离群点划线
fig=plt.figure()
ax1=fig.add_subplot(1,1,1)
ax1.set_title('PCO2_2')
ax1.set_xlabel('SUBJECT_ID')
ax1.set_ylabel('PCO2')
ax1.scatter(x,y,marker='.')
ax1.plot(x,y2,c='r')
plt.show()
fig=plt.figure()
ax2=fig.add_subplot(1,1,1)
ax
评论0