import pandas as pd
import os
# ## 1.生成属-count
# import os
# import pandas as pd
# import math
#
# inputpath_mngs=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\宏基因组'
# inputpath_mngs_base=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\宏基因组\base.xls'
#
# inputpath_capture=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\捕获'
# inputpath_capture_base=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\捕获\base.xls'
#
# # outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-greengene\species_input'
# outputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\genus_input_count'
#
# df_mngs_base=pd.read_csv(inputpath_mngs_base,sep='\t')
# dict_mngs_base=dict(zip(df_mngs_base['name'],df_mngs_base['base']))
# dict_mngs_base={ item: dict_mngs_base[item]/1000000000 for item in dict_mngs_base}
#
# print(dict_mngs_base)
# df_capture_base=pd.read_csv(inputpath_capture_base,sep='\t')
# dict_capture_base=dict(zip(df_capture_base['name'],df_capture_base['base']))
# dict_capture_base={ item: dict_capture_base[item]/1000000000 for item in dict_capture_base}
#
# ## 第一步找到共有的物种
# ## mngs数据
# list_mngs=[item for item in os.listdir(inputpath_mngs) if item.endswith('.classify.last.xls')]
# list_mngs_sample=list(set([item.split('.',1)[0] for item in list_mngs]))
# print(list_mngs_sample)
#
# list_mngs_need_species=[]
# for i in list_mngs_sample:
# for j in list_mngs:
# if j.startswith(i):
# df1=pd.read_csv(inputpath_mngs+'/'+j,sep='\t')
# df1=df1[df1['G/S']=='genus'].copy()
# df1.sort_values(by=['count'],ascending=False,inplace=True)
#
# if len(df1) <= 30 :
# list_mngs_need_species+=df1['Ename'].tolist()
# else:
# df1=df1.iloc[0:30,:]
# list_mngs_need_species += df1['Ename'].tolist()
#
#
#
# ## capture数据
# list_capture=[item for item in os.listdir(inputpath_capture) if item.endswith('.classify.last.xls')]
# list_capture_sample=list(set([item.split('.',1)[0] for item in list_capture]))
# print(list_capture_sample)
#
# list_capture_need_species=[]
# for i in list_capture_sample:
# for j in list_capture:
# if j.startswith(i):
# df1=pd.read_csv(inputpath_capture+'/'+j,sep='\t')
# df1=df1[df1['G/S']=='genus'].copy()
# df1.sort_values(by=['count'],ascending=False,inplace=True)
#
# if len(df1) <= 30 :
# list_capture_need_species +=df1['Ename'].tolist()
# else:
# df1=df1.iloc[0:30,:]
# list_capture_need_species += df1['Ename'].tolist()
#
# list_need_species=list(set(list_capture_need_species+list_mngs_need_species))
#
# print(len(list_need_species))
#
# ## 第二步生成表格
#
# ## mngs数据
# list_mngs=[item for item in os.listdir(inputpath_mngs) if item.endswith('.classify.last.xls')]
# list_mngs_sample=list(set([item.split('.',1)[0] for item in list_mngs]))
# print(list_mngs_sample)
#
# # with open(outputpath1,'a') as f1:
# for i in list_mngs_sample:
# with open(outputpath2+'/'+i+'.mngs.input.xls','w') as f1:
# f1.write('%s\t%s\t%s\t%s\n' %('x','y','value','group'))
# for j in list_mngs:
# if j.startswith(i):
# df1=pd.read_csv(inputpath_mngs+'/'+j,sep='\t')
# df1=df1[df1['Ename'].isin(list_need_species)].copy()
#
# df1.sort_values(by=['count'],ascending=False,inplace=True)
# dict_name_to_count = dict(zip(df1['Ename'], df1['count']))
#
# for item in dict_name_to_count:
# # print(type(item),type(dict_name_to_count[item]))
# # print(j.split('.')[1]+'\t'+item+'\t'+str(dict_name_to_count[item]/int(dict_mngs_base[item]))
# f1.write('{}\t{}\t{}\t{}\n'.format(j.split('.')[1],item,str(dict_name_to_count[item]),'mNGS'))
# # print(j)
#
# ## capture数据
# list_capture=[item for item in os.listdir(inputpath_capture) if item.endswith('.classify.last.xls')]
# list_capture_sample=list(set([item.split('.',1)[0] for item in list_capture]))
# print(list_capture_sample)
#
# # with open(outputpath1,'a') as f1
# for i in list_capture_sample:
# with open(outputpath2+'/'+i+'.capture.input.xls','w') as f1:
# f1.write('%s\t%s\t%s\t%s\n' %('x','y','value','group'))
# for j in list_capture:
# if j.startswith(i):
# df1=pd.read_csv(inputpath_capture+'/'+j,sep='\t')
# df1=df1[df1['Ename'].isin(list_need_species)].copy()
# df1.sort_values(by=['count'],ascending=False,inplace=True)
#
# dict_name_to_count = dict(zip(df1['Ename'], df1['count']))
#
# for item in dict_name_to_count:
# f1.write('{}\t{}\t{}\t{}\n'.format(j.split('.')[1],item,str(dict_name_to_count[item]),'16S'))
#
#
# list_merge=[item for item in os.listdir(outputpath2) if item.endswith('.xls')]
# list_merge_sample=list(set([item.split('.',1)[0] for item in list_mngs]))
#
# list_kmer=[2,5,8,10,15,20,25,30,40]
#
#
# for i in list_merge:
# df_add = pd.DataFrame(columns=['x'])
# df2=pd.read_csv(outputpath2+'/'+i,sep='\t')
# for j in list_kmer:
# df_part=df2[df2['x']==j].copy()
# df_add=df_add.append(df_part)
# df_add.to_csv(outputpath2 + '/' + i , sep='\t', index=None)
#
#
#
# for i in list_merge_sample:
# df_add=pd.DataFrame(columns=['x'])
# for j in list_merge:
# if j.startswith(i):
# df2=pd.read_csv(outputpath2+'/'+j,sep='\t')
# df_add=df_add.append(df2)
# df_add['value']=df_add['value'].apply(lambda x:int(x))
#
#
# df_add.columns=['kmer','genus','value','group']
# # df_add.sort_values(by=['kmer'],ascending=True,inplace=True)
# df_add.to_csv(outputpath2+'/'+i+'.input.txt',sep='\t',index=None)
# ## 2.生成属-标准化
# import os
# import pandas as pd
# import math
#
# inputpath_mngs=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\宏基因组'
# inputpath_mngs_base=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\宏基因组\base.xls'
#
# inputpath_capture=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\捕获'
# inputpath_capture_base=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\捕获\base.xls'
#
# # outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\species_input'
# outputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\genus_input'
#
# df_mngs_base=pd.read_csv(inputpath_mngs_base,sep='\t')
# dict_mngs_base=dict(zip(df_mngs_base['name'],df_mngs_base['base']))
# dict_mngs_base={ item: dict_mngs_base[item]/1000000000 for item in dict_mngs_base}
#
# print(dict_mngs_base)
# df_capture_base=pd.read_csv(inputpath_capture_base,sep='\t')
# dict_capture_base=dict(zip(df_capture_base['name'],df_capture_base['base']))
# dict_capture_base={ item: dict_capture_base[item]/1000000000 for item in dict_capture_base}
#
# ## 第一步找到共有的物种
# ## mngs数据
# list_mngs=[item for item in os.listdir(inputpath_mngs) if item.endswith('.classify.last.xls')]
# list_mngs_sample=list(set([item.split('.',1)[0] for item in list_mngs]))
# print(list_mngs_sample)
#
# list_mngs_need_species=[]
# for i in list_mngs_sample:
# for j in list_mngs:
# if j.startswith(i):
# df1=pd.read_csv(inputpath_mngs+'/'+j,sep='\t')
#