数据预处理，python读取excel数据，分类属性数值化

共3个文件

py：3个

python

3星 · 超过75%的资源需积分: 40 197 浏览量 2018-10-07 10:06:45 上传评论 9 收藏 3KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

Data Preprocessing.rar （3个子文件）

Data Preprocessing

Categorical Attributes Digitalization.py 4KB

__init__.py 0B

Excel Reading Example.py 2KB

import xlrd from collections import Counter from numpy import * import numpy as np import xlsxwriter ''' python3读取excel数据，其中行为数据对象，列为属性。属性皆为分类属性，每一属性值用字符表示，这一file用于将分类属性数值化 ''' # 读取excel中的数据，并保存成List输出 # path = 'E:/Molecular Biology data.xlsx' def readExcel(path): # 设置路径：参数path为文件路径 # 打开execl workbook = xlrd.open_workbook(path) # 输出Excel文件中所有sheet的名字 print(workbook.sheet_names()) # 根据sheet索引或者名称获取sheet内容 Data_sheet: object = workbook.sheets()[0] # 通过索引获取第一个sheet的数据内容 # Data_sheet = workbook.sheet_by_index(0) # 通过索引获取 # Data_sheet = workbook.sheet_by_name(u'名称') # 通过名称获取 print(Data_sheet.name) # 获取sheet名称 rowNum = Data_sheet.nrows # sheet行数 colNum = Data_sheet.ncols # sheet列数 # 获取所有单元格的内容 list = [] for i in range(rowNum): rowlist = [] for j in range(colNum): rowlist.append(Data_sheet.cell_value(i, j)) list.append(rowlist) # list为读取到的excel中的内容 return list # list中数据行为数据对象，列为属性（分类属性，由字符表示），需要按列将属性值进行数值化处理 def digitalization(list): # 将list中的元素进行数值化，用自然数表示 rowNum = len(list) colNum = len(list[0]) listDigitalization = mat(zeros((rowNum,colNum))) for i in range(colNum): # 获得第i列数据 collistStr = getColValues(list, i) print(collistStr[0]) # 获取第i列中出现的所有元素 values_counts = unique(collistStr) print(values_counts) newCollistDigitalization=[] for j in range(rowNum): tempValue=list[j][i] newValue = values_counts.index(tempValue) listDigitalization[j][:,i] = newValue listDigitalization = listDigitalization.tolist() return listDigitalization # list的去重函数，获取list中出现的元素 def unique(list): newlist = [] for x in list: if x not in newlist: newlist.append(x) return newlist # 获取多维list中第i列的值 def getColValues(list, i): colList = [] rows = len(list) for x in range(rows): colList.append(list[x][i]) return colList def writeExcel(list, path): # 在写入数据的时候，我本来是打算用xlwt的，但是由于处理的数据量较大，因此用xlwt会发生错误ValueError:row index # (65536)not an intin range(65536），在查阅资料后选择了xlsxwriter进行写入，最大能够支持1048576行数据，16384列数据。 # list为要写入excel中的数据，为list格式 # path为要写入的excel的路径 workbook = xlsxwriter.Workbook(path) # 生成表格 worksheet = workbook.add_worksheet(u'sheet1') # 在文件中创建一个名为TEST的sheet,不加名字默认为sheet1 # worksheet.set_column('A:A', 20) # 设置第一列宽度为20像素 bold = workbook.add_format({'bold': True}) # 设置一个加粗的格式对象 newcols = len(list) for i in range(newcols): worksheet.write_row('A%s' % str(i+1), list[i]) # 循环写处理后的数据生成的列表 workbook.close() if __name__ == '__main__': # 确定要处理的文件路径 pathData = 'E:/Academic Research/HABOC相关（LISS2018）/data/HABOS数据库/Voting_NoLabelData.xlsx' # 将excel读入到list中 dataList = readExcel(pathData) # list中的元素为字符型，需要处理成数值型 dataListDigit = digitalization(dataList) print(type(dataListDigit)) # 将处理完的数据结果写入excel中 pathExcel = 'E:/Academic Research/HABOC相关（LISS2018）/data/HABOS数据库/Voting_NoLabelDataDigitalization.xlsx' writeExcel(dataListDigit, pathExcel)

评论收藏

内容反馈