智能系统开发课程设计--红酒品质预测_红酒品质预测随机森林模型调优资源-CSDN文库

共2个文件

py：1个

docx：1个

版权申诉

5星 · 超过95%的资源 185 浏览量 2022-07-02 13:16:02 上传评论 3 收藏 839KB RAR 举报

资源详情

资源评论

资源推荐

收起资源包目录

智能系统开发课程设计.rar （2个子文件）

智能系统开发课程设计

红酒品质预测说明书 .docx 839KB

源程序.py 12KB

import pandas as pd import numpy as np from pandas import DataFrame from pylab import * import matplotlib.pyplot as plt import sys import random import time from sklearn import linear_model from sklearn.pipeline import make_pipeline ## normalize from sklearn import preprocessing from sklearn.preprocessing import PolynomialFeatures def gaussian_norm(dataframe): "特征使用高斯归一化，包含对预测值的归一化 0均值，方差为1" norm_dataframe=dataframe summary=dataframe.describe() print(dataframe.shape) for i in range(dataframe.shape[1]-1): mean=summary.iloc[:,i].values[1] sd=summary.iloc[:,i].values[2] norm_dataframe.iloc[:,i]=(dataframe.iloc[:,i].values-mean)/sd return norm_dataframe def generate_train_test(dataframe,train_pct): "训练集所占比例为 train_pct, 随机抽样训练集合" item_index=[i for i in range(dataframe.shape[0])] item_index_train=random.sample(item_index,int(train_pct*len(item_index))) item_index_test=list(set(item_index)-set(item_index_train)) train_array=dataframe.loc[item_index_train,:].values test_array=dataframe.loc[item_index_test,:].values return (pd.DataFrame(train_array),pd.DataFrame(test_array)) def MSE(y_pred,y): "计算均方误差 y_pred: 预测值的向量 y: 实际值得向量" return sum([k*k for k in y_pred-y])/len(y) def train_pred_MSE(train_dataframe,test_dataframe,alpha_val): "使用 Ridge 回归模型，超参数为 alpha_val,在训练集上训练，在测试集上测试，返回测试集上的均方误差" X_train = train_dataframe.iloc[:,0:train_dataframe.shape[1]-1] y_train = train_dataframe.iloc[:,train_dataframe.shape[1]-1] X_test = test_dataframe.iloc[:,0:test_dataframe.shape[1]-1] y_test = test_dataframe.iloc[:,test_dataframe.shape[1]-1] clf = linear_model.Ridge (alpha = alpha_val) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) return MSE(y_pred, y_test) def train_pred_MSE_model(train_dataframe,test_dataframe,model): "使用 model 指定的回归模型,在训练集上训练，在测试集上测试，返回测试集上的均方误差" X_train = train_dataframe.iloc[:,0:train_dataframe.shape[1]-1] y_train = train_dataframe.iloc[:,train_dataframe.shape[1]-1] X_test = test_dataframe.iloc[:,0:test_dataframe.shape[1]-1] y_test = test_dataframe.iloc[:,test_dataframe.shape[1]-1] model.fit(X_train, y_train) y_pred = model.predict(X_test) return (MSE(y_pred, y_test),model.coef_) def generate_nfolds(dataframe,foldK,total_folds): "使用 total_folds 折交叉验证(total_folds=10,表示 10 折交叉验证),返回第 foldK 折对应的训练集以及测试集" test_sample_size=int(dataframe.shape[0]/total_folds) item_index=[i for i in range(dataframe.shape[0])] item_index_test=[item_index[test_sample_size*foldK+i] for i in range(test_sample_size)] item_index_train=list(set(item_index)-set(item_index_test)) train_array=dataframe.loc[item_index_train,:].values test_array=dataframe.loc[item_index_test,:].values return (pd.DataFrame(train_array),pd.DataFrame(test_array)) def run_CV(dataframe,model,total_folds): cur_perf=[] avg_coef=[] for foldK in range(total_folds): (train_dataframe,test_dataframe)=generate_nfolds(dataframe,foldK,total_folds) (mse,coef)=train_pred_MSE_model(train_dataframe, test_dataframe, model) cur_perf.append(mse) #print "%20s\t%10.5f" %('fold_'+str(foldK+1),performance) avg_coef.append(coef) avg_coef=np.sum(np.array(avg_coef),axis=0)/total_folds print("alpha="+str(model.alpha),avg_coef) return sum(cur_perf)/total_folds def runCV_SK(input_dataframe, total_folds): "使用 sklearn 自带的交叉验证，选择超参数 alpha" X = input_dataframe.iloc[:,0:train_dataframe.shape[1]-1] y = input_dataframe.iloc[:,train_dataframe.shape[1]-1] t1 = time.time() model = linear_model.LassoCV(cv=total_folds).fit(X, y) t_lasso_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) plt.axis('tight') plt.show() def show_regularize(): "使用多项式回归演示正则化效果" x_plot = np.linspace(-1, 5, num=30)[:,np.newaxis] n_samples = [5,10] # 生成 2 组点 # 回归目标是学习穿越 2 组点的曲线 # 多项式回归输入相当于将原始特征 x 转换为 [x,x^2,x^3,...],预测 y~x+x^2+x^3 的线性模型 # 原始特征 x 转换为[x,x^2,x^3,...]，相当于非线性变换 # # 正则化的目标是选择一个变化幅度小的模型(通过对参数进行约束) # 在同样拟合数据的情况下，预测结果不要太过敏感震荡(x 的微小变化，y 变化不要太大) # # X1 = np.random.uniform(0, 1, size=n_samples[0])[:, np.newaxis] y1 = np.empty(n_samples[0]) y1.fill(1) X2 = np.random.uniform(3, 4, size=n_samples[1])[:, np.newaxis] y2 = np.empty(n_samples[1]) y2.fill(3) X=np.concatenate((X1,X2),axis=0) y=np.concatenate((y1,y2),axis=0) row_plt_num=2 col_plt_num=3 figure, ax_list = plt.subplots(row_plt_num, col_plt_num, sharex=True) #print(X.shape,y.shape) for row in range(row_plt_num): for col in range(col_plt_num): ax_list[row,col].scatter(X, y, s=20) degree_list=[0,1,3,5,8,10] for idx in range(len(degree_list)): degree=degree_list[idx] #使用非正则化的多项式回归模型拟合数据 model = make_pipeline(PolynomialFeatures(degree), linear_model.LinearRegression()) model.fit(X, y) y_plot = model.predict(x_plot) row_idx=idx//col_plt_num col_idx=idx%col_plt_num ax_list[row_idx,col_idx].plot(x_plot, y_plot) alpha=100 #使用正则化的多项式回归模型拟合数据 modelR = make_pipeline(PolynomialFeatures(degree), linear_model.Ridge(alpha)) modelR.fit(X, y) y_plot = modelR.predict(x_plot) ax_list[row_idx,col_idx].plot(x_plot, y_plot, label="d=%d, Regularized" % degree) ax_list[row_idx,col_idx].set_title("d=%d" % degree, fontsize=20) plt.show() def demo_step_forward(global_X_index,train_dataframe,test_dataframe): ### 正向逐步回归 ### 顺序进行特征选择 # 计划添加多少个属性进去，该值人工确定 step=10 # 剩余可加特征 feature_idx_set=set(global_X_index) step_feature_idx_list=[] # 已加入特征 step_err_list=[] # 最多特征数 nstep=min(step,len(feature_idx_set)) for k in range(nstep): ## 找到使 MSE 下降最大的特征 min_err=None besti=None for feature_idx in feature_idx_set: feature_list=[idx for idx in step_feature_idx_list] feature_list.append(feature_idx) X_train=train_dataframe.iloc[:,feature_list] y_train=np.concatenate(train_dataframe.iloc[:,global_y_index].values) X_test=test_dataframe.iloc[:,feature_list] y_test=np.concatenate(test_dataframe.iloc[:,global_y_index].values) model=linear_model.LinearRegression() model.fit(X_train, y_train) err=MSE(model.predict(