import pandas as pd
import numpy as np
from pandas import DataFrame
from pylab import *
import matplotlib.pyplot as plt
import sys
import random
import time
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
## normalize
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
def gaussian_norm(dataframe):
"特征使用高斯归一化,包含对预测值的归一化 0均值,方差为1"
norm_dataframe=dataframe
summary=dataframe.describe()
print(dataframe.shape)
for i in range(dataframe.shape[1]-1):
mean=summary.iloc[:,i].values[1]
sd=summary.iloc[:,i].values[2]
norm_dataframe.iloc[:,i]=(dataframe.iloc[:,i].values-mean)/sd
return norm_dataframe
def generate_train_test(dataframe,train_pct):
"训练集所占比例为 train_pct, 随机抽样训练集合"
item_index=[i for i in range(dataframe.shape[0])]
item_index_train=random.sample(item_index,int(train_pct*len(item_index)))
item_index_test=list(set(item_index)-set(item_index_train))
train_array=dataframe.loc[item_index_train,:].values
test_array=dataframe.loc[item_index_test,:].values
return (pd.DataFrame(train_array),pd.DataFrame(test_array))
def MSE(y_pred,y):
"计算均方误差 y_pred: 预测值的向量 y: 实际值得向量"
return sum([k*k for k in y_pred-y])/len(y)
def train_pred_MSE(train_dataframe,test_dataframe,alpha_val):
"使用 Ridge 回归模型,超参数为 alpha_val,在训练集上训练,在测试集上测试,返回测试集上的均方误差"
X_train = train_dataframe.iloc[:,0:train_dataframe.shape[1]-1]
y_train = train_dataframe.iloc[:,train_dataframe.shape[1]-1]
X_test = test_dataframe.iloc[:,0:test_dataframe.shape[1]-1]
y_test = test_dataframe.iloc[:,test_dataframe.shape[1]-1]
clf = linear_model.Ridge (alpha = alpha_val)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
return MSE(y_pred, y_test)
def train_pred_MSE_model(train_dataframe,test_dataframe,model):
"使用 model 指定的回归模型,在训练集上训练,在测试集上测试,返回测试集上的均方误差"
X_train = train_dataframe.iloc[:,0:train_dataframe.shape[1]-1]
y_train = train_dataframe.iloc[:,train_dataframe.shape[1]-1]
X_test = test_dataframe.iloc[:,0:test_dataframe.shape[1]-1]
y_test = test_dataframe.iloc[:,test_dataframe.shape[1]-1]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return (MSE(y_pred, y_test),model.coef_)
def generate_nfolds(dataframe,foldK,total_folds):
"使用 total_folds 折交叉验证(total_folds=10,表示 10 折交叉验证),返回第 foldK 折对应的 训练集以及测试集"
test_sample_size=int(dataframe.shape[0]/total_folds)
item_index=[i for i in range(dataframe.shape[0])]
item_index_test=[item_index[test_sample_size*foldK+i] for i in range(test_sample_size)]
item_index_train=list(set(item_index)-set(item_index_test))
train_array=dataframe.loc[item_index_train,:].values
test_array=dataframe.loc[item_index_test,:].values
return (pd.DataFrame(train_array),pd.DataFrame(test_array))
def run_CV(dataframe,model,total_folds):
cur_perf=[]
avg_coef=[]
for foldK in range(total_folds):
(train_dataframe,test_dataframe)=generate_nfolds(dataframe,foldK,total_folds)
(mse,coef)=train_pred_MSE_model(train_dataframe, test_dataframe, model)
cur_perf.append(mse)
#print "%20s\t%10.5f" %('fold_'+str(foldK+1),performance)
avg_coef.append(coef)
avg_coef=np.sum(np.array(avg_coef),axis=0)/total_folds
print("alpha="+str(model.alpha),avg_coef)
return sum(cur_perf)/total_folds
def runCV_SK(input_dataframe, total_folds):
"使用 sklearn 自带的交叉验证,选择超参数 alpha"
X = input_dataframe.iloc[:,0:train_dataframe.shape[1]-1]
y = input_dataframe.iloc[:,train_dataframe.shape[1]-1]
t1 = time.time()
model = linear_model.LassoCV(cv=total_folds).fit(X, y)
t_lasso_cv = time.time() - t1
# Display results
m_log_alphas = -np.log10(model.alphas_)
plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha: CV estimate')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.show()
def show_regularize():
"使用 多项式回归演示正则化效果"
x_plot = np.linspace(-1, 5, num=30)[:,np.newaxis]
n_samples = [5,10]
# 生成 2 组点
# 回归目标是学习穿越 2 组点的曲线
# 多项式回归输入相当于将原始特征 x 转换为 [x,x^2,x^3,...],预测 y~x+x^2+x^3 的线性模型
# 原始特征 x 转换为[x,x^2,x^3,...],相当于非线性变换
#
# 正则化的目标是选择一个 变化幅度小的模型(通过对参数进行约束)
# 在同样拟合数据的情况下,预测结果不要太过敏感震荡(x 的微小变化,y 变化不要太大)
#
#
X1 = np.random.uniform(0, 1, size=n_samples[0])[:, np.newaxis]
y1 = np.empty(n_samples[0])
y1.fill(1)
X2 = np.random.uniform(3, 4, size=n_samples[1])[:, np.newaxis]
y2 = np.empty(n_samples[1])
y2.fill(3)
X=np.concatenate((X1,X2),axis=0)
y=np.concatenate((y1,y2),axis=0)
row_plt_num=2
col_plt_num=3
figure, ax_list = plt.subplots(row_plt_num, col_plt_num, sharex=True)
#print(X.shape,y.shape)
for row in range(row_plt_num):
for col in range(col_plt_num):
ax_list[row,col].scatter(X, y, s=20)
degree_list=[0,1,3,5,8,10]
for idx in range(len(degree_list)):
degree=degree_list[idx]
#使用非正则化的多项式回归模型拟合数据
model = make_pipeline(PolynomialFeatures(degree), linear_model.LinearRegression())
model.fit(X, y)
y_plot = model.predict(x_plot)
row_idx=idx//col_plt_num
col_idx=idx%col_plt_num
ax_list[row_idx,col_idx].plot(x_plot, y_plot)
alpha=100
#使用正则化的多项式回归模型拟合数据
modelR = make_pipeline(PolynomialFeatures(degree), linear_model.Ridge(alpha))
modelR.fit(X, y)
y_plot = modelR.predict(x_plot)
ax_list[row_idx,col_idx].plot(x_plot, y_plot, label="d=%d, Regularized" % degree)
ax_list[row_idx,col_idx].set_title("d=%d" % degree, fontsize=20)
plt.show()
def demo_step_forward(global_X_index,train_dataframe,test_dataframe):
### 正向逐步回归
### 顺序进行特征选择
# 计划添加多少个属性进去,该值人工确定
step=10
# 剩余可加特征
feature_idx_set=set(global_X_index)
step_feature_idx_list=[] # 已加入特征
step_err_list=[]
# 最多特征数
nstep=min(step,len(feature_idx_set))
for k in range(nstep):
## 找到使 MSE 下降最大的 特征
min_err=None
besti=None
for feature_idx in feature_idx_set:
feature_list=[idx for idx in step_feature_idx_list]
feature_list.append(feature_idx)
X_train=train_dataframe.iloc[:,feature_list]
y_train=np.concatenate(train_dataframe.iloc[:,global_y_index].values)
X_test=test_dataframe.iloc[:,feature_list]
y_test=np.concatenate(test_dataframe.iloc[:,global_y_index].values)
model=linear_model.LinearRegression()
model.fit(X_train, y_train)
err=MSE(model.predict(
智能系统开发课程设计--红酒品质预测
版权申诉
5星 · 超过95%的资源 185 浏览量
2022-07-02
13:16:02
上传
评论 3
收藏 839KB RAR 举报
爱吃一口松饼的柠檬茶
- 粉丝: 1
- 资源: 17
评论2