# -*- coding:utf-8 –*-
import pandas as pd
import datetime
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
def dataProcessing():
train = pd.read_csv(r'E:\pycharm files\sales prediction\data\train.csv')
feature = pd.read_csv(r'E:\pycharm files\sales prediction\data\features.csv')
# 统一Date格式
feature['Date'] = [datetime.datetime.strptime(x, '%Y/%m/%d').date() for x in feature['Date']]
train['Date'] = [datetime.datetime.strptime(y, '%Y-%m-%d').date() for y in train['Date']]
# 将train和feature整合为一张表
traindataNew = pd.merge(train, feature, how='left', on=['Date', 'Store', 'IsHoliday'])
# 将IsHoliday的布尔值化为0,1
traindataNew['IsHoliday'] = [conVertbool(x) for x in traindataNew['IsHoliday']]
# 将日期化为回归可接受的数字
weekNm = [(x - traindataNew['Date'][0]) for x in list(traindataNew['Date'])]
traindataNew['Date'] = [(float(np.timedelta64(x, 'D').astype(int) / 7 % 52)) for x in weekNm]
# 归一化
traindataNew['Temperature'] = regularization(list(traindataNew['Temperature']))
traindataNew['Fuel_Price'] = regularization(list(traindataNew['Fuel_Price']))
traindataNew['CPI'] = regularization(list(traindataNew['CPI']))
traindataNew['Unemployment'] = regularization(list(traindataNew['Unemployment']))
# 去除5个markdown列,去除特征'CPI'和'Unemployment',因为测试集中缺失值
traindataNew = traindataNew.iloc[:, [0, 1, 2, 3, 4, 5, 6]]
# 将45个store转化为45个特征
Store_Dummies = pd.get_dummies(traindataNew.Store, prefix='Store').iloc[:, 0:]
traindataNewchange = pd.concat([traindataNew, Store_Dummies], axis=1)
# 将99个部门转化为特征
Dept_Dummies = pd.get_dummies(traindataNewchange.Dept, prefix='Dept').iloc[:, 0:]
traindataNewchange2 = pd.concat([traindataNewchange, Dept_Dummies], axis=1)
dataNew = traindataNewchange2.to_csv(r'E:\pycharm files\sales prediction\data\traindataNew.csv')
return dataNew
# 将布尔值转化为0,1
def conVertbool(x):
if x:
return 1
else:
return 0
#归一化
def regularization(List):
Max = max(List)
Min = min(List)
List = [(x - Min) / (Max - Min) for x in List]
return List
# 训练模型
def train_modle():
traindataNew = pd.read_csv(r'E:\pycharm files\sales prediction\data\traindataNew.csv')
# 从样本中随机选取训练样本和测试样本
train_sample, test_sample = train_test_split(traindataNew, test_size=0.2, random_state=42)
xtrain = train_sample.iloc[:, ([3] + range(5, 136))]
ytrain = train_sample.Weekly_Sales
xtest = train_sample.iloc[:, ([3] + range(5, 136))]
ytest = train_sample.Weekly_Sales
# 调用随机森林
wmLinear = ensemble.RandomForestRegressor(n_estimators=25, oob_score=True)
wmLinear.fit(xtrain, ytrain)
score = wmLinear.score(xtest, ytest)
print (score)
ypredict = wmLinear.predict(xtest)
plt.plot(ytest, ypredict, 'ro')
plt.plot(ytest, ytest, 'b-')
plt.show()
return wmLinear
def testdata_processing():
train = pd.read_csv(r'E:\pycharm files\sales prediction\data\train.csv')
testdata = pd.read_csv(r'E:\pycharm files\sales prediction\data\test.csv')
feature = pd.read_csv(r'E:\pycharm files\sales prediction\data\features.csv')
train['Date'] = [datetime.datetime.strptime(y, '%Y-%m-%d').date() for y in train['Date']]
feature['Date'] = [datetime.datetime.strptime(x, '%Y/%m/%d').date() for x in feature['Date']]
testdata['Date'] = [datetime.datetime.strptime(y, '%Y-%m-%d').date() for y in testdata['Date']]
testdataNew = pd.merge(testdata, feature, how='left', on=['Date', 'Store', 'IsHoliday'])
testdataNew['IsHoliday'] = [conVertbool(x) for x in testdataNew['IsHoliday']]
weekNm = [(x - train['Date'][0]) for x in list(testdataNew['Date'])]
testdataNew['Date'] = [(float(np.timedelta64(x, 'D').astype(int) / 7 % 52)) for x in weekNm]
#缺失值填充
# testdataNew['CPI'] = testdataNew['CPI'].fillna(method='pad')
# testdataNew['Unemployment'] = testdataNew['Unemployment'].fillna(method='pad')
testdataNew['Temperature'] = regularization(list(testdataNew['Temperature']))
testdataNew['Fuel_Price'] = regularization(list(testdataNew['Fuel_Price']))
testdataNew['CPI'] = regularization(list(testdataNew['CPI']))
testdataNew['Unemployment'] = regularization(list(testdataNew['Unemployment']))
testdataNew = testdataNew.iloc[:, [0, 1, 2, 3, 4, 5]]
Store_Dummies = pd.get_dummies(testdataNew.Store, prefix='Store').iloc[:, 0:]
testdataNewchange = pd.concat([testdataNew, Store_Dummies], axis=1)
# 将99个部门转化为特征
Dept_Dummies = pd.get_dummies(testdataNewchange.Dept, prefix='Dept').iloc[:, 0:]
testdataNewchange2 = pd.concat([testdataNewchange, Dept_Dummies], axis=1)
dataNew = testdataNewchange2.to_csv(r'E:\pycharm files\sales prediction\data\testdataNew.csv')
return dataNew
# K折交叉验证
# dataProcessing()
# traindataNew = pd.read_csv(r'E:\pycharm files\sales prediction\data\traindataNew.csv')
# xtrain = traindataNew.iloc[:, ([3] + range(5, 136))]
# ytrain = traindataNew.Weekly_Sales
# wmLinear = ensemble.RandomForestRegressor(n_estimators=25)
# score=cross_val_score(wmLinear,xtrain,ytrain,cv=2)
# print score
# print sum(score)/10
#实际预测
dataProcessing()
testdata_processing()
testdataNew = pd.read_csv(r'E:\pycharm files\sales prediction\data\testdataNew.csv')
xtestt = testdataNew.iloc[:, range(3,133)]
traindataNew = pd.read_csv(r'E:\pycharm files\sales prediction\data\traindataNew.csv')
xtrain = traindataNew.iloc[:, ([3] + range(5, 134))]
ytrain = traindataNew.Weekly_Sales
wmLinear = ensemble.RandomForestRegressor(n_estimators=150,oob_score=True,n_jobs=-1)
wmLinear.fit(xtrain, ytrain)
print (wmLinear.oob_score_)
salespredict = wmLinear.predict(xtestt)
result = pd.read_csv(r'E:\pycharm files\sales prediction\data\sampleSubmission.csv')
result['Weekly_Sales'] = [x for x in salespredict]
result.to_csv(r'E:\pycharm files\sales prediction\data\result.csv')
# # 袋外分数
# dataProcessing()
# traindataNew = pd.read_csv(r'E:\pycharm files\sales prediction\data\traindataNew.csv')
# xtrain = traindataNew.iloc[:, ([3] + range(5, 136))]
# ytrain = traindataNew.Weekly_Sales
# wmLinear = ensemble.RandomForestRegressor(n_estimators=100, oob_score=True)
# wmLinear.fit(xtrain, ytrain)
# print wmLinear.oob_score_
# #画图
# train = pd.read_csv(r'E:\pycharm files\sales prediction\data\train.csv')
# train['Date'] = [datetime.datetime.strptime(y, '%Y-%m-%d').date() for y in train['Date']]
# plt.plot(train.iloc[range(0,145),[2]],train.iloc[range(0,145),[3]],'b-')
# plt.show()
# plt.plot(train.iloc[range(0,52),[2]],train.iloc[range(0,52),[3]],'b-')
# plt.show()
# plt.plot(train.iloc[range(52,104),[2]],train.iloc[range(52,104),[3]],'b-')
# plt.show()
data processing.zip_electricityzoo_gentle61j_kaggle_sales-predic
版权申诉
129 浏览量
2022-09-21
00:25:37
上传
评论 1
收藏 2KB ZIP 举报
寒泊
- 粉丝: 75
- 资源: 1万+
最新资源
- 基于JavaScript开发的智慧养老微信小程序源码(毕设项目).zip
- kmp算法-基于C语言kmp算法实现的字符串匹配.zip
- kmp算法-使用kmp算法在Golang中实现字符串匹配.zip
- LTC2756 +LTC6244+LTC6655 设计18位乘法串行输入电流输出DAC数模转换器模块硬件(原理图+PCB)工程
- kmp算法-基于C语言实现KMP算法.zip
- DMS智能座舱项目-Python基于深度学习实现驾驶员分心行为监测系统源码.zip
- 2023年数控机床与编程知识点总结归纳(精选试题附答案).docx
- 2023年数控机床与编程考点总结.pdf
- 2023年数控机床与编程考点题型与解题方法(精选试题附答案).docx
- 3CHISS统计软件介绍(Master).pdf
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
评论0