# -*- coding: utf-8 -*-
"""
Created on Thu May 18 13:55:11 2017
@author: dys
来自36大数据
http://www.36dsj.com/archives/43811
"""
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
rcParams['figure.figsize'] = 15, 6 # 修改默认更新图表大小
# 导入数据集,查看数据类型
data = pd.read_csv('AirPassengers.csv')
print (data.head())
print ('\n Data Types:')
print (data.dtypes)
# 为了将读取的数据作为时间序列,必须通过特殊的参数读取csv指令
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
parse =['Month']
data = pd.read_csv('AirPassengers.csv', parse_dates=parse, index_col='Month',date_parser=dateparse)
print (data.head())
'''
parse_dates:指定含有时间数据信息的列,正如上面所说,列的名称为“Month”
index_col:使用pandas的时间序列数据背后的关键思想是:目录成为描述时间信息的变量。\
所以该参数告诉pandas使用"Month"的列作为索引。
date_parser:指定将输入的字符串转换为可变的时间数据 。pandas默认的数据读取格式是
'YYYY-MM-DD HH:MM:SS'。如需要读取的数据没有默认的格式,就要人工定义。这\
和dataparse的功能部分相似,这里的定义可以为这一目的服务。
'''
ts = data['#Passengers'] #将列转换为序列对象,这样当我每次使用时间序列的时候,就不需要每次都要提及列名称。
print(ts.head(10))
# 开始分析时间序列
ts['1949']
plt.plot(ts) # ts从1949年1月到1960年12月
print ("稳定性分析")
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = pd.rolling_mean(timeseries, window=12)
rolstd = pd.rolling_std(timeseries, window=12)
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
test_stationarity(ts)
# Estimating&Eliminating Trend
ts_log = np.log(ts)
plt.plot(ts_log)
# smoothing--moving average
moving_avg = pd.rolling_mean(ts_log,12) #计算前12个的均值,故最前11个为NaN
plt.plot(ts_log)
plt.plot(moving_avg,color='red')
ts_log_moving_avg_diff = ts_log - moving_avg
ts_log_moving_avg_diff.head(12)
#删除NaN
ts_log_moving_avg_diff.dropna(inplace=True)#对于一个 Series,dropna 返回一个仅含非空数据和索引值的 Series
#凡是会对数组作出修改并返回一个新数组的,往往都有一个 replace=False 的可选参数。如果手动设定为 True,那么原数组就可以被替换。
test_stationarity(ts_log_moving_avg_diff)
# exponentially weighted moving average
expwighted_avg = pd.ewma(ts_log,halflife=12)
plt.plot(ts_log)
plt.plot(expwighted_avg,color='red')
ts_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(ts_log_ewma_diff)
#Take first difference:
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
decomposition = seasonal_decompose(ts_log)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)
#ACF and PACF plots:
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')
#Plot ACF:
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
#MA model:
model = ARIMA(ts_log, order=(2, 1, 0))
results_AR = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log_diff)**2))
model = ARIMA(ts_log, order=(0, 1, 2))
results_MA = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log_diff)**2))
model = ARIMA(ts_log, order=(2, 1, 2))
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff)**2))
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print predictions_ARIMA_diff.head()
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print predictions_ARIMA_diff_cumsum.head()
predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()
plt.plot(ts_log)
plt.plot(predictions_ARIMA_log)
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts)
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ts)**2)/len(ts)))