import numpy as np # linear algebraimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')
defcreate_features(df, label=None):"""
Creates atime series features from datetime index
"""
df['date']= df.index
df['hour']= df['date'].dt.hour
df['dayofweek']= df['date'].dt.dayofweek
df['quarter']= df['date'].dt.quarter
df['month']= df['date'].dt.month
df['year']= df['date'].dt.year
df['dayofyear']= df['date'].dt.dayofyear
df['dayofmonth']= df['date'].dt.day
df['weekofyear']= df['date'].dt.weekofyear
X = df[['hour','dayofweek','quarter','month','year','dayofyear','dayofmonth','weekofyear']]if label:
y = df[label]return X, y
return X
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train),(X_test, y_test)],
early_stopping_rounds=50,
verbose=False)# Change verbose to True if you want to see it train
D:\Anaconda3\lib\site-packages\xgboost\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version
if getattr(data, 'base', None) is not None and \
D:\Anaconda3\lib\site-packages\xgboost\core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version
data.base is not None and isinstance(data, np.ndarray) \
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, importance_type='gain',
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
subsample=1)
特征重要性
Feature importance is a great way to get a general idea about which features the model is relying on most to make the prediction. This is a metric that simply sums up how many times each feature is split on.
We can see that the day of year was most commonly used to split trees, while hour and year came in next. Quarter has low importance due to the fact that it could be created by different dayofyear splits.
I like using mean absolute percent error because it gives an easy to interperate percentage showing how off the predictions are.
MAPE isn’t included in sklearn so we need to use a custom function.
defmean_absolute_percentage_error(y_true, y_pred):"""Calculates MAPE given y_true and y_pred"""
y_true, y_pred = np.array(y_true), np.array(y_pred)return np.mean(np.abs((y_true - y_pred)/ y_true))*100
# Plot the forecast with the actuals
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = pjme_all[['MW_Prediction','PJME_MW']].plot(ax=ax,
style=['-','.'])
ax.set_xbound(lower='01-01-2015', upper='02-01-2015')
ax.set_ylim(0,60000)
plot = plt.suptitle('January 2015 Forecast vs Actuals')
# Plot the forecast with the actuals
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = pjme_all[['MW_Prediction','PJME_MW']].plot(ax=ax,
style=['-','.'])
ax.set_xbound(lower='01-01-2015', upper='01-08-2015')
ax.set_ylim(0,60000)
plot = plt.suptitle('First Week of January Forecast vs Actuals')
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = pjme_all[['MW_Prediction','PJME_MW']].plot(ax=ax,
style=['-','.'])
ax.set_ylim(0,60000)
ax.set_xbound(lower='07-01-2015', upper='07-08-2015')
plot = plt.suptitle('First Week of July Forecast vs Actuals')