example with xgboost¶
Links: notebook
, html, PDF
, python
, slides, GitHub
Test XGBoost after it was compiled, pickle, unpickle.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
This is an example taken from xgboost website.
import pickle
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_diabetes
import xgboost as xgb
Zeros and Ones from the Digits dataset: binary classification¶
rng = np.random.RandomState(31337)
digits = load_digits(2)
y = digits['target']
X = digits['data']
conf = []
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
conf.append(confusion_matrix(actuals, predictions))
conf
[array([[87, 0],
[ 1, 92]], dtype=int64), array([[91, 0],
[ 3, 86]], dtype=int64)]
Iris: multiclass classification¶
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
conf = []
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
conf.append(confusion_matrix(actuals, predictions))
conf
[array([[19, 0, 0],
[ 0, 31, 3],
[ 0, 1, 21]], dtype=int64), array([[31, 0, 0],
[ 0, 16, 0],
[ 0, 3, 25]], dtype=int64)]
Diabetes: regression¶
data = load_diabetes()
y = data['target']
X = data['data']
err = []
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
err.append(mean_squared_error(actuals, predictions))
err
[9.860776812557337, 15.942418468446029]
Parameter optimization¶
import joblib # to check you can parallelize GridSearchCV
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1, n_jobs=1, pre_dispatch=1, cv=5)
clf.fit(X,y)
clf.best_score_, clf.best_params_
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 45 out of 45 | elapsed: 2.3s finished c:python370_x64libsite-packagessklearnmodel_selection_search.py:841: DeprecationWarning: The default of the iid parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
(0.6699572097100618, {'max_depth': 2, 'n_estimators': 100})
Pickling sklearn API models¶
# The sklearn API models are picklable
# must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
np.allclose(clf.predict(X), clf2.predict(X))
True
Early stopping¶
X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])
[0] validation_0-auc:0.999497
Will train until validation_0-auc hasn't improved in 10 rounds.
[1] validation_0-auc:0.999497
[2] validation_0-auc:0.999497
[3] validation_0-auc:0.999749
[4] validation_0-auc:0.999749
[5] validation_0-auc:0.999749
[6] validation_0-auc:0.999749
[7] validation_0-auc:0.999749
[8] validation_0-auc:0.999749
[9] validation_0-auc:0.999749
[10] validation_0-auc:1
[11] validation_0-auc:1
[12] validation_0-auc:1
[13] validation_0-auc:1
[14] validation_0-auc:1
[15] validation_0-auc:1
[16] validation_0-auc:1
[17] validation_0-auc:1
[18] validation_0-auc:1
[19] validation_0-auc:1
[20] validation_0-auc:1
Stopping. Best iteration:
[10] validation_0-auc:1
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)