# Get the data
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
y = digits.target


# Instantiate and train the classifier
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

KNeighborsClassifier(n_neighbors=1)

KNeighborsClassifier(n_neighbors=1)


# Check the results using metrics
from sklearn import metrics
y_pred = clf.predict(X)


print(metrics.confusion_matrix(y_pred, y))

[[178   0   0   0   0   0   0   0   0   0]
 [  0 182   0   0   0   0   0   0   0   0]
 [  0   0 177   0   0   0   0   0   0   0]
 [  0   0   0 183   0   0   0   0   0   0]
 [  0   0   0   0 181   0   0   0   0   0]
 [  0   0   0   0   0 182   0   0   0   0]
 [  0   0   0   0   0   0 181   0   0   0]
 [  0   0   0   0   0   0   0 179   0   0]
 [  0   0   0   0   0   0   0   0 174   0]
 [  0   0   0   0   0   0   0   0   0 180]]


%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np


from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor

data = load_diabetes()
clf = DecisionTreeRegressor().fit(data.data, data.target)
predicted = clf.predict(data.data)
expected = data.target

plt.scatter(expected, predicted)
plt.plot([0, 350], [0, 350], '--k')
plt.axis('tight')
plt.xlabel('True Progression')
plt.ylabel('Predicted Progression');


from sklearn.model_selection import train_test_split
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print("%r, %r, %r" % (X.shape, X_train.shape, X_test.shape))

(1797, 64), (1347, 64), (450, 64)


clf = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
y_pred = clf.predict(X_test)


print(metrics.confusion_matrix(y_test, y_pred))

[[37  0  0  0  0  0  0  0  0  0]
 [ 0 43  0  0  0  0  0  0  0  0]
 [ 0  0 43  1  0  0  0  0  0  0]
 [ 0  0  0 45  0  0  0  0  0  0]
 [ 0  0  0  0 38  0  0  0  0  0]
 [ 0  0  0  0  0 47  0  0  0  1]
 [ 0  0  0  0  0  0 52  0  0  0]
 [ 0  0  0  0  0  0  0 48  0  0]
 [ 0  0  0  0  0  0  0  0 48  0]
 [ 0  0  0  1  0  1  0  0  0 45]]


print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00        43
           2       1.00      0.98      0.99        44
           3       0.96      1.00      0.98        45
           4       1.00      1.00      1.00        38
           5       0.98      0.98      0.98        48
           6       1.00      1.00      1.00        52
           7       1.00      1.00      1.00        48
           8       1.00      1.00      1.00        48
           9       0.98      0.96      0.97        47

    accuracy                           0.99       450
   macro avg       0.99      0.99      0.99       450
weighted avg       0.99      0.99      0.99       450


metrics.f1_score(y_test, y_pred, average="macro")

0.9913675218842191


metrics.f1_score(y_train, clf.predict(X_train), average="macro")

1.0


from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import warnings  # suppress warnings from older versions of KNeighbors
warnings.filterwarnings('ignore', message='kneighbors*')

X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

for Model in [LinearSVC, GaussianNB, KNeighborsClassifier]:
    clf = Model().fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(Model.__name__,
          metrics.f1_score(y_test, y_pred, average="macro"))
    
print('------------------')

# test SVC loss
for loss, p, dual in [('squared_hinge', 'l1', False), ('squared_hinge', 'l2', True)]:
    clf = LinearSVC(penalty=p, loss=loss, dual=dual)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("LinearSVC(penalty='{0}', loss='{1}')".format(p, loss),
          metrics.f1_score(y_test, y_pred, average="macro"))
    
print('-------------------')
    
# test K-neighbors
for n_neighbors in range(1, 11):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("KNeighbors(n_neighbors={0})".format(n_neighbors),
          metrics.f1_score(y_test, y_pred, average="macro"))

C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(

LinearSVC 0.9257041879239652
GaussianNB 0.8332741681010101
KNeighborsClassifier 0.9804562804949924
------------------

C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(

LinearSVC(penalty='l1', loss='squared_hinge') 0.9447242283258508
LinearSVC(penalty='l2', loss='squared_hinge') 0.9385749925598466
-------------------
KNeighbors(n_neighbors=1) 0.9913675218842191
KNeighbors(n_neighbors=2) 0.9848442068835102
KNeighbors(n_neighbors=3) 0.9867753449543099
KNeighbors(n_neighbors=4) 0.9803719053818863
KNeighbors(n_neighbors=5) 0.9804562804949924
KNeighbors(n_neighbors=6) 0.9757924194139573
KNeighbors(n_neighbors=7) 0.9780645792142071
KNeighbors(n_neighbors=8) 0.9780645792142071
KNeighbors(n_neighbors=9) 0.9780645792142071
KNeighbors(n_neighbors=10) 0.9755550897728812


clf = KNeighborsClassifier()
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X, y, cv=5)

array([0.94722222, 0.95555556, 0.96657382, 0.98050139, 0.9637883 ])


from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5)
cross_val_score(clf, X, y, cv=cv)

array([0.98333333, 0.98333333, 0.98888889, 0.98333333, 1.        ])


from sklearn.datasets import load_diabetes
data = load_diabetes()
X, y = data.data, data.target
print(X.shape)

(442, 10)


from sklearn.linear_model import Ridge, Lasso

for Model in [Ridge, Lasso]:
    model = Model()
    print(Model.__name__, cross_val_score(model, X, y).mean())

Ridge 0.410174971340889
Lasso 0.3375593674654274


alphas = np.logspace(-3, -1, 30)

for Model in [Lasso, Ridge]:
    scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
              for alpha in alphas]
    plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left');


from sklearn.model_selection import GridSearchCV


for Model in [Ridge, Lasso]:
    gscv = GridSearchCV(Model(), dict(alpha=alphas), cv=3).fit(X, y)
    print(Model.__name__, gscv.best_params_)

Ridge {'alpha': 0.06210169418915616}
Lasso {'alpha': 0.01268961003167922}


from sklearn.linear_model import RidgeCV, LassoCV
for Model in [RidgeCV, LassoCV]:
    model = Model(alphas=alphas, cv=3).fit(X, y)
    print(Model.__name__, model.alpha_)

RidgeCV 0.06210169418915616
LassoCV 0.01268961003167922


for Model in [RidgeCV, LassoCV]:
    scores = cross_val_score(Model(alphas=alphas, cv=3), X, y, cv=3)
    print(Model.__name__, np.mean(scores))

RidgeCV 0.48916033973224776
LassoCV 0.4854908670556423

2A.ML101.5: Measuring prediction performance¶

Using the K-neighbors classifier¶

A Better Approach: Using a validation set¶

Application: Model Selection via Validation¶

Cross-validation¶

Hyperparameter optimization with cross-validation¶

Basic Hyperparameter Optimization¶

Automatically Performing Grid Search¶

Built-in Hyperparameter Search¶

Nested cross-validation¶