Classification multi-classe

Links: notebook, html, PDF, python, slides, GitHub

On cherche à prédire la note d’un vin avec un classifieur multi-classe.

%matplotlib inline
from papierstat.datasets import load_wines_dataset
df = load_wines_dataset()
X = df.drop(['quality', 'color'], axis=1)
y = df['quality']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
clr.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
import numpy
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
53.84615384615385

On regarde la matrice de confusion.

from sklearn.metrics import confusion_matrix
import pandas
pandas.DataFrame(confusion_matrix(y_test, clr.predict(X_test)))
0 1 2 3 4 5 6
0 0 0 6 0 0 0 0
1 0 0 39 14 1 0 0
2 0 0 338 208 2 0 0
3 0 0 195 517 17 0 0
4 0 0 19 200 20 0 0
5 0 0 2 38 8 0 0
6 0 0 0 1 0 0 0

On l’affiche différemment avec le nom des classes.

conf = confusion_matrix(y_test, clr.predict(X_test))
dfconf = pandas.DataFrame(conf)
labels = list(clr.classes_)
if len(labels) < dfconf.shape[1]:
    labels += [9]  # La classe 9 est très représentée, elle est parfois absente en train.
elif len(labels) > dfconf.shape[1]:
    labels = labels[:dfconf.shape[1]] # ou l'inverse
dfconf.columns = labels
dfconf.index = labels
dfconf
3 4 5 6 7 8 9
3 0 0 6 0 0 0 0
4 0 0 39 14 1 0 0
5 0 0 338 208 2 0 0
6 0 0 195 517 17 0 0
7 0 0 19 200 20 0 0
8 0 0 2 38 8 0 0
9 0 0 0 1 0 0 0

Pas extraordinaire. On applique la stratégie OneVsRestClassifier.

from sklearn.multiclass import OneVsRestClassifier
clr = OneVsRestClassifier(LogisticRegression())
clr.fit(X_train, y_train)
OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
53.784615384615385

Le modèle logistique régression multi-classe est équivalent à la stratégie OneVsRest. Voyons l’autre.

from sklearn.multiclass import OneVsOneClassifier
clr = OneVsOneClassifier(LogisticRegression())
clr.fit(X_train, y_train)
OneVsOneClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
53.47692307692308
conf = confusion_matrix(y_test, clr.predict(X_test))
dfconf = pandas.DataFrame(conf)
labels = list(clr.classes_)
if len(labels) < dfconf.shape[1]:
    labels += [9]  # La classe 9 est très représentée, elle est parfois absente en train.
elif len(labels) > dfconf.shape[1]:
    labels = labels[:dfconf.shape[1]] # ou l'inverse
dfconf.columns = labels
dfconf.index = labels
dfconf
3 4 5 6 7 8 9
3 0 0 6 0 0 0 0
4 0 0 38 15 1 0 0
5 0 0 335 208 5 0 0
6 0 0 197 491 41 0 0
7 0 0 20 176 43 0 0
8 0 0 1 34 13 0 0
9 0 0 0 1 0 0 0

A peu près pareil mais sans doute pas de manière significative. Voyons avec un arbre de décision.

from sklearn.tree import DecisionTreeClassifier
clr = DecisionTreeClassifier()
clr.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
59.50769230769231

Et avec OneVsRestClassifier :

clr = OneVsRestClassifier(DecisionTreeClassifier())
clr.fit(X_train, y_train)
OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=1)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
52.92307692307693

Et avec OneVsOneClassifier

clr = OneVsOneClassifier(DecisionTreeClassifier())
clr.fit(X_train, y_train)
OneVsOneClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=1)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
60.12307692307692

Mieux.

from sklearn.ensemble import RandomForestClassifier
clr = RandomForestClassifier()
clr.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
66.46153846153847
clr = OneVsRestClassifier(RandomForestClassifier())
clr.fit(X_train, y_train)
OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=1)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
65.90769230769232

Proche, il faut affiner avec une validation croisée.

from sklearn.neural_network import MLPClassifier
clr = MLPClassifier(hidden_layer_sizes=30, max_iter=600)
clr.fit(X_train, y_train)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=30, learning_rate='constant',
       learning_rate_init=0.001, max_iter=600, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
51.323076923076925
clr = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=30, max_iter=600))
clr.fit(X_train, y_train)
OneVsRestClassifier(estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=30, learning_rate='constant',
       learning_rate_init=0.001, max_iter=600, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
          n_jobs=1)
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100
47.56923076923077

Pas foudroyant.