from jyquickhelper import add_notebook_menu
add_notebook_menu()


%matplotlib inline


import numpy, numpy.random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

N = 1000

res = []

for n in [1, 2, 5, 10, 20, 50, 80, 90, 100, 110]:
    print("n=", n)
    for k in range(10):

        X = numpy.zeros((N, 2))
        X[:, 0] = numpy.random.randint(0, 2, (N,))
        X[:, 1] = numpy.random.randint(0, n+1, (N,))
        Y = X[:, 0] + X[:, 1] + numpy.random.normal(size=(N,)) / 2
        Y[Y < 1.5] = 0
        Y[Y >= 1.5] = 1

        X_train, X_test, y_train, y_test = train_test_split(X, Y)

        stat = dict(N=N, n=n, ratio_train=y_train.sum()/y_train.shape[0],
                    k=k, ratio_test=y_test.sum()/y_test.shape[0])
        
        for model in [LogisticRegression(solver="liblinear"),
                      MLPClassifier(max_iter=500),
                      RandomForestClassifier(n_estimators=10),
                      AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10)]:
            obs = stat.copy()
            obs["model"] = model.__class__.__name__
            if obs["model"] == "AdaBoostClassifier":
                obs["model"] = "AdaB-" + model.base_estimator.__class__.__name__
            try:
                model.fit(X_train, y_train)
            except ValueError as e:
                obs["erreur"] = str(e)
                res.append(obs)
                continue
            sc = model.score(X_test, y_test)
            obs["accuracy"] = sc
            conf = confusion_matrix(y_test, model.predict(X_test))
            try:
                obs["Error-0|1"] = conf[0, 1] / conf[0, :].sum()
                obs["Error-1|0"] = conf[1, 0] / conf[1, :].sum()
            except Exception:
                pass
            res.append(obs)

n= 1
n= 2
n= 5
n= 10
n= 20
n= 50
n= 80
n= 90
n= 100
n= 110


from pandas import DataFrame
df = DataFrame(res)
df = df.sort_values(['n', 'model', 'model', "k"]).reset_index(drop=True)
df["diff_ratio"] = (df["ratio_test"] - df["ratio_train"]).abs()
df.head(n=5)


df.tail(n=5)


df[df.n==100][["n", "ratio_test", "ratio_train"]].head(n=10)


#df.to_excel("data.xlsx")


columns = ["n", "N", "model"]
agg = df.groupby(columns, as_index=False).mean().sort_values(["n", "model"]).reset_index(drop=True)
agg.tail()


import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(10,4))
agg.plot(x="n", y="diff_ratio", ax=ax[0])
agg.plot(x="n", y="ratio_train", ax=ax[1])
agg.plot(x="n", y="ratio_test", ax=ax[1])
ax[0].set_title("Maximum difference between\nratio of first class on train and test")
ax[1].set_title("Ratio of first class on train and test")
ax[0].legend();


agg2 = agg.copy()
agg2["ratio_test2"] = agg2["ratio_test"] + agg2["n"] / 100000


import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 3, figsize=(14,4))
agg2.pivot("ratio_test2", "model", "accuracy").plot(ax=ax[0])
agg2.pivot("ratio_test2", "model", "Error-0|1").plot(ax=ax[1])
agg2.pivot("ratio_test2", "model", "Error-1|0").plot(ax=ax[2])
ax[0].plot([0.5, 1.0], [0.5, 1.0], '--', label="constant")
ax[0].set_title("Accuracy")
ax[1].set_title("Error-0|1")
ax[2].set_title("Error-1|0")
ax[0].legend();


agg2.pivot("ratio_test2", "model", "Error-0|1")


from sklearn.datasets import load_diabetes
data = load_diabetes()
X, y = data.data, data.target


X_train, X_test, y_train, y_test = train_test_split(X, y)


from sklearn.ensemble import RandomForestRegressor

model = None
res = []
for i in range(0, 20):
    if model is None:
        model = RandomForestRegressor(n_estimators=1, warm_start=True)
    else:
        model.set_params(**dict(n_estimators=model.n_estimators+1))
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    res.append(dict(n_estimators=model.n_estimators, score=score))


df = DataFrame(res)
df.head()


ax = df.plot(x="n_estimators", y="score")
ax.set_title("Apprentissage continu\nmesure de la performance à chaque itération");

	N	n	ratio_train	k	ratio_test	model	accuracy	Error-0\|1	Error-1\|0	diff_ratio
0	1000	1	0.273333	0	0.300	AdaB-DecisionTreeClassifier	0.860	0.062857	0.320000	0.026667
1	1000	1	0.274667	1	0.328	AdaB-DecisionTreeClassifier	0.916	0.029762	0.195122	0.053333
2	1000	1	0.304000	2	0.284	AdaB-DecisionTreeClassifier	0.860	0.072626	0.309859	0.020000
3	1000	1	0.285333	3	0.268	AdaB-DecisionTreeClassifier	0.896	0.027322	0.313433	0.017333
4	1000	1	0.297333	4	0.256	AdaB-DecisionTreeClassifier	0.888	0.053763	0.281250	0.041333

	N	n	ratio_train	k	ratio_test	model	accuracy	Error-0\|1	Error-1\|0	diff_ratio
395	1000	110	0.982667	5	0.996	RandomForestClassifier	0.996	0.0	0.004016	0.013333
396	1000	110	0.990667	6	0.980	RandomForestClassifier	0.996	0.2	0.000000	0.010667
397	1000	110	0.985333	7	0.988	RandomForestClassifier	1.000	0.0	0.000000	0.002667
398	1000	110	0.985333	8	0.992	RandomForestClassifier	1.000	0.0	0.000000	0.006667
399	1000	110	0.985333	9	0.992	RandomForestClassifier	0.996	0.5	0.000000	0.006667

	n	ratio_test	ratio_train
320	100	0.980	0.992000
321	100	0.984	0.980000
322	100	0.988	0.984000
323	100	0.988	0.986667
324	100	0.976	0.986667
325	100	0.984	0.985333
326	100	0.984	0.981333
327	100	0.988	0.982667
328	100	0.984	0.989333
329	100	0.992	0.989333

	n	N	model	ratio_train	k	ratio_test	accuracy	Error-0\|1	Error-1\|0	diff_ratio
35	100	1000	RandomForestClassifier	0.985733	4.5	0.9848	0.9956	0.185000	0.001216	0.004933
36	110	1000	AdaB-DecisionTreeClassifier	0.986533	4.5	0.9900	0.9972	0.130000	0.000810	0.007200
37	110	1000	LogisticRegression	0.986533	4.5	0.9900	0.9960	0.346667	0.000402	0.007200
38	110	1000	MLPClassifier	0.986533	4.5	0.9900	0.9956	0.346667	0.000810	0.007200
39	110	1000	RandomForestClassifier	0.986533	4.5	0.9900	0.9980	0.090000	0.000810	0.007200

model	AdaB-DecisionTreeClassifier	LogisticRegression	MLPClassifier	RandomForestClassifier
ratio_test2
0.29721	0.052249	0.052249	0.052249	0.052249
0.50682	0.110686	0.110686	0.110686	0.110686
0.75525	0.119578	0.119578	0.119578	0.119578
0.86690	0.099333	0.099333	0.099333	0.099333
0.92900	0.088095	0.113095	0.113095	0.088095
0.96970	0.106349	0.253968	0.220635	0.163492
0.98120	0.125000	0.310000	0.200000	0.175000
0.98490	0.110000	0.155000	0.155000	0.170000
0.98580	0.185000	0.335000	0.268333	0.185000
0.99110	0.130000	0.346667	0.346667	0.090000

2018-10-09 Ensemble, Gradient, Boosting...¶

Skewed split train test¶

Apprentissage continu¶

	n_estimators	score
0	1	0.128906
1	2	0.323854
2	3	0.352876
3	4	0.389476
4	5	0.429992