.. _solution2016creditclementrst: =========================================================== 2016 - Une solution à la compétition de machine learning 2A =========================================================== .. only:: html **Links:** :download:`notebook `, :downloadlink:`html `, :download:`python `, :downloadlink:`slides `, :githublink:`GitHub|_doc/notebooks/notebook_eleves/2016-2017/solution_2016_credit_clement.ipynb|*` Ce notebook a été proposé par un étudiant pour la compétition organisée pour ce cours : `classification binaire `__. .. code:: ipython3 from pyensae.datasource import download_data download_data("ensae_competition_2016.zip", url="https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/") .. parsed-literal:: ['ensae_competition_test_X.txt', 'ensae_competition_train.txt'] .. code:: ipython3 # packages import pandas as pd import numpy as np from sklearn import svm, linear_model, datasets, metrics import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline from statsmodels.nonparametric.kde import KDEUnivariate from statsmodels.nonparametric import smoothers_lowess .. code:: ipython3 # dataframe # df = pd.read_excel("default_of_credit_card_clients.xls", header=[0, 1], encoding="utf8", index_col=0, engine='openpyxl') df = pd.read_csv("ensae_competition_train.txt", header=[0, 1], encoding="utf8", index_col=0, sep="\t") df.head(10) .. raw:: html
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X15 X16 X17 X18 X19 X20 X21 X22 X23 Y
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
0 180000 1 2 1 47 0 0 0 0 0 ... 99694 65977 67415 3700 3700 4100 2360 2500 2618 0
1 110000 2 2 1 35 0 0 0 0 0 ... 4869 4966 5070 1053 1073 1081 178 184 185 1
2 70000 2 2 2 22 0 0 0 0 0 ... 69927 50579 49483 2501 3001 2608 1777 1792 1793 1
3 200000 2 1 2 27 -2 -2 -2 -2 -2 ... 1665 3370 -36 5610 15616 1673 3385 0 95456 0
4 370000 2 1 1 39 0 0 0 0 0 ... 48216 47675 48074 2157 2000 1668 2000 3000 1000 0
5 260000 2 1 1 29 0 0 0 -2 -2 ... 0 0 0 3090 0 0 0 0 141516 0
6 90000 2 1 1 43 -1 -1 2 -1 -1 ... 7660 21175 4009 4367 9 7660 21175 4009 7452 0
7 220000 2 1 1 43 -1 3 2 0 0 ... 1090 1090 0 167 0 0 0 0 0 1
8 50000 1 2 1 35 1 2 0 0 0 ... 21260 70 29575 0 2052 1800 0 29935 1200 1
9 50000 2 3 2 40 0 0 0 0 0 ... 8292 8465 8650 1271 1130 1000 307 325 436 0

10 rows × 24 columns

.. code:: ipython3 df.columns .. parsed-literal:: MultiIndex(levels=[['X1', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X2', 'X20', 'X21', 'X22', 'X23', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'Y'], ['AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'EDUCATION', 'LIMIT_BAL', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'SEX', 'default payment next month']], labels=[[0, 11, 16, 17, 18, 19, 20, 21, 22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 23], [8, 22, 7, 9, 0, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 16, 17, 18, 19, 20, 21, 23]], names=[None, 'ID']) .. code:: ipython3 # Retrait 2ème ligne header df1 = df.copy() df1.columns = df1.columns.droplevel(-1) .. code:: ipython3 df1.columns .. parsed-literal:: Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'Y'], dtype='object') .. code:: ipython3 # statistiques descriptives # paramètres des graphes fig = plt.figure(figsize=(12, 6)) alpha=alpha_scatterplot = 0.2 alpha_bar_chart = 0.55 '''graphs - the history of past payment''' # September 2005 plt.subplot2grid((3,6),(0,0)) plt.scatter(df1.Y, df1.X6, alpha=alpha_scatterplot) # axe x plt.xlabel("Default") # axe y plt.ylabel("Payment delay") # grid - titre plt.grid(b=True, which='major', axis='y') plt.title("September 2005") # August 2005 plt.subplot2grid((3,6),(0,1)) plt.scatter(df1.Y, df1.X7, alpha=alpha_scatterplot) # axe x plt.xlabel("Default") # axe y plt.ylabel("Payment delay") # grid - titre plt.grid(b=True, which='major', axis='y') plt.title("August 2005") # July 2005 plt.subplot2grid((3,6),(0,2)) plt.scatter(df1.Y, df1.X8, alpha=alpha_scatterplot) # axe x plt.xlabel("Default") # axe y plt.ylabel("Payment delay") # grid - titre plt.grid(b=True, which='major', axis='y') plt.title("July 2005") # May 2005 plt.subplot2grid((3,6),(0,3)) plt.scatter(df1.Y, df1.X9, alpha=alpha_scatterplot) # axe x plt.xlabel("Default") # axe y plt.ylabel("Payment delay") # grid - titre plt.grid(b=True, which='major', axis='y') plt.title("May 2005") # April 2005 plt.subplot2grid((3,6),(0,4)) plt.scatter(df1.Y, df1.X10, alpha=alpha_scatterplot) # axe x plt.xlabel("Default") # axe y plt.ylabel("Payment delay") # grid - titre plt.grid(b=True, which='major', axis='y') plt.title("April 2005") # March 2005 plt.subplot2grid((3,6),(0,5)) plt.scatter(df1.Y, df1.X11, alpha=alpha_scatterplot) # axe x plt.xlabel("Default") # axe y plt.ylabel("Payment delay") # grid - titre plt.grid(b=True, which='major', axis='y') plt.title("March 2005") .. parsed-literal:: .. image:: solution_2016_credit_clement_7_1.png .. code:: ipython3 fig = plt.figure(figsize=(12, 6)) alpha=alpha_scatterplot = 0.2 alpha_bar_chart = 0.55 '''Graphs - bill statement''' # personnes pas en défaut de paiement ax1 = plt.subplot2grid((3,6),(1,0), colspan=3) # kernel density df1.X12[df1.Y == 0].plot(kind='kde') df1.X13[df1.Y == 0].plot(kind='kde') df1.X14[df1.Y == 0].plot(kind='kde') df1.X15[df1.Y == 0].plot(kind='kde') df1.X16[df1.Y == 0].plot(kind='kde') df1.X17[df1.Y == 0].plot(kind='kde') # axes plt.xlabel("Bill statement") plt.title("People distribution, no default") # limites ax1.set_xlim(0, 200000) # légende plt.legend(('September','August','July','May','April','March'),loc='best') # personnes en défaut de paiement ax2 = plt.subplot2grid((3,6),(1,3), colspan=3) # kernel density df1.X12[df1.Y == 1].plot(kind='kde') df1.X13[df1.Y == 1].plot(kind='kde') df1.X14[df1.Y == 1].plot(kind='kde') df1.X15[df1.Y == 1].plot(kind='kde') df1.X16[df1.Y == 1].plot(kind='kde') df1.X17[df1.Y == 1].plot(kind='kde') # axes plt.xlabel("Bill statement") plt.title("People distribution, default") # limites ax2.set_xlim(0, 200000) # légende plt.legend(('September','August','July','May','April','March'),loc='best') '''Graphs - amount of bill payed''' # personnes pas en défaut de paiement ax1 = plt.subplot2grid((3,6),(2,0), colspan=3) # kernel density df1.X18[df1.Y == 0].plot(kind='kde') df1.X19[df1.Y == 0].plot(kind='kde') df1.X20[df1.Y == 0].plot(kind='kde') df1.X21[df1.Y == 0].plot(kind='kde') df1.X22[df1.Y == 0].plot(kind='kde') df1.X23[df1.Y == 0].plot(kind='kde') # axes plt.xlabel("Amount of bill payed") plt.title("People distribution, no default") # limites ax1.set_xlim(0, 25000) # légende plt.legend(('September','August','July','May','April','March'),loc='best') # personnes en défaut de paiement ax2 = plt.subplot2grid((3,6),(2,3), colspan=3) # kernel density df1.X18[df1.Y == 1].plot(kind='kde') df1.X19[df1.Y == 1].plot(kind='kde') df1.X20[df1.Y == 1].plot(kind='kde') df1.X21[df1.Y == 1].plot(kind='kde') df1.X22[df1.Y == 1].plot(kind='kde') df1.X23[df1.Y == 1].plot(kind='kde') # axes plt.xlabel("Amount of bill payed") plt.title("People distribution, default") # limites ax2.set_xlim(0, 25000) # légende plt.legend(('September','August','July','May','April','March'),loc='best') .. parsed-literal:: .. image:: solution_2016_credit_clement_8_1.png .. code:: ipython3 # Matrice des corrélations sns.set(context="paper", font="monospace") corrmat = df1.corr() # atplotlib figure f, ax = plt.subplots(figsize=(12, 9)) # Draw the heatmap using seaborn sns.heatmap(corrmat, vmax=.8, square=True) .. parsed-literal:: .. image:: solution_2016_credit_clement_9_1.png .. code:: ipython3 # on modifie les colonnes (création de variables d'intérêt) df1['TotalDelay'] = df1.X11 + 2*df1.X10 + 4*df1.X9 + 8*df1.X8 + 16*df1.X7 + 32*df1.X6 df1['TotalPayment'] = df1.X23 + 2*df1.X22 + 3*df1.X21 + 4*df1.X20 + 5*df1.X19 + 6*df1.X18 df1['PartMay'] = -(df1.X22 - df1.X17)/(df1.X17 + 1) df1['PartJune'] = -(df1.X21 - df1.X16)/(df1.X16 + 1) df1['PartJuly'] = -(df1.X20 - df1.X15)/(df1.X15 + 1) df1['PartAugust'] = -(df1.X19 - df1.X14)/(df1.X14 + 1) df1['PartSeptember'] = -(df1.X18 - df1.X13)/(df1.X13 + 1) df1.head(20) .. raw:: html
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X22 X23 Y TotalDelay TotalPayment PartMay PartJune PartJuly PartAugust PartSeptember
0 180000 1 2 1 47 0 0 0 0 0 ... 2500 2618 0 0 71798 0.962902 0.964215 0.958865 0.961978 0.961112
1 110000 2 2 1 35 0 0 0 0 0 ... 184 185 1 0 17094 0.963518 0.963962 0.777823 0.721906 0.850305
2 70000 2 2 2 22 0 0 0 0 0 ... 1792 1793 1 0 51151 0.963766 0.964848 0.962690 0.956517 0.962942
3 200000 2 1 2 27 -2 -2 -2 -2 -2 ... 0 95456 0 -126 224043 1.028571 -0.004450 -0.004802 -0.004502 -0.009899
4 370000 2 1 1 39 0 0 0 0 0 ... 3000 1000 0 0 42614 0.937577 0.958029 0.965386 0.959245 0.968143
5 260000 2 1 1 29 0 0 0 -2 -2 ... 0 141516 0 -14 160056 0.000000 0.000000 0.000000 0.000000 0.942813
6 90000 2 1 1 43 -1 -1 2 -1 -1 ... 4009 7452 0 -39 135882 0.000000 0.000000 0.000000 0.997711 0.393333
7 220000 2 1 1 43 -1 3 2 0 0 ... 0 0 1 32 1002 0.000000 0.999083 0.999083 0.999083 0.866455
8 50000 1 2 1 35 1 2 0 0 0 ... 29935 1200 1 63 78530 -0.012172 0.985915 0.915291 0.956269 0.999979
9 50000 2 3 2 40 0 0 0 0 0 ... 325 436 0 0 19283 0.962316 0.963619 0.879296 0.847636 0.806216
10 130000 1 2 1 24 0 0 2 0 0 ... 0 780 0 16 38480 0.997442 0.999978 0.959249 0.999979 0.899013
11 200000 2 1 2 25 -1 -1 -1 -1 -1 ... 4970 8888 0 -63 82748 0.000000 0.000000 0.000000 0.000000 0.000000
12 230000 2 2 1 38 -2 -2 -2 -2 -2 ... 2132 2204 0 -126 111453 0.000000 0.000000 0.000000 0.000000 0.000000
13 90000 2 1 2 29 -2 -2 -2 -2 -2 ... 0 0 0 -126 0 1.004184 1.004184 1.004184 1.004184 1.004184
14 230000 1 3 2 37 -1 0 0 0 -1 ... 5003 3016 0 -34 287273 0.887873 -0.000340 0.769071 0.760863 0.952064
15 130000 1 2 2 33 2 2 -1 -1 -2 ... 0 0 0 78 3578 0.000000 0.000000 0.000000 0.000000 0.993003
16 90000 2 2 1 35 0 0 0 0 0 ... 4000 0 0 2 91108 0.954684 0.883510 0.952754 0.953646 0.952596
17 10000 2 2 1 37 -1 4 3 2 2 ... 0 36 0 70 3236 0.999550 0.848221 0.800478 0.999590 0.999652
18 80000 1 3 1 36 0 0 0 0 0 ... 3000 6200 0 0 73411 0.961214 0.926935 0.962670 0.963936 0.962893
19 320000 2 1 1 36 -1 2 0 0 0 ... 5000 11906 0 0 96906 0.792507 0.755381 0.703680 0.400943 0.999851

20 rows × 31 columns

.. code:: ipython3 # Matrice des corrélations sns.set(context="paper", font="monospace") corrmat = df1.corr() # matplotlib figure f, ax = plt.subplots(figsize=(12, 9)) # Draw the heatmap using seaborn sns.heatmap(corrmat, vmax=.8, square=True) .. parsed-literal:: .. image:: solution_2016_credit_clement_11_1.png .. code:: ipython3 # drop some columns df1 = df1.drop(['X'+str(n) for n in range(7,12)] + ['X'+str(n) for n in range(13,24)], axis=1) df1.head(20) .. raw:: html
X1 X2 X3 X4 X5 X6 X12 Y TotalDelay TotalPayment PartMay PartJune PartJuly PartAugust PartSeptember
0 180000 1 2 1 47 0 179253 0 0 71798 0.962902 0.964215 0.958865 0.961978 0.961112
1 110000 2 2 1 35 0 6137 1 0 17094 0.963518 0.963962 0.777823 0.721906 0.850305
2 70000 2 2 2 22 0 66505 1 0 51151 0.963766 0.964848 0.962690 0.956517 0.962942
3 200000 2 1 2 27 -2 4941 0 -126 224043 1.028571 -0.004450 -0.004802 -0.004502 -0.009899
4 370000 2 1 1 39 0 141552 0 0 42614 0.937577 0.958029 0.965386 0.959245 0.968143
5 260000 2 1 1 29 0 71864 0 -14 160056 0.000000 0.000000 0.000000 0.000000 0.942813
6 90000 2 1 1 43 -1 16139 0 -39 135882 0.000000 0.000000 0.000000 0.997711 0.393333
7 220000 2 1 1 43 -1 1090 1 32 1002 0.000000 0.999083 0.999083 0.999083 0.866455
8 50000 1 2 1 35 1 48047 1 63 78530 -0.012172 0.985915 0.915291 0.956269 0.999979
9 50000 2 3 2 40 0 5538 0 0 19283 0.962316 0.963619 0.879296 0.847636 0.806216
10 130000 1 2 1 24 0 46113 0 16 38480 0.997442 0.999978 0.959249 0.999979 0.899013
11 200000 2 1 2 25 -1 8926 0 -63 82748 0.000000 0.000000 0.000000 0.000000 0.000000
12 230000 2 2 1 38 -2 12696 0 -126 111453 0.000000 0.000000 0.000000 0.000000 0.000000
13 90000 2 1 2 29 -2 -240 0 -126 0 1.004184 1.004184 1.004184 1.004184 1.004184
14 230000 1 3 2 37 -1 36571 0 -34 287273 0.887873 -0.000340 0.769071 0.760863 0.952064
15 130000 1 2 2 33 2 2183 0 78 3578 0.000000 0.000000 0.000000 0.000000 0.993003
16 90000 2 2 1 35 0 72112 0 2 91108 0.954684 0.883510 0.952754 0.953646 0.952596
17 10000 2 2 1 37 -1 3305 0 70 3236 0.999550 0.848221 0.800478 0.999590 0.999652
18 80000 1 3 1 36 0 81066 0 0 73411 0.961214 0.926935 0.962670 0.963936 0.962893
19 320000 2 1 1 36 -1 7868 0 0 96906 0.792507 0.755381 0.703680 0.400943 0.999851
.. code:: ipython3 from sklearn.decomposition import PCA from numpy import inf pca = PCA(n_components=2, svd_solver='randomized') dfpca = df1.values dfpca[dfpca == -inf] = 0 y = dfpca[:, 7] proj = pca.fit_transform(dfpca[:, :7 + 8:]) plt.scatter(proj[:, 0], proj[:, 1], c=y) plt.colorbar() .. parsed-literal:: .. image:: solution_2016_credit_clement_13_1.png .. code:: ipython3 # training/crossval set X = df1.values X[X==-inf] = 0 print(df1.head()) # training set X_train = X[:, :] Y_train = X[:, 7].ravel() X_train = np.delete(X_train, 7, axis=1) # expected result expected = X[20000:, 7].ravel() # cross-validation data set X_cross = X[20000:, :] X_cross = np.delete(X_cross, 7, axis=1) .. parsed-literal:: X1 X2 X3 X4 X5 X6 X12 Y TotalDelay TotalPayment PartMay \ 0 180000 1 2 1 47 0 179253 0 0 71798 0.962902 1 110000 2 2 1 35 0 6137 1 0 17094 0.963518 2 70000 2 2 2 22 0 66505 1 0 51151 0.963766 3 200000 2 1 2 27 -2 4941 0 -126 224043 1.028571 4 370000 2 1 1 39 0 141552 0 0 42614 0.937577 PartJune PartJuly PartAugust PartSeptember 0 0.964215 0.958865 0.961978 0.961112 1 0.963962 0.777823 0.721906 0.850305 2 0.964848 0.962690 0.956517 0.962942 3 -0.004450 -0.004802 -0.004502 -0.009899 4 0.958029 0.965386 0.959245 0.968143 .. code:: ipython3 from sklearn.naive_bayes import GaussianNB # train the model GNB = GaussianNB() GNB.fit(X_train, Y_train) # use the model to predict the labels of the test data predicted = GNB.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) .. parsed-literal:: [[ 209 1732] [ 26 533]] .. code:: ipython3 from sklearn.ensemble import GradientBoostingClassifier GBR = GradientBoostingClassifier() GBR.fit(X_train,Y_train) predicted = GBR.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) .. parsed-literal:: [[1848 93] [ 352 207]] .. code:: ipython3 from sklearn.neighbors import KNeighborsClassifier KNC = KNeighborsClassifier(5) KNC.fit(X_train, Y_train) predicted = KNC.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) pred = KNC.predict_proba(X_train) .. parsed-literal:: [[1864 77] [ 368 191]] .. code:: ipython3 print(pred[:10]) print(Y_train[:10]) .. parsed-literal:: [[ 1. 0. ] [ 0.6 0.4] [ 0.6 0.4] [ 1. 0. ] [ 1. 0. ] [ 0.6 0.4] [ 1. 0. ] [ 0.4 0.6] [ 0.4 0.6] [ 0.8 0.2]] [ 0. 1. 1. 0. 0. 0. 0. 1. 1. 0.] .. code:: ipython3 # neural network from sklearn.neural_network import MLPClassifier from sklearn.metrics import r2_score from sklearn.model_selection import GridSearchCV # optimisation - choix du nombre de couches param_grid = [ {'hidden_layer_sizes': [(nb,) for nb in range(20,50,10)]}, {'alpha': [a/100 for a in range(0,40,20)]} ] neural2 = GridSearchCV(MLPClassifier(), param_grid, verbose=1) neural2.fit(X_train, Y_train) neural2.best_estimator_ .. parsed-literal:: Fitting 3 folds for each of 5 candidates, totalling 15 fits .. parsed-literal:: [Parallel(n_jobs=1)]: Done 15 out of 15 | elapsed: 27.2s finished .. parsed-literal:: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(20,), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=None, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) .. code:: ipython3 neural = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(170,), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=None, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) neural.fit(X_train, Y_train) predicted = neural.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) neural.predict_proba(X_cross[:10]) .. parsed-literal:: [[1780 161] [ 378 181]] .. parsed-literal:: array([[ 1.00000000e+000, 4.54194423e-294], [ 1.00000000e+000, 1.14673239e-101], [ 1.00000000e+000, 1.13397258e-051], [ 1.00000000e+000, 1.89540529e-117], [ 1.00000000e+000, 8.01811448e-032], [ 1.00000000e+000, 1.14003085e-160], [ 1.00000000e+000, 1.02562443e-115], [ 1.00000000e+000, 9.41507727e-017], [ 1.00000000e+000, 3.16744761e-026], [ 6.95744980e-001, 3.04255020e-001]]) .. code:: ipython3 if_you_have_time = False if if_you_have_time: from sklearn.gaussian_process import GaussianProcessClassifier GPC = GaussianProcessClassifier() GPC.fit(X_train, Y_train) predicted = GPC.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) GPC.predict_proba(X_cross) .. code:: ipython3 from sklearn.ensemble import RandomForestClassifier RFC = RandomForestClassifier(5) RFC.fit(X_train, Y_train) predicted = RFC.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) print(RFC.predict_proba(X_cross)) print(expected) .. parsed-literal:: [[1928 13] [ 65 494]] [[ 1. 0. ] [ 1. 0. ] [ 0. 1. ] ..., [ 1. 0. ] [ 1. 0. ] [ 0.8 0.2]] [ 0. 0. 1. ..., 0. 0. 0.] .. code:: ipython3 if if_you_have_time: from sklearn.svm import SVC SVC = SVC(probability = True) SVC.fit(X_train, Y_train) predicted = SVC.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) print(SVC.predict_proba(X_cross)) print(expected) .. code:: ipython3 from sklearn.linear_model import LogisticRegression LR = LogisticRegression() LR.fit(X_train, Y_train) predicted = LR.predict(X_cross) print(metrics.confusion_matrix(expected, predicted)) print(LR.predict_proba(X_train)) print(Y_train) .. parsed-literal:: [[1861 80] [ 413 146]] [[ 0.85603222 0.14396778] [ 0.72020052 0.27979948] [ 0.69524708 0.30475292] ..., [ 0.69884026 0.30115974] [ 0.85309221 0.14690779] [ 0.75099513 0.24900487]] [ 0. 1. 1. ..., 0. 0. 0.] .. code:: ipython3 #--------------# # modèle final # #--------------# # dataframe dfend = pd.read_csv("ensae_competition_test_X.txt", header=[0, 1], sep='\t', encoding="utf8", index_col=0) dfend.columns = dfend.columns.droplevel(-1) # modifications colonnes dfend['TotalDelay'] = dfend.X11 + 2*dfend.X10 + 4*dfend.X9 + 8*dfend.X8 + 16*dfend.X7 + 32*dfend.X6 dfend['TotalPayment'] = dfend.X23 + 2*dfend.X22 + 3*dfend.X21 + 4*dfend.X20 + 5*dfend.X19 + 6*dfend.X18 dfend['PartMay'] = -(dfend.X22 - dfend.X17)/(dfend.X17 + 1) dfend['PartJune'] = -(dfend.X21 - dfend.X16)/(dfend.X16 + 1) dfend['PartJuly'] = -(dfend.X20 - dfend.X15)/(dfend.X15 + 1) dfend['PartAugust'] = -(dfend.X19 - dfend.X14)/(dfend.X14 + 1) dfend['PartSeptember'] = -(dfend.X18 - dfend.X13)/(dfend.X13 + 1) dfend = dfend.drop(['X'+str(n) for n in range(7,12)] + ['X'+str(n) for n in range(13,24)], axis=1) # dataset as array X = dfend.values X[X==-inf] = 0 .. code:: ipython3 # prédictions # réseau de neuronnes l = neural.predict(X) text_file = open('answerN.txt','w') for e in l: text_file.write(str(int(e)) + '\n') text_file.close() .. code:: ipython3 # random forest l = RFC.predict(X) text_file = open('answerRF.txt','w') for e in l: text_file.write(str(int(e)) + '\n') text_file.close()