from jyquickhelper import add_notebook_menu
add_notebook_menu()


%matplotlib inline


from papierstat.datasets import load_titanic_dataset
data1 = load_titanic_dataset(subset="A")
data1.head(n=2)


data2 = load_titanic_dataset(subset="B")
data2.head(n=2)


df = data1


from sklearn.model_selection import train_test_split

X = df[["age", "sex", "pclass"]]
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y)


set(df.sex), set(df.pclass)

({'female', 'male'}, {1, 2, 3})


from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder()
one.fit(X_train[['sex', 'pclass']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)


one.transform(X_train[['sex', 'pclass']])

<981x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1962 stored elements in Compressed Sparse Row format>


one.transform(X_train[['sex', 'pclass']]).todense()

matrix([[1., 0., 0., 1., 0.],
        [0., 1., 0., 0., 1.],
        [0., 1., 1., 0., 0.],
        ...,
        [1., 0., 0., 0., 1.],
        [0., 1., 0., 0., 1.],
        [0., 1., 0., 0., 1.]])


names = one.get_feature_names()
names

array(['x0_female', 'x0_male', 'x1_1', 'x1_2', 'x1_3'], dtype=object)


import numpy
cats = one.transform(X_train[['sex', 'pclass']]).todense()
age = X_train[['age']]
feat = numpy.hstack([age, cats])
feat[:10]

matrix([[31.,  1.,  0.,  0.,  1.,  0.],
        [28.,  0.,  1.,  0.,  0.,  1.],
        [21.,  0.,  1.,  1.,  0.,  0.],
        [28.,  0.,  1.,  0.,  0.,  1.],
        [nan,  0.,  1.,  0.,  1.,  0.],
        [80.,  0.,  1.,  1.,  0.,  0.],
        [39.,  0.,  1.,  1.,  0.,  0.],
        [12.,  1.,  0.,  0.,  1.,  0.],
        [nan,  0.,  1.,  1.,  0.,  0.],
        [31.,  1.,  0.,  0.,  1.,  0.]])


df[['age']].shape, df[['age']].dropna().shape

((1309, 1), (1046, 1))


from sklearn.impute import SimpleImputer
imp = SimpleImputer()
new_age = imp.fit_transform(X_train[['age']])
feat = numpy.hstack([new_age, cats])
feat[:10]

matrix([[31.        ,  1.        ,  0.        ,  0.        ,  1.        ,
          0.        ],
        [28.        ,  0.        ,  1.        ,  0.        ,  0.        ,
          1.        ],
        [21.        ,  0.        ,  1.        ,  1.        ,  0.        ,
          0.        ],
        [28.        ,  0.        ,  1.        ,  0.        ,  0.        ,
          1.        ],
        [29.60563707,  0.        ,  1.        ,  0.        ,  1.        ,
          0.        ],
        [80.        ,  0.        ,  1.        ,  1.        ,  0.        ,
          0.        ],
        [39.        ,  0.        ,  1.        ,  1.        ,  0.        ,
          0.        ],
        [12.        ,  1.        ,  0.        ,  0.        ,  1.        ,
          0.        ],
        [29.60563707,  0.        ,  1.        ,  1.        ,  0.        ,
          0.        ],
        [31.        ,  1.        ,  0.        ,  0.        ,  1.        ,
          0.        ]])


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(feat, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


cats_test = one.transform(X_test[['sex', 'pclass']])
age_test = imp.transform(X_test[['age']])
feat_test = numpy.hstack([age_test, cats_test.todense()])


from sklearn.metrics import confusion_matrix

pred = rf.predict(feat_test)
confusion_matrix(y_test, pred)

array([[177,  25],
       [ 49,  77]], dtype=int64)


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pipe = Pipeline([
    ('cats', ColumnTransformer([
        ('one', OneHotEncoder(), ['sex', 'pclass']),
        ('imp', SimpleImputer(), ['age'])
    ]))
])
pipe.fit(X_train)
pipe.transform(X_test)

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        30.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        29.60563707],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        29.60563707],
       ...,
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        25.        ],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        53.        ],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        17.        ]])


pipe = Pipeline([
    ('cats', ColumnTransformer([
        ('one', OneHotEncoder(), ['sex', 'pclass']),
        ('imp', SimpleImputer(), ['age'])
    ])),
    ('rf', RandomForestClassifier(n_estimators=100))
])

pipe.fit(X_train, y_train)
confusion_matrix(y_test, pipe.predict(X_test))

array([[174,  28],
       [ 47,  79]], dtype=int64)


cols = ['age'] + list(names)
list(zip(cols, pipe.steps[-1][1].feature_importances_))

[('age', 0.2279931985187704),
 ('x0_female', 0.21516489235386238),
 ('x0_male', 0.04301082695552079),
 ('x1_1', 0.019360189479876846),
 ('x1_2', 0.06817992681764856),
 ('x1_3', 0.426290965874321)]


[_ for _ in set(df.ticket) if ' ' in _][:10]

['SOTON/O.Q. 3101310',
 'A/4 48873',
 'STON/O2. 3101283',
 'PC 17611',
 'CA 31352',
 'C 17368',
 'A/5 3594',
 'W./C. 6608',
 'C.A. 34050',
 'C.A. 24580']


X = df[["age", "sex", "pclass", "ticket"]]
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y)


from sklearn.feature_extraction.text import CountVectorizer

pipe = Pipeline([
    ('cats', ColumnTransformer([
        ('one', OneHotEncoder(), ['sex', 'pclass']),
        ('imp', SimpleImputer(), ['age']),
        ('bow', CountVectorizer(), ['ticket']),
    ])),
])

try:
    pipe.fit(X_train)
except ValueError as e:
    print(e)

all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 981 and the array at index 2 has size 1


CountVectorizer().fit_transform(X_train['ticket'])

<981x756 sparse matrix of type '<class 'numpy.int64'>'
	with 1170 stored elements in Compressed Sparse Row format>


from papierstat.mltricks import TextVectorizerTransformer

pipe = Pipeline([
    ('cats', ColumnTransformer([
        ('one', OneHotEncoder(), ['sex', 'pclass']),
        ('imp', SimpleImputer(), ['age']),
        ('bow', TextVectorizerTransformer(CountVectorizer()), ['ticket']),
    ])),
])

pipe.fit(X_train)

C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\utils\deprecation.py:144: FutureWarning: The sklearn.cluster.k_means_ module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.cluster. Anything that cannot be imported from sklearn.cluster is now part of the private API.
  warnings.warn(message, FutureWarning)

Pipeline(memory=None,
         steps=[('cats',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('one',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['sex', 'pclass']),
                                                 ('imp',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_...
                                                  TextVectorizerTransformer(estimator=CountVectorizer(analyzer='word',
                                                                                                      binary=False,
                                                                                                      decode_error='strict',
                                                                                                      dtype=<class 'numpy.int64'>,
                                                                                                      encoding='utf-8',
                                                                                                      input='content',
                                                                                                      lowercase=True,
                                                                                                      max_df=1.0,
                                                                                                      max_features=None,
                                                                                                      min_df=1,
                                                                                                      ngram_range=(1,
                                                                                                                   1),
                                                                                                      preprocessor=None,
                                                                                                      stop_words=None,
                                                                                                      strip_accents=None,
                                                                                                      token_pattern='(?u)\\b\\w\\w+\\b',
                                                                                                      tokenizer=None,
                                                                                                      vocabulary=None)),
                                                  ['ticket'])],
                                   verbose=False))],
         verbose=False)


pipe.transform(X_test)

<328x762 sparse matrix of type '<class 'numpy.float64'>'
	with 1174 stored elements in Compressed Sparse Row format>


pipe = Pipeline([
    ('cats', ColumnTransformer([
        ('one', OneHotEncoder(), ['sex', 'pclass']),
        ('imp', SimpleImputer(), ['age']),
        ('bow', TextVectorizerTransformer(CountVectorizer()), ['ticket']),
    ])),
    ('rf', RandomForestClassifier(n_estimators=100))
])

pipe.fit(X_train, y_train)
confusion_matrix(y_test, pipe.predict(X_test))

array([[191,  14],
       [ 35,  88]], dtype=int64)

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.00	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.92	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON

	row.names	pclass	survived	name	age	embarked	home.dest	room	ticket	boat	sex
0	1	1st	1	Allen, Miss Elisabeth Walton	29.0	Southampton	St Louis, MO	B-5	24160 L221	2	female
1	2	1st	0	Allison, Miss Helen Loraine	2.0	Southampton	Montreal, PQ / Chesterville, ON	C26	NaN	NaN	female

Machine learning avec des catégories et du texte¶

Les données¶

Premier modèle de prédiction¶

Utiliser d'autres variables¶