from jyquickhelper import add_notebook_menu
add_notebook_menu()


import random
values = [ [random.random() for i in range(0,20)] for _ in range(0,100000) ]
col = [ "col%d" % i for i in range(0,20) ]


import pandas
df = pandas.DataFrame( values, columns = col )


df.to_csv("df_text.txt", sep="\t")


df.to_pickle("df_text.bin")


%timeit pandas.read_csv("df_text.txt", sep="\t")

499 ms ± 8.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


%timeit pandas.read_pickle("df_text.bin")

10.1 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


obj = dict(a=[50, "r"], gg=(5, 't'))

import jsonpickle
frozen = jsonpickle.encode(obj)
frozen

'{"a": [50, "r"], "gg": {"py/tuple": [5, "t"]}}'


frozen = jsonpickle.encode(df)


len(frozen), type(frozen), frozen[:55]

(22025124, str, '{"py/object": "pandas.core.frame.DataFrame", "py/state"')


def to_json(obj, filename):
    frozen = jsonpickle.encode(obj)
    with open(filename, "w", encoding="utf-8") as f:
        f.write(frozen)
        
def read_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        enc = f.read()
    return jsonpickle.decode(enc)


to_json(df, "df_text.json")


try:
    df = read_json("df_text.json")
except Exception as e:
    print(e)

all inputs must be Index


import jsonpickle.ext.numpy as jsonpickle_numpy
jsonpickle_numpy.register_handlers()


from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target


from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X,y)

LogisticRegression()


clf.predict_proba([[0.1, 0.2]])

array([[9.98521017e-01, 1.47896452e-03, 1.84545577e-08]])


to_json(clf, "logreg.json")


try:
    clf2 = read_json("logreg.json")
except AttributeError as e:
    # Pour une raison inconnue, un bug sans doute, le code ne fonctionne pas.
    print(e)

'list' object has no attribute 'flags'


class EncapsulateLogisticRegression:
    def __init__(self, obj):
        self.obj = obj
    def __getstate__(self):
        return {k: v for k, v in sorted(self.obj.__getstate__().items())}
    def __setstate__(self, data):
        self.obj = LogisticRegression()
        self.obj.__setstate__(data)
        
enc = EncapsulateLogisticRegression(clf)
to_json(enc, "logreg.json")


enc2 = read_json("logreg.json")
clf2 = enc2.obj


clf2.predict_proba([[0.1, 0.2]])

array([[9.98521017e-01, 1.47896452e-03, 1.84545577e-08]])


with open("logreg.json", "r") as f:
    content = f.read()
content

'{"py/object": "__main__.EncapsulateLogisticRegression", "py/state": {"C": 1.0, "_sklearn_version": "1.0.dev0", "class_weight": null, "classes_": {"py/object": "numpy.ndarray", "dtype": "int32", "values": [0, 1, 2]}, "coef_": {"py/object": "numpy.ndarray", "base": {"py/object": "numpy.ndarray", "dtype": "float64", "values": [[[-2.7089024902680983, 2.3240237755859914, 7.913221292541044], [0.6127325890163979, -1.5705880338943812, 1.8450471421510946], [2.0961699012517387, -0.7534357416910977, -9.758268434691205]]]}, "strides": [24, 8], "shape": [3, 2], "dtype": "float64", "values": [[-2.7089024902680983, 2.3240237755859914], [0.6127325890163979, -1.5705880338943812], [2.0961699012517387, -0.7534357416910977]]}, "dual": false, "fit_intercept": true, "intercept_": {"py/object": "numpy.ndarray", "base": {"py/id": 4}, "offset": 16, "strides": [24], "shape": [3], "dtype": "float64", "values": [7.913221292541044, 1.8450471421510946, -9.758268434691205]}, "intercept_scaling": 1, "l1_ratio": null, "max_iter": 100, "multi_class": "auto", "n_features_in_": 2, "n_iter_": {"py/object": "numpy.ndarray", "base": {"py/object": "numpy.ndarray", "dtype": "int32", "values": [[50]]}, "shape": [1], "dtype": "int32", "values": [50]}, "n_jobs": null, "penalty": "l2", "random_state": null, "solver": "lbfgs", "tol": 0.0001, "verbose": 0, "warm_start": false}}'

2A.i - Sérialisation - correction¶

Exercice 1 : sérialisation d'un gros dataframe¶

Exercice 2 : json¶

json + scikit-learn¶