Source code for ensae_projects.hackathon.perf2018

"""
Compute the performance for the hackathon 2018.


:githublink:`%|py|5`
"""
import os
import time
import pandas
import numpy
from PIL import Image
from lightmlrestapi.mlapp.mlstorage import MLStorage
try:
    from ..hackathon.image_helper import enumerate_image_class
except (ImportError, ValueError):
    from ensae_projects.hackathon.image_helper import enumerate_image_class


[docs]class MLStoragePerf2018: """ Computes the performances the a hackathon. :githublink:`%|py|20` """
[docs] def __init__(self, storage, examples, cache_file="cache_file.csv"): """ :param storage: storage location :param examples: deep learning models :githublink:`%|py|26` """ self._storage = self._load_ml_storage(storage) self._examples = examples self._cache_file = cache_file
[docs] def _load_ml_storage(self, root): """ Creates an instance of a `MLStorage <http://www.xavierdupre.fr/app/lightmlrestapi/helpsphinx/lightmlrestapi/mlapp/mlstorage.html # lightmlrestapi.mlapp.mlstorage.MLStorage>`_ based on a folder. :param root: folder :githublink:`%|py|39` """ if not os.path.exists(root): raise FileNotFoundError("Unable to find '{0}'".format(root)) stor = MLStorage(root) return stor
[docs] def _load_cached_performance(self, cache_file=None): """ Retrieves performances already computed. :param cached_file: file :githublink:`%|py|50` """ if cache_file is None: cache_file = self._cache_file if os.path.exists(cache_file): df = pandas.read_csv(cache_file, sep=",", encoding="utf-8") return df else: return None
[docs] def _save_performance(self, df, cache_file=None): """ Saves cached performance. :param df: dataframe :param cache_file: destination :githublink:`%|py|65` """ if cache_file is None: cache_file = self._cache_file df.to_csv(cache_file, sep=',', encoding='utf-8', index=False)
[docs] def compute_performance(self, use_cache=True, fLOG=None): """ Computes the performance for the not cached one if *use_cache* is True. :param use_cache: use cache :param fLOG: logging function :return: dataframe :githublink:`%|py|78` """ cache = None already = set() if use_cache: cache = self._load_cached_performance() if cache is not None: already = set(cache["name"]) rows = [] for i, name in enumerate(sorted(self._storage.enumerate_names())): if i % 30 == 0: print('.') if name not in already: t0 = time.perf_counter() if fLOG: fLOG( "[MLStoragePerf2018] compute perf for {0}: '{1}'".format(i, name)) res = self.compute_perf(name) # pylint: disable=E1111 if fLOG: fLOG( "[MLStoragePerf2018] Done for {0}: {1}".format(name, res)) if 'exc' in res: fLOG("[MLStoragePerf2018] exception for {0}: {1}".format( name, res['exc'])) res["name"] = name res["stime"] = os.stat(self._storage._folder, name).st_mtime t1 = time.perf_counter() - t0 res["time"] = t1 rows.append(res) already.add(name) df = pandas.DataFrame(rows) sc = list(sorted(df.columns)) df = df[sc] if cache is not None: df = pandas.concat([df, cache]) df = df.sort_values("name").copy() self._save_performance(df) return df
[docs] def compute_perf(self, name): """ Computes the performances for every image and one particular model. :githublink:`%|py|126` """ raise NotImplementedError()
[docs]class MLStoragePerf2018Image(MLStoragePerf2018): """ Overloads *compute_perf* for images. Example of use: :: from ensae_projects.hackathon.perf2018 import MLStoragePerf2018Image mstorage = "storage_brgm" mexample = "hackathon_test/sample_labelled_test" mpref = MLStoragePerf2018Image(mstorage, mexample) mres = mpref.compute_performance(fLOG=print, use_cache=True) mres = mres.sort_values("precision", ascending=False) print(mres) mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n" mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html()) with open("brgm.html", "w", encoding="utf-8") as f: f.write(mcontent) :githublink:`%|py|148` """
[docs] def __init__(self, storage, examples, cache_file="cache_file.csv"): """ :param storage: storage location :param examples: deep learning models :githublink:`%|py|154` """ MLStoragePerf2018.__init__(self, storage, examples, cache_file)
[docs] def _label_mapping(self, subs): """ Computes the label based on a subfolder name. :githublink:`%|py|160` """ return 1 if subs.endswith('1') else 0
[docs] def compute_perf(self, name): """ Computes the performances for every image and one particular model. :githublink:`%|py|167` """ from keras import backend as K K.clear_session() # pylint: disable=E1101 folder = self._examples try: model = self._storage.load_model(name) vers = self._storage.call_version(name) exc = None except Exception as e: # pylint: disable=W0703 model = None exc = e vers = None rows = [] for img, sub in enumerate_image_class(folder): label = self._label_mapping(sub) obs = dict(image=img, sub=sub, label=label) if model is None: obs = {'exc': Exception("model is None")} pred = None else: X = numpy.array(Image.open(img)) try: pred = self._storage.call_predict( name, X, loaded_model=model) # print("*****",pred) except Exception as e: # pylint: disable=W0703 exc = e pred = None #print('------', e) if pred is None: pass else: if isinstance(pred, float): plabel = 1 if pred > 0.5 else 0 score = pred if isinstance(pred, list): pred = numpy.array(pred) if isinstance(pred, numpy.ndarray): pred = pred.ravel() if len(pred) == 1: plabel = 1 if pred[0] > 0.5 else 0 score = pred[0] elif len(pred) > 1: plabel = numpy.argmax(pred) score = pred[plabel] else: exc = ValueError("No prediction") else: exc = TypeError( "Prediction with the wrong type {0}".format(type(pred))) if exc: obs.update({'exc': exc}) else: obs.update(dict(predicted_label=plabel, score=score)) rows.append(obs) # print(rows) # break final = pandas.DataFrame(rows) columns = list(final.columns) if 'score' in columns: final["score"] = final["score"].fillna(0) final["predicted_label"] = final["predicted_label"].fillna(-1) final["correct"] = final["predicted_label"] == final["label"] final["correcti"] = 0 final.loc[final["correct"], "correcti"] = 1 res = {} if exc: res["exc"] = str(exc) if len(final) > 0: gr = final["correcti"].sum() / final.shape[0] res["precision"] = gr gr = final[["sub", "correcti", "correct"]] gr = gr.groupby("sub", as_index=False) gr = gr.agg({"correct": len, 'correcti': sum}) gr["ratio"] = gr["correcti"] / gr["correct"] for i in range(gr.shape[0]): res["p_%s" % gr.loc[i, "sub"]] = gr.loc[i, "ratio"] else: res["precision"] = 0 else: res = dict(exc=exc, precision=0) if vers is not None: res["version"] = vers return res
[docs]class MLStoragePerf2018TimeSeries(MLStoragePerf2018): """ Overloads *compute_perf* for timeseries. Example of use: :: from ensae_projects.hackathon.perf2018 import MLStoragePerf2018TimeSeries mstorage = "storage_microdon" mexample = "hackathon_test/sample_labelled_test" mpref = MLStoragePerf2018TimeSeries(mstorage, mexample) mres = mpref.compute_performance(fLOG=print, use_cache=True) mres = mres.sort_values("cor", ascending=False) print(mres) mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - Microdon</h1>\n" mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html()) with open("brgm.html", "w", encoding="utf-8") as f: f.write(mcontent) :githublink:`%|py|281` """
[docs] def __init__(self, storage, examples, cache_file="cache_file.csv"): """ :param storage: storage location :param examples: deep learning models :githublink:`%|py|287` """ MLStoragePerf2018.__init__(self, storage, examples, cache_file) df = pandas.read_csv(examples) sub = df[["year", "week", "campaigns_campaign_id", "collecteur_id", "montant_total", "nb_dons_total", "nb_transac_total"]].copy() dsub = sub.fillna(0) gr = dsub.groupby( ["year", "week", "campaigns_campaign_id"], as_index=False).sum() gr["PARTICIPATION"] = gr["nb_dons_total"] / gr["nb_transac_total"] self._expected = gr
[docs] def _label_mapping(self, subs): """ Computes the label based on a subfolder name. :githublink:`%|py|303` """ return 1 if subs.endswith('1') else 0
[docs] def compute_perf(self, name): """ Computes the performances for every image and one particular model. :githublink:`%|py|310` """ try: model = self._storage.load_model(name) vers = self._storage.call_version(name) exc = None except Exception as e: # pylint: disable=W0703 model = None exc = e vers = None cols = ["year", "week", "campaigns_campaign_id", "PARTICIPATION", "nb_transac_total"] X = self._expected[cols] total = 0 total10 = 0 total100 = 0 total1000 = 0 n1 = 0 n10 = 0 n100 = 0 n1000 = 0 preds = [] exps = [] diffs = {} for i in range(0, X.shape[0]): week = X.iloc[i, 1] camp = X.iloc[i, 2] exp = X.iloc[i, 3] if exp > 1: continue nb_transac_total = X.iloc[i, 4] if nb_transac_total > 0: try: pred = self._storage.call_predict( name, (week, camp), loaded_model=model) except Exception as e: # pylint: disable=W0703 exc = e pred = 0 if isinstance(pred, list): if len(pred) == 1: pred = pred[0] else: exc = Exception( "Returned a list of value when expecting one") elif isinstance(pred, numpy.ndarray): pred = pred.ravel() if len(pred) == 1: pred = pred[0] else: exc = Exception( "Returned a list of value when expecting one") n1 += 1 exps.append(exp) preds.append(pred) diffs[week, camp] = abs(exp - pred) total += (pred - exp) ** 2 if nb_transac_total >= 10: total10 += (pred - exp) ** 2 n10 += 1 if nb_transac_total >= 100: total100 += (pred - exp) ** 2 n100 += 1 if nb_transac_total >= 1000: total1000 += (pred - exp) ** 2 n1000 += 1 res = {} if vers is not None: res["version"] = vers if exc is not None: res["exc"] = exc if n1 > 0: res["score"] = (total / n1) ** 0.5 res["score10"] = (total10 / n10) ** 0.5 res["score100"] = (total100 / n100) ** 0.5 res["score1000"] = (total1000 / n1000) ** 0.5 try: res["cor"] = numpy.corrcoef(numpy.array(preds), numpy.array(exps))[0, 1] except (AttributeError, KeyError): res["cor"] = numpy.nan try: res["pmin"] = numpy.min(preds) res["pmax"] = numpy.max(preds) except (KeyError, ValueError): res["pmin"] = numpy.nan res["pmax"] = numpy.nan resort = [(v, k) for k, v in diffs.items()] try: resort.sort() skip = False except ValueError: skip = True exc = Exception( "Unable to sort differences {0}".format(resort[0])) if not skip: last = resort[-1] res["worst"] = "{0}:{1}".format(last[1], last[0]) best = resort[0] res["best"] = "{0}:{1}".format(best[1], best[0]) return res
if __name__ == "__main__": mstorage = r'/home/jbr/hack35/' mexample = r'./sample_labelled_test' mpref = MLStoragePerf2018Image(mstorage, mexample) mres = mpref.compute_performance(fLOG=print, use_cache=True) mres = mres.sort_values("precision", ascending=False) print(mres) if 'exc' in mres.columns: print(list(mres['exc'])) mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n" mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html()) from pyquickhelper.pandashelper.tblformat import df2rst with open("hackathon2018/brgm.rst", "w", encoding="utf-8") as f: f.write(df2rst(mres)) with open("hackathon2018/brgm.html", "w", encoding="utf-8") as f: f.write(mcontent)