Source code for lightmlboard.metrics.regression_custom

"""
Metrics about regressions.


:githublink:`%|py|5`
"""
import io
import numpy
import pandas


[docs]def l1_reg_max(exp, val, max_val=180, nomax=False, exc=True): """ Implements a :epkg:`L1` scoring function which does not consider error above threshold *max_val*. :param exp: list of values or :epkg:`numpy:array` :param val: list of values or :epkg:`numpy:array` :param max_val: every value above *max_val* is replaced by *max_val* before computing the differences :param nomax: removes every value equal or above *nomax* in expected set, then compute the score :param raises: an exception if not enough submitted items :return: score If ``max_val==180``, the function computes: .. math:: E = \\frac{1}{n} \\sum_{i=1}^n \\frac{\\left| \\min (Y_i, 180) - \\min(f(X_i), 180) \\right|}{180} The computation is faster if :epkg:`numpy:array` are used (for *exp* and *val*). *exp and *val* can be filenames or streams. In that case, the function expects to find two columns: id, value in both files or streams. :githublink:`%|py|34` """ if isinstance(exp, numpy.ndarray) and isinstance(val, numpy.ndarray): if len(exp) != len(val): raise ValueError( "Dimension mismatch {0} != {1}".format(len(exp), len(val))) an = numpy.zeros((len(exp),)) an[:] = max_val mv = numpy.minimum(an, val) # pylint: disable=E1111 me = numpy.minimum(an, exp) # pylint: disable=E1111 if nomax: mv = mv[me < max_val] # pylint: disable=W0143,E1136 me = me[me < max_val] # pylint: disable=W0143,E1136 df = numpy.abs(mv - me) / max_val return df.mean() elif isinstance(exp, dict) and isinstance(val, dict): if exc and len(exp) != len(val): number_common = len(set(exp) & set(val)) raise ValueError( "Dimension mismatch {0} != {1} (#common={2})".format(len(exp), len(val), number_common)) r = 0.0 nb = 0 for k, e in exp.items(): if k in val: v = val[k] try: mv = min(v, max_val) except TypeError: return numpy.nan try: ev = min(e, max_val) except TypeError: return numpy.nan if nomax and ev >= max_val: continue d = abs(mv - ev) r += 1. * d / max_val nb += 1 elif exc: raise ValueError("Missing key in prediction {0}".format(k)) else: r += 1. nb += 1 return r / nb if nb > 0 else 0.0 elif isinstance(exp, (str, io.StringIO)) and isinstance(val, (str, io.StringIO)): # We expect filenames. d1 = pandas.read_csv(exp, header=None, sep=";") d2 = pandas.read_csv(val, header=None, sep=";") dd1 = {} for k, v in d1.itertuples(name=None, index=False): if k in dd1: raise KeyError("Key '{}' present at least twice.".format(k)) dd1[k] = v dd2 = {} for k, v in d2.itertuples(name=None, index=False): if k in dd2: raise KeyError("Key '{}' present at least twice.".format(k)) dd2[k] = v return l1_reg_max(dd1, dd2, max_val=max_val, nomax=nomax, exc=exc) elif isinstance(exp, list) and isinstance(val, list): if len(exp) != len(val): raise ValueError( "Dimension mismatch {0} != {1}".format(len(exp), len(val))) r = 0.0 nb = 0 for e, v in zip(exp, val): ev = min(e, max_val) if nomax and ev >= max_val: continue mv = min(v, max_val) d = abs(mv - ev) r += 1. * d / max_val nb += 1 return r / nb if nb > 0 else 0.0 else: raise TypeError( "Inconsistent types {0} != {1}".format(type(exp), type(val)))