Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Metrics about regressions.
4"""
5import io
6import numpy
7import pandas
8from sklearn.metrics import roc_auc_score
11def is_vector(a):
12 """
13 Tells if an array is a vector.
14 """
15 return len(a.shape) == 1 or a.shape[1] == 1
18def reshape(exp, val):
19 """
20 Reshape the expected values and predictions.
21 """
22 if isinstance(val, list):
23 val = numpy.array(val)
24 if isinstance(exp, list):
25 exp = numpy.array(exp)
26 if isinstance(val, pandas.DataFrame):
27 val = val.values
28 if isinstance(exp, pandas.DataFrame):
29 exp = exp.values
30 if not isinstance(val, numpy.ndarray):
31 raise TypeError("val is {0} not an array".format(type(val)))
32 if not isinstance(exp, numpy.ndarray):
33 raise TypeError("exp is {0} not an array".format(type(exp)))
34 if is_vector(exp) != is_vector(val):
35 if not is_vector(val) and is_vector(exp):
36 exp_ = exp
37 exp = numpy.zeros((val.shape))
38 for i, v in enumerate(exp_.ravel()):
39 exp[i, int(v)] = 1
40 else:
41 exp = exp.ravel()
42 val = val.ravel()
43 elif is_vector(exp) and is_vector(val):
44 exp = exp.ravel()
45 val = val.ravel()
47 if len(exp.shape) == 2 and exp.shape[1] == 1:
48 raise ValueError("exp has two dimensions but one column")
49 if len(val.shape) == 2 and val.shape[1] == 1:
50 raise ValueError("val has two dimensions but one column")
51 return exp, val
54def roc_auc_score_micro(exp, val):
55 """
56 Computes `roc_auc_score <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html>`_
57 with *average='micro'*.
58 """
59 exp, val = reshape(exp, val)
60 return roc_auc_score(exp, val, average="micro")
63def roc_auc_score_macro(exp, val):
64 """
65 Computes `roc_auc_score <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html>`_
66 with *average='macro'*.
67 """
68 exp, val = reshape(exp, val)
69 return roc_auc_score(exp, val, average="macro")
72def multi_label_jaccard(exp, val, exc=True):
73 """
74 Applies to a multi-label classification problem.
75 Computes the average Jaccard index between two sequences
76 of sets of labels
77 (see `Multi-label classification <https://en.wikipedia.org/wiki/Multi-label_classification>`_).
79 @param exp list of tuple or list of set or filename or streams (comma separated values) or dict
80 @param val list of tuple or list of set or filename or streams (comma separated values) or dict
81 @param exc raises an exception if not enough submitted items
82 @return score
84 .. math::
86 E = \\frac{1}{n} \\sum_{i=1}^n \\frac{|C_i \\cap P_i|}{|C_i \\cup P_i|}
88 """
89 def to_set(v):
90 "as a set"
91 if isinstance(v, set):
92 return v
93 elif isinstance(v, str):
94 return set(v.split(','))
95 elif isinstance(v, (float, int)):
96 return {str(v)}
97 else:
98 return set(v)
100 if isinstance(exp, (str, io.StringIO)) and isinstance(val, (str, io.StringIO)):
101 # Files or streams.
102 d1 = pandas.read_csv(exp, header=None, sep=";")
103 d2 = pandas.read_csv(val, header=None, sep=";")
104 dd1 = {}
105 for k, v in d1.itertuples(name=None, index=False):
106 if k in dd1:
107 raise KeyError("Key '{}' present at least twice.".format(k))
108 dd1[k] = v
109 dd2 = {}
110 for k, v in d2.itertuples(name=None, index=False):
111 if k in dd2:
112 raise KeyError("Key '{}' present at least twice.".format(k))
113 dd2[k] = v
114 return multi_label_jaccard(dd1, dd2, exc=exc)
115 elif isinstance(exp, dict) and isinstance(val, dict):
116 if exc and len(exp) != len(val):
117 number_common = len(set(exp) & set(val))
118 raise ValueError(
119 "Dimension mismatch {0} != {1} (#common={2})".format(len(exp), len(val), number_common))
120 r = 0.0
121 missing = 0
122 for k, e in exp.items():
123 if k in val:
124 v = val[k]
125 es = to_set(e)
126 vs = to_set(v)
127 r += float(len(es & vs)) / len(es.union(vs))
128 else:
129 missing += 1
130 if exc:
131 raise ValueError("Missing key in prediction {0}".format(k))
132 return r / len(exp)
133 elif isinstance(exp, list) and isinstance(val, list):
134 if len(exp) != len(val):
135 raise ValueError(
136 "Dimension mismatch {0} != {1}. Use product_id and only_exp.".format(len(exp), len(val)))
138 r = 0.0
139 for e, v in zip(exp, val):
140 es = to_set(e)
141 vs = to_set(v)
142 r += float(len(es & vs)) / len(es.union(vs))
143 return r / len(exp)
144 else:
145 raise TypeError(
146 "Inconsistent types {0} != {1}".format(type(exp), type(val)))