Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Metrics about regressions. 

4""" 

5import io 

6import numpy 

7import pandas 

8from sklearn.metrics import roc_auc_score 

9 

10 

11def is_vector(a): 

12 """ 

13 Tells if an array is a vector. 

14 """ 

15 return len(a.shape) == 1 or a.shape[1] == 1 

16 

17 

18def reshape(exp, val): 

19 """ 

20 Reshape the expected values and predictions. 

21 """ 

22 if isinstance(val, list): 

23 val = numpy.array(val) 

24 if isinstance(exp, list): 

25 exp = numpy.array(exp) 

26 if isinstance(val, pandas.DataFrame): 

27 val = val.values 

28 if isinstance(exp, pandas.DataFrame): 

29 exp = exp.values 

30 if not isinstance(val, numpy.ndarray): 

31 raise TypeError("val is {0} not an array".format(type(val))) 

32 if not isinstance(exp, numpy.ndarray): 

33 raise TypeError("exp is {0} not an array".format(type(exp))) 

34 if is_vector(exp) != is_vector(val): 

35 if not is_vector(val) and is_vector(exp): 

36 exp_ = exp 

37 exp = numpy.zeros((val.shape)) 

38 for i, v in enumerate(exp_.ravel()): 

39 exp[i, int(v)] = 1 

40 else: 

41 exp = exp.ravel() 

42 val = val.ravel() 

43 elif is_vector(exp) and is_vector(val): 

44 exp = exp.ravel() 

45 val = val.ravel() 

46 

47 if len(exp.shape) == 2 and exp.shape[1] == 1: 

48 raise ValueError("exp has two dimensions but one column") 

49 if len(val.shape) == 2 and val.shape[1] == 1: 

50 raise ValueError("val has two dimensions but one column") 

51 return exp, val 

52 

53 

54def roc_auc_score_micro(exp, val): 

55 """ 

56 Computes `roc_auc_score <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html>`_ 

57 with *average='micro'*. 

58 """ 

59 exp, val = reshape(exp, val) 

60 return roc_auc_score(exp, val, average="micro") 

61 

62 

63def roc_auc_score_macro(exp, val): 

64 """ 

65 Computes `roc_auc_score <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html>`_ 

66 with *average='macro'*. 

67 """ 

68 exp, val = reshape(exp, val) 

69 return roc_auc_score(exp, val, average="macro") 

70 

71 

72def multi_label_jaccard(exp, val, exc=True): 

73 """ 

74 Applies to a multi-label classification problem. 

75 Computes the average Jaccard index between two sequences 

76 of sets of labels 

77 (see `Multi-label classification <https://en.wikipedia.org/wiki/Multi-label_classification>`_). 

78 

79 @param exp list of tuple or list of set or filename or streams (comma separated values) or dict 

80 @param val list of tuple or list of set or filename or streams (comma separated values) or dict 

81 @param exc raises an exception if not enough submitted items 

82 @return score 

83 

84 .. math:: 

85 

86 E = \\frac{1}{n} \\sum_{i=1}^n \\frac{|C_i \\cap P_i|}{|C_i \\cup P_i|} 

87 

88 """ 

89 def to_set(v): 

90 "as a set" 

91 if isinstance(v, set): 

92 return v 

93 elif isinstance(v, str): 

94 return set(v.split(',')) 

95 elif isinstance(v, (float, int)): 

96 return {str(v)} 

97 else: 

98 return set(v) 

99 

100 if isinstance(exp, (str, io.StringIO)) and isinstance(val, (str, io.StringIO)): 

101 # Files or streams. 

102 d1 = pandas.read_csv(exp, header=None, sep=";") 

103 d2 = pandas.read_csv(val, header=None, sep=";") 

104 dd1 = {} 

105 for k, v in d1.itertuples(name=None, index=False): 

106 if k in dd1: 

107 raise KeyError("Key '{}' present at least twice.".format(k)) 

108 dd1[k] = v 

109 dd2 = {} 

110 for k, v in d2.itertuples(name=None, index=False): 

111 if k in dd2: 

112 raise KeyError("Key '{}' present at least twice.".format(k)) 

113 dd2[k] = v 

114 return multi_label_jaccard(dd1, dd2, exc=exc) 

115 elif isinstance(exp, dict) and isinstance(val, dict): 

116 if exc and len(exp) != len(val): 

117 number_common = len(set(exp) & set(val)) 

118 raise ValueError( 

119 "Dimension mismatch {0} != {1} (#common={2})".format(len(exp), len(val), number_common)) 

120 r = 0.0 

121 missing = 0 

122 for k, e in exp.items(): 

123 if k in val: 

124 v = val[k] 

125 es = to_set(e) 

126 vs = to_set(v) 

127 r += float(len(es & vs)) / len(es.union(vs)) 

128 else: 

129 missing += 1 

130 if exc: 

131 raise ValueError("Missing key in prediction {0}".format(k)) 

132 return r / len(exp) 

133 elif isinstance(exp, list) and isinstance(val, list): 

134 if len(exp) != len(val): 

135 raise ValueError( 

136 "Dimension mismatch {0} != {1}. Use product_id and only_exp.".format(len(exp), len(val))) 

137 

138 r = 0.0 

139 for e, v in zip(exp, val): 

140 es = to_set(e) 

141 vs = to_set(v) 

142 r += float(len(es & vs)) / len(es.union(vs)) 

143 return r / len(exp) 

144 else: 

145 raise TypeError( 

146 "Inconsistent types {0} != {1}".format(type(exp), type(val)))