Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Metrics about regressions. 

4""" 

5import io 

6import numpy 

7import pandas 

8 

9 

10def l1_reg_max(exp, val, max_val=180, nomax=False, exc=True): 

11 """ 

12 Implements a :epkg:`L1` scoring function which does not consider 

13 error above threshold *max_val*. 

14 

15 @param exp list of values or :epkg:`numpy:array` 

16 @param val list of values or :epkg:`numpy:array` 

17 @param max_val every value above *max_val* is replaced by *max_val* 

18 before computing the differences 

19 @param nomax removes every value equal or above *nomax* in expected set, 

20 then compute the score 

21 @param raises an exception if not enough submitted items 

22 @return score 

23 

24 If ``max_val==180``, the function computes: 

25 

26 .. math:: 

27 

28 E = \\frac{1}{n} \\sum_{i=1}^n \\frac{\\left| \\min (Y_i, 180) - \\min(f(X_i), 180) \\right|}{180} 

29 

30 The computation is faster if :epkg:`numpy:array` are used 

31 (for *exp* and *val*). *exp and *val* can be filenames or streams. 

32 In that case, the function expects to find two columns: id, value 

33 in both files or streams. 

34 """ 

35 if isinstance(exp, numpy.ndarray) and isinstance(val, numpy.ndarray): 

36 if len(exp) != len(val): 

37 raise ValueError( 

38 "Dimension mismatch {0} != {1}".format(len(exp), len(val))) 

39 an = numpy.zeros((len(exp),)) 

40 an[:] = max_val 

41 mv = numpy.minimum(an, val) # pylint: disable=E1111 

42 me = numpy.minimum(an, exp) # pylint: disable=E1111 

43 if nomax: 

44 mv = mv[me < max_val] # pylint: disable=W0143,E1136 

45 me = me[me < max_val] # pylint: disable=W0143,E1136 

46 df = numpy.abs(mv - me) / max_val 

47 return df.mean() 

48 elif isinstance(exp, dict) and isinstance(val, dict): 

49 if exc and len(exp) != len(val): 

50 number_common = len(set(exp) & set(val)) 

51 raise ValueError( 

52 "Dimension mismatch {0} != {1} (#common={2})".format(len(exp), len(val), number_common)) 

53 r = 0.0 

54 nb = 0 

55 for k, e in exp.items(): 

56 if k in val: 

57 v = val[k] 

58 try: 

59 mv = min(v, max_val) 

60 except TypeError: 

61 return numpy.nan 

62 try: 

63 ev = min(e, max_val) 

64 except TypeError: 

65 return numpy.nan 

66 if nomax and ev >= max_val: 

67 continue 

68 d = abs(mv - ev) 

69 r += 1. * d / max_val 

70 nb += 1 

71 elif exc: 

72 raise ValueError("Missing key in prediction {0}".format(k)) 

73 else: 

74 r += 1. 

75 nb += 1 

76 return r / nb if nb > 0 else 0.0 

77 elif isinstance(exp, (str, io.StringIO)) and isinstance(val, (str, io.StringIO)): 

78 # We expect filenames. 

79 d1 = pandas.read_csv(exp, header=None, sep=";") 

80 d2 = pandas.read_csv(val, header=None, sep=";") 

81 dd1 = {} 

82 for k, v in d1.itertuples(name=None, index=False): 

83 if k in dd1: 

84 raise KeyError("Key '{}' present at least twice.".format(k)) 

85 dd1[k] = v 

86 dd2 = {} 

87 for k, v in d2.itertuples(name=None, index=False): 

88 if k in dd2: 

89 raise KeyError("Key '{}' present at least twice.".format(k)) 

90 dd2[k] = v 

91 return l1_reg_max(dd1, dd2, max_val=max_val, nomax=nomax, exc=exc) 

92 elif isinstance(exp, list) and isinstance(val, list): 

93 if len(exp) != len(val): 

94 raise ValueError( 

95 "Dimension mismatch {0} != {1}".format(len(exp), len(val))) 

96 r = 0.0 

97 nb = 0 

98 for e, v in zip(exp, val): 

99 ev = min(e, max_val) 

100 if nomax and ev >= max_val: 

101 continue 

102 mv = min(v, max_val) 

103 d = abs(mv - ev) 

104 r += 1. * d / max_val 

105 nb += 1 

106 return r / nb if nb > 0 else 0.0 

107 else: 

108 raise TypeError( 

109 "Inconsistent types {0} != {1}".format(type(exp), type(val)))