Coverage for src/ensae_teaching_cs/homeblog/table_formula_stat.py: 56%
63 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1# -*- coding: utf8 -*-
2"""
3@file
4@brief Contains TableFormulaStat.
5"""
8class _TableFormulaStat:
9 """
10 Contains various statistical functions.
12 ::
14 table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1".replace(" ", "\\t").replace("#","\\n"))
15 gini = table.Gini (lambda v : v["sum_y"])
16 print (gini) # expects 1
18 table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1#5#10".replace(" ", "\\t").replace("#","\\n"))
19 gini = table.Gini (lambda v : v["sum_y"])
20 print (gini) # expects much more less than 1
22 """
24 def GiniCurve(self, functionY, functionX=None, isXdx=False):
25 """
26 Computes the Gini curve, takes the following parameters.
28 @param functionY revenues
29 @param functionX sum of persons having an income below Y
30 (or having Y is isXdx is True)
31 @param isXdx number of persons equal to Y (True) or inferior (False),
32 if True, X,Y couples are sorted
33 @return a curve (x, Gini(x))
34 """
35 couples = [(0., 0.)]
36 for i, row in enumerate(self.values):
37 v = self._interpret_row(row)
38 x = functionX(v) if functionX is not None else float(i + 1)
39 y = functionY(v)
40 couples.append((x, y))
41 if y < 0:
42 raise ValueError(
43 "a value should not be negative for y: " + str(y))
44 if x < 0:
45 raise ValueError(
46 "a value should not be negative for x: " + str(x))
48 if not isXdx:
49 couples.sort()
51 sumx = sum(_[0] for _ in couples) if isXdx else max(_[0]
52 for _ in couples)
53 sumy = sum(_[1] for _ in couples)
54 couples = [[_[0] / sumx, _[1] / sumy] for _ in couples]
56 for i in range(1, len(couples)):
57 couples[i][1] += couples[i - 1][1]
58 if isXdx:
59 couples[i][0] += couples[i - 1][0]
60 for _ in (0, 1):
61 couples[i][_] = min(couples[i][_], 1.)
63 return self._private_getclass()(["x", "Gini(x)"], couples)
65 def Gini(self, functionY, functionX=None, isXdx=False):
66 """
67 computes the Gini, it calls GiniCurve (@see me GiniCurve),
68 it takes the following parameters:
69 @param functionY revenues
70 @param functionX sum of persons having an income below Y
71 (or having Y is isXdx is True)
72 @param isXdx number of persons equal to Y (True) or inferior (False),
73 if True, X,Y couples are sorted
74 @return a curve (x, Gini(x))
75 """
76 giniC = self.GiniCurve(functionY, functionX, isXdx)
77 gini = 0.
78 row_ = giniC.values[0]
80 for i in range(1, len(giniC)):
81 row = giniC.values[i]
82 dx = row[0] - row_[0]
83 y = row[1] + row_[1]
84 gini += dx * y
85 row_ = row
87 return 1. - gini
89 def summary(self):
90 """
91 produces a summary on each columns
92 @return TableFormulaStat
93 """
95 row = []
96 for col in self.header:
97 res = self.summary_column(col)
98 row.append(res)
100 return self._private_getclass()(row)
102 def summary_column(self, column_name):
103 """
104 produces a summary of a column, it the column is numerical, it
105 computes, the min, max, quantile, mean, med, std. If it is not,
106 count the number of distinct values.
107 The function considers an empty column as a non-numerical column.
108 The fonction do not consider None values.
110 @param column_name column name
111 @return dictionary
112 """
113 vals = self.select(lambda v: v[column_name])
114 vals = [_ for _ in vals if _ is not None]
115 missing = len(self) - len(vals)
117 if len(vals) > 0:
118 try:
119 s = sum(vals)
120 s2 = sum([v**2 for v in vals])
121 m = s / len(vals)
122 vals.sort()
123 res = {"ave": m,
124 "std": (s2 / len(vals) - m**2) ** 0.5,
125 "med": vals[len(vals) // 2],
126 "min": vals[0],
127 "max": vals[-1],
128 "1qua": vals[len(vals) * 1 // 4],
129 "3qua": vals[len(vals) * 3 // 4],
130 "02.5%": vals[len(vals) * 25 // 1000],
131 "97.5%": vals[len(vals) * 975 // 1000],
132 }
133 except TypeError:
134 count = {}
135 for v in vals:
136 count[v] = count.get(v, 0) + 1
137 res = {"count": len(count)}
138 else:
139 res = {"count": 0}
141 if missing > 0:
142 res["missing"] = missing
143 res["var"] = column_name
145 return res