Code source de ensae_teaching_cs.homeblog.table_formula_stat
# -*- coding: utf8 -*-
"""
Contains TableFormulaStat.
:githublink:`%|py|6`
"""
[docs]class _TableFormulaStat:
"""
Contains various statistical functions.
::
table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1".replace(" ", "\\t").replace("#","\\n"))
gini = table.Gini (lambda v : v["sum_y"])
print (gini) # expects 1
table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1#5#10".replace(" ", "\\t").replace("#","\\n"))
gini = table.Gini (lambda v : v["sum_y"])
print (gini) # expects much more less than 1
:githublink:`%|py|22`
"""
[docs] def GiniCurve(self, functionY, functionX=None, isXdx=False):
"""
Computes the Gini curve, takes the following parameters.
:param functionY: revenues
:param functionX: sum of persons having an income below Y
(or having Y is isXdx is True)
:param isXdx: number of persons equal to Y (True) or inferior (False),
if True, X,Y couples are sorted
:return: a curve (x, Gini(x))
:githublink:`%|py|34`
"""
couples = [(0., 0.)]
for i, row in enumerate(self.values):
v = self._interpret_row(row)
x = functionX(v) if functionX is not None else float(i + 1)
y = functionY(v)
couples.append((x, y))
if y < 0:
raise ValueError(
"a value should not be negative for y: " + str(y))
if x < 0:
raise ValueError(
"a value should not be negative for x: " + str(x))
if not isXdx:
couples.sort()
sumx = sum(_[0] for _ in couples) if isXdx else max(_[0]
for _ in couples)
sumy = sum(_[1] for _ in couples)
couples = [[_[0] / sumx, _[1] / sumy] for _ in couples]
for i in range(1, len(couples)):
couples[i][1] += couples[i - 1][1]
if isXdx:
couples[i][0] += couples[i - 1][0]
for _ in (0, 1):
couples[i][_] = min(couples[i][_], 1.)
return self._private_getclass()(["x", "Gini(x)"], couples)
[docs] def Gini(self, functionY, functionX=None, isXdx=False):
"""
computes the Gini, it calls GiniCurve (:meth:`GiniCurve <ensae_teaching_cs.homeblog.table_formula_stat._TableFormulaStat.GiniCurve>`),
it takes the following parameters:
:param functionY: revenues
:param functionX: sum of persons having an income below Y
(or having Y is isXdx is True)
:param isXdx: number of persons equal to Y (True) or inferior (False),
if True, X,Y couples are sorted
:return: a curve (x, Gini(x))
:githublink:`%|py|75`
"""
giniC = self.GiniCurve(functionY, functionX, isXdx)
gini = 0.
row_ = giniC.values[0]
for i in range(1, len(giniC)):
row = giniC.values[i]
dx = row[0] - row_[0]
y = row[1] + row_[1]
gini += dx * y
row_ = row
return 1. - gini
[docs] def summary(self):
"""
produces a summary on each columns
:return: TableFormulaStat
:githublink:`%|py|93`
"""
row = []
for col in self.header:
res = self.summary_column(col)
row.append(res)
return self._private_getclass()(row)
[docs] def summary_column(self, column_name):
"""
produces a summary of a column, it the column is numerical, it
computes, the min, max, quantile, mean, med, std. If it is not,
count the number of distinct values.
The function considers an empty column as a non-numerical column.
The fonction do not consider None values.
:param column_name: column name
:return: dictionary
:githublink:`%|py|112`
"""
vals = self.select(lambda v: v[column_name])
vals = [_ for _ in vals if _ is not None]
missing = len(self) - len(vals)
if len(vals) > 0:
try:
s = sum(vals)
s2 = sum([v**2 for v in vals])
m = s / len(vals)
vals.sort()
res = {"ave": m,
"std": (s2 / len(vals) - m**2) ** 0.5,
"med": vals[len(vals) // 2],
"min": vals[0],
"max": vals[-1],
"1qua": vals[len(vals) * 1 // 4],
"3qua": vals[len(vals) * 3 // 4],
"02.5%": vals[len(vals) * 25 // 1000],
"97.5%": vals[len(vals) * 975 // 1000],
}
except TypeError:
count = {}
for v in vals:
count[v] = count.get(v, 0) + 1
res = {"count": len(count)}
else:
res = {"count": 0}
if missing > 0:
res["missing"] = missing
res["var"] = column_name
return res