.. _pvaluesexamplesrst: ======== p-values ======== .. only:: html **Links:** :download:`notebook `, :downloadlink:`html `, :download:`PDF `, :download:`python `, :downloadlink:`slides `, :githublink:`GitHub|_doc/notebooks/metric/pvalues_examples.ipynb|*` Compute p-values and heavy tails estimators. .. code:: ipython3 from jyquickhelper import add_notebook_menu add_notebook_menu() .. contents:: :local: .. code:: ipython3 %matplotlib inline p-value table ------------- .. code:: ipython3 from scipy.stats import norm import pandas from pandas import DataFrame import numpy def pvalue(p, q, N): theta = abs(p-q) var = p*(1-p) bn = (2*N)**0.5 * theta / var**0.5 ret = (1 - norm.cdf(bn))*2 return ret def pvalue_N(p, q, alpha): theta = abs(p-q) var = p*(1-p) rev = abs(norm.ppf (alpha/2)) N = 2 * (rev * var**0.5 / theta)** 2 return int(N+1) def alphatable(ps, dps, alpha): values = [] for p in ps : row=[] for dp in dps : q = p+dp r = pvalue_N(p,q,alpha) if 1 >= q >= 0 else numpy.nan row.append (r) values.append (row) return values def dataframe(ps,dps,table): columns = dps df = pandas.DataFrame(data=table, index=ps) df.columns = dps return df print ("norm.ppf(0.025)",norm.ppf (0.025)) # -1.9599639845400545 ps = [0.001, 0.002] + [ 0.05*i for i in range (1,20) ] dps = [ -0.2, -0.1, -0.02, -0.01, -0.002, -0.001, 0.2, 0.1, 0.02, 0.01, 0.002, 0.001, ] dps.sort() t = alphatable(ps, dps, 0.05) dataframe (ps, dps, t) .. parsed-literal:: norm.ppf(0.025) -1.9599639845400545 .. raw:: html
-0.200 -0.100 -0.020 -0.010 -0.002 -0.001 0.001 0.002 0.010 0.020 0.100 0.200
0.001 NaN NaN NaN NaN NaN 7676 7676 1919 77 20 1.0 1.0
0.002 NaN NaN NaN NaN 3834.0 15336 15336 3834 154 39 2.0 1.0
0.050 NaN NaN 913.0 3650.0 91235.0 364939 364939 91235 3650 913 37.0 10.0
0.100 NaN 70.0 1729.0 6915.0 172866.0 691463 691463 172866 6915 1729 70.0 18.0
0.150 NaN 98.0 2449.0 9796.0 244893.0 979572 979572 244893 9796 2449 98.0 25.0
0.200 31.0 123.0 3074.0 12293.0 307317.0 1229267 1229267 307317 12293 3074 123.0 31.0
0.250 37.0 145.0 3602.0 14406.0 360137.0 1440548 1440548 360137 14406 3602 145.0 37.0
0.300 41.0 162.0 4034.0 16135.0 403354.0 1613413 1613413 403354 16135 4034 162.0 41.0
0.350 44.0 175.0 4370.0 17479.0 436966.0 1747864 1747864 436966 17479 4370 175.0 44.0
0.400 47.0 185.0 4610.0 18440.0 460976.0 1843901 1843901 460976 18440 4610 185.0 47.0
0.450 48.0 191.0 4754.0 19016.0 475381.0 1901523 1901523 475381 19016 4754 191.0 48.0
0.500 49.0 193.0 4802.0 19208.0 480183.0 1920730 1920730 480183 19208 4802 193.0 49.0
0.550 48.0 191.0 4754.0 19016.0 475381.0 1901523 1901523 475381 19016 4754 191.0 48.0
0.600 47.0 185.0 4610.0 18440.0 460976.0 1843901 1843901 460976 18440 4610 185.0 47.0
0.650 44.0 175.0 4370.0 17479.0 436966.0 1747864 1747864 436966 17479 4370 175.0 44.0
0.700 41.0 162.0 4034.0 16135.0 403354.0 1613413 1613413 403354 16135 4034 162.0 41.0
0.750 37.0 145.0 3602.0 14406.0 360137.0 1440548 1440548 360137 14406 3602 145.0 37.0
0.800 31.0 123.0 3074.0 12293.0 307317.0 1229267 1229267 307317 12293 3074 123.0 31.0
0.850 25.0 98.0 2449.0 9796.0 244893.0 979572 979572 244893 9796 2449 98.0 NaN
0.900 18.0 70.0 1729.0 6915.0 172866.0 691463 691463 172866 6915 1729 70.0 NaN
0.950 10.0 37.0 913.0 3650.0 91235.0 364939 364939 91235 3650 913 NaN NaN
p-values in 2D -------------- .. code:: ipython3 import numpy, matplotlib, random, math import matplotlib.pyplot as pylab def matrix_square_root(sigma) : eigen, vect = numpy.linalg.eig(sigma) dim = len(sigma) res = numpy.identity(dim) for i in range(0,dim) : res[i,i] = eigen[i]**0.5 return vect * res * vect.transpose() def chi2_level (alpha = 0.95) : N = 1000 x = [ random.gauss(0,1) for _ in range(0,N) ] y = [ random.gauss(0,1) for _ in range(0,N) ] r = map ( lambda c : (c[0]**2+c[1]**2)**0.5, zip(x,y)) r = list(r) r.sort() res = r [ int (alpha * N) ] return res def square_figure(mat, a) : x = [ ] y = [ ] for i in range (0,100) : x.append( a * mat[0][0]**0.5 ) y.append( (random.random ()-0.5) * a * mat[1][1]**0.5*2 ) x.append( -a * mat[0][0]**0.5 ) y.append( (random.random ()-0.5) * a * mat[1][1]**0.5*2 ) y.append( a * mat[1][1]**0.5 ) x.append( (random.random ()-0.5) * a * mat[0][0]**0.5*2 ) y.append( -a * mat[1][1]**0.5 ) x.append( (random.random ()-0.5) * a * mat[0][0]**0.5*2 ) pylab.plot(x,y, 'ro') x = [ ] y = [ ] for i in range (0,100) : x.append( a ) y.append( (random.random ()-0.5) * a*2 ) x.append( -a ) y.append( (random.random ()-0.5) * a*2 ) y.append( a ) x.append( (random.random ()-0.5) * a*2 ) y.append( -a ) x.append( (random.random ()-0.5) * a*2 ) xs,ys = [],[] for a,b in zip (x,y) : ar = numpy.matrix( [ [a], [b] ] ).transpose() we = ar * root xs.append( we [0,0] ) ys.append( we [0,1] ) pylab.plot(xs,ys, 'bo') pylab.show() def circle_figure (mat, a) : x = [ ] y = [ ] for i in range (0,200) : z = random.random() * math.pi * 2 i = a * mat[0][0]**0.5 * math.cos(z) j = a * mat[0][0]**0.5 * math.sin(z) x.append ( i ) y.append ( j ) pylab.plot(x,y, 'ro') x = [ ] y = [ ] for i in range (0,200) : z = random.random() * math.pi * 2 i = a * math.cos(z) j = a * math.sin(z) x.append ( i ) y.append ( j ) xs,ys = [],[] for a,b in zip (x,y) : ar = numpy.matrix( [ [a], [b] ] ).transpose() we = ar * root xs.append( we [0,0] ) ys.append( we [0,1] ) pylab.plot(xs,ys, 'bo') pylab.show() level = chi2_level () mat = [ [0.1, 0.05], [0.05, 0.2] ] npmat = numpy.matrix(mat) root = matrix_square_root (npmat) square_figure (mat, 1.96) circle_figure (mat, level) .. image:: pvalues_examples_6_0.png .. image:: pvalues_examples_6_1.png p-value ratio ------------- .. code:: ipython3 import random, math def densite_gauss (mu, sigma, x) : e = -(x - mu)**2 / (sigma**2 * 2) d = 1. / ((2*math.pi)**0.5 * sigma) return d * math.exp (e) def simulation_vector (N, mu, sigma) : return [ random.gauss(mu,sigma) for n in range(N) ] def ratio (vector, x, fdensite) : under = 0 above = 0 fx = fdensite(x) for u in vector: f = fdensite (u) if f >= fx: above += 1 else: under += 1 return float(above) / float (above + under) x = 1.96 N = 10000 mu = 0 sigma = 1 v = simulation_vector(N, mu, sigma) g = ratio(v, x, lambda y: densite_gauss (mu, sigma, y) ) print (g) .. parsed-literal:: 0.9487 p-values and EM --------------- See `Applying the EM Algorithm: Binomial Mixtures `__. .. code:: ipython3 from scipy.stats import norm import random, math def average_std_deviation(sample): mean = 0. var = 0. for x in sample: mean += x var += x*x mean /= len(sample) var /= len(sample) var -= mean*mean return mean,var ** 0.5 def bootsample(sample): n = len(sample)-1 return [ sample[ random.randint(0,n) ] for _ in sample ] def bootstrap_difference(sampleX, sampleY, draws=2000, confidence=0.05): diff = [ ] for n in range (0,draws) : if n % 1000 == 0: print(n) sx = bootsample(sampleX) sy = bootsample(sampleY) px = sum(sx) * 1.0/ len(sx) py = sum(sy) * 1.0/ len(sy) diff.append (px-py) diff.sort() n = int(len(diff) * confidence / 2) av = sum(diff) / len(diff) return av, diff [n], diff [len(diff)-n] # generation of a sample def generate_obs(p): x = random.random() if x <= p : return 1 else : return 0 def generate_n_obs(p, n): return [ generate_obs(p) for i in range (0,n) ] # std deviation def diff_std_deviation(px, py): s = px*(1-px) + py*(1-py) return px, py, s**0.5 def pvalue(diff, std, N): theta = abs(diff) bn = (2*N)**0.5 * theta / std pv = (1 - norm.cdf(bn))*2 return pv def omega_i (X, pi, p, q) : np = p * pi if X == 1 else (1-p)*pi nq = q * (1-pi) if X == 1 else (1-q)*(1-pi) return np / (np + nq) def likelihood (X, pi, p, q) : np = p * pi if X == 1 else (1-p)*pi nq = q * (1-pi) if X == 1 else (1-q)*(1-pi) return math.log(np) + math.log(nq) def algoEM (sample): p = random.random() q = random.random() pi = random.random() iter = 0 while iter < 10 : lk = sum ( [ likelihood (x, pi, p, q) for x in sample ] ) wi = [ omega_i (x, pi, p, q) for x in sample ] sw = sum(wi) pin = sum(wi) / len(wi) pn = sum([ x * w for x,w in zip (sample,wi) ]) / sw qn = sum([ x * (1-w) for x,w in zip (sample,wi) ]) / (len(wi) - sw) pi,p,q = pin,pn,qn iter += 1 lk = sum ( [ likelihood (x, pi, p, q) for x in sample ] ) return pi,p,q, lk # mix p,q = 0.20, 0.80 pi = 0.7 N = 1000 na = int(N * pi) nb = N - na print("------- sample") sampleX = generate_n_obs(p, na) + generate_n_obs (q, nb) random.shuffle(sampleX) print("ave", p * pi + q*(1-pi)) print("mea", sum(sampleX)*1./len(sampleX)) lk = sum ( [ likelihood (x, pi, p, q) for x in sampleX ] ) print ("min lk", lk, sum (sampleX)*1. / len(sampleX)) res = [] for k in range (0, 10) : r = algoEM (sampleX) res.append ( (r[-1], r) ) res.sort () rows = [] for r in res: pi,p,q,lk = r[1] rows.append( [p * pi + q*(1-pi)] + list(r[1])) df = pandas.DataFrame(data=rows) df.columns = ["average", "pi", "p", "q", "likelihood"] df .. parsed-literal:: ------- sample ave 0.38 mea 0.373 min lk -3393.2292120130046 0.373 .. raw:: html
average pi p q likelihood
0 0.373 0.000324 0.341877 0.373010 -9358.705695
1 0.373 0.863747 0.284788 0.932204 -4531.967709
2 0.373 0.936083 0.346101 0.766941 -4490.512057
3 0.373 0.123023 0.290964 0.384508 -3563.557269
4 0.373 0.538835 0.053584 0.746213 -3487.438442
5 0.373 0.346351 0.057880 0.539974 -3302.391944
6 0.373 0.797540 0.376491 0.359248 -3144.938682
7 0.373 0.392520 0.592563 0.231131 -2902.915478
8 0.373 0.390241 0.459488 0.317648 -2778.903072
9 0.373 0.609127 0.338062 0.427447 -2764.987703
p-value and heavy tail ---------------------- .. code:: ipython3 from scipy.stats import norm, zipf import sys def generate_n_obs_zipf (tail_index, n) : return list(zipf.rvs(tail_index, size=n)) def hill_estimator (sample) : sample = list(sample) sample.sort(reverse=True) end = len(sample)/10 end = min(end,100) s = 0. res = [] for k in range (0,end) : s += math.log(sample[k]) h = (s - (k+1)*math.log(sample[k+1]))/(k+1) h = 1./h res.append( [k, h] ) return res # mix tail_index = 1.05 N = 10000 sample = generate_n_obs_zipf(tail_index, N) sample[:5] .. parsed-literal:: [357621, 148, 18, 1812876449, 36150] .. code:: ipython3 import pandas def graph_XY(curves, xlabel=None, ylabel=None, marker=True, link_point=False, title=None, format_date="%Y-%m-%d", legend_loc=0, figsize=None, ax=None): if ax is None: import matplotlib.pyplot as plt # pylint: disable=C0415 fig, ax = plt.subplots(1, 1, figsize=figsize) smarker = {(True, True): 'o-', (True, False): 'o', (False, True): '-', # (False, False) :'' }[marker, link_point] has_date = False for xf, yf, label in curves: ax.plot(xf, yf, smarker, label=label) ax.legend(loc=legend_loc) return ax def draw_variance(sample) : avg = 0. std = 0. n = 0. w = 1. add = [] for i,x in enumerate(sample) : x = float (x) avg += x * w std += x*x * w n += w val = (std/n - (avg/n)**2)**0.5 add.append ( [ i, avg/n, val] ) print(add[-1]) table = pandas.DataFrame(add, columns=["index", "avg(n)", "std(n)"]) return graph_XY([ [table['index'], table["avg(n)"], "avg(n)"], [table['index'], table["std(n)"], "std(n)"], ], marker=False, link_point=True) draw_variance(sample); .. parsed-literal:: [9999, 55186871.0339, 233342554.46156308] .. image:: pvalues_examples_13_1.png .. code:: ipython3 def draw_hill_estimator (sample) : res = hill_estimator(sample) table = DataFrame(res, columns=["d", "hill"]) return graph_XY( [[table['d'], table['hill'], "Hill"],], marker=False, link_point=True) draw_hill_estimator(sample); .. image:: pvalues_examples_14_0.png .. code:: ipython3 def draw_heavy_tail (sample) : table = DataFrame([[_] for _ in sample ], columns=["obs"]) std = 1 normal = norm.rvs(size = len(sample)) normal = [ x*std for x in normal ] nortbl = DataFrame([ [_] for _ in normal ], columns=["obs"]) nortbl["iobs"] = (nortbl['obs'] * 10).astype(numpy.int64) histon = nortbl[["iobs", "obs"]].groupby('iobs', as_index=False).count() histon.columns = ['iobs', 'nb'] histon = histon.sort_values("nb", ascending=False).reset_index(drop=True) table["one"] = 1 histo = table.groupby('obs', as_index=False).count() histo.columns = ['obs', 'nb'] histo = histo.sort_values('nb', ascending=False).reset_index(drop=True) histo.reset_index(drop=True, inplace=True) histo["index"] = histo.index + 1 vec = list(histon["nb"]) vec += [0,] * len(histo) histo['nb_normal'] = vec[:len(histo)] histo["log(index)"] = numpy.log(histo["index"]) / numpy.log(10) histo["log(nb)"] = numpy.log(histo["nb"]) / numpy.log(10) histo["log(nb_normal)"] = numpy.log(histo["nb_normal"]) / numpy.log(10) return graph_XY ([ [histo["log(index)"], histo["log(nb)"], "Zipf"], [histo["log(index)"], histo["log(nb_normal)"], "Gauss"], ], marker=False, link_point=True) draw_heavy_tail(sample); .. parsed-literal:: c:\python372_x64\lib\site-packages\pandas\core\series.py:679: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) .. image:: pvalues_examples_15_1.png