from jyquickhelper import add_notebook_menu
add_notebook_menu()


%matplotlib inline


n = 10**4
U = range(n)


p = 4294967291
m = 10


import random
a = random.randint(1, p)
b = random.randint(0, p)
def h(x):
    return ((a*x + b) % p) % m


couples = set()
for i in range(500):
    x, y = random.sample(U, 2)
    couples.add((x, y))
print('Nombre de couples distincts = {}'.format(len(couples)))

Nombre de couples distincts = 500


c = 0
for x, y, in couples:
    if (h(x) == h(y)):
        c += 1


p_collisions = c / len(couples)
print('Probabilité de collision = {:.2f}%'.format(p_collisions * 100.0))

Probabilité de collision = 7.40%


import numpy
collisions = []
# on reitere 100 fois
for _ in range(100):
    a = random.randint(1, p)
    b = random.randint(0, p)
    
    def h(x):
        return ((a*x + b) % p) % m
    
    couples = set()
    for i in range(500):
        x, y = random.sample(U, 2)
        couples.add((x, y))

    c = 0
    for x, y, in couples:
        if (h(x) == h(y)):
            c += 1
    collisions.append(c / len(couples))
p_collision = numpy.mean(collisions)
print('Probabilité de collision moyenne = {:.2f}%'.format(p_collision * 100.0))

Probabilité de collision moyenne = 9.92%


sizes = [10, 25, 50, 100, 250, 500, 750, 1000]
p_collision = []
p = 4294967291

for m in sizes:    
    collisions = []
    # on reitere 100 fois
    for _ in range(100):
        a = random.randint(1, p)
        b = random.randint(0, p)

        def h(x):
            return ((a*x + b) % p) % m

        couples = set()
        for i in range(500):
            x, y = random.sample(U, 2)
            couples.add((x, y))

        c = 0
        for x, y, in couples:
            if (h(x) == h(y)):
                c += 1
        collisions.append(c / len(couples))
    p_collision.append(numpy.mean(collisions))


import matplotlib.pyplot as plt
fix, ax = plt.subplots()
plt.plot(sizes, p_collision)
plt.xlabel(r'$m$')
ax.set_title('Ratio des collisions en fonction de la taille de hash')

<matplotlib.text.Text at 0x16ddf9cd7b8>


n = 10**3
N = 10**4
# nous tirons N entiers de 64bits (type i8) dont n sont distincts
universe = numpy.random.randint(0, n, N, dtype='i8')
s = 500
stream = universe[-s:]


# definissons un ensemble B
B = set()
epsilon = 0.1
B_max = 1 / epsilon**2


p = 4294967291


import random
# deux couples (a_1, b_1) (a_1, b_2) distincts
a1, a2 = random.sample(range(1, p), 2)
b1, b2 = random.sample(range(0, p), 2)

def h1(x):
    return ((a1*x + b1) % p) % s

def h2(x):
    return ((a2*x + b2) % p) % s


c = 0
# Prenons le premier élément du stream (à titre d'exemple)
x = stream[0]
y = h1(x)
print('x = {}, y = {}'.format(x, y))

x = 207, y = 230


mod_37bit_position = (32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
  7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5,
  20, 8, 19, 18)

# Un seul zéro à droite
s = 2**1
zeros = mod_37bit_position[(-s & s) % 37]
print('Decomposition binaire de 2**1 = {}, nombre de zeros a droite = {}'.format(bin(s), zeros))

# Dix zéros à droite
s = 2**10
zeros = mod_37bit_position[(-s & s) % 37]
print('Decomposition binaire de 2**10 = {}, nombre de zeros a droite = {}'.format(bin(s), zeros))

Decomposition binaire de 2**1 = 0b10, nombre de zeros a droite = 1
Decomposition binaire de 2**10 = 0b10000000000, nombre de zeros a droite = 10


k = mod_37bit_position[(-y & y) % 37]
print('Decomposition binaire de y = {}, nombre de zeros a droite = {}'.format(bin(y), k))

Decomposition binaire de y = 0b11100110, nombre de zeros a droite = 1


if (k >= c):
    z = h2(x)
    B.add((z, k))

B

{(1002, 1)}


while (len(B) >= B_max):
    c += 1
    # on prend ici une copie de B
    for z, k in B.copy():
        if (k < c):
            B.remove((z, k))


for x in stream:
    y = h1(x)
    k = mod_37bit_position[(-y & y) % 37]
    if (k >= c):
        z = h2(x)
        B.add((z, k))
        while (len(B) >= B_max):
            c += 1
            for z, k in B.copy():
                if (k < c):
                    B.remove((z, k))


print('Taille de B = {}, c = {}'.format(len(B), c))

Taille de B = 55, c = 3


print('Estimation de la taille de U = {}'.format(2**c * len(B)))

Estimation de la taille de U = 440


def BJKST(stream, epsilon):
    s = len(stream)
    a1, a2 = random.sample(range(1, p), 2)
    b1, b2 = random.sample(range(0, p), 2)
    def h1(x):
        return ((a1*x + b1) % p) % s
    def h2(x):
        return ((a2*x + b2) % p) % s
    c = 0
    B = set()
    B_max = 1.0 / epsilon**2
    for x in stream:
        y = h1(x)
        k = mod_37bit_position[(-y & y) % 37]
        if (k >= c):
            z = h2(x)
            B.add((z, k))
            while (len(B) >= B_max):
                c += 1
                for z, k in B.copy():
                    if (k < c):
                        B.remove((z, k))             
    return 2**c*len(B)


epsilons = [0.5, 0.2, 0.1]
sizes = [100, 250, 500, 1000, 2500, 5000]
estimates = {}
for eps in epsilons:
    values = []
    for s in sizes:
        stream = universe[-s:]
        values.append(numpy.median([BJKST(stream, eps) for _ in range(100)]))
    estimates[eps] = values

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-4a3332e0535d> in <module>()
      6     for s in sizes:
      7         stream = universe[-s:]
----> 8         values.append(np.median([BJKST(stream, eps) for _ in range(100)]))
      9     estimates[eps] = values

NameError: name 'np' is not defined


for eps in estimates:
    plt.plot(sizes, estimates[eps], label = '$\epsilon$ = {:.1f}'.format(eps))
plt.axhline(y=n, color='r', linestyle='--', label='Vraie valeur de $n$')
plt.title('Estimation de $n$')
plt.xlabel('Taille du stream')
plt.legend()


epsilon = 0.1
for i in range(len(sizes)):
    print('Erreur relative = {0:.2f}%, s = {1}'.format(abs(estimates[epsilon][i]/ n - 1.0)*100.0, sizes[i]))


import time
epsilon = 0.1
size_bound = 15
sizes = [100, 250, 500, 1000, 2500, 5000]
m = 100
times = []
for s in sizes:
    start = time.time()
    stream = universe[-s:]
    BJKST(stream, epsilon)
    times.append(time.time() - start)
times = numpy.array(times)


fix, ax = plt.subplots()
plt.plot(sizes, times*1000)
plt.title('Temps de calcul (en ms)')
plt.xlabel('Taille du stream')


import random
import numpy
import matplotlib.pyplot as plt
%matplotlib inline

n = 1000
stream = numpy.arange(1000)
p = 4294967291

mod_37bit_position = (32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
  7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5,
  20, 8, 19, 18)


def BJKST(stream, B_max, h1, h2):
    c = 0
    B = set()
    R = []
    removed = 0
    for x in stream:
        y = h1(x)
        k = mod_37bit_position[(-y & y) % 37]
        if (k >= c):
            z = h2(x)
            B.add((z, k))
            while (len(B) >= B_max):
                c += 1
                for z, k in B.copy():
                    if (k < c):
                        B.remove((z, k))
                        removed += 1
        R.append([removed, len(B), c])
    return numpy.array(R)


B_max = 200
def h1(x):
    return x
def h2(x):
    return x
R = BJKST(stream, B_max, h1, h2)
estimate = 2**R[-1,2]*R[-1,1]
print('Estimated = {}, true = {}, c= {}'.format(estimate, n, R[-1,2]))
D = numpy.concatenate((numpy.array([1]), numpy.diff(R[:,2])))
changes = stream[numpy.nonzero(D)]

fix, ax = plt.subplots()
ax.plot(stream, R[:,0], color='red', label='total #removed')
ax.plot(stream, R[:,1], color='blue', label='len B')
for c in changes:
    ax.annotate('c = {}'.format(R[c, 2]), xy=(c, R[c, 1]), xytext=(c + 65, R[c,1] - 30), arrowprops=dict(arrowstyle='->'))
ax.legend(loc=(1.1, 0.9))
plt.xlabel('$x$')
plt.ylabel('size')


B_max = 200
s = B_max // 4
a1, a2 = random.sample(range(1, p), 2)
b1, b2 = random.sample(range(0, p), 2)
def h1(x):
    return ((a1*x + b1) % p) % n
def h2(x):
    return ((a2*x + b2) % p) % s
R = BJKST(stream, B_max, h1, h2)
estimate = 2**R[-1,2]*R[-1,1]
print('Estimated = {}, true = {}, c= {}'.format(estimate, n, R[-1,2]))
D = numpy.concatenate((numpy.array([1]),numpy.diff(R[:,2])))
changes = stream[numpy.nonzero(D)]
fix, ax = plt.subplots()
ax.plot(stream, R[:,0], color='red', label='total #removed')
ax.plot(stream, R[:,1], color='blue', label='len B')
for c in changes:
    ax.annotate('c = {}'.format(R[c, 2]), xy=(c, R[c, 1]), xytext=(c + 65, R[c,1] - 30), arrowprops=dict(arrowstyle='->'))
ax.legend(loc=(1.1, 0.9))
plt.xlabel('$x$')
plt.ylabel('size')


B_max = 200
s = B_max
a1, a2 = random.sample(range(1, p), 2)
b1, b2 = random.sample(range(0, p), 2)
def h1(x):
    return ((a1*x + b1) % p) % n
def h2(x):
    return ((a2*x + b2) % p) % s
R = BJKST(stream, B_max, h1, h2)
estimate = 2**R[-1,2]*R[-1,1]
print('Estimated = {}, true = {}, c= {}'.format(estimate, n, R[-1,2]))
D =numpy.concatenate((numpy.array([1]), numpy.diff(R[:,2])))
changes = stream[numpy.nonzero(D)]
fix, ax = plt.subplots()
ax.plot(stream, R[:,0], color='red', label='total #removed')
ax.plot(stream, R[:,1], color='blue', label='len B')
for c in changes:
    ax.annotate('c = {}'.format(R[c, 2]), xy=(c, R[c, 1]), xytext=(c + 65, R[c,1] - 30), arrowprops=dict(arrowstyle='->'))
ax.legend(loc=(1.1, 0.9))
plt.xlabel('$x$')
plt.ylabel('size')


def BJKST(stream, epsilon):
    a1, a2 = random.sample(range(1, p), 2)
    b1, b2 = random.sample(range(0, p), 2)
    #taille de la valeur de hashage dépend de la precision
    b = int(numpy.log(n) / epsilon**2)
    def h1(x):
        return ((a1*x + b1) % p) % n
    # on applique la taille b sur la seconde fonction de hash
    def h2(x):
        return ((a2*x + b2) % p) % b
    c = 0
    B = set()
    B_max = 1.0 / epsilon**2
    for x in stream:
        y = h1(x)
        k = mod_37bit_position[(-y & y) % 37]
        if (k >= c):
            z = h2(x)
            B.add((z, k))
            while (len(B) >= B_max):
                c += 1
                for z, k in B.copy():
                    if (k < c):
                        B.remove((z, k))             
    return 2**c*len(B)


m = 100
epsilons = numpy.array([0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01])
medians = numpy.array([numpy.median([BJKST(stream, eps) for _ in range(m)]) for eps in epsilons])


import matplotlib.pyplot as plt
%matplotlib inline

#plt.plot(1.0 / epsilons, medians)
plt.plot(epsilons, medians)
plt.axhline(y=n, color='r', linestyle='--')
plt.title(r'Mediane en fonction de $1 / \epsilon$,  m = {}'.format(m))
plt.xlabel(r'$\epsilon$')
plt.ylabel('Mediane')

2A.algo - Algorithmes de streaming : généralités¶

Introduction¶

Estimer le nombre d'éléments distincts: l'algorithme BJKST¶

Universal hashing¶

Collisions¶

Algorithme BJKST¶

Résultats numériques¶

Temps de calcul en fonction de la taille du stream¶

Un peu plus sur la précision de l'estimateur¶

$h_1$ et $h_2$ égales à l'identité¶

cas où la taille du hash est petite¶

cas où la taille de hash est plus grande¶

la taille de hash dépend de la précision $\epsilon$¶