Correction.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
def dist_hamming(m1,m2):
d = 0
for a,b in zip(m1,m2):
if a != b :
d += 1
return d
dist_hamming("close", "cloue")
1
On peut allonger le mot le plus court par des espaces ce qui revient à ajouter au résultat la différence de taille.
def dist_hamming(m1,m2):
d = abs(len(m1)-len(m2))
for a,b in zip(m1,m2):
if a != b :
d += 1
return d
dist_hamming("close", "cloue"), dist_hamming("close", "clouet")
(1, 2)
Comme l'écriture est récursive, on peut essayer même si cela n'est pas optimal (pas optimal du tout).
def distance_edition_rec(m1,m2):
if max(len(m1), len(m2)) <= 2 or min(len(m1), len(m2)) <= 1:
return dist_hamming(m1,m2)
else:
collecte = []
for i in range(1,len(m1)):
for j in range(1,len(m2)):
d1 = distance_edition_rec(m1[:i],m2[:j])
d2 = distance_edition_rec(m1[i:],m2[j:])
collecte.append(d1+d2)
return min(collecte)
distance_edition_rec("longmot", "liongmot")
1
distance_edition_rec("longmot", "longmoit")
1
Que se passe-t-il lorsqu'on enlève la condition or min(len(m1), len(m2)) <= 1
?
def distance_edition_rec_cache(m1,m2,cache=None):
if cache is None:
cache = {}
if (m1,m2) in cache:
return cache[m1,m2]
if max(len(m1), len(m2)) <= 2 or min(len(m1), len(m2)) <= 1:
cache[m1,m2] = dist_hamming(m1,m2)
return cache[m1,m2]
else:
collecte = []
for i in range(1,len(m1)):
for j in range(1,len(m2)):
d1 = distance_edition_rec_cache(m1[:i],m2[:j], cache)
d2 = distance_edition_rec_cache(m1[i:],m2[j:], cache)
collecte.append(d1+d2)
cache[m1,m2] = min(collecte)
return cache[m1,m2]
distance_edition_rec_cache("longmot", "liongmot"), distance_edition_rec_cache("longmot", "longmoit")
(1, 1)
%timeit distance_edition_rec("longmot", "longmoit")
10 loops, best of 3: 48.4 ms per loop
%timeit distance_edition_rec_cache("longmot", "longmoit")
100 loops, best of 3: 4.82 ms per loop
Il apparaît qu'on perd un temps fou dans la première version à recalculer un grand nombre de fois les mêmes distances. Conserver ces résultats permet d'aller beaucoup plus vite.
On reprend la dernière version en la modificant pour ne tenir compte des mots insécables.
def distance_edition_rec_cache_insecable(m1,m2,cache=None):
if cache is None:
cache = {}
if (m1,m2) in cache:
return cache[m1,m2]
if max(len(m1), len(m2)) <= 2 or min(len(m1), len(m2)) <= 1:
cache[m1,m2] = dist_hamming(m1,m2)
return cache[m1,m2]
else:
i = len(m1)
j = len(m2)
d1 = distance_edition_rec_cache_insecable(m1[:i-2],m2[:j-1], cache) + dist_hamming(m1[i-2:], m2[j-1:])
d2 = distance_edition_rec_cache_insecable(m1[:i-1],m2[:j-2], cache) + dist_hamming(m1[i-1:], m2[j-2:])
d3 = distance_edition_rec_cache_insecable(m1[:i-1],m2[:j-1], cache) + dist_hamming(m1[i-1:], m2[j-1:])
cache[m1,m2] = min(d1,d2,d3)
return cache[m1,m2]
distance_edition_rec_cache_insecable("longmot", "liongmot"), distance_edition_rec_cache_insecable("longmot", "longmoit")
(1, 1)
%timeit distance_edition_rec_cache_insecable("longmot", "longmoit")
The slowest run took 4.80 times longer than the fastest. This could mean that an intermediate result is being cached 1000 loops, best of 3: 227 µs per loop
C'est encore plus rapide.
La version non récursive est plus simple à envisager dans ce cas.
def distance_edition_insecable(m1,m2,cache=None):
dist = {}
dist[-2,-1] = 0
dist[-1,-2] = 0
dist[-1,-1] = 0
for i in range(0,len(m1)):
dist[i,-1] = i
dist[i,-2] = i
for j in range(0,len(m2)):
dist[-1,j] = j
dist[-2,j] = j
for i in range(0,len(m1)):
for j in range(0,len(m2)):
d1 = dist[i-2,j-1] + dist_hamming(m1[i-2:i], m2[j-1:j])
d2 = dist[i-1,j-2] + dist_hamming(m1[i-1:i], m2[j-2:j])
d3 = dist[i-1,j-1] + dist_hamming(m1[i-1:i], m2[j-1:j])
dist[i,j] = min(d1,d2,d3)
return dist[len(m1)-1, len(m2)-1]
distance_edition_insecable("longmot", "liongmot"), distance_edition_insecable("longmot", "longmoit")
(1, 1)
%timeit distance_edition_insecable("longmot", "longmoit")
The slowest run took 7.55 times longer than the fastest. This could mean that an intermediate result is being cached 1000 loops, best of 3: 354 µs per loop
La distance de Hamming n'est pas présente dans l'algorithme décrit sur la page Wikipedia. C'est parce qu'on décompose la distance de Hamming entre un mot de 1 caractère et un mot de 2 caractères par une comparaison et une insertion (ou une suppression).
def distance_edition(m1,m2,cache=None):
dist = {}
dist[-1,-1] = 0
for i in range(0,len(m1)):
dist[i,-1] = i
for j in range(0,len(m2)):
dist[-1,j] = j
for i, c in enumerate(m1):
for j, d in enumerate(m2):
d1 = dist[i-1,j] + 1 # insertion
d2 = dist[i,j-1] + 1 # suppression
x = 0 if c == d else 1
d3 = dist[i-1,j-1] + x
dist[i,j] = min(d1,d2,d3)
return dist[len(m1)-1, len(m2)-1]
distance_edition("longmot", "liongmot"), distance_edition("longmot", "longmoit")
(1, 1)
%timeit distance_edition_insecable("longmot", "longmoit")
The slowest run took 4.42 times longer than the fastest. This could mean that an intermediate result is being cached 1000 loops, best of 3: 355 µs per loop