.. _imagesdupsrst: ================= Image et doublons ================= .. only:: html **Links:** :download:`notebook `, :downloadlink:`html `, :download:`PDF `, :download:`python `, :downloadlink:`slides `, :githublink:`GitHub|_doc/notebooks/hackathon_2018/images_dups.ipynb|*` Material for the hackathon ENSAE / BRGM / 2018. Les images sont extraites de tweets mais sont retweetées sans être retweetées. .. code:: ipython3 %matplotlib inline import matplotlib.pyplot as plt .. code:: ipython3 from jyquickhelper import add_notebook_menu add_notebook_menu() .. contents:: :local: Séparation des doublons ----------------------- Pour le challenge, il faut repérer les doublons dans les images. Pour cela, je zoom chaque image sur un carré 50x50 en noir et blanc, suivi d’une ACP puis k plus proches voisins pour détecter les doublons. Images en gris 50x50 ~~~~~~~~~~~~~~~~~~~~ .. code:: ipython3 folder = "c:/temp/suricatenat_images" .. code:: ipython3 from ensae_projects.hackathon.image_helper import apply_image_transform, image_zoom, img2gray dest_folder = "img5050" list(apply_image_transform(folder, dest_folder, lambda img: image_zoom(img2gray(img), (50, 50)), fLOG=print)) Images en features ~~~~~~~~~~~~~~~~~~ Pas utilisé par la suite. .. code:: ipython3 from ensae_projects.hackathon.image_helper import stream_image2features import numpy dest_folder = "img5050" dest_batch = "batch" for b in stream_image2features(dest_folder, dest_batch, numpy.array, fLOG=print): pass voisins ~~~~~~~ .. code:: ipython3 %matplotlib inline .. code:: ipython3 from ensae_projects.hackathon.image_knn import ImageNearestNeighbors folder = "img5050" knn = ImageNearestNeighbors() knn.fit(folder, fLOG=print) .. parsed-literal:: [ImageNearestNeighbors] processing image 0: 'inondation_2016\735614357036519425_CjVtTTrUoAAUUZp.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 1000: 'inondation_2016\737596119933321217_Cjx3w1FVAAAyyY1.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 2000: 'inondation_2016\737891662077255685_Cj2EjjXWUAA8Dhq.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 3000: 'inondation_2016\738050337521709056_Cj4UpFDUoAIR2gD.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 4000: 'inondation_2016\738283056302313472_Cj7oe7VWkAAPwAT.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 5000: 'inondation_2016\738366585526718464_Cj80fFNXEAAx9T2.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 6000: 'inondation_2016\738439428159377408_Cj92vvAUYAARP2A.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 7000: 'inondation_2016\738629637845221376_CkAjvUFVAAErbJF.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 8000: 'inondation_2016\738695722296614912_CkBf1CbXIAAonp1.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 9000: 'inondation_2016\738766013416787968_CkCfqR3XIAEuX8m.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 10000: 'inondation_2016\738894521304526849_CkEUnRhW0AEh1e1.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 11000: 'inondation_2016\739101985295728640_CkHRVZ-WUAAyCrp.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 12000: 'inondation_2016\739400457899114496_CkLgzBAWkAE9hCa.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 13000: 'inondation_2016\739732522427424768_CkQOztKWYAAlOul.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 14000: 'inondation_2016\740054590863859712_CkUzuikWgAAJUTK.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 15000: 'inondation_2016\740416207296299008_CkZ8nAnWYAAG7cC.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 16000: 'inondation_2016\740833843914153985_Ckf4dIFWEAANSOT.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 17000: 'inondation_2016\742361701924937728_Ck1mBsLWkAE6EQX.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 18000: 'inondation_2018\955391968712019968_DUI76ywW4AA2J1b.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 19000: 'inondation_2018\956216357934325761_LKmRQ9hLmVxOkWtm.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 20000: 'inondation_2018\957254473604268032_DUjZ2vSWkAAdzd2.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 21000: 'inondation_2018\959020320320565248_DU8fYlpX4AAZIRV.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 22000: 'inondation_2018\964034081381109761_DWDv4vHWsAAMQIS.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 23000: 'seisme_Amatrice\768290329543995392_MwkGcfSrCBzWbxwK.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 24000: 'seisme_Amatrice\768326333034364928_CqmktbfXEAAw2RU.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 25000: 'seisme_Amatrice\768345861646581760_Cqm2eUjWYAAWS68.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 26000: 'seisme_Amatrice\768361403522646016_CqnEgFrWcAANqdO.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 27000: 'seisme_Amatrice\768374709645967361_CqnQt96XEAAew2V.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 28000: 'seisme_Amatrice\768387852862455810_CqncoxLWYAAulnb.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 29000: 'seisme_Amatrice\768401257769865216_CqnlYItWAAAbc7p.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 30000: 'seisme_Amatrice\768417849652027394_Cqnz5_gXgAAx967.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 31000: 'seisme_Amatrice\768433724564377600_CqoGZC4WAAEh8zG.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 32000: 'seisme_Amatrice\768451168372662272_CqoWQCGW8AQIbHV.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 33000: 'seisme_Amatrice\768468307288743936_Cqol1cDXgAEychm.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 34000: 'seisme_Amatrice\768488406091386880_Cqo4H6GWIAAr4YP.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 35000: 'seisme_Amatrice\768511762429800448_CqpNXTxXYAATvNk.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 36000: 'seisme_Amatrice\768543842845032448_CqplczAWIAAhINz.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 37000: 'seisme_Amatrice\768647190260518912_CqrIhKnUkAAyvf6.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 38000: 'seisme_Amatrice\768716815279063040_CqsH3mqUEAA6gXD.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 39000: 'seisme_Amatrice\768743738634080256_CqsgWwgWcAE0rWO.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 40000: 'seisme_Amatrice\768772807568351232_Cqs6ORfWIAAXlf8.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 41000: 'seisme_Amatrice\768804543748575232_CqtXniSXYAAp7Tt.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 42000: 'seisme_Amatrice\768843712357076993_Cqt7R_1WYAE6tr6.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 43000: 'seisme_Amatrice\768901703898771456_Cquv7mKWgAAn6ZX.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 44000: 'suricatenat_inondation_aude\1052220109740228608_Dpo8nOhXgAYLNEm.jpg' - class 'suricatenat_inondation_aude' .. code:: ipython3 from ensae_projects.hackathon.image_helper import enumerate_image_class folder = "img5050" iter = enumerate_image_class(folder) imgs = [_[0] for _ in zip(iter, range(0,1000000))] len(imgs) .. parsed-literal:: 44053 .. code:: ipython3 for i, img in enumerate(imgs): dist, ind = knn.kneighbors(img[0]) if dist[0, 1] <= 10: print("dist =", dist) print("ind =", ind) break .. parsed-literal:: dist = [[ 0. 0. 7.93725393 366.16662874 380.73481585]] ind = [[ 12 3 10 21464 8684]] .. code:: ipython3 knn.plot_neighbors(ind, dist, obs=img[0], folder_or_images=folder); .. image:: images_dups_14_0.png .. code:: ipython3 pairs = [] for i, img in enumerate(imgs): if i % 1000 == 0: print("{0}/{1} done".format(i, len(imgs))) dist, ind = knn.kneighbors(img[0]) sub = ind.ravel()[dist.ravel() <= 10] if len(sub) > 0: for j in sub: pairs.append((i, j)) .. parsed-literal:: 0/44053 done 1000/44053 done 2000/44053 done 3000/44053 done 4000/44053 done 5000/44053 done 6000/44053 done 7000/44053 done 8000/44053 done 9000/44053 done 10000/44053 done 11000/44053 done 12000/44053 done 13000/44053 done 14000/44053 done 15000/44053 done 16000/44053 done 17000/44053 done 18000/44053 done 19000/44053 done 20000/44053 done 21000/44053 done 22000/44053 done 23000/44053 done 24000/44053 done 25000/44053 done 26000/44053 done 27000/44053 done 28000/44053 done 29000/44053 done 30000/44053 done 31000/44053 done 32000/44053 done 33000/44053 done 34000/44053 done 35000/44053 done 36000/44053 done 37000/44053 done 38000/44053 done 39000/44053 done 40000/44053 done 41000/44053 done 42000/44053 done 43000/44053 done 44000/44053 done .. code:: ipython3 pairs[:10] .. parsed-literal:: [(0, 0), (1, 1), (2, 2), (3, 12), (3, 3), (3, 10), (4, 4), (5, 133), (5, 1549), (5, 158)] .. code:: ipython3 pairs2 = [(i,j) for i,j in pairs if i != j] len(pairs), len(pairs2) .. parsed-literal:: (75725, 33675) .. code:: ipython3 pairs2[:10] .. parsed-literal:: [(3, 12), (3, 10), (5, 133), (5, 1549), (5, 158), (5, 5632), (5, 16784), (8, 14699), (8, 23), (8, 35)] .. code:: ipython3 dist, ind = knn.kneighbors(imgs[5][0]) knn.plot_neighbors(ind, dist, obs=imgs[5][0], folder_or_images=folder); .. image:: images_dups_19_0.png Composantes connectes ~~~~~~~~~~~~~~~~~~~~~ .. code:: ipython3 distincts = [] for i, j in pairs2: distincts.append(i) distincts.append(j) distincts = set(distincts) connex = {} for k in distincts: connex[k] = k n = 0 while n < 10: modif = 0 for i, j in pairs2: a = min(connex[i], connex[j]) if a != connex[i] or a != connex[j]: modif += 1 connex[i] = connex[j] = a print(n, modif) n += 1 .. parsed-literal:: 0 9096 1 6 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 .. code:: ipython3 len(connex), len(set(connex.values())) .. parsed-literal:: (13271, 4185) .. code:: ipython3 names = knn.image_names_ names[:2] .. parsed-literal:: ['inondation_2016/735614357036519425_CjVtTTrUoAAUUZp.jpg', 'inondation_2016/735616090261184512_CjVu73ZVEAAlWmu.jpg'] .. code:: ipython3 dups = [] for i, j in connex.items(): if i != j: dups.append(names[i]) len(dups) .. parsed-literal:: 9086 Images très proches ~~~~~~~~~~~~~~~~~~~ .. code:: ipython3 for i, img in enumerate(imgs): dist, ind = knn.kneighbors(img[0]) if 10 < dist[0, 1] <= 30: print("dist =", dist) print("ind =", ind) break .. parsed-literal:: dist = [[ 0. 21.97726098 21.97726098 21.97726098 161.13348504]] ind = [[ 285 308 351 311 3005]] .. code:: ipython3 obs = imgs[ind[0, 0]][0] knn.plot_neighbors(ind, dist, obs=obs, folder_or_images=folder); .. image:: images_dups_27_0.png Recopie de la base ~~~~~~~~~~~~~~~~~~ .. code:: ipython3 not_allowed = set(dups) len(not_allowed) .. parsed-literal:: 9086 .. code:: ipython3 list(sorted(not_allowed))[:5] .. parsed-literal:: ['inondation_2016/735805396657397762_CjYbG-DUgAQTu19.jpg', 'inondation_2016/735829559329853440_CjYxFcrXEAAvjlH.jpg', 'inondation_2016/735870604038045696_CjZWafAXEAA3sOb.jpg', 'inondation_2016/735892072960512000_CjZp8CoWsAIOhL5.jpg', 'inondation_2016/735892650583306240_CjZqdvoXAAEaSRM.jpg'] .. code:: ipython3 from ensae_projects.hackathon.image_helper import stream_copy_images src_folder = "c:/temp/suricatenat_images/" dest_folder = "c:/temp/suricatenat_clean/" def valid(name): spl = name.split("suricatenat_images")[-1].replace("\\", "/").strip("/\\") return spl not in allowed for img in stream_copy_images(src_folder, dest_folder, valid, fLOG=print): pass .. parsed-literal:: [stream_copy_images] copy image 0: 'bing\01-9.jpg' - class 'bing' [stream_copy_images] copy image 1000: 'imagenet1\3271012508_955158b073.jpg' - class 'imagenet1' [stream_copy_images] copy image 2000: 'imagenet2\3287016043_987800dc67.jpg' - class 'imagenet2' [stream_copy_images] copy image 3000: 'imagenet4\106994_5349_big_200907_voyager11.jpg' - class 'imagenet4' [stream_copy_images] copy image 4000: 'imagenet5\532346050_dafb11ec86.jpg' - class 'imagenet5' [stream_copy_images] copy image 5000: 'inondation_2016\736966968138473472_Cjo7jTrXAAAeffo.jpg' - class 'inondation_2016' [stream_copy_images] copy image 6000: 'inondation_2016\737629970399252480_CjySiGiUkAUr8TC.jpg' - class 'inondation_2016' [stream_copy_images] copy image 7000: 'inondation_2016\737923554407256064_Cj2hYNwWUAElsOP.jpg' - class 'inondation_2016' [stream_copy_images] copy image 8000: 'inondation_2016\738072076880347136_Cj4opLHXEAAuuGK.jpg' - class 'inondation_2016' [stream_copy_images] copy image 9000: 'inondation_2016\738298504267730945_Cj72k1kUoAAIKUA.jpg' - class 'inondation_2016' [stream_copy_images] copy image 10000: 'inondation_2016\738378724442296321_Cj8_iRUXEAEIYex.jpg' - class 'inondation_2016' [stream_copy_images] copy image 11000: 'inondation_2016\738456441082793984_Cj-GNbPWkAAecmj.jpg' - class 'inondation_2016' [stream_copy_images] copy image 12000: 'inondation_2016\738642491671379968_CkAvbhyVAAQdhnl.jpg' - class 'inondation_2016' [stream_copy_images] copy image 13000: 'inondation_2016\738708144893927424_CkBrBsFXIAAesMt.jpg' - class 'inondation_2016' [stream_copy_images] copy image 14000: 'inondation_2016\738775822753013760_CkCosKRXEAAL3QS.jpg' - class 'inondation_2016' [stream_copy_images] copy image 15000: 'inondation_2016\738983572388913152_CkFlodbW0AAjH1A.jpg' - class 'inondation_2016' [stream_copy_images] copy image 16000: 'inondation_2016\739133036877467649_CkHtiX3XEAAQ5qt.jpg' - class 'inondation_2016' [stream_copy_images] copy image 17000: 'inondation_2016\739435820709519360_CkMA9WNXAAEBBwW.jpg' - class 'inondation_2016' [stream_copy_images] copy image 18000: 'inondation_2016\739759634534141958_CkQnd1TUUAQli3i.jpg' - class 'inondation_2016' [stream_copy_images] copy image 19000: 'inondation_2016\740101248225935361_CkVVPYDWUAAc8U3.jpg' - class 'inondation_2016' [stream_copy_images] copy image 20000: 'inondation_2016\740462147130556416_CkamZeeXAAIf6ru.jpg' - class 'inondation_2016' [stream_copy_images] copy image 21000: 'inondation_2016\740924772062769152_CkhLHExW0AIpwYC.jpg' - class 'inondation_2016' [stream_copy_images] copy image 22000: 'inondation_2016\742979124050964480_Ck-XkQfXEAE46Wh.jpg' - class 'inondation_2016' [stream_copy_images] copy image 23000: 'inondation_2018\955500762070769664_DUKe4P3WAAEBJFC.jpg' - class 'inondation_2018' [stream_copy_images] copy image 24000: 'inondation_2018\956447069216165890_DUX7giCXUAANfkI.jpg' - class 'inondation_2018' [stream_copy_images] copy image 25000: 'inondation_2018\957555126931279872_DUnrT9aXUAARFxJ.jpg' - class 'inondation_2018' [stream_copy_images] copy image 26000: 'inondation_2018\959394452564598784_DVB0KQsWkAA4Bta.jpg' - class 'inondation_2018' [stream_copy_images] copy image 27000: 'inondation_2018\965549350599487488_DWZSA7cWsAEWGaK.jpg' - class 'inondation_2018' [stream_copy_images] copy image 28000: 'seisme_Amatrice\768296828550819841_CqmJ4k4UsAEcTaF.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 29000: 'seisme_Amatrice\768330792049205248_CqmooXdXgAAJ2b3.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 30000: 'seisme_Amatrice\768348574694408192_Cqm4itvWcAAsv_s.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 31000: 'seisme_Amatrice\768363756728516608_CqnGo90WIAAfA1I.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 32000: 'seisme_Amatrice\768376884677738496_CqnOR17WIAAP2Hn.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 33000: 'seisme_Amatrice\768390411228422144_Cqne_UWWYAAnY6V.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 34000: 'seisme_Amatrice\768404063755141120_CqnrVy8XYAAYIGO.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 35000: 'seisme_Amatrice\768420565745106944_Cqn6bjbWIAEAbck.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 36000: 'seisme_Amatrice\768436635444908032_CqoI-OfWIAEpe5T.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 37000: 'seisme_Amatrice\768453842098880512_CqoYsPEXEAARM5o.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 38000: 'seisme_Amatrice\768471447140458496_CqoosvJW8AIjpEA.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 39000: 'seisme_Amatrice\768492129882517506_Cqo7hBpW8AA64OU.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 40000: 'seisme_Amatrice\768516668515577856_CqpR0mDWIAAEGkL.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 41000: 'seisme_Amatrice\768550981206441984_Cqpw-qOWAAAVGTB.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 42000: 'seisme_Amatrice\768679088013778944_CqrlaXlVUAAcqVG.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 43000: 'seisme_Amatrice\768721000015892480_CqsLrL6UkAAofwM.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 44000: 'seisme_Amatrice\768749206500741120_CqslU7hWEAA_Cyn.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 45000: 'seisme_Amatrice\768777504609931264_Cqs_DzcWAAAZ2_h.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 46000: 'seisme_Amatrice\768810730250461184_CqtdRvNWAAAE_HJ.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 47000: 'seisme_Amatrice\768850688487022592_Cqsp-GyWgAEJ6Kp.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 48000: 'seisme_Amatrice\768916332322648064_Cqu9OPSWgAEzaux.jpg' - class 'seisme_Amatrice' .. code:: ipython3 l1 = list(enumerate_image_class("c:/temp/suricatenat_images/")) .. code:: ipython3 l2 = list(enumerate_image_class("c:/temp/suricatenat_clean/")) .. code:: ipython3 len(l1), len(l2) .. parsed-literal:: (48884, 39798) Takes a random sample ~~~~~~~~~~~~~~~~~~~~~ .. code:: ipython3 from ensae_projects.hackathon.image_helper import stream_random_sample, last_element rnd = last_element(stream_random_sample("c:/temp/suricatenat_clean/", abspath=False)) .. code:: ipython3 rnd[:5] .. parsed-literal:: [('imagenet2\\2611787731_6b65bdaf6a.jpg', 'imagenet2'), ('inondation_2016\\740608740169224192_CkcruUEXIAEsWUl.jpg', 'inondation_2016'), ('inondation_2016\\738614580658606080_CkAWBegUgAA5Z9l.jpg', 'inondation_2016'), ('inondation_2018\\956548703552245760_DUZX5TRWsAAyDqH.jpg', 'inondation_2018'), ('inondation_2018\\956925376936148993_DUeuiGQX4AAocq-.jpg', 'inondation_2018')] .. code:: ipython3 import os import shutil src_folder = "c:/temp/suricatenat_clean/" dest_folder = "c:/temp/suricatenat_sample/" for img, sub in rnd: src = os.path.join(src_folder, img) dst = os.path.join(dest_folder, img) d = os.path.dirname(dst) if not os.path.exists(d): os.makedirs(d) shutil.copy(src, dst)