"""
Implements a way to get close examples based
on the output of a machine learned model.
:githublink:`%|py|6`
"""
import numpy
from .search_engine_predictions import SearchEnginePredictions
[docs]class SearchEnginePredictionImages(SearchEnginePredictions):
"""
Extends class :class:`SearchEnginePredictions <mlinsights.search_rank.search_engine_predictions.SearchEnginePredictions>`.
Vectors are coming from images. The metadata must contains
information about path names. We assume all images can hold
in memory. An example can found in notebook
:ref:`searchimageskerasrst` or :ref:`searchimagestorchrst`.
Another example can be found there:
`search_images_dogcat.py
<https://github.com/sdpython/ensae_projects/blob/master/src/
ensae_projects/restapi/search_images_dogcat.py>`_.
:githublink:`%|py|21`
"""
[docs] def _prepare_fit(self, data=None, features=None, metadata=None,
transform=None, n=None, fLOG=None):
"""
Stores data in the class itself.
:param data: a dataframe or None if the
the features and the metadata
are specified with an array and a
dictionary
:param features: features columns or an array
:param metadata: data
:param transform: transform each vector before using it
:param n: takes *n* images (or ``len(iter_images)``)
:param fLOG: logging function
:githublink:`%|py|37`
"""
if "torch" in str(type(data)):
self.module_ = "torch"
from torch.utils.data import DataLoader # pylint: disable=E0401,C0415,E0611
dataloader = DataLoader(
data, batch_size=1, shuffle=False, num_workers=0)
self.iter_images_ = iter_images = iter(
zip(dataloader, data.samples))
if n is None:
n = len(data)
elif "keras" in str(type(data)):
self.module_ = "keras"
iter_images = data
# We delay the import as keras backend is not necessarily installed.
from keras.preprocessing.image import Iterator # pylint: disable=E0401,C0415,E0611
from keras_preprocessing.image import DirectoryIterator, NumpyArrayIterator # pylint: disable=E0401,C0415
if not isinstance(iter_images, (Iterator, DirectoryIterator, NumpyArrayIterator)):
raise NotImplementedError( # pragma: no cover
"iter_images must be a keras Iterator. No option implemented for type {0}."
"".format(type(iter_images)))
if iter_images.batch_size != 1:
raise ValueError( # pragma: no cover
"batch_size must be 1 not {0}".format(
iter_images.batch_size))
self.iter_images_ = iter_images
if n is None:
n = len(iter_images)
if not hasattr(iter_images, "filenames"):
raise NotImplementedError( # pragma: no cover
"Iterator does not iterate on images but numpy arrays (not implemented).")
else:
raise TypeError( # pragma: no cover
"Unexpected data type {0}.".format(type(data)))
def get_current_index(flow):
"get current index"
return flow.index_array[(flow.batch_index + flow.n - 1) % flow.n]
def iterator_feature_meta():
"iterators on metadata"
def accessor(iter_images):
if hasattr(iter_images, 'filenames'):
# keras
return (lambda i, ite: (ite, iter_images.filenames[get_current_index(iter_images)]))
else:
# torch
return (lambda i, ite: (ite[0], ite[1][0]))
acc = accessor(iter_images)
for i, it in zip(range(n), iter_images):
im, name = acc(i, it)
if not isinstance(name, str):
raise TypeError( # pragma: no cover
"name should be a string, not {0}".format(type(name)))
yield im[0], dict(name=name, i=i)
if fLOG and i % 10000 == 0:
fLOG(
'[SearchEnginePredictionImages.fit] i={}/{} - {}'.format(i, n, name))
super()._prepare_fit(data=iterator_feature_meta(), transform=transform)
[docs] def fit(self, iter_images, n=None, fLOG=None):
"""
Processes images through the model and fits a *k-nn*.
:param iter_images: `Iterator <https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py#L719>`_
:param n: takes *n* images (or ``len(iter_images)``)
:param fLOG: logging function
:param kwimg: parameters used to preprocess the images
:githublink:`%|py|105`
"""
self._prepare_fit(data=iter_images, transform=self.fct, n=n, fLOG=fLOG)
return self._fit_knn()
[docs] def kneighbors(self, iter_images, n_neighbors=None):
"""
Searches for neighbors close to the first image
returned by *iter_images*. It returns the neighbors
only for the first image.
:param iter_images: `Iterator <https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py#L719>`_
:return: score, ind, meta
*score* is an array representing the lengths to points,
*ind* contains the indices of the nearest points in the population matrix,
*meta* is the metadata.
:githublink:`%|py|121`
"""
if isinstance(iter_images, numpy.ndarray):
if self.module_ == "keras":
raise NotImplementedError("Not yet implemented or Keras.")
elif self.module_ == "torch":
from torch import from_numpy # pylint: disable=E0611,E0401,C0415
X = from_numpy(iter_images[numpy.newaxis, :, :, :])
return super().kneighbors(X, n_neighbors=n_neighbors)
raise RuntimeError(
"Unknown module '{0}'.".format(self.module_))
elif "keras" in str(iter_images):
if self.module_ != "keras":
raise RuntimeError( # pragma: no cover
"Keras object but {0} was used to train the KNN.".format(self.module_))
# We delay the import as keras backend is not necessarily installed.
# keras, it expects an iterator.
from keras.preprocessing.image import Iterator # pylint: disable=E0401,C0415,E0611
from keras_preprocessing.image import DirectoryIterator, NumpyArrayIterator # pylint: disable=E0401,C0415,E0611
if not isinstance(iter_images, (Iterator, DirectoryIterator, NumpyArrayIterator)):
raise NotImplementedError( # pragma: no cover
"iter_images must be a keras Iterator. No option implemented for type {0}.".format(type(iter_images)))
if iter_images.batch_size != 1:
raise ValueError( # pragma: no cover
"batch_size must be 1 not {0}".format(
iter_images.batch_size))
for img in iter_images:
X = img[0]
break
return super().kneighbors(X, n_neighbors=n_neighbors)
elif "torch" in str(type(iter_images)):
if self.module_ != "torch":
raise RuntimeError( # pragma: no cover
"Torch object but {0} was used to train the KNN.".format(self.module_))
# torch: it expects a tensor
X = iter_images
return super().kneighbors(X, n_neighbors=n_neighbors)
elif isinstance(iter_images, list):
res = [self.kneighbors(it, n_neighbors=n_neighbors)
for it in iter_images]
return (numpy.vstack([_[0] for _ in res]),
numpy.vstack([_[1] for _ in res]),
numpy.vstack([_[2] for _ in res]))
else:
raise TypeError( # pragma: no cover
"Unexpected type {0} in SearchEnginePredictionImages.kneighbors".format(
type(iter_images)))