Source code for mlinsights.search_rank.search_engine_predictions_images

"""
Implements a way to get close examples based
on the output of a machine learned model.


:githublink:`%|py|6`
"""
import numpy
from .search_engine_predictions import SearchEnginePredictions


[docs]class SearchEnginePredictionImages(SearchEnginePredictions):
    """
    Extends class :class:`SearchEnginePredictions <mlinsights.search_rank.search_engine_predictions.SearchEnginePredictions>`.
    Vectors are coming from images. The metadata must contains
    information about path names. We assume all images can hold
    in memory. An example can found in notebook
    :ref:`searchimageskerasrst` or :ref:`searchimagestorchrst`.
    Another example can be found there:
    `search_images_dogcat.py
    <https://github.com/sdpython/ensae_projects/blob/master/src/
    ensae_projects/restapi/search_images_dogcat.py>`_.


    :githublink:`%|py|21`
    """

[docs]    def _prepare_fit(self, data=None, features=None, metadata=None,
                     transform=None, n=None, fLOG=None):
        """
        Stores data in the class itself.

        :param      data:        a dataframe or None if the
                                the features and the metadata
                                are specified with an array and a
                                dictionary

        :param      features:    features columns or an array
        :param      metadata:    data
        :param      transform:   transform each vector before using it
        :param      n:           takes *n* images (or ``len(iter_images)``)
        :param      fLOG:        logging function


        :githublink:`%|py|37`
        """
        if "torch" in str(type(data)):
            self.module_ = "torch"
            from torch.utils.data import DataLoader  # pylint: disable=E0401,C0415,E0611
            dataloader = DataLoader(
                data, batch_size=1, shuffle=False, num_workers=0)
            self.iter_images_ = iter_images = iter(
                zip(dataloader, data.samples))
            if n is None:
                n = len(data)
        elif "keras" in str(type(data)):
            self.module_ = "keras"
            iter_images = data
            # We delay the import as keras backend is not necessarily installed.
            from keras.preprocessing.image import Iterator  # pylint: disable=E0401,C0415,E0611
            from keras_preprocessing.image import DirectoryIterator, NumpyArrayIterator  # pylint: disable=E0401,C0415
            if not isinstance(iter_images, (Iterator, DirectoryIterator, NumpyArrayIterator)):
                raise NotImplementedError(  # pragma: no cover
                    "iter_images must be a keras Iterator. No option implemented for type {0}."
                    "".format(type(iter_images)))
            if iter_images.batch_size != 1:
                raise ValueError(  # pragma: no cover
                    "batch_size must be 1 not {0}".format(
                        iter_images.batch_size))
            self.iter_images_ = iter_images
            if n is None:
                n = len(iter_images)
            if not hasattr(iter_images, "filenames"):
                raise NotImplementedError(  # pragma: no cover
                    "Iterator does not iterate on images but numpy arrays (not implemented).")
        else:
            raise TypeError(  # pragma: no cover
                "Unexpected data type {0}.".format(type(data)))

        def get_current_index(flow):
            "get current index"
            return flow.index_array[(flow.batch_index + flow.n - 1) % flow.n]

        def iterator_feature_meta():
            "iterators on metadata"
            def accessor(iter_images):
                if hasattr(iter_images, 'filenames'):
                    # keras
                    return (lambda i, ite: (ite, iter_images.filenames[get_current_index(iter_images)]))
                else:
                    # torch
                    return (lambda i, ite: (ite[0], ite[1][0]))
            acc = accessor(iter_images)

            for i, it in zip(range(n), iter_images):
                im, name = acc(i, it)
                if not isinstance(name, str):
                    raise TypeError(  # pragma: no cover
                        "name should be a string, not {0}".format(type(name)))
                yield im[0], dict(name=name, i=i)
                if fLOG and i % 10000 == 0:
                    fLOG(
                        '[SearchEnginePredictionImages.fit] i={}/{} - {}'.format(i, n, name))
        super()._prepare_fit(data=iterator_feature_meta(), transform=transform)

[docs]    def fit(self, iter_images, n=None, fLOG=None):
        """
        Processes images through the model and fits a *k-nn*.

        :param      iter_images: `Iterator <https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py#L719>`_
        :param      n:           takes *n* images (or ``len(iter_images)``)
        :param      fLOG:        logging function
        :param      kwimg:       parameters used to preprocess the images


        :githublink:`%|py|105`
        """
        self._prepare_fit(data=iter_images, transform=self.fct, n=n, fLOG=fLOG)
        return self._fit_knn()

[docs]    def kneighbors(self, iter_images, n_neighbors=None):
        """
        Searches for neighbors close to the first image
        returned by *iter_images*. It returns the neighbors
        only for the first image.

        :param      iter_images: `Iterator <https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py#L719>`_
        :return:                 score, ind, meta

        *score* is an array representing the lengths to points,
        *ind* contains the indices of the nearest points in the population matrix,
        *meta* is the metadata.


        :githublink:`%|py|121`
        """
        if isinstance(iter_images, numpy.ndarray):
            if self.module_ == "keras":
                raise NotImplementedError("Not yet implemented or Keras.")
            elif self.module_ == "torch":
                from torch import from_numpy  # pylint: disable=E0611,E0401,C0415
                X = from_numpy(iter_images[numpy.newaxis, :, :, :])
                return super().kneighbors(X, n_neighbors=n_neighbors)
            raise RuntimeError(
                "Unknown module '{0}'.".format(self.module_))
        elif "keras" in str(iter_images):
            if self.module_ != "keras":
                raise RuntimeError(  # pragma: no cover
                    "Keras object but {0} was used to train the KNN.".format(self.module_))
            # We delay the import as keras backend is not necessarily installed.
            # keras, it expects an iterator.
            from keras.preprocessing.image import Iterator  # pylint: disable=E0401,C0415,E0611
            from keras_preprocessing.image import DirectoryIterator, NumpyArrayIterator  # pylint: disable=E0401,C0415,E0611
            if not isinstance(iter_images, (Iterator, DirectoryIterator, NumpyArrayIterator)):
                raise NotImplementedError(  # pragma: no cover
                    "iter_images must be a keras Iterator. No option implemented for type {0}.".format(type(iter_images)))
            if iter_images.batch_size != 1:
                raise ValueError(  # pragma: no cover
                    "batch_size must be 1 not {0}".format(
                        iter_images.batch_size))
            for img in iter_images:
                X = img[0]
                break
            return super().kneighbors(X, n_neighbors=n_neighbors)
        elif "torch" in str(type(iter_images)):
            if self.module_ != "torch":
                raise RuntimeError(  # pragma: no cover
                    "Torch object but {0} was used to train the KNN.".format(self.module_))
            # torch: it expects a tensor
            X = iter_images
            return super().kneighbors(X, n_neighbors=n_neighbors)
        elif isinstance(iter_images, list):
            res = [self.kneighbors(it, n_neighbors=n_neighbors)
                   for it in iter_images]
            return (numpy.vstack([_[0] for _ in res]),
                    numpy.vstack([_[1] for _ in res]),
                    numpy.vstack([_[2] for _ in res]))
        else:
            raise TypeError(  # pragma: no cover
                "Unexpected type {0} in SearchEnginePredictionImages.kneighbors".format(
                    type(iter_images)))
Source code for mlinsights.search_rank.search_engine_predictions_images

mlinsights

Navigation

Related Topics