Source code for mlinsights.search_rank.search_engine_vectors

"""
Implements a way to get close examples based
on the output of a machine learned model.


:githublink:`%|py|6`
"""
import json
import zipfile
import pandas
import numpy
from sklearn.neighbors import NearestNeighbors
from pandas_streaming.df import to_zip, read_zip
from ..helpers.parameters import format_function_call


[docs]class SearchEngineVectors:
    """
    Implements a kind of local search engine which
    looks for similar results assuming they are vectors.
    The class is using
    :epkg:`sklearn:neighborsNearestNeighbors` to find
    the nearest neighbors of a vector and follows
    the same API.
    The class populates members:

    * ``features_``: vectors used to compute the neighbors
    * ``knn_``: parameters for the :epkg:`sklearn:neighborsNearestNeighbors`
    * ``metadata_``: metadata, can be None


    :githublink:`%|py|28`
    """

[docs]    def __init__(self, **pknn):
        """
        :param      pknn:        list of parameters, see :epkg:`sklearn:neighborsNearestNeighbors`


        :githublink:`%|py|33`
        """
        self.pknn = pknn

[docs]    def __repr__(self):
        """
        usual


        :githublink:`%|py|39`
        """
        return format_function_call(self.__class__.__name__, self.pknn)

[docs]    def _is_iterable(self, data):
        """
        Tells if an objet is an iterator or not.


        :githublink:`%|py|45`
        """
        try:
            iter(data)
            return not isinstance(data, (list, tuple, pandas.DataFrame, numpy.ndarray))
        except TypeError:
            return False

[docs]    def _prepare_fit(self, data=None, features=None, metadata=None, transform=None):
        """
        Stores data in the class itself.

        :param      data:        a :epkg:`dataframe` or None if the
                                the features and the metadata
                                are specified with an array and a
                                dictionary

        :param      features:    features columns or an array
        :param      metadata:    data
        :param      transform:   transform each vector before using it

        *transform* is a function whose signature::

            def transform(vec, many):
                # Many tells is the functions receives many vectors
                # or just one (many=False).

        Function *transform* is applied only if
        *data* is not None.


        :githublink:`%|py|72`
        """
        iterate = self._is_iterable(data)
        if iterate:
            if data is None:
                raise ValueError(  # pragma: no cover
                    "iterator is True, data must be specified.")
            if features is not None:
                raise ValueError(  # pragma: no cover
                    "iterator is True, features must be None.")
            if metadata is not None:
                raise ValueError(  # pragma: no cover
                    "iterator is True, metadata must be None.")
            metas = []
            arrays = []
            for row in data:
                if not isinstance(row, tuple):
                    raise TypeError(  # pragma: no cover
                        'data must be an iterator on tuple')
                if len(row) != 2:
                    raise ValueError(  # pragma: no cover
                        'data must be an iterator on tuple on two elements')
                arr, meta = row
                if not isinstance(meta, dict):
                    raise TypeError(  # pragma: no cover
                        'Second element of the tuple must be a dictionary')
                metas.append(meta)
                if transform is None:
                    tradd = arr
                else:
                    tradd = transform(arr, False)
                if not isinstance(tradd, numpy.ndarray):
                    if transform is None:
                        raise TypeError(  # pragma: no cover
                            "feature should be of type numpy.array not {}".format(type(tradd)))
                    else:
                        raise TypeError(  # pragma: no cover
                            "output of method transform ({}) should be of type numpy.array not {}".format(
                                transform, type(tradd)))
                arrays.append(tradd)
            self.features_ = numpy.vstack(arrays)
            self.metadata_ = pandas.DataFrame(metas)
        elif data is None:
            if not isinstance(features, numpy.ndarray):
                raise TypeError(  # pragma: no cover
                    "features must be an array if data is None")
            self.features_ = features
            self.metadata_ = metadata
        else:
            if not isinstance(data, pandas.DataFrame):
                raise ValueError(  # pragma: no cover
                    "data should be a dataframe")
            self.features_ = data[features]
            self.metadata_ = data[metadata] if metadata else None

[docs]    def fit(self, data=None, features=None, metadata=None):
        """
        Every vector comes with a list of metadata.

        :param      data:        a dataframe or None if the
                                the features and the metadata
                                are specified with an array and a
                                dictionary

        :param      features:    features columns or an array
        :param      metadata:    data


        :githublink:`%|py|136`
        """
        self._prepare_fit(data=data, features=features, metadata=metadata)
        return self._fit_knn()

[docs]    def _fit_knn(self):
        """
        Fits the nearest neighbors.


        :githublink:`%|py|143`
        """
        self.knn_ = NearestNeighbors(**self.pknn)
        self.knn_.fit(self.features_)
        return self

[docs]    def _first_pass(self, X, n_neighbors=None):
        """
        Finds the closest *n_neighbors*.

        :param      X:               features
        :param      n_neighbors:     number of neighbors to get (default is the value passed to the constructor)
        :return:                     *dist*, *ind*

        *dist* is an array representing the lengths to points,
        *ind* contains the indices of the nearest points in the population matrix.


        :githublink:`%|py|158`
        """
        if isinstance(X, list):
            if len(X) == 0 or isinstance(X[0], (list, tuple)):
                raise TypeError(  # pragma: no cover
                    "X must be a list or a vector (1)")
            X = [X]
        if isinstance(X, numpy.ndarray) and (len(X.shape) > 1 and X.shape[0] != 1):
            raise TypeError(  # pragma: no cover
                "X must be a list or a vector (2)")
        dist, ind = self.knn_.kneighbors(
            X, n_neighbors=n_neighbors, return_distance=True)
        ind = ind.ravel()
        dist = dist.ravel()
        return dist, ind

[docs]    def _second_pass(self, X, dist, ind):
        """
        Reorders the closest *n_neighbors*.

        :param      X:               features
        :param      dist:            array representing the lengths to points
        :param      ind:             indices of the nearest points in the population matrix
        :return:                     *score*, *ind*

        *score* is an array representing the lengths to points,
        *ind* contains the indices of the nearest points in the population matrix.


        :githublink:`%|py|184`
        """
        return dist, ind

[docs]    def kneighbors(self, X, n_neighbors=None):
        """
        Searches for neighbors close to *X*.

        :param      X:               features
        :return:                     score, ind, meta

        *score* is an array representing the lengths to points,
        *ind* contains the indices of the nearest points in the population matrix,
        *meta* is the metadata


        :githublink:`%|py|197`
        """
        dist, ind = self._first_pass(X, n_neighbors=n_neighbors)
        score, ind = self._second_pass(X, dist, ind)
        rind = ind
        if self.metadata_ is None:
            rmeta = None
        elif hasattr(self.metadata_, 'iloc'):
            rmeta = self.metadata_.iloc[ind, :]
        elif len(self.metadata_.shape) == 1:
            rmeta = self.metadata_[ind]
        else:
            rmeta = self.metadata_[ind, :]
        return score, rind, rmeta

[docs]    def to_zip(self, zipfilename, **kwargs):
        """
        Saves the features and the metadata into a zipfile.
        The function does not save the *k-nn*.

        :param      zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename
        :param      kwargs:      parameters for :epkg:`pandas:to_csv` (for the metadata)
        :return:                 zipfilename

        The function relies on function
        `to_zip <http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/pandas_streaming/df/
        dataframe_io.html#pandas_streaming.df.dataframe_io.to_zip>`_.
        It only works for :epkg:`Python` 3.6+.


        :githublink:`%|py|224`
        """
        if isinstance(zipfilename, str):
            zf = zipfile.ZipFile(zipfilename, 'w')
            close = True
        else:
            zf = zipfilename
            close = False
        if 'index' is not kwargs:
            kwargs['index'] = False
        to_zip(self.features_, zf, 'SearchEngineVectors-features.npy')
        to_zip(self.metadata_, zf, 'SearchEngineVectors-metadata.csv', **kwargs)
        js = json.dumps(self.pknn)
        zf.writestr('SearchEngineVectors-knn.json', js)
        if close:
            zf.close()

[docs]    @staticmethod
    def read_zip(zipfilename, **kwargs):
        """
        Restore the features, the metadata to a :class:`SearchEngineVectors <mlinsights.search_rank.search_engine_vectors.SearchEngineVectors>`.

        :param      zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename
        :param      zname:       a filename in th zipfile
        :param      kwargs:      parameters for :epkg:`pandas:read_csv`
        :return:                 :class:`SearchEngineVectors <mlinsights.search_rank.search_engine_vectors.SearchEngineVectors>`

        It only works for :epkg:`Python` 3.6+.


        :githublink:`%|py|251`
        """
        if isinstance(zipfilename, str):
            zf = zipfile.ZipFile(zipfilename, 'r')
            close = True
        else:
            zf = zipfilename
            close = False
        feat = read_zip(zf, 'SearchEngineVectors-features.npy')
        meta = read_zip(zf, 'SearchEngineVectors-metadata.csv', **kwargs)
        js = zf.read('SearchEngineVectors-knn.json')
        knn = json.loads(js)
        if close:
            zf.close()

        obj = SearchEngineVectors(**knn)
        obj.fit(features=feat, metadata=meta)
        return obj
Source code for mlinsights.search_rank.search_engine_vectors

mlinsights

Navigation

Related Topics