Source code for mlinsights.search_rank.search_engine_vectors

"""
Implements a way to get close examples based
on the output of a machine learned model.


:githublink:`%|py|6`
"""
import json
import zipfile
import pandas
import numpy
from sklearn.neighbors import NearestNeighbors
from pandas_streaming.df import to_zip, read_zip
from ..helpers.parameters import format_function_call


[docs]class SearchEngineVectors: """ Implements a kind of local search engine which looks for similar results assuming they are vectors. The class is using :epkg:`sklearn:neighborsNearestNeighbors` to find the nearest neighbors of a vector and follows the same API. The class populates members: * ``features_``: vectors used to compute the neighbors * ``knn_``: parameters for the :epkg:`sklearn:neighborsNearestNeighbors` * ``metadata_``: metadata, can be None :githublink:`%|py|28` """
[docs] def __init__(self, **pknn): """ :param pknn: list of parameters, see :epkg:`sklearn:neighborsNearestNeighbors` :githublink:`%|py|33` """ self.pknn = pknn
[docs] def __repr__(self): """ usual :githublink:`%|py|39` """ return format_function_call(self.__class__.__name__, self.pknn)
[docs] def _is_iterable(self, data): """ Tells if an objet is an iterator or not. :githublink:`%|py|45` """ try: iter(data) return not isinstance(data, (list, tuple, pandas.DataFrame, numpy.ndarray)) except TypeError: return False
[docs] def _prepare_fit(self, data=None, features=None, metadata=None, transform=None): """ Stores data in the class itself. :param data: a :epkg:`dataframe` or None if the the features and the metadata are specified with an array and a dictionary :param features: features columns or an array :param metadata: data :param transform: transform each vector before using it *transform* is a function whose signature:: def transform(vec, many): # Many tells is the functions receives many vectors # or just one (many=False). Function *transform* is applied only if *data* is not None. :githublink:`%|py|72` """ iterate = self._is_iterable(data) if iterate: if data is None: raise ValueError("iterator is True, data must be specified.") if features is not None: raise ValueError("iterator is True, features must be None.") if metadata is not None: raise ValueError("iterator is True, metadata must be None.") metas = [] arrays = [] for row in data: if not isinstance(row, tuple): raise TypeError('data must be an iterator on tuple') if len(row) != 2: raise ValueError( 'data must be an iterator on tuple on two elements') arr, meta = row if not isinstance(meta, dict): raise TypeError( 'Second element of the tuple must be a dictionary') metas.append(meta) if transform is None: tradd = arr else: tradd = transform(arr, False) if not isinstance(tradd, numpy.ndarray): if transform is None: raise TypeError( "feature should be of type numpy.array not {}".format(type(tradd))) else: raise TypeError("output of method transform ({}) should be of type numpy.array not {}".format( transform, type(tradd))) arrays.append(tradd) self.features_ = numpy.vstack(arrays) self.metadata_ = pandas.DataFrame(metas) elif data is None: if not isinstance(features, numpy.ndarray): raise TypeError("features must be an array if data is None") self.features_ = features self.metadata_ = metadata else: if not isinstance(data, pandas.DataFrame): raise ValueError("data should be a dataframe") self.features_ = data[features] self.metadata_ = data[metadata] if metadata else None
[docs] def fit(self, data=None, features=None, metadata=None): """ Every vector comes with a list of metadata. :param data: a dataframe or None if the the features and the metadata are specified with an array and a dictionary :param features: features columns or an array :param metadata: data :githublink:`%|py|129` """ self._prepare_fit(data=data, features=features, metadata=metadata) return self._fit_knn()
[docs] def _fit_knn(self): """ Fits the nearest neighbors. :githublink:`%|py|136` """ self.knn_ = NearestNeighbors(**self.pknn) self.knn_.fit(self.features_) return self
[docs] def _first_pass(self, X, n_neighbors=None): """ Finds the closest *n_neighbors*. :param X: features :param n_neighbors: number of neighbors to get (default is the value passed to the constructor) :return: *dist*, *ind* *dist* is an array representing the lengths to points, *ind* contains the indices of the nearest points in the population matrix. :githublink:`%|py|151` """ if isinstance(X, list): if len(X) == 0 or isinstance(X[0], (list, tuple)): raise TypeError("X must be a list or a vector (1)") X = [X] if isinstance(X, numpy.ndarray) and (len(X.shape) > 1 and X.shape[0] != 1): raise TypeError("X must be a list or a vector (2)") dist, ind = self.knn_.kneighbors( X, n_neighbors=n_neighbors, return_distance=True) ind = ind.ravel() dist = dist.ravel() return dist, ind
[docs] def _second_pass(self, X, dist, ind): """ Reorders the closest *n_neighbors*. :param X: features :param dist: array representing the lengths to points :param ind: indices of the nearest points in the population matrix :return: *score*, *ind* *score* is an array representing the lengths to points, *ind* contains the indices of the nearest points in the population matrix. :githublink:`%|py|175` """ return dist, ind
[docs] def kneighbors(self, X, n_neighbors=None): """ Searches for neighbors close to *X*. :param X: features :return: score, ind, meta *score* is an array representing the lengths to points, *ind* contains the indices of the nearest points in the population matrix, *meta* is the metadata :githublink:`%|py|188` """ dist, ind = self._first_pass(X, n_neighbors=n_neighbors) score, ind = self._second_pass(X, dist, ind) rind = ind if self.metadata_ is None: rmeta = None elif hasattr(self.metadata_, 'iloc'): rmeta = self.metadata_.iloc[ind, :] elif len(self.metadata_.shape) == 1: rmeta = self.metadata_[ind] else: rmeta = self.metadata_[ind, :] return score, rind, rmeta
[docs] def to_zip(self, zipfilename, **kwargs): """ Saves the features and the metadata into a zipfile. The function does not save the *k-nn*. :param zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename :param kwargs: parameters for :epkg:`pandas:to_csv` (for the metadata) :return: zipfilename The function relies on function `to_zip <http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/pandas_streaming/df/ dataframe_io.html#pandas_streaming.df.dataframe_io.to_zip>`_. It only works for :epkg:`Python` 3.6+. :githublink:`%|py|215` """ if isinstance(zipfilename, str): zf = zipfile.ZipFile(zipfilename, 'w') close = True else: zf = zipfilename close = False if 'index' is not kwargs: kwargs['index'] = False to_zip(self.features_, zf, 'SearchEngineVectors-features.npy') to_zip(self.metadata_, zf, 'SearchEngineVectors-metadata.csv', **kwargs) js = json.dumps(self.pknn) zf.writestr('SearchEngineVectors-knn.json', js) if close: zf.close()
[docs] @staticmethod def read_zip(zipfilename, **kwargs): """ Restore the features, the metadata to a :class:`SearchEngineVectors <mlinsights.search_rank.search_engine_vectors.SearchEngineVectors>`. :param zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename :param zname: a filename in th zipfile :param kwargs: parameters for :epkg:`pandas:read_csv` :return: :class:`SearchEngineVectors <mlinsights.search_rank.search_engine_vectors.SearchEngineVectors>` It only works for :epkg:`Python` 3.6+. :githublink:`%|py|242` """ if isinstance(zipfilename, str): zf = zipfile.ZipFile(zipfilename, 'r') close = True else: zf = zipfilename close = False feat = read_zip(zf, 'SearchEngineVectors-features.npy') meta = read_zip(zf, 'SearchEngineVectors-metadata.csv', **kwargs) js = zf.read('SearchEngineVectors-knn.json') knn = json.loads(js) if close: zf.close() obj = SearchEngineVectors(**knn) obj.fit(features=feat, metadata=meta) return obj