Source code for mlinsights.timeseries.patterns

"""
Find patterns in timeseries.


:githublink:`%|py|5`
"""
import numpy
import pandas
from sklearn.cluster import KMeans
from .agg import aggregate_timeseries


[docs]def find_ts_group_pattern(ttime, values, names, name_subset=None, per='week', unit='half-hour', agg='sum', estimator=None, fLOG=None): """ Clusters times series to find similar patterns. :param ttime: time column :param values: features to use to cluster :param names: column which holds group name :param name_subset: subset of groups to study, None for all :param per: aggragation per week :param estimator: estimator used to find pattern, :epkg:`sklearn:cluster:KMeans` and 10 groups :param fLOG: logging function :return: found clusters, distances :githublink:`%|py|27` """ for var, na in zip([ttime, values, names], ['ttime', 'values', 'names']): if not isinstance(var, numpy.ndarray): raise TypeError("'{}' must an array not {}".format(na, type(var))) # builds features set_names = set(names) if name_subset is not None: set_names &= set(name_subset) if fLOG: fLOG( # pragma: no cover '[find_ts_group_pattern] build features, {} groups'.format(len(set_names))) gr_names = [] to_merge = [] for name in set_names: indices = names == name gr_ttime = ttime[indices] gr_values = values[indices] gr = aggregate_timeseries(None, gr_ttime, gr_values, unit=unit, agg=agg, per=per) gr.set_index(gr.columns[0], inplace=True) gr_names.append(name) to_merge.append(gr) if fLOG: fLOG( # pragma: no cover '[find_ts_group_pattern] merge features') all_merged = pandas.concat(to_merge, axis=1) all_merged.fillna(0, inplace=True) ncol = all_merged.shape[1] // len(gr_names) gr_feats = [] for i, name in enumerate(gr_names): feats = all_merged.iloc[:, i * ncol: (i + 1) * ncol].values.ravel() gr_feats.append(feats) gr_feats = numpy.vstack(gr_feats) # cluster if fLOG: fLOG( # pragma: no cover '[find_ts_group_pattern] clustering, shape={}'.format(gr_feats.shape)) if estimator is None: estimator = KMeans() estimator.fit(gr_feats) # predicted clusters pred = estimator.predict(gr_feats) dist = estimator.transform(gr_feats) if fLOG: fLOG( # pragma: no cover '[find_ts_group_pattern] number of clusters: {}'.format(len(set(pred)))) row_name = {n: i for i, n in enumerate(gr_names)} clusters = numpy.empty(ttime.shape[0], dtype=pred.dtype) dists = numpy.empty((ttime.shape[0], dist.shape[1]), dtype=dist.dtype) for i in range(ttime.shape[0]): if names[i] in row_name: index = row_name[names[i]] clusters[i] = pred[index] dists[i, :] = dist[index, :] else: clusters[i] = -1 dists[i, :] = numpy.nan return clusters, dists