Source code for pyensae.graphhelper.corrplot

"""
CorrPlot functionalities.

It comes from `corrplot.py <https://raw.githubusercontent.com/biokit/biokit/master/biokit/viz/corrplot.py>`_
which I copied here because the module does not properly work on Python 3 (import issues).
See also `biokit license <https://github.com/biokit/biokit/blob/master/LICENSE>`_.

:author: Thomas Cokelaer
:references: http://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html


:githublink:`%|py|12`
"""
import numpy
from scipy.cluster.hierarchy import dendrogram, fcluster
import pandas
from .linkage import Linkage
from ._colormap import cmap_builder


[docs]class Corrplot(Linkage): """ An implementation of correlation plotting tools (corrplot). the class requires `scipy <http://www.scipy.org/>`_. Here is a simple example with a correlation matrix as an input (stored in a pandas dataframe): .. plot:: :width: 50% :include-source: import pandas import numpy letters = "ABCDEFGHIJKLM"[0:10] df = pandas.DataFrame(dict(( (k, numpy.random.random(10)+ord(k)-65) for k in letters))) import matplotlib.pyplot as plt plt.close('all') plt.style.use('ggplot') from pyensae.graph_helper import Corrplot c = Corrplot(df) c.plot() plt.show() This class requires module `colormap <https://pypi.python.org/pypi/colormap>`_. :githublink:`%|py|46` """
[docs] def __init__(self, data, na=0): """ Plots the content of square matrix that contains correlation values. :param data: input can be a dataframe (Pandas), or list of lists (python) or a numpy matrix. Note, however, that values must be between -1 and 1. If not, or if the matrix (or list of lists) is not squared, then correlation is computed. The data or computed correlation is stored in :attr:`df` attribute. :param na: replace NA values with this value (default 0) The :attr:`params` contains some tunable parameters for the colorbar in the :meth:`plot` method. :: # can be a list of lists, the correlation matrix is then a 2x2 matrix c = Corrplot([[1,1], [2,4], [3,3], [4,4]]) :githublink:`%|py|66` """ super(Corrplot, self).__init__() # pylint: disable=R1725 # we delay import in case this is not needed self.cmap_builder = cmap_builder self.df = pandas.DataFrame(data, copy=True) compute_correlation = False w, h = self.df.shape if self.df.max().max() > 1 or self.df.min().min() < -1: compute_correlation = True if w != h: compute_correlation = True if list(self.df.index) != list(self.df.columns): compute_correlation = True if compute_correlation: cor = self.df.corr() self.df = cor # replace NA with zero self.df.fillna(na, inplace=True) #: tunable parameters for the :meth:`plot` method. self.params = { 'colorbar.N': 100, 'colorbar.shrink': .8, 'colorbar.orientation': 'vertical'}
[docs] def _set_default_cmap(self): self.cm = self.cmap_builder('#AA0000', 'white', 'darkblue')
[docs] def order(self, method='complete', metric='euclidean', inplace=False): """ Rearranges the order of rows and columns after clustering. :param method: any scipy method (e.g., single, average, centroid, median, ward). See scipy.cluster.hierarchy.linkage :param metric: any scipy distance (euclidean, hamming, jaccard) See scipy.spatial.distance or scipy.cluster.hieararchy :param bool inplace: if set to True, the dataframe is replaced You probably do not need to use that method. Use :meth:`plot` and the two parameters order_metric and order_method instead. :githublink:`%|py|112` """ if None is method or None is metric: return self.df Y = self.linkage(self.df, method=method, metric=metric) ind1 = fcluster(Y, 0.7 * max(Y[:, 2]), 'distance') Z = dendrogram(Y, no_plot=True) idx1 = Z['leaves'] cor2 = self.df.iloc[idx1].T.iloc[idx1].T if inplace is True: self.df = cor2 else: return cor2 self.Y = Y self.Z = Z self.idx1 = idx1 self.ind1 = ind1 return cor2
[docs] def plot(self, fig=None, grid=True, rotation=30, lower=None, upper=None, shrink=0.9, axisbg='white', colorbar=True, label_color='black', fontsize='small', edgecolor='black', method='ellipse', order_method='complete', order_metric='euclidean', cmap=None, ax=None, binarise_color=False, figsize=None): """ Plots the correlation matrix from the content of :attr:`df` (dataframe). By default, the correlation is shown on the upper and lower triangle and is symmetric wrt to the diagonal. The symbols are ellipses. The symbols can be changed to e.g. rectangle. The symbols are shown on upper and lower sides but you could choose a symbol for the upper side and another for the lower side using the **lower** and **upper** parameters. :param fig: Create a new figure by default. If an instance of an existing figure is provided, the corrplot is overlayed on the figure provided. Can also be the number of the figure. :param grid: add grid (Defaults to grey color). You can set it to False or a color. :param rotation: rotate labels on y-axis :param lower: if set to a valid method, plots the data on the lower left triangle :param upper: if set to a valid method, plots the data on the upper left triangle :param float shrink: maximum space used (in percent) by a symbol. If negative values are provided, the absolute value is taken. If greater than 1, the symbols wiill overlap. :param axisbg: color of the background (defaults to white). :param colorbar: add the colorbar (defaults to True). :param str label_color: (defaults to black). :param fontsize: size of the fonts defaults to 'small'. :param method: shape to be used in 'ellipse', 'square', 'rectangle', 'color', 'text', 'circle', 'number', 'pie'. :param order_method: see :meth:`order <pyensae.graphhelper.corrplot.Corrplot.order>`. :param order_metric: see : meth:`order`. :param cmap: a valid cmap from matplotlib or colormap package (e.g., 'jet', or 'copper'). Default is red/white/blue colors. :param binarise_color: two colors only, negative, positive :param ax: a matplotlib axes. :param figsize: gives that parameter to the new created figure :return: ax (matplotlib axes) The colorbar can be tuned with the parameters stored in :attr:`params`. Here is an example. See notebook for other examples: :: c = corrplot.Corrplot(dataframe) c.plot(cmap=('Orange', 'white', 'green')) c.plot(method='circle') c.plot(colorbar=False, shrink=.8, upper='circle' ) :githublink:`%|py|183` """ import matplotlib.pyplot as plt # pylint: disable=C0415 # default if cmap is not None: try: if isinstance(cmap, str): self.cm = self.cmap_builder(cmap) else: self.cm = self.cmap_builder(*cmap) except Exception: self._set_default_cmap() else: self._set_default_cmap() self.shrink = abs(shrink) self.fontsize = fontsize self.edgecolor = edgecolor df = self.order(method=order_method, metric=order_metric) # figure can be a number or an instance; otherwise creates it params = dict(facecolor=axisbg) if isinstance(fig, int): params["num"] = fig.number elif fig is not None: params["num"] = fig.number else: params["num"] = None if figsize is not None: params["figsize"] = figsize fig = plt.figure(**params) # do we have an axes to plot the data in ? if ax is None: ax = plt.subplot(1, 1, 1, aspect='equal', facecolor=axisbg) else: # if so, clear the axes. Colorbar cannot be removed easily. plt.sca(ax) ax.clear() # subplot resets the bg color, let us set it again fig.set_facecolor(axisbg) width, height = df.shape labels = (df.columns) if upper is None and lower is None: mode = 'method' elif upper and lower: mode = 'both' elif lower is not None: mode = 'lower' elif upper is not None: mode = 'upper' self.binarise_color = binarise_color if mode == 'upper': self._add_patches(df, upper, 'upper', ax, diagonal=True) elif mode == 'lower': self._add_patches(df, lower, 'lower', ax, diagonal=True) elif mode == 'method': self._add_patches(df, method, 'both', ax, diagonal=True) elif mode == 'both': self._add_patches(df, upper, 'upper', ax, diagonal=False) self._add_patches(df, lower, 'lower', ax, diagonal=False) # shift the limits to englobe the patches correctly ax.set_xlim(-0.5, width - .5) ax.set_ylim(-0.5, height - .5) # set xticks/xlabels on top ax.xaxis.tick_top() xtickslocs = numpy.arange(len(labels)) ax.set_xticks(xtickslocs) ax.set_xticklabels(labels, rotation=rotation, color=label_color, fontsize=fontsize, ha='left') ax.invert_yaxis() ytickslocs = numpy.arange(len(labels)) ax.set_yticks(ytickslocs) ax.set_yticklabels(labels, fontsize=fontsize, color=label_color) plt.tight_layout() if grid is not False: if grid is True: grid = 'grey' for i in range(0, width): ratio1 = float(i) / width ratio2 = float(i + 2) / width # set axis off # 2 - set xlabels along the diagonal # set colorbar either on left or bottom if mode == 'lower': ax.axvline(i + .5, ymin=1 - ratio1, ymax=0., color=grid) ax.axhline(i + .5, xmin=0, xmax=ratio2, color=grid) if mode == 'upper': ax.axvline(i + .5, ymin=1 - ratio2, ymax=1, color=grid) ax.axhline(i + .5, xmin=ratio1, xmax=1, color=grid) if mode in ['method', 'both']: ax.axvline(i + .5, color=grid) ax.axhline(i + .5, color=grid) # can probably be simplified if mode == 'lower': ax.axvline(-.5, ymin=0, ymax=1, color='grey') ax.axvline(width - .5, ymin=0, ymax=1. / width, color='grey', lw=2) ax.axhline(width - .5, xmin=0, xmax=1, color='grey', lw=2) ax.axhline(-.5, xmin=0, xmax=1. / width, color='grey', lw=2) ax.xticks([]) for i in range(0, width): ax.text(i, i - .6, labels[i], fontsize=fontsize, color=label_color, rotation=rotation, verticalalignment='bottom') ax.text(-.6, i, labels[i], fontsize=fontsize, color=label_color, rotation=0, horizontalalignment='right') ax.set_axis_off() # can probably be simplified elif mode == 'upper': ax.axvline(width - .5, ymin=0, ymax=1, color='grey', lw=2) ax.axvline(-.5, ymin=1 - 1. / width, ymax=1, color='grey', lw=2) ax.axhline(-.5, xmin=0, xmax=1, color='grey', lw=2) ax.axhline(width - .5, xmin=1 - 1. / width, xmax=1, color='grey', lw=2) ax.yticks([]) for i in range(0, width): ax.text(-.6 + i, i, labels[i], fontsize=fontsize, color=label_color, horizontalalignment='right', rotation=0) ax.text(i, -.5, labels[i], fontsize=fontsize, color=label_color, rotation=rotation, verticalalignment='bottom') ax.set_axis_off() # set all ticks length to zero ax = plt.gca() ax.tick_params(axis='both', which='both', length=0) if colorbar: N = self.params['colorbar.N'] + 1 if N < 2: raise RuntimeError("No colorbar to draw.") cb = plt.gcf().colorbar( self.collection, orientation=self.params['colorbar.orientation'], shrink=self.params['colorbar.shrink'], boundaries=numpy.linspace(0, 1, N), ticks=[0, .25, 0.5, 0.75, 1]) cb.ax.set_yticklabels([-1, -.5, 0, .5, 1]) # make sure it goes from -1 to 1 even though actual values may not # reach that range # cb.set_clim(0, 1) # not working in matplotlib 3.3.0 return ax
[docs] def _add_patches(self, df, method, fill, ax, diagonal=True): from matplotlib.patches import Ellipse, Circle, Rectangle, Wedge from matplotlib.collections import PatchCollection width, height = df.shape patches = [] colors = [] for x in range(width): for y in range(height): if fill == 'lower' and x > y: continue if fill == 'upper' and x < y: continue if diagonal is False and x == y: continue datum = (df.iloc[x, y] + 1.) / 2. d = df.iloc[x, y] d_abs = numpy.abs(d) #c = self.pvalues[x, y] rotate = -45 if d > 0 else +45 #cmap = self.poscm if d >= 0 else self.negcm if method in ['ellipse', 'square', 'rectangle', 'color']: if method == 'ellipse': func = Ellipse patch = func((x, y), width=1 * self.shrink, height=(self.shrink - d_abs * self.shrink), angle=rotate) else: func = Rectangle w = h = d_abs * self.shrink offset = (1 - w) / 2. if method == 'color': w = 1 h = 1 offset = 0 patch = func((x + offset - .5, y + offset - .5), width=w, height=h, angle=0) if self.edgecolor: patch.set_edgecolor(self.edgecolor) # patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') # ax.add_artist(patch) patches.append(patch) elif method == 'circle': patch = Circle((x, y), radius=d_abs * self.shrink / 2.) if self.edgecolor: patch.set_edgecolor(self.edgecolor) # patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') # ax.add_artist(patch) patches.append(patch) elif method in ['number', 'text']: if d < 0: edgecolor = self.cm(-1.0) elif d >= 0: edgecolor = self.cm(1.0) d_str = "{:.2f}".format(d).replace( "0.", ".").replace(".00", "") ax.text(x, y, d_str, color=edgecolor, fontsize=self.fontsize, horizontalalignment='center', weight='bold', alpha=max(0.5, d_abs), withdash=False) elif method == 'pie': S = 360 * d_abs patch = [ Wedge((x, y), 1 * self.shrink / 2., -90, S - 90), Wedge((x, y), 1 * self.shrink / 2., S - 90, 360 - 90), ] # patch[0].set_facecolor(cmap(d_abs)) # patch[1].set_facecolor('white') colors.append(datum) colors.append(0.5) if self.edgecolor: patch[0].set_edgecolor(self.edgecolor) patch[1].set_edgecolor(self.edgecolor) # ax.add_artist(patch[0]) # ax.add_artist(patch[1]) patches.append(patch[0]) patches.append(patch[1]) else: raise ValueError( 'Method for the symbols is not known. Use e.g, square, circle') if self.binarise_color: colors = [1 if color > 0.5 else -1 for color in colors] if len(patches): col1 = PatchCollection( patches, array=numpy.array(colors), cmap=self.cm) ax.add_collection(col1) self.collection = col1