.. _bigdatashaderrst: ========== datashader ========== .. only:: html **Links:** :download:`notebook `, :downloadlink:`html `, :download:`PDF `, :download:`python `, :downloadlink:`slides `, :githublink:`GitHub|_doc/notebooks/2016/pydata/big_datashader.ipynb|*` `datashader `__ plots huge volume of data. `documentation `__ `source `__ `tutorial `__ .. code:: ipython3 from jyquickhelper import add_notebook_menu add_notebook_menu() .. contents:: :local: .. code:: ipython3 import bokeh.plotting as bp bp.output_notebook() .. raw:: html
Loading BokehJS ...
.. code:: ipython3 import datashader datashader.__version__ .. parsed-literal:: '0.6.4dev1' The version should be higher than ``0.6.4``. short example ------------- From `4_Trajectories.ipynb `__. .. code:: ipython3 import pandas as pd import numpy as np import xarray as xr # On Windows, you must run the notebook with admin right # otherwise the following instruction does not end. import datashader import datashader as ds import datashader.transfer_functions as tf .. code:: ipython3 # Constants np.random.seed(1) n = 1000000 # Number of points f = filter_width = 5000 # momentum or smoothing parameter, for a moving average filter # filtered random walk xs = np.convolve(np.random.normal(0, 0.1, size=n), np.ones(f)/f).cumsum() ys = np.convolve(np.random.normal(0, 0.1, size=n), np.ones(f)/f).cumsum() # Add "mechanical" wobble on the x axis xs += 0.1*np.sin(0.1*np.array(range(n-1+f))) # Add "measurement" noise xs += np.random.normal(0, 0.005, size=n-1+f) ys += np.random.normal(0, 0.005, size=n-1+f) # Add a completely incorrect value xs[int(len(xs)/2)] = 100 ys[int(len(xs)/2)] = 0 # Create a dataframe df = pd.DataFrame(dict(x=xs,y=ys)) # Default plot ranges: x_range = (xs.min(), xs.max()) y_range = (ys.min(), ys.max()) df.tail() .. raw:: html
x y
1004994 65.164829 -105.064056
1004995 65.177603 -105.069781
1004996 65.190898 -105.071699
1004997 65.194054 -105.054657
1004998 65.204752 -105.073366
.. code:: ipython3 def create_image(x_range=x_range, y_range=y_range, w=500, h=500): cvs = ds.Canvas(x_range=x_range, y_range=y_range, plot_height=h, plot_width=w) agg = cvs.line(df, 'x', 'y', agg=ds.any()) return tf.shade(agg) .. code:: ipython3 %time create_image() .. parsed-literal:: Wall time: 1.1 s .. image:: big_datashader_10_1.png .. code:: ipython3 from datashader.bokeh_ext import InteractiveImage import bokeh.plotting as bp def base_plot(tools='pan,wheel_zoom,reset'): p = bp.figure(tools=tools, plot_width=500, plot_height=500, x_range=x_range, y_range=y_range, outline_line_color=None, min_border=0, min_border_left=0, min_border_right=0, min_border_top=0, min_border_bottom=0) p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None return p p = base_plot() InteractiveImage(p, create_image) .. raw:: html
NYC taxi -------- `NYC taxi `__ without datashader ~~~~~~~~~~~~~~~~~~ .. code:: ipython3 import pandas as pd import os if os.path.exists('green_tripdata_2015-12.csv'): df = pd.read_csv('green_tripdata_2015-12.csv', usecols=['Pickup_longitude', 'Pickup_latitude', 'Dropoff_longitude', 'Dropoff_latitude', 'Passenger_count']) df = df [(df.Dropoff_longitude < -10) & (df.Pickup_longitude < -10)] df.sample(100000).to_csv("green_tripdata_2015-12_sample.csv") else: df = pd.read_csv("green_tripdata_2015-12_sample.csv") df.tail() .. raw:: html
Unnamed: 0 Pickup_longitude Pickup_latitude Dropoff_longitude Dropoff_latitude Passenger_count
99995 1505945 -73.916512 40.777092 -74.008743 40.704510 2
99996 968290 -73.830383 40.759563 -73.820259 40.751740 1
99997 1274687 -73.899574 40.746056 -73.899651 40.746105 2
99998 1023243 -73.948578 40.789158 -73.957741 40.776196 1
99999 261195 -73.943939 40.711861 -73.994743 40.684658 1
.. code:: ipython3 samples = df.sample(n=1000) samples.head() .. raw:: html
Unnamed: 0 Pickup_longitude Pickup_latitude Dropoff_longitude Dropoff_latitude Passenger_count
21828 824149 -73.937645 40.679783 -73.973488 40.680565 1
43531 622111 -73.899452 40.743587 -73.882927 40.741615 5
63837 291124 -73.922501 40.708939 -73.956100 40.688629 1
8116 1516644 -73.995628 40.686577 -73.986610 40.680191 1
73979 1283648 -73.840294 40.695374 -73.824905 40.706371 1
.. code:: ipython3 samples.describe() .. raw:: html
Unnamed: 0 Pickup_longitude Pickup_latitude Dropoff_longitude Dropoff_latitude Passenger_count
count 1.000000e+03 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 8.017301e+05 -73.934097 40.745278 -73.931417 40.741332 1.322000
std 4.551613e+05 0.044140 0.055953 0.051746 0.057068 0.946156
min 1.376000e+03 -74.026848 40.587414 -74.030685 40.578484 1.000000
25% 4.124792e+05 -73.960981 40.694023 -73.965458 40.692575 1.000000
50% 8.012435e+05 -73.945271 40.742716 -73.944492 40.742437 1.000000
75% 1.206654e+06 -73.914671 40.799161 -73.904083 40.781639 1.000000
max 1.608100e+06 -73.776367 40.887508 -73.722435 40.909649 6.000000
.. code:: ipython3 from bokeh.plotting import figure, output_notebook, show x_range=(samples.Dropoff_longitude.min(), samples.Dropoff_longitude.max()) y_range=(samples.Dropoff_latitude.min(), samples.Dropoff_latitude.max()) def base_plot(tools='pan,wheel_zoom,reset',plot_width=900, plot_height=600, **plot_args): p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height, x_range=x_range, y_range=y_range, outline_line_color=None, min_border=0, min_border_left=0, min_border_right=0, min_border_top=0, min_border_bottom=0, **plot_args) p.axis.visible = False p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None return p .. code:: ipython3 from bokeh.tile_providers import STAMEN_TERRAIN, get_provider p = base_plot() tile_terrain = get_provider(STAMEN_TERRAIN) p.add_tile(tile_terrain) options = dict(line_color=None, fill_color='blue', size=5) p.circle(x=samples['Dropoff_longitude'], y=samples['Dropoff_latitude'], **options) show(p) .. raw:: html
.. code:: ipython3 samples = df.sample(n=10000) p = base_plot() p.circle(x=samples['Dropoff_longitude'], y=samples['Dropoff_latitude'], **options) show(p) .. raw:: html
.. code:: ipython3 options = dict(line_color=None, fill_color='blue', size=1, alpha=0.1) samples = df.sample(n=100000) p = base_plot() p.circle(x=samples['Dropoff_longitude'], y=samples['Dropoff_latitude'], **options) show(p) .. raw:: html
with datashader ~~~~~~~~~~~~~~~ See `nyc_taxi.ipynb `__. This part should be run with a bigger sample than the previous one. .. code:: ipython3 import pandas as pd import os if os.path.exists('green_tripdata_2015-12.csv'): df = pd.read_csv('green_tripdata_2015-12.csv', usecols=['pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count', 'tpep_pickup_datetime']) df = df [(df.dropoff_x < -10) & (df.dropoff_y < -10)] df.sample(100000).to_csv("green_tripdata_2015-12_sample.csv") else: df = pd.read_csv("green_tripdata_2015-12_sample.csv") df.columns = ['?', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count'] df.tail() .. raw:: html
? pickup_x pickup_y dropoff_x dropoff_y passenger_count
99995 1505945 -73.916512 40.777092 -74.008743 40.704510 2
99996 968290 -73.830383 40.759563 -73.820259 40.751740 1
99997 1274687 -73.899574 40.746056 -73.899651 40.746105 2
99998 1023243 -73.948578 40.789158 -73.957741 40.776196 1
99999 261195 -73.943939 40.711861 -73.994743 40.684658 1
.. code:: ipython3 import datashader as ds from datashader import transfer_functions as tf from datashader.colors import Greys9 Greys9_r = list(reversed(Greys9))[:-2] .. code:: ipython3 plot_width = int(750) plot_height = int(plot_width//1.2) .. code:: ipython3 cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height, x_range=x_range, y_range=y_range) agg = cvs.points(df, 'dropoff_x', 'dropoff_y', ds.count('passenger_count')) img = tf.shade(agg, cmap=["white", 'darkblue'], how='linear') .. code:: ipython3 img .. image:: big_datashader_26_0.png .. code:: ipython3 import numpy as np def histogram(x,colors=None): hist,edges = np.histogram(x, bins=100) p = figure(y_axis_label="Pixels", tools='', height=130, outline_line_color=None, min_border=0, min_border_left=0, min_border_right=0, min_border_top=0, min_border_bottom=0) p.quad(top=hist[1:], bottom=0, left=edges[1:-1], right=edges[2:]) print("min: {}, max: {}".format(np.min(x),np.max(x))) show(p) .. code:: ipython3 histogram(agg.values) .. parsed-literal:: min: 0, max: 175 .. raw:: html
.. code:: ipython3 histogram(np.log1p(agg.values)) tf.shade(agg, cmap=Greys9_r, how='log') .. parsed-literal:: min: 0.0, max: 5.170483995038151 .. raw:: html
.. image:: big_datashader_29_3.png .. code:: ipython3 NYC = x_range, y_range = ((-8242000,-8210000), (4965000,4990000)) .. code:: ipython3 import datashader as ds from datashader.bokeh_ext import InteractiveImage from functools import partial from datashader.utils import export_image from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno from IPython.core.display import HTML, display background = "black" export = partial(export_image, export_path="export", background=background) cm = partial(colormap_select, reverse=(background=="black")) def create_image(x_range, y_range, w=plot_width, h=plot_height): cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range) agg = cvs.points(df, 'dropoff_x', 'dropoff_y', ds.count('passenger_count')) img = tf.shade(agg, cmap=Hot, how='eq_hist') return tf.dynspread(img, threshold=0.5, max_px=4) p = base_plot(background_fill_color=background) export(create_image(*NYC),"NYCT_hot") InteractiveImage(p, create_image) .. raw:: html
.. code:: ipython3 import numpy as np from functools import partial def create_image90(x_range, y_range, w=plot_width, h=plot_height): cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range) agg = cvs.points(df, 'dropoff_x', 'dropoff_y', ds.count('passenger_count')) img = tf.shade(agg #.where(agg>np.percentile(agg,90)) # already a sample and it removes too many rows , cmap=inferno, how='eq_hist') return tf.dynspread(img, threshold=0.3, max_px=4) p = base_plot() p.add_tile(tile_terrain) export(create_image(*NYC),"NYCT_90th") InteractiveImage(p, create_image90) .. raw:: html
.. code:: ipython3 def merged_images(x_range, y_range, w=plot_width, h=plot_height, how='log'): cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range) picks = cvs.points(df, 'pickup_x', 'pickup_y', ds.count('passenger_count')) drops = cvs.points(df, 'dropoff_x', 'dropoff_y', ds.count('passenger_count')) # already a sample and the following filter removes too many rows, # you should use a bigger sample more_drops = tf.shade(drops # .where(drops > picks) , cmap=["darkblue", 'cornflowerblue'], how=how) more_picks = tf.shade(picks # .where(picks > drops) , cmap=["darkred", 'orangered'], how=how) img = tf.stack(more_picks,more_drops) return tf.dynspread(img, threshold=0.3, max_px=4) p = base_plot(background_fill_color=background) export(merged_images(*NYC),"NYCT_pickups_vs_dropoffs") InteractiveImage(p, merged_images) .. raw:: html