from jyquickhelper import add_notebook_menu
add_notebook_menu()


import psutil

def find_process(name):
    for proc in psutil.process_iter():
        try: pinfo = proc.as_dict(attrs=['pid', 'name'])
        except psutil.NoSuchProcess: pass
        else:
            if name in pinfo["name"]:
                return pinfo, proc
    return None, None


i, p = find_process('ipcluster')
i, p

(None, None)


import os,sys
if hasattr(sys, 'real_prefix'):
    exe = sys.real_prefix
else:
    exe = sys.base_exec_prefix
f = os.path.join(exe, "Scripts")
f

'c:\\python35_x64\\Scripts'


[ _ for _ in os.listdir(f) if "cluster" in _ ]

['ipcluster.exe']


# ipcluster start -n 2


from pyquickhelper.helpgen import NbImage
NbImage("cmdipc.png")


if find_process("ipcluster")[0] is None:
    print("start ipcluster")
    from pyquickhelper.loghelper import run_cmd
    if sys.platform.startswith("win"):
        cmd = os.path.join(f, "ipcluster")
    else:
        cmd = "ipcluster"
    cmd += " start -n 2"
    run_cmd(cmd, wait=False)
else:
    print("déjà démarré", find_process("ipcluster"))

start ipcluster


import time
time.sleep(5)


from ipyparallel import Client
clients = Client()
clients.block = True  # use synchronous computations
print(clients.ids)    # on s'attend à deux clients

[0, 1]


from IPython.core.display import Image
Image("http://ipython.org/ipython-doc/stable/_images/wideView.png", width=300)


clients[:].apply_sync(lambda : "Hello, World")

['Hello, World', 'Hello, World']


def addition(m):
    return m + 1


parallel_result = clients[:].map_sync(addition, range(32))


parallel_result[:5]

[1, 2, 3, 4, 5]


def inverse_matrice(m) :
    return numpy.linalg.inv ( m )


import numpy
ms = [ numpy.random.random ( (5,5) )  for i in range(0,10) ]


# mat = clients[:].map_sync(inverse_matrice, ms)


with clients[:].sync_imports():
    import numpy

importing numpy on engine(s)


mat = clients[:].map_sync(inverse_matrice, ms)


mat[:2]

[array([[  0.44355592,  -1.51704114,   0.78806645,   1.46110877,
          -0.91563883],
        [  2.75547106,  -7.29685701,   6.23665899,   7.79916234,
         -15.77432008],
        [ -2.86686696,   4.24283253,  -2.76788208,  -4.2635678 ,
          10.16315185],
        [  3.66888355,  -4.75685452,   4.39946704,   4.01960793,
         -12.05135127],
        [ -7.28746322,  16.05911264, -13.53209716, -14.15627803,
          33.21478412]]),
 array([[  6.25664148,  15.66280127,  12.06488389, -13.3586936 ,
         -16.76660498],
        [  0.35637738,  -4.10233268,  -4.27594773,   2.7287904 ,
           4.91983962],
        [-15.0833757 , -30.39031239, -18.9426754 ,  28.14733333,
          30.08489302],
        [  0.2343882 ,  -0.76741284,  -3.21725047,   1.67819449,
           2.19797219],
        [  6.31315111,  13.48306181,   9.00514844, -13.46040096,
         -12.49950715]])]


view = clients.load_balanced_view()


results = view.map(inverse_matrice, ms)
len(results)

10


results = view.map(inverse_matrice, ms)
len(results)

10


%timeit list(map(inverse_matrice, ms))

The slowest run took 33.95 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 296 µs per loop


%timeit view.map(inverse_matrice, ms)

1 loop, best of 3: 294 ms per loop


def average_random_matrix(i):
    mean = None
    for n in range(0,100000):
        m = numpy.random.random ( (10,10) )
        if mean is None : mean = m
        else : mean += m
    return mean / n


%timeit list(map(average_random_matrix, range(0,10)))

1 loop, best of 3: 5.16 s per loop


%timeit view.map(average_random_matrix, range(0,10))

1 loop, best of 3: 3.23 s per loop


import numpy
ms = [ numpy.random.random ( (5,5) )  for i in range(0,100000) ]

def inverse_matrice(m) :
    return numpy.linalg.inv ( m )

from ensae_teaching_cs.td_2a import ParallelThread
%timeit res = ParallelThread.parallel( inverse_matrice, [ (m,) for m in ms ], 4, delay_sec = 0.1 )

1 loop, best of 3: 3.12 s per loop


%timeit list(map(inverse_matrice, ms))

1 loop, best of 3: 2.95 s per loop


import pyensae.datasource
pyensae.datasource.download_data("td8_velib.zip")

['stations.txt', 'td8_velib.txt']


import pandas
df = pandas.read_csv("td8_velib.txt", sep="\t")


df.shape

(1103787, 7)


df.columns

Index(['collect_date', 'last_update', 'available_bike_stands',
       'available_bikes', 'number', 'heure', 'minute'],
      dtype='object')


df = df.sort_values("collect_date")
df.head()


import os,sys
if hasattr(sys, 'real_prefix'):
    exe = sys.real_prefix
else:
    exe = sys.base_exec_prefix
f = os.path.join(exe, "Scripts")
f

'c:\\python35_x64\\Scripts'


if find_process("ipcluster")[0] is not None:
    print("stop ipcluster")
    from pyquickhelper.loghelper import run_cmd
    if sys.platform.startswith("win"):
        cmd = os.path.join(f, "ipcluster")
    else:
        cmd = "ipcluster"
    cmd += " stop"
    out, err = run_cmd(cmd, wait=True)
    print(out.replace(os.environ["USERNAME"], "USERNAME"))
else:
    print("aucun processus ipcluster trouvé")

stop ipcluster

2A.i - Jupyter et calcul distribué¶

Une fontion utile pour récupérer les processus qui tournent¶

Calcul distribué¶

Exemple simple¶

Exemple plus subtile¶

Autre écriture et mesure de temps¶

Autres options pour distribuer les calculs¶

Un peu plus sur les threads¶

Exercice 1 : Distribuer un calcul¶

Fin : on arrête le cluster¶

	collect_date	last_update	available_bike_stands	available_bikes	number	heure	minute
40729	2013-09-10 11:27:43.390054	2013-09-10 11:25:19	7	15	22005	11	25
40467	2013-09-10 11:27:43.390054	2013-09-10 11:25:19	19	0	13037	11	25
40525	2013-09-10 11:27:43.390054	2013-09-10 11:25:19	20	2	9021	11	25
40351	2013-09-10 11:27:43.390054	2013-09-10 11:25:19	16	2	19038	11	25
40070	2013-09-10 11:27:43.390054	2013-09-10 11:20:19	19	0	13037	11	20