Correction.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
import pyensae
from pyquickhelper.ipythonhelper import open_html_form
params={"server":"df...fr", "username":"", "password":""}
open_html_form(params=params,title="server + credentials", key_save="params")
import pyensae
%load_ext pyensae
%load_ext pyenbc
password = params["password"]
server = params["server"]
username = params["username"]
client = %remote_open
client
<pyensae.remote.ssh_remote_connection.ASSHClient at 0x9e20910>
On ajoute deux paramètres pour construire l'histogramme entre deux valeurs a
,b
. Ajouter ces deux paramètres au nom du fichier de sortie peut paraître raisonnable mais l'interpréteur a du mal à identifier les paramètres Undefined parameter : bins_
. On utilise des tirets.
%%PIG histogramab.pig
values = LOAD 'random/random.sample.txt' USING PigStorage('\t') AS (x:double);
values_f = FILTER values BY x >= $a AND x <= $b ; -- ligne ajoutée
values_h = FOREACH values_f GENERATE x, ((int)(x / $bins)) * $bins AS h ;
hist_group = GROUP values_h BY h ;
hist = FOREACH hist_group GENERATE group, COUNT(values_h) AS nb ;
STORE hist INTO 'random/histo_$bins-$a-$b.txt' USING PigStorage('\t') ;
if client.dfs_exists("random/histo_0.1-0.2-0.8.txt"):
client.dfs_rm("random/histo_0.1-0.2-0.8.txt", recursive=True)
client.pig_submit("histogramab.pig", redirection="redirection",
params =dict(bins="0.1", a="0.2", b="0.8") )
('', '')
%remote_cmd tail redirection.err
Spillable Memory Manager spill count : 0 Total bags proactively spilled: 0 Total records proactively spilled: 0 Job DAG: job_1414491244634_0189 2014-12-03 22:55:47,929 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server 2014-12-03 22:55:48,031 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!
%dfs_ls random
attributes | code | alias | folder | size | date | time | name | isdir | |
---|---|---|---|---|---|---|---|---|---|
0 | drwxr-xr-x | - | xavierdupre | xavierdupre | 0 | 2014-12-03 | 22:55 | random/histo_0.1-0.2-0.8.txt | True |
1 | drwxr-xr-x | - | xavierdupre | xavierdupre | 0 | 2014-11-28 | 00:11 | random/histo_0.1.txt | True |
2 | -rw-r--r-- | 3 | xavierdupre | xavierdupre | 202586 | 2014-11-27 | 23:38 | random/random.sample.txt | False |
if os.path.exists("histo.txt") : os.remove("histo.txt")
client.download_cluster("random/histo_0.1-0.2-0.8.txt","histo.txt", merge=True)
'random/histo_0.1-0.2-0.8.txt'
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas
df = pandas.read_csv("histo.txt", sep="\t",names=["bin","nb"])
df.plot(x="bin",y="nb",kind="bar")
<matplotlib.axes._subplots.AxesSubplot at 0xa0c5c70>