%matplotlib inline
import matplotlib.pyplot as plt


from jyquickhelper import add_notebook_menu
add_notebook_menu()


import pandas
from ensae_teaching_cs.data import donnees_enquete_2003_television
df = pandas.read_csv(donnees_enquete_2003_television(), sep="\t", engine="python")
df.head()


df = df [[ c for c in df.columns if "Unnamed" not in c]]
df.head()


notnull = df [ ~df.cLT2FREQ.isnull() ]  # équivalent )  df [ df.cLT2FREQ.notnull() ] 
print(len(df),len(notnull))
notnull.tail()

8403 7386


notnull.to_excel("data.xlsx")  # question 4


%system "data.xlsx"

[]


from IPython.display import Image
Image("td10exc.png")


def delta(x,y):
    return max(x,y)- min(x,y)


delta = lambda x,y : max(x,y)- min(x,y)


delta(4,5)

1


import random 
df["select"]= df.apply( lambda row : random.randint(1,10), axis=1) 
echantillon = df [ df["select"] ==1 ]
echantillon.shape, df.shape

((851, 5), (8403, 5))


from ensae_teaching_cs.data import marathon
import pandas
df = pandas.read_csv(marathon(), sep="\t", names=["ville", "annee", "temps","secondes"])
df.head()


# étape 1
# par défaut, la méthode groupby utilise la clé de group comme index
# pour ne pas le faire, il faut préciser as_index = False
gr = df[["ville","secondes"]].groupby("ville", as_index=False).mean()
gr.head()


# étape 2 - on ajoute une colonne
tout = df.merge( gr, on="ville")
tout.head()


# étape 3
piv = tout.pivot("annee","ville","secondes_x")
piv.tail()


gr["annee"] = "moyenne"
pivmean = gr.pivot("annee","ville","secondes")
pivmean


piv = df.pivot("annee","ville","secondes")
pandas.concat( [ piv, pivmean ]).tail()


import pandas, urllib.request
from ensae_teaching_cs.data import marathon
df = pandas.read_csv(marathon(filename=True), 
                     sep="\t", names=["ville", "annee", "temps","secondes"])
piv = df.pivot("annee","ville","secondes")
gr = df[["ville","secondes"]].groupby("ville", as_index=False).mean()
gr["annee"] = "moyenne"
pivmean = gr.pivot("annee","ville","secondes")

pandas.concat([piv, pivmean]).tail()


import urllib.request
import zipfile
import http.client

def download_and_save(name, root_url):
    try:
        response = urllib.request.urlopen(root_url+name)
    except (TimeoutError, urllib.request.URLError, http.client.BadStatusLine):
        # back up plan
        root_url = "http://www.xavierdupre.fr/enseignement/complements/"
        response = urllib.request.urlopen(root_url+name)
    with open(name, "wb") as outfile:
        outfile.write(response.read())

def unzip(name):
    with zipfile.ZipFile(name, "r") as z:
        z.extractall(".")

filenames = ["etatcivil2012_mar2012_dbase.zip", 
             "etatcivil2012_nais2012_dbase.zip",
             "etatcivil2012_dec2012_dbase.zip", ]
root_url = 'http://telechargement.insee.fr/fichiersdetail/etatcivil2012/dbase/'

for filename in filenames:
    download_and_save(filename, root_url)
    unzip(filename)
    print("Download of {}: DONE!".format(filename))

Download of etatcivil2012_mar2012_dbase.zip: DONE!
Download of etatcivil2012_nais2012_dbase.zip: DONE!
Download of etatcivil2012_dec2012_dbase.zip: DONE!


import pandas
try:
    from dbfread_ import DBF
    use_dbfread = True
except ImportError as e :
    use_dbfread = False
    
if use_dbfread:
    print("use of dbfread")
    def dBase2df(dbase_filename):
        table = DBF(dbase_filename, load=True, encoding="cp437")
        return pandas.DataFrame(table.records)

    df = dBase2df('mar2012.dbf')
else :
    print("use of zipped version")
    import pyensae.datasource
    data = pyensae.datasource.download_data("mar2012.zip")
    df = pandas.read_csv(data[0], sep="\t", encoding="utf8", low_memory=False)  
    
print(df.shape, df.columns)
df.head()

use of zipped version
(246123, 16) Index(['AMAR', 'ANAISF', 'ANAISH', 'DEPDOM', 'DEPMAR', 'DEPNAISF', 'DEPNAISH',
       'ETAMATF', 'ETAMATH', 'INDNATF', 'INDNATH', 'JSEMAINE', 'MMAR',
       'NBENFCOM', 'TUCOM', 'TUDOM'],
      dtype='object')


df["ageH"] = df.apply (lambda r:  2014 - int(r["ANAISH"]), axis=1)
df["ageF"] = df.apply (lambda r:  2014 - int(r["ANAISF"]), axis=1)
df.head()


df.plot(x="ageH",y="ageF", kind="scatter")

<matplotlib.axes._subplots.AxesSubplot at 0x2ee646da358>


df.plot(x="ageH",y="ageF", kind="hexbin")

<matplotlib.axes._subplots.AxesSubplot at 0x2ee6591b780>


df["ANAISH"] = df.apply (lambda r:  int(r["ANAISH"]), axis=1)
df["ANAISF"] = df.apply (lambda r:  int(r["ANAISF"]), axis=1)
df["differenceHF"] = df.ANAISH - df.ANAISF
df["nb"] = 1
dist = df[["nb","differenceHF"]].groupby("differenceHF", as_index=False).count()
df["differenceHF"].hist(figsize=(16,6), bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x2ee656d7c88>


df["nb"] = 1
dissem = df[["JSEMAINE","nb"]].groupby("JSEMAINE",as_index=False).sum()
total = dissem["nb"].sum()
repsem = dissem.cumsum() 
repsem["nb"] /= total

ax = dissem["nb"].plot(kind="bar")
repsem["nb"].plot(ax=ax, secondary_y=True)
ax.set_title("distribution des mariages par jour de la semaine")

<matplotlib.text.Text at 0x2ee65958860>

	POIDLOG	POIDSF	cLT1FREQ	cLT2FREQ	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	...	Unnamed: 22	Unnamed: 23	Unnamed: 24	Unnamed: 25	Unnamed: 26	Unnamed: 27	Unnamed: 28	Unnamed: 29	Unnamed: 30	Unnamed: 31
0	0.889422	4766.865201	2	1.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2.310209	12381.589746	30	1.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2.740070	14685.431344	6	2.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1.775545	9516.049939	1	1.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	0.732512	3925.907588	3	1.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	POIDLOG	POIDSF	cLT1FREQ	cLT2FREQ
0	0.889422	4766.865201	2	1.0
1	2.310209	12381.589746	30	1.0
2	2.740070	14685.431344	6	2.0
3	1.775545	9516.049939	1	1.0
4	0.732512	3925.907588	3	1.0

	POIDLOG	POIDSF	cLT1FREQ	cLT2FREQ
8397	0.502091	2690.961176	3	1.0
8398	0.306852	1644.574141	6	1.0
8399	2.501181	13405.104689	6	1.0
8400	1.382758	7410.905653	1	1.0
8401	0.343340	1840.132652	3	1.0

	ville	annee	temps	secondes
0	PARIS	2011	02:06:29	7589
1	PARIS	2010	02:06:41	7601
2	PARIS	2009	02:05:47	7547
3	PARIS	2008	02:06:40	7600
4	PARIS	2007	02:07:17	7637

	ville	secondes
0	AMSTERDAM	7883.371429
1	BERLIN	7922.315789
2	BOSTON	7891.061224
3	CHICAGO	7815.909091
4	FUKUOKA	8075.187500

2A.data - DataFrame et Graphes - correction¶

Exercice 1 : créer un fichier Excel

Questions

Exercice 2 : lambda fonction

Exercice 3 : moyennes par groupes

Exercice 4 : écart entre les mariés

Exercice 5 : graphe de la distribution avec pandas

Exercice 6 : distribution des mariages par jour

ville	AMSTERDAM	BERLIN	BOSTON	CHICAGO	FUKUOKA	LONDON	NEW YORK	PARIS	STOCKOLM
annee
2007	7589.0	7466.0	8053.0	7871.0	7599.0	7661.0	7744.0	7637.0	8456.0
2008	7672.0	7439.0	7665.0	7585.0	7570.0	7515.0	7723.0	7600.0	8163.0
2009	7578.0	7568.0	7722.0	7541.0	7518.0	7510.0	7755.0	7547.0	8134.0
2010	7544.0	7508.0	7552.0	7583.0	7704.0	7519.0	7694.0	7601.0	7968.0
2011	NaN	7418.0	7382.0	NaN	NaN	7480.0	NaN	7589.0	8047.0

ville	AMSTERDAM	BERLIN	BOSTON	CHICAGO	FUKUOKA	LONDON	NEW YORK	PARIS	STOCKOLM
annee
moyenne	7883.371429	7922.315789	7891.061224	7815.909091	8075.1875	7695.16129	7928.560976	7937.028571	8133.393939

ville	AMSTERDAM	BERLIN	BOSTON	CHICAGO	FUKUOKA	LONDON	NEW YORK	PARIS	STOCKOLM
annee
2008	7672.000000	7439.000000	7665.000000	7585.000000	7570.0000	7515.00000	7723.000000	7600.000000	8163.000000
2009	7578.000000	7568.000000	7722.000000	7541.000000	7518.0000	7510.00000	7755.000000	7547.000000	8134.000000
2010	7544.000000	7508.000000	7552.000000	7583.000000	7704.0000	7519.00000	7694.000000	7601.000000	7968.000000
2011	NaN	7418.000000	7382.000000	NaN	NaN	7480.00000	NaN	7589.000000	8047.000000
moyenne	7883.371429	7922.315789	7891.061224	7815.909091	8075.1875	7695.16129	7928.560976	7937.028571	8133.393939

	AMAR	ANAISF	ANAISH	DEPDOM	DEPMAR	DEPNAISF	DEPNAISH	ETAMATF	ETAMATH	INDNATF	INDNATH	JSEMAINE	MMAR	NBENFCOM	TUCOM	TUDOM
0	2012	1984	1982	99	29	99	75	1	1	2	1	1	1	N	NaN	9
1	2012	1969	1956	99	75	99	69	4	4	2	2	3	1	N	NaN	9
2	2012	1992	1982	99	34	99	99	1	1	1	2	5	1	N	NaN	9
3	2012	1987	1985	99	13	84	99	1	1	1	2	4	1	N	NaN	9
4	2012	1963	1968	99	26	99	99	1	1	2	2	6	1	N	NaN	9