1A.soft - Notions de SQL - correction#

Links: notebook, html, python, slides, GitHub

Correction des exercices du premier notebooks relié au SQL.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

%matplotlib inline

Recupérer les données #

import os
if not os.path.exists("td8_velib.db3"):
    from pyensae.datasource import download_data
    download_data("td8_velib.zip", website = 'xd')
    from pyensae.sql import import_flatfile_into_database
    dbf = "td8_velib.db3"
    import_flatfile_into_database(dbf, "td8_velib.txt")                   # 2 secondes
    import_flatfile_into_database(dbf, "stations.txt", table="stations")  # 2 minutes

  TextFile: opening file  td8_velib.txt
  TextFile.guess_columns: processing file  td8_velib.txt
  TextFile: opening file  td8_velib.txt
  TextFile.guess_columns: using  101  lines
  TextFile: closing file  td8_velib.txt
  TextFile.guess_columns: sep  't' nb cols 7  bestnb  101  more  {('t', 6): 101, (' ', 2): 100}
  TextFile.guess_columns: header  True  columns  {0: ('collect_date', <class 'str'>), 1: ('last_update', <class 'str'>), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
  compiling ^(?P<collect_date>.*)t(?P<last_update>.*)t(?P<available_bike_stands>([-]?[1-9][0-9]*?)|(0?))t(?P<available_bikes>([-]?[1-9][0-9]*?)|(0?))t(?P<number>([-]?[1-9][0-9]*?)|(0?))t(?P<heure>([-]?[1-9][0-9]*?)|(0?))t(?P<minute>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: regex  ^(?P<collect_date>.*)t(?P<last_update>.*)t(?P<available_bike_stands>([-]?[1-9][0-9]*?)|(0?))t(?P<available_bikes>([-]?[1-9][0-9]*?)|(0?))t(?P<number>([-]?[1-9][0-9]*?)|(0?))t(?P<heure>([-]?[1-9][0-9]*?)|(0?))t(?P<minute>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: header  True  columns  {0: ('collect_date', (<class 'str'>, 52)), 1: ('last_update', (<class 'str'>, 38)), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
  [_guess_columns] sep=['t']
  TextFile: closing file  td8_velib.txt
  [_guess_columns] columns_name=None
   guess with  1001 lines
   count_types  {0: {<class 'str'>: 1000}, 1: {<class 'str'>: 1000}, 2: {<class 'int'>: 1000}, 3: {<class 'int'>: 1000}, 4: {<class 'int'>: 1000}, 5: {<class 'int'>: 1000}, 6: {<class 'int'>: 1000}}
   columns  {0: ('collect_date', <class 'str'>), 1: ('last_update', <class 'str'>), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
   guess {0: ('collect_date', (<class 'str'>, 52)), 1: ('last_update', (<class 'str'>, 38)), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
SQL  'CREATE TABLE td8_velib(collect_date TEXT,'
'       last_update TEXT,'
'       available_bike_stands INTEGER,'
'       available_bikes INTEGER,'
'       number INTEGER,'
'       heure INTEGER,'
'       minute INTEGER);'
   column_has_space False ['collect_date', 'last_update', 'available_bike_stands', 'available_bikes', 'number', 'heure', 'minute']
   changes {}
    TextFileColumns (2): regex:  {0: ('collect_date', (<class 'str'>, 52)), 1: ('last_update', (<class 'str'>, 38)), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
  TextFile.guess_columns: processing file  td8_velib.txt
  TextFile: opening file  td8_velib.txt
  TextFile.guess_columns: using  101  lines
  TextFile: closing file  td8_velib.txt
  TextFile.guess_columns: sep  't' nb cols 7  bestnb  101  more  {('t', 6): 101, (' ', 2): 100}
  TextFile.guess_columns: header  True  columns  {0: ('collect_date', <class 'str'>), 1: ('last_update', <class 'str'>), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
  compiling ^(?P<collect_date>.*)t(?P<last_update>.*)t(?P<available_bike_stands>([-]?[1-9][0-9]*?)|(0?))t(?P<available_bikes>([-]?[1-9][0-9]*?)|(0?))t(?P<number>([-]?[1-9][0-9]*?)|(0?))t(?P<heure>([-]?[1-9][0-9]*?)|(0?))t(?P<minute>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: regex  ^(?P<collect_date>.*)t(?P<last_update>.*)t(?P<available_bike_stands>([-]?[1-9][0-9]*?)|(0?))t(?P<available_bikes>([-]?[1-9][0-9]*?)|(0?))t(?P<number>([-]?[1-9][0-9]*?)|(0?))t(?P<heure>([-]?[1-9][0-9]*?)|(0?))t(?P<minute>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: header  True  columns  {0: ('collect_date', (<class 'str'>, 52)), 1: ('last_update', (<class 'str'>, 38)), 2: ('available_bike_stands', <class 'int'>), 3: ('available_bikes', <class 'int'>), 4: ('number', <class 'int'>), 5: ('heure', <class 'int'>), 6: ('minute', <class 'int'>)}
  TextFile: opening file  td8_velib.txt
adding 100000 lines into table td8_velib
adding 200000 lines into table td8_velib
adding 300000 lines into table td8_velib
adding 400000 lines into table td8_velib
adding 500000 lines into table td8_velib
adding 600000 lines into table td8_velib
adding 700000 lines into table td8_velib
adding 800000 lines into table td8_velib
adding 900000 lines into table td8_velib
adding 1000000 lines into table td8_velib
adding 1100000 lines into table td8_velib
^(?P<collect_date>.*)t(?P<last_update>.*)t(?P<available_bike_stands>([-]?[1-9][0-9]*?)|(0?))t(?P<available_bikes>([-]?[1-9][0-9]*?)|(0?))t(?P<number>([-]?[1-9][0-9]*?)|(0?))t(?P<heure>([-]?[1-9][0-9]*?)|(0?))t(?P<minute>([-]?[1-9][0-9]*?)|(0?))$
error regex 0 unable to interpret line  1103788 :  ''
  TextFile: closing file  td8_velib.txt
1103787  lines imported
  TextFile: opening file  stations.txt
  TextFile.guess_columns: processing file  stations.txt
  TextFile: opening file  stations.txt
  TextFile.guess_columns: using  101  lines
  TextFile: closing file  stations.txt
  TextFile.guess_columns: sep  't' nb cols 6  bestnb  101  more  {('t', 5): 101, (' ', 13): 3, (' ', 21): 1, (' ', 14): 2, (' ', 12): 11, (' ', 16): 2, (' ', 9): 24, (' ', 10): 21, (' ', 8): 13, (' ', 11): 10, (' ', 7): 12, (' ', 6): 1}
  TextFile.guess_columns: header  True  columns  {0: ('address', <class 'str'>), 1: ('contract_name', <class 'str'>), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', <class 'str'>), 5: ('number', <class 'int'>)}
  compiling ^(?P<address>.*)t(?P<contract_name>.*)t(?P<lat>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<lng>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<name>.*)t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: regex  ^(?P<address>.*)t(?P<contract_name>.*)t(?P<lat>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<lng>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<name>.*)t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: header  True  columns  {0: ('address', (<class 'str'>, 134)), 1: ('contract_name', (<class 'str'>, 10)), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', (<class 'str'>, 98)), 5: ('number', <class 'int'>)}
  [_guess_columns] sep=['t']
  TextFile: closing file  stations.txt
  [_guess_columns] columns_name=None
   guess with  935 lines
   count_types  {0: {<class 'str'>: 934}, 1: {<class 'str'>: 934}, 2: {<class 'float'>: 934}, 3: {<class 'float'>: 934}, 4: {<class 'str'>: 934}, 5: {<class 'int'>: 934}}
   columns  {0: ('address', <class 'str'>), 1: ('contract_name', <class 'str'>), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', <class 'str'>), 5: ('number', <class 'int'>)}
   guess {0: ('address', (<class 'str'>, 152)), 1: ('contract_name', (<class 'str'>, 10)), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', (<class 'str'>, 98)), 5: ('number', <class 'int'>)}
SQL  'CREATE TABLE stations(address TEXT,'
'       contract_name TEXT,'
'       lat FLOAT,'
'       lng FLOAT,'
'       name TEXT,'
'       number INTEGER);'
   column_has_space False ['address', 'contract_name', 'lat', 'lng', 'name', 'number']
   changes {}
    TextFileColumns (2): regex:  {0: ('address', (<class 'str'>, 152)), 1: ('contract_name', (<class 'str'>, 10)), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', (<class 'str'>, 98)), 5: ('number', <class 'int'>)}
  TextFile.guess_columns: processing file  stations.txt
  TextFile: opening file  stations.txt
  TextFile.guess_columns: using  101  lines
  TextFile: closing file  stations.txt
  TextFile.guess_columns: sep  't' nb cols 6  bestnb  101  more  {('t', 5): 101, (' ', 13): 3, (' ', 21): 1, (' ', 14): 2, (' ', 12): 11, (' ', 16): 2, (' ', 9): 24, (' ', 10): 21, (' ', 8): 13, (' ', 11): 10, (' ', 7): 12, (' ', 6): 1}
  TextFile.guess_columns: header  True  columns  {0: ('address', <class 'str'>), 1: ('contract_name', <class 'str'>), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', <class 'str'>), 5: ('number', <class 'int'>)}
  compiling ^(?P<address>.*)t(?P<contract_name>.*)t(?P<lat>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<lng>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<name>.*)t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: regex  ^(?P<address>.*)t(?P<contract_name>.*)t(?P<lat>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<lng>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<name>.*)t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: header  True  columns  {0: ('address', (<class 'str'>, 134)), 1: ('contract_name', (<class 'str'>, 10)), 2: ('lat', <class 'float'>), 3: ('lng', <class 'float'>), 4: ('name', (<class 'str'>, 98)), 5: ('number', <class 'int'>)}
  TextFile: opening file  stations.txt
^(?P<address>.*)t(?P<contract_name>.*)t(?P<lat>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<lng>[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)t(?P<name>.*)t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
error regex 0 unable to interpret line  1232 :  ''
  TextFile: closing file  stations.txt
1231  lines imported

%load_ext pyensae

%SQL_connect td8_velib.db3

<pyensae.sql.sql_interface_database.InterfaceSQLDatabase at 0x206823f4100>

Exercice 1 #

%%SQL
SELECT COUNT(*) FROM (
    SELECT DISTINCT last_update FROM td8_velib
) ;

	COUNT(*)
0	15294

%%SQL
SELECT MIN(last_update), MAX(last_update) FROM td8_velib ;

	MIN(last_update)	MAX(last_update)
0	2013-07-22 09:00:19	2013-09-13 11:25:19

Exercice 2 #

%%SQL
SELECT number, COUNT(*) AS nb
FROM td8_velib
WHERE available_bikes==0 AND last_update >= '2013-09-10 11:30:19'
GROUP BY number
ORDER BY nb DESC

	number	nb
0	42704	864
1	41101	864
2	35008	864
3	35005	864
4	35003	864
5	33012	864
6	33011	864
7	33006	864
8	33005	864
9	32603	864

Exercice 3 : plage horaires de cinq minutes où il n’y a aucun vélo disponible #

%%SQL
SELECT nb, COUNT(*) AS nb_station
FROM (
  -- requête de l'exercice précédent
  SELECT number, COUNT(*) AS nb
  FROM td8_velib
  WHERE available_bikes==0 AND last_update >= '2013-09-10 11:30:19'
  GROUP BY number
)
GROUP BY nb

	nb	nb_station
0	1	7
1	2	7
2	3	5
3	4	6
4	5	6
5	6	5
6	7	8
7	8	9
8	9	1
9	10	7

Exercice 4 : distribution horaire par station et par tranche de 5 minutes #

%%SQL
SELECT A.number, A.heure, A.minute, 1.0 * A.nb_velo / B.nb_velo_tot  AS distribution_temporelle
FROM (
  SELECT number, heure, minute, SUM(available_bikes) AS nb_velo
  FROM td8_velib
  WHERE last_update >= '2013-09-10 11:30:19'
  GROUP BY heure, minute, number
) AS A
JOIN (
  SELECT number, heure, minute, SUM(available_bikes) AS nb_velo_tot
  FROM td8_velib
  WHERE last_update >= '2013-09-10 11:30:19'
  GROUP BY number
) AS B
ON A.number == B.number
--WHERE A.number in (8001, 8003, 15024, 15031)  -- pour n'afficher que quelques stations
ORDER BY A.number, A.heure, A.minute

	number	heure	minute	distribution_temporelle
0	901	0	0	0.001104
1	901	0	5	0.001104
2	901	0	10	0.001104
3	901	0	15	0.001104
4	901	0	20	0.001104
5	901	0	25	0.000946
6	901	0	30	0.000946
7	901	0	35	0.000946
8	901	0	40	0.000946
9	901	0	45	0.000946

Zones de travail et zones de résidence #

%%SQL --df=df
SELECT number, SUM(distribution_temporelle) AS velo_jour
FROM (
  -- requête de l'exercice 4
    SELECT A.number, A.heure, A.minute, 1.0 * A.nb_velo / B.nb_velo_tot  AS distribution_temporelle
    FROM (
        SELECT number, heure, minute, SUM(available_bikes) AS nb_velo
        FROM td8_velib
        WHERE last_update >= '2013-09-10 11:30:19'
        GROUP BY heure, minute, number
    ) AS A
    JOIN (
        SELECT number, heure, minute, SUM(available_bikes) AS nb_velo_tot
        FROM td8_velib
        WHERE last_update >= '2013-09-10 11:30:19'
        GROUP BY number
    ) AS B
    ON A.number == B.number
)
WHERE heure >= 10 AND heure <= 16
GROUP BY number

	number	velo_jour
0	901	0.555188
1	903	0.473137
2	904	0.357640
3	905	0.323395
4	906	0.117566
5	908	0.501727
6	1001	0.486796
7	1002	0.355434
8	1003	0.364749
9	1004	0.440549

df = df.sort_values("velo_jour").reset_index()
df["index"] = range(0, df.shape[0])
df.head()

	index	number	velo_jour
0	0	18113	0.000000
1	1	35013	0.000000
2	2	10010	0.000000
3	3	20037	0.007382
4	4	20119	0.010623

df.plot(x="index", y="velo_jour")

<AxesSubplot:xlabel='index'>

../_images/td1a_sql_correction_19_1.png

JOIN avec la table stations et les stations “travail”#

On trouve les arrondissements où les stations de vélib sont les plus remplies en journée au centre de Paris.

%%SQL
SELECT C.number, name, lat, lng, velo_jour FROM
(
  -- requête de la partie précédente
    SELECT number, SUM(distribution_temporelle) AS velo_jour
    FROM (
        -- requête de l'exercice 4
        SELECT A.number, A.heure, A.minute, 1.0 * A.nb_velo / B.nb_velo_tot  AS distribution_temporelle
        FROM (
            SELECT number, heure, minute, SUM(available_bikes) AS nb_velo
            FROM td8_velib
            WHERE last_update >= '2013-09-10 11:30:19'
            GROUP BY heure, minute, number
        ) AS A
        JOIN (
            SELECT number, heure, minute, SUM(available_bikes) AS nb_velo_tot
            FROM td8_velib
            WHERE last_update >= '2013-09-10 11:30:19'
            GROUP BY number
        ) AS B
        ON A.number == B.number
    )
    WHERE heure >= 10 AND heure <= 16
    GROUP BY number
) AS C
INNER JOIN stations
ON C.number == stations.number

	number	name	lat	lng	velo_jour
0	901	00901 - PORT SOLFÉRINO (STATION MOBILE)	48.861380	2.324420	0.555188
1	903	00903 - QUAI MAURIAC / PONT DE BERCY	48.837134	2.374341	0.473137
2	904	00904 - PLACE JOFFRE / ECOLE MILITAIRE	48.852136	2.301961	0.357640
3	905	00905 - CONCORDE/BERGES DE SEINE (STATION MOBILE)	48.863140	2.316690	0.323395
4	906	00906 - GARE DE L'EST	48.876420	2.358630	0.117566
5	908	00908 - PORT DU GROS CAILLOU (STATION MOBILE)	48.862880	2.306520	0.501727
6	1001	01001 - ILE DE LA CITE PONT NEUF	48.857092	2.341748	0.486796
7	1002	01002 - PLACE DU CHATELET	48.857940	2.347010	0.355434
8	1003	01003 - RIVOLI SAINT DENIS	48.859150	2.347620	0.364749
9	1004	01004 - MARGUERITE DE NAVARRE	48.859896	2.346757	0.440549