.. _td1asqlcorrectionrst: ===================================== 1A.soft - Notions de SQL - correction ===================================== .. only:: html **Links:** :download:`notebook `, :downloadlink:`html `, :download:`python `, :downloadlink:`slides `, :githublink:`GitHub|_doc/notebooks/td1a_soft/td1a_sql_correction.ipynb|*` Correction des exercices du premier notebooks relié au SQL. .. code:: ipython3 from jyquickhelper import add_notebook_menu add_notebook_menu() .. contents:: :local: .. code:: ipython3 %matplotlib inline Recupérer les données --------------------- .. code:: ipython3 import os if not os.path.exists("td8_velib.db3"): from pyensae.datasource import download_data download_data("td8_velib.zip", website = 'xd') from pyensae.sql import import_flatfile_into_database dbf = "td8_velib.db3" import_flatfile_into_database(dbf, "td8_velib.txt") # 2 secondes import_flatfile_into_database(dbf, "stations.txt", table="stations") # 2 minutes .. parsed-literal:: TextFile: opening file td8_velib.txt TextFile.guess_columns: processing file td8_velib.txt TextFile: opening file td8_velib.txt TextFile.guess_columns: using 101 lines TextFile: closing file td8_velib.txt TextFile.guess_columns: sep '\t' nb cols 7 bestnb 101 more {('\t', 6): 101, (' ', 2): 100} TextFile.guess_columns: header True columns {0: ('collect_date', ), 1: ('last_update', ), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} compiling ^(?P.*)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: regex ^(?P.*)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: header True columns {0: ('collect_date', (, 52)), 1: ('last_update', (, 38)), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} [_guess_columns] sep=['\t'] TextFile: closing file td8_velib.txt [_guess_columns] columns_name=None guess with 1001 lines count_types {0: {: 1000}, 1: {: 1000}, 2: {: 1000}, 3: {: 1000}, 4: {: 1000}, 5: {: 1000}, 6: {: 1000}} columns {0: ('collect_date', ), 1: ('last_update', ), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} guess {0: ('collect_date', (, 52)), 1: ('last_update', (, 38)), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} SQL 'CREATE TABLE td8_velib(collect_date TEXT,' ' last_update TEXT,' ' available_bike_stands INTEGER,' ' available_bikes INTEGER,' ' number INTEGER,' ' heure INTEGER,' ' minute INTEGER);' column_has_space False ['collect_date', 'last_update', 'available_bike_stands', 'available_bikes', 'number', 'heure', 'minute'] changes {} TextFileColumns (2): regex: {0: ('collect_date', (, 52)), 1: ('last_update', (, 38)), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} TextFile.guess_columns: processing file td8_velib.txt TextFile: opening file td8_velib.txt TextFile.guess_columns: using 101 lines TextFile: closing file td8_velib.txt TextFile.guess_columns: sep '\t' nb cols 7 bestnb 101 more {('\t', 6): 101, (' ', 2): 100} TextFile.guess_columns: header True columns {0: ('collect_date', ), 1: ('last_update', ), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} compiling ^(?P.*)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: regex ^(?P.*)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: header True columns {0: ('collect_date', (, 52)), 1: ('last_update', (, 38)), 2: ('available_bike_stands', ), 3: ('available_bikes', ), 4: ('number', ), 5: ('heure', ), 6: ('minute', )} TextFile: opening file td8_velib.txt adding 100000 lines into table td8_velib adding 200000 lines into table td8_velib adding 300000 lines into table td8_velib adding 400000 lines into table td8_velib adding 500000 lines into table td8_velib adding 600000 lines into table td8_velib adding 700000 lines into table td8_velib adding 800000 lines into table td8_velib adding 900000 lines into table td8_velib adding 1000000 lines into table td8_velib adding 1100000 lines into table td8_velib ^(?P.*)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))\t(?P([-]?[1-9][0-9]*?)|(0?))$ error regex 0 unable to interpret line 1103788 : '' TextFile: closing file td8_velib.txt 1103787 lines imported TextFile: opening file stations.txt TextFile.guess_columns: processing file stations.txt TextFile: opening file stations.txt TextFile.guess_columns: using 101 lines TextFile: closing file stations.txt TextFile.guess_columns: sep '\t' nb cols 6 bestnb 101 more {('\t', 5): 101, (' ', 13): 3, (' ', 21): 1, (' ', 14): 2, (' ', 12): 11, (' ', 16): 2, (' ', 9): 24, (' ', 10): 21, (' ', 8): 13, (' ', 11): 10, (' ', 7): 12, (' ', 6): 1} TextFile.guess_columns: header True columns {0: ('address', ), 1: ('contract_name', ), 2: ('lat', ), 3: ('lng', ), 4: ('name', ), 5: ('number', )} compiling ^(?P

.*)\t(?P.*)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: regex ^(?P

.*)\t(?P.*)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: header True columns {0: ('address', (, 134)), 1: ('contract_name', (, 10)), 2: ('lat', ), 3: ('lng', ), 4: ('name', (, 98)), 5: ('number', )} [_guess_columns] sep=['\t'] TextFile: closing file stations.txt [_guess_columns] columns_name=None guess with 935 lines count_types {0: {: 934}, 1: {: 934}, 2: {: 934}, 3: {: 934}, 4: {: 934}, 5: {: 934}} columns {0: ('address', ), 1: ('contract_name', ), 2: ('lat', ), 3: ('lng', ), 4: ('name', ), 5: ('number', )} guess {0: ('address', (, 152)), 1: ('contract_name', (, 10)), 2: ('lat', ), 3: ('lng', ), 4: ('name', (, 98)), 5: ('number', )} SQL 'CREATE TABLE stations(address TEXT,' ' contract_name TEXT,' ' lat FLOAT,' ' lng FLOAT,' ' name TEXT,' ' number INTEGER);' column_has_space False ['address', 'contract_name', 'lat', 'lng', 'name', 'number'] changes {} TextFileColumns (2): regex: {0: ('address', (, 152)), 1: ('contract_name', (, 10)), 2: ('lat', ), 3: ('lng', ), 4: ('name', (, 98)), 5: ('number', )} TextFile.guess_columns: processing file stations.txt TextFile: opening file stations.txt TextFile.guess_columns: using 101 lines TextFile: closing file stations.txt TextFile.guess_columns: sep '\t' nb cols 6 bestnb 101 more {('\t', 5): 101, (' ', 13): 3, (' ', 21): 1, (' ', 14): 2, (' ', 12): 11, (' ', 16): 2, (' ', 9): 24, (' ', 10): 21, (' ', 8): 13, (' ', 11): 10, (' ', 7): 12, (' ', 6): 1} TextFile.guess_columns: header True columns {0: ('address', ), 1: ('contract_name', ), 2: ('lat', ), 3: ('lng', ), 4: ('name', ), 5: ('number', )} compiling ^(?P

.*)\t(?P.*)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))$ TextFile.guess_columns: regex ^(?P

.*)\t(?P.*)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P[-]?[0-9]*?([.][0-9]*?)?([eE][-]?[0-9]{0,4})?)\t(?P.*)\t(?P([-]?[1-9][0-9]*?)|(0?))$ error regex 0 unable to interpret line 1232 : '' TextFile: closing file stations.txt 1231 lines imported .. code:: ipython3 %load_ext pyensae .. code:: ipython3 %SQL_connect td8_velib.db3 .. parsed-literal:: Exercice 1 ---------- .. code:: ipython3 %%SQL SELECT COUNT(*) FROM ( SELECT DISTINCT last_update FROM td8_velib ) ; .. raw:: html

	COUNT(*)
0	15294

.. code:: ipython3 %%SQL SELECT MIN(last_update), MAX(last_update) FROM td8_velib ; .. raw:: html

	MIN(last_update)	MAX(last_update)
0	2013-07-22 09:00:19	2013-09-13 11:25:19

Exercice 2 ---------- .. code:: ipython3 %%SQL SELECT number, COUNT(*) AS nb FROM td8_velib WHERE available_bikes==0 AND last_update >= '2013-09-10 11:30:19' GROUP BY number ORDER BY nb DESC .. raw:: html

	number	nb
0	42704	864
1	41101	864
2	35008	864
3	35005	864
4	35003	864
5	33012	864
6	33011	864
7	33006	864
8	33005	864
9	32603	864

Exercice 3 : plage horaires de cinq minutes où il n’y a aucun vélo disponible ----------------------------------------------------------------------------- .. code:: ipython3 %%SQL SELECT nb, COUNT(*) AS nb_station FROM ( -- requête de l'exercice précédent SELECT number, COUNT(*) AS nb FROM td8_velib WHERE available_bikes==0 AND last_update >= '2013-09-10 11:30:19' GROUP BY number ) GROUP BY nb .. raw:: html

	nb	nb_station
0	1	7
1	2	7
2	3	5
3	4	6
4	5	6
5	6	5
6	7	8
7	8	9
8	9	1
9	10	7

Exercice 4 : distribution horaire par station et par tranche de 5 minutes ------------------------------------------------------------------------- .. code:: ipython3 %%SQL SELECT A.number, A.heure, A.minute, 1.0 * A.nb_velo / B.nb_velo_tot AS distribution_temporelle FROM ( SELECT number, heure, minute, SUM(available_bikes) AS nb_velo FROM td8_velib WHERE last_update >= '2013-09-10 11:30:19' GROUP BY heure, minute, number ) AS A JOIN ( SELECT number, heure, minute, SUM(available_bikes) AS nb_velo_tot FROM td8_velib WHERE last_update >= '2013-09-10 11:30:19' GROUP BY number ) AS B ON A.number == B.number --WHERE A.number in (8001, 8003, 15024, 15031) -- pour n'afficher que quelques stations ORDER BY A.number, A.heure, A.minute .. raw:: html

	number	minute	distribution_temporelle
0	901	0	0.001104
1	901	5	0.001104
2	901	10	0.001104
3	901	15	0.001104
4	901	20	0.001104
5	901	25	0.000946
6	901	30	0.000946
7	901	35	0.000946
8	901	40	0.000946
9	901	45	0.000946

Zones de travail et zones de résidence -------------------------------------- .. code:: ipython3 %%SQL --df=df SELECT number, SUM(distribution_temporelle) AS velo_jour FROM ( -- requête de l'exercice 4 SELECT A.number, A.heure, A.minute, 1.0 * A.nb_velo / B.nb_velo_tot AS distribution_temporelle FROM ( SELECT number, heure, minute, SUM(available_bikes) AS nb_velo FROM td8_velib WHERE last_update >= '2013-09-10 11:30:19' GROUP BY heure, minute, number ) AS A JOIN ( SELECT number, heure, minute, SUM(available_bikes) AS nb_velo_tot FROM td8_velib WHERE last_update >= '2013-09-10 11:30:19' GROUP BY number ) AS B ON A.number == B.number ) WHERE heure >= 10 AND heure <= 16 GROUP BY number .. raw:: html

	number	velo_jour
0	901	0.555188
1	903	0.473137
2	904	0.357640
3	905	0.323395
4	906	0.117566
5	908	0.501727
6	1001	0.486796
7	1002	0.355434
8	1003	0.364749
9	1004	0.440549

.. code:: ipython3 df = df.sort_values("velo_jour").reset_index() df["index"] = range(0, df.shape[0]) df.head() .. raw:: html

	index	number	velo_jour
0	0	18113	0.000000
1	1	35013	0.000000
2	2	10010	0.000000
3	3	20037	0.007382
4	4	20119	0.010623

.. code:: ipython3 df.plot(x="index", y="velo_jour") .. parsed-literal:: .. image:: td1a_sql_correction_19_1.png JOIN avec la table stations et les stations “travail” ----------------------------------------------------- On trouve les arrondissements où les stations de vélib sont les plus remplies en journée au centre de Paris. .. code:: ipython3 %%SQL SELECT C.number, name, lat, lng, velo_jour FROM ( -- requête de la partie précédente SELECT number, SUM(distribution_temporelle) AS velo_jour FROM ( -- requête de l'exercice 4 SELECT A.number, A.heure, A.minute, 1.0 * A.nb_velo / B.nb_velo_tot AS distribution_temporelle FROM ( SELECT number, heure, minute, SUM(available_bikes) AS nb_velo FROM td8_velib WHERE last_update >= '2013-09-10 11:30:19' GROUP BY heure, minute, number ) AS A JOIN ( SELECT number, heure, minute, SUM(available_bikes) AS nb_velo_tot FROM td8_velib WHERE last_update >= '2013-09-10 11:30:19' GROUP BY number ) AS B ON A.number == B.number ) WHERE heure >= 10 AND heure <= 16 GROUP BY number ) AS C INNER JOIN stations ON C.number == stations.number .. raw:: html

	number	name	lat	lng	velo_jour
0	901	00901 - PORT SOLFÉRINO (STATION MOBILE)	48.861380	2.324420	0.555188
1	903	00903 - QUAI MAURIAC / PONT DE BERCY	48.837134	2.374341	0.473137
2	904	00904 - PLACE JOFFRE / ECOLE MILITAIRE	48.852136	2.301961	0.357640
3	905	00905 - CONCORDE/BERGES DE SEINE (STATION MOBILE)	48.863140	2.316690	0.323395
4	906	00906 - GARE DE L'EST	48.876420	2.358630	0.117566
5	908	00908 - PORT DU GROS CAILLOU (STATION MOBILE)	48.862880	2.306520	0.501727
6	1001	01001 - ILE DE LA CITE PONT NEUF	48.857092	2.341748	0.486796
7	1002	01002 - PLACE DU CHATELET	48.857940	2.347010	0.355434
8	1003	01003 - RIVOLI SAINT DENIS	48.859150	2.347620	0.364749
9	1004	01004 - MARGUERITE DE NAVARRE	48.859896	2.346757	0.440549