Times Series#
Links: notebook
, html, PDF
, python
, slides, GitHub
Material for the hackathon ENSAE / Red Cross / 2015. Extract and plot a time series.
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
Populating the interactive namespace from numpy and matplotlib
from jyquickhelper import add_notebook_menu
add_notebook_menu()
Connection to the cluster#
from pyquickhelper.loghelper import get_password
hackathon = {}
hackathon["blob_storage"] = get_password("CRCREDENTIALS", "blobstorage")
hackathon["password"] = get_password("CRCREDENTIALS", "password")
blobstorage = hackathon["blob_storage"]
blobpassword = hackathon["password"]
%load_ext pyensae
cl, bs = %blob_open
cl, bs
(<pyensae.remote.azure_connection.AzureClient at 0xa145d30>,
<azure.storage.blob.blobservice.BlobService at 0xa13a6a0>)
Download data#
%blob_ls croix-rouge
name | last_modified | content_type | content_length | blob_type | |
---|---|---|---|---|---|
0 | build/SINVOICE_M.csv | Wed, 18 Nov 2015 18:56:27 GMT | application/octet-stream | 533771533 | BlockBlob |
1 | build/Test_CRFFOR.GACCTMPD.csv | Sun, 22 Nov 2015 21:53:38 GMT | application/octet-stream | 822231942 | BlockBlob |
2 | data/ITMMASTER.schema.txt | Mon, 16 Nov 2015 23:00:34 GMT | application/octet-stream | 5658 | BlockBlob |
3 | data/ITMMASTER.txt | Mon, 09 Nov 2015 21:41:00 GMT | application/octet-stream | 103096479 | BlockBlob |
4 | data/SINVOICE.schema.txt | Mon, 16 Nov 2015 23:00:35 GMT | application/octet-stream | 10252 | BlockBlob |
5 | data/SINVOICE.txt | Mon, 09 Nov 2015 21:42:32 GMT | application/octet-stream | 1362433753 | BlockBlob |
6 | data/SINVOICEV.schema.txt | Mon, 16 Nov 2015 23:00:35 GMT | application/octet-stream | 7999 | BlockBlob |
7 | data/SINVOICEV.txt | Mon, 09 Nov 2015 21:44:08 GMT | application/octet-stream | 1252461865 | BlockBlob |
8 | data/SINVOICEV_.txt | Tue, 24 Nov 2015 15:35:49 GMT | application/octet-stream | 1252461865 | BlockBlob |
9 | data/SINVOICE_.txt | Tue, 24 Nov 2015 15:34:54 GMT | application/octet-stream | 1362433753 | BlockBlob |
10 | data/enseignes_france.csv | Mon, 09 Nov 2015 21:40:54 GMT | application/octet-stream | 6303836 | BlockBlob |
11 | data/stojou.csv | Mon, 09 Nov 2015 21:55:23 GMT | application/octet-stream | 8821375868 | BlockBlob |
12 | readme.txt | Sun, 22 Nov 2015 18:00:34 GMT | application/octet-stream | 45 | BlockBlob |
%blob_down croix-rouge/data/SINVOICE.schema.txt SINVOICE.schema.txt
'SINVOICE.schema.txt'
import pandas
df = pandas.read_csv("SINVOICE.schema.txt", sep="\t", encoding="utf8")
df.to_excel("SINVOICE.schema.xlsx")
df.head(n=1)
Zone | Typ | Menu | Long | Act | Dim | Intitulé normal | Intitulé abrégé | Intitulé long | Options | Table liée | Expression de lien | Copie législation | Annulation | Vérification | Obligatoire | RAZ | Mot-clé d'aide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | SIVTYP | TSV | 0 | NaN | NaN | 1 | Type facture | Type fac | Type facture | NaN | TABSIVTYP | NaN | NaN | Bloquant | Oui | Non | Non | NaN |
%blob_down croix-rouge/data/SINVOICE_.txt SINVOICE_.txt
Operations, conversion, and some others stuff#
change encoding#
from ensae_projects.datainc import change_encoding
change_encoding("SINVOICE_.txt", "SINVOICE_.utf8.txt", enc1="latin-1")
1446850
import pyensae
%head SINVOICE_.utf8.txt --n=2
"SIVTYP_0" "INVTYP_0" "NUM_0" "ORIMOD_0" "BPR_0" "BPRSAC_0" "CPY_0" "FCY_0" "GTE_0" "JOU_0" "ACCDAT_0" "ACCNUM_0" "BPRDAT_0" "BPRVCR_0" "CUR_0" "CURTYP_0" "LED_0" "LED_1" "LED_2" "LED_3" "LED_4" "LED_5" "LED_6" "LED_7" "LED_8" "LED_9" "CURLED_0" "CURLED_1" "CURLED_2" "CURLED_3" "CURLED_4" "CURLED_5" "CURLED_6" "CURLED_7" "CURLED_8" "CURLED_9" "RATMLT_0" "RATMLT_1" "RATMLT_2" "RATMLT_3" "RATMLT_4" "RATMLT_5" "RATMLT_6" "RATMLT_7" "RATMLT_8" "RATMLT_9" "RATDIV_0" "RATDIV_1" "RATDIV_2" "RATDIV_3" "RATDIV_4" "RATDIV_5" "RATDIV_6" "RATDIV_7" "RATDIV_8" "RATDIV_9" "RATDAT_0" "BPRPAY_0" "BPAPAY_0" "BPYNAM_0" "BPYNAM_1" "BPYADDLIG_0" "BPYADDLIG_1" "BPYADDLIG_2" "BPYPOSCOD_0" "BPYCTY_0" "BPYSAT_0" "BPYCRY_0" "BPYCRYNAM_0" "BPRFCT_0" "FCTVCR_0" "FCTVCRFLG_0" "QTCACCNUM_0" "STRDUDDAT_0" "PTE_0" "DEP_0" "VAC_0" "DIRINVFLG_0" "EECNUMDEB_0" "STA_0" "DES_0" "DES_1" "DES_2" "DES_3" "DES_4" "INVNUM_0" "SNS_0" "AMTATI_0" "AMTNOT_0" "AMTNOTL_0" "AMTATIL_0" "VATDAT_0" "NBRTAX_0" "TAX_0" "TAX_1" "TAX_2" "TAX_3" "TAX_4" "TAX_5" "TAX_6" "TAX_7" "TAX_8" "TAX_9" "BASTAX_0" "BASTAX_1" "BASTAX_2" "BASTAX_3" "BASTAX_4" "BASTAX_5" "BASTAX_6" "BASTAX_7" "BASTAX_8" "BASTAX_9" "AMTTAX_0" "AMTTAX_1" "AMTTAX_2" "AMTTAX_3" "AMTTAX_4" "AMTTAX_5" "AMTTAX_6" "AMTTAX_7" "AMTTAX_8" "AMTTAX_9" "THEAMTTAX_0" "EXEAMTTAX_0" "EXEAMTTAX_1" "EXEAMTTAX_2" "EXEAMTTAX_3" "EXEAMTTAX_4" "EXEAMTTAX_5" "EXEAMTTAX_6" "EXEAMTTAX_7" "EXEAMTTAX_8" "EXEAMTTAX_9" "DIE_0" "DIE_1" "DIE_2" "DIE_3" "DIE_4" "DIE_5" "DIE_6" "DIE_7" "DIE_8" "DIE_9" "DIE_10" "DIE_11" "DIE_12" "DIE_13" "DIE_14" "DIE_15" "DIE_16" "DIE_17" "DIE_18" "DIE_19" "CCE_0" "CCE_1" "CCE_2" "CCE_3" "CCE_4" "CCE_5" "CCE_6" "CCE_7" "CCE_8" "CCE_9" "CCE_10" "CCE_11" "CCE_12" "CCE_13" "CCE_14" "CCE_15" "CCE_16" "CCE_17" "CCE_18" "CCE_19" "BPAINV_0" "BPRNAM_0" "BPRNAM_1" "BPAADDLIG_0" "BPAADDLIG_1" "BPAADDLIG_2" "POSCOD_0" "CTY_0" "SAT_0" "CRY_0" "CRYNAM_0" "BILVCR_0" "TRSFAM_0" "FIY_0" "PER_0" "STRDATSVC_0" "ENDDATSVC_0" "LASDATSVC_0" "AMTTAXUSA_0" "CAI_0" "DATVLYCAI_0" "WRHE_0" "EXPNUM_0" "SINUM_0" "STARPT_0" "ISEXTDOC_0" "CREDAT_0" "CREUSR_0" "UPDDAT_0" "YRESTEAV_0" "UPDUSR_0" "YNBR_0" "YTYPOFAM_0" "YCSP_0" "YSITPAR_0" "YSITTRA_0" "YSITLOG_0" "YSSITLOG_0" "YCRY_0" "YT1_0" "YT2_0" "YT3_0" "YT4_0" "YT5_0" "YT6_0" "YT7_0" "YT8_0" "YSEXMAL_0" "YSEXFEM_0" "YSOLDINITIAL_0" "YSOLDEACTUAL_0" "YMTSANSREDUC_0" "YMTAVECREDUC_0" "YMTREDUCTION_0" "YMTVALMER_0" "YORIG1_0" "YORIG2_0" "YORIG3_0" "YORIG4_0" "YORIG5_0" "YORIG6_0" "YORIG7_0" "YORIG8_0" "YORIG9_0" "YFREQPASS_0" "YCSPCHEF_0" "YTYPPROV_0" "YREVREF_0" "YQUOTFAM_0" "YCNTTYP_0" "DIS" 1 "DIS101028000093" 5 "100000104" "C2" "CRF" "U8201" "FACLI" "DIS" 28/10/10 217 28/10/10 " " "EUR" 1 "FRA" " " " " " " " " " " " " " " " " " " "EUR" " " " " " " " " " " " " " " " " " " 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 28/10/10 "100000104" "A01" "supprimé" "supprimé" " " " " " " " " " " " " " " " " " " " " 0 0 28/10/10 "COMPTANT" " " " " 2 0 3 " " " " " " " " " " " " 1 0 0 0 0 28/10/10 0 " " " " " " " " " " " " " " " " " " " " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "A01" "supprimé" "supprimé" " " " " " " " " " " " " " " " " " " " " 0 0 31/12/99 31/12/99 31/12/99 0 " " 31/12/99 " " 1 " " 2 0 28/10/10 "REYC" 16/04/15 18,83666666666667 "YANONIMISATION" 1 3 1 1 3 2 2 "FR" 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "1F1S" 1 2 0 0 1
enumerate rows as dictionary#
from ensae_projects.datainc import enumerate_text_lines
def clean_column_name(s):
return s.replace("_0", "")
for i, row in enumerate(enumerate_text_lines("SINVOICE_.utf8.txt",
encoding="utf-8", header=True, quotes_as_str=True,
clean_column_name=clean_column_name)):
print(list(sorted(row.keys())))
break
['ACCDAT', 'ACCNUM', 'AMTATI', 'AMTATIL', 'AMTNOT', 'AMTNOTL', 'AMTTAX', 'AMTTAXUSA', 'AMTTAX_1', 'AMTTAX_2', 'AMTTAX_3', 'AMTTAX_4', 'AMTTAX_5', 'AMTTAX_6', 'AMTTAX_7', 'AMTTAX_8', 'AMTTAX_9', 'BASTAX', 'BASTAX_1', 'BASTAX_2', 'BASTAX_3', 'BASTAX_4', 'BASTAX_5', 'BASTAX_6', 'BASTAX_7', 'BASTAX_8', 'BASTAX_9', 'BILVCR', 'BPAADDLIG', 'BPAADDLIG_1', 'BPAADDLIG_2', 'BPAINV', 'BPAPAY', 'BPR', 'BPRDAT', 'BPRFCT', 'BPRNAM', 'BPRNAM_1', 'BPRPAY', 'BPRSAC', 'BPRVCR', 'BPYADDLIG', 'BPYADDLIG_1', 'BPYADDLIG_2', 'BPYCRY', 'BPYCRYNAM', 'BPYCTY', 'BPYNAM', 'BPYNAM_1', 'BPYPOSCOD', 'BPYSAT', 'CAI', 'CCE', 'CCE_1', 'CCE_10', 'CCE_11', 'CCE_12', 'CCE_13', 'CCE_14', 'CCE_15', 'CCE_16', 'CCE_17', 'CCE_18', 'CCE_19', 'CCE_2', 'CCE_3', 'CCE_4', 'CCE_5', 'CCE_6', 'CCE_7', 'CCE_8', 'CCE_9', 'CPY', 'CREDAT', 'CREUSR', 'CRY', 'CRYNAM', 'CTY', 'CUR', 'CURLED', 'CURLED_1', 'CURLED_2', 'CURLED_3', 'CURLED_4', 'CURLED_5', 'CURLED_6', 'CURLED_7', 'CURLED_8', 'CURLED_9', 'CURTYP', 'DATVLYCAI', 'DEP', 'DES', 'DES_1', 'DES_2', 'DES_3', 'DES_4', 'DIE', 'DIE_1', 'DIE_10', 'DIE_11', 'DIE_12', 'DIE_13', 'DIE_14', 'DIE_15', 'DIE_16', 'DIE_17', 'DIE_18', 'DIE_19', 'DIE_2', 'DIE_3', 'DIE_4', 'DIE_5', 'DIE_6', 'DIE_7', 'DIE_8', 'DIE_9', 'DIRINVFLG', 'EECNUMDEB', 'ENDDATSVC', 'EXEAMTTAX', 'EXEAMTTAX_1', 'EXEAMTTAX_2', 'EXEAMTTAX_3', 'EXEAMTTAX_4', 'EXEAMTTAX_5', 'EXEAMTTAX_6', 'EXEAMTTAX_7', 'EXEAMTTAX_8', 'EXEAMTTAX_9', 'EXPNUM', 'FCTVCR', 'FCTVCRFLG', 'FCY', 'FIY', 'GTE', 'INVNUM', 'INVTYP', 'ISEXTDOC', 'JOU', 'LASDATSVC', 'LED', 'LED_1', 'LED_2', 'LED_3', 'LED_4', 'LED_5', 'LED_6', 'LED_7', 'LED_8', 'LED_9', 'NBRTAX', 'NUM', 'ORIMOD', 'PER', 'POSCOD', 'PTE', 'QTCACCNUM', 'RATDAT', 'RATDIV', 'RATDIV_1', 'RATDIV_2', 'RATDIV_3', 'RATDIV_4', 'RATDIV_5', 'RATDIV_6', 'RATDIV_7', 'RATDIV_8', 'RATDIV_9', 'RATMLT', 'RATMLT_1', 'RATMLT_2', 'RATMLT_3', 'RATMLT_4', 'RATMLT_5', 'RATMLT_6', 'RATMLT_7', 'RATMLT_8', 'RATMLT_9', 'SAT', 'SINUM', 'SIVTYP', 'SNS', 'STA', 'STARPT', 'STRDATSVC', 'STRDUDDAT', 'TAX', 'TAX_1', 'TAX_2', 'TAX_3', 'TAX_4', 'TAX_5', 'TAX_6', 'TAX_7', 'TAX_8', 'TAX_9', 'THEAMTTAX', 'TRSFAM', 'UPDDAT', 'UPDUSR', 'VAC', 'VATDAT', 'WRHE', 'YCNTTYP', 'YCRY', 'YCSP', 'YCSPCHEF', 'YFREQPASS', 'YMTAVECREDUC', 'YMTREDUCTION', 'YMTSANSREDUC', 'YMTVALMER', 'YNBR', 'YORIG1', 'YORIG2', 'YORIG3', 'YORIG4', 'YORIG5', 'YORIG6', 'YORIG7', 'YORIG8', 'YORIG9', 'YQUOTFAM', 'YRESTEAV', 'YREVREF', 'YSEXFEM', 'YSEXMAL', 'YSITLOG', 'YSITPAR', 'YSITTRA', 'YSOLDEACTUAL', 'YSOLDINITIAL', 'YSSITLOG', 'YT1', 'YT2', 'YT3', 'YT4', 'YT5', 'YT6', 'YT7', 'YT8', 'YTYPOFAM', 'YTYPPROV']
select a few columns and look at them#
bigfile = enumerate_text_lines("SINVOICE_.utf8.txt", encoding="utf-8", header=True, quotes_as_str=True,
clean_column_name=clean_column_name, fLOG=print)
l = map(lambda row: { "BPR":row["BPR"], "BPRDAT":row["BPRDAT"]}, bigfile)
l = list(l)
len(l)
SINVOICE_.utf8.txt - 100000 lines
SINVOICE_.utf8.txt - 200000 lines
SINVOICE_.utf8.txt - 300000 lines
SINVOICE_.utf8.txt - 400000 lines
SINVOICE_.utf8.txt - 500000 lines
SINVOICE_.utf8.txt - 600000 lines
SINVOICE_.utf8.txt - 700000 lines
SINVOICE_.utf8.txt - 800000 lines
SINVOICE_.utf8.txt - 900000 lines
SINVOICE_.utf8.txt - 1000000 lines
SINVOICE_.utf8.txt - 1100000 lines
SINVOICE_.utf8.txt - 1200000 lines
SINVOICE_.utf8.txt - 1300000 lines
SINVOICE_.utf8.txt - 1400000 lines
1446850
import pandas
df = pandas.DataFrame(l)
df.head()
BPR | BPRDAT | |
---|---|---|
0 | 100000104 | 28/10/10 |
1 | 100000177 | 28/10/10 |
2 | 100000161 | 28/10/10 |
3 | 100000280 | 28/10/10 |
4 | 100000198 | 28/10/10 |
df.groupby("BPRDAT").count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0xa3d3a20>