Times Series

Links: notebook, html, PDF, python, slides, slides(2), GitHub

Material for the hackathon ENSAE / Red Cross / 2015. Extract and plot a time series.

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
Populating the interactive namespace from numpy and matplotlib
from jyquickhelper import add_notebook_menu
add_notebook_menu()

Connection to the cluster

import keyring
hackathon = {}
hackathon["blob_storage"] = keyring.get_password("CRCREDENTIALS", "blobstorage")
hackathon["password"] = keyring.get_password("CRCREDENTIALS", "password")
blobstorage = hackathon["blob_storage"]
blobpassword = hackathon["password"]
%load_ext pyensae
cl, bs = %blob_open
cl, bs
(<pyensae.remote.azure_connection.AzureClient at 0xa145d30>,
 <azure.storage.blob.blobservice.BlobService at 0xa13a6a0>)

Download data

%blob_ls croix-rouge
name last_modified content_type content_length blob_type
0 build/SINVOICE_M.csv Wed, 18 Nov 2015 18:56:27 GMT application/octet-stream 533771533 BlockBlob
1 build/Test_CRFFOR.GACCTMPD.csv Sun, 22 Nov 2015 21:53:38 GMT application/octet-stream 822231942 BlockBlob
2 data/ITMMASTER.schema.txt Mon, 16 Nov 2015 23:00:34 GMT application/octet-stream 5658 BlockBlob
3 data/ITMMASTER.txt Mon, 09 Nov 2015 21:41:00 GMT application/octet-stream 103096479 BlockBlob
4 data/SINVOICE.schema.txt Mon, 16 Nov 2015 23:00:35 GMT application/octet-stream 10252 BlockBlob
5 data/SINVOICE.txt Mon, 09 Nov 2015 21:42:32 GMT application/octet-stream 1362433753 BlockBlob
6 data/SINVOICEV.schema.txt Mon, 16 Nov 2015 23:00:35 GMT application/octet-stream 7999 BlockBlob
7 data/SINVOICEV.txt Mon, 09 Nov 2015 21:44:08 GMT application/octet-stream 1252461865 BlockBlob
8 data/SINVOICEV_.txt Tue, 24 Nov 2015 15:35:49 GMT application/octet-stream 1252461865 BlockBlob
9 data/SINVOICE_.txt Tue, 24 Nov 2015 15:34:54 GMT application/octet-stream 1362433753 BlockBlob
10 data/enseignes_france.csv Mon, 09 Nov 2015 21:40:54 GMT application/octet-stream 6303836 BlockBlob
11 data/stojou.csv Mon, 09 Nov 2015 21:55:23 GMT application/octet-stream 8821375868 BlockBlob
12 readme.txt Sun, 22 Nov 2015 18:00:34 GMT application/octet-stream 45 BlockBlob
%blob_down croix-rouge/data/SINVOICE.schema.txt SINVOICE.schema.txt
'SINVOICE.schema.txt'
import pandas
df = pandas.read_csv("SINVOICE.schema.txt", sep="\t", encoding="utf8")
df.to_excel("SINVOICE.schema.xlsx")
df.head(n=1)
Zone Typ Menu Long Act Dim Intitulé normal Intitulé abrégé Intitulé long Options Table liée Expression de lien Copie législation Annulation Vérification Obligatoire RAZ Mot-clé d'aide
0 SIVTYP TSV 0 NaN NaN 1 Type facture Type fac Type facture NaN TABSIVTYP NaN NaN Bloquant Oui Non Non NaN
%blob_down croix-rouge/data/SINVOICE_.txt SINVOICE_.txt

Operations, conversion, and some others stuff

change encoding

from ensae_projects.datainc import change_encoding
change_encoding("SINVOICE_.txt", "SINVOICE_.utf8.txt", enc1="latin-1")
1446850
import pyensae
%head SINVOICE_.utf8.txt --n=2
"SIVTYP_0"  "INVTYP_0"      "NUM_0" "ORIMOD_0"      "BPR_0" "BPRSAC_0"      "CPY_0" "FCY_0" "GTE_0" "JOU_0" "ACCDAT_0"      "ACCNUM_0"      "BPRDAT_0"      "BPRVCR_0"      "CUR_0" "CURTYP_0"      "LED_0" "LED_1" "LED_2" "LED_3" "LED_4" "LED_5" "LED_6" "LED_7" "LED_8" "LED_9" "CURLED_0"      "CURLED_1"      "CURLED_2"      "CURLED_3"      "CURLED_4"      "CURLED_5"      "CURLED_6"      "CURLED_7"      "CURLED_8"      "CURLED_9"      "RATMLT_0"      "RATMLT_1"      "RATMLT_2"      "RATMLT_3"      "RATMLT_4"      "RATMLT_5"      "RATMLT_6"      "RATMLT_7"      "RATMLT_8"      "RATMLT_9"      "RATDIV_0"      "RATDIV_1"      "RATDIV_2"      "RATDIV_3"      "RATDIV_4"      "RATDIV_5"      "RATDIV_6"      "RATDIV_7"      "RATDIV_8"      "RATDIV_9"      "RATDAT_0"      "BPRPAY_0"      "BPAPAY_0"      "BPYNAM_0"      "BPYNAM_1"      "BPYADDLIG_0"   "BPYADDLIG_1"   "BPYADDLIG_2"   "BPYPOSCOD_0"   "BPYCTY_0"      "BPYSAT_0"      "BPYCRY_0"      "BPYCRYNAM_0"   "BPRFCT_0"      "FCTVCR_0"      "FCTVCRFLG_0"   "QTCACCNUM_0"   "STRDUDDAT_0"   "PTE_0" "DEP_0" "VAC_0" "DIRINVFLG_0"   "EECNUMDEB_0"   "STA_0" "DES_0" "DES_1" "DES_2" "DES_3" "DES_4" "INVNUM_0"      "SNS_0" "AMTATI_0"      "AMTNOT_0"      "AMTNOTL_0"     "AMTATIL_0"     "VATDAT_0"      "NBRTAX_0"      "TAX_0" "TAX_1" "TAX_2" "TAX_3" "TAX_4" "TAX_5" "TAX_6" "TAX_7" "TAX_8" "TAX_9" "BASTAX_0"      "BASTAX_1"      "BASTAX_2"      "BASTAX_3"      "BASTAX_4"      "BASTAX_5"      "BASTAX_6"      "BASTAX_7"      "BASTAX_8"      "BASTAX_9"      "AMTTAX_0"      "AMTTAX_1"      "AMTTAX_2"      "AMTTAX_3"      "AMTTAX_4"      "AMTTAX_5"      "AMTTAX_6"      "AMTTAX_7"      "AMTTAX_8"      "AMTTAX_9"      "THEAMTTAX_0"   "EXEAMTTAX_0"   "EXEAMTTAX_1"   "EXEAMTTAX_2"   "EXEAMTTAX_3"   "EXEAMTTAX_4"   "EXEAMTTAX_5"   "EXEAMTTAX_6"   "EXEAMTTAX_7"   "EXEAMTTAX_8"   "EXEAMTTAX_9"   "DIE_0" "DIE_1" "DIE_2" "DIE_3" "DIE_4" "DIE_5" "DIE_6" "DIE_7" "DIE_8" "DIE_9" "DIE_10"        "DIE_11"        "DIE_12"        "DIE_13"        "DIE_14"        "DIE_15"        "DIE_16"        "DIE_17"        "DIE_18"        "DIE_19"        "CCE_0" "CCE_1" "CCE_2" "CCE_3" "CCE_4" "CCE_5" "CCE_6" "CCE_7" "CCE_8" "CCE_9" "CCE_10"        "CCE_11"        "CCE_12"        "CCE_13"        "CCE_14"        "CCE_15"        "CCE_16"        "CCE_17"        "CCE_18"        "CCE_19"        "BPAINV_0"      "BPRNAM_0"      "BPRNAM_1"      "BPAADDLIG_0"   "BPAADDLIG_1"   "BPAADDLIG_2"   "POSCOD_0"      "CTY_0" "SAT_0" "CRY_0" "CRYNAM_0"      "BILVCR_0"      "TRSFAM_0"      "FIY_0" "PER_0" "STRDATSVC_0"   "ENDDATSVC_0"   "LASDATSVC_0"   "AMTTAXUSA_0"   "CAI_0" "DATVLYCAI_0"   "WRHE_0"        "EXPNUM_0"      "SINUM_0"       "STARPT_0"      "ISEXTDOC_0"    "CREDAT_0"      "CREUSR_0"      "UPDDAT_0"      "YRESTEAV_0"    "UPDUSR_0"      "YNBR_0"        "YTYPOFAM_0"    "YCSP_0"        "YSITPAR_0"     "YSITTRA_0"     "YSITLOG_0"     "YSSITLOG_0"    "YCRY_0"        "YT1_0" "YT2_0" "YT3_0" "YT4_0" "YT5_0" "YT6_0" "YT7_0" "YT8_0" "YSEXMAL_0"     "YSEXFEM_0"     "YSOLDINITIAL_0"        "YSOLDEACTUAL_0"        "YMTSANSREDUC_0"        "YMTAVECREDUC_0"        "YMTREDUCTION_0"        "YMTVALMER_0"   "YORIG1_0"      "YORIG2_0"      "YORIG3_0"      "YORIG4_0"      "YORIG5_0"      "YORIG6_0"      "YORIG7_0"      "YORIG8_0"      "YORIG9_0"      "YFREQPASS_0"   "YCSPCHEF_0"    "YTYPPROV_0"    "YREVREF_0"     "YQUOTFAM_0"    "YCNTTYP_0"
"DIS"       1       "DIS101028000093"       5       "100000104"     "C2"    "CRF"   "U8201" "FACLI" "DIS"   28/10/10        217     28/10/10        " "     "EUR"   1       "FRA"   " "     " "     " "     " "     " "     " "     " "     " "     " "     "EUR"   " "     " "     " "     " "     " "     " "     " "     " "     " "     1       0       0       0       0       0       0       0       0       0       1       0       0       0       0       0       0       0       0       0       28/10/10        "100000104"     "A01"   "supprimé"      "supprimé"      " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     0       0       28/10/10        "COMPTANT"      " "     " "     2       0       3       " "     " "     " "     " "     " "     " "     1       0       0       0       0       28/10/10        0       " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     "A01"   "supprimé"      "supprimé"      " "     " "     " "     " "     " "     " "     " "     " "     " "     " "     0       0       31/12/99        31/12/99        31/12/99        0       " "     31/12/99        " "     1       " "     2       0       28/10/10        "REYC"  16/04/15        18,83666666666667       "YANONIMISATION"        1       3       1       1       3       2       2       "FR"    1       0       0       0       0       0       0       0       1       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       "1F1S"  1       2       0       0       1

enumerate rows as dictionary

from ensae_projects.datainc import enumerate_text_lines
def clean_column_name(s):
    return s.replace("_0", "")
for i, row in enumerate(enumerate_text_lines("SINVOICE_.utf8.txt",
                                             encoding="utf-8", header=True, quotes_as_str=True,
                       clean_column_name=clean_column_name)):

    print(list(sorted(row.keys())))
    break
['ACCDAT', 'ACCNUM', 'AMTATI', 'AMTATIL', 'AMTNOT', 'AMTNOTL', 'AMTTAX', 'AMTTAXUSA', 'AMTTAX_1', 'AMTTAX_2', 'AMTTAX_3', 'AMTTAX_4', 'AMTTAX_5', 'AMTTAX_6', 'AMTTAX_7', 'AMTTAX_8', 'AMTTAX_9', 'BASTAX', 'BASTAX_1', 'BASTAX_2', 'BASTAX_3', 'BASTAX_4', 'BASTAX_5', 'BASTAX_6', 'BASTAX_7', 'BASTAX_8', 'BASTAX_9', 'BILVCR', 'BPAADDLIG', 'BPAADDLIG_1', 'BPAADDLIG_2', 'BPAINV', 'BPAPAY', 'BPR', 'BPRDAT', 'BPRFCT', 'BPRNAM', 'BPRNAM_1', 'BPRPAY', 'BPRSAC', 'BPRVCR', 'BPYADDLIG', 'BPYADDLIG_1', 'BPYADDLIG_2', 'BPYCRY', 'BPYCRYNAM', 'BPYCTY', 'BPYNAM', 'BPYNAM_1', 'BPYPOSCOD', 'BPYSAT', 'CAI', 'CCE', 'CCE_1', 'CCE_10', 'CCE_11', 'CCE_12', 'CCE_13', 'CCE_14', 'CCE_15', 'CCE_16', 'CCE_17', 'CCE_18', 'CCE_19', 'CCE_2', 'CCE_3', 'CCE_4', 'CCE_5', 'CCE_6', 'CCE_7', 'CCE_8', 'CCE_9', 'CPY', 'CREDAT', 'CREUSR', 'CRY', 'CRYNAM', 'CTY', 'CUR', 'CURLED', 'CURLED_1', 'CURLED_2', 'CURLED_3', 'CURLED_4', 'CURLED_5', 'CURLED_6', 'CURLED_7', 'CURLED_8', 'CURLED_9', 'CURTYP', 'DATVLYCAI', 'DEP', 'DES', 'DES_1', 'DES_2', 'DES_3', 'DES_4', 'DIE', 'DIE_1', 'DIE_10', 'DIE_11', 'DIE_12', 'DIE_13', 'DIE_14', 'DIE_15', 'DIE_16', 'DIE_17', 'DIE_18', 'DIE_19', 'DIE_2', 'DIE_3', 'DIE_4', 'DIE_5', 'DIE_6', 'DIE_7', 'DIE_8', 'DIE_9', 'DIRINVFLG', 'EECNUMDEB', 'ENDDATSVC', 'EXEAMTTAX', 'EXEAMTTAX_1', 'EXEAMTTAX_2', 'EXEAMTTAX_3', 'EXEAMTTAX_4', 'EXEAMTTAX_5', 'EXEAMTTAX_6', 'EXEAMTTAX_7', 'EXEAMTTAX_8', 'EXEAMTTAX_9', 'EXPNUM', 'FCTVCR', 'FCTVCRFLG', 'FCY', 'FIY', 'GTE', 'INVNUM', 'INVTYP', 'ISEXTDOC', 'JOU', 'LASDATSVC', 'LED', 'LED_1', 'LED_2', 'LED_3', 'LED_4', 'LED_5', 'LED_6', 'LED_7', 'LED_8', 'LED_9', 'NBRTAX', 'NUM', 'ORIMOD', 'PER', 'POSCOD', 'PTE', 'QTCACCNUM', 'RATDAT', 'RATDIV', 'RATDIV_1', 'RATDIV_2', 'RATDIV_3', 'RATDIV_4', 'RATDIV_5', 'RATDIV_6', 'RATDIV_7', 'RATDIV_8', 'RATDIV_9', 'RATMLT', 'RATMLT_1', 'RATMLT_2', 'RATMLT_3', 'RATMLT_4', 'RATMLT_5', 'RATMLT_6', 'RATMLT_7', 'RATMLT_8', 'RATMLT_9', 'SAT', 'SINUM', 'SIVTYP', 'SNS', 'STA', 'STARPT', 'STRDATSVC', 'STRDUDDAT', 'TAX', 'TAX_1', 'TAX_2', 'TAX_3', 'TAX_4', 'TAX_5', 'TAX_6', 'TAX_7', 'TAX_8', 'TAX_9', 'THEAMTTAX', 'TRSFAM', 'UPDDAT', 'UPDUSR', 'VAC', 'VATDAT', 'WRHE', 'YCNTTYP', 'YCRY', 'YCSP', 'YCSPCHEF', 'YFREQPASS', 'YMTAVECREDUC', 'YMTREDUCTION', 'YMTSANSREDUC', 'YMTVALMER', 'YNBR', 'YORIG1', 'YORIG2', 'YORIG3', 'YORIG4', 'YORIG5', 'YORIG6', 'YORIG7', 'YORIG8', 'YORIG9', 'YQUOTFAM', 'YRESTEAV', 'YREVREF', 'YSEXFEM', 'YSEXMAL', 'YSITLOG', 'YSITPAR', 'YSITTRA', 'YSOLDEACTUAL', 'YSOLDINITIAL', 'YSSITLOG', 'YT1', 'YT2', 'YT3', 'YT4', 'YT5', 'YT6', 'YT7', 'YT8', 'YTYPOFAM', 'YTYPPROV']

select a few columns and look at them

bigfile = enumerate_text_lines("SINVOICE_.utf8.txt", encoding="utf-8", header=True, quotes_as_str=True,
                              clean_column_name=clean_column_name, fLOG=print)
l = map(lambda row: { "BPR":row["BPR"], "BPRDAT":row["BPRDAT"]}, bigfile)
l = list(l)
len(l)
SINVOICE_.utf8.txt - 100000 lines
SINVOICE_.utf8.txt - 200000 lines
SINVOICE_.utf8.txt - 300000 lines
SINVOICE_.utf8.txt - 400000 lines
SINVOICE_.utf8.txt - 500000 lines
SINVOICE_.utf8.txt - 600000 lines
SINVOICE_.utf8.txt - 700000 lines
SINVOICE_.utf8.txt - 800000 lines
SINVOICE_.utf8.txt - 900000 lines
SINVOICE_.utf8.txt - 1000000 lines
SINVOICE_.utf8.txt - 1100000 lines
SINVOICE_.utf8.txt - 1200000 lines
SINVOICE_.utf8.txt - 1300000 lines
SINVOICE_.utf8.txt - 1400000 lines
1446850
import pandas
df = pandas.DataFrame(l)
df.head()
BPR BPRDAT
0 100000104 28/10/10
1 100000177 28/10/10
2 100000161 28/10/10
3 100000280 28/10/10
4 100000198 28/10/10
df.groupby("BPRDAT").count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0xa3d3a20>
../_images/times_series_25_1.png