#!/usr/bin/env python # -*- coding: utf-8 -*- # # Import a flat file into a SQLite database # # Importing a flatfile can be done with *pandas*. *pyensae* proposes a function which does so by guessing the schema over the first lines. # In[1]: import pyensae from jyquickhelper import add_notebook_menu add_notebook_menu() # ### Mix SQLite and DataFrame # When a dataset is huge (~3Gb), it takes some time to load it into a DataFrame. It is difficult to look at it in any tool (Python, Excel, ...) One option I usually do is to load it a SQL server if you have one. If you do not, then [SQLite](http://www.sqlite.org/) is the best option. Let's see how it works with a custom datasets. # In[2]: import pyensae import pyensae.datasource pyensae.datasource.download_data("velib_vanves.zip", website = "xd") # As this file is small (just an example), let's see how it looks like with a DataFrame. # In[3]: import pandas df = pandas.read_csv("velib_vanves.txt",sep="\t") df.head(n=2) # Then we import it into a SQLite3 database. The following function automatically guesses the table schema. # In[4]: from pyensae.sql import import_flatfile_into_database import_flatfile_into_database("velib_vanves.db3", "velib_vanves.txt", add_key="key") # We check the database exists: # In[5]: import os os.listdir(".") # On Windows, you can use [SQLiteSpy](http://www.yunqa.de/delphi/doku.php/products/sqlitespy/index) to visualize the created table. We use [pymysintall](http://www.xavierdupre.fr/app/pymyinstall/helpsphinx/index.html) to download it. # In[6]: try: from pymyinstall.installcustom import install_sqlitespy exe = install_sqlitespy() except: # we skip an exception # the website can be down... exe = None exe # We just need to run it (see [run_cmd](http://www.xavierdupre.fr/app/pyquickhelper/helpsphinx/pyquickhelper/loghelper/flog.html?highlight=run_cmd#loghelper.flog.run_cmd)). # In[7]: if exe: from pyquickhelper import run_cmd run_cmd("SQLiteSpy.exe velib_vanves.db3") # You should be able to see something like (on Windows): # In[8]: from pyquickhelper.helpgen import NbImage NbImage('img_nb_sqlitespy.png') # It is easier to use that tool to extract a sample of the data. Once it is ready, you can execute the SQL query in Python and converts the results into a DataFrame. The following code extracts a random sample from the original sets. # In[9]: sql = """SELECT * FROM velib_vanves WHERE key IN ({0})""" import random from pyquickhelper.loghelper import noLOG from pyensae.sql import Database db = Database("velib_vanves.db3", LOG = noLOG) db.connect() mx = db.execute_view("SELECT MAX(key) FROM velib_vanves")[0][0] rnd_ids = [ random.randint(1,mx) for i in range(0,100) ] # liste de 100 id aléatoires strids = ",".join( str(_) for _ in rnd_ids ) res = db.execute_view(sql.format (strids)) df = db.to_df(sql.format (strids)) db.close() df.head()[["key","last_update","available_bike_stands","available_bikes"]] #