Cheat Sheet on files#

Links: notebook, html, PDF, python, slides, GitHub

Cheat sheet on files.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

change the encoding of a file#

with open("essai.txt", "w", encoding="latin-1") as f:
    f.write("ée\nàà")
from ensae_projects.datainc import change_encoding
change_encoding("essai.txt", "essai.utf8.txt", enc1="latin-1", enc2="utf-8")
1
with open("essai.utf8.txt", "r", encoding="utf8") as f:
    s = f.read()
print(s)
ée
àà

select a subset of columns from a tsv files#

import pyensae.datasource
%load_ext pyensae
files = pyensae.datasource.download_data("OnlineNewsPopularity.zip",
                              website="http://archive.ics.uci.edu/ml/machine-learning-databases/00332/")
files[1]
'OnlineNewsPopularity/OnlineNewsPopularity.csv'
%head OnlineNewsPopularity/OnlineNewsPopularity.csv -n 2
url, timedelta, n_tokens_title, n_tokens_content, n_unique_tokens, n_non_stop_words, n_non_stop_unique_tokens, num_hrefs, num_self_hrefs, num_imgs, num_videos, average_token_length, num_keywords, data_channel_is_lifestyle, data_channel_is_entertainment, data_channel_is_bus, data_channel_is_socmed, data_channel_is_tech, data_channel_is_world, kw_min_min, kw_max_min, kw_avg_min, kw_min_max, kw_max_max, kw_avg_max, kw_min_avg, kw_max_avg, kw_avg_avg, self_reference_min_shares, self_reference_max_shares, self_reference_avg_sharess, weekday_is_monday, weekday_is_tuesday, weekday_is_wednesday, weekday_is_thursday, weekday_is_friday, weekday_is_saturday, weekday_is_sunday, is_weekend, LDA_00, LDA_01, LDA_02, LDA_03, LDA_04, global_subjectivity, global_sentiment_polarity, global_rate_positive_words, global_rate_negative_words, rate_positive_words, rate_negative_words, avg_positive_polarity, min_positive_polarity, max_positive_polarity, avg_negative_polarity, min_negative_polarity, max_negative_polarity, title_subjectivity, title_sentiment_polarity, abs_title_subjectivity, abs_title_sentiment_polarity, shares
http://mashable.com/2013/01/07/amazon-instant-video-browser/, 731.0, 12.0, 219.0, 0.663594466988, 0.999999992308, 0.815384609112, 4.0, 2.0, 1.0, 0.0, 4.6803652968, 5.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 496.0, 496.0, 496.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.500331204081, 0.378278929586, 0.0400046751006, 0.0412626477296, 0.0401225435029, 0.521617145481, 0.0925619834711, 0.0456621004566, 0.013698630137, 0.769230769231, 0.230769230769, 0.378636363636, 0.1, 0.7, -0.35, -0.6, -0.2, 0.5, -0.1875, 0.0, 0.1875, 593

from ensae_projects.datainc import enumerate_text_lines
def clean_column_name(s):
    return s.strip()
bigfile = enumerate_text_lines("OnlineNewsPopularity/OnlineNewsPopularity.csv",
                               encoding="utf-8", header=True, quotes_as_str=False,
                               sep=",",
                               clean_column_name=clean_column_name, fLOG=print)
res = list(map(lambda row: {"LDA_00": row["LDA_00"], "title_sentiment_polarity":row["title_sentiment_polarity"]},
            bigfile))
len(res)
39644
import pandas
df = pandas.DataFrame(res)
df.head()
LDA_00 title_sentiment_polarity
0 0.500331204081 -0.1875
1 0.799755687423 0.0
2 0.217792288518 0.0
3 0.0285732164707 0.0
4 0.0286328101715 0.136363636364

look at the head or tail of a file#

We use magic commands %head and %tail.

%load_ext pyensae
The pyensae extension is already loaded. To reload it, use:
  %reload_ext pyensae
%head essai.txt -n 1 -s ignore
e

%tail essai.txt -n 1 -s ignore
e

select lines of a flat file based on a regular expression#

%load_ext pyensae
The pyensae extension is already loaded. To reload it, use:
  %reload_ext pyensae
%grep essai.utf8.txt .*é.*
ée

More complex, we extract all lines containing a substring and we add the header to the file to make it look like a dataframe. We do that usually when we cannot load a big file into memory with pandas due to the lack of memory. This code relies on magic command grep and function enumerate_grep.

import pandas
import pyensae
df = pandas.DataFrame([dict(name="Dupré", first_name="Xavier"),
                       dict(name="Dupré", first_name="Sloane")])
df.to_csv("data.txt", encoding="utf8", index=False)
%head data.txt
first_name,name
Xavier,Dupré
Sloane,Dupré

raw = %grep data.txt Xavier --raw
raw
'Xavier,Duprén'
header = %head data.txt -n 1 --raw
header
'first_name,namen'
with open("data_xavier.txt", "w", encoding="utf8") as f:
    f.write(header)
    f.write(raw)

%head data_xavier.txt
first_name,name
Xavier,Dupré

pandas.read_csv("data_xavier.txt")
first_name name
0 Xavier Dupré