missingno represents missing values in dataframe.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline
Taken from NYPD-Motor-Vehicle-Collisions.
import pandas, os
if not os.path.exists("NYPD_Motor_Vehicle_Collisions_sample.csv"):
# the full file is 153 Mb
df = pandas.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
df.sample(10000).to_csv("NYPD_Motor_Vehicle_Collisions_sample.csv")
df = pandas.read_csv("NYPD_Motor_Vehicle_Collisions_sample.csv")
df.dtypes
Unnamed: 0 int64 DATE object TIME object BOROUGH object ZIP CODE float64 LATITUDE float64 LONGITUDE float64 LOCATION object ON STREET NAME object CROSS STREET NAME object OFF STREET NAME object NUMBER OF PERSONS INJURED int64 NUMBER OF PERSONS KILLED int64 NUMBER OF PEDESTRIANS INJURED int64 NUMBER OF PEDESTRIANS KILLED int64 NUMBER OF CYCLIST INJURED int64 NUMBER OF CYCLIST KILLED int64 NUMBER OF MOTORIST INJURED int64 NUMBER OF MOTORIST KILLED int64 CONTRIBUTING FACTOR VEHICLE 1 object CONTRIBUTING FACTOR VEHICLE 2 object CONTRIBUTING FACTOR VEHICLE 3 object CONTRIBUTING FACTOR VEHICLE 4 object CONTRIBUTING FACTOR VEHICLE 5 object UNIQUE KEY int64 VEHICLE TYPE CODE 1 object VEHICLE TYPE CODE 2 object VEHICLE TYPE CODE 3 object VEHICLE TYPE CODE 4 object VEHICLE TYPE CODE 5 object dtype: object
sam = df.sample(250)
import missingno
missingno.matrix(sam)
try:
missingno.heatmap(sam)
except KeyError:
print("Maybe a mismatch between pandas and missingno.")
missingno.dendrogram(sam)
filtered_data = missingno.nullity_filter(sam, filter='bottom', n=15, p=0.999)
missingno.matrix(filtered_data)
sorted_data = missingno.nullity_sort(sam, sort='descending')
missingno.matrix(sorted_data.sample(250))