OnlineNewPopularity (data from UCI)#

Links: notebook, html, PDF, python, slides, GitHub

This notebook suggests a couple of ways to explore the data of a machine learning problem.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
import numpy
import pandas

Download data #

Online News Popularity Data Set

import pyensae.datasource
pyensae.datasource.download_data("OnlineNewsPopularity.zip",
                      url="https://archive.ics.uci.edu/ml/machine-learning-databases/00332/")

['OnlineNewsPopularity/OnlineNewsPopularity.names',
 'OnlineNewsPopularity/OnlineNewsPopularity.csv']

data = pandas.read_csv("OnlineNewsPopularity/OnlineNewsPopularity.csv")
data.columns = [c.strip() for c in data.columns]  # remove spaces around data
data.head()

	url	timedelta	n_tokens_title	n_tokens_content	n_unique_tokens	n_non_stop_words	n_non_stop_unique_tokens	num_hrefs	num_self_hrefs	num_imgs	...	min_positive_polarity	max_positive_polarity	avg_negative_polarity	min_negative_polarity	max_negative_polarity	title_subjectivity	title_sentiment_polarity	abs_title_subjectivity	abs_title_sentiment_polarity	shares
0	http://mashable.com/2013/01/07/amazon-instant-...	731.0	12.0	219.0	0.663594	1.0	0.815385	4.0	2.0	1.0	...	0.100000	0.7	-0.350000	-0.600	-0.200000	0.500000	-0.187500	0.000000	0.187500	593
1	http://mashable.com/2013/01/07/ap-samsung-spon...	731.0	9.0	255.0	0.604743	1.0	0.791946	3.0	1.0	1.0	...	0.033333	0.7	-0.118750	-0.125	-0.100000	0.000000	0.000000	0.500000	0.000000	711
2	http://mashable.com/2013/01/07/apple-40-billio...	731.0	9.0	211.0	0.575130	1.0	0.663866	3.0	1.0	1.0	...	0.100000	1.0	-0.466667	-0.800	-0.133333	0.000000	0.000000	0.500000	0.000000	1500
3	http://mashable.com/2013/01/07/astronaut-notre...	731.0	9.0	531.0	0.503788	1.0	0.665635	9.0	0.0	1.0	...	0.136364	0.8	-0.369697	-0.600	-0.166667	0.000000	0.000000	0.500000	0.000000	1200
4	http://mashable.com/2013/01/07/att-u-verse-apps/	731.0	13.0	1072.0	0.415646	1.0	0.540890	19.0	19.0	20.0	...	0.033333	1.0	-0.220192	-0.500	-0.050000	0.454545	0.136364	0.045455	0.136364	505

5 rows × 61 columns

data.shape

(39644, 61)

import numpy
numeric = [c for i,c in enumerate(data.columns) if data.dtypes[i] in [numpy.float64, numpy.int64]]
len(numeric)

Corr-Pair-Plots and scales #

cmap = seaborn.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True, center="light")
seaborn.clustermap(data[numeric].corr(), figsize=(14, 14), cmap=cmap);

../_images/online_news_popylarity_9_0.png

numeric[:5]

['timedelta',
 'n_tokens_title',
 'n_tokens_content',
 'n_unique_tokens',
 'n_non_stop_words']

data_numeric5 = data[numeric[:5]]

seaborn.clustermap(data_numeric5.corr(), figsize=(6, 6), cmap=cmap);

../_images/online_news_popylarity_12_0.png

data_numeric5[::100].shape

(397, 5)

We take a subsample as the whole dataframe takes time to plot.

def my_pair_plot(df):
    g = seaborn.PairGrid(df, diag_sharey=False)
    g.map_upper(seaborn.scatterplot, s=15)
    g.map_lower(seaborn.kdeplot)
    g.map_diag(seaborn.kdeplot, lw=2)
    return g

my_pair_plot(data_numeric5[::100]);

../_images/online_news_popylarity_15_0.png

Or maybe it is because there are some outliers.

data[data.n_unique_tokens > 100]

	url	timedelta	n_tokens_title	n_tokens_content	n_unique_tokens	n_non_stop_words	n_non_stop_unique_tokens	num_hrefs	num_self_hrefs	num_imgs	...	min_positive_polarity	max_positive_polarity	avg_negative_polarity	min_negative_polarity	max_negative_polarity	title_subjectivity	title_sentiment_polarity	abs_title_subjectivity	abs_title_sentiment_polarity	shares
31037	http://mashable.com/2014/08/18/ukraine-civilia...	142.0	9.0	1570.0	701.0	1042.0	650.0	11.0	10.0	51.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5900

1 rows × 61 columns

We remove this row as it seems an outliar:

data_clean = data[data.n_unique_tokens < 100].copy()

my_pair_plot(data_clean[numeric[:5]][::100]);

../_images/online_news_popylarity_20_0.png

data_clean.hist(figsize=(16,16));

../_images/online_news_popylarity_21_0.png

desc = data_clean.describe().T
desc["log"] = (desc["max"] > desc["50%"] * 9) & (desc["max"] > 1)
desc["scale"] = ""
desc.loc[desc["log"],"scale"] = "log"
desc[["mean", "min", "50%", "max", "scale"]]

	mean	min	50%	max	scale
timedelta	354.535832	8.000000	339.000000	731.000000
n_tokens_title	10.398784	2.000000	10.000000	23.000000
n_tokens_content	546.488914	0.000000	409.000000	8474.000000	log
n_unique_tokens	0.530547	0.000000	0.539216	1.000000
n_non_stop_words	0.970209	0.000000	1.000000	1.000000
n_non_stop_unique_tokens	0.672796	0.000000	0.690476	1.000000
num_hrefs	10.883687	0.000000	8.000000	304.000000	log
num_self_hrefs	3.293469	0.000000	3.000000	116.000000	log
num_imgs	4.542971	0.000000	1.000000	128.000000	log
num_videos	1.249905	0.000000	0.000000	91.000000	log
average_token_length	4.548236	0.000000	4.664078	8.041534
num_keywords	7.223772	1.000000	7.000000	10.000000
data_channel_is_lifestyle	0.052948	0.000000	0.000000	1.000000
data_channel_is_entertainment	0.177989	0.000000	0.000000	1.000000
data_channel_is_bus	0.157859	0.000000	0.000000	1.000000
data_channel_is_socmed	0.058598	0.000000	0.000000	1.000000
data_channel_is_tech	0.185304	0.000000	0.000000	1.000000
data_channel_is_world	0.212572	0.000000	0.000000	1.000000
kw_min_min	26.107484	-1.000000	-1.000000	377.000000	log
kw_max_min	1153.961166	0.000000	660.000000	298400.000000	log
kw_avg_min	312.371221	-1.000000	235.500000	42827.857143	log
kw_min_max	13612.114774	0.000000	1400.000000	843300.000000	log
kw_max_max	752321.771813	0.000000	843300.000000	843300.000000
kw_avg_max	259280.143039	0.000000	244566.666667	843300.000000
kw_min_avg	1117.113731	-1.000000	1023.619048	3613.039820
kw_max_avg	5657.265804	0.000000	4355.694105	298400.000000	log
kw_avg_avg	3135.864283	0.000000	2870.047184	43567.659946	log
self_reference_min_shares	3998.836211	0.000000	1200.000000	843300.000000	log
self_reference_max_shares	10329.473218	0.000000	2800.000000	843300.000000	log
self_reference_avg_sharess	6401.684395	0.000000	2200.000000	843300.000000	log
weekday_is_monday	0.168025	0.000000	0.000000	1.000000
weekday_is_tuesday	0.186389	0.000000	0.000000	1.000000
weekday_is_wednesday	0.187549	0.000000	0.000000	1.000000
weekday_is_thursday	0.183311	0.000000	0.000000	1.000000
weekday_is_friday	0.143808	0.000000	0.000000	1.000000
weekday_is_saturday	0.061877	0.000000	0.000000	1.000000
weekday_is_sunday	0.069041	0.000000	0.000000	1.000000
is_weekend	0.130918	0.000000	0.000000	1.000000
LDA_00	0.184604	0.018182	0.033387	0.926994
LDA_01	0.141259	0.018182	0.033345	0.925947
LDA_02	0.216326	0.018182	0.040004	0.919999
LDA_03	0.223775	0.018182	0.040001	0.926534
LDA_04	0.234035	0.018182	0.040727	0.927191
global_subjectivity	0.443381	0.000000	0.453458	1.000000
global_sentiment_polarity	0.119312	-0.393750	0.119119	0.727841
global_rate_positive_words	0.039626	0.000000	0.039024	0.155488
global_rate_negative_words	0.016613	0.000000	0.015337	0.184932
rate_positive_words	0.682167	0.000000	0.710526	1.000000
rate_negative_words	0.287941	0.000000	0.280000	1.000000
avg_positive_polarity	0.353834	0.000000	0.358760	1.000000
min_positive_polarity	0.095448	0.000000	0.100000	1.000000
max_positive_polarity	0.756747	0.000000	0.800000	1.000000
avg_negative_polarity	-0.259531	-1.000000	-0.253333	0.000000
min_negative_polarity	-0.521957	-1.000000	-0.500000	0.000000
max_negative_polarity	-0.107503	-1.000000	-0.100000	0.000000
title_subjectivity	0.282360	0.000000	0.150000	1.000000
title_sentiment_polarity	0.071427	-1.000000	0.000000	1.000000
abs_title_subjectivity	0.341851	0.000000	0.500000	0.500000
abs_title_sentiment_polarity	0.156068	0.000000	0.000000	1.000000
shares	3395.317004	1.000000	1400.000000	843300.000000	log

numpy.log(data_clean["shares"]).hist(bins=50);

../_images/online_news_popylarity_23_0.png

shares = data_clean[[c for c in numeric if "share" in c]].copy()
for c in shares.columns:
    shares[c] = numpy.log(shares[c] + 1)
seaborn.pairplot(shares);

../_images/online_news_popylarity_24_0.png

kw = data_clean[[c for c in numeric if "kw" in c]].copy()
seaborn.pairplot(kw);

../_images/online_news_popylarity_25_0.png

Outcome, cleaning scaling #

cleaning

data_clean = data[data.n_unique_tokens < 100].copy()

scaling: we consider that if the maximum is far away from the mediane, the scale should be logarithmic as it is far way from a gaussian law, it just applies on this problem

desc = data_clean.describe().T
desc["log"] = (desc["max"] > desc["50%"] * 9) & (desc["max"] > 1)
desc["log+2"] = desc["log"] & (desc["min"] < 0)
desc["scale"] = ""
desc.loc[desc["log"],"scale"] = "log"
desc.loc[desc["log+2"],"scale"] = "log+2"
desc[["mean", "min", "50%", "max", "scale"]]

	mean	min	50%	max	scale
timedelta	354.535832	8.000000	339.000000	731.000000
n_tokens_title	10.398784	2.000000	10.000000	23.000000
n_tokens_content	546.488914	0.000000	409.000000	8474.000000	log
n_unique_tokens	0.530547	0.000000	0.539216	1.000000
n_non_stop_words	0.970209	0.000000	1.000000	1.000000
n_non_stop_unique_tokens	0.672796	0.000000	0.690476	1.000000
num_hrefs	10.883687	0.000000	8.000000	304.000000	log
num_self_hrefs	3.293469	0.000000	3.000000	116.000000	log
num_imgs	4.542971	0.000000	1.000000	128.000000	log
num_videos	1.249905	0.000000	0.000000	91.000000	log
average_token_length	4.548236	0.000000	4.664078	8.041534
num_keywords	7.223772	1.000000	7.000000	10.000000
data_channel_is_lifestyle	0.052948	0.000000	0.000000	1.000000
data_channel_is_entertainment	0.177989	0.000000	0.000000	1.000000
data_channel_is_bus	0.157859	0.000000	0.000000	1.000000
data_channel_is_socmed	0.058598	0.000000	0.000000	1.000000
data_channel_is_tech	0.185304	0.000000	0.000000	1.000000
data_channel_is_world	0.212572	0.000000	0.000000	1.000000
kw_min_min	26.107484	-1.000000	-1.000000	377.000000	log+2
kw_max_min	1153.961166	0.000000	660.000000	298400.000000	log
kw_avg_min	312.371221	-1.000000	235.500000	42827.857143	log+2
kw_min_max	13612.114774	0.000000	1400.000000	843300.000000	log
kw_max_max	752321.771813	0.000000	843300.000000	843300.000000
kw_avg_max	259280.143039	0.000000	244566.666667	843300.000000
kw_min_avg	1117.113731	-1.000000	1023.619048	3613.039820
kw_max_avg	5657.265804	0.000000	4355.694105	298400.000000	log
kw_avg_avg	3135.864283	0.000000	2870.047184	43567.659946	log
self_reference_min_shares	3998.836211	0.000000	1200.000000	843300.000000	log
self_reference_max_shares	10329.473218	0.000000	2800.000000	843300.000000	log
self_reference_avg_sharess	6401.684395	0.000000	2200.000000	843300.000000	log
weekday_is_monday	0.168025	0.000000	0.000000	1.000000
weekday_is_tuesday	0.186389	0.000000	0.000000	1.000000
weekday_is_wednesday	0.187549	0.000000	0.000000	1.000000
weekday_is_thursday	0.183311	0.000000	0.000000	1.000000
weekday_is_friday	0.143808	0.000000	0.000000	1.000000
weekday_is_saturday	0.061877	0.000000	0.000000	1.000000
weekday_is_sunday	0.069041	0.000000	0.000000	1.000000
is_weekend	0.130918	0.000000	0.000000	1.000000
LDA_00	0.184604	0.018182	0.033387	0.926994
LDA_01	0.141259	0.018182	0.033345	0.925947
LDA_02	0.216326	0.018182	0.040004	0.919999
LDA_03	0.223775	0.018182	0.040001	0.926534
LDA_04	0.234035	0.018182	0.040727	0.927191
global_subjectivity	0.443381	0.000000	0.453458	1.000000
global_sentiment_polarity	0.119312	-0.393750	0.119119	0.727841
global_rate_positive_words	0.039626	0.000000	0.039024	0.155488
global_rate_negative_words	0.016613	0.000000	0.015337	0.184932
rate_positive_words	0.682167	0.000000	0.710526	1.000000
rate_negative_words	0.287941	0.000000	0.280000	1.000000
avg_positive_polarity	0.353834	0.000000	0.358760	1.000000
min_positive_polarity	0.095448	0.000000	0.100000	1.000000
max_positive_polarity	0.756747	0.000000	0.800000	1.000000
avg_negative_polarity	-0.259531	-1.000000	-0.253333	0.000000
min_negative_polarity	-0.521957	-1.000000	-0.500000	0.000000
max_negative_polarity	-0.107503	-1.000000	-0.100000	0.000000
title_subjectivity	0.282360	0.000000	0.150000	1.000000
title_sentiment_polarity	0.071427	-1.000000	0.000000	1.000000
abs_title_subjectivity	0.341851	0.000000	0.500000	0.500000
abs_title_sentiment_polarity	0.156068	0.000000	0.000000	1.000000
shares	3395.317004	1.000000	1400.000000	843300.000000	log

import numpy
new_data = data_clean.copy()
for c in desc.index [ desc["scale"] == "log"]:
    new_data[c] = numpy.log(new_data[c] + 1)
for c in desc.index [ desc["scale"] == "log+2"]:
    new_data[c] = numpy.log(new_data[c] + 2)

new_data.shape

(39643, 61)

set(new_data.dtypes)

{dtype('float64'), dtype('O')}

from sklearn.model_selection import train_test_split

features = new_data[[c for c in numeric if c != "shares"]]
target = new_data["shares"]
X_train, X_test, y_train, y_test = train_test_split(features, target)

learning #

RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor
clr = RandomForestRegressor(min_samples_leaf=20, n_estimators=50, min_weight_fraction_leaf=0.01, min_samples_split=10)
clr.fit(X_train, y_train)

RandomForestRegressor(min_samples_leaf=20, min_samples_split=10,
                      min_weight_fraction_leaf=0.01, n_estimators=50)

tpredicted = clr.predict(X_train)
df = pandas.DataFrame()
df["train_predicted"] = tpredicted
df["train_expected"] = y_train
df.corr()

	train_predicted	train_expected
train_predicted	1.000000	0.004091
train_expected	0.004091	1.000000

df = pandas.DataFrame()
df["test_predicted"] = clr.predict(X_test)
df["test_expected"] = y_test
df.corr()

	test_predicted	test_expected
test_predicted	1.00000	-0.00921
test_expected	-0.00921	1.00000

df.plot(x ="test_expected", y="test_predicted", kind="scatter");

../_images/online_news_popylarity_38_0.png

GradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(min_samples_leaf=20, n_estimators=50, min_weight_fraction_leaf=0.01, min_samples_split=10)
est.fit(X_train, y_train)

GradientBoostingRegressor(min_samples_leaf=20, min_samples_split=10,
                          min_weight_fraction_leaf=0.01, n_estimators=50)

tpredicted = est.predict(X_train)
df = pandas.DataFrame()
df["train_predicted"] = tpredicted
df["train_expected"] = y_train
df.corr()

	train_predicted	train_expected
train_predicted	1.000000	0.008707
train_expected	0.008707	1.000000

df = pandas.DataFrame()
df["train_predicted"] = est.predict(X_train)
df["train_expected"] = y_train
df.corr()

	train_predicted	train_expected
train_predicted	1.000000	0.008707
train_expected	0.008707	1.000000

XGBRegressor

import xgboost
clxg = xgboost.XGBRegressor(max_depth=10, learning_rate=0.3, n_estimators=50)
clxg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

xgpredicted = clxg.predict(X_train)
df = pandas.DataFrame()
df["train_predicted"] = xgpredicted
df["train_expected"] = y_train
df.corr()

	train_predicted	train_expected
train_predicted	1.000000	0.000811
train_expected	0.000811	1.000000

# trop long
#from sklearn import tree
#from sklearn.ensemble import AdaBoostRegressor
#clfr = tree.DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=10)
#clf2 = AdaBoostRegressor(clfr, n_estimators=800, learning_rate=0.5)
#clf2.fit(X_train, y_train)

t-SNE #

Comparison of Manifold Learning methods, t-SNE, t-distributed Stochastic Neighbor Embedding (t-SNE)

from sklearn.model_selection import train_test_split
X_1, X_2, y_1, y_2 = train_test_split(X_train.reset_index(drop=True),
                        y_train.reset_index(drop=True), test_size=0.2, random_state=42)
X_1.shape, X_2.shape

((23785, 59), (5947, 59))

from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
model

TSNE(random_state=0)

W_2 = model.fit_transform(X_2)

i_2 = y_2.astype(int)

W_2.shape, X_2.shape, y_2.shape, i_2.shape

((5947, 2), (5947, 59), (5947,), (5947,))

mini, maxi = min(i_2), max(i_2)+1

import matplotlib.pyplot as plt
f, ax = plt.subplots()

for i in range(mini, maxi):
    ind = numpy.array(numpy.where(i_2==i)).T
    print(i, ind.shape)
    if i in(6,7,8):
        continue
    m = "o" if i <= 9 else "o"
    r = 1.0*i / maxi
    ax.plot(W_2[ind,0], W_2[ind,1], m, color =(r, 1-r, 0.0), label=str(i))
ax.legend()
ax;

(1, 1)
(1, 1)
(0, 1)
(3, 1)
(16, 1)
(38, 1)
(1821, 1)
(2646, 1)
(978, 1)
(321, 1)
(96, 1)
(24, 1)
(2, 1)

../_images/online_news_popylarity_54_1.png

Links

Contents

Information

Previous topic

Next topic

OnlineNewPopularity (data from UCI)#

Download data #

Corr-Pair-Plots and scales #

Outcome, cleaning scaling #

learning #

t-SNE #

Links

Contents

Information

Previous topic

Next topic

OnlineNewPopularity (data from UCI)#

Download data#

Corr-Pair-Plots and scales#

Outcome, cleaning scaling#

learning#

t-SNE#

Download data #

Corr-Pair-Plots and scales #

Outcome, cleaning scaling #

learning #

t-SNE #