Exploration des logs

Links: notebook, html, PDF, python, slides, GitHub

Traitements de logs générés par un QCM.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline

Observations brutes

import os
names = [os.path.join("logs", _) for _ in os.listdir("logs") if '.log' in _]
names = names[:1]
names
['logs\QCMApp.log']
with open(names[0], 'r', encoding="utf-8") as f:
    lines = f.readlines()
lines[5:10]
['2018-12-12 17:56:29,989,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"client":["167.220.197.38",6274],"game":"simple_french_qcm","qn":"0"}n',
 '2018-12-12 17:56:33,130,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["167.220.197.38",6274],"events":["focus:true,game:simple_french_qcm,qn:0"]}n',
 '2018-12-12 17:56:34,145,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["167.220.197.38",6274],"events":["focus:true,game:simple_french_qcm,qn:0"]}n',
 '2018-12-12 17:56:34,224,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["167.220.196.38",52686],"events":["focus:true,game:simple_french_qcm,qn:0"]}n',
 '2018-12-12 17:56:34,255,INFO,[DATA],{"msg":"answer","session":{"alias":"xavierd"},"client":["167.220.197.38",6274],"data":{"a0":"on","b":"ok","game":"simple_french_qcm","qn":"0","next":"1","events":"-a0,on"}}n']
from mathenjeu.datalog import enumerate_qcmlog
obs = list(enumerate_qcmlog(names))
obs[:5]
[{'person_id': 'c241c15008614ea67480',
  'alias': 'xavierd',
  'time': datetime.datetime(2018, 12, 12, 17, 56, 29, 989000),
  'qtime': 'begin'},
 {'person_id': 'c241c15008614ea67480',
  'alias': 'xavierd',
  'time': datetime.datetime(2018, 12, 12, 17, 56, 34, 255000),
  'qtime': 'end',
  'simple_french_qcm-0-a0': 'on',
  'simple_french_qcm-0-b': 'ok',
  'game': 'simple_french_qcm',
  'qn': '0',
  'next': '1',
  'events': '-a0,on',
  'simple_french_qcm-0-nbvisit': 1.0,
  'simple_french_qcm-0-duration': datetime.timedelta(seconds=4, microseconds=266000)},
 {'person_id': '32606f02fa0df6aac111',
  'alias': 'xavierd',
  'time': datetime.datetime(2018, 12, 12, 17, 56, 34, 302000),
  'qtime': 'begin'},
 {'person_id': '32606f02fa0df6aac111',
  'alias': 'xavierd',
  'time': datetime.datetime(2018, 12, 12, 17, 56, 37, 645000),
  'qtime': 'end',
  'simple_french_qcm-1-a2': 'on',
  'simple_french_qcm-1-b': 'ok',
  'game': 'simple_french_qcm',
  'qn': '1',
  'next': '2',
  'events': '-a2,on',
  'simple_french_qcm-1-nbvisit': 1.0,
  'simple_french_qcm-1-duration': datetime.timedelta(seconds=3, microseconds=343000)},
 {'person_id': '32606f02fa0df6aac111',
  'alias': 'xavierd',
  'time': datetime.datetime(2018, 12, 12, 17, 56, 37, 677000),
  'qtime': 'begin'}]
import pandas
df = pandas.DataFrame(obs)
df.shape
(81, 58)
df.head().T
0 1 2 3 4
person_id c241c15008614ea67480 c241c15008614ea67480 32606f02fa0df6aac111 32606f02fa0df6aac111 32606f02fa0df6aac111
alias xavierd xavierd xavierd xavierd xavierd
time 2018-12-12 17:56:29.989000 2018-12-12 17:56:34.255000 2018-12-12 17:56:34.302000 2018-12-12 17:56:37.645000 2018-12-12 17:56:37.677000
qtime begin end begin end begin
simple_french_qcm-0-a0 NaN on NaN NaN NaN
simple_french_qcm-0-b NaN ok NaN NaN NaN
game NaN simple_french_qcm NaN simple_french_qcm NaN
qn NaN 0 NaN 1 NaN
next NaN 1 NaN 2 NaN
events NaN -a0,on NaN -a2,on NaN
simple_french_qcm-0-nbvisit NaN 1 NaN NaN NaN
simple_french_qcm-0-duration NaT 0 days 00:00:04.266000 NaT NaT NaT
simple_french_qcm-1-a2 NaN NaN NaN on NaN
simple_french_qcm-1-b NaN NaN NaN ok NaN
simple_french_qcm-1-nbvisit NaN NaN NaN 1 NaN
simple_french_qcm-1-duration NaT NaT NaT 0 days 00:00:03.343000 NaT
simple_french_qcm-2-a2 NaN NaN NaN NaN NaN
simple_french_qcm-2-b NaN NaN NaN NaN NaN
simple_french_qcm-2-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-2-duration NaT NaT NaT NaT NaT
simple_french_qcm-3-a2 NaN NaN NaN NaN NaN
simple_french_qcm-3-a3 NaN NaN NaN NaN NaN
simple_french_qcm-3-b NaN NaN NaN NaN NaN
simple_french_qcm-3-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-3-duration NaT NaT NaT NaT NaT
simple_french_qcm-4-a2 NaN NaN NaN NaN NaN
simple_french_qcm-4-b NaN NaN NaN NaN NaN
simple_french_qcm-4-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-4-duration NaT NaT NaT NaT NaT
simple_french_qcm-5-a0 NaN NaN NaN NaN NaN
simple_french_qcm-5-a1 NaN NaN NaN NaN NaN
simple_french_qcm-5-a2 NaN NaN NaN NaN NaN
simple_french_qcm-5-b NaN NaN NaN NaN NaN
simple_french_qcm-5-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-5-duration NaT NaT NaT NaT NaT
simple_french_qcm-6-a3 NaN NaN NaN NaN NaN
simple_french_qcm-6-b NaN NaN NaN NaN NaN
simple_french_qcm-6-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-6-duration NaT NaT NaT NaT NaT
simple_french_qcm-7-a2 NaN NaN NaN NaN NaN
simple_french_qcm-7-b NaN NaN NaN NaN NaN
simple_french_qcm-7-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-7-duration NaT NaT NaT NaT NaT
simple_french_qcm-8-ANS NaN NaN NaN NaN NaN
simple_french_qcm-8-b NaN NaN NaN NaN NaN
simple_french_qcm-8-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-8-duration NaT NaT NaT NaT NaT
simple_french_qcm-3-a0 NaN NaN NaN NaN NaN
simple_french_qcm-6-a2 NaN NaN NaN NaN NaN
simple_french_qcm-1-a1 NaN NaN NaN NaN NaN
simple_french_qcm-4-a0 NaN NaN NaN NaN NaN
simple_french_qcm-6-a5 NaN NaN NaN NaN NaN
simple_french_qcm-7-a0 NaN NaN NaN NaN NaN
simple_french_qcm-0-a1 NaN NaN NaN NaN NaN
-a1 NaN NaN NaN NaN NaN
on-a2 NaN NaN NaN NaN NaN
on NaN NaN NaN NaN NaN
simple_french_qcm-4-a3 NaN NaN NaN NaN NaN
set(df.alias)
{'Clemence', 'thierry-d', 'xavierd', 'xavierg'}

Préparation des données

df2 = df[df.qtime == 'end'].copy()
df2.head().T
1 3 5 7 9
person_id c241c15008614ea67480 32606f02fa0df6aac111 32606f02fa0df6aac111 32606f02fa0df6aac111 32606f02fa0df6aac111
alias xavierd xavierd xavierd xavierd xavierd
time 2018-12-12 17:56:34.255000 2018-12-12 17:56:37.645000 2018-12-12 17:56:44.427000 2018-12-12 17:56:54.317000 2018-12-12 17:57:04.052000
qtime end end end end end
simple_french_qcm-0-a0 on NaN NaN NaN NaN
simple_french_qcm-0-b ok NaN NaN NaN NaN
game simple_french_qcm simple_french_qcm simple_french_qcm simple_french_qcm simple_french_qcm
qn 0 1 2 3 4
next 1 2 3 4 5
events -a0,on -a2,on -a2,on -a2,on-a2,on-a3,on-a2,on -a2,on
simple_french_qcm-0-nbvisit 1 NaN NaN NaN NaN
simple_french_qcm-0-duration 0 days 00:00:04.266000 NaT NaT NaT NaT
simple_french_qcm-1-a2 NaN on NaN NaN NaN
simple_french_qcm-1-b NaN ok NaN NaN NaN
simple_french_qcm-1-nbvisit NaN 1 NaN NaN NaN
simple_french_qcm-1-duration NaT 0 days 00:00:03.343000 NaT NaT NaT
simple_french_qcm-2-a2 NaN NaN on NaN NaN
simple_french_qcm-2-b NaN NaN ok NaN NaN
simple_french_qcm-2-nbvisit NaN NaN 1 NaN NaN
simple_french_qcm-2-duration NaT NaT 0 days 00:00:06.750000 NaT NaT
simple_french_qcm-3-a2 NaN NaN NaN on NaN
simple_french_qcm-3-a3 NaN NaN NaN on NaN
simple_french_qcm-3-b NaN NaN NaN ok NaN
simple_french_qcm-3-nbvisit NaN NaN NaN 0.5 NaN
simple_french_qcm-3-duration NaT NaT NaT 1 days 00:00:00 NaT
simple_french_qcm-4-a2 NaN NaN NaN NaN on
simple_french_qcm-4-b NaN NaN NaN NaN ok
simple_french_qcm-4-nbvisit NaN NaN NaN NaN 1
simple_french_qcm-4-duration NaT NaT NaT NaT 0 days 00:00:09.688000
simple_french_qcm-5-a0 NaN NaN NaN NaN NaN
simple_french_qcm-5-a1 NaN NaN NaN NaN NaN
simple_french_qcm-5-a2 NaN NaN NaN NaN NaN
simple_french_qcm-5-b NaN NaN NaN NaN NaN
simple_french_qcm-5-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-5-duration NaT NaT NaT NaT NaT
simple_french_qcm-6-a3 NaN NaN NaN NaN NaN
simple_french_qcm-6-b NaN NaN NaN NaN NaN
simple_french_qcm-6-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-6-duration NaT NaT NaT NaT NaT
simple_french_qcm-7-a2 NaN NaN NaN NaN NaN
simple_french_qcm-7-b NaN NaN NaN NaN NaN
simple_french_qcm-7-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-7-duration NaT NaT NaT NaT NaT
simple_french_qcm-8-ANS NaN NaN NaN NaN NaN
simple_french_qcm-8-b NaN NaN NaN NaN NaN
simple_french_qcm-8-nbvisit NaN NaN NaN NaN NaN
simple_french_qcm-8-duration NaT NaT NaT NaT NaT
simple_french_qcm-3-a0 NaN NaN NaN NaN NaN
simple_french_qcm-6-a2 NaN NaN NaN NaN NaN
simple_french_qcm-1-a1 NaN NaN NaN NaN NaN
simple_french_qcm-4-a0 NaN NaN NaN NaN NaN
simple_french_qcm-6-a5 NaN NaN NaN NaN NaN
simple_french_qcm-7-a0 NaN NaN NaN NaN NaN
simple_french_qcm-0-a1 NaN NaN NaN NaN NaN
-a1 NaN NaN NaN NaN NaN
on-a2 NaN NaN NaN NaN NaN
on NaN NaN NaN NaN NaN
simple_french_qcm-4-a3 NaN NaN NaN NaN NaN
cols = ['alias'] + [c for c in df2.columns if "simple_french_qcm" in c and '-a' in c]
df_question = df2[cols]
cols = ['alias'] + [c for c in df2.columns if "simple_french_qcm" in c and '-b' in c]
df_bouton = df2[cols]
cols = ['alias'] + [c for c in df2.columns if "simple_french_qcm" in c and '-nb' in c]
df_visit = df2[cols]
cols = ['alias'] + [c for c in df2.columns if "simple_french_qcm" in c and '-ANS' in c]
df_ans = df2[cols]
cols = ['alias'] + [c for c in df2.columns if "simple_french_qcm" in c and '-dur' in c]
df_dur = df2[cols]
df_dur.head().T
1 3 5 7 9
alias xavierd xavierd xavierd xavierd xavierd
simple_french_qcm-0-duration 0 days 00:00:04.266000 NaT NaT NaT NaT
simple_french_qcm-1-duration NaT 0 days 00:00:03.343000 NaT NaT NaT
simple_french_qcm-2-duration NaT NaT 0 days 00:00:06.750000 NaT NaT
simple_french_qcm-3-duration NaT NaT NaT 1 days 00:00:00 NaT
simple_french_qcm-4-duration NaT NaT NaT NaT 0 days 00:00:09.688000
simple_french_qcm-5-duration NaT NaT NaT NaT NaT
simple_french_qcm-6-duration NaT NaT NaT NaT NaT
simple_french_qcm-7-duration NaT NaT NaT NaT NaT
simple_french_qcm-8-duration NaT NaT NaT NaT NaT
import numpy

def aggnotnan_serie(values):
    res = []
    for v in values:
        if isinstance(v, float) and numpy.isnan(v):
            continue
        if pandas.isnull(v):
            continue
        if v in ('ok', 'on'):
            v = 1
        elif v == 'skip':
            v = 1000
        res.append(v)
    if len(res) > 0:
        if isinstance(res[0], str):
            r = ",".join(str(_) for _ in res)
        else:
            if len(res) == 1:
                r = res[0]
            else:
                try:
                    r = sum(res)
                except:
                    r = 0
    else:
        r = numpy.nan
    return r


def aggnotnan(values):
    if isinstance(values, pandas.core.series.Series):
        r = aggnotnan_serie(values)
        return r
    else:
        res = []
        for col in values.columns:
            val = list(values[col])
            res.append(aggnotnan_serie(val))
        df = pandas.DataFrame(res, columns)
        return df

gr_ans = df_ans.groupby("alias").agg(aggnotnan)
gr_ans
simple_french_qcm-8-ANS
alias
Clemence Prout
thierry-d Astérix et Cléopâtre
xavierd thalès
xavierg
gr_dur = df_dur.groupby("alias").agg(aggnotnan)
gr_dur
simple_french_qcm-0-duration simple_french_qcm-1-duration simple_french_qcm-2-duration simple_french_qcm-3-duration simple_french_qcm-4-duration simple_french_qcm-5-duration simple_french_qcm-6-duration simple_french_qcm-7-duration simple_french_qcm-8-duration
alias
Clemence 0 days 00:00:16.530000 0 days 00:00:14.010000 0 days 00:00:28.765000 0 days 00:00:19.492000 0 days 00:03:19.593000 0 days 00:00:11.740000 0 days 00:00:21.868000 0 days 00:00:20.923000 0 days 00:00:14.483000
thierry-d 0 days 00:00:06.904000 0 0 days 00:00:31.978000 0 days 00:00:19.246000 0 days 00:00:21.230000 0 days 00:00:10.153000 0 days 00:00:20.314000 0 days 00:00:17.141000 0 days 00:03:02.506000
xavierd 0 days 00:00:04.266000 0 days 00:00:03.343000 0 days 00:00:06.750000 1 days 00:00:00 0 days 00:00:09.688000 1 days 00:00:00 1 days 00:00:00 0 days 00:00:06.390000 0 days 00:00:04.390000
xavierg 0 days 00:00:02.920000 NaN NaT 0 days 00:00:09.323000 0 0 days 00:00:02.247000 0 days 00:00:00.750000 0 days 00:00:01.188000 0 days 00:00:01.422000
gr_dur.T.plot();
c:python372_x64libsite-packagespandasplotting_matplotlibcore.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)
../_images/example_logs_16_1.png
gr_question = df_question.groupby("alias").agg(aggnotnan)
gr_question.T
alias Clemence thierry-d xavierd xavierg
simple_french_qcm-0-a0 1.0 1.0 1.0 NaN
simple_french_qcm-1-a2 NaN 2.0 1.0 NaN
simple_french_qcm-2-a2 1.0 1.0 1.0 NaN
simple_french_qcm-3-a2 NaN NaN 1.0 1.0
simple_french_qcm-3-a3 NaN NaN 1.0 1.0
simple_french_qcm-4-a2 NaN 1.0 1.0 1.0
simple_french_qcm-5-a0 1.0 NaN 1.0 NaN
simple_french_qcm-5-a1 NaN 1.0 1.0 NaN
simple_french_qcm-5-a2 NaN NaN 1.0 NaN
simple_french_qcm-6-a3 NaN NaN 1.0 NaN
simple_french_qcm-7-a2 NaN 1.0 1.0 NaN
simple_french_qcm-3-a0 1.0 1.0 NaN NaN
simple_french_qcm-6-a2 NaN 1.0 NaN NaN
simple_french_qcm-1-a1 1.0 NaN NaN NaN
simple_french_qcm-4-a0 1.0 NaN NaN NaN
simple_french_qcm-6-a5 1.0 NaN NaN NaN
simple_french_qcm-7-a0 1.0 NaN NaN NaN
simple_french_qcm-0-a1 NaN NaN NaN 1.0
simple_french_qcm-4-a3 NaN NaN NaN 1.0
gr_bouton = df_bouton.groupby("alias").agg(aggnotnan)
gr_bouton.T
alias Clemence thierry-d xavierd xavierg
simple_french_qcm-0-b 1.0 1.0 1.0 1.0
simple_french_qcm-1-b 1.0 1001.0 1.0 NaN
simple_french_qcm-2-b 1.0 1.0 1.0 NaN
simple_french_qcm-3-b 1.0 1.0 1.0 1000.0
simple_french_qcm-4-b 1.0 1.0 1.0 2.0
simple_french_qcm-5-b 1.0 1.0 1.0 1000.0
simple_french_qcm-6-b 1.0 1.0 1.0 1.0
simple_french_qcm-7-b 1.0 1.0 1.0 1.0
simple_french_qcm-8-b 1.0 1.0 1.0 1.0
gr_visit = df_visit.groupby("alias").agg(aggnotnan)
gr_visit.T
alias Clemence thierry-d xavierd xavierg
simple_french_qcm-0-nbvisit 1.0 1.0 1.0 1.0
simple_french_qcm-1-nbvisit 1.0 3.0 1.0 NaN
simple_french_qcm-2-nbvisit 1.0 2.0 1.0 NaN
simple_french_qcm-3-nbvisit 1.0 1.0 0.5 3.0
simple_french_qcm-4-nbvisit 1.0 1.0 1.0 5.0
simple_french_qcm-5-nbvisit 1.0 1.0 0.5 2.0
simple_french_qcm-6-nbvisit 1.0 1.0 0.5 1.0
simple_french_qcm-7-nbvisit 1.0 1.0 1.0 1.0
simple_french_qcm-8-nbvisit 1.0 1.0 1.0 1.0

Histogrammes

nonan_question = gr_question.fillna(0)
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nonan_question.shape[0], 1,
                       figsize=(8, nonan_question.shape[0]))
for i in range(0, nonan_question.shape[0]):
    ax[i].set_ylabel(nonan_question.index[i])
    ax[i].bar(list(range(nonan_question.shape[1])),
              nonan_question.iloc[i,:])
../_images/example_logs_22_0.png

Clustering

nonan_question = gr_question.fillna(0)
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit(nonan_question)
KMeans(n_clusters=2)
pred = km.predict(nonan_question)
pred
array([1, 0, 0, 1])
solution = pandas.DataFrame(data=pred, columns=["cluster"], index=nonan_question.index)
solution
cluster
alias
Clemence 1
thierry-d 0
xavierd 0
xavierg 1

ACP

nonan_question.shape
(4, 19)
from sklearn.decomposition import PCA
acp = PCA(n_components=2, svd_solver='arpack')
acp.fit(nonan_question)
PCA(n_components=2, svd_solver='arpack')
coord = acp.transform(nonan_question)
data = pandas.DataFrame(data=coord, columns=['X1', 'X2'], index=nonan_question.index)
data["cluster"] = solution
data
X1 X2 cluster
alias
Clemence 1.745710 -1.484954 1
thierry-d -1.574330 -0.883546 0
xavierd -1.057572 0.461806 0
xavierg 0.886193 1.906695 1
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
colors = ['red', 'blue', 'orange', 'green']
for i in range(0, 2):
    data[data.cluster==i].plot(x="X1", y="X2",
                               kind="scatter",
                               ax=ax, label="c%d" % i,
                               color=colors[i])
../_images/example_logs_34_0.png
data.columns
Index(['X1', 'X2', 'cluster', 'weight'], dtype='object')
data.loc[data.cluster == 0, 'X1'].values, data.loc[data.cluster == 0, 'X2'].values
(array([-1.57433033, -1.05757229]), array([-0.88354622,  0.46180558]))
data["weight"] = 10
# Plot miles per gallon against horsepower with other semantics
fig, ax = plt.subplots(1, 1)
ax.plot(data.loc[data.cluster == 0, 'X1'].values,
        data.loc[data.cluster == 0, 'X2'].values, 'o', label='c0')
ax.plot(data.loc[data.cluster == 1, 'X1'].values,
        data.loc[data.cluster == 1, 'X2'].values, 'o', label='c1')
ind = list(data.index)
for i in range(0, data.shape[0]):
    ax.text(data.iloc[i, 0], data.iloc[i, 1], ind[i])
ax.set_title('Clustering représenté en 2 dimensions');
../_images/example_logs_37_0.png

Prediction

nonan_question.T
alias Clemence thierry-d xavierd xavierg
simple_french_qcm-0-a0 1.0 1.0 1.0 0.0
simple_french_qcm-1-a2 0.0 2.0 1.0 0.0
simple_french_qcm-2-a2 1.0 1.0 1.0 0.0
simple_french_qcm-3-a2 0.0 0.0 1.0 1.0
simple_french_qcm-3-a3 0.0 0.0 1.0 1.0
simple_french_qcm-4-a2 0.0 1.0 1.0 1.0
simple_french_qcm-5-a0 1.0 0.0 1.0 0.0
simple_french_qcm-5-a1 0.0 1.0 1.0 0.0
simple_french_qcm-5-a2 0.0 0.0 1.0 0.0
simple_french_qcm-6-a3 0.0 0.0 1.0 0.0
simple_french_qcm-7-a2 0.0 1.0 1.0 0.0
simple_french_qcm-3-a0 1.0 1.0 0.0 0.0
simple_french_qcm-6-a2 0.0 1.0 0.0 0.0
simple_french_qcm-1-a1 1.0 0.0 0.0 0.0
simple_french_qcm-4-a0 1.0 0.0 0.0 0.0
simple_french_qcm-6-a5 1.0 0.0 0.0 0.0
simple_french_qcm-7-a0 1.0 0.0 0.0 0.0
simple_french_qcm-0-a1 0.0 0.0 0.0 1.0
simple_french_qcm-4-a3 0.0 0.0 0.0 1.0
xcols = nonan_question.columns[:15]
ycol = nonan_question.columns[16]
ycol, set(nonan_question[ycol])
('simple_french_qcm-7-a0', {0.0, 1.0})
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(nonan_question[xcols], nonan_question[ycol], test_size=0.5)
from sklearn.ensemble import RandomForestClassifier
clr = RandomForestClassifier()
clr.fit(X_train, y_train)
RandomForestClassifier()
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, clr.predict(X_test))
array([[1, 1],
       [0, 0]], dtype=int64)