# -*- coding: utf-8 -*-
"""
Helpers to process data from logs.
:githublink:`%|py|6`
"""
import re
from datetime import datetime
import hashlib
import numpy
import pandas
import ujson
[docs]def _duration(seq):
dt = None
t1 = None
for t, e in seq:
if e == 'enter':
t1 = t
elif e == 'leave':
if t1 is None:
# raise RuntimeError("Wrong logging {0}".format(seq))
return datetime(2018, 1, 2) - datetime(2018, 1, 1)
if dt is None:
dt = t - t1
else:
dt += t - t1
t1 = None
return dt
[docs]def _enumerate_processed_row(rows, data, cache, last_key, set_expected_answers=None):
"""
Converts time, data as dictionary into other data
as dictionary.
:param rows: previous rows
:param data: data as dictionaries
:param cache: cache events
:param last_key: last seen key
:param set_expected_answers: set of expected answers,
adds a field if one is found
:return: iterator on clean rows
:githublink:`%|py|44`
"""
def comma_semi(st):
if st is None:
return {}
res = {}
for val in st.split(','):
spl = val.split(':')
if len(spl) == 1:
res[spl[0]] = True
elif len(spl) == 2:
res[spl[0]] = spl[1]
else:
raise ValueError( # pragma: no cover
"Unable to parse value '{0}'".format(st))
return res
def hash4alias(st):
by = st.encode("utf-8")
m = hashlib.sha256()
m.update(by)
res = m.hexdigest()
return res[:20] if len(res) > 20 else res
session = data.get('session', None)
ipadd = data.get('client', ['NN.NN.NN.NN'])[0]
if ipadd is None:
raise ValueError( # pragma: no cover
"Unable to extract an ip address from {0}".format(data))
keys = {'qn', 'game', 'next', 'events'}
if session is not None: # pylint: disable=R1702
alias = session['alias']
person_id = hash4alias(alias + ipadd)
res = dict(person_id=person_id, alias=alias, time=data['time'])
event = data.get('msg', None)
if event == 'qcm':
res['qtime'] = 'begin'
key = person_id, alias, data['game'], data['qn']
if key not in cache:
cache[key] = []
cache[key].append((data['time'], 'enter'))
if len(last_key) > 0:
cache[last_key[0]].append((data['time'], 'leave'))
last_key.clear()
last_key.append(key)
yield res
events = data.get('events', None)
res0 = res.copy()
res0['qtime'] = 'event'
if events is not None:
if not isinstance(events, list):
events = [events]
res = res0.copy()
for event in events:
ev = comma_semi(event)
res.update(ev)
yield res
elif event == "answer":
res["qtime"] = 'end'
q = data.get('data', None)
good = {}
if q is not None:
qn = q['qn']
game = q['game']
q2 = {}
for k, v in q.items():
if k in keys:
q2[k] = v
else:
key = "{0}-{1}-{2}".format(game, qn, k)
q2[key] = v
key_short = "{0}-{1}".format(game, qn)
if key in set_expected_answers:
good[key_short] = 1
elif key_short not in good:
good[key_short] = 0
res.update(q2)
key = person_id, alias, q['game'], q['qn']
if key not in cache:
cache[key] = []
cache[key].append((data['time'], 'leave'))
duration = _duration(cache[key])
res["{0}-{1}-{2}".format(
game, qn, 'nbvisit')] = len(cache[key]) * 0.5
res["{0}-{1}-{2}".format(game, qn, 'duration')] = duration
for k, v in good.items():
res[k + '-good'] = v
last_key.clear()
yield res
events = data.get('events', None)
res0 = res.copy()
res0['qtime'] = 'event'
if events is not None:
if not isinstance(events, list):
events = [events]
res = res0.copy()
for event in events:
ev = comma_semi(event)
res.update(ev)
yield res
[docs]def enumerate_qcmlog(files, expected_answers=None):
"""
Processes many files of logs produced by application
:class:`QCMApp <mathenjeu.apps.qcm.qcm_app.QCMApp>`.
:param files: list of filenames
:param expected_answers: expected answers
:return: iterator on observations as dictionary
Example of data it processes::
2018-12-12 17:56:42,833,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]}
2018-12-12 17:56:44,270,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]}
2018-12-12 17:56:44,349,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]}
2018-12-12 17:56:44,458,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"game":"simple_french_qcm","qn":"3"}
2018-12-12 17:56:49,427,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
2018-12-12 17:56:50,817,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
2018-12-12 17:56:50,864,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
2018-12-12 17:56:53,302,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
2018-12-12 17:56:53,333,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
2018-12-12 17:56:54,208,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
2018-12-12 17:56:54,239,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
:githublink:`%|py|172`
"""
set_expected_answers = set()
if expected_answers is not None:
for a in expected_answers:
for _ in a:
set_expected_answers.add(_)
rows = []
cache = {}
last_key = []
for name in files:
if len(rows) > 1000:
rows = rows[-1000:]
with open(name, "r", encoding="utf-8") as f:
for line in f.readlines():
if "[DATA]" not in line:
continue
line = line.strip("\n\r")
spl = line.split(",INFO,[DATA],")
ti = spl[0]
sdata = ",INFO,[DATA],".join(spl[1:])
try:
data = ujson.loads(sdata) # pylint: disable=E1101
except ValueError:
if '"' not in sdata and "'" in sdata:
sdata2 = sdata.replace("'", '"')
try:
data = ujson.loads(sdata2) # pylint: disable=E1101
except ValueError:
if '"msg": "finish"' in sdata2:
# Fix the code somewhere else.
sdata3 = sdata2.replace(
'"client": ("', '"client": ["')
sdata3 = sdata3.replace(
'), "data": QueryParams', '], "data": QueryParams')
sdata3 = re.sub(
'QueryParams\\(\\"game=([a-z_]+)\\"\\)', '{"game":"\\1"}', sdata3)
try:
data = ujson.loads( # pylint: disable=E1101
sdata3)
except ValueError as e:
raise ValueError(
"Unable to process line\n{}\n{}\n{}".format(
sdata, sdata2, sdata3)) from e
else:
raise ValueError(
"Unable to process line\n{}\n{}".format(
sdata, sdata2)) from e
tid = datetime.strptime(ti, '%Y-%m-%d %H:%M:%S,%f')
data['time'] = tid
obss = _enumerate_processed_row(
rows, data, cache, last_key, set_expected_answers)
for obs in obss:
yield obs
rows.append(data)
[docs]def _aggnotnan_serie(values):
res = []
for v in values:
if isinstance(v, float) and numpy.isnan(v):
continue
if pandas.isnull(v):
continue
if v in ('ok', 'on'):
v = 1
elif v == 'skip':
v = 1000
res.append(v)
if len(res) > 0:
if isinstance(res[0], str):
r = ",".join(str(_) for _ in res)
else:
if len(res) == 1:
r = res[0]
else:
try:
r = sum(res)
except TypeError:
r = 0
else:
r = numpy.nan
return r
[docs]def _aggnotnan(values):
if isinstance(values, pandas.core.series.Series):
r = _aggnotnan_serie(values)
return r
res = []
for col in values.columns:
val = list(values[col])
res.append(_aggnotnan_serie(val))
df = pandas.DataFrame(res, values.columns)
return df
[docs]def enumerate_qcmlogdf(files, expected_answers=None):
"""
Processes many files of logs produced by application
:class:`QCMApp <mathenjeu.apps.qcm.qcm_app.QCMApp>` in dataframe.
:param files: list of filenames
:param expected_answers: expected answers
:return: iterator on observations as dictionary
Example of data it processes::
2018-12-12 17:56:42,833,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]", events":["game:sfq,qn:2"]}
2018-12-12 17:56:44,270,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:2"]}
2018-12-12 17:56:44,349,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:2"]}
2018-12-12 17:56:44,458,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","game":"sfq","qn":"3"}
2018-12-12 17:56:49,427,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
2018-12-12 17:56:50,817,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
2018-12-12 17:56:50,864,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
2018-12-12 17:56:53,302,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
2018-12-12 17:56:53,333,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
2018-12-12 17:56:54,208,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
2018-12-12 17:56:54,239,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
:githublink:`%|py|292`
"""
def select_name(col):
return "-" in col
def prepare_df(rows):
df = pandas.DataFrame(rows)
df2 = df[df.qtime == 'end']
cols = ['person_id']
cols2 = [c for c in df2.columns if select_name(c)]
cols2.sort()
df_question = df2[cols + cols2]
gr_ans = df_question.groupby("person_id").agg(_aggnotnan)
return gr_ans
stack = {}
index = {}
for i, row in enumerate(enumerate_qcmlog(files, expected_answers)):
person_id = row.get('person_id', None)
if person_id is None:
continue
index[person_id] = i
if person_id not in stack:
stack[person_id] = []
stack[person_id].append(row)
rem = []
for k, ind in index.items():
if i - ind > 500:
rem.append(k)
for k in rem:
yield prepare_df(stack[k])
del stack[k]
del index[k]
for k, rows in stack.items():
yield prepare_df(rows)