Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Helpers to process data from logs.
5"""
6import re
7from datetime import datetime
8import hashlib
9import numpy
10import pandas
11import ujson
14def _duration(seq):
15 dt = None
16 t1 = None
17 for t, e in seq:
18 if e == 'enter':
19 t1 = t
20 elif e == 'leave':
21 if t1 is None:
22 # raise RuntimeError("Wrong logging {0}".format(seq))
23 return datetime(2018, 1, 2) - datetime(2018, 1, 1)
24 if dt is None:
25 dt = t - t1
26 else:
27 dt += t - t1
28 t1 = None
29 return dt
32def _enumerate_processed_row(rows, data, cache, last_key, set_expected_answers=None):
33 """
34 Converts time, data as dictionary into other data
35 as dictionary.
37 @param rows previous rows
38 @param data data as dictionaries
39 @param cache cache events
40 @param last_key last seen key
41 @param set_expected_answers set of expected answers,
42 adds a field if one is found
43 @return iterator on clean rows
44 """
45 def comma_semi(st):
46 if st is None:
47 return {}
48 res = {}
49 for val in st.split(','):
50 spl = val.split(':')
51 if len(spl) == 1:
52 res[spl[0]] = True
53 elif len(spl) == 2:
54 res[spl[0]] = spl[1]
55 else:
56 raise ValueError( # pragma: no cover
57 "Unable to parse value '{0}'".format(st))
58 return res
60 def hash4alias(st):
61 by = st.encode("utf-8")
62 m = hashlib.sha256()
63 m.update(by)
64 res = m.hexdigest()
65 return res[:20] if len(res) > 20 else res
67 session = data.get('session', None)
68 ipadd = data.get('client', ['NN.NN.NN.NN'])[0]
69 if ipadd is None:
70 raise ValueError( # pragma: no cover
71 "Unable to extract an ip address from {0}".format(data))
72 keys = {'qn', 'game', 'next', 'events'}
73 if session is not None: # pylint: disable=R1702
74 alias = session['alias']
75 person_id = hash4alias(alias + ipadd)
77 res = dict(person_id=person_id, alias=alias, time=data['time'])
78 event = data.get('msg', None)
79 if event == 'qcm':
80 res['qtime'] = 'begin'
81 key = person_id, alias, data['game'], data['qn']
82 if key not in cache:
83 cache[key] = []
84 cache[key].append((data['time'], 'enter'))
85 if len(last_key) > 0:
86 cache[last_key[0]].append((data['time'], 'leave'))
87 last_key.clear()
88 last_key.append(key)
89 yield res
91 events = data.get('events', None)
92 res0 = res.copy()
93 res0['qtime'] = 'event'
94 if events is not None:
95 if not isinstance(events, list):
96 events = [events]
97 res = res0.copy()
98 for event in events:
99 ev = comma_semi(event)
100 res.update(ev)
101 yield res
103 elif event == "answer":
104 res["qtime"] = 'end'
105 q = data.get('data', None)
106 good = {}
107 if q is not None:
108 qn = q['qn']
109 game = q['game']
110 q2 = {}
111 for k, v in q.items():
112 if k in keys:
113 q2[k] = v
114 else:
115 key = "{0}-{1}-{2}".format(game, qn, k)
116 q2[key] = v
117 key_short = "{0}-{1}".format(game, qn)
118 if key in set_expected_answers:
119 good[key_short] = 1
120 elif key_short not in good:
121 good[key_short] = 0
123 res.update(q2)
124 key = person_id, alias, q['game'], q['qn']
125 if key not in cache:
126 cache[key] = []
127 cache[key].append((data['time'], 'leave'))
128 duration = _duration(cache[key])
129 res["{0}-{1}-{2}".format(
130 game, qn, 'nbvisit')] = len(cache[key]) * 0.5
131 res["{0}-{1}-{2}".format(game, qn, 'duration')] = duration
132 for k, v in good.items():
133 res[k + '-good'] = v
134 last_key.clear()
135 yield res
137 events = data.get('events', None)
138 res0 = res.copy()
139 res0['qtime'] = 'event'
140 if events is not None:
141 if not isinstance(events, list):
142 events = [events]
143 res = res0.copy()
144 for event in events:
145 ev = comma_semi(event)
146 res.update(ev)
147 yield res
150def enumerate_qcmlog(files, expected_answers=None):
151 """
152 Processes many files of logs produced by application
153 @see cl QCMApp.
155 :param files: list of filenames
156 :param expected_answers: expected answers
157 :return: iterator on observations as dictionary
159 Example of data it processes::
161 2018-12-12 17:56:42,833,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]}
162 2018-12-12 17:56:44,270,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]}
163 2018-12-12 17:56:44,349,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]}
164 2018-12-12 17:56:44,458,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"game":"simple_french_qcm","qn":"3"}
165 2018-12-12 17:56:49,427,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
166 2018-12-12 17:56:50,817,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
167 2018-12-12 17:56:50,864,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
168 2018-12-12 17:56:53,302,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
169 2018-12-12 17:56:53,333,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
170 2018-12-12 17:56:54,208,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
171 2018-12-12 17:56:54,239,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]}
172 """
173 set_expected_answers = set()
174 if expected_answers is not None:
175 for a in expected_answers:
176 for _ in a:
177 set_expected_answers.add(_)
179 rows = []
180 cache = {}
181 last_key = []
182 for name in files:
183 if len(rows) > 1000:
184 rows = rows[-1000:]
185 with open(name, "r", encoding="utf-8") as f:
186 for line in f.readlines():
187 if "[DATA]" not in line:
188 continue
189 line = line.strip("\n\r")
190 spl = line.split(",INFO,[DATA],")
191 ti = spl[0]
192 sdata = ",INFO,[DATA],".join(spl[1:])
193 try:
194 data = ujson.loads(sdata) # pylint: disable=E1101
195 except ValueError:
196 if '"' not in sdata and "'" in sdata:
197 sdata2 = sdata.replace("'", '"')
198 try:
199 data = ujson.loads(sdata2) # pylint: disable=E1101
200 except ValueError:
201 if '"msg": "finish"' in sdata2:
202 # Fix the code somewhere else.
203 sdata3 = sdata2.replace(
204 '"client": ("', '"client": ["')
205 sdata3 = sdata3.replace(
206 '), "data": QueryParams', '], "data": QueryParams')
207 sdata3 = re.sub(
208 'QueryParams\\(\\"game=([a-z_]+)\\"\\)', '{"game":"\\1"}', sdata3)
209 try:
210 data = ujson.loads( # pylint: disable=E1101
211 sdata3)
212 except ValueError as e:
213 raise ValueError(
214 "Unable to process line\n{}\n{}\n{}".format(
215 sdata, sdata2, sdata3)) from e
216 else:
217 raise ValueError(
218 "Unable to process line\n{}\n{}".format(
219 sdata, sdata2)) from e
221 tid = datetime.strptime(ti, '%Y-%m-%d %H:%M:%S,%f')
222 data['time'] = tid
223 obss = _enumerate_processed_row(
224 rows, data, cache, last_key, set_expected_answers)
225 for obs in obss:
226 yield obs
227 rows.append(data)
230def _aggnotnan_serie(values):
231 res = []
232 for v in values:
233 if isinstance(v, float) and numpy.isnan(v):
234 continue
235 if pandas.isnull(v):
236 continue
237 if v in ('ok', 'on'):
238 v = 1
239 elif v == 'skip':
240 v = 1000
241 res.append(v)
242 if len(res) > 0:
243 if isinstance(res[0], str):
244 r = ",".join(str(_) for _ in res)
245 else:
246 if len(res) == 1:
247 r = res[0]
248 else:
249 try:
250 r = sum(res)
251 except TypeError:
252 r = 0
253 else:
254 r = numpy.nan
255 return r
258def _aggnotnan(values):
259 if isinstance(values, pandas.core.series.Series):
260 r = _aggnotnan_serie(values)
261 return r
262 res = []
263 for col in values.columns:
264 val = list(values[col])
265 res.append(_aggnotnan_serie(val))
266 df = pandas.DataFrame(res, values.columns)
267 return df
270def enumerate_qcmlogdf(files, expected_answers=None):
271 """
272 Processes many files of logs produced by application
273 @see cl QCMApp in dataframe.
275 :param files: list of filenames
276 :param expected_answers: expected answers
277 :return: iterator on observations as dictionary
279 Example of data it processes::
281 2018-12-12 17:56:42,833,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]", events":["game:sfq,qn:2"]}
282 2018-12-12 17:56:44,270,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:2"]}
283 2018-12-12 17:56:44,349,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:2"]}
284 2018-12-12 17:56:44,458,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","game":"sfq","qn":"3"}
285 2018-12-12 17:56:49,427,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
286 2018-12-12 17:56:50,817,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
287 2018-12-12 17:56:50,864,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
288 2018-12-12 17:56:53,302,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
289 2018-12-12 17:56:53,333,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
290 2018-12-12 17:56:54,208,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
291 2018-12-12 17:56:54,239,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]}
292 """
293 def select_name(col):
294 return "-" in col
296 def prepare_df(rows):
297 df = pandas.DataFrame(rows)
298 df2 = df[df.qtime == 'end']
299 cols = ['person_id']
300 cols2 = [c for c in df2.columns if select_name(c)]
301 cols2.sort()
302 df_question = df2[cols + cols2]
303 gr_ans = df_question.groupby("person_id").agg(_aggnotnan)
304 return gr_ans
306 stack = {}
307 index = {}
308 for i, row in enumerate(enumerate_qcmlog(files, expected_answers)):
310 person_id = row.get('person_id', None)
311 if person_id is None:
312 continue
314 index[person_id] = i
315 if person_id not in stack:
316 stack[person_id] = []
317 stack[person_id].append(row)
319 rem = []
320 for k, ind in index.items():
321 if i - ind > 500:
322 rem.append(k)
323 for k in rem:
324 yield prepare_df(stack[k])
325 del stack[k]
326 del index[k]
327 for k, rows in stack.items():
328 yield prepare_df(rows)