Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Helpers to process data from logs. 

5""" 

6import re 

7from datetime import datetime 

8import hashlib 

9import numpy 

10import pandas 

11import ujson 

12 

13 

14def _duration(seq): 

15 dt = None 

16 t1 = None 

17 for t, e in seq: 

18 if e == 'enter': 

19 t1 = t 

20 elif e == 'leave': 

21 if t1 is None: 

22 # raise RuntimeError("Wrong logging {0}".format(seq)) 

23 return datetime(2018, 1, 2) - datetime(2018, 1, 1) 

24 if dt is None: 

25 dt = t - t1 

26 else: 

27 dt += t - t1 

28 t1 = None 

29 return dt 

30 

31 

32def _enumerate_processed_row(rows, data, cache, last_key, set_expected_answers=None): 

33 """ 

34 Converts time, data as dictionary into other data 

35 as dictionary. 

36 

37 @param rows previous rows 

38 @param data data as dictionaries 

39 @param cache cache events 

40 @param last_key last seen key 

41 @param set_expected_answers set of expected answers, 

42 adds a field if one is found 

43 @return iterator on clean rows 

44 """ 

45 def comma_semi(st): 

46 if st is None: 

47 return {} 

48 res = {} 

49 for val in st.split(','): 

50 spl = val.split(':') 

51 if len(spl) == 1: 

52 res[spl[0]] = True 

53 elif len(spl) == 2: 

54 res[spl[0]] = spl[1] 

55 else: 

56 raise ValueError( # pragma: no cover 

57 "Unable to parse value '{0}'".format(st)) 

58 return res 

59 

60 def hash4alias(st): 

61 by = st.encode("utf-8") 

62 m = hashlib.sha256() 

63 m.update(by) 

64 res = m.hexdigest() 

65 return res[:20] if len(res) > 20 else res 

66 

67 session = data.get('session', None) 

68 ipadd = data.get('client', ['NN.NN.NN.NN'])[0] 

69 if ipadd is None: 

70 raise ValueError( # pragma: no cover 

71 "Unable to extract an ip address from {0}".format(data)) 

72 keys = {'qn', 'game', 'next', 'events'} 

73 if session is not None: # pylint: disable=R1702 

74 alias = session['alias'] 

75 person_id = hash4alias(alias + ipadd) 

76 

77 res = dict(person_id=person_id, alias=alias, time=data['time']) 

78 event = data.get('msg', None) 

79 if event == 'qcm': 

80 res['qtime'] = 'begin' 

81 key = person_id, alias, data['game'], data['qn'] 

82 if key not in cache: 

83 cache[key] = [] 

84 cache[key].append((data['time'], 'enter')) 

85 if len(last_key) > 0: 

86 cache[last_key[0]].append((data['time'], 'leave')) 

87 last_key.clear() 

88 last_key.append(key) 

89 yield res 

90 

91 events = data.get('events', None) 

92 res0 = res.copy() 

93 res0['qtime'] = 'event' 

94 if events is not None: 

95 if not isinstance(events, list): 

96 events = [events] 

97 res = res0.copy() 

98 for event in events: 

99 ev = comma_semi(event) 

100 res.update(ev) 

101 yield res 

102 

103 elif event == "answer": 

104 res["qtime"] = 'end' 

105 q = data.get('data', None) 

106 good = {} 

107 if q is not None: 

108 qn = q['qn'] 

109 game = q['game'] 

110 q2 = {} 

111 for k, v in q.items(): 

112 if k in keys: 

113 q2[k] = v 

114 else: 

115 key = "{0}-{1}-{2}".format(game, qn, k) 

116 q2[key] = v 

117 key_short = "{0}-{1}".format(game, qn) 

118 if key in set_expected_answers: 

119 good[key_short] = 1 

120 elif key_short not in good: 

121 good[key_short] = 0 

122 

123 res.update(q2) 

124 key = person_id, alias, q['game'], q['qn'] 

125 if key not in cache: 

126 cache[key] = [] 

127 cache[key].append((data['time'], 'leave')) 

128 duration = _duration(cache[key]) 

129 res["{0}-{1}-{2}".format( 

130 game, qn, 'nbvisit')] = len(cache[key]) * 0.5 

131 res["{0}-{1}-{2}".format(game, qn, 'duration')] = duration 

132 for k, v in good.items(): 

133 res[k + '-good'] = v 

134 last_key.clear() 

135 yield res 

136 

137 events = data.get('events', None) 

138 res0 = res.copy() 

139 res0['qtime'] = 'event' 

140 if events is not None: 

141 if not isinstance(events, list): 

142 events = [events] 

143 res = res0.copy() 

144 for event in events: 

145 ev = comma_semi(event) 

146 res.update(ev) 

147 yield res 

148 

149 

150def enumerate_qcmlog(files, expected_answers=None): 

151 """ 

152 Processes many files of logs produced by application 

153 @see cl QCMApp. 

154 

155 :param files: list of filenames 

156 :param expected_answers: expected answers 

157 :return: iterator on observations as dictionary 

158 

159 Example of data it processes:: 

160 

161 2018-12-12 17:56:42,833,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]} 

162 2018-12-12 17:56:44,270,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]} 

163 2018-12-12 17:56:44,349,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:2"]} 

164 2018-12-12 17:56:44,458,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"game":"simple_french_qcm","qn":"3"} 

165 2018-12-12 17:56:49,427,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

166 2018-12-12 17:56:50,817,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

167 2018-12-12 17:56:50,864,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

168 2018-12-12 17:56:53,302,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

169 2018-12-12 17:56:53,333,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

170 2018-12-12 17:56:54,208,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

171 2018-12-12 17:56:54,239,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"events":["game:simple_french_qcm,qn:3"]} 

172 """ 

173 set_expected_answers = set() 

174 if expected_answers is not None: 

175 for a in expected_answers: 

176 for _ in a: 

177 set_expected_answers.add(_) 

178 

179 rows = [] 

180 cache = {} 

181 last_key = [] 

182 for name in files: 

183 if len(rows) > 1000: 

184 rows = rows[-1000:] 

185 with open(name, "r", encoding="utf-8") as f: 

186 for line in f.readlines(): 

187 if "[DATA]" not in line: 

188 continue 

189 line = line.strip("\n\r") 

190 spl = line.split(",INFO,[DATA],") 

191 ti = spl[0] 

192 sdata = ",INFO,[DATA],".join(spl[1:]) 

193 try: 

194 data = ujson.loads(sdata) # pylint: disable=E1101 

195 except ValueError: 

196 if '"' not in sdata and "'" in sdata: 

197 sdata2 = sdata.replace("'", '"') 

198 try: 

199 data = ujson.loads(sdata2) # pylint: disable=E1101 

200 except ValueError: 

201 if '"msg": "finish"' in sdata2: 

202 # Fix the code somewhere else. 

203 sdata3 = sdata2.replace( 

204 '"client": ("', '"client": ["') 

205 sdata3 = sdata3.replace( 

206 '), "data": QueryParams', '], "data": QueryParams') 

207 sdata3 = re.sub( 

208 'QueryParams\\(\\"game=([a-z_]+)\\"\\)', '{"game":"\\1"}', sdata3) 

209 try: 

210 data = ujson.loads( # pylint: disable=E1101 

211 sdata3) 

212 except ValueError as e: 

213 raise ValueError( 

214 "Unable to process line\n{}\n{}\n{}".format( 

215 sdata, sdata2, sdata3)) from e 

216 else: 

217 raise ValueError( 

218 "Unable to process line\n{}\n{}".format( 

219 sdata, sdata2)) from e 

220 

221 tid = datetime.strptime(ti, '%Y-%m-%d %H:%M:%S,%f') 

222 data['time'] = tid 

223 obss = _enumerate_processed_row( 

224 rows, data, cache, last_key, set_expected_answers) 

225 for obs in obss: 

226 yield obs 

227 rows.append(data) 

228 

229 

230def _aggnotnan_serie(values): 

231 res = [] 

232 for v in values: 

233 if isinstance(v, float) and numpy.isnan(v): 

234 continue 

235 if pandas.isnull(v): 

236 continue 

237 if v in ('ok', 'on'): 

238 v = 1 

239 elif v == 'skip': 

240 v = 1000 

241 res.append(v) 

242 if len(res) > 0: 

243 if isinstance(res[0], str): 

244 r = ",".join(str(_) for _ in res) 

245 else: 

246 if len(res) == 1: 

247 r = res[0] 

248 else: 

249 try: 

250 r = sum(res) 

251 except TypeError: 

252 r = 0 

253 else: 

254 r = numpy.nan 

255 return r 

256 

257 

258def _aggnotnan(values): 

259 if isinstance(values, pandas.core.series.Series): 

260 r = _aggnotnan_serie(values) 

261 return r 

262 res = [] 

263 for col in values.columns: 

264 val = list(values[col]) 

265 res.append(_aggnotnan_serie(val)) 

266 df = pandas.DataFrame(res, values.columns) 

267 return df 

268 

269 

270def enumerate_qcmlogdf(files, expected_answers=None): 

271 """ 

272 Processes many files of logs produced by application 

273 @see cl QCMApp in dataframe. 

274 

275 :param files: list of filenames 

276 :param expected_answers: expected answers 

277 :return: iterator on observations as dictionary 

278 

279 Example of data it processes:: 

280 

281 2018-12-12 17:56:42,833,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]", events":["game:sfq,qn:2"]} 

282 2018-12-12 17:56:44,270,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:2"]} 

283 2018-12-12 17:56:44,349,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:2"]} 

284 2018-12-12 17:56:44,458,INFO,[DATA],{"msg":"qcm","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","game":"sfq","qn":"3"} 

285 2018-12-12 17:56:49,427,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

286 2018-12-12 17:56:50,817,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

287 2018-12-12 17:56:50,864,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

288 2018-12-12 17:56:53,302,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

289 2018-12-12 17:56:53,333,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

290 2018-12-12 17:56:54,208,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

291 2018-12-12 17:56:54,239,INFO,[DATA],{"msg":"event","session":{"alias":"xavierd"},"client":["N.N.N.N",N]","events":["game:sfq,qn:3"]} 

292 """ 

293 def select_name(col): 

294 return "-" in col 

295 

296 def prepare_df(rows): 

297 df = pandas.DataFrame(rows) 

298 df2 = df[df.qtime == 'end'] 

299 cols = ['person_id'] 

300 cols2 = [c for c in df2.columns if select_name(c)] 

301 cols2.sort() 

302 df_question = df2[cols + cols2] 

303 gr_ans = df_question.groupby("person_id").agg(_aggnotnan) 

304 return gr_ans 

305 

306 stack = {} 

307 index = {} 

308 for i, row in enumerate(enumerate_qcmlog(files, expected_answers)): 

309 

310 person_id = row.get('person_id', None) 

311 if person_id is None: 

312 continue 

313 

314 index[person_id] = i 

315 if person_id not in stack: 

316 stack[person_id] = [] 

317 stack[person_id].append(row) 

318 

319 rem = [] 

320 for k, ind in index.items(): 

321 if i - ind > 500: 

322 rem.append(k) 

323 for k in rem: 

324 yield prepare_df(stack[k]) 

325 del stack[k] 

326 del index[k] 

327 for k, rows in stack.items(): 

328 yield prepare_df(rows)