Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Compute the performance for the hackathon 2018. 

4""" 

5import os 

6import time 

7import pandas 

8import numpy 

9from PIL import Image 

10from lightmlrestapi.mlapp.mlstorage import MLStorage 

11try: 

12 from ..hackathon.image_helper import enumerate_image_class 

13except (ImportError, ValueError): 

14 from ensae_projects.hackathon.image_helper import enumerate_image_class 

15 

16 

17class MLStoragePerf2018: 

18 """ 

19 Computes the performances the a hackathon. 

20 """ 

21 

22 def __init__(self, storage, examples, cache_file="cache_file.csv"): 

23 """ 

24 @param storage storage location 

25 @param examples deep learning models 

26 """ 

27 self._storage = self._load_ml_storage(storage) 

28 self._examples = examples 

29 self._cache_file = cache_file 

30 

31 def _load_ml_storage(self, root): 

32 """ 

33 Creates an instance of a 

34 `MLStorage <http://www.xavierdupre.fr/app/lightmlrestapi/helpsphinx/lightmlrestapi/mlapp/mlstorage.html 

35 # lightmlrestapi.mlapp.mlstorage.MLStorage>`_ 

36 based on a folder. 

37 

38 @param root folder 

39 """ 

40 if not os.path.exists(root): 

41 raise FileNotFoundError("Unable to find '{0}'".format(root)) 

42 stor = MLStorage(root) 

43 return stor 

44 

45 def _load_cached_performance(self, cache_file=None): 

46 """ 

47 Retrieves performances already computed. 

48 

49 @param cached_file file 

50 """ 

51 if cache_file is None: 

52 cache_file = self._cache_file 

53 if os.path.exists(cache_file): 

54 df = pandas.read_csv(cache_file, sep=",", encoding="utf-8") 

55 return df 

56 else: 

57 return None 

58 

59 def _save_performance(self, df, cache_file=None): 

60 """ 

61 Saves cached performance. 

62 

63 @param df dataframe 

64 @param cache_file destination 

65 """ 

66 if cache_file is None: 

67 cache_file = self._cache_file 

68 df.to_csv(cache_file, sep=',', encoding='utf-8', index=False) 

69 

70 def compute_performance(self, use_cache=True, fLOG=None): 

71 """ 

72 Computes the performance for the not cached one if 

73 *use_cache* is True. 

74 

75 @param use_cache use cache 

76 @param fLOG logging function 

77 @return dataframe 

78 """ 

79 cache = None 

80 already = set() 

81 if use_cache: 

82 cache = self._load_cached_performance() 

83 if cache is not None: 

84 already = set(cache["name"]) 

85 

86 rows = [] 

87 for i, name in enumerate(sorted(self._storage.enumerate_names())): 

88 if i % 30 == 0: 

89 print('.') 

90 if name not in already: 

91 t0 = time.perf_counter() 

92 if fLOG: 

93 fLOG( 

94 "[MLStoragePerf2018] compute perf for {0}: '{1}'".format(i, name)) 

95 res = self.compute_perf(name) # pylint: disable=E1111 

96 if fLOG: 

97 fLOG( 

98 "[MLStoragePerf2018] Done for {0}: {1}".format(name, res)) 

99 if 'exc' in res: 

100 fLOG("[MLStoragePerf2018] exception for {0}: {1}".format( 

101 name, res['exc'])) 

102 res["name"] = name 

103 res["stime"] = os.stat(self._storage._folder, name).st_mtime 

104 t1 = time.perf_counter() - t0 

105 res["time"] = t1 

106 rows.append(res) 

107 already.add(name) 

108 

109 df = pandas.DataFrame(rows) 

110 sc = list(sorted(df.columns)) 

111 df = df[sc] 

112 

113 if cache is not None: 

114 df = pandas.concat([df, cache]) 

115 

116 df = df.sort_values("name").copy() 

117 

118 self._save_performance(df) 

119 

120 return df 

121 

122 def compute_perf(self, name): 

123 """ 

124 Computes the performances for every image and one 

125 particular model. 

126 """ 

127 raise NotImplementedError() 

128 

129 

130class MLStoragePerf2018Image(MLStoragePerf2018): 

131 """ 

132 Overloads *compute_perf* for images. 

133 Example of use: 

134 

135 :: 

136 

137 from ensae_projects.hackathon.perf2018 import MLStoragePerf2018Image 

138 mstorage = "storage_brgm" 

139 mexample = "hackathon_test/sample_labelled_test" 

140 mpref = MLStoragePerf2018Image(mstorage, mexample) 

141 mres = mpref.compute_performance(fLOG=print, use_cache=True) 

142 mres = mres.sort_values("precision", ascending=False) 

143 print(mres) 

144 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n" 

145 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html()) 

146 with open("brgm.html", "w", encoding="utf-8") as f: 

147 f.write(mcontent) 

148 """ 

149 

150 def __init__(self, storage, examples, cache_file="cache_file.csv"): 

151 """ 

152 @param storage storage location 

153 @param examples deep learning models 

154 """ 

155 MLStoragePerf2018.__init__(self, storage, examples, cache_file) 

156 

157 def _label_mapping(self, subs): 

158 """ 

159 Computes the label based on a subfolder name. 

160 """ 

161 return 1 if subs.endswith('1') else 0 

162 

163 def compute_perf(self, name): 

164 """ 

165 Computes the performances for every image and one 

166 particular model. 

167 """ 

168 from keras import backend as K 

169 K.clear_session() # pylint: disable=E1101 

170 folder = self._examples 

171 

172 try: 

173 model = self._storage.load_model(name) 

174 vers = self._storage.call_version(name) 

175 exc = None 

176 except Exception as e: # pylint: disable=W0703 

177 model = None 

178 exc = e 

179 vers = None 

180 

181 rows = [] 

182 for img, sub in enumerate_image_class(folder): 

183 label = self._label_mapping(sub) 

184 obs = dict(image=img, sub=sub, label=label) 

185 if model is None: 

186 obs = {'exc': Exception("model is None")} 

187 pred = None 

188 else: 

189 X = numpy.array(Image.open(img)) 

190 try: 

191 pred = self._storage.call_predict( 

192 name, X, loaded_model=model) 

193 # print("*****",pred) 

194 except Exception as e: # pylint: disable=W0703 

195 exc = e 

196 pred = None 

197 #print('------', e) 

198 

199 if pred is None: 

200 pass 

201 else: 

202 if isinstance(pred, float): 

203 plabel = 1 if pred > 0.5 else 0 

204 score = pred 

205 if isinstance(pred, list): 

206 pred = numpy.array(pred) 

207 if isinstance(pred, numpy.ndarray): 

208 pred = pred.ravel() 

209 if len(pred) == 1: 

210 plabel = 1 if pred[0] > 0.5 else 0 

211 score = pred[0] 

212 elif len(pred) > 1: 

213 plabel = numpy.argmax(pred) 

214 score = pred[plabel] 

215 else: 

216 exc = ValueError("No prediction") 

217 else: 

218 exc = TypeError( 

219 "Prediction with the wrong type {0}".format(type(pred))) 

220 

221 if exc: 

222 obs.update({'exc': exc}) 

223 else: 

224 obs.update(dict(predicted_label=plabel, score=score)) 

225 

226 rows.append(obs) 

227 # print(rows) 

228 # break 

229 

230 final = pandas.DataFrame(rows) 

231 columns = list(final.columns) 

232 if 'score' in columns: 

233 final["score"] = final["score"].fillna(0) 

234 final["predicted_label"] = final["predicted_label"].fillna(-1) 

235 final["correct"] = final["predicted_label"] == final["label"] 

236 final["correcti"] = 0 

237 final.loc[final["correct"], "correcti"] = 1 

238 

239 res = {} 

240 if exc: 

241 res["exc"] = str(exc) 

242 if len(final) > 0: 

243 gr = final["correcti"].sum() / final.shape[0] 

244 res["precision"] = gr 

245 

246 gr = final[["sub", "correcti", "correct"]] 

247 gr = gr.groupby("sub", as_index=False) 

248 gr = gr.agg({"correct": len, 'correcti': sum}) 

249 gr["ratio"] = gr["correcti"] / gr["correct"] 

250 for i in range(gr.shape[0]): 

251 res["p_%s" % gr.loc[i, "sub"]] = gr.loc[i, "ratio"] 

252 else: 

253 res["precision"] = 0 

254 else: 

255 res = dict(exc=exc, precision=0) 

256 

257 if vers is not None: 

258 res["version"] = vers 

259 return res 

260 

261 

262class MLStoragePerf2018TimeSeries(MLStoragePerf2018): 

263 """ 

264 Overloads *compute_perf* for timeseries. 

265 

266 Example of use: 

267 

268 :: 

269 

270 from ensae_projects.hackathon.perf2018 import MLStoragePerf2018TimeSeries 

271 mstorage = "storage_microdon" 

272 mexample = "hackathon_test/sample_labelled_test" 

273 mpref = MLStoragePerf2018TimeSeries(mstorage, mexample) 

274 mres = mpref.compute_performance(fLOG=print, use_cache=True) 

275 mres = mres.sort_values("cor", ascending=False) 

276 print(mres) 

277 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - Microdon</h1>\n" 

278 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html()) 

279 with open("brgm.html", "w", encoding="utf-8") as f: 

280 f.write(mcontent) 

281 """ 

282 

283 def __init__(self, storage, examples, cache_file="cache_file.csv"): 

284 """ 

285 @param storage storage location 

286 @param examples deep learning models 

287 """ 

288 MLStoragePerf2018.__init__(self, storage, examples, cache_file) 

289 

290 df = pandas.read_csv(examples) 

291 

292 sub = df[["year", "week", "campaigns_campaign_id", "collecteur_id", 

293 "montant_total", "nb_dons_total", "nb_transac_total"]].copy() 

294 dsub = sub.fillna(0) 

295 gr = dsub.groupby( 

296 ["year", "week", "campaigns_campaign_id"], as_index=False).sum() 

297 gr["PARTICIPATION"] = gr["nb_dons_total"] / gr["nb_transac_total"] 

298 self._expected = gr 

299 

300 def _label_mapping(self, subs): 

301 """ 

302 Computes the label based on a subfolder name. 

303 """ 

304 return 1 if subs.endswith('1') else 0 

305 

306 def compute_perf(self, name): 

307 """ 

308 Computes the performances for every image and one 

309 particular model. 

310 """ 

311 

312 try: 

313 model = self._storage.load_model(name) 

314 vers = self._storage.call_version(name) 

315 exc = None 

316 except Exception as e: # pylint: disable=W0703 

317 model = None 

318 exc = e 

319 vers = None 

320 

321 cols = ["year", "week", "campaigns_campaign_id", 

322 "PARTICIPATION", "nb_transac_total"] 

323 X = self._expected[cols] 

324 total = 0 

325 total10 = 0 

326 total100 = 0 

327 total1000 = 0 

328 n1 = 0 

329 n10 = 0 

330 n100 = 0 

331 n1000 = 0 

332 

333 preds = [] 

334 exps = [] 

335 diffs = {} 

336 

337 for i in range(0, X.shape[0]): 

338 week = X.iloc[i, 1] 

339 camp = X.iloc[i, 2] 

340 exp = X.iloc[i, 3] 

341 if exp > 1: 

342 continue 

343 nb_transac_total = X.iloc[i, 4] 

344 if nb_transac_total > 0: 

345 try: 

346 pred = self._storage.call_predict( 

347 name, (week, camp), loaded_model=model) 

348 except Exception as e: # pylint: disable=W0703 

349 exc = e 

350 pred = 0 

351 if isinstance(pred, list): 

352 if len(pred) == 1: 

353 pred = pred[0] 

354 else: 

355 exc = Exception( 

356 "Returned a list of value when expecting one") 

357 elif isinstance(pred, numpy.ndarray): 

358 pred = pred.ravel() 

359 if len(pred) == 1: 

360 pred = pred[0] 

361 else: 

362 exc = Exception( 

363 "Returned a list of value when expecting one") 

364 n1 += 1 

365 exps.append(exp) 

366 preds.append(pred) 

367 diffs[week, camp] = abs(exp - pred) 

368 total += (pred - exp) ** 2 

369 if nb_transac_total >= 10: 

370 total10 += (pred - exp) ** 2 

371 n10 += 1 

372 if nb_transac_total >= 100: 

373 total100 += (pred - exp) ** 2 

374 n100 += 1 

375 if nb_transac_total >= 1000: 

376 total1000 += (pred - exp) ** 2 

377 n1000 += 1 

378 

379 res = {} 

380 if vers is not None: 

381 res["version"] = vers 

382 if exc is not None: 

383 res["exc"] = exc 

384 if n1 > 0: 

385 res["score"] = (total / n1) ** 0.5 

386 res["score10"] = (total10 / n10) ** 0.5 

387 res["score100"] = (total100 / n100) ** 0.5 

388 res["score1000"] = (total1000 / n1000) ** 0.5 

389 try: 

390 res["cor"] = numpy.corrcoef(numpy.array(preds), 

391 numpy.array(exps))[0, 1] 

392 except (AttributeError, KeyError): 

393 res["cor"] = numpy.nan 

394 try: 

395 res["pmin"] = numpy.min(preds) 

396 res["pmax"] = numpy.max(preds) 

397 except (KeyError, ValueError): 

398 res["pmin"] = numpy.nan 

399 res["pmax"] = numpy.nan 

400 

401 resort = [(v, k) for k, v in diffs.items()] 

402 

403 try: 

404 resort.sort() 

405 skip = False 

406 except ValueError: 

407 skip = True 

408 exc = Exception( 

409 "Unable to sort differences {0}".format(resort[0])) 

410 if not skip: 

411 last = resort[-1] 

412 res["worst"] = "{0}:{1}".format(last[1], last[0]) 

413 best = resort[0] 

414 res["best"] = "{0}:{1}".format(best[1], best[0]) 

415 return res 

416 

417 

418if __name__ == "__main__": 

419 mstorage = r'/home/jbr/hack35/' 

420 mexample = r'./sample_labelled_test' 

421 mpref = MLStoragePerf2018Image(mstorage, mexample) 

422 mres = mpref.compute_performance(fLOG=print, use_cache=True) 

423 mres = mres.sort_values("precision", ascending=False) 

424 print(mres) 

425 if 'exc' in mres.columns: 

426 print(list(mres['exc'])) 

427 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n" 

428 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html()) 

429 from pyquickhelper.pandashelper.tblformat import df2rst 

430 with open("hackathon2018/brgm.rst", "w", encoding="utf-8") as f: 

431 f.write(df2rst(mres)) 

432 with open("hackathon2018/brgm.html", "w", encoding="utf-8") as f: 

433 f.write(mcontent)