Coverage for mlprodict/onnxrt/validate/validate

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Validates runtime for many :epkg:`scikit-learn` operators.

4The submodule relies on :epkg:`onnxconverter_common`,

5:epkg:`sklearn-onnx`.

6"""

7import math

8import copy

9from timeit import Timer

10import os

11import warnings

12from importlib import import_module

13import pickle

14from time import perf_counter

15import numpy

16from sklearn.base import BaseEstimator

17from sklearn.linear_model._base import LinearModel

18from sklearn.model_selection import train_test_split

19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version

20from .validate_problems import _problems

23class RuntimeBadResultsError(RuntimeError):

24 """

25 Raised when the results are too different from

26 :epkg:`scikit-learn`.

27 """

29 def __init__(self, msg, obs):

30 """

31 :param msg: to display

32 :param obs: observations

33 """

34 RuntimeError.__init__(self, msg)

35 self.obs = obs

38def _dictionary2str(di):

39 el = []

40 for k in sorted(di):

41 el.append('{}={}'.format(k, di[k]))

42 return '/'.join(el)

45def modules_list():

46 """

47 Returns modules and versions currently used.

49 .. runpython::

50 :showcode:

51 :rst:

52 :warningout: DeprecationWarning

54 from mlprodict.onnxrt.validate.validate_helper import modules_list

55 from pyquickhelper.pandashelper import df2rst

56 from pandas import DataFrame

57 print(df2rst(DataFrame(modules_list())))

58 """

59 def try_import(name):

60 try:

61 mod = import_module(name)

62 except ImportError: # pragma: no cover

63 return None

64 return (dict(name=name, version=mod.__version__)

65 if hasattr(mod, '__version__') else dict(name=name))

67 rows = []

68 for name in sorted(['pandas', 'numpy', 'sklearn', 'mlprodict',

69 'skl2onnx', 'onnxmltools', 'onnx', 'onnxruntime',

70 'scipy']):

71 res = try_import(name)

72 if res is not None:

73 rows.append(res)

74 return rows

77def _dispsimple(arr, fLOG):

78 if isinstance(arr, (tuple, list)):

79 for i, a in enumerate(arr):

80 fLOG("output %d" % i)

81 _dispsimple(a, fLOG)

82 elif hasattr(arr, 'shape'):

83 if len(arr.shape) == 1:

84 threshold = 8

85 else:

86 threshold = min(

87 50, min(50 // arr.shape[1], 8) * arr.shape[1])

88 fLOG(numpy.array2string(arr, max_line_width=120,

89 suppress_small=True,

90 threshold=threshold))

91 else: # pragma: no cover

92 s = str(arr)

93 if len(s) > 50:

94 s = s[:50] + "..."

95 fLOG(s)

98def _merge_options(all_conv_options, aoptions):

99 if aoptions is None:

100 return copy.deepcopy(all_conv_options)

101 if not isinstance(aoptions, dict):

102 return copy.deepcopy(aoptions) # pragma: no cover

103 merged = {}

104 for k, v in all_conv_options.items():

105 if k in aoptions:

106 merged[k] = _merge_options(v, aoptions[k])

107 else:

108 merged[k] = copy.deepcopy(v)

109 for k, v in aoptions.items():

110 if k in all_conv_options:

111 continue

112 merged[k] = copy.deepcopy(v)

113 return merged

114

115

116def sklearn_operators(subfolder=None, extended=False,

117 experimental=True):

118 """

119 Builds the list of operators from :epkg:`scikit-learn`.

120 The function goes through the list of submodule

121 and get the list of class which inherit from

122 :epkg:`scikit-learn:base:BaseEstimator`.

123

124 :param subfolder: look into only one subfolder

125 :param extended: extends the list to the list of operators

126 this package implements a converter for

127 :param experimental: includes experimental module from

128 :epkg:`scikit-learn` (see `sklearn.experimental

129 <https://github.com/scikit-learn/scikit-learn/

130 tree/master/sklearn/experimental>`_)

131 :return: the list of found operators

132 """

133 if experimental:

134 from sklearn.experimental import ( # pylint: disable=W0611

135 enable_hist_gradient_boosting,

136 enable_iterative_imputer)

137

138 subfolders = sklearn__all__ + ['mlprodict.onnx_conv']

139 found = []

140 for subm in sorted(subfolders):

141 if isinstance(subm, list):

142 continue # pragma: no cover

143 if subfolder is not None and subm != subfolder:

144 continue

145

146 if subm == 'feature_extraction':

147 subs = [subm, 'feature_extraction.text']

148 else:

149 subs = [subm]

150

151 for sub in subs:

152 if '.' in sub and sub not in {'feature_extraction.text'}:

153 name_sub = sub

154 else:

155 name_sub = "{0}.{1}".format("sklearn", sub)

156 try:

157 mod = import_module(name_sub)

158 except ModuleNotFoundError:

159 continue

160

161 if hasattr(mod, "register_converters"):

162 fct = getattr(mod, "register_converters")

163 cls = fct()

164 else:

165 cls = getattr(mod, "__all__", None)

166 if cls is None:

167 cls = list(mod.__dict__)

168 cls = [mod.__dict__[cl] for cl in cls]

169

170 for cl in cls:

171 try:

172 issub = issubclass(cl, BaseEstimator)

173 except TypeError:

174 continue

175 if cl.__name__ in {'Pipeline', 'ColumnTransformer',

176 'FeatureUnion', 'BaseEstimator',

177 'BaseEnsemble', 'BaseDecisionTree'}:

178 continue

179 if cl.__name__ in {'CustomScorerTransform'}:

180 continue

181 if (sub in {'calibration', 'dummy', 'manifold'} and

182 'Calibrated' not in cl.__name__):

183 continue

184 if issub:

185 pack = "sklearn" if sub in sklearn__all__ else cl.__module__.split('.')[

186 0]

187 found.append(

188 dict(name=cl.__name__, subfolder=sub, cl=cl, package=pack))

189

190 if extended:

191 from ...onnx_conv import register_converters

192 with warnings.catch_warnings():

193 warnings.simplefilter("ignore", ResourceWarning)

194 models = register_converters(True)

195

196 done = set(_['name'] for _ in found)

197 for m in models:

198 try:

199 name = m.__module__.split('.')

200 except AttributeError as e: # pragma: no cover

201 raise AttributeError("Unexpected value, m={}".format(m)) from e

202 sub = '.'.join(name[1:])

203 pack = name[0]

204 if m.__name__ not in done:

205 found.append(

206 dict(name=m.__name__, cl=m, package=pack, sub=sub))

207

208 # let's remove models which cannot predict

209 all_found = found

210 found = []

211 for mod in all_found:

212 cl = mod['cl']

213 if hasattr(cl, 'fit_predict') and not hasattr(cl, 'predict'):

214 continue

215 if hasattr(cl, 'fit_transform') and not hasattr(cl, 'transform'):

216 continue

217 if (not hasattr(cl, 'transform') and

218 not hasattr(cl, 'predict') and

219 not hasattr(cl, 'decision_function')):

220 continue

221 found.append(mod)

222 return found

223

224

225def _measure_time(fct, repeat=1, number=1, first_run=True):

226 """

227 Measures the execution time for a function.

228

229 :param fct: function to measure

230 :param repeat: number of times to repeat

231 :param number: number of times between two measures

232 :param first_run: if True, runs the function once before measuring

233 :return: last result, average, values

234 """

235 res = None

236 values = []

237 if first_run:

238 fct()

239 for __ in range(repeat):

240 begin = perf_counter()

241 for _ in range(number):

242 res = fct()

243 end = perf_counter()

244 values.append(end - begin)

245 if repeat * number == 1:

246 return res, values[0], values

247 return res, sum(values) / (repeat * number), values # pragma: no cover

248

249

250def _shape_exc(obj):

251 if hasattr(obj, 'shape'):

252 return obj.shape

253 if isinstance(obj, (list, dict, tuple)):

254 return "[{%d}]" % len(obj)

255 return None

256

257

258def dump_into_folder(dump_folder, obs_op=None, is_error=True,

259 **kwargs):

260 """

261 Dumps information when an error was detected

262 using :epkg:`*py:pickle`.

263

264 :param dump_folder: dump_folder

265 :param obs_op: obs_op (information)

266 :param is_error: is it an error or not?

267 :param kwargs: additional parameters

268 :return: name

269 """

270 if dump_folder is None:

271 raise ValueError("dump_folder cannot be None.")

272 optim = obs_op.get('optim', '')

273 optim = str(optim)

274 optim = optim.replace("<class 'sklearn.", "")

275 optim = optim.replace("<class '", "")

276 optim = optim.replace(" ", "")

277 optim = optim.replace(">", "")

278 optim = optim.replace("=", "")

279 optim = optim.replace("{", "")

280 optim = optim.replace("}", "")

281 optim = optim.replace(":", "")

282 optim = optim.replace("'", "")

283 optim = optim.replace("/", "")

284 optim = optim.replace("\\", "")

285 parts = (obs_op['runtime'], obs_op['name'], obs_op['scenario'],

286 obs_op['problem'], optim,

287 "op" + str(obs_op.get('opset', '-')),

288 "nf" + str(obs_op.get('n_features', '-')))

289 name = "dump-{}-{}.pkl".format(

290 "ERROR" if is_error else "i",

291 "-".join(map(str, parts)))

292 name = os.path.join(dump_folder, name)

293 obs_op = obs_op.copy()

294 fcts = [k for k in obs_op if k.startswith('lambda')]

295 for fct in fcts:

296 del obs_op[fct]

297 kwargs.update({'obs_op': obs_op})

298 with open(name, "wb") as f:

299 pickle.dump(kwargs, f)

300 return name

301

302

303def default_time_kwargs():

304 """

305 Returns default values *number* and *repeat* to measure

306 the execution of a function.

307

308 .. runpython::

309 :showcode:

310 :warningout: DeprecationWarning

311

312 from mlprodict.onnxrt.validate.validate_helper import default_time_kwargs

313 import pprint

314 pprint.pprint(default_time_kwargs())

315

316 keys define the number of rows,

317 values defines *number* and *repeat*.

318 """

319 return {

320 1: dict(number=30, repeat=20),

321 10: dict(number=20, repeat=20),

322 100: dict(number=8, repeat=10),

323 1000: dict(number=5, repeat=5),

324 10000: dict(number=3, repeat=3),

325 }

326

327

328def measure_time(stmt, x, repeat=10, number=50, div_by_number=False, first_run=True):

329 """

330 Measures a statement and returns the results as a dictionary.

331

332 :param stmt: string

333 :param x: matrix

334 :param repeat: average over *repeat* experiment

335 :param number: number of executions in one row

336 :param div_by_number: divide by the number of executions

337 :param first_run: if True, runs the function once before measuring

338 :return: dictionary

339

340 See `Timer.repeat <https://docs.python.org/3/library/timeit.html?timeit.Timer.repeat>`_

341 for a better understanding of parameter *repeat* and *number*.

342 The function returns a duration corresponding to

343 *number* times the execution of the main statement.

344 """

345 if x is None:

346 raise ValueError("x cannot be None") # pragma: no cover

347

348 try:

349 stmt(x)

350 except RuntimeError as e: # pragma: no cover

351 raise RuntimeError("{}-{}".format(type(x), x.dtype)) from e

352

353 def fct():

354 stmt(x)

355

356 if first_run:

357 fct()

358 tim = Timer(fct)

359 res = numpy.array(tim.repeat(repeat=repeat, number=number))

360 total = numpy.sum(res)

361 if div_by_number:

362 res /= number

363 mean = numpy.mean(res)

364 dev = numpy.mean(res ** 2)

365 dev = max(0, (dev - mean**2)) ** 0.5

366 mes = dict(average=mean, deviation=dev, min_exec=numpy.min(res),

367 max_exec=numpy.max(res), repeat=repeat, number=number,

368 total=total)

369 return mes

370

371

372def _multiply_time_kwargs(time_kwargs, time_kwargs_fact, inst):

373 """

374 Multiplies values in *time_kwargs* following strategy

375 *time_kwargs_fact* for a given model *inst*.

376

377 :param time_kwargs: see below

378 :param time_kwargs_fact: see below

379 :param inst: :epkg:`scikit-learn` model

380 :return: new *time_kwargs*

381

382 Possible values for *time_kwargs_fact*:

383

384 - a integer: multiplies *number* by this number

385 - `'lin'`: multiplies value *number* for linear models depending

386 on the number of rows to process (:math:`\\propto 1/\\log_{10}(n)`)

387

388 .. runpython::

389 :showcode:

390 :warningout: DeprecationWarning

391

392 from pprint import pprint

393 from sklearn.linear_model import LinearRegression

394 from mlprodict.onnxrt.validate.validate_helper import (

395 default_time_kwargs, _multiply_time_kwargs)

396

397 lr = LinearRegression()

398 kw = default_time_kwargs()

399 pprint(kw)

400

401 kw2 = _multiply_time_kwargs(kw, 'lin', lr)

402 pprint(kw2)

403 """

404 if time_kwargs is None:

405 raise ValueError("time_kwargs cannot be None.") # pragma: no cover

406 if time_kwargs_fact in ('', None):

407 return time_kwargs

408 try:

409 vi = int(time_kwargs_fact)

410 time_kwargs_fact = vi

411 except (TypeError, ValueError):

412 pass

413 if isinstance(time_kwargs_fact, int):

414 time_kwargs_modified = copy.deepcopy(time_kwargs)

415 for k in time_kwargs_modified:

416 time_kwargs_modified[k]['number'] *= time_kwargs_fact

417 return time_kwargs_modified

418 if time_kwargs_fact == 'lin':

419 if isinstance(inst, LinearModel):

420 time_kwargs_modified = copy.deepcopy(time_kwargs)

421 for k in time_kwargs_modified:

422 kl = max(int(math.log(k) / math.log(10) + 1e-5), 1)

423 f = max(int(10 / kl + 0.5), 1)

424 time_kwargs_modified[k]['number'] *= f

425 time_kwargs_modified[k]['repeat'] *= 1

426 return time_kwargs_modified

427 return time_kwargs

428 raise ValueError( # pragma: no cover

429 "Unable to interpret time_kwargs_fact='{}'.".format(

430 time_kwargs_fact))

431

432

433def _get_problem_data(prob, n_features):

434 data_problem = _problems[prob](n_features=n_features)

435 if len(data_problem) == 6:

436 X_, y_, init_types, method, output_index, Xort_ = data_problem

437 dofit = True

438 elif len(data_problem) == 7:

439 X_, y_, init_types, method, output_index, Xort_, dofit = data_problem

440 else:

441 raise RuntimeError( # pragma: no cover

442 "Unable to interpret problem '{}'.".format(prob))

443 if (len(X_.shape) == 2 and X_.shape[1] != n_features and

444 n_features is not None):

445 raise RuntimeError( # pragma: no cover

446 "Problem '{}' with n_features={} returned {} features"

447 "(func={}).".format(prob, n_features, X_.shape[1],

448 _problems[prob]))

449 if y_ is None:

450 (X_train, X_test, Xort_train, # pylint: disable=W0612

451 Xort_test) = train_test_split(

452 X_, Xort_, random_state=42)

453 y_train, y_test = None, None

454 else:

455 (X_train, X_test, y_train, y_test, # pylint: disable=W0612

456 Xort_train, Xort_test) = train_test_split(

457 X_, y_, Xort_, random_state=42)

458 if isinstance(init_types, tuple):

459 init_types, conv_options = init_types

460 else:

461 conv_options = None

462

463 if isinstance(method, tuple):

464 method_name, predict_kwargs = method

465 else:

466 method_name = method

467 predict_kwargs = {}

468

469 return (X_train, X_test, y_train,

470 y_test, Xort_test,

471 init_types, conv_options, method_name,

472 output_index, dofit, predict_kwargs)

Coverage for mlprodict/onnxrt/validate/validate_helper.py : 97%

234 statements

Coverage for mlprodict/onnxrt/validate/validate_helper.py : 97%

234 statements 227 run 7 missing 19 excluded

234 statements