Source code for mlprodict.onnxrt.ops_cpu.op_dict_vectorizer
# -*- encoding: utf-8 -*-
# pylint: disable=E0203,E1101,C0111
"""
Runtime operator.
:githublink:`%|py|7`
"""
import numpy
from scipy.sparse import coo_matrix
from ._op import OpRun, RuntimeTypeError
from ..shape_object import ShapeObject
[docs]class DictVectorizer(OpRun):
atts = {'int64_vocabulary': numpy.empty(0, dtype=numpy.int64),
'string_vocabulary': numpy.empty(0, dtype=numpy.str)}
[docs] def __init__(self, onnx_node, desc=None, **options):
OpRun.__init__(self, onnx_node, desc=desc,
expected_attributes=DictVectorizer.atts,
**options)
self.dict_labels = {}
if len(self.int64_vocabulary) > 0:
for i, v in enumerate(self.int64_vocabulary):
self.dict_labels[v] = i
self.is_int = True
else:
for i, v in enumerate(self.string_vocabulary):
self.dict_labels[v.decode('utf-8')] = i
self.is_int = False
if len(self.dict_labels) == 0:
raise RuntimeError( # pragma: no cover
"int64_vocabulary and string_vocabulary cannot be both empty.")
[docs] def _run(self, x): # pylint: disable=W0221
if not isinstance(x, (numpy.ndarray, list)):
raise RuntimeTypeError( # pragma: no cover
"x must be iterable not {}.".format(type(x)))
values = []
rows = []
cols = []
for i, row in enumerate(x):
for k, v in row.items():
values.append(v)
rows.append(i)
cols.append(self.dict_labels[k])
values = numpy.array(values)
rows = numpy.array(rows)
cols = numpy.array(cols)
return (coo_matrix((values, (rows, cols)), shape=(len(x), len(self.dict_labels))), )
[docs] def _infer_shapes(self, x): # pylint: disable=W0221
pref = str(hex(id(self))[2:])
return (ShapeObject(["ndv%s_0" % pref, "N%s_1" % pref], dtype=x.dtype), )