Sérialisation avec protobuf

Links: notebook, html, PDF, python, slides, GitHub

protobuf optimise la sérialisation de deux façons. Elle accélère l’écriture et la lecture des données et permet aussi un accès rapide à une information précise dans désérialiser les autres. Elle réalise cela en imposant un schéma strict de données.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

Schéma

On récupère l’exemple du tutorial.

schema = """
syntax = "proto2";

package tutorial;

message Person {
  required string name = 1;
  required int32 id = 2;
  optional string email = 3;

  enum PhoneType {
    MOBILE = 0;
    HOME = 1;
    WORK = 2;
  }

  message PhoneNumber {
    required string number = 1;
    optional PhoneType type = 2 [default = HOME];
  }

  repeated PhoneNumber phones = 4;
}

message AddressBook {
  repeated Person people = 1;
}
"""

Compilation

Il faut d’abord récupérer le compilateur. Cela peut se faire depuis le site de protobuf ou sur Linux (Ubuntu/Debian) apt-get install protobuf-compiler pour obtenir le programme protoc.

import google.protobuf as gp
version = gp.__version__
if version == "3.5.2.post1":
    version = "3.5.1"
version
'3.5.1'
import sys, os

if sys.platform.startswith("win"):
    url = "https://github.com/google/protobuf/releases/download/v{0}/protoc-{0}-win32.zip".format(version)
    name = "protoc-{0}-win32.zip".format(version)
    exe = 'protoc.exe'
else:
    url = "https://github.com/google/protobuf/releases/download/v{0}/protoc-{0}-linux-x86_64.zip".format(version)
    exe = 'protoc'
    name = "protoc-{0}-linux-x86_64.zip".format(version)

protoc = os.path.join("bin", exe)
if not os.path.exists(name):
    from pyquickhelper.filehelper import download
    try:
        download(url)
    except Exception as e:
        raise Exception("Unable to download '{0}'\nERROR\n{1}".format(url, e))
else:
    print(name)
protoc-3.5.1-win32.zip
if not os.path.exists(protoc):
    from pyquickhelper.filehelper import unzip_files
    unzip_files(name,where_to='.')
if not os.path.exists(protoc):
    raise FileNotFoundError(protoc)

On écrit le format sur disque.

with open('schema.proto', 'w') as f:
    f.write(schema)

Et on peut compiler.

from pyquickhelper.loghelper import run_cmd
cmd = '{0} --python_out=. schema.proto'.format(protoc)
try:
    out, err = run_cmd(cmd=cmd, wait=True)
except PermissionError as e:
    # Sous Linux si ne marche pas avec bin/protoc, on utilise
    # protoc directement à supposer que le package
    # protobuf-compiler a été installé.
    if not sys.platform.startswith("win"):
        protoc = "protoc"
        cmd = '{0} --python_out=. schema.proto'.format(protoc)
        try:
            out, err = run_cmd(cmd=cmd, wait=True)
        except Exception as e:
            mes = "CMD: {0}".format(cmd)
            raise Exception("Unable to use {0}\n{1}".format(protoc, mes)) from e
    else:
        mes = "CMD: {0}".format(cmd)
        raise Exception("Unable to use {0}\n{1}".format(protoc, mes)) from e
print("\n----\n".join([out, err]))
----

Un fichier a été généré.

[_ for _ in os.listdir(".") if '.py' in _]
['schema_pb2.py']
with open('schema_pb2.py', 'r') as f:
    content = f.read()
print(content[:1000])
# Generated by the protocol buffer compiler.  DO NOT EDIT!
# source: schema.proto
import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()



DESCRIPTOR = _descriptor.FileDescriptor(
  name='schema.proto',
  package='tutorial',
  syntax='proto2',
  serialized_pb=_b('nx0cschema.protox12x08tutorial"xdbx01nx06Personx12x0cnx04namex18x01 x02(tx12nnx02idx18x02 x02(x05x12rnx05x65mailx18x03 x01(tx12,nx06phonesx18x04 x03(x0bx32x1c.tutorial.Person.PhoneNumberx1aMnx0bPhoneNumberx12x0enx06numberx18x01 x02(tx12.nx04typex18x02 x01(x0ex32x1a.tutorial.Person.PhoneType:x04HOME"

Import du module créé

Pour utliser protobuf, il faut importer le module créé.

import schema_pb2

On créé un enregistrement.

person = schema_pb2.Person()
person.id = 1234
person.name = "John Doe"
person.email = "jdoe@example.com"
phone = person.phones.add()
phone.number = "555-4321"
phone.type = schema_pb2.Person.HOME
person
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}

Sérialisation en chaîne de caractères

res = person.SerializeToString()
type(res), res
(bytes,
 b'nx08John Doex10xd2tx1ax10jdoe@example.com"x0cnx08555-4321x10x01')
%timeit person.SerializeToString()
4.56 µs ± 447 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
pers = schema_pb2.Person.FromString(res)
pers
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
pers = schema_pb2.Person()
pers.ParseFromString(res)
pers
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
%timeit schema_pb2.Person.FromString(res)
3.44 µs ± 696 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit pers.ParseFromString(res)
3.13 µs ± 633 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

Plusieurs chaînes de caractères

db = []

person = schema_pb2.Person()
person.id = 1234
person.name = "John Doe"
person.email = "jdoe@example.com"
phone = person.phones.add()
phone.number = "555-4321"
phone.type = schema_pb2.Person.HOME
db.append(person)

person = schema_pb2.Person()
person.id = 5678
person.name = "Johnette Doette"
person.email = "jtdoet@example2.com"
phone = person.phones.add()
phone.number = "777-1234"
phone.type = schema_pb2.Person.MOBILE
db.append(person)
import struct
from io import BytesIO
buffer = BytesIO()
for p in db:
    size = p.ByteSize()
    buffer.write(struct.pack('i', size))
    buffer.write(p.SerializeToString())
res = buffer.getvalue()
res
b'-x00x00x00nx08John Doex10xd2tx1ax10jdoe@example.com"x0cnx08555-4321x10x017x00x00x00nx0fJohnette Doettex10xae,x1ax13jtdoet@example2.com"x0cnx08777-1234x10x00'
from google.protobuf.internal.decoder import _DecodeVarint32
db2 = []
buffer = BytesIO(res)
n = 0
while True:
    bsize = buffer.read(4)
    if len(bsize) == 0:
        # C'est fini.
        break
    size = struct.unpack('i', bsize)[0]
    data = buffer.read(size)
    p = schema_pb2.Person.FromString(data)
    db2.append(p)
db2[0], db2[1]
(name: "John Doe"
 id: 1234
 email: "jdoe@example.com"
 phones {
   number: "555-4321"
   type: HOME
 }, name: "Johnette Doette"
 id: 5678
 email: "jtdoet@example2.com"
 phones {
   number: "777-1234"
   type: MOBILE
 })

Sérialisation JSON

from google.protobuf.json_format import MessageToJson
print(MessageToJson(pers))
{
  "name": "John Doe",
  "id": 1234,
  "email": "jdoe@example.com",
  "phones": [
    {
      "number": "555-4321",
      "type": "HOME"
    }
  ]
}
%timeit MessageToJson(pers)
76.4 µs ± 7.48 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
from google.protobuf.json_format import Parse as ParseJson
js = MessageToJson(pers)
res = ParseJson(js, message=schema_pb2.Person())
res
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
%timeit ParseJson(js, message=schema_pb2.Person())
75 µs ± 7.77 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)