10.3 Binary formats#

Estimated time for this notebook: 15 minutes

Binary file formats#

Now we’re getting toward a numerically-based data structure, using integers for object keys, we should think about binary serialisation.

Binary file formats are much smaller than human-readable text based formats, so important when handling really big datasets.

One can compress a textual file format, of course, and with good compression algorithms this will be similar in size to the binary file. However, this has performance implications.

A hand-designed binary format is fast and small, at the loss of human readability.

Let’s remind ourselves of the model we have been using previously.

class Element:
    def __init__(self, symbol, id):
        self.symbol = symbol
        self.id = id


class Molecule:
    def __init__(self, id):
        self.elements = {}
        self.id = id

    def add_element(self, element, number):
        self.elements[element] = number

    def to_struct(self):
        return {element.symbol: number for element, number in self.elements.items()}


class Reaction:
    def __init__(self):
        self.reactants = {}
        self.products = {}

    def add_reactant(self, reactant, stoichiometry):
        self.reactants[reactant] = stoichiometry

    def add_product(self, product, stoichiometry):
        self.products[product] = stoichiometry

    def to_struct(self):
        return {
            "reactants": [x.to_struct() for x in self.reactants],
            "products": [x.to_struct() for x in self.products],
            "stoichiometries": list(self.reactants.values())
            + list(self.products.values()),
        }


class System:
    def __init__(self):
        self.reactions = []
        self.elements = []
        self.molecules = []

    def add_element(self, symbol):
        new_element = Element(symbol, len(self.elements))
        self.elements.append(new_element)
        return new_element

    def add_molecule(self):
        new_molecule = Molecule(len(self.molecules))
        self.molecules.append(new_molecule)
        return new_molecule

    def add_reaction(self):
        new_reaction = Reaction()
        self.reactions.append(new_reaction)
        return new_reaction

    def save(self):

        result = {
            "elements": [element.symbol for element in self.elements],
            "molecules": {
                molecule.id: {
                    element.id: number for element, number in molecule.elements.items()
                }
                for molecule in self.molecules
            },
            "reactions": [
                {
                    "reactants": {
                        reactant.id: stoich
                        for reactant, stoich in reaction.reactants.items()
                    },
                    "products": {
                        product.id: stoich
                        for product, stoich in reaction.products.items()
                    },
                }
                for reaction in self.reactions
            ],
        }

        return result
s = System()

c = s.add_element("C")
o = s.add_element("O")
h = s.add_element("H")

co2 = s.add_molecule()
co2.add_element(c, 1)
co2.add_element(o, 2)

h2o = s.add_molecule()
h2o.add_element(h, 2)
h2o.add_element(o, 1)

o2 = s.add_molecule()
o2.add_element(o, 2)

h2 = s.add_molecule()
h2.add_element(h, 2)

glucose = s.add_molecule()
glucose.add_element(c, 6)
glucose.add_element(h, 12)
glucose.add_element(o, 6)

combustion_glucose = s.add_reaction()
combustion_glucose.add_reactant(glucose, 1)
combustion_glucose.add_reactant(o2, 6)
combustion_glucose.add_product(co2, 6)
combustion_glucose.add_product(h2o, 6)
combustion_hydrogen = s.add_reaction()
combustion_hydrogen.add_reactant(h2, 2)
combustion_hydrogen.add_reactant(o2, 1)
combustion_hydrogen.add_product(h2o, 2)

The problem with binary file formats, is that, lacking complex data structures, one needs to supply the length of an item before that item:

class FakeBinarySavingSystem:
    # Pretend binary-style writing to a list to make it easier to read at first.
    def save(self, system, buffer):
        buffer.append(len(system.elements))
        for element in system.elements:
            buffer.append(element.symbol)

        buffer.append(len(system.molecules))
        for molecule in system.molecules:
            buffer.append(len(molecule.elements))
            for element, number in molecule.elements.items():
                buffer.append(element.id)
                buffer.append(number)

        buffer.append(len(system.reactions))
        for reaction in system.reactions:
            buffer.append(len(reaction.reactants))
            for reactant, stoich in reaction.reactants.items():
                buffer.append(reactant.id)
                buffer.append(stoich)
            buffer.append(len(reaction.products))
            for product, stoich in reaction.products.items():
                buffer.append(product.id)
                buffer.append(stoich)
characterarray = []
FakeBinarySavingSystem().save(s, characterarray)
characterarray
[3,
 'C',
 'O',
 'H',
 5,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 3,
 0,
 6,
 2,
 12,
 1,
 6,
 2,
 2,
 4,
 1,
 2,
 6,
 2,
 0,
 6,
 1,
 6,
 2,
 3,
 2,
 2,
 1,
 1,
 1,
 2]

Deserialisation is left as an exercise for the reader :).

Endian-robust binary file formats#

Having prepared our data as a sequence of data which can be recorded in a single byte, we might think a binary file format on disk is as simple as saving each number in one byte:

# First, turn symbol characters to equivalent integers (ascii)
intarray = [x.encode("ascii")[0] if isinstance(x, str) else x for x in characterarray]
intarray
[3,
 67,
 79,
 72,
 5,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 3,
 0,
 6,
 2,
 12,
 1,
 6,
 2,
 2,
 4,
 1,
 2,
 6,
 2,
 0,
 6,
 1,
 6,
 2,
 3,
 2,
 2,
 1,
 1,
 1,
 2]
bytearray(intarray)
bytearray(b'\x03COH\x05\x02\x00\x01\x01\x02\x02\x02\x02\x01\x01\x01\x01\x02\x01\x02\x02\x03\x00\x06\x02\x0c\x01\x06\x02\x02\x04\x01\x02\x06\x02\x00\x06\x01\x06\x02\x03\x02\x02\x01\x01\x01\x02')
with open("system.mol", "wb") as binfile:
    binfile.write(bytearray(intarray))

However, this misses out on an unfortunate problem if we end up with large enough numbers to need more than one byte per integer, or we want to represent floats: different computer designs will put the most-significant bytes of a multi-byte integer or float either at the beginning (‘big endian’ systems) or at the end (‘little endian’ systems).

To get around this, we need to use a portable standard for making binary files.

One possible choice is XDR (standing for eXternal Data Representation). XDR is a standard data serialization format that accounts for endian differences between systems.

import xdrlib


class XDRSavingSystem(System):
    def __init__(self, system):
        super().__init__()
        # Shallow Copy constructor
        self.elements = system.elements
        self.reactions = system.reactions
        self.molecules = system.molecules
        self.buffer = xdrlib.Packer()

    def _pack_pair(self, item):
        self.buffer.pack_int(item[0].id)
        self.buffer.pack_int(item[1])

    def _pack_molecule(self, mol):
        self.buffer.pack_array(mol.elements.items(), self._pack_pair)

    def _pack_reaction(self, reaction):
        self.buffer.pack_array(reaction.reactants.items(), self._pack_pair)
        self.buffer.pack_array(reaction.products.items(), self._pack_pair)

    def save(self):
        el_symbols = list(map(lambda x: x.symbol.encode("utf-8"), self.elements))
        # Note that pack_array AUTOMATICALLY packs the length of the array first!
        self.buffer.pack_array(el_symbols, self.buffer.pack_string)
        self.buffer.pack_array(self.molecules, self._pack_molecule)
        self.buffer.pack_array(self.reactions, self._pack_reaction)
        return self.buffer
xdrsys = XDRSavingSystem(s)
xdrbuffer = xdrsys.save()
xdrbuffer.get_buffer()
b'\x00\x00\x00\x03\x00\x00\x00\x01C\x00\x00\x00\x00\x00\x00\x01O\x00\x00\x00\x00\x00\x00\x01H\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x02'

A higher level approach to binary file formats: HDF5#

This was quite painful. We’ve shown you it because it is very likely you will encounter this kind of unpleasant binary file format in your work.

However, one recommended approach to building binary file formats is to use HDF5 (Hierarchical Data Format), a much higher level binary file format.

HDF5’s approach requires you to represent your system in terms of high-dimensional matrices, like NumPy arrays. It then saves these, and handles all the tedious number-of-field management for you.

import h5py
import numpy as np


class HDF5SavingSystem(System):
    def __init__(self, system):
        super().__init__()
        # Shallow Copy constructor
        self.elements = system.elements
        self.reactions = system.reactions
        self.molecules = system.molecules

    def element_symbols(self):
        return list(map(lambda x: x.symbol.encode("ascii"), self.elements))

    def molecule_matrix(self):
        molecule_matrix = np.zeros((len(self.elements), len(self.molecules)), dtype=int)

        for molecule in self.molecules:
            for element, n in molecule.elements.items():
                molecule_matrix[element.id, molecule.id] = n

        return molecule_matrix

    def reaction_matrix(self):
        reaction_matrix = np.zeros(
            (len(self.molecules), len(self.reactions)), dtype=int
        )

        for i, reaction in enumerate(self.reactions):
            for reactant, n in reaction.reactants.items():
                reaction_matrix[reactant.id, i] = -1 * n

            for product, n in reaction.products.items():
                reaction_matrix[product.id, i] = n

        return reaction_matrix

    def write(self, filename):
        hdf = h5py.File(filename, "w")
        string_type = h5py.special_dtype(vlen=bytes)
        hdf.create_dataset(
            "symbols", (len(self.elements), 1), string_type, self.element_symbols()
        )
        hdf.create_dataset("molecules", data=self.molecule_matrix())
        hdf.create_dataset("reactions", data=self.reaction_matrix())
        hdf.close()
saver = HDF5SavingSystem(s)
saver.element_symbols()
[b'C', b'O', b'H']
saver.molecule_matrix()
array([[ 1,  0,  0,  0,  6],
       [ 2,  1,  2,  0,  6],
       [ 0,  2,  0,  2, 12]])
saver.reaction_matrix()
array([[ 6,  0],
       [ 6,  2],
       [-6, -1],
       [ 0, -2],
       [-1,  0]])
saver.write("foo.hdf5")

Note that this binary representation is not human readable at all.

with open("foo.hdf5", "rb") as f_in:
    bytes = f_in.read()
bytes[0:100]
b'\x89HDF\r\n\x1a\n\x00\x00\x00\x00\x00\x08\x08\x00\x04\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xf8\x18\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00`\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x88\x00\x00\x00\x00\x00\x00\x00\xa8\x02\x00\x00\x00\x00\x00\x00\x01\x00\x01\x00'
hdf_load = h5py.File("foo.hdf5")
np.array(hdf_load["reactions"])
array([[ 6,  0],
       [ 6,  2],
       [-6, -1],
       [ 0, -2],
       [-1,  0]])

Using a sparse matrix storage would be even better here, but we don’t have time for that!