XClose
Menu

Deserialisation

Consider a straightforward YAML serialisation for our model:

In [1]:
class Element:
    def __init__(self, symbol):
        self.symbol = symbol
    
class Molecule:
    def __init__(self):
        self.elements= {} # Map from element to number of that element in the molecule
        
    def add_element(self, element, number):
        self.elements[element] = number
    
    def to_struct(self):
        return {x.symbol: self.elements[x] for x in self.elements}
    
class Reaction:
    def __init__(self):
        self.reactants = { } # Map from reactants to stoichiometries
        self.products = { } # Map from products to stoichiometries
        
    def add_reactant(self, reactant, stoichiometry):
        self.reactants[reactant] = stoichiometry
        
    def add_product(self, product, stoichiometry):
        self.products[product] = stoichiometry
        
    def to_struct(self):
        return {
            'reactants' : [x.to_struct() for x in self.reactants],
            'products' : [x.to_struct() for x in self.products],
            'stoichiometries' : list(self.reactants.values())+
                                list(self.products.values())
        }
     
class System:
    def __init__(self):
        self.reactions=[]
    def add_reaction(self, reaction):
        self.reactions.append(reaction)
        
    def to_struct(self):
        return [x.to_struct() for x in self.reactions]
In [2]:
c=Element("C")
o=Element("O")
h=Element("H")

co2 = Molecule()
co2.add_element(c,1)
co2.add_element(o,2)

h2o = Molecule()
h2o.add_element(h,2)
h2o.add_element(o,1)

o2 = Molecule()
o2.add_element(o,2)

h2 = Molecule()
h2.add_element(h,2)

glucose = Molecule()
glucose.add_element(c,6)
glucose.add_element(h,12)
glucose.add_element(o,6)

combustion_glucose = Reaction()
combustion_glucose.add_reactant(glucose,  1)
combustion_glucose.add_reactant(o2, 6)
combustion_glucose.add_product(co2, 6)
combustion_glucose.add_product(h2o, 6)

combustion_hydrogen = Reaction()
combustion_hydrogen.add_reactant(h2,2)
combustion_hydrogen.add_reactant(o2,1)
combustion_hydrogen.add_product(h2o,2)

s=System()
s.add_reaction(combustion_glucose)
s.add_reaction(combustion_hydrogen)

s.to_struct()
Out[2]:
[{'reactants': [{'C': 6, 'H': 12, 'O': 6}, {'O': 2}],
  'products': [{'C': 1, 'O': 2}, {'H': 2, 'O': 1}],
  'stoichiometries': [1, 6, 6, 6]},
 {'reactants': [{'H': 2}, {'O': 2}],
  'products': [{'H': 2, 'O': 1}],
  'stoichiometries': [2, 1, 2]}]
In [3]:
import yaml
print(yaml.dump(s.to_struct()))
- products:
  - {C: 1, O: 2}
  - {H: 2, O: 1}
  reactants:
  - {C: 6, H: 12, O: 6}
  - {O: 2}
  stoichiometries: [1, 6, 6, 6]
- products:
  - {H: 2, O: 1}
  reactants:
  - {H: 2}
  - {O: 2}
  stoichiometries: [2, 1, 2]

Deserialising non-normal data structures

We can see that this data structure, although seemingly sensible, is horribly non-normal.

  • The stoichiometries information requires us to align each one to the corresponding molecule in order.
  • Each element is described multiple times: we will have to ensure that each mention of C comes back to the same constructed element object.
In [4]:
class DeSerialiseStructure:
    def __init__(self):
        self.elements = {}
        self.molecules = {}
        
    def add_element(self, candidate):
        if candidate not in self.elements:
            self.elements[candidate]=Element(candidate)
        return self.elements[candidate]
    
    def add_molecule(self, candidate):
        if tuple(candidate.items()) not in self.molecules:
            m = Molecule()
            for symbol, number in candidate.items():
                m.add_element(self.add_element(symbol), number)
            self.molecules[tuple(candidate.items())]=m
        return self.molecules[tuple(candidate.items())]
    
    def parse_system(self, system):
        s = System()
        for reaction in system:
            r = Reaction()
            stoichiometries = reaction['stoichiometries']
            for molecule in reaction['reactants']:
                r.add_reactant(self.add_molecule(molecule),
                               stoichiometries.pop(0))
            for molecule in reaction['products']:
                r.add_product(self.add_molecule(molecule),
                               stoichiometries.pop(0))
            s.add_reaction(r)
        return s
In [5]:
de_serialiser = DeSerialiseStructure()
round_trip = de_serialiser.parse_system(s.to_struct())
In [6]:
round_trip.to_struct()
Out[6]:
[{'reactants': [{'C': 6, 'H': 12, 'O': 6}, {'O': 2}],
  'products': [{'C': 1, 'O': 2}, {'H': 2, 'O': 1}],
  'stoichiometries': [1, 6, 6, 6]},
 {'reactants': [{'H': 2}, {'O': 2}],
  'products': [{'H': 2, 'O': 1}],
  'stoichiometries': [2, 1, 2]}]
In [7]:
de_serialiser.elements
Out[7]:
{'C': <__main__.Element at 0x2b3f937797b8>,
 'H': <__main__.Element at 0x2b3f937798d0>,
 'O': <__main__.Element at 0x2b3f93779940>}
In [8]:
de_serialiser.molecules
Out[8]:
{(('C', 6), ('H', 12), ('O', 6)): <__main__.Molecule at 0x2b3f93779a58>,
 (('O', 2),): <__main__.Molecule at 0x2b3f93779828>,
 (('C', 1), ('O', 2)): <__main__.Molecule at 0x2b3f93779c50>,
 (('H', 2), ('O', 1)): <__main__.Molecule at 0x2b3f93779cc0>,
 (('H', 2),): <__main__.Molecule at 0x2b3f93779dd8>}
In [9]:
list(round_trip.reactions[0].reactants.keys())[1]
Out[9]:
<__main__.Molecule at 0x2b3f93779828>
In [10]:
list(round_trip.reactions[1].reactants.keys())[1]
Out[10]:
<__main__.Molecule at 0x2b3f93779828>

In making this, we ended up choosing primary keys for our datatypes:

In [11]:
list(de_serialiser.molecules.keys())
Out[11]:
[(('C', 6), ('H', 12), ('O', 6)),
 (('O', 2),),
 (('C', 1), ('O', 2)),
 (('H', 2), ('O', 1)),
 (('H', 2),)]

Again, we note that a combination of columns uniquely defining an item is a valid key - there is a key correspondence between a candidate key in the database sense and a "hashable" data structure that can be used to a key in a dict.

Note that to make this example even reasonably doable, we didn't add additional data to the objects (mass, rate etc)

Normalising a YAML structure

To make this structure easier to de-serialise, we can make a normalised file-format, by defining primary keys (hashable types) for each entity on write:

In [12]:
class SaveSystem:
    def __init__(self):
        self.elements = set()
        self.molecules = set()
        
    def element_key(self, element):
        
        return element.symbol
    
    def molecule_key(self, molecule):
        key=''
        for element, number in molecule.elements.items():
            key+=element.symbol
            key+=str(number)
        return key
    
    def save(self, system):
        for reaction in system.reactions:
            for molecule in reaction.reactants:
                self.molecules.add(molecule)
                for element in molecule.elements:
                    self.elements.add(element)
            for molecule in reaction.products:
                self.molecules.add(molecule)
                for element in molecule.elements:
                    self.elements.add(element)
                    
        result = {
            'elements' : [self.element_key(element)
                          for element in self.elements],
            'molecules' : {
                self.molecule_key(molecule):
                    {self.element_key(element): number
                          for element, number
                          in molecule.elements.items()}
                    for molecule in self.molecules},
            'reactions' : [{
                'reactants' : {
                    self.molecule_key(reactant) : stoich
                        for reactant, stoich
                        in reaction.reactants.items()
                },
                'products' : {
                    self.molecule_key(product) : stoich
                        for product, stoich
                        in reaction.products.items()
                    
                }}
                for reaction in system.reactions]
            }
        return result
In [13]:
saver = SaveSystem()
print(yaml.dump(saver.save(s)))
elements: [C, H, O]
molecules:
  C1O2: {C: 1, O: 2}
  C6H12O6: {C: 6, H: 12, O: 6}
  H2: {H: 2}
  H2O1: {H: 2, O: 1}
  O2: {O: 2}
reactions:
- products: {C1O2: 6, H2O1: 6}
  reactants: {C6H12O6: 1, O2: 6}
- products: {H2O1: 2}
  reactants: {H2: 2, O2: 1}

We can see that to make an easily parsed file format, without having to guess-recognise repeated entities based on their names (which is highly subject to data entry error), we effectively recover the same tables as found for the database model.

An alternative is to use a simple integer for such a primary key:

In [14]:
class SaveSystemI:
    def __init__(self):
        self.elements = {}
        self.molecules = {}
        
    def add_element(self, element):
        if element not in self.elements:
            self.elements[element]=len(self.elements)
        return self.elements[element]
        
    def add_molecule(self, molecule):
        if molecule not in self.molecules:
            self.molecules[molecule]=len(self.molecules)
        return self.molecules[molecule]
        
    def element_key(self, element):
        return self.elements[element]
    
    def molecule_key(self, molecule):
        return self.molecules[molecule]
    
    def save(self, system):
        for reaction in system.reactions:
            for molecule in reaction.reactants:
                self.add_molecule(molecule)
                for element in molecule.elements:
                    self.add_element(element)
            for molecule in reaction.products:
                self.add_molecule(molecule)
                for element in molecule.elements:
                    self.add_element(element)
                    
        result = {
            'elements' : [element.symbol
                          for element in self.elements],
            'molecules' : {
                self.molecule_key(molecule):
                    {self.element_key(element): number
                          for element, number
                          in molecule.elements.items()}
                    for molecule in self.molecules},
            'reactions' : [{
                'reactants' : {
                    self.molecule_key(reactant) : stoich
                        for reactant, stoich
                        in reaction.reactants.items()
                },
                'products' : {
                    self.molecule_key(product) : stoich
                        for product, stoich
                        in reaction.products.items()
                    
                }}
                for reaction in system.reactions]
            }
        return result
In [15]:
saver = SaveSystemI()
print(yaml.dump(saver.save(s)))
elements: [C, H, O]
molecules:
  0: {0: 6, 1: 12, 2: 6}
  1: {2: 2}
  2: {0: 1, 2: 2}
  3: {1: 2, 2: 1}
  4: {1: 2}
reactions:
- products: {2: 6, 3: 6}
  reactants: {0: 1, 1: 6}
- products: {3: 2}
  reactants: {1: 1, 4: 2}

Reference counting

The above approach of using a dictionary to determine the integer keys for objects is a bit clunky.

Another good approach is to use counted objects either via a static member or by using a factory pattern:

In [16]:
class Element:
    def __init__(self, symbol, id):
        self.symbol = symbol
        self.id = id
    
class Molecule:
    def __init__(self, id):
        self.elements= {} # Map from element to number of that element in the molecule
        self.id=id
        
    def add_element(self, element, number):
        self.elements[element] = number
    
    def to_struct(self):
        return {x.symbol: self.elements[x] for x in self.elements}
    
class Reaction:
    def __init__(self):
        self.reactants = { } # Map from reactants to stoichiometries
        self.products = { } # Map from products to stoichiometries
        
    def add_reactant(self, reactant, stoichiometry):
        self.reactants[reactant] = stoichiometry
        
    def add_product(self, product, stoichiometry):
        self.products[product] = stoichiometry
        
    def to_struct(self):
        return {
            'reactants' : [x.to_struct() for x in self.reactants],
            'products' : [x.to_struct() for x in self.products],
            'stoichiometries' : list(self.reactants.values())+
                                list(self.products.values())
        }
     
class System: # This will be our factory
    def __init__(self):
        self.reactions=[]
        self.elements=[]
        self.molecules=[]
        
    def add_element(self, symbol):
        new_element = Element(symbol, len(self.elements))
        self.elements.append(new_element)
        return new_element
    
    def add_molecule(self):
        new_molecule = Molecule(len(self.molecules))
        self.molecules.append(new_molecule)
        return new_molecule
    
    def add_reaction(self):
        new_reaction=Reaction()
        self.reactions.append(new_reaction)
        return new_reaction

    def save(self):
                    
        result = {
            'elements' : [element.symbol
                          for element in self.elements],
            'molecules' : {
                molecule.id:
                    {element.id: number
                          for element, number
                          in molecule.elements.items()}
                    for molecule in self.molecules},
            'reactions' : [{
                'reactants' : {
                        reactant.id : stoich
                        for reactant, stoich
                        in reaction.reactants.items()
                },
                'products' : {
                    product.id : stoich
                        for product, stoich
                        in reaction.products.items()
                    
                }}
                for reaction in self.reactions]
            }

        
        return result
In [17]:
s2=System()

c=s2.add_element("C")
o=s2.add_element("O")
h=s2.add_element("H")

co2 = s2.add_molecule()
co2.add_element(c,1)
co2.add_element(o,2)

h2o = s2.add_molecule()
h2o.add_element(h,2)
h2o.add_element(o,1)

o2 = s2.add_molecule()
o2.add_element(o,2)

h2 = s2.add_molecule()
h2.add_element(h,2)

glucose = s2.add_molecule()
glucose.add_element(c,6)
glucose.add_element(h,12)
glucose.add_element(o,6)

combustion_glucose = s2.add_reaction()
combustion_glucose.add_reactant(glucose,  1)
combustion_glucose.add_reactant(o2, 6)
combustion_glucose.add_product(co2, 6)
combustion_glucose.add_product(h2o, 6)
In [18]:
combustion_hydrogen = s2.add_reaction()
combustion_hydrogen.add_reactant(h2,2)
combustion_hydrogen.add_reactant(o2,1)
combustion_hydrogen.add_product(h2o,2)
In [19]:
s2.save()
Out[19]:
{'elements': ['C', 'O', 'H'],
 'molecules': {0: {0: 1, 1: 2},
  1: {2: 2, 1: 1},
  2: {1: 2},
  3: {2: 2},
  4: {0: 6, 2: 12, 1: 6}},
 'reactions': [{'reactants': {4: 1, 2: 6}, 'products': {0: 6, 1: 6}},
  {'reactants': {3: 2, 2: 1}, 'products': {1: 2}}]}

Binary file formats

Now we're getting toward a numerically-based data structure, using integers for object keys, we should think about binary serialisation.

Binary file formats are much smaller than human-readable text based formats, so important when handling really big datasets.

One can compress a textual file format, of course, and with good compression algorithms this will be similar in size to the binary file. (C.f. discussions of Shannon information density!) However, this has performance implications.

A hand-designed binary format is fast and small, at the loss of human readability.

The problem with binary file formats, is that, lacking complex data structures, one needs to supply the length of an item before that item:

In [20]:
class FakeSaveBinary: # Pretend binary-style writing to a list
    # to make it easier to read at first.    
    def save(self, system, buffer):
        buffer.append(len(system.elements))
        for element in system.elements:
            buffer.append(element.symbol)
        
        buffer.append(len(system.molecules))
        for molecule in system.molecules:
            buffer.append(len(molecule.elements))
            for element, number in molecule.elements.items():
                buffer.append(element.id)
                buffer.append(number)
        
        buffer.append(len(system.reactions))
        for reaction in system.reactions:
            buffer.append(len(reaction.reactants))
            for reactant, stoich in reaction.reactants.items():
                buffer.append(reactant.id)
                buffer.append(stoich)
            buffer.append(len(reaction.products))
            for product, stoich in reaction.products.items():
                buffer.append(product.id)
                buffer.append(stoich)
In [21]:
import io
arraybuffer = []
FakeSaveBinary().save(s2, arraybuffer)
In [22]:
arraybuffer
Out[22]:
[3,
 'C',
 'O',
 'H',
 5,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 3,
 0,
 6,
 2,
 12,
 1,
 6,
 2,
 2,
 4,
 1,
 2,
 6,
 2,
 0,
 6,
 1,
 6,
 2,
 3,
 2,
 2,
 1,
 1,
 1,
 2]

Deserialisation is left as an exercise for the reader.

Endian-robust binary file formats

Having prepared our data as a sequence of data which can be recorded in a single byte, we might think a binary file format on disk is as simple as saving each number in one byte:

In [23]:
# First, turn symbol characters to equivalent integers (ascii)
intarray = [x.encode('ascii')[0] if type(x)==str else x for x in arraybuffer]
In [24]:
bytearray(intarray)
Out[24]:
bytearray(b'\x03COH\x05\x02\x00\x01\x01\x02\x02\x02\x02\x01\x01\x01\x01\x02\x01\x02\x02\x03\x00\x06\x02\x0c\x01\x06\x02\x02\x04\x01\x02\x06\x02\x00\x06\x01\x06\x02\x03\x02\x02\x01\x01\x01\x02')
In [25]:
with open('system.mol','bw') as binfile:
    binfile.write(bytearray(intarray))

However, this misses out on an unfortunate problem if we end up with large enough numbers to need more than one byte per integer, or we want to represent floats: different computer designs but the most-significant bytes of a multi-byte integer or float at the beginning or end ('big endian' or 'little endian' data).

To get around this, we need to use a portable standard for making binary files.

One choice is XDR:

In [26]:
class XDRSavingSystem(System):
    
    def __init__(self, system):
        # Shallow Copy constructor
        self.elements = system.elements
        self.reactions = system.reactions
        self.molecules = system.molecules
        
    def save(self):
                 
        import xdrlib
        
        buffer = xdrlib.Packer()
        
        el_symbols = list(map(lambda x: x.symbol.encode('utf-8'), 
                                   self.elements))
        buffer.pack_array(el_symbols,
                          buffer.pack_string)
        #AUTOMATICALLY packs the length of the array first!

        def _pack_pair(item):
             buffer.pack_int(item[0].id)
             buffer.pack_int(item[1])
        
        def _pack_molecule(mol):
            buffer.pack_array(mol.elements.items(), 
                              _pack_pair)
        
        buffer.pack_array(self.molecules, _pack_molecule)
        
        def _pack_reaction(reaction):
            buffer.pack_array(reaction.reactants.items(),
                            _pack_pair)
            buffer.pack_array(reaction.products.items(),
                             _pack_pair)
        
        buffer.pack_array(self.reactions, _pack_reaction)
        return buffer
In [27]:
xdrsys = XDRSavingSystem(s2)
In [28]:
xdrbuff = xdrsys.save()
xdrbuff.get_buffer()
Out[28]:
b'\x00\x00\x00\x03\x00\x00\x00\x01C\x00\x00\x00\x00\x00\x00\x01O\x00\x00\x00\x00\x00\x00\x01H\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x02'

A higher level approach to binary file formats: HDF5

This was quite painful. We've shown you it because it is very likely you will encounter this kind of unpleasant binary file format in your work.

However, the recommended approach to building binary file formats is to use HDF5, a much higher level binary file format.

HDF5's approach requires you to represent your system in terms of high-dimensional matrices, like NumPy arrays. It then saves these, and handles all the tedious number-of-field management for you.

In [29]:
import numpy as np

class HDF5SavingSystem(System):
    def __init__(self, system):
        # Shallow Copy constructor
        self.elements = system.elements
        self.reactions = system.reactions
        self.molecules = system.molecules
        
    def element_symbols(self):
        return list(map(lambda x: x.symbol.encode('ascii'), 
                                   self.elements))
    
    def molecule_matrix(self):
        molecule_matrix = np.zeros((len(self.elements), 
                                    len(self.molecules)),dtype=int)
        
        for molecule in self.molecules:
            for element, n in molecule.elements.items():
                molecule_matrix[element.id,
                            molecule.id]=n
            
        return molecule_matrix
    
    def reaction_matrix(self):
        reaction_matrix = np.zeros((len(self.molecules), 
                                    len(self.reactions)),dtype=int)
        
        for i, reaction in enumerate(self.reactions):
            for reactant,n in reaction.reactants.items():
                reaction_matrix[reactant.id,i]=-1*n
            
            for product, n in reaction.products.items():
                reaction_matrix[product.id,i]=n
    
        return reaction_matrix
    
    def write(self, filename):
        import h5py
        hdf = h5py.File(filename,'w')
        string_type = h5py.special_dtype(vlen=bytes)
        hdf.create_dataset('symbols', (len(self.elements),1),
                           string_type, self.element_symbols())
        hdf.create_dataset('molecules', data=self.molecule_matrix())
        hdf.create_dataset('reactions', data=self.reaction_matrix())
        hdf.close()
In [30]:
saver=HDF5SavingSystem(s2)
In [31]:
saver.element_symbols()
Out[31]:
[b'C', b'O', b'H']
In [32]:
saver.molecule_matrix()
Out[32]:
array([[ 1,  0,  0,  0,  6],
       [ 2,  1,  2,  0,  6],
       [ 0,  2,  0,  2, 12]])
In [33]:
saver.reaction_matrix()
Out[33]:
array([[ 6,  0],
       [ 6,  2],
       [-6, -1],
       [ 0, -2],
       [-1,  0]])
In [34]:
saver.write('foo.hdf5')
In [35]:
import h5py
hdf_load=h5py.File('foo.hdf5')
In [36]:
np.array(hdf_load['reactions'])
Out[36]:
array([[ 6,  0],
       [ 6,  2],
       [-6, -1],
       [ 0, -2],
       [-1,  0]])

Using a sparse matrix storage would be even better here, but we don't have time for that!

In [ ]: