XClose
Menu

Domain specific languages

Lex and Yacc

Let's go back to our nice looks-like LaTeX file format:

In [1]:
%%writefile system.py

class Element:
    def __init__(self, symbol):
        self.symbol = symbol
        
    def __str__(self):
        return str(self.symbol)
        
class Molecule:
    def __init__(self):
        self.elements= {} # Map from element to number of that element in the molecule
        
    def add_element(self, element, number):
        self.elements[element] = number
    
    @staticmethod
    def as_subscript(number):
        if number==1:
            return ""
        
        if number<10:
            return "_"+str(number)
        else:
            return "_{"+str(number)+"}"
    
    def __str__(self):
        return ''.join(
            [str(element)+Molecule.as_subscript(self.elements[element])
             for element in self.elements])  

class Side:
    def __init__(self):
        self.molecules={}
    def add(self, reactant, stoichiometry):
        self.molecules[reactant]=stoichiometry
        
    @staticmethod
    def print_if_not_one(number):
        if number==1:
            return ''
        else: return str(number)
    
    def __str__(self):
        return " + ".join(
            [Side.print_if_not_one(self.molecules[molecule]) + 
             str(molecule) for molecule in self.molecules])
    
class Reaction:
    def __init__(self):
        self.reactants = Side() 
        self.products = Side()
        
    def __str__(self):
        return (str(self.reactants) + 
                  " \\rightarrow " + 
                  str(self.products))

class System:       
    def __init__(self):
        self.reactions=[]

    def add_reaction(self, reaction):
        self.reactions.append(reaction)
        
    def __str__(self):
        return "\\\\ \n".join(map(str,self.reactions))
Writing system.py
In [2]:
from system import *
s2=System()

c=Element("C")
o=Element("O")
h=Element("H")

co2 = Molecule()
co2.add_element(c,1)
co2.add_element(o,2)

h2o = Molecule()
h2o.add_element(h,2)
h2o.add_element(o,1)

o2 = Molecule()
o2.add_element(o,2)

h2 = Molecule()
h2.add_element(h,2)

glucose = Molecule()
glucose.add_element(c,6)
glucose.add_element(h,12)
glucose.add_element(o,6)

combustion_glucose=Reaction()
combustion_glucose.reactants.add(glucose,  1)
combustion_glucose.reactants.add(o2, 6)
combustion_glucose.products.add(co2, 6)
combustion_glucose.products.add(h2o, 6)
s2.add_reaction(combustion_glucose)


combustion_hydrogen = Reaction()
combustion_hydrogen.reactants.add(h2,2)
combustion_hydrogen.reactants.add(o2,1)
combustion_hydrogen.products.add(h2o,2)
s2.add_reaction(combustion_hydrogen)

print(s2)
C_6H_{12}O_6 + 6O_2 \rightarrow 6CO_2 + 6H_2O\\ 
2H_2 + O_2 \rightarrow 2H_2O
In [3]:
from IPython.display import display, Math
display(Math(str(s2)))
$\displaystyle C_6H_{12}O_6 + 6O_2 \rightarrow 6CO_2 + 6H_2O\\ 2H_2 + O_2 \rightarrow 2H_2O$

How might we write a parser for this? Clearly we'll encounter the problems we previously solved in ensuring each molecule is the and atom only gets one object instance, but we solved this by using an appropriate primary key. (The above implementation is designed to make this easy, learning from the previous lecture.)

But we'll also run into a bunch of problems which are basically string parsing : noting, for example, that _2 and _{12} make a number of atoms in a molecule, or that + joins molecules.

This will be very hard to do with straightforward python string processing.

In [4]:
import ply

Instead, we will use a tool called "Lexx and Yacc", which allows us to define the grammar of our file format.

The theory of "context free grammars" is rich and deep, and we will just scratch the surface here.

Tokenising with Lex

First, we need to turn our file into a "token stream", using regular expressions to match the kinds of symbol in our source:

In [5]:
%%writefile lexreactions.py

from ply import lex

tokens = (
'ELEMENT','NUMBER','SUBSCRIPT','LBRACE','RBRACE',
'PLUS','ARROW','NEWLINE','TEXNEWLINE'
)

# Tokens

t_PLUS    = r'\+'
t_SUBSCRIPT = r'_'
t_LBRACE  = r'\{'
t_RBRACE  = r'\}'
t_TEXNEWLINE = r'\\\\'
t_ARROW = r'\\rightarrow'
t_ELEMENT    = r'[A-Z][a-z]*?'

def t_NUMBER(t):
    r'\d+'
    t.value = int(t.value)
    return t

t_ignore  = ' '

def t_NEWLINE(t):
    r'\n+'
    return t

def t_error(t):
    print("Illegal character '%s'" % t.value[0])
    t.lexer.skip(1)

# Build the lexer
lexer = lex.lex()
Writing lexreactions.py
In [6]:
from lexreactions import lexer
In [7]:
tokens=[]
lexer.input(str(s2))
while True:
    tok = lexer.token()
    if not tok: 
        break      # No more input
    tokens.append(tok)
In [8]:
tokens
Out[8]:
[LexToken(ELEMENT,'C',1,0),
 LexToken(SUBSCRIPT,'_',1,1),
 LexToken(NUMBER,6,1,2),
 LexToken(ELEMENT,'H',1,3),
 LexToken(SUBSCRIPT,'_',1,4),
 LexToken(LBRACE,'{',1,5),
 LexToken(NUMBER,12,1,6),
 LexToken(RBRACE,'}',1,8),
 LexToken(ELEMENT,'O',1,9),
 LexToken(SUBSCRIPT,'_',1,10),
 LexToken(NUMBER,6,1,11),
 LexToken(PLUS,'+',1,13),
 LexToken(NUMBER,6,1,15),
 LexToken(ELEMENT,'O',1,16),
 LexToken(SUBSCRIPT,'_',1,17),
 LexToken(NUMBER,2,1,18),
 LexToken(ARROW,'\\rightarrow',1,20),
 LexToken(NUMBER,6,1,32),
 LexToken(ELEMENT,'C',1,33),
 LexToken(ELEMENT,'O',1,34),
 LexToken(SUBSCRIPT,'_',1,35),
 LexToken(NUMBER,2,1,36),
 LexToken(PLUS,'+',1,38),
 LexToken(NUMBER,6,1,40),
 LexToken(ELEMENT,'H',1,41),
 LexToken(SUBSCRIPT,'_',1,42),
 LexToken(NUMBER,2,1,43),
 LexToken(ELEMENT,'O',1,44),
 LexToken(TEXNEWLINE,'\\\\',1,45),
 LexToken(NEWLINE,'\n',1,48),
 LexToken(NUMBER,2,1,49),
 LexToken(ELEMENT,'H',1,50),
 LexToken(SUBSCRIPT,'_',1,51),
 LexToken(NUMBER,2,1,52),
 LexToken(PLUS,'+',1,54),
 LexToken(ELEMENT,'O',1,56),
 LexToken(SUBSCRIPT,'_',1,57),
 LexToken(NUMBER,2,1,58),
 LexToken(ARROW,'\\rightarrow',1,60),
 LexToken(NUMBER,2,1,72),
 LexToken(ELEMENT,'H',1,73),
 LexToken(SUBSCRIPT,'_',1,74),
 LexToken(NUMBER,2,1,75),
 LexToken(ELEMENT,'O',1,76)]

Writing a grammar

So, how do we express our algebra for chemical reactions as a grammar?

We write a series of production rules, expressing how a system is made up of equations, an equation is made up of molecules etc:

system : equation
system : system TEXNEWLINE NEWLINE equation
equation : side ARROW side
side : molecules
molecules : molecule
molecules : NUMBER molecule
side : side PLUS molecules
molecule : countedelement
countedelement : ELEMENT
countedelement : ELEMENT atomcount
molecule : molecule countedelement
atomcount : SUBSCRIPT NUMBER
atomcount : SUBSCRIPT LBRACE NUMBER RBRACE

Note how we right that a system is made of more than one equation:

system : equation # A system could be one equation
system : system NEWLINE equation # Or it could be a system then an equation

... which implies, recursively, that a system could also be:

system: equation NEWLINE equation NEWLINE equation ...

Parsing with Yacc

A parser defined with Yacc builds up the final object, by breaking down the file according to the rules of the grammar, and then building up objects as the individual tokens coalesce into the full grammar.

Here, we will for clarity not attempt to solve the problem of having multiple molecule instances for the same molecule - the normalisation problem solved last lecture.

In [9]:
%%writefile parsereactions.py

# Yacc example
from system import *

import ply.yacc as yacc

# Get the token map from the lexer.  This is required.
from lexreactions import tokens

def p_expression_system(p):
    'system : equation'
    p[0]=System()
    p[0].add_reaction(p[1])

def p_expression_combine_system(p):
    'system : system TEXNEWLINE NEWLINE equation'
    p[0]=p[1]
    p[0].add_reaction(p[4])

def p_equation(p):
    'equation : side ARROW side'
    p[0] = Reaction()
    p[0].reactants = p[1]
    p[0].products = p[3]

def p_side(p):
    'side : molecules'
    p[0] = Side()
    p[0].add(p[1][0],p[1][1])

def p_molecules(p):
    'molecules : molecule'
    p[0]=(p[1],1)

def p_stoichiometry(p):
    'molecules : NUMBER molecule'
    p[0]=(p[2],p[1])

def p_plus(p):
    'side : side PLUS molecules'
    p[0]=p[1]
    p[0].add(p[3][0],p[3][1])

def p_molecule(p):
    'molecule : countedelement'
    p[0]= Molecule()
    p[0].add_element(p[1][0],p[1][1])

def p_countedelement(p):
    'countedelement : ELEMENT'
    p[0]=(p[1], 1)
    
def p_ncountedelement(p):
    'countedelement : ELEMENT atomcount'
    p[0]=(p[1], p[2])

def p_multi_element(p):
    'molecule : molecule countedelement'
    p[0]=p[1]
    p[0].add_element(p[2][0],p[2][1])

def p_multi_atoms(p):
    'atomcount : SUBSCRIPT NUMBER'
    p[0]=int(p[2])

def p_many_atoms(p):
    'atomcount : SUBSCRIPT LBRACE NUMBER RBRACE'
    p[0]=int(p[3])

# Error rule for syntax errors
def p_error(p):
    print("Syntax error in input!")   

# Build the parser
parser = yacc.yacc()
Writing parsereactions.py
In [10]:
from parsereactions import parser

roundtrip_system=parser.parse(str(s2))
Generating LALR tables
In [11]:
#!cat parser.out
In [12]:
display(Math(str(roundtrip_system)))
$\displaystyle C_6H_{12}O_6 + 6O_2 \rightarrow 6CO_2 + 6H_2O\\ 2H_2 + O_2 \rightarrow 2H_2O$
In [13]:
with open('system.tex','w') as texfile:
    texfile.write(str(roundtrip_system))

Internal DSLs

In doing the above, we have defined what is called an "external DSL": our code is in Python, but the file format is a language with a grammar of its own.

However, we can use the language itself to define something almost as fluent, without having to write our own grammar, by using operator overloading and metaprogramming tricks:

In [14]:
%%writefile reactionsdsl.py


class Element:
    def __init__(self, symbol):
        self.symbol = symbol
        
    def __str__(self):
        return str(self.symbol)
    
    def __truediv__(self, number):
        res = Molecule()
        res.add_element(self, number)
        return res
        
class Molecule:
    def __init__(self):
        self.elements= {} # Map from element to number of that element in the molecule
        
    def add_element(self, element, number):
        self.elements[element] = number
    
    @staticmethod
    def as_subscript(number):
        if number==1:
            return ""
        
        if number<10:
            return "_"+str(number)
        else:
            return "_{"+str(number)+"}"
    
    def __str__(self):
        return ''.join(
            [str(element)+Molecule.as_subscript(self.elements[element])
             for element in self.elements])
    
    def __mul__(self, other):
        if type(other)==Molecule:
            self.elements.update(other.elements)
        else:
            self.add_element(other,1)
        return self
    
    def __rmul__(self, stoich):
        res = Side()
        res.add(self, stoich)
        return res
    
    def __add__(self, other):
        if type(other)==Side:
            other.molecules[self]=1
            return other
        else:
            res=Side()
            res.add(self,1)
            res.add(other,1)

class Side:
    def __init__(self):
        self.molecules={}
        
    def add(self, reactant, stoichiometry):
        self.molecules[reactant]=stoichiometry
        
    @staticmethod
    def print_if_not_one(number):
        if number==1:
            return ''
        else: return str(number)
    
    def __str__(self):
        return " + ".join(
            [Side.print_if_not_one(self.molecules[molecule]) + 
             str(molecule) for molecule in self.molecules])
    
    def __add__(self, other):
        self.molecules.update(other.molecules)
        return self
    
    def __eq__(self, other):
        res = Reaction()
        res.reactants = self
        res.products = other
        current_system.add_reaction(res) # Closure!
        return "Created"
    
class Reaction:
    def __init__(self):
        self.reactants = Side() 
        self.products = Side()
        
    def __str__(self):
        return (str(self.reactants) + 
                  " \\rightarrow " + 
                  str(self.products))

class System:       
    def __init__(self):
        self.reactions=[]

    def add_reaction(self, reaction):
        self.reactions.append(reaction)
        
    def __str__(self):
        return "\\\\ \n".join(map(str,self.reactions))

def elements(mvars, *elements):
    for element in elements:
        mvars[element]=Element(element)
        
current_system = System()
Writing reactionsdsl.py
In [15]:
from reactionsdsl import *
In [16]:
elements(globals(),'C','N','O','H')
In [17]:
C
Out[17]:
<reactionsdsl.Element at 0x2b45c22d7438>
In [18]:
O/2+2*(H/2) == 2*(H/2*O)
Out[18]:
'Created'
In [19]:
display(Math(str(current_system)))
$\displaystyle 2H_2 + O_2 \rightarrow 2H_2O$

Python is not perfect for this, because it lacks the idea of parenthesis- free function dispatch and other things that make internal DSLs pretty.

In [ ]: