XClose
Menu

Markup Languages

XML and its relatives are based on the idea of marking up content with labels on its purpose:

<name>James</name> is a <job>Programmer</job>

One of the easiest ways to make a markup-language based fileformat is the use of a templating language.

In [1]:
import mako

from parsereactions import parser

from IPython.display import display, Math
system=parser.parse(open('system.tex').read())
display(Math(str(system)))
$\displaystyle C_6H_{12}O_6 + 6O_2 \rightarrow 6CO_2 + 6H_2O\\ 2H_2 + O_2 \rightarrow 2H_2O$
In [2]:
%%writefile chemistry_template.mko
<?xml version="1.0" encoding="UTF-8"?>
<system>
    %for reaction in reactions:
    <reaction>
        <reactants>
        %for molecule in reaction.reactants.molecules:
            <molecule stoichiometry="${reaction.reactants.molecules[molecule]}">
                % for element in molecule.elements:
                    <element symbol="${element.symbol}" number="${molecule.elements[element]}"/>
                % endfor
            </molecule>
        %endfor
        </reactants>
        <products>
        %for molecule in reaction.products.molecules:
            <molecule stoichiometry="${reaction.products.molecules[molecule]}">
            % for element in molecule.elements:
                    <element symbol="${element.symbol}" number="${molecule.elements[element]}"/>
            % endfor
            </molecule>
        %endfor
        </products>
    </reaction>
    %endfor
</system>
Writing chemistry_template.mko
In [3]:
from mako.template import Template

mytemplate = Template(filename='chemistry_template.mko')
with open('system.xml','w') as xmlfile:
    xmlfile.write((mytemplate.render( **vars(system))))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-3-6df4e1bc4785> in <module>
      3 mytemplate = Template(filename='chemistry_template.mko')
      4 with open('system.xml','w') as xmlfile:
----> 5     xmlfile.write((mytemplate.render( **vars(system))))

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/template.py in render(self, *args, **data)
    460 
    461         """
--> 462         return runtime._render(self, self.callable_, args, data)
    463 
    464     def render_unicode(self, *args, **data):

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/runtime.py in _render(template, callable_, args, data, as_unicode)
    836 
    837     _render_context(template, callable_, context, *args,
--> 838                     **_kwargs_for_callable(callable_, data))
    839     return context._pop_buffer().getvalue()
    840 

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/runtime.py in _render_context(tmpl, callable_, context, *args, **kwargs)
    871         # if main render method, call from the base of the inheritance stack
    872         (inherit, lclcontext) = _populate_self_namespace(context, tmpl)
--> 873         _exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
    874     else:
    875         # otherwise, call the actual rendering method specified

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/runtime.py in _exec_template(callable_, context, args, kwargs)
    897             _render_error(template, context, e)
    898     else:
--> 899         callable_(context, *args, **kwargs)
    900 
    901 

chemistry_template_mko in render_body(context, **pageargs)

AttributeError: 'str' object has no attribute 'symbol'
In [4]:
!cat system.xml

Markup languages are verbose (jokingly called the "angle bracket tax") but very clear.

Data as text

The above serialisation specifies all data as XML "Attributes". An alternative is to put the data in the text:

In [5]:
%%writefile chemistry_template2.mko
<?xml version="1.0" encoding="UTF-8"?>
<system>
    %for reaction in reactions:
    <reaction>
        <reactants>
        %for molecule in reaction.reactants.molecules:
            <molecule stoichiometry="${reaction.reactants.molecules[molecule]}">
                % for element in molecule.elements:
                    <element symbol="${element.symbol}">${molecule.elements[element]}</element>
                % endfor
            </molecule>
        %endfor
        </reactants>
        <products>
        %for molecule in reaction.products.molecules:
            <molecule stoichiometry="${reaction.products.molecules[molecule]}">
            % for element in molecule.elements:
                    <element symbol="${element.symbol}">${molecule.elements[element]}</element>
            % endfor
            </molecule>
        %endfor
        </products>
    </reaction>
    %endfor
</system>
Writing chemistry_template2.mko
In [6]:
from mako.template import Template

mytemplate = Template(filename='chemistry_template2.mko')
with open('system2.xml','w') as xmlfile:
    xmlfile.write((mytemplate.render( **vars(system))))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-6-33b9a5f81e70> in <module>
      3 mytemplate = Template(filename='chemistry_template2.mko')
      4 with open('system2.xml','w') as xmlfile:
----> 5     xmlfile.write((mytemplate.render( **vars(system))))

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/template.py in render(self, *args, **data)
    460 
    461         """
--> 462         return runtime._render(self, self.callable_, args, data)
    463 
    464     def render_unicode(self, *args, **data):

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/runtime.py in _render(template, callable_, args, data, as_unicode)
    836 
    837     _render_context(template, callable_, context, *args,
--> 838                     **_kwargs_for_callable(callable_, data))
    839     return context._pop_buffer().getvalue()
    840 

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/runtime.py in _render_context(tmpl, callable_, context, *args, **kwargs)
    871         # if main render method, call from the base of the inheritance stack
    872         (inherit, lclcontext) = _populate_self_namespace(context, tmpl)
--> 873         _exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
    874     else:
    875         # otherwise, call the actual rendering method specified

~/virtualenv/python3.6.3/lib/python3.6/site-packages/mako/runtime.py in _exec_template(callable_, context, args, kwargs)
    897             _render_error(template, context, e)
    898     else:
--> 899         callable_(context, *args, **kwargs)
    900 
    901 

chemistry_template2_mko in render_body(context, **pageargs)

AttributeError: 'str' object has no attribute 'symbol'
In [7]:
!cat system2.xml

Parsing XML

XML is normally parsed by building a tree-structure of all the tags in the file, called a DOM or Document Object Model.

In [8]:
from lxml import etree
In [9]:
tree = etree.parse(open('system.xml'))
Traceback (most recent call last):

  File "/home/travis/virtualenv/python3.6.3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-9-67b89becf134>", line 1, in <module>
    tree = etree.parse(open('system.xml'))

  File "src/lxml/etree.pyx", line 3424, in lxml.etree.parse

  File "src/lxml/parser.pxi", line 1861, in lxml.etree._parseDocument

  File "src/lxml/parser.pxi", line 1881, in lxml.etree._parseFilelikeDocument

  File "src/lxml/parser.pxi", line 1776, in lxml.etree._parseDocFromFilelike

  File "src/lxml/parser.pxi", line 1187, in lxml.etree._BaseParser._parseDocFromFilelike

  File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc

  File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult

  File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError

  File "/home/travis/build/alan-turing-institute/rsd-engineeringcourse/ch09fileformats/system.xml", line 1
XMLSyntaxError: Document is empty, line 1, column 1
In [10]:
print(etree.tostring(tree, pretty_print=True, encoding=str))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-2bd695001e8a> in <module>
----> 1 print(etree.tostring(tree, pretty_print=True, encoding=str))

NameError: name 'tree' is not defined

We can navigage the tree, with each element being an iterable yielding its children:

In [11]:
tree.getroot()[0][0][1].attrib['stoichiometry']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-a89836e5168c> in <module>
----> 1 tree.getroot()[0][0][1].attrib['stoichiometry']

NameError: name 'tree' is not defined

Searching XML

xpath is a sophisticated tool for searching XML DOMs:

In [12]:
tree.xpath('//molecule/element[@number="1"]/@symbol')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-9a9f01ed5bc8> in <module>
----> 1 tree.xpath('//molecule/element[@number="1"]/@symbol')

NameError: name 'tree' is not defined

It is useful to understand grammars like these using the "FOR-LET-WHERE-ORDER-RETURN" (Flower) model.

The above says: "For element in molecules where number is one, return symbol", roughly equivalent to [element.symbol for element in molecule for molecule in document if element.number==1] in Python.

In [13]:
etree.parse(open('system2.xml')).xpath('//molecule[element=1]//@symbol')
Traceback (most recent call last):

  File "/home/travis/virtualenv/python3.6.3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-13-a5f1b225e455>", line 1, in <module>
    etree.parse(open('system2.xml')).xpath('//molecule[element=1]//@symbol')

  File "src/lxml/etree.pyx", line 3424, in lxml.etree.parse

  File "src/lxml/parser.pxi", line 1861, in lxml.etree._parseDocument

  File "src/lxml/parser.pxi", line 1881, in lxml.etree._parseFilelikeDocument

  File "src/lxml/parser.pxi", line 1776, in lxml.etree._parseDocFromFilelike

  File "src/lxml/parser.pxi", line 1187, in lxml.etree._BaseParser._parseDocFromFilelike

  File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc

  File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult

  File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError

  File "/home/travis/build/alan-turing-institute/rsd-engineeringcourse/ch09fileformats/system2.xml", line 1
XMLSyntaxError: Document is empty, line 1, column 1

Note how we select on text content rather than attributes by using the element tag directly. The above says "for every moelcule where at least one element is present with just a single atom, return all the symbols of all the elements in that molecule."

Transforming XML : XSLT

Two technologies (XSLT and XQUERY) provide capability to produce text output from an XML tree.

We'll look at XSLT as support is more widespread, including in the python library we're using. XQuery is probably easier to use and understand, but with less support.

However, XSLT is a beautiful functional declarative language, once you read past the angle-brackets.

Here's an XSLT to transform our reaction system into a LaTeX representation:

In [14]:
%%writefile xmltotex.xsl

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes" omit-xml-declaration="yes" />
        
    <xsl:template match="//reaction">
        <xsl:apply-templates select="reactants"/>
        <xsl:text> \rightarrow </xsl:text>
        <xsl:apply-templates select="products"/>
        <xsl:text>\\&#xa;</xsl:text>
    </xsl:template>
        
    <xsl:template match="//molecule[position()!=1]">
        <xsl:text> + </xsl:text>
        <xsl:apply-templates select="@stoichiometry"/>
        <xsl:apply-templates/>
    </xsl:template>
        
    <xsl:template match="@stoichiometry[.='1']"/>
    <!-- do not copy 1-stoichiometries -->
    
    <!-- Otherwise, use the default template for attributes, which is just to copy value -->
        
    <xsl:template match="//molecule[position()=1]">
        <xsl:apply-templates select="@* | *"/> 
    </xsl:template>
    
    <xsl:template match="//element">
        <xsl:value-of select="@symbol"/>
        <xsl:apply-templates select="@number"/>
    </xsl:template>
        
    <xsl:template match="@number[.=1]"/>
    <!-- do not copy 1-numbers -->
    
    <xsl:template match="@number[.!=1][10>.]">
        <xsl:text>_</xsl:text>
        <xsl:value-of select="."/>
    </xsl:template>
        
    <xsl:template match="@number[.!=1][.>9]">
        <xsl:text>_{</xsl:text>
        <xsl:value-of select="."/>
        <xsl:text>}</xsl:text>          
    </xsl:template>
        
    <xsl:template match="text()" />
    <!-- Do not copy input whitespace to output -->
</xsl:stylesheet>
Writing xmltotex.xsl
In [15]:
transform=etree.XSLT(etree.XML(open("xmltotex.xsl").read()))
In [16]:
print(str(transform(tree)))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-61fb34f0ad1e> in <module>
----> 1 print(str(transform(tree)))

NameError: name 'tree' is not defined
In [17]:
display(Math(str(transform(tree))))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-2d24b1acdacb> in <module>
----> 1 display(Math(str(transform(tree))))

NameError: name 'tree' is not defined

Validating XML : Schema

XML Schema is a way to define how an XML file is allowed to be: which attributes and tags should exist where.

You should always define one of these when using an XML file format.

In [18]:
%%writefile reactions.xsd

<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">

<xs:element name="element">
<xs:complexType>
    <xs:attribute name="symbol" type="xs:string"/>
    <xs:attribute name="number" type="xs:integer"/>
</xs:complexType>
</xs:element>
    
<xs:element name="molecule">
    <xs:complexType>
        <xs:sequence>
            <xs:element ref="element" maxOccurs="unbounded"/>
        </xs:sequence>
        <xs:attribute name="stoichiometry" type="xs:integer"/>
    </xs:complexType>
</xs:element>
    
<xs:element name="reaction">
    <xs:complexType>
        <xs:sequence>
        <xs:element name="reactants">
            <xs:complexType>
                <xs:sequence>
                    <xs:element ref="molecule" maxOccurs="unbounded"/>
                </xs:sequence>
            </xs:complexType>
        </xs:element>
        <xs:element name="products">
            <xs:complexType>
                <xs:sequence>
                    <xs:element ref="molecule" maxOccurs="unbounded"/>
                </xs:sequence>
            </xs:complexType>
        </xs:element>
        </xs:sequence>
    </xs:complexType>
</xs:element>

<xs:element name="system">
<xs:complexType>
    <xs:sequence>
        <xs:element ref="reaction" maxOccurs="unbounded"/>
    </xs:sequence>
</xs:complexType>
</xs:element>   
    
</xs:schema>
Writing reactions.xsd
In [19]:
schema = etree.XMLSchema(etree.XML(open("reactions.xsd").read()))
In [20]:
parser = etree.XMLParser(schema = schema)
In [21]:
tree = etree.parse(open('system.xml'),parser)
Traceback (most recent call last):

  File "/home/travis/virtualenv/python3.6.3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-21-908014410f7b>", line 1, in <module>
    tree = etree.parse(open('system.xml'),parser)

  File "src/lxml/etree.pyx", line 3424, in lxml.etree.parse

  File "src/lxml/parser.pxi", line 1861, in lxml.etree._parseDocument

  File "src/lxml/parser.pxi", line 1881, in lxml.etree._parseFilelikeDocument

  File "src/lxml/parser.pxi", line 1776, in lxml.etree._parseDocFromFilelike

  File "src/lxml/parser.pxi", line 1187, in lxml.etree._BaseParser._parseDocFromFilelike

  File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc

  File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult

  File "src/lxml/parser.pxi", line 649, in lxml.etree._raiseParseError

  File "/home/travis/build/alan-turing-institute/rsd-engineeringcourse/ch09fileformats/system.xml", line 1
XMLSyntaxError: line 1: b'Document is empty'

Compare parsing something that is not valid under the schema:

In [24]:
%%writefile invalid_system.xml

<system>
    <reaction>
        <reactants>
            <molecule stoichiometry="two">
                    <element symbol="H" number="2"/>
            </molecule>
            <molecule stoichiometry="1">
                    <element symbol="O" number="2"/>
            </molecule>
        </reactants>
        <products>
            <molecule stoichiometry="2">
                    <element symbol="H" number="2"/>
                    <element symbol="O" number="1"/>
            </molecule>
        </products>
    </reaction>
</system>
In [22]:
tree = etree.parse(open('invalid_system.xml'),parser)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-22-84e81d0435a0> in <module>
----> 1 tree = etree.parse(open('invalid_system.xml'),parser)

FileNotFoundError: [Errno 2] No such file or directory: 'invalid_system.xml'