#!/usr/bin/env python
# Copyright (c) 2015-2016, Juniper Networks, Inc.
# All rights reserved.
#
# Copyright (C) 2012 Martin Blech and individual contributors.
#
# See the LICENSE file for further information.
"""Module that provides XML parsing."""
from __future__ import absolute_import
from xml.parsers import expat
from . import parser_defaults, parsing_increment, StringIO, _unicode
from ._parsehandler import _DictSAXHandler
try: # pragma no cover
from io import BytesIO # pylint: disable=wrong-import-order
except ImportError: # pragma no cover
BytesIO = StringIO
try: # pragma no cover
_bytes = bytes
except NameError: # pragma no cover
_bytes = str
__all__ = ['Parser', 'parse']
[docs]class Parser(object):
"""Creates Python data structures from raw XML.
This class creates a callable object used to parse XML into Python data
structures. You can provide optional parameters at the class creation time.
These parameters modify the default behavior of the parser. When you invoke
the callable object to parse a document, you can supply additional
parameters to override the values specified when the :py:class:`Parser`
object was created.
General usage is::
>>> myparser = Parser()
>>> root = myparser("<a>foo</a>")
Calling a :py:class:`Parser` object returns an :py:class:`XMLDictNode`
containing the parsed XML tree.
In this example, ``root`` is an :py:class:`XMLDictNode` which contains a
representation of the parsed XML::
>>> isinstance(root, XMLDictNode)
True
>>> root.prettyprint()
{u'a': u'foo'}
>>> print root.emit_xml()
<?xml version="1.0" encoding="utf-8"?>
<a>foo</a>
If you will just be using a parser once, you can just use the
:py:meth:`parse` method, which is a shortcut way of creating a
:py:class:`Parser` class and calling it all in one call. You can provide
the same arguments to the :py:meth:`parse` method that you provide to the
:py:class:`Parser` class.
For example::
>>> root = jxmlease.parse('<a x="y"><b>1</b><b>2</b><b>3</b></a>')
>>> root.prettyprint()
{u'a': {u'b': [u'1', u'2', u'3']}}
It is possible to call a :py:class:`Parser` object as a generator by
specifying the :py:obj:`generator` parameter. The :py:obj:`generator`
parameter contains a list of paths to match. If paths are provided in this
parameter, the behavior of the parser is changed. Instead of returning the
root node of a parsed XML hierarchy, the parser returns a generator object.
On each call to the generator object, it will return the next node that
matches one of the provided paths.
Paths are provided in a format similar to XPath expressions. For example,
``/a/b`` will match node ``<b>`` in this XML::
<a>
<b/>
</a>
If a path begins with a ``/``, it must exactly match the full path to a
node. If a path does not begin with a ``/``, it must exactly match the
"right side" of the path to a node. For example, consider this XML::
<a>
<b>
<c/>
</b>
</a>
In this example, ``/a/b/c``, ``c``, ``b/c``, and ``a/b/c`` all match the
``<c>`` node.
For each match, the generator returns a tuple of:
``(path,match_string,xml_node)``, where the *path* is
the calculated absolute path to the matching node, *match_string* is the
user-supplied match string that triggered the match, and *xml_node* is the
object representing that node (an instance of a :py:class:`XMLNodeBase`
subclass).
For example::
>>> xml = '<a x="y"><b>1</b><b>2</b><b>3</b></a>'
>>> myparser = Parser(generator=["/a/b"])
>>> for (path, match, value) in myparser(xml):
... print "%s: %s" % (path, value)
...
/a/b: 1
/a/b: 2
/a/b: 3
When calling the parser, you can specify all of these parameters. When
creating a parsing instance, you can specify all of these parameters
except :py:obj:`xml_input`:
Args:
xml_input (stirng or file-like object): Contains the XML to parse.
encoding (string or None): The input's encoding. If not provided, this
defaults to 'utf-8'.
expat (An expat, or equivalent, parser class): Used for parsing the XML
input. If not provided, defaults to the expat parser in
:py:data:`xml.parsers`.
process_namespaces (bool): If True, namespaces in tags and attributes
are converted to their full URL value. If False (the default), the
namespaces in tags and attributes are left unchanged.
namespace_separator (string): If :py:obj:`process_namespaces` is True,
this specifies the separator that expat should use between
namespaces and identifiers in tags and attributes
xml_attribs (bool): If True (the default), include XML attributes.
If False, ignore them.
strip_whitespace (bool): If True (the default), strip whitespace
at the start and end of CDATA. If False, keep all whitespace.
namespaces (`dict`): A remapping for namespaces. If supplied, identifiers
with a namespace prefix will have their namespace prefix rewritten
based on the dictionary. The code will look for
:py:obj:`namespaces[current_namespace]`. If found,
:py:obj:`current_namespace` will be replaced with the result of
the lookup.
strip_namespace (bool): If True, the namespace prefix will be
removed from all identifiers. If False (the default), the namespace
prefix will be retained.
cdata_separator (string): When encountering "semi-structured" XML
(where the XML has CDATA and tags intermixed at the same level), the
:py:obj:`cdata_separator` will be placed between the different
groups of CDATA. By default, the :py:obj:`cdata_separator`
parameter is '', which results in the CDATA groups being
concatenated without separator.
generator (list of strings): A list of paths to match. If paths are
provided here, the behavior of the parser is changed. Instead of
returning the root node of a parsed XML hierarchy, the parser
returns a :py:obj:`generator` object. On each call to the
:py:obj:`generator` object, it will return the next node that
matches one of the provided paths.
Returns:
A callable instance of the :py:class:`Parser` class.
Calling a :py:class:`Parser` object returns an :py:class:`XMLDictNode`
containing the parsed XML tree.
Alternatively, if the :py:obj:`generator` parameter is specified, a
:py:obj:`generator` object is returned.
"""
def __init__(self, **kwargs):
"""See class documentation."""
# Populate a dictionary with default arguments.
self._default_kwargs = dict(encoding=None, expat=expat,
process_namespaces=False,
namespace_separator=":")
# Update the dictionary with user-provided defaults.
self._default_kwargs.update(parser_defaults)
# Update the dictionary with the provided arguments. We will save
# the arguments for later use.
self._default_kwargs.update(kwargs)
# Process the arguments.
self._process_args()
# Make a default handler, which will also try the arguments to catch
# argument errors now.
self._make_handler()
# Try the arguments to catch argument errors now. We will
# throw this one away (as the encoding is unpredictable).
if not self._encoding:
self._encoding = 'utf-8'
self._make_parser()
# Stash the handler for future use.
self._default_handler = self._handler
self._handler = None
def _process_args(self, **kwargs):
# Make a copy of the default kwargs database.
self._kwargs = dict(self._default_kwargs)
# Update the dictionary with the provided arguments.
self._kwargs.update(kwargs)
# Pop off and save the arguments that we don't want to pass to
# the handler class.
self._encoding = self._kwargs.pop('encoding')
self._expat = self._kwargs.pop('expat')
self._process_namespaces = self._kwargs.pop('process_namespaces')
def _make_handler(self):
# pylint: disable=unexpected-keyword-arg
self._handler = _DictSAXHandler(**self._kwargs)
def _make_parser(self):
# We don't need a namespace separator if we're not processing
# namespaces.
if not self._process_namespaces:
namespace_separator = None
else:
namespace_separator = self._kwargs['namespace_separator']
self._parser = self._expat.ParserCreate(
self._encoding, namespace_separator
)
# Setup some parser attributes
self._parser.buffer_text = True
try:
self._parser.ordered_attributes = True
except AttributeError: # pragma no cover
# Jython's expat does not support ordered_attributes
pass
# Assign the handler methods to the parser
self._parser.StartElementHandler = self._handler.start_element
self._parser.EndElementHandler = self._handler.end_element
self._parser.CharacterDataHandler = self._handler.characters
def _parse_generator(self, xml_input):
if isinstance(xml_input, (str, _unicode)):
io_obj = StringIO(xml_input)
elif isinstance(xml_input, _bytes):
io_obj = BytesIO(xml_input)
else:
io_obj = xml_input
at_eof = False
while not at_eof:
buf = io_obj.read(parsing_increment)
if len(buf) == 0:
at_eof = True
try:
self._parser.Parse(buf, at_eof)
except expat.ExpatError as e:
# If the only error was parsing an empty document, ignore
# the error and return the empty dictionary.
raise_error = True
if (hasattr(expat, "errors") and
hasattr(expat.errors, "XML_ERROR_NO_ELEMENTS") and
str(e).startswith(expat.errors.XML_ERROR_NO_ELEMENTS + ":") and
at_eof and
not self._handler.processing_started):
raise_error = False
# If needed, raise the error
if raise_error:
raise
if at_eof:
self._handler.end_document()
for rv in self._handler.pop_matches():
yield rv
[docs] def __call__(self, xml_input, **kwargs):
"""See class documentation."""
# Make a copy of the default arguments and update that copy with
# our new arguments.
self._process_args(**kwargs)
# Did we get keyword arguments? If so, we need to recreate the
# default handler. Otherwise, we can try to use it (if the default
# parser exists).
if len(kwargs) == 0 and self._default_handler is not None:
self._handler = self._default_handler
self._default_handler = None
else:
self._make_handler()
# Make sure our unicode text (if any) is properly encoded.
if isinstance(xml_input, _unicode):
if not self._encoding:
self._encoding = 'utf-8'
xml_input = xml_input.encode(self._encoding)
# Create our parser.
self._make_parser()
# Do the actual parsing.
if self._kwargs.get("generator", False):
return self._parse_generator(xml_input)
else:
try:
if isinstance(xml_input, (str, _unicode, _bytes)):
self._parser.Parse(xml_input, True)
else:
self._parser.ParseFile(xml_input)
except expat.ExpatError as e:
# If the only error was parsing an empty document, ignore
# the error and return the empty dictionary.
raise_error = True
if (hasattr(expat, "errors") and
hasattr(expat.errors, "XML_ERROR_NO_ELEMENTS") and
str(e).startswith(expat.errors.XML_ERROR_NO_ELEMENTS + ":") and
not self._handler.processing_started):
raise_error = False
# If needed, raise the error
if raise_error:
raise
return self._handler.item
[docs]def parse(xml_input, **kwargs):
"""Create Python data structures from raw XML.
See the :py:class:`Parser` class documentation."""
return Parser(**kwargs)(xml_input)