Source code for jxmlease.xmlparser

#!/usr/bin/env python
# Copyright (c) 2015-2016, Juniper Networks, Inc.
# All rights reserved.
#
# Copyright (C) 2012 Martin Blech and individual contributors.
#
# See the LICENSE file for further information.
"""Module that provides XML parsing."""
from __future__ import absolute_import

from xml.parsers import expat
from . import parser_defaults, parsing_increment, StringIO, _unicode
from ._parsehandler import _DictSAXHandler
try: # pragma no cover
    from io import BytesIO # pylint: disable=wrong-import-order
except ImportError: # pragma no cover
    BytesIO = StringIO
try:  # pragma no cover
    _bytes = bytes
except NameError:  # pragma no cover
    _bytes = str

__all__ = ['Parser', 'parse']

[docs]class Parser(object): """Creates Python data structures from raw XML. This class creates a callable object used to parse XML into Python data structures. You can provide optional parameters at the class creation time. These parameters modify the default behavior of the parser. When you invoke the callable object to parse a document, you can supply additional parameters to override the values specified when the :py:class:`Parser` object was created. General usage is:: >>> myparser = Parser() >>> root = myparser("<a>foo</a>") Calling a :py:class:`Parser` object returns an :py:class:`XMLDictNode` containing the parsed XML tree. In this example, ``root`` is an :py:class:`XMLDictNode` which contains a representation of the parsed XML:: >>> isinstance(root, XMLDictNode) True >>> root.prettyprint() {u'a': u'foo'} >>> print root.emit_xml() <?xml version="1.0" encoding="utf-8"?> <a>foo</a> If you will just be using a parser once, you can just use the :py:meth:`parse` method, which is a shortcut way of creating a :py:class:`Parser` class and calling it all in one call. You can provide the same arguments to the :py:meth:`parse` method that you provide to the :py:class:`Parser` class. For example:: >>> root = jxmlease.parse('<a x="y"><b>1</b><b>2</b><b>3</b></a>') >>> root.prettyprint() {u'a': {u'b': [u'1', u'2', u'3']}} It is possible to call a :py:class:`Parser` object as a generator by specifying the :py:obj:`generator` parameter. The :py:obj:`generator` parameter contains a list of paths to match. If paths are provided in this parameter, the behavior of the parser is changed. Instead of returning the root node of a parsed XML hierarchy, the parser returns a generator object. On each call to the generator object, it will return the next node that matches one of the provided paths. Paths are provided in a format similar to XPath expressions. For example, ``/a/b`` will match node ``<b>`` in this XML:: <a> <b/> </a> If a path begins with a ``/``, it must exactly match the full path to a node. If a path does not begin with a ``/``, it must exactly match the "right side" of the path to a node. For example, consider this XML:: <a> <b> <c/> </b> </a> In this example, ``/a/b/c``, ``c``, ``b/c``, and ``a/b/c`` all match the ``<c>`` node. For each match, the generator returns a tuple of: ``(path,match_string,xml_node)``, where the *path* is the calculated absolute path to the matching node, *match_string* is the user-supplied match string that triggered the match, and *xml_node* is the object representing that node (an instance of a :py:class:`XMLNodeBase` subclass). For example:: >>> xml = '<a x="y"><b>1</b><b>2</b><b>3</b></a>' >>> myparser = Parser(generator=["/a/b"]) >>> for (path, match, value) in myparser(xml): ... print "%s: %s" % (path, value) ... /a/b: 1 /a/b: 2 /a/b: 3 When calling the parser, you can specify all of these parameters. When creating a parsing instance, you can specify all of these parameters except :py:obj:`xml_input`: Args: xml_input (stirng or file-like object): Contains the XML to parse. encoding (string or None): The input's encoding. If not provided, this defaults to 'utf-8'. expat (An expat, or equivalent, parser class): Used for parsing the XML input. If not provided, defaults to the expat parser in :py:data:`xml.parsers`. process_namespaces (bool): If True, namespaces in tags and attributes are converted to their full URL value. If False (the default), the namespaces in tags and attributes are left unchanged. namespace_separator (string): If :py:obj:`process_namespaces` is True, this specifies the separator that expat should use between namespaces and identifiers in tags and attributes xml_attribs (bool): If True (the default), include XML attributes. If False, ignore them. strip_whitespace (bool): If True (the default), strip whitespace at the start and end of CDATA. If False, keep all whitespace. namespaces (`dict`): A remapping for namespaces. If supplied, identifiers with a namespace prefix will have their namespace prefix rewritten based on the dictionary. The code will look for :py:obj:`namespaces[current_namespace]`. If found, :py:obj:`current_namespace` will be replaced with the result of the lookup. strip_namespace (bool): If True, the namespace prefix will be removed from all identifiers. If False (the default), the namespace prefix will be retained. cdata_separator (string): When encountering "semi-structured" XML (where the XML has CDATA and tags intermixed at the same level), the :py:obj:`cdata_separator` will be placed between the different groups of CDATA. By default, the :py:obj:`cdata_separator` parameter is '', which results in the CDATA groups being concatenated without separator. generator (list of strings): A list of paths to match. If paths are provided here, the behavior of the parser is changed. Instead of returning the root node of a parsed XML hierarchy, the parser returns a :py:obj:`generator` object. On each call to the :py:obj:`generator` object, it will return the next node that matches one of the provided paths. Returns: A callable instance of the :py:class:`Parser` class. Calling a :py:class:`Parser` object returns an :py:class:`XMLDictNode` containing the parsed XML tree. Alternatively, if the :py:obj:`generator` parameter is specified, a :py:obj:`generator` object is returned. """ def __init__(self, **kwargs): """See class documentation.""" # Populate a dictionary with default arguments. self._default_kwargs = dict(encoding=None, expat=expat, process_namespaces=False, namespace_separator=":") # Update the dictionary with user-provided defaults. self._default_kwargs.update(parser_defaults) # Update the dictionary with the provided arguments. We will save # the arguments for later use. self._default_kwargs.update(kwargs) # Process the arguments. self._process_args() # Make a default handler, which will also try the arguments to catch # argument errors now. self._make_handler() # Try the arguments to catch argument errors now. We will # throw this one away (as the encoding is unpredictable). if not self._encoding: self._encoding = 'utf-8' self._make_parser() # Stash the handler for future use. self._default_handler = self._handler self._handler = None def _process_args(self, **kwargs): # Make a copy of the default kwargs database. self._kwargs = dict(self._default_kwargs) # Update the dictionary with the provided arguments. self._kwargs.update(kwargs) # Pop off and save the arguments that we don't want to pass to # the handler class. self._encoding = self._kwargs.pop('encoding') self._expat = self._kwargs.pop('expat') self._process_namespaces = self._kwargs.pop('process_namespaces') def _make_handler(self): # pylint: disable=unexpected-keyword-arg self._handler = _DictSAXHandler(**self._kwargs) def _make_parser(self): # We don't need a namespace separator if we're not processing # namespaces. if not self._process_namespaces: namespace_separator = None else: namespace_separator = self._kwargs['namespace_separator'] self._parser = self._expat.ParserCreate( self._encoding, namespace_separator ) # Setup some parser attributes self._parser.buffer_text = True try: self._parser.ordered_attributes = True except AttributeError: # pragma no cover # Jython's expat does not support ordered_attributes pass # Assign the handler methods to the parser self._parser.StartElementHandler = self._handler.start_element self._parser.EndElementHandler = self._handler.end_element self._parser.CharacterDataHandler = self._handler.characters def _parse_generator(self, xml_input): if isinstance(xml_input, (str, _unicode)): io_obj = StringIO(xml_input) elif isinstance(xml_input, _bytes): io_obj = BytesIO(xml_input) else: io_obj = xml_input at_eof = False while not at_eof: buf = io_obj.read(parsing_increment) if len(buf) == 0: at_eof = True try: self._parser.Parse(buf, at_eof) except expat.ExpatError as e: # If the only error was parsing an empty document, ignore # the error and return the empty dictionary. raise_error = True if (hasattr(expat, "errors") and hasattr(expat.errors, "XML_ERROR_NO_ELEMENTS") and str(e).startswith(expat.errors.XML_ERROR_NO_ELEMENTS + ":") and at_eof and not self._handler.processing_started): raise_error = False # If needed, raise the error if raise_error: raise if at_eof: self._handler.end_document() for rv in self._handler.pop_matches(): yield rv
[docs] def __call__(self, xml_input, **kwargs): """See class documentation.""" # Make a copy of the default arguments and update that copy with # our new arguments. self._process_args(**kwargs) # Did we get keyword arguments? If so, we need to recreate the # default handler. Otherwise, we can try to use it (if the default # parser exists). if len(kwargs) == 0 and self._default_handler is not None: self._handler = self._default_handler self._default_handler = None else: self._make_handler() # Make sure our unicode text (if any) is properly encoded. if isinstance(xml_input, _unicode): if not self._encoding: self._encoding = 'utf-8' xml_input = xml_input.encode(self._encoding) # Create our parser. self._make_parser() # Do the actual parsing. if self._kwargs.get("generator", False): return self._parse_generator(xml_input) else: try: if isinstance(xml_input, (str, _unicode, _bytes)): self._parser.Parse(xml_input, True) else: self._parser.ParseFile(xml_input) except expat.ExpatError as e: # If the only error was parsing an empty document, ignore # the error and return the empty dictionary. raise_error = True if (hasattr(expat, "errors") and hasattr(expat.errors, "XML_ERROR_NO_ELEMENTS") and str(e).startswith(expat.errors.XML_ERROR_NO_ELEMENTS + ":") and not self._handler.processing_started): raise_error = False # If needed, raise the error if raise_error: raise return self._handler.item
[docs]def parse(xml_input, **kwargs): """Create Python data structures from raw XML. See the :py:class:`Parser` class documentation.""" return Parser(**kwargs)(xml_input)