#!/usr/bin/python # Copyright (c) 2005 by Mark T. Holder, Florida State University. (see end of file) # some xml reading code taken from: # pyxml/test/test_sax.py. # The header from that file # regression test for SAX 2.0 # '''Uses SAX to marshal XML into python objects that. Requires maintaining a script that creates a _CTH member for saxConstructible classes. This data member maps tags to python classes see cipres_registry_from_xml for an example''' import re from PIPRes.util.io import toCamelCaseRE from xml.sax import make_parser, ContentHandler, SAXException, SAXReaderNotAvailable, SAXParseException try: make_parser() except SAXReaderNotAvailable: raise ImportError("no XML parsers available") from xml.sax.saxutils import XMLGenerator, escape, quoteattr, XMLFilterBase from xml.sax.expatreader import create_parser import copy #import logging #_LOG = logging.getLogger('pipres.util.xml_to_obj') def adict(**kwds): return kwds def writeSAXConstructibleToXML(outStr, elName, atts, text, subElements): outStr.write('<%s' % elName) for k, v in atts.iteritems(): outStr.write(' %s="%s"' % (k, v)) outStr.write('>%s\n' % text) for element in subElements: element.writeXML(outStr) outStr.write('\n' % elName) class _SubTreeContentHandler(ContentHandler): '''Class that installs itself as ContentHandler in the __init__() call and reinstalls the parent when it gets the matching endElement call (__init__ is assumed to be called in response to a startElement call).''' def __init__(self, elName, parseContext, depth = 1): '''Sets self to the parser's ContentHandler until the subtree is read, and then installs parent as the ContentHandler. Pass depth or 0 if the object will receive a startElement for the element to ignore, and 1 if the startElement has already been read.''' self._parent = parseContext.get('parent') self._parser = parseContext.get('parser') self._depth = depth self._parser.setContentHandler(self) def startElement(self, name, attrs): self._depth += 1 def endElement(self, name): self._depth -= 1 if self._depth == 0: self._parser.setContentHandler(self._parent) class XMLEchoHandler(_SubTreeContentHandler): '''Prints element name and characters with indentation indicating element depth''' _nTabs = 0 def startElement(self, name, attrs): for i in xrange(self._nTabs): print '\t', print name.encode('ascii') self._nTabs += 1 _SubTreeContentHandler.startElement(self, name, attrs) def endElement(self, name): self._nTabs -= 1 _SubTreeContentHandler.endElement(self, name) def characters(self, ch): print ch.strip() class GenXMLEchoHandler(ContentHandler): '''ContentHandler that will create an XMLEchoHandler if it receives a startElement event.''' def __init__(self, parser): self.parser = parser def startElement(self, name, attrs): XMLEchoHandler(name, {'parser':self.parser, 'parent':self}) class SkipSubTreeHandler(_SubTreeContentHandler): '''Class to absorb SAX events for a subtree.''' def characters(self, ch): pass class ChildrenToHandle(object): '''LIsts attributes to read and maps children elements to SAXConstructible class to handle the element''' def __init__(self, **children): '''Take dictionary mapping attr, singleEl, and multiEl to a list and 2 dictionaries.''' self.attributesToRead = children.get('attr', []) self.singleElementDict = children.get('singleEl', {}) self.multiElementDict = children.get('multiEl', {}) def getChildElementsCTor(self, name): '''Returns the a sub-class of SAXConstructible to handle an element 'name' or None.''' childTypePair = self.singleElementDict.get(name) if childTypePair is None: return self.multiElementDict.get(name) return childTypePair def isSingleElement(self, name): '''Returns True if the 'name' is the name of an element that is expected as a child once''' return self.singleElementDict.get(name) is not None def getUnion(self, other): a = copy.copy(self.attributesToRead) a.extend(other.attributesToRead) s = copy.copy(self.singleElementDict) s.update(other.singleElementDict) m = copy.copy(self.multiElementDict) m.update(other.multiElementDict) return ChildrenToHandle(attr = a, singleEl = s, multiEl = m) def __str__(self): a = 'attr = ' + str(self.attributesToRead) s = ', '.join([i for i in self.singleElementDict.iterkeys()]) m = ', '.join([i for i in self.multiElementDict.iterkeys()]) return a + '\nsingle elements = ' + s + '\nmulti elements = ' + m class SAXConstructible(ContentHandler, object): illegalAttrCharsRE = re.compile(r'[_\-.]') # not an exhaustive list, others shouldn't be used in xml attribute names ''' Abstract class for Sax ContententHandlers which delegate parsing of elements to contained SAXConstructible objects. Methods startSelfElement, childDone are pure virtual''' # set to True to use the derived class's _attr, _singleEl, and _multiEl # to intialize the _childrenToHandle field initializeFromClassStatic = True # endElement is used to purge attributes added by SAXConstructible # (meaning the object cannot be used for parsing again and the base __str__ cannot be used) purgeParsingVars = False def __init__(self, elName, parseContext, childrenToHandle = None): ''' Sets _elName to elName and calls startSelfElement if attrs were sent in parseContext.''' self._elName = elName if SAXConstructible.initializeFromClassStatic: self._childrenToHandle = self.__class__._CTH else: if childrenToHandle is None: raise TypeError, 'must use childrenToHandle arg in SAXConstructible.__init__() when SAXConstructible.initializeFromClassStatic is False' self._childrenToHandle = childrenToHandle self.clear() self._parent = parseContext.get('parent') self._parser = parseContext.get('parser') if self._parser is not None: self._parser.setContentHandler(self) if parseContext.get('attrs') is not None: self.callStartSelfElement(self._elName, parseContext.get('attrs')) def version(): return '0.1' version = staticmethod(version) def getSubParser(self, inParser, name, inAttrs): ''' Creates a new ContentHandler of the class specified in _childrenToHandle. If an element of type name found in _childrenToHandle dictionary will be used to return a new object of the correct type. If it is not found a SkipSubTreeHandler will be returned.''' cTorArg = self._childrenToHandle.getChildElementsCTor(name) parseContext = adict(parser = inParser, parent = self, attrs = inAttrs) if cTorArg is None: return SkipSubTreeHandler(name, parseContext) return cTorArg(name, parseContext) def getElementName(self): ''' Returns the elName passed in as to __init__.''' return self._elName def getObject(self): ''' Returns self or None (if the element has not been read.''' if self._saxRead or self.hasChars(): return self return None def hasChars(self): '''Returns True if self._rawChars is not empty.''' return len(self._rawChars) > 0 def clear(self): ''' Resets _rawChars, all elements and attributes and flags.''' self._saxRead = False for attTag in self._childrenToHandle.attributesToRead: self.__dict__[toCamelCaseRE(attTag, SAXConstructible.illegalAttrCharsRE)] = '' for elTag in self._childrenToHandle.singleElementDict.iterkeys(): self.__dict__[toCamelCaseRE(elTag, SAXConstructible.illegalAttrCharsRE)] = None for elTag in self._childrenToHandle.multiElementDict.iterkeys(): self.__dict__[toCamelCaseRE(elTag, SAXConstructible.illegalAttrCharsRE)] = [] self._allElements = [] self._rawChars = '' def parseFile(self, fileName): ''' Creates a new parser for fileName, using self as the initial ContentHandler.''' return self.parseFileObj(open(fileName, 'rU')) def parseFileObj(self, fileObj): ''' Creates a new parser for the fileObj, using self as the initial ContentHandler.''' self._parent = None self._parser = create_parser() self._parser.setContentHandler(self) self._parser.parse(fileObj) return True def callStartSelfElement(self, name, attrs): ''' Calls startSelfElement(), sets self._saxRead to True, and calls postStartSelfElement()''' self.startSelfElement(name, attrs) self._saxRead = True self.postStartSelfElement() def startSelfElement(self, name, attrs): ''' Adds all attributes in _childrenToHandle.attributesToRead to self's dictionary ''' for attTag in self._childrenToHandle.attributesToRead: pyAttName = toCamelCaseRE(attTag, SAXConstructible.illegalAttrCharsRE) self.__dict__[pyAttName] = str(attrs.get(attTag, "")) def postStartSelfElement(self): ''' Hook for post-startSelfElement() behavior''' pass def startElement(self, name, attrs): ''' Spawns a subParser or calls startSelfElement if self has not parsed and name matches getElementName().''' if name == self._elName and self.getObject() is None: self.callStartSelfElement(name, attrs) else: self._subParser = self.getSubParser(self._parser, name, attrs) def characters(self, ch): ''' Strips and then append ch.''' stripped = ch.strip() if (len(stripped) > 0): self._rawChars += stripped.encode('ascii') def childDone(self, obj, name): ''' Appends the obj to the list of _allElements and to the element data member 'name'.''' self._allElements.append(obj) pyAttName = toCamelCaseRE(name, SAXConstructible.illegalAttrCharsRE) if self._childrenToHandle.isSingleElement(name): self.__dict__[pyAttName] = obj elif self.__dict__.get(pyAttName) is None: return else: self.__dict__[pyAttName].append(obj) def endSelfElement(self, name): ''' Hook for post-endElement() behavior.''' pass def endElement(self, name): ''' Calls endSelfElement() then parent's childDone() method with getObject() as an argument, and sets the parent as the parser's ContentHandler ''' if name != self._elName: raise ValueError, 'Unexpected name in endElement' if self._parent is not None: obj = self.getObject() self.endSelfElement(name) parent, parser = self._parent, self._parser if parent is not None: parent.childDone(obj, name) if parser is not None: parser.setContentHandler(parent) elif parser is not None: parser.setContentHandler(GenXMLEchoHandler(parser)) if SAXConstructible.purgeParsingVars: SAXConstructible.delParsingAttributes(self) else: self._parent = None self._parser = None def delParsingAttributes(self): del self._parent del self._parser del self._elName del self._saxRead del self._rawChars if hasattr(self, '_allElements'): del self._allElements if hasattr(self, '_subParser'): del self._subParser del self._childrenToHandle delParsingAttributes = staticmethod(delParsingAttributes) def getRawChars(self): ''' Returns stripped concatenation of all characters.''' return self._rawChars def getEscapedChars(self): ''' Returns stripped concatenation of all characters, with &, < and > escaped.''' return escape(self._rawChars) def __str__(self): ''' Concatenates self._rawChars and string for of every member self._allElement.''' return self._rawChars + ''.join([str(x) for x in self._allElements]) def writeXML(self, outStr): '''Writes the content of the object to XML preserving element order, but not order of characters relative to elements''' filledAttDict = {} for rawAtt in self._childrenToHandle.attributesToRead: attTag = toCamelCaseRE(rawAtt, SAXConstructible.illegalAttrCharsRE) if self.__dict__[attTag] != '': filledAttDict[attTag] = self.__dict__[attTag] writeSAXConstructibleToXML(outStr, self._elName, filledAttDict, self.getEscapedChars(), self._allElements) def ignorableWhitespace(self, content): '''Ignores content.''' pass def processingInstruction(self, target, data): '''Ignores the processing content.''' pass class TextOnlyElement(SAXConstructible): '''Only handles character content.''' automaticallyConvertToString = False _CTH = ChildrenToHandle() def __init__(self, elName, parseContext, translateFunc = None): super(TextOnlyElement, self).__init__(elName, parseContext, TextOnlyElement._CTH) self._translateFunc = translateFunc def childDone(self, obj, n): raise ValueError, 'Unexpected Child in TextOnlyElement' def endSelfElement(self, name): if self._translateFunc is not None: self._rawChars = self._translateFunc(self._rawChars) def getObject(self): if TextOnlyElement.automaticallyConvertToString: return str(self) return self class TranslateTextOnlyElement(SAXConstructible): '''Callable class that creates instances of TextOnlyElement with a stored translation function.''' def __init__(self, translateFunc): self._translateFunc = translateFunc def __call__(self, elName, parseContext): return TextOnlyElement(elName, parseContext, self._translateFunc) class IgnoreContentElement(SAXConstructible): '''Only handles character content.''' _CTH = ChildrenToHandle() def __init__(self, elName, parseContext): super(IgnoreContentElement, self).__init__(elName, parseContext, IgnoreContentElement._CTH) def saxEchoParseFileObj(fileObj): parser = create_parser() echoer = GenXMLEchoHandler(parser) parser.setContentHandler(echoer) parser.parse(fileObj) # This file is part of the PIPRes library # The PIPRes library is free software; you can redistribute it # and/or modify it under the terms of the GNU Lesser General # Public License as published by the Free Software Foundation; # either version 2.1 of the License, or (at your option) any later # version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free # Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA