1 """ A SAX2 driver for libxml2, on top of it's XmlReader API
4 # put this file (drv_libxml2.py) in PYTHONPATH
6 reader = xml.sax.make_parser(["drv_libxml2"])
7 # ...and the rest is standard python sax.
10 - Lexical handlers are supported, except for start/endEntity
11 (waiting for XmlReader.ResolveEntity) and start/endDTD
12 - Error callbacks are not exactly synchronous, they tend
13 to be invoked before the corresponding content callback,
14 because the underlying reader interface parses
15 data by chunks of 512 bytes
19 - some ErrorHandler events (warning)
20 - some ContentHandler events (setDocumentLocator, skippedEntity)
21 - EntityResolver (using libxml2.?)
22 - DTDHandler (if/when libxml2 exposes such node types)
23 - DeclHandler (if/when libxml2 exposes such node types)
24 - property_xml_string?
25 - feature_string_interning?
27 - additional performance tuning:
28 - one might cache callbacks to avoid some name lookups
29 - one might implement a smarter way to pass attributes to startElement
30 (some kind of lazy evaluation?)
31 - there might be room for improvement in start/endPrefixMapping
36 __author__ = u"Stéphane Bidoul <sbi@skynet.be>"
41 from types import StringType, UnicodeType
42 StringTypes = (StringType,UnicodeType)
44 from xml.sax._exceptions import *
45 from xml.sax import xmlreader, saxutils
46 from xml.sax.handler import \
48 feature_namespace_prefixes, \
49 feature_string_interning, \
51 feature_external_ges, \
52 feature_external_pes, \
53 property_lexical_handler, \
54 property_declaration_handler, \
58 # libxml2 returns strings as UTF8
59 _decoder = codecs.lookup("utf8")[1]
68 except ImportError, e:
69 raise SAXReaderNotAvailable("libxml2 not available: " \
70 "import error was: %s" % e)
72 class Locator(xmlreader.Locator):
73 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
75 def __init__(self,locator):
76 self.__locator = locator
78 def getColumnNumber(self):
79 "Return the column number where the current event ends."
82 def getLineNumber(self):
83 "Return the line number where the current event ends."
84 return self.__locator.LineNumber()
86 def getPublicId(self):
87 "Return the public identifier for the current event."
90 def getSystemId(self):
91 "Return the system identifier for the current event."
92 return self.__locator.BaseURI()
94 class LibXml2Reader(xmlreader.XMLReader):
97 xmlreader.XMLReader.__init__(self)
104 # additional handlers
105 self.__lex_handler = None
106 self.__decl_handler = None
107 # error messages accumulator
110 def _errorHandler(self,arg,msg,severity,locator):
111 if self.__errors is None:
113 self.__errors.append((severity,
114 SAXParseException(msg,None,
117 def _reportErrors(self,fatal):
118 for severity,exception in self.__errors:
119 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
120 libxml2.PARSER_SEVERITY_WARNING):
121 self._err_handler.warning(exception)
123 # when fatal is set, the parse will stop;
124 # we consider that the last error reported
126 if fatal and exception is self.__errors[-1][1]:
127 self._err_handler.fatalError(exception)
129 self._err_handler.error(exception)
132 def parse(self, source):
135 # prepare source and create reader
136 if type(source) in StringTypes:
137 reader = libxml2.newTextReaderFilename(source)
139 source = saxutils.prepare_input_source(source)
140 input = libxml2.inputBuffer(source.getByteStream())
141 reader = input.newTextReader(source.getSystemId())
142 reader.SetErrorHandler(self._errorHandler,None)
144 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
145 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
146 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
147 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
148 # we reuse attribute maps (for a slight performance gain)
150 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
152 attributesImpl = xmlreader.AttributesImpl({})
153 # prefixes to pop (for endPrefixMapping)
156 self._cont_handler.startDocument()
161 if not self.__errors is None:
162 self._reportErrors(0)
164 if not self.__errors is None:
165 self._reportErrors(0)
168 if not self.__errors is None:
169 self._reportErrors(1)
171 self._err_handler.fatalError(\
172 SAXException("Read failed (no details available)"))
173 break # fatal parse error
175 nodeType = reader.NodeType()
179 eltName = (_d(reader.NamespaceUri()),\
180 _d(reader.LocalName()))
181 eltQName = _d(reader.Name())
182 attributesNSImpl._attrs = attrs = {}
183 attributesNSImpl._qnames = qnames = {}
185 while reader.MoveToNextAttribute():
186 qname = _d(reader.Name())
187 value = _d(reader.Value())
188 if qname.startswith("xmlns"):
190 newPrefix = qname[6:]
193 newPrefixes.append(newPrefix)
194 self._cont_handler.startPrefixMapping(\
197 continue # don't report xmlns attribute
198 attName = (_d(reader.NamespaceUri()),
199 _d(reader.LocalName()))
200 qnames[attName] = qname
201 attrs[attName] = value
202 reader.MoveToElement()
203 self._cont_handler.startElementNS( \
204 eltName,eltQName,attributesNSImpl)
205 if reader.IsEmptyElement():
206 self._cont_handler.endElementNS(eltName,eltQName)
207 for newPrefix in newPrefixes:
208 self._cont_handler.endPrefixMapping(newPrefix)
210 prefixes.append(newPrefixes)
212 eltName = _d(reader.Name())
213 attributesImpl._attrs = attrs = {}
214 while reader.MoveToNextAttribute():
215 attName = _d(reader.Name())
216 attrs[attName] = _d(reader.Value())
217 reader.MoveToElement()
218 self._cont_handler.startElement( \
219 eltName,attributesImpl)
220 if reader.IsEmptyElement():
221 self._cont_handler.endElement(eltName)
225 self._cont_handler.endElementNS( \
226 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
228 for prefix in prefixes.pop():
229 self._cont_handler.endPrefixMapping(prefix)
231 self._cont_handler.endElement(_d(reader.Name()))
234 self._cont_handler.characters(_d(reader.Value()))
237 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
238 # SignificantWhitespace
240 self._cont_handler.characters(_d(reader.Value()))
243 if not self.__lex_handler is None:
244 self.__lex_handler.startCDATA()
245 self._cont_handler.characters(_d(reader.Value()))
246 if not self.__lex_handler is None:
247 self.__lex_handler.endCDATA()
250 if not self.__lex_handler is None:
251 self.startEntity(_d(reader.Name()))
252 reader.ResolveEntity()
255 if not self.__lex_handler is None:
256 self.endEntity(_d(reader.Name()))
257 # ProcessingInstruction
259 self._cont_handler.processingInstruction( \
260 _d(reader.Name()),_d(reader.Value()))
263 if not self.__lex_handler is None:
264 self.__lex_handler.comment(_d(reader.Value()))
267 #if not self.__lex_handler is None:
268 # self.__lex_handler.startDTD()
269 pass # TODO (how to detect endDTD? on first non-dtd event?)
275 pass # TODO (entity decl)
279 # Attribute (never in this loop)
282 # Document (not exposed)
285 # DocumentFragment (never returned by XmlReader)
286 #elif nodeType == 11:
293 raise SAXException("Unexpected node type %d" % nodeType)
295 self._cont_handler.endDocument()
300 def setDTDHandler(self, handler):
301 # TODO (when supported, the inherited method works just fine)
302 raise SAXNotSupportedException("DTDHandler not supported")
304 def setEntityResolver(self, resolver):
305 # TODO (when supported, the inherited method works just fine)
306 raise SAXNotSupportedException("EntityResolver not supported")
308 def getFeature(self, name):
309 if name == feature_namespaces:
311 elif name == feature_namespace_prefixes:
313 elif name == feature_validation:
314 return self.__validate
315 elif name == feature_external_ges:
316 return 1 # TODO (does that relate to PARSER_LOADDTD)?
317 elif name == feature_external_pes:
318 return 1 # TODO (does that relate to PARSER_LOADDTD)?
320 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
323 def setFeature(self, name, state):
325 raise SAXNotSupportedException("Cannot set feature %s " \
326 "while parsing" % name)
327 if name == feature_namespaces:
329 elif name == feature_namespace_prefixes:
331 elif name == feature_validation:
332 self.__validate = state
333 elif name == feature_external_ges:
335 # TODO (does that relate to PARSER_LOADDTD)?
336 raise SAXNotSupportedException("Feature '%s' not supported" % \
338 elif name == feature_external_pes:
340 # TODO (does that relate to PARSER_LOADDTD)?
341 raise SAXNotSupportedException("Feature '%s' not supported" % \
344 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
347 def getProperty(self, name):
348 if name == property_lexical_handler:
349 return self.__lex_handler
350 elif name == property_declaration_handler:
351 return self.__decl_handler
353 raise SAXNotRecognizedException("Property '%s' not recognized" % \
356 def setProperty(self, name, value):
357 if name == property_lexical_handler:
358 self.__lex_handler = value
359 elif name == property_declaration_handler:
360 # TODO: remove if/when libxml2 supports dtd events
361 raise SAXNotSupportedException("Property '%s' not supported" % \
363 self.__decl_handler = value
365 raise SAXNotRecognizedException("Property '%s' not recognized" % \
369 return LibXml2Reader()