Initial revision
[TestXSLT.git] / libxml2 / python / drv_libxml2.py
1 """ A SAX2 driver for libxml2, on top of it's XmlReader API
2
3 USAGE
4     # put this file (drv_libxml2.py) in PYTHONPATH
5     import xml.sax
6     reader = xml.sax.make_parser(["drv_libxml2"])
7     # ...and the rest is standard python sax.
8
9 CAVEATS
10     - Lexical handlers are supported, except for start/endEntity
11       (waiting for XmlReader.ResolveEntity) and start/endDTD
12     - Error callbacks are not exactly synchronous, they tend
13       to be invoked before the corresponding content callback,
14       because the underlying reader interface parses
15       data by chunks of 512 bytes
16     
17 TODO
18     - search for TODO
19     - some ErrorHandler events (warning)
20     - some ContentHandler events (setDocumentLocator, skippedEntity)
21     - EntityResolver (using libxml2.?)
22     - DTDHandler (if/when libxml2 exposes such node types)
23     - DeclHandler (if/when libxml2 exposes such node types)
24     - property_xml_string?
25     - feature_string_interning?
26     - Incremental parser
27     - additional performance tuning:
28       - one might cache callbacks to avoid some name lookups
29       - one might implement a smarter way to pass attributes to startElement
30         (some kind of lazy evaluation?)
31       - there might be room for improvement in start/endPrefixMapping
32       - other?
33
34 """
35
36 __author__  = u"Stéphane Bidoul <sbi@skynet.be>"
37 __version__ = "0.3"
38
39 import codecs
40 import sys
41 from types import StringType, UnicodeType
42 StringTypes = (StringType,UnicodeType)
43
44 from xml.sax._exceptions import *
45 from xml.sax import xmlreader, saxutils
46 from xml.sax.handler import \
47      feature_namespaces, \
48      feature_namespace_prefixes, \
49      feature_string_interning, \
50      feature_validation, \
51      feature_external_ges, \
52      feature_external_pes, \
53      property_lexical_handler, \
54      property_declaration_handler, \
55      property_dom_node, \
56      property_xml_string
57
58 # libxml2 returns strings as UTF8
59 _decoder = codecs.lookup("utf8")[1]
60 def _d(s):
61     if s is None:
62         return s
63     else:
64         return _decoder(s)[0]
65
66 try:
67     import libxml2
68 except ImportError, e:
69     raise SAXReaderNotAvailable("libxml2 not available: " \
70                                 "import error was: %s" % e)
71
72 class Locator(xmlreader.Locator):
73     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
74
75     def __init__(self,locator):
76         self.__locator = locator
77
78     def getColumnNumber(self):
79         "Return the column number where the current event ends."
80         return -1
81
82     def getLineNumber(self):
83         "Return the line number where the current event ends."
84         return self.__locator.LineNumber()
85
86     def getPublicId(self):
87         "Return the public identifier for the current event."
88         return None
89
90     def getSystemId(self):
91         "Return the system identifier for the current event."
92         return self.__locator.BaseURI()
93
94 class LibXml2Reader(xmlreader.XMLReader):
95
96     def __init__(self):
97         xmlreader.XMLReader.__init__(self)
98         # features
99         self.__ns = 0
100         self.__nspfx = 0
101         self.__validate = 0
102         # parsing flag
103         self.__parsing = 0
104         # additional handlers
105         self.__lex_handler = None
106         self.__decl_handler = None
107         # error messages accumulator
108         self.__errors = None
109
110     def _errorHandler(self,arg,msg,severity,locator):
111         if self.__errors is None:
112             self.__errors = []
113         self.__errors.append((severity,
114                               SAXParseException(msg,None,
115                                                 Locator(locator))))
116
117     def _reportErrors(self,fatal):
118         for severity,exception in self.__errors:
119             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
120                             libxml2.PARSER_SEVERITY_WARNING):
121                 self._err_handler.warning(exception)
122             else:
123                 # when fatal is set, the parse will stop;
124                 # we consider that the last error reported
125                 # is the fatal one.
126                 if fatal and exception is self.__errors[-1][1]:
127                     self._err_handler.fatalError(exception)
128                 else:
129                     self._err_handler.error(exception)
130         self.__errors = None
131
132     def parse(self, source):
133         self.__parsing = 1
134         try:
135             # prepare source and create reader
136             if type(source) in StringTypes:
137                 reader = libxml2.newTextReaderFilename(source)
138             else:
139                 source = saxutils.prepare_input_source(source)
140                 input = libxml2.inputBuffer(source.getByteStream())
141                 reader = input.newTextReader(source.getSystemId())
142             reader.SetErrorHandler(self._errorHandler,None)
143             # configure reader
144             reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
145             reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
146             reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
147             reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
148             # we reuse attribute maps (for a slight performance gain)
149             if self.__ns:
150                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
151             else:
152                 attributesImpl = xmlreader.AttributesImpl({})
153             # prefixes to pop (for endPrefixMapping)
154             prefixes = []
155             # start loop
156             self._cont_handler.startDocument()
157             while 1:
158                 r = reader.Read()
159                 # check for errors
160                 if r == 1:
161                     if not self.__errors is None:
162                         self._reportErrors(0)
163                 elif r == 0:
164                     if not self.__errors is None:
165                         self._reportErrors(0)
166                     break # end of parse
167                 else:
168                     if not self.__errors is None:
169                         self._reportErrors(1)
170                     else:
171                         self._err_handler.fatalError(\
172                             SAXException("Read failed (no details available)"))
173                     break # fatal parse error
174                 # get node type
175                 nodeType = reader.NodeType()
176                 # Element
177                 if nodeType == 1: 
178                     if self.__ns:
179                         eltName = (_d(reader.NamespaceUri()),\
180                                    _d(reader.LocalName()))
181                         eltQName = _d(reader.Name())
182                         attributesNSImpl._attrs = attrs = {}
183                         attributesNSImpl._qnames = qnames = {}
184                         newPrefixes = []
185                         while reader.MoveToNextAttribute():
186                             qname = _d(reader.Name())
187                             value = _d(reader.Value())
188                             if qname.startswith("xmlns"):
189                                 if len(qname) > 5:
190                                     newPrefix = qname[6:]
191                                 else:
192                                     newPrefix = None
193                                 newPrefixes.append(newPrefix)
194                                 self._cont_handler.startPrefixMapping(\
195                                     newPrefix,value)
196                                 if not self.__nspfx:
197                                     continue # don't report xmlns attribute
198                             attName = (_d(reader.NamespaceUri()),
199                                        _d(reader.LocalName()))
200                             qnames[attName] = qname
201                             attrs[attName] = value
202                         reader.MoveToElement()
203                         self._cont_handler.startElementNS( \
204                             eltName,eltQName,attributesNSImpl) 
205                         if reader.IsEmptyElement():
206                             self._cont_handler.endElementNS(eltName,eltQName)
207                             for newPrefix in newPrefixes:
208                                 self._cont_handler.endPrefixMapping(newPrefix)
209                         else:
210                             prefixes.append(newPrefixes)
211                     else:
212                         eltName = _d(reader.Name())
213                         attributesImpl._attrs = attrs = {}
214                         while reader.MoveToNextAttribute():
215                             attName = _d(reader.Name())
216                             attrs[attName] = _d(reader.Value())
217                         reader.MoveToElement()
218                         self._cont_handler.startElement( \
219                             eltName,attributesImpl)
220                         if reader.IsEmptyElement():
221                             self._cont_handler.endElement(eltName)
222                 # EndElement
223                 elif nodeType == 15: 
224                     if self.__ns:
225                         self._cont_handler.endElementNS( \
226                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
227                              _d(reader.Name()))
228                         for prefix in prefixes.pop():
229                             self._cont_handler.endPrefixMapping(prefix)
230                     else:
231                         self._cont_handler.endElement(_d(reader.Name()))
232                 # Text
233                 elif nodeType == 3: 
234                     self._cont_handler.characters(_d(reader.Value()))
235                 # Whitespace
236                 elif nodeType == 13: 
237                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
238                 # SignificantWhitespace
239                 elif nodeType == 14:
240                     self._cont_handler.characters(_d(reader.Value()))
241                 # CDATA
242                 elif nodeType == 4:
243                     if not self.__lex_handler is None:
244                         self.__lex_handler.startCDATA()
245                     self._cont_handler.characters(_d(reader.Value()))
246                     if not self.__lex_handler is None:
247                         self.__lex_handler.endCDATA()
248                 # EntityReference
249                 elif nodeType == 5:
250                     if not self.__lex_handler is None:
251                         self.startEntity(_d(reader.Name()))
252                     reader.ResolveEntity()
253                 # EndEntity
254                 elif nodeType == 16:
255                     if not self.__lex_handler is None:
256                         self.endEntity(_d(reader.Name()))
257                 # ProcessingInstruction
258                 elif nodeType == 7: 
259                     self._cont_handler.processingInstruction( \
260                         _d(reader.Name()),_d(reader.Value()))
261                 # Comment
262                 elif nodeType == 8:
263                     if not self.__lex_handler is None:
264                         self.__lex_handler.comment(_d(reader.Value()))
265                 # DocumentType
266                 elif nodeType == 10:
267                     #if not self.__lex_handler is None:
268                     #    self.__lex_handler.startDTD()
269                     pass # TODO (how to detect endDTD? on first non-dtd event?)
270                 # XmlDeclaration
271                 elif nodeType == 17:
272                     pass # TODO
273                 # Entity
274                 elif nodeType == 6:
275                     pass # TODO (entity decl)
276                 # Notation (decl)
277                 elif nodeType == 12:
278                     pass # TODO
279                 # Attribute (never in this loop)
280                 #elif nodeType == 2: 
281                 #    pass
282                 # Document (not exposed)
283                 #elif nodeType == 9: 
284                 #    pass
285                 # DocumentFragment (never returned by XmlReader)
286                 #elif nodeType == 11:
287                 #    pass
288                 # None
289                 #elif nodeType == 0:
290                 #    pass
291                 # -
292                 else:
293                     raise SAXException("Unexpected node type %d" % nodeType)
294             if r == 0:
295                 self._cont_handler.endDocument()
296             reader.Close()
297         finally:
298             self.__parsing = 0
299
300     def setDTDHandler(self, handler):
301         # TODO (when supported, the inherited method works just fine)
302         raise SAXNotSupportedException("DTDHandler not supported")
303
304     def setEntityResolver(self, resolver):
305         # TODO (when supported, the inherited method works just fine)
306         raise SAXNotSupportedException("EntityResolver not supported")
307
308     def getFeature(self, name):
309         if name == feature_namespaces:
310             return self.__ns
311         elif name == feature_namespace_prefixes:
312             return self.__nspfx
313         elif name == feature_validation:
314             return self.__validate
315         elif name == feature_external_ges:
316             return 1 # TODO (does that relate to PARSER_LOADDTD)?
317         elif name == feature_external_pes:
318             return 1 # TODO (does that relate to PARSER_LOADDTD)?
319         else:
320             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
321                                             name)
322
323     def setFeature(self, name, state):
324         if self.__parsing:
325             raise SAXNotSupportedException("Cannot set feature %s " \
326                                            "while parsing" % name)
327         if name == feature_namespaces:
328             self.__ns = state
329         elif name == feature_namespace_prefixes:
330             self.__nspfx = state
331         elif name == feature_validation:
332             self.__validate = state
333         elif name == feature_external_ges:
334             if state == 0:
335                 # TODO (does that relate to PARSER_LOADDTD)?
336                 raise SAXNotSupportedException("Feature '%s' not supported" % \
337                                                name)
338         elif name == feature_external_pes:
339             if state == 0:
340                 # TODO (does that relate to PARSER_LOADDTD)?
341                 raise SAXNotSupportedException("Feature '%s' not supported" % \
342                                                name)
343         else:
344             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
345                                             name)
346
347     def getProperty(self, name):
348         if name == property_lexical_handler:
349             return self.__lex_handler
350         elif name == property_declaration_handler:
351             return self.__decl_handler
352         else:
353             raise SAXNotRecognizedException("Property '%s' not recognized" % \
354                                             name)
355
356     def setProperty(self, name, value):     
357         if name == property_lexical_handler:
358             self.__lex_handler = value
359         elif name == property_declaration_handler:
360             # TODO: remove if/when libxml2 supports dtd events
361             raise SAXNotSupportedException("Property '%s' not supported" % \
362                                            name)
363             self.__decl_handler = value
364         else:
365             raise SAXNotRecognizedException("Property '%s' not recognized" % \
366                                             name)
367
368 def create_parser():
369     return LibXml2Reader()
370