strict.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # The strict feed parser that interfaces with an XML parsing library
  2. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  3. # Copyright 2002-2008 Mark Pilgrim
  4. # All rights reserved.
  5. #
  6. # This file is a part of feedparser.
  7. #
  8. # Redistribution and use in source and binary forms, with or without modification,
  9. # are permitted provided that the following conditions are met:
  10. #
  11. # * Redistributions of source code must retain the above copyright notice,
  12. # this list of conditions and the following disclaimer.
  13. # * Redistributions in binary form must reproduce the above copyright notice,
  14. # this list of conditions and the following disclaimer in the documentation
  15. # and/or other materials provided with the distribution.
  16. #
  17. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  18. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. # POSSIBILITY OF SUCH DAMAGE.
  28. from ..exceptions import UndeclaredNamespace
  29. class _StrictFeedParser(object):
  30. def __init__(self, baseuri, baselang, encoding):
  31. self.bozo = 0
  32. self.exc = None
  33. self.decls = {}
  34. self.baseuri = baseuri or ''
  35. self.lang = baselang
  36. self.encoding = encoding
  37. super(_StrictFeedParser, self).__init__()
  38. @staticmethod
  39. def _normalize_attributes(kv):
  40. k = kv[0].lower()
  41. v = k in ('rel', 'type') and kv[1].lower() or kv[1]
  42. return k, v
  43. def startPrefixMapping(self, prefix, uri):
  44. if not uri:
  45. return
  46. # Jython uses '' instead of None; standardize on None
  47. prefix = prefix or None
  48. self.track_namespace(prefix, uri)
  49. if prefix and uri == 'http://www.w3.org/1999/xlink':
  50. self.decls['xmlns:' + prefix] = uri
  51. def startElementNS(self, name, qname, attrs):
  52. namespace, localname = name
  53. lowernamespace = str(namespace or '').lower()
  54. if lowernamespace.find('backend.userland.com/rss') != -1:
  55. # match any backend.userland.com namespace
  56. namespace = 'http://backend.userland.com/rss'
  57. lowernamespace = namespace
  58. if qname and qname.find(':') > 0:
  59. givenprefix = qname.split(':')[0]
  60. else:
  61. givenprefix = None
  62. prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
  63. if givenprefix and (prefix is None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespaces_in_use:
  64. raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix)
  65. localname = str(localname).lower()
  66. # qname implementation is horribly broken in Python 2.1 (it
  67. # doesn't report any), and slightly broken in Python 2.2 (it
  68. # doesn't report the xml: namespace). So we match up namespaces
  69. # with a known list first, and then possibly override them with
  70. # the qnames the SAX parser gives us (if indeed it gives us any
  71. # at all). Thanks to MatejC for helping me test this and
  72. # tirelessly telling me that it didn't work yet.
  73. attrsD, self.decls = self.decls, {}
  74. if localname == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML':
  75. attrsD['xmlns'] = namespace
  76. if localname == 'svg' and namespace == 'http://www.w3.org/2000/svg':
  77. attrsD['xmlns'] = namespace
  78. if prefix:
  79. localname = prefix.lower() + ':' + localname
  80. elif namespace and not qname: # Expat
  81. for name, value in self.namespaces_in_use.items():
  82. if name and value == namespace:
  83. localname = name + ':' + localname
  84. break
  85. for (namespace, attrlocalname), attrvalue in attrs.items():
  86. lowernamespace = (namespace or '').lower()
  87. prefix = self._matchnamespaces.get(lowernamespace, '')
  88. if prefix:
  89. attrlocalname = prefix + ':' + attrlocalname
  90. attrsD[str(attrlocalname).lower()] = attrvalue
  91. for qname in attrs.getQNames():
  92. attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
  93. localname = str(localname).lower()
  94. self.unknown_starttag(localname, list(attrsD.items()))
  95. def characters(self, text):
  96. self.handle_data(text)
  97. def endElementNS(self, name, qname):
  98. namespace, localname = name
  99. lowernamespace = str(namespace or '').lower()
  100. if qname and qname.find(':') > 0:
  101. givenprefix = qname.split(':')[0]
  102. else:
  103. givenprefix = ''
  104. prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
  105. if prefix:
  106. localname = prefix + ':' + localname
  107. elif namespace and not qname: # Expat
  108. for name, value in self.namespaces_in_use.items():
  109. if name and value == namespace:
  110. localname = name + ':' + localname
  111. break
  112. localname = str(localname).lower()
  113. self.unknown_endtag(localname)
  114. def error(self, exc):
  115. self.bozo = 1
  116. self.exc = exc
  117. # drv_libxml2 calls warning() in some cases
  118. warning = error
  119. def fatalError(self, exc):
  120. self.error(exc)
  121. raise exc