sgml.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  2. # Copyright 2002-2008 Mark Pilgrim
  3. # All rights reserved.
  4. #
  5. # This file is a part of feedparser.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  17. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. # POSSIBILITY OF SUCH DAMAGE.
  27. import re
  28. import sgmllib
  29. __all__ = [
  30. 'sgmllib',
  31. 'charref',
  32. 'tagfind',
  33. 'attrfind',
  34. 'entityref',
  35. 'incomplete',
  36. 'interesting',
  37. 'shorttag',
  38. 'shorttagopen',
  39. 'starttagopen',
  40. 'endbracket',
  41. ]
  42. # sgmllib defines a number of module-level regular expressions that are
  43. # insufficient for the XML parsing feedparser needs. Rather than modify
  44. # the variables directly in sgmllib, they're defined here using the same
  45. # names, and the compiled code objects of several sgmllib.SGMLParser
  46. # methods are copied into _BaseHTMLProcessor so that they execute in
  47. # feedparser's scope instead of sgmllib's scope.
  48. charref = re.compile(r'&#(\d+|[xX][0-9a-fA-F]+);')
  49. tagfind = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*')
  50. attrfind = re.compile(
  51. r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*"""
  52. r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?"""
  53. )
  54. # Unfortunately, these must be copied over to prevent NameError exceptions
  55. entityref = sgmllib.entityref
  56. incomplete = sgmllib.incomplete
  57. interesting = sgmllib.interesting
  58. shorttag = sgmllib.shorttag
  59. shorttagopen = sgmllib.shorttagopen
  60. starttagopen = sgmllib.starttagopen
  61. class _EndBracketRegEx:
  62. def __init__(self):
  63. # Overriding the built-in sgmllib.endbracket regex allows the
  64. # parser to find angle brackets embedded in element attributes.
  65. self.endbracket = re.compile(
  66. r'('
  67. r"""[^'"<>]"""
  68. r"""|"[^"]*"(?=>|/|\s|\w+=)"""
  69. r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])"""
  70. r"""|.*?(?=[<>]"""
  71. r')'
  72. )
  73. def search(self, target, index=0):
  74. match = self.endbracket.match(target, index)
  75. if match is not None:
  76. # Returning a new object in the calling thread's context
  77. # resolves a thread-safety.
  78. return EndBracketMatch(match)
  79. return None
  80. class EndBracketMatch:
  81. def __init__(self, match):
  82. self.match = match
  83. def start(self, n):
  84. return self.match.end(n)
  85. endbracket = _EndBracketRegEx()