html.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  2. # Copyright 2002-2008 Mark Pilgrim
  3. # All rights reserved.
  4. #
  5. # This file is a part of feedparser.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  17. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. # POSSIBILITY OF SUCH DAMAGE.
  27. import html.entities
  28. import re
  29. from .sgml import *
  30. _cp1252 = {
  31. 128: '\u20ac', # euro sign
  32. 130: '\u201a', # single low-9 quotation mark
  33. 131: '\u0192', # latin small letter f with hook
  34. 132: '\u201e', # double low-9 quotation mark
  35. 133: '\u2026', # horizontal ellipsis
  36. 134: '\u2020', # dagger
  37. 135: '\u2021', # double dagger
  38. 136: '\u02c6', # modifier letter circumflex accent
  39. 137: '\u2030', # per mille sign
  40. 138: '\u0160', # latin capital letter s with caron
  41. 139: '\u2039', # single left-pointing angle quotation mark
  42. 140: '\u0152', # latin capital ligature oe
  43. 142: '\u017d', # latin capital letter z with caron
  44. 145: '\u2018', # left single quotation mark
  45. 146: '\u2019', # right single quotation mark
  46. 147: '\u201c', # left double quotation mark
  47. 148: '\u201d', # right double quotation mark
  48. 149: '\u2022', # bullet
  49. 150: '\u2013', # en dash
  50. 151: '\u2014', # em dash
  51. 152: '\u02dc', # small tilde
  52. 153: '\u2122', # trade mark sign
  53. 154: '\u0161', # latin small letter s with caron
  54. 155: '\u203a', # single right-pointing angle quotation mark
  55. 156: '\u0153', # latin small ligature oe
  56. 158: '\u017e', # latin small letter z with caron
  57. 159: '\u0178', # latin capital letter y with diaeresis
  58. }
  59. class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
  60. special = re.compile("""[<>'"]""")
  61. bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
  62. elements_no_end_tag = {
  63. 'area',
  64. 'base',
  65. 'basefont',
  66. 'br',
  67. 'col',
  68. 'command',
  69. 'embed',
  70. 'frame',
  71. 'hr',
  72. 'img',
  73. 'input',
  74. 'isindex',
  75. 'keygen',
  76. 'link',
  77. 'meta',
  78. 'param',
  79. 'source',
  80. 'track',
  81. 'wbr',
  82. }
  83. def __init__(self, encoding=None, _type='application/xhtml+xml'):
  84. if encoding:
  85. self.encoding = encoding
  86. self._type = _type
  87. self.pieces = []
  88. super(_BaseHTMLProcessor, self).__init__()
  89. def reset(self):
  90. self.pieces = []
  91. super(_BaseHTMLProcessor, self).reset()
  92. def _shorttag_replace(self, match):
  93. """
  94. :type match: Match[str]
  95. :rtype: str
  96. """
  97. tag = match.group(1)
  98. if tag in self.elements_no_end_tag:
  99. return '<' + tag + ' />'
  100. else:
  101. return '<' + tag + '></' + tag + '>'
  102. # By declaring these methods and overriding their compiled code
  103. # with the code from sgmllib, the original code will execute in
  104. # feedparser's scope instead of sgmllib's. This means that the
  105. # `tagfind` and `charref` regular expressions will be found as
  106. # they're declared above, not as they're declared in sgmllib.
  107. def goahead(self, i):
  108. raise NotImplementedError
  109. # Replace goahead with SGMLParser's goahead() code object.
  110. try:
  111. goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
  112. except AttributeError:
  113. # Python 2
  114. # noinspection PyUnresolvedReferences
  115. goahead.func_code = sgmllib.SGMLParser.goahead.func_code
  116. def __parse_starttag(self, i):
  117. raise NotImplementedError
  118. # Replace __parse_starttag with SGMLParser's parse_starttag() code object.
  119. try:
  120. __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
  121. except AttributeError:
  122. # Python 2
  123. # noinspection PyUnresolvedReferences
  124. __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
  125. def parse_starttag(self, i):
  126. j = self.__parse_starttag(i)
  127. if self._type == 'application/xhtml+xml':
  128. if j > 2 and self.rawdata[j-2:j] == '/>':
  129. self.unknown_endtag(self.lasttag)
  130. return j
  131. def feed(self, data):
  132. """
  133. :type data: str
  134. :rtype: None
  135. """
  136. data = re.sub(r'<!((?!DOCTYPE|--|\[))', r'&lt;!\1', data, re.IGNORECASE)
  137. data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
  138. data = data.replace('&#39;', "'")
  139. data = data.replace('&#34;', '"')
  140. super(_BaseHTMLProcessor, self).feed(data)
  141. super(_BaseHTMLProcessor, self).close()
  142. @staticmethod
  143. def normalize_attrs(attrs):
  144. """
  145. :type attrs: List[Tuple[str, str]]
  146. :rtype: List[Tuple[str, str]]
  147. """
  148. if not attrs:
  149. return attrs
  150. # utility method to be called by descendants
  151. # Collapse any duplicate attribute names and values by converting
  152. # *attrs* into a dictionary, then convert it back to a list.
  153. attrs_d = {k.lower(): v for k, v in attrs}
  154. attrs = [
  155. (k, k in ('rel', 'type') and v.lower() or v)
  156. for k, v in attrs_d.items()
  157. ]
  158. attrs.sort()
  159. return attrs
  160. def unknown_starttag(self, tag, attrs):
  161. """
  162. :type tag: str
  163. :type attrs: List[Tuple[str, str]]
  164. :rtype: None
  165. """
  166. # Called for each start tag
  167. # attrs is a list of (attr, value) tuples
  168. # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
  169. uattrs = []
  170. strattrs = ''
  171. if attrs:
  172. for key, value in attrs:
  173. value = value.replace('>', '&gt;')
  174. value = value.replace('<', '&lt;')
  175. value = value.replace('"', '&quot;')
  176. value = self.bare_ampersand.sub("&amp;", value)
  177. uattrs.append((key, value))
  178. strattrs = ''.join(
  179. ' %s="%s"' % (key, value)
  180. for key, value in uattrs
  181. )
  182. if tag in self.elements_no_end_tag:
  183. self.pieces.append('<%s%s />' % (tag, strattrs))
  184. else:
  185. self.pieces.append('<%s%s>' % (tag, strattrs))
  186. def unknown_endtag(self, tag):
  187. """
  188. :type tag: str
  189. :rtype: None
  190. """
  191. # Called for each end tag, e.g. for </pre>, tag will be 'pre'
  192. # Reconstruct the original end tag.
  193. if tag not in self.elements_no_end_tag:
  194. self.pieces.append("</%s>" % tag)
  195. def handle_charref(self, ref):
  196. """
  197. :type ref: str
  198. :rtype: None
  199. """
  200. # Called for each character reference, e.g. '&#160;' will extract '160'
  201. # Reconstruct the original character reference.
  202. ref = ref.lower()
  203. if ref.startswith('x'):
  204. value = int(ref[1:], 16)
  205. else:
  206. value = int(ref)
  207. if value in _cp1252:
  208. self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
  209. else:
  210. self.pieces.append('&#%s;' % ref)
  211. def handle_entityref(self, ref):
  212. """
  213. :type ref: str
  214. :rtype: None
  215. """
  216. # Called for each entity reference, e.g. '&copy;' will extract 'copy'
  217. # Reconstruct the original entity reference.
  218. if ref in html.entities.name2codepoint or ref == 'apos':
  219. self.pieces.append('&%s;' % ref)
  220. else:
  221. self.pieces.append('&amp;%s' % ref)
  222. def handle_data(self, text):
  223. """
  224. :type text: str
  225. :rtype: None
  226. """
  227. # called for each block of plain text, i.e. outside of any tag and
  228. # not containing any character or entity references
  229. # Store the original text verbatim.
  230. self.pieces.append(text)
  231. def handle_comment(self, text):
  232. """
  233. :type text: str
  234. :rtype: None
  235. """
  236. # Called for HTML comments, e.g. <!-- insert Javascript code here -->
  237. # Reconstruct the original comment.
  238. self.pieces.append('<!--%s-->' % text)
  239. def handle_pi(self, text):
  240. """
  241. :type text: str
  242. :rtype: None
  243. """
  244. # Called for each processing instruction, e.g. <?instruction>
  245. # Reconstruct original processing instruction.
  246. self.pieces.append('<?%s>' % text)
  247. def handle_decl(self, text):
  248. """
  249. :type text: str
  250. :rtype: None
  251. """
  252. # called for the DOCTYPE, if present, e.g.
  253. # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
  254. # "http://www.w3.org/TR/html4/loose.dtd">
  255. # Reconstruct original DOCTYPE
  256. self.pieces.append('<!%s>' % text)
  257. _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
  258. def _scan_name(self, i, declstartpos):
  259. """
  260. :type i: int
  261. :type declstartpos: int
  262. :rtype: Tuple[Optional[str], int]
  263. """
  264. rawdata = self.rawdata
  265. n = len(rawdata)
  266. if i == n:
  267. return None, -1
  268. m = self._new_declname_match(rawdata, i)
  269. if m:
  270. s = m.group()
  271. name = s.strip()
  272. if (i + len(s)) == n:
  273. return None, -1 # end of buffer
  274. return name.lower(), m.end()
  275. else:
  276. self.handle_data(rawdata)
  277. # self.updatepos(declstartpos, i)
  278. return None, -1
  279. @staticmethod
  280. def convert_charref(name):
  281. """
  282. :type name: str
  283. :rtype: str
  284. """
  285. return '&#%s;' % name
  286. @staticmethod
  287. def convert_entityref(name):
  288. """
  289. :type name: str
  290. :rtype: str
  291. """
  292. return '&%s;' % name
  293. def output(self):
  294. """Return processed HTML as a single string.
  295. :rtype: str
  296. """
  297. return ''.join(self.pieces)
  298. def parse_declaration(self, i):
  299. """
  300. :type i: int
  301. :rtype: int
  302. """
  303. try:
  304. return sgmllib.SGMLParser.parse_declaration(self, i)
  305. except sgmllib.SGMLParseError:
  306. # Escape the doctype declaration and continue parsing.
  307. self.handle_data('&lt;')
  308. return i+1