encodings.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. # Character encoding routines
  2. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  3. # Copyright 2002-2008 Mark Pilgrim
  4. # All rights reserved.
  5. #
  6. # This file is a part of feedparser.
  7. #
  8. # Redistribution and use in source and binary forms, with or without modification,
  9. # are permitted provided that the following conditions are met:
  10. #
  11. # * Redistributions of source code must retain the above copyright notice,
  12. # this list of conditions and the following disclaimer.
  13. # * Redistributions in binary form must reproduce the above copyright notice,
  14. # this list of conditions and the following disclaimer in the documentation
  15. # and/or other materials provided with the distribution.
  16. #
  17. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  18. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. # POSSIBILITY OF SUCH DAMAGE.
  28. import codecs
  29. import re
  30. import typing as t
  31. try:
  32. try:
  33. import cchardet as chardet
  34. except ImportError:
  35. import chardet
  36. except ImportError:
  37. chardet = None
  38. lazy_chardet_encoding = None
  39. else:
  40. def lazy_chardet_encoding(data):
  41. return chardet.detect(data)['encoding'] or ''
  42. from .exceptions import (
  43. CharacterEncodingOverride,
  44. CharacterEncodingUnknown,
  45. NonXMLContentType,
  46. )
  47. # Each marker represents some of the characters of the opening XML
  48. # processing instruction ('<?xm') in the specified encoding.
  49. EBCDIC_MARKER = b'\x4C\x6F\xA7\x94'
  50. UTF16BE_MARKER = b'\x00\x3C\x00\x3F'
  51. UTF16LE_MARKER = b'\x3C\x00\x3F\x00'
  52. UTF32BE_MARKER = b'\x00\x00\x00\x3C'
  53. UTF32LE_MARKER = b'\x3C\x00\x00\x00'
  54. ZERO_BYTES = '\x00\x00'
  55. # Match the opening XML declaration.
  56. # Example: <?xml version="1.0" encoding="utf-8"?>
  57. RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>')
  58. # Capture the value of the XML processing instruction's encoding attribute.
  59. # Example: <?xml version="1.0" encoding="utf-8"?>
  60. RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
  61. def parse_content_type(line: str) -> t.Tuple[str, str]:
  62. """Parse an HTTP Content-Type header.
  63. The return value will be a tuple of strings:
  64. the MIME type, and the value of the "charset" (if any).
  65. This is a custom replacement for Python's cgi.parse_header().
  66. The cgi module will be removed in Python 3.13.
  67. """
  68. chunks = line.split(";")
  69. if not chunks:
  70. return "", ""
  71. mime_type = chunks[0].strip()
  72. charset_value = ""
  73. for chunk in chunks[1:]:
  74. key, _, value = chunk.partition("=")
  75. if key.strip().lower() == "charset":
  76. charset_value = value.strip().strip("\"'")
  77. return mime_type, charset_value
  78. def convert_to_utf8(http_headers, data, result):
  79. """Detect and convert the character encoding to UTF-8.
  80. http_headers is a dictionary
  81. data is a raw string (not Unicode)"""
  82. # This is so much trickier than it sounds, it's not even funny.
  83. # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
  84. # is application/xml, application/*+xml,
  85. # application/xml-external-parsed-entity, or application/xml-dtd,
  86. # the encoding given in the charset parameter of the HTTP Content-Type
  87. # takes precedence over the encoding given in the XML prefix within the
  88. # document, and defaults to 'utf-8' if neither are specified. But, if
  89. # the HTTP Content-Type is text/xml, text/*+xml, or
  90. # text/xml-external-parsed-entity, the encoding given in the XML prefix
  91. # within the document is ALWAYS IGNORED and only the encoding given in
  92. # the charset parameter of the HTTP Content-Type header should be
  93. # respected, and it defaults to 'us-ascii' if not specified.
  94. # Furthermore, discussion on the atom-syntax mailing list with the
  95. # author of RFC 3023 leads me to the conclusion that any document
  96. # served with a Content-Type of text/* and no charset parameter
  97. # must be treated as us-ascii. (We now do this.) And also that it
  98. # must always be flagged as non-well-formed. (We now do this too.)
  99. # If Content-Type is unspecified (input was local file or non-HTTP source)
  100. # or unrecognized (server just got it totally wrong), then go by the
  101. # encoding given in the XML prefix of the document and default to
  102. # 'iso-8859-1' as per the HTTP specification (RFC 2616).
  103. # Then, assuming we didn't find a character encoding in the HTTP headers
  104. # (and the HTTP Content-type allowed us to look in the body), we need
  105. # to sniff the first few bytes of the XML data and try to determine
  106. # whether the encoding is ASCII-compatible. Section F of the XML
  107. # specification shows the way here:
  108. # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
  109. # If the sniffed encoding is not ASCII-compatible, we need to make it
  110. # ASCII compatible so that we can sniff further into the XML declaration
  111. # to find the encoding attribute, which will tell us the true encoding.
  112. # Of course, none of this guarantees that we will be able to parse the
  113. # feed in the declared character encoding (assuming it was declared
  114. # correctly, which many are not). iconv_codec can help a lot;
  115. # you should definitely install it if you can.
  116. # http://cjkpython.i18n.org/
  117. bom_encoding = ''
  118. xml_encoding = ''
  119. # Look at the first few bytes of the document to guess what
  120. # its encoding may be. We only need to decode enough of the
  121. # document that we can use an ASCII-compatible regular
  122. # expression to search for an XML encoding declaration.
  123. # The heuristic follows the XML specification, section F:
  124. # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
  125. # Check for BOMs first.
  126. if data[:4] == codecs.BOM_UTF32_BE:
  127. bom_encoding = 'utf-32be'
  128. data = data[4:]
  129. elif data[:4] == codecs.BOM_UTF32_LE:
  130. bom_encoding = 'utf-32le'
  131. data = data[4:]
  132. elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
  133. bom_encoding = 'utf-16be'
  134. data = data[2:]
  135. elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
  136. bom_encoding = 'utf-16le'
  137. data = data[2:]
  138. elif data[:3] == codecs.BOM_UTF8:
  139. bom_encoding = 'utf-8'
  140. data = data[3:]
  141. # Check for the characters '<?xm' in several encodings.
  142. elif data[:4] == EBCDIC_MARKER:
  143. bom_encoding = 'cp037'
  144. elif data[:4] == UTF16BE_MARKER:
  145. bom_encoding = 'utf-16be'
  146. elif data[:4] == UTF16LE_MARKER:
  147. bom_encoding = 'utf-16le'
  148. elif data[:4] == UTF32BE_MARKER:
  149. bom_encoding = 'utf-32be'
  150. elif data[:4] == UTF32LE_MARKER:
  151. bom_encoding = 'utf-32le'
  152. tempdata = data
  153. try:
  154. if bom_encoding:
  155. tempdata = data.decode(bom_encoding).encode('utf-8')
  156. except (UnicodeDecodeError, LookupError):
  157. # feedparser recognizes UTF-32 encodings that aren't
  158. # available in Python 2.4 and 2.5, so it's possible to
  159. # encounter a LookupError during decoding.
  160. xml_encoding_match = None
  161. else:
  162. xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
  163. if xml_encoding_match:
  164. xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
  165. # Normalize the xml_encoding if necessary.
  166. if bom_encoding and (xml_encoding in (
  167. 'u16', 'utf-16', 'utf16', 'utf_16',
  168. 'u32', 'utf-32', 'utf32', 'utf_32',
  169. 'iso-10646-ucs-2', 'iso-10646-ucs-4',
  170. 'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
  171. )):
  172. xml_encoding = bom_encoding
  173. # Find the HTTP Content-Type and, hopefully, a character
  174. # encoding provided by the server. The Content-Type is used
  175. # to choose the "correct" encoding among the BOM encoding,
  176. # XML declaration encoding, and HTTP encoding, following the
  177. # heuristic defined in RFC 3023.
  178. http_content_type = http_headers.get('content-type') or ''
  179. http_content_type, http_encoding = parse_content_type(http_content_type)
  180. acceptable_content_type = 0
  181. application_content_types = ('application/xml', 'application/xml-dtd',
  182. 'application/xml-external-parsed-entity')
  183. text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
  184. if (
  185. http_content_type in application_content_types
  186. or (
  187. http_content_type.startswith('application/')
  188. and http_content_type.endswith('+xml')
  189. )
  190. ):
  191. acceptable_content_type = 1
  192. rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
  193. elif (
  194. http_content_type in text_content_types
  195. or (
  196. http_content_type.startswith('text/')
  197. and http_content_type.endswith('+xml')
  198. )
  199. ):
  200. acceptable_content_type = 1
  201. rfc3023_encoding = http_encoding or 'us-ascii'
  202. elif http_content_type.startswith('text/'):
  203. rfc3023_encoding = http_encoding or 'us-ascii'
  204. elif http_headers and 'content-type' not in http_headers:
  205. rfc3023_encoding = xml_encoding or 'iso-8859-1'
  206. else:
  207. rfc3023_encoding = xml_encoding or 'utf-8'
  208. # gb18030 is a superset of gb2312, so always replace gb2312
  209. # with gb18030 for greater compatibility.
  210. if rfc3023_encoding.lower() == 'gb2312':
  211. rfc3023_encoding = 'gb18030'
  212. if xml_encoding.lower() == 'gb2312':
  213. xml_encoding = 'gb18030'
  214. # there are four encodings to keep track of:
  215. # - http_encoding is the encoding declared in the Content-Type HTTP header
  216. # - xml_encoding is the encoding declared in the <?xml declaration
  217. # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
  218. # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
  219. error = None
  220. if http_headers and (not acceptable_content_type):
  221. if 'content-type' in http_headers:
  222. msg = '%s is not an XML media type' % http_headers['content-type']
  223. else:
  224. msg = 'no Content-type specified'
  225. error = NonXMLContentType(msg)
  226. # determine character encoding
  227. known_encoding = 0
  228. tried_encodings = []
  229. # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
  230. for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
  231. lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'):
  232. if callable(proposed_encoding):
  233. proposed_encoding = proposed_encoding(data)
  234. if not proposed_encoding:
  235. continue
  236. if proposed_encoding in tried_encodings:
  237. continue
  238. tried_encodings.append(proposed_encoding)
  239. try:
  240. data = data.decode(proposed_encoding)
  241. except (UnicodeDecodeError, LookupError):
  242. pass
  243. else:
  244. known_encoding = 1
  245. # Update the encoding in the opening XML processing instruction.
  246. new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
  247. if RE_XML_DECLARATION.search(data):
  248. data = RE_XML_DECLARATION.sub(new_declaration, data)
  249. else:
  250. data = new_declaration + '\n' + data
  251. data = data.encode('utf-8')
  252. break
  253. # if still no luck, give up
  254. if not known_encoding:
  255. error = CharacterEncodingUnknown(
  256. 'document encoding unknown, I tried ' +
  257. '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
  258. (rfc3023_encoding, xml_encoding))
  259. rfc3023_encoding = ''
  260. elif proposed_encoding != rfc3023_encoding:
  261. error = CharacterEncodingOverride(
  262. 'document declared as %s, but parsed as %s' %
  263. (rfc3023_encoding, proposed_encoding))
  264. rfc3023_encoding = proposed_encoding
  265. result['encoding'] = rfc3023_encoding
  266. if error:
  267. result['bozo'] = True
  268. result['bozo_exception'] = error
  269. return data