http.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  2. # Copyright 2002-2008 Mark Pilgrim
  3. # All rights reserved.
  4. #
  5. # This file is a part of feedparser.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  17. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. # POSSIBILITY OF SUCH DAMAGE.
  27. import base64
  28. import datetime
  29. import gzip
  30. import io
  31. import re
  32. import struct
  33. import urllib.parse
  34. import urllib.request
  35. import zlib
  36. from .datetimes import _parse_date
  37. from .urls import convert_to_idn
  38. # HTTP "Accept" header to send to servers when downloading feeds. If you don't
  39. # want to send an Accept header, set this to None.
  40. ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
  41. class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
  42. def http_error_default(self, req, fp, code, msg, headers):
  43. # The default implementation just raises HTTPError.
  44. # Forget that.
  45. fp.status = code
  46. return fp
  47. def http_error_301(self, req, fp, code, msg, hdrs):
  48. result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
  49. if not result:
  50. return fp
  51. result.status = code
  52. result.newurl = result.geturl()
  53. return result
  54. # The default implementations in urllib.request.HTTPRedirectHandler
  55. # are identical, so hardcoding a http_error_301 call above
  56. # won't affect anything
  57. http_error_300 = http_error_301
  58. http_error_302 = http_error_301
  59. http_error_303 = http_error_301
  60. http_error_307 = http_error_301
  61. def http_error_401(self, req, fp, code, msg, headers):
  62. # Check if
  63. # - server requires digest auth, AND
  64. # - we tried (unsuccessfully) with basic auth, AND
  65. # If all conditions hold, parse authentication information
  66. # out of the Authorization header we sent the first time
  67. # (for the username and password) and the WWW-Authenticate
  68. # header the server sent back (for the realm) and retry
  69. # the request with the appropriate digest auth headers instead.
  70. # This evil genius hack has been brought to you by Aaron Swartz.
  71. host = urllib.parse.urlparse(req.get_full_url())[1]
  72. if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
  73. return self.http_error_default(req, fp, code, msg, headers)
  74. auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
  75. user, passw = auth.split(':')
  76. realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
  77. self.add_password(realm, host, user, passw)
  78. retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
  79. self.reset_retry_count()
  80. return retry
  81. def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers):
  82. request = urllib.request.Request(url)
  83. request.add_header('User-Agent', agent)
  84. if etag:
  85. request.add_header('If-None-Match', etag)
  86. if isinstance(modified, str):
  87. modified = _parse_date(modified)
  88. elif isinstance(modified, datetime.datetime):
  89. modified = modified.utctimetuple()
  90. if modified:
  91. # format into an RFC 1123-compliant timestamp. We can't use
  92. # time.strftime() since the %a and %b directives can be affected
  93. # by the current locale, but RFC 2616 states that dates must be
  94. # in English.
  95. short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  96. months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
  97. request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
  98. if referrer:
  99. request.add_header('Referer', referrer)
  100. request.add_header('Accept-encoding', 'gzip, deflate')
  101. if auth:
  102. request.add_header('Authorization', 'Basic %s' % auth)
  103. if accept_header:
  104. request.add_header('Accept', accept_header)
  105. # use this for whatever -- cookies, special headers, etc
  106. # [('Cookie','Something'),('x-special-header','Another Value')]
  107. for header_name, header_value in request_headers.items():
  108. request.add_header(header_name, header_value)
  109. request.add_header('A-IM', 'feed') # RFC 3229 support
  110. return request
  111. def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None):
  112. if handlers is None:
  113. handlers = []
  114. elif not isinstance(handlers, list):
  115. handlers = [handlers]
  116. if request_headers is None:
  117. request_headers = {}
  118. # Deal with the feed URI scheme
  119. if url.startswith('feed:http'):
  120. url = url[5:]
  121. elif url.startswith('feed:'):
  122. url = 'http:' + url[5:]
  123. if not agent:
  124. from . import USER_AGENT
  125. agent = USER_AGENT
  126. # Test for inline user:password credentials for HTTP basic auth
  127. auth = None
  128. if not url.startswith('ftp:'):
  129. url_pieces = urllib.parse.urlparse(url)
  130. if url_pieces.username:
  131. new_pieces = list(url_pieces)
  132. new_pieces[1] = url_pieces.hostname
  133. if url_pieces.port:
  134. new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
  135. url = urllib.parse.urlunparse(new_pieces)
  136. auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
  137. # iri support
  138. if not isinstance(url, bytes):
  139. url = convert_to_idn(url)
  140. # Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
  141. bits = []
  142. for c in url:
  143. try:
  144. c.encode('ascii')
  145. except UnicodeEncodeError:
  146. bits.append(urllib.parse.quote(c))
  147. else:
  148. bits.append(c)
  149. url = ''.join(bits)
  150. # try to open with urllib2 (to use optional headers)
  151. request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
  152. opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
  153. opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
  154. f = opener.open(request)
  155. data = f.read()
  156. f.close()
  157. # lowercase all of the HTTP headers for comparisons per RFC 2616
  158. result['headers'] = {k.lower(): v for k, v in f.headers.items()}
  159. # if feed is gzip-compressed, decompress it
  160. if data and 'gzip' in result['headers'].get('content-encoding', ''):
  161. try:
  162. data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
  163. except (EOFError, IOError, struct.error) as e:
  164. # IOError can occur if the gzip header is bad.
  165. # struct.error can occur if the data is damaged.
  166. result['bozo'] = True
  167. result['bozo_exception'] = e
  168. if isinstance(e, struct.error):
  169. # A gzip header was found but the data is corrupt.
  170. # Ideally, we should re-request the feed without the
  171. # 'Accept-encoding: gzip' header, but we don't.
  172. data = None
  173. elif data and 'deflate' in result['headers'].get('content-encoding', ''):
  174. try:
  175. data = zlib.decompress(data)
  176. except zlib.error:
  177. try:
  178. # The data may have no headers and no checksum.
  179. data = zlib.decompress(data, -15)
  180. except zlib.error as e:
  181. result['bozo'] = True
  182. result['bozo_exception'] = e
  183. # save HTTP headers
  184. if 'etag' in result['headers']:
  185. etag = result['headers'].get('etag', '')
  186. if isinstance(etag, bytes):
  187. etag = etag.decode('utf-8', 'ignore')
  188. if etag:
  189. result['etag'] = etag
  190. if 'last-modified' in result['headers']:
  191. modified = result['headers'].get('last-modified', '')
  192. if modified:
  193. result['modified'] = modified
  194. result['modified_parsed'] = _parse_date(modified)
  195. if isinstance(f.url, bytes):
  196. result['href'] = f.url.decode('utf-8', 'ignore')
  197. else:
  198. result['href'] = f.url
  199. result['status'] = getattr(f, 'status', None) or 200
  200. # Stop processing if the server sent HTTP 304 Not Modified.
  201. if getattr(f, 'code', 0) == 304:
  202. result['version'] = ''
  203. result['debug_message'] = 'The feed has not changed since you last checked, ' + \
  204. 'so the server sent no data. This is a feature, not a bug!'
  205. return data