urls.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  2. # Copyright 2002-2008 Mark Pilgrim
  3. # All rights reserved.
  4. #
  5. # This file is a part of feedparser.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  17. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. # POSSIBILITY OF SUCH DAMAGE.
  27. import re
  28. import urllib.parse
  29. from .html import _BaseHTMLProcessor
  30. # If you want feedparser to allow all URL schemes, set this to ()
  31. # List culled from Python's urlparse documentation at:
  32. # http://docs.python.org/library/urlparse.html
  33. # as well as from "URI scheme" at Wikipedia:
  34. # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
  35. # Many more will likely need to be added!
  36. ACCEPTABLE_URI_SCHEMES = (
  37. 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
  38. 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
  39. 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
  40. 'wais',
  41. # Additional common-but-unofficial schemes
  42. 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
  43. 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
  44. )
  45. _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
  46. def _urljoin(base, uri):
  47. uri = _urifixer.sub(r'\1\3', uri)
  48. try:
  49. uri = urllib.parse.urljoin(base, uri)
  50. except ValueError:
  51. uri = ''
  52. return uri
  53. def convert_to_idn(url):
  54. """Convert a URL to IDN notation"""
  55. # this function should only be called with a unicode string
  56. # strategy: if the host cannot be encoded in ascii, then
  57. # it'll be necessary to encode it in idn form
  58. parts = list(urllib.parse.urlsplit(url))
  59. try:
  60. parts[1].encode('ascii')
  61. except UnicodeEncodeError:
  62. # the url needs to be converted to idn notation
  63. host = parts[1].rsplit(':', 1)
  64. newhost = []
  65. port = ''
  66. if len(host) == 2:
  67. port = host.pop()
  68. for h in host[0].split('.'):
  69. newhost.append(h.encode('idna').decode('utf-8'))
  70. parts[1] = '.'.join(newhost)
  71. if port:
  72. parts[1] += ':' + port
  73. return urllib.parse.urlunsplit(parts)
  74. else:
  75. return url
  76. def make_safe_absolute_uri(base, rel=None):
  77. # bail if ACCEPTABLE_URI_SCHEMES is empty
  78. if not ACCEPTABLE_URI_SCHEMES:
  79. return _urljoin(base, rel or '')
  80. if not base:
  81. return rel or ''
  82. if not rel:
  83. try:
  84. scheme = urllib.parse.urlparse(base)[0]
  85. except ValueError:
  86. return ''
  87. if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
  88. return base
  89. return ''
  90. uri = _urljoin(base, rel)
  91. if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
  92. return ''
  93. return uri
  94. class RelativeURIResolver(_BaseHTMLProcessor):
  95. relative_uris = {
  96. ('a', 'href'),
  97. ('applet', 'codebase'),
  98. ('area', 'href'),
  99. ('audio', 'src'),
  100. ('blockquote', 'cite'),
  101. ('body', 'background'),
  102. ('del', 'cite'),
  103. ('form', 'action'),
  104. ('frame', 'longdesc'),
  105. ('frame', 'src'),
  106. ('iframe', 'longdesc'),
  107. ('iframe', 'src'),
  108. ('head', 'profile'),
  109. ('img', 'longdesc'),
  110. ('img', 'src'),
  111. ('img', 'usemap'),
  112. ('input', 'src'),
  113. ('input', 'usemap'),
  114. ('ins', 'cite'),
  115. ('link', 'href'),
  116. ('object', 'classid'),
  117. ('object', 'codebase'),
  118. ('object', 'data'),
  119. ('object', 'usemap'),
  120. ('q', 'cite'),
  121. ('script', 'src'),
  122. ('source', 'src'),
  123. ('video', 'poster'),
  124. ('video', 'src'),
  125. }
  126. def __init__(self, baseuri, encoding, _type):
  127. _BaseHTMLProcessor.__init__(self, encoding, _type)
  128. self.baseuri = baseuri
  129. def resolve_uri(self, uri):
  130. return make_safe_absolute_uri(self.baseuri, uri.strip())
  131. def unknown_starttag(self, tag, attrs):
  132. attrs = self.normalize_attrs(attrs)
  133. attrs = [(key, ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value) for key, value in attrs]
  134. super(RelativeURIResolver, self).unknown_starttag(tag, attrs)
  135. def resolve_relative_uris(html_source, base_uri, encoding, type_):
  136. p = RelativeURIResolver(base_uri, encoding, type_)
  137. p.feed(html_source)
  138. return p.output()