| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- # The public API for feedparser
- # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
- # Copyright 2002-2008 Mark Pilgrim
- # All rights reserved.
- #
- # This file is a part of feedparser.
- #
- # Redistribution and use in source and binary forms, with or without modification,
- # are permitted provided that the following conditions are met:
- #
- # * Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- # POSSIBILITY OF SUCH DAMAGE.
- import io
- import urllib.error
- import urllib.parse
- import xml.sax
- from .datetimes import registerDateHandler, _parse_date
- from .encodings import convert_to_utf8
- from .exceptions import *
- from .html import _BaseHTMLProcessor
- from . import http
- from . import mixin
- from .mixin import _FeedParserMixin
- from .parsers.loose import _LooseFeedParser
- from .parsers.strict import _StrictFeedParser
- from .sanitizer import replace_doctype
- from .sgml import *
- from .urls import convert_to_idn, make_safe_absolute_uri
- from .util import FeedParserDict
- # List of preferred XML parsers, by SAX driver name. These will be tried first,
- # but if they're not installed, Python will keep searching through its own list
- # of pre-installed parsers until it finds one that supports everything we need.
- PREFERRED_XML_PARSERS = ["drv_libxml2"]
- _XML_AVAILABLE = True
- SUPPORTED_VERSIONS = {
- '': 'unknown',
- 'rss090': 'RSS 0.90',
- 'rss091n': 'RSS 0.91 (Netscape)',
- 'rss091u': 'RSS 0.91 (Userland)',
- 'rss092': 'RSS 0.92',
- 'rss093': 'RSS 0.93',
- 'rss094': 'RSS 0.94',
- 'rss20': 'RSS 2.0',
- 'rss10': 'RSS 1.0',
- 'rss': 'RSS (unknown version)',
- 'atom01': 'Atom 0.1',
- 'atom02': 'Atom 0.2',
- 'atom03': 'Atom 0.3',
- 'atom10': 'Atom 1.0',
- 'atom': 'Atom (unknown version)',
- 'cdf': 'CDF',
- }
- def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result):
- """URL, filename, or string --> stream
- This function lets you define parsers that take any input source
- (URL, pathname to local or network file, or actual data as a string)
- and deal with it in a uniform manner. Returned object is guaranteed
- to have all the basic stdio read methods (read, readline, readlines).
- Just .close() the object when you're done with it.
- If the etag argument is supplied, it will be used as the value of an
- If-None-Match request header.
- If the modified argument is supplied, it can be a tuple of 9 integers
- (as returned by gmtime() in the standard Python time module) or a date
- string in any format supported by feedparser. Regardless, it MUST
- be in GMT (Greenwich Mean Time). It will be reformatted into an
- RFC 1123-compliant date and used as the value of an If-Modified-Since
- request header.
- If the agent argument is supplied, it will be used as the value of a
- User-Agent request header.
- If the referrer argument is supplied, it will be used as the value of a
- Referer[sic] request header.
- If handlers is supplied, it is a list of handlers used to build a
- urllib2 opener.
- if request_headers is supplied it is a dictionary of HTTP request headers
- that will override the values generated by FeedParser.
- :return: A bytes object.
- """
- if hasattr(url_file_stream_or_string, 'read'):
- return url_file_stream_or_string.read()
- if isinstance(url_file_stream_or_string, str) \
- and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
- return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
- # try to open with native open function (if url_file_stream_or_string is a filename)
- try:
- with open(url_file_stream_or_string, 'rb') as f:
- data = f.read()
- except (IOError, UnicodeEncodeError, TypeError, ValueError):
- # if url_file_stream_or_string is a str object that
- # cannot be converted to the encoding returned by
- # sys.getfilesystemencoding(), a UnicodeEncodeError
- # will be thrown
- # If url_file_stream_or_string is a string that contains NULL
- # (such as an XML document encoded in UTF-32), TypeError will
- # be thrown.
- pass
- else:
- return data
- # treat url_file_stream_or_string as string
- if not isinstance(url_file_stream_or_string, bytes):
- return url_file_stream_or_string.encode('utf-8')
- return url_file_stream_or_string
- LooseFeedParser = type(
- 'LooseFeedParser',
- (_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
- {},
- )
- StrictFeedParser = type(
- 'StrictFeedParser',
- (_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
- {},
- )
- def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
- """Parse a feed from a URL, file, stream, or string.
- :param url_file_stream_or_string:
- File-like object, URL, file path, or string. Both byte and text strings
- are accepted. If necessary, encoding will be derived from the response
- headers or automatically detected.
- Note that strings may trigger network I/O or filesystem access
- depending on the value. Wrap an untrusted string in
- a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
- pass untrusted strings to this function.
- When a URL is not passed the feed location to use in relative URL
- resolution should be passed in the ``Content-Location`` response header
- (see ``response_headers`` below).
- :param str etag: HTTP ``ETag`` request header.
- :param modified: HTTP ``Last-Modified`` request header.
- :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
- :class:`datetime.datetime`
- :param str agent: HTTP ``User-Agent`` request header, which defaults to
- the value of :data:`feedparser.USER_AGENT`.
- :param referrer: HTTP ``Referer`` [sic] request header.
- :param request_headers:
- A mapping of HTTP header name to HTTP header value to add to the
- request, overriding internally generated values.
- :type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
- :param response_headers:
- A mapping of HTTP header name to HTTP header value. Multiple values may
- be joined with a comma. If a HTTP request was made, these headers
- override any matching headers in the response. Otherwise this specifies
- the entirety of the response headers.
- :type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
- :param bool resolve_relative_uris:
- Should feedparser attempt to resolve relative URIs absolute ones within
- HTML content? Defaults to the value of
- :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
- :param bool sanitize_html:
- Should feedparser skip HTML sanitization? Only disable this if you know
- what you are doing! Defaults to the value of
- :data:`feedparser.SANITIZE_HTML`, which is ``True``.
- :return: A :class:`FeedParserDict`.
- """
- if not agent or sanitize_html is None or resolve_relative_uris is None:
- import feedparser
- if not agent:
- agent = feedparser.USER_AGENT
- if sanitize_html is None:
- sanitize_html = feedparser.SANITIZE_HTML
- if resolve_relative_uris is None:
- resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
- result = FeedParserDict(
- bozo=False,
- entries=[],
- feed=FeedParserDict(),
- headers={},
- )
- try:
- data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
- except urllib.error.URLError as error:
- result.update({
- 'bozo': True,
- 'bozo_exception': error,
- })
- return result
- if not data:
- return result
- # overwrite existing headers using response_headers
- result['headers'].update(response_headers or {})
- data = convert_to_utf8(result['headers'], data, result)
- use_strict_parser = result['encoding'] and True or False
- result['version'], data, entities = replace_doctype(data)
- # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
- contentloc = result['headers'].get('content-location', '')
- href = result.get('href', '')
- baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href
- baselang = result['headers'].get('content-language', None)
- if isinstance(baselang, bytes) and baselang is not None:
- baselang = baselang.decode('utf-8', 'ignore')
- if not _XML_AVAILABLE:
- use_strict_parser = 0
- if use_strict_parser:
- # initialize the SAX parser
- feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
- feedparser.resolve_relative_uris = resolve_relative_uris
- feedparser.sanitize_html = sanitize_html
- saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
- saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
- try:
- # disable downloading external doctype references, if possible
- saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
- except xml.sax.SAXNotSupportedException:
- pass
- saxparser.setContentHandler(feedparser)
- saxparser.setErrorHandler(feedparser)
- source = xml.sax.xmlreader.InputSource()
- source.setByteStream(io.BytesIO(data))
- try:
- saxparser.parse(source)
- except xml.sax.SAXException as e:
- result['bozo'] = 1
- result['bozo_exception'] = feedparser.exc or e
- use_strict_parser = 0
- if not use_strict_parser:
- feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
- feedparser.resolve_relative_uris = resolve_relative_uris
- feedparser.sanitize_html = sanitize_html
- feedparser.feed(data.decode('utf-8', 'replace'))
- result['feed'] = feedparser.feeddata
- result['entries'] = feedparser.entries
- result['version'] = result['version'] or feedparser.version
- result['namespaces'] = feedparser.namespaces_in_use
- return result
|