| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506 |
- # Support for the Atom, RSS, RDF, and CDF feed formats
- # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
- # Copyright 2002-2008 Mark Pilgrim
- # All rights reserved.
- #
- # This file is a part of feedparser.
- #
- # Redistribution and use in source and binary forms, with or without modification,
- # are permitted provided that the following conditions are met:
- #
- # * Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- # POSSIBILITY OF SUCH DAMAGE.
- import copy
- from ..datetimes import _parse_date
- from ..urls import make_safe_absolute_uri
- from ..util import FeedParserDict
- class Namespace(object):
- """Support for the Atom, RSS, RDF, and CDF feed formats.
- The feed formats all share common elements, some of which have conflicting
- interpretations. For simplicity, all of the base feed format support is
- collected here.
- """
- supported_namespaces = {
- '': '',
- 'http://backend.userland.com/rss': '',
- 'http://blogs.law.harvard.edu/tech/rss': '',
- 'http://purl.org/rss/1.0/': '',
- 'http://my.netscape.com/rdf/simple/0.9/': '',
- 'http://example.com/newformat#': '',
- 'http://example.com/necho': '',
- 'http://purl.org/echo/': '',
- 'uri/of/echo/namespace#': '',
- 'http://purl.org/pie/': '',
- 'http://purl.org/atom/ns#': '',
- 'http://www.w3.org/2005/Atom': '',
- 'http://purl.org/rss/1.0/modules/rss091#': '',
- }
- def _start_rss(self, attrs_d):
- versionmap = {
- '0.91': 'rss091u',
- '0.92': 'rss092',
- '0.93': 'rss093',
- '0.94': 'rss094',
- }
- # If we're here then this is an RSS feed.
- # If we don't have a version or have a version that starts with something
- # other than RSS then there's been a mistake. Correct it.
- if not self.version or not self.version.startswith('rss'):
- attr_version = attrs_d.get('version', '')
- version = versionmap.get(attr_version)
- if version:
- self.version = version
- elif attr_version.startswith('2.'):
- self.version = 'rss20'
- else:
- self.version = 'rss'
- def _start_channel(self, attrs_d):
- self.infeed = 1
- self._cdf_common(attrs_d)
- def _cdf_common(self, attrs_d):
- if 'lastmod' in attrs_d:
- self._start_modified({})
- self.elementstack[-1][-1] = attrs_d['lastmod']
- self._end_modified()
- if 'href' in attrs_d:
- self._start_link({})
- self.elementstack[-1][-1] = attrs_d['href']
- self._end_link()
- def _start_feed(self, attrs_d):
- self.infeed = 1
- versionmap = {'0.1': 'atom01',
- '0.2': 'atom02',
- '0.3': 'atom03'}
- if not self.version:
- attr_version = attrs_d.get('version')
- version = versionmap.get(attr_version)
- if version:
- self.version = version
- else:
- self.version = 'atom'
- def _end_channel(self):
- self.infeed = 0
- _end_feed = _end_channel
- def _start_image(self, attrs_d):
- context = self._get_context()
- if not self.inentry:
- context.setdefault('image', FeedParserDict())
- self.inimage = 1
- self.title_depth = -1
- self.push('image', 0)
- def _end_image(self):
- self.pop('image')
- self.inimage = 0
- def _start_textinput(self, attrs_d):
- context = self._get_context()
- context.setdefault('textinput', FeedParserDict())
- self.intextinput = 1
- self.title_depth = -1
- self.push('textinput', 0)
- _start_textInput = _start_textinput
- def _end_textinput(self):
- self.pop('textinput')
- self.intextinput = 0
- _end_textInput = _end_textinput
- def _start_author(self, attrs_d):
- self.inauthor = 1
- self.push('author', 1)
- # Append a new FeedParserDict when expecting an author
- context = self._get_context()
- context.setdefault('authors', [])
- context['authors'].append(FeedParserDict())
- _start_managingeditor = _start_author
- def _end_author(self):
- self.pop('author')
- self.inauthor = 0
- self._sync_author_detail()
- _end_managingeditor = _end_author
- def _start_contributor(self, attrs_d):
- self.incontributor = 1
- context = self._get_context()
- context.setdefault('contributors', [])
- context['contributors'].append(FeedParserDict())
- self.push('contributor', 0)
- def _end_contributor(self):
- self.pop('contributor')
- self.incontributor = 0
- def _start_name(self, attrs_d):
- self.push('name', 0)
- def _end_name(self):
- value = self.pop('name')
- if self.inpublisher:
- self._save_author('name', value, 'publisher')
- elif self.inauthor:
- self._save_author('name', value)
- elif self.incontributor:
- self._save_contributor('name', value)
- elif self.intextinput:
- context = self._get_context()
- context['name'] = value
- def _start_width(self, attrs_d):
- self.push('width', 0)
- def _end_width(self):
- value = self.pop('width')
- try:
- value = int(value)
- except ValueError:
- value = 0
- if self.inimage:
- context = self._get_context()
- context['width'] = value
- def _start_height(self, attrs_d):
- self.push('height', 0)
- def _end_height(self):
- value = self.pop('height')
- try:
- value = int(value)
- except ValueError:
- value = 0
- if self.inimage:
- context = self._get_context()
- context['height'] = value
- def _start_url(self, attrs_d):
- self.push('href', 1)
- _start_homepage = _start_url
- _start_uri = _start_url
- def _end_url(self):
- value = self.pop('href')
- if self.inauthor:
- self._save_author('href', value)
- elif self.incontributor:
- self._save_contributor('href', value)
- _end_homepage = _end_url
- _end_uri = _end_url
- def _start_email(self, attrs_d):
- self.push('email', 0)
- def _end_email(self):
- value = self.pop('email')
- if self.inpublisher:
- self._save_author('email', value, 'publisher')
- elif self.inauthor:
- self._save_author('email', value)
- elif self.incontributor:
- self._save_contributor('email', value)
- def _start_subtitle(self, attrs_d):
- self.push_content('subtitle', attrs_d, 'text/plain', 1)
- _start_tagline = _start_subtitle
- def _end_subtitle(self):
- self.pop_content('subtitle')
- _end_tagline = _end_subtitle
- def _start_rights(self, attrs_d):
- self.push_content('rights', attrs_d, 'text/plain', 1)
- _start_copyright = _start_rights
- def _end_rights(self):
- self.pop_content('rights')
- _end_copyright = _end_rights
- def _start_item(self, attrs_d):
- self.entries.append(FeedParserDict())
- self.push('item', 0)
- self.inentry = 1
- self.guidislink = 0
- self.title_depth = -1
- id = self._get_attribute(attrs_d, 'rdf:about')
- if id:
- context = self._get_context()
- context['id'] = id
- self._cdf_common(attrs_d)
- _start_entry = _start_item
- def _end_item(self):
- self.pop('item')
- self.inentry = 0
- self.hasContent = 0
- _end_entry = _end_item
- def _start_language(self, attrs_d):
- self.push('language', 1)
- def _end_language(self):
- self.lang = self.pop('language')
- def _start_webmaster(self, attrs_d):
- self.push('publisher', 1)
- def _end_webmaster(self):
- self.pop('publisher')
- self._sync_author_detail('publisher')
- def _start_published(self, attrs_d):
- self.push('published', 1)
- _start_issued = _start_published
- _start_pubdate = _start_published
- def _end_published(self):
- value = self.pop('published')
- self._save('published_parsed', _parse_date(value), overwrite=True)
- _end_issued = _end_published
- _end_pubdate = _end_published
- def _start_updated(self, attrs_d):
- self.push('updated', 1)
- _start_modified = _start_updated
- _start_lastbuilddate = _start_updated
- def _end_updated(self):
- value = self.pop('updated')
- parsed_value = _parse_date(value)
- self._save('updated_parsed', parsed_value, overwrite=True)
- _end_modified = _end_updated
- _end_lastbuilddate = _end_updated
- def _start_created(self, attrs_d):
- self.push('created', 1)
- def _end_created(self):
- value = self.pop('created')
- self._save('created_parsed', _parse_date(value), overwrite=True)
- def _start_expirationdate(self, attrs_d):
- self.push('expired', 1)
- def _end_expirationdate(self):
- self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
- def _start_category(self, attrs_d):
- term = attrs_d.get('term')
- scheme = attrs_d.get('scheme', attrs_d.get('domain'))
- label = attrs_d.get('label')
- self._add_tag(term, scheme, label)
- self.push('category', 1)
- _start_keywords = _start_category
- def _end_category(self):
- value = self.pop('category')
- if not value:
- return
- context = self._get_context()
- tags = context['tags']
- if value and len(tags) and not tags[-1]['term']:
- tags[-1]['term'] = value
- else:
- self._add_tag(value, None, None)
- _end_keywords = _end_category
- def _start_cloud(self, attrs_d):
- self._get_context()['cloud'] = FeedParserDict(attrs_d)
- def _start_link(self, attrs_d):
- attrs_d.setdefault('rel', 'alternate')
- if attrs_d['rel'] == 'self':
- attrs_d.setdefault('type', 'application/atom+xml')
- else:
- attrs_d.setdefault('type', 'text/html')
- context = self._get_context()
- attrs_d = self._enforce_href(attrs_d)
- if 'href' in attrs_d:
- attrs_d['href'] = self.resolve_uri(attrs_d['href'])
- expecting_text = self.infeed or self.inentry or self.insource
- context.setdefault('links', [])
- if not (self.inentry and self.inimage):
- context['links'].append(FeedParserDict(attrs_d))
- if 'href' in attrs_d:
- if (
- attrs_d.get('rel') == 'alternate'
- and self.map_content_type(attrs_d.get('type')) in self.html_types
- ):
- context['link'] = attrs_d['href']
- else:
- self.push('link', expecting_text)
- def _end_link(self):
- self.pop('link')
- def _start_guid(self, attrs_d):
- self.guidislink = (attrs_d.get('ispermalink', 'true') == 'true')
- self.push('id', 1)
- _start_id = _start_guid
- def _end_guid(self):
- value = self.pop('id')
- self._save('guidislink', self.guidislink and 'link' not in self._get_context())
- if self.guidislink:
- # guid acts as link, but only if 'ispermalink' is not present or is 'true',
- # and only if the item doesn't already have a link element
- self._save('link', value)
- _end_id = _end_guid
- def _start_title(self, attrs_d):
- if self.svgOK:
- return self.unknown_starttag('title', list(attrs_d.items()))
- self.push_content('title', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource)
- def _end_title(self):
- if self.svgOK:
- return
- value = self.pop_content('title')
- if not value:
- return
- self.title_depth = self.depth
- def _start_description(self, attrs_d):
- context = self._get_context()
- if 'summary' in context and not self.hasContent:
- self._summaryKey = 'content'
- self._start_content(attrs_d)
- else:
- self.push_content('description', attrs_d, 'text/html', self.infeed or self.inentry or self.insource)
- def _start_abstract(self, attrs_d):
- self.push_content('description', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource)
- def _end_description(self):
- if self._summaryKey == 'content':
- self._end_content()
- else:
- self.pop_content('description')
- self._summaryKey = None
- _end_abstract = _end_description
- def _start_info(self, attrs_d):
- self.push_content('info', attrs_d, 'text/plain', 1)
- _start_feedburner_browserfriendly = _start_info
- def _end_info(self):
- self.pop_content('info')
- _end_feedburner_browserfriendly = _end_info
- def _start_generator(self, attrs_d):
- if attrs_d:
- attrs_d = self._enforce_href(attrs_d)
- if 'href' in attrs_d:
- attrs_d['href'] = self.resolve_uri(attrs_d['href'])
- self._get_context()['generator_detail'] = FeedParserDict(attrs_d)
- self.push('generator', 1)
- def _end_generator(self):
- value = self.pop('generator')
- context = self._get_context()
- if 'generator_detail' in context:
- context['generator_detail']['name'] = value
- def _start_summary(self, attrs_d):
- context = self._get_context()
- if 'summary' in context and not self.hasContent:
- self._summaryKey = 'content'
- self._start_content(attrs_d)
- else:
- self._summaryKey = 'summary'
- self.push_content(self._summaryKey, attrs_d, 'text/plain', 1)
- def _end_summary(self):
- if self._summaryKey == 'content':
- self._end_content()
- else:
- self.pop_content(self._summaryKey or 'summary')
- self._summaryKey = None
- def _start_enclosure(self, attrs_d):
- attrs_d = self._enforce_href(attrs_d)
- context = self._get_context()
- attrs_d['rel'] = 'enclosure'
- context.setdefault('links', []).append(FeedParserDict(attrs_d))
- def _start_source(self, attrs_d):
- if 'url' in attrs_d:
- # This means that we're processing a source element from an RSS 2.0 feed
- self.sourcedata['href'] = attrs_d['url']
- self.push('source', 1)
- self.insource = 1
- self.title_depth = -1
- def _end_source(self):
- self.insource = 0
- value = self.pop('source')
- if value:
- self.sourcedata['title'] = value
- self._get_context()['source'] = copy.deepcopy(self.sourcedata)
- self.sourcedata.clear()
- def _start_content(self, attrs_d):
- self.hasContent = 1
- self.push_content('content', attrs_d, 'text/plain', 1)
- src = attrs_d.get('src')
- if src:
- self.contentparams['src'] = src
- self.push('content', 1)
- def _start_body(self, attrs_d):
- self.push_content('content', attrs_d, 'application/xhtml+xml', 1)
- _start_xhtml_body = _start_body
- def _start_content_encoded(self, attrs_d):
- self.hasContent = 1
- self.push_content('content', attrs_d, 'text/html', 1)
- _start_fullitem = _start_content_encoded
- def _end_content(self):
- copyToSummary = self.map_content_type(self.contentparams.get('type')) in ({'text/plain'} | self.html_types)
- value = self.pop_content('content')
- if copyToSummary:
- self._save('summary', value)
- _end_body = _end_content
- _end_xhtml_body = _end_content
- _end_content_encoded = _end_content
- _end_fullitem = _end_content
- def _start_newlocation(self, attrs_d):
- self.push('newlocation', 1)
- def _end_newlocation(self):
- url = self.pop('newlocation')
- context = self._get_context()
- # don't set newlocation if the context isn't right
- if context is not self.feeddata:
- return
- context['newlocation'] = make_safe_absolute_uri(self.baseuri, url.strip())
|