_base.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. # Support for the Atom, RSS, RDF, and CDF feed formats
  2. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  3. # Copyright 2002-2008 Mark Pilgrim
  4. # All rights reserved.
  5. #
  6. # This file is a part of feedparser.
  7. #
  8. # Redistribution and use in source and binary forms, with or without modification,
  9. # are permitted provided that the following conditions are met:
  10. #
  11. # * Redistributions of source code must retain the above copyright notice,
  12. # this list of conditions and the following disclaimer.
  13. # * Redistributions in binary form must reproduce the above copyright notice,
  14. # this list of conditions and the following disclaimer in the documentation
  15. # and/or other materials provided with the distribution.
  16. #
  17. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  18. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. # POSSIBILITY OF SUCH DAMAGE.
  28. import copy
  29. from ..datetimes import _parse_date
  30. from ..urls import make_safe_absolute_uri
  31. from ..util import FeedParserDict
  32. class Namespace(object):
  33. """Support for the Atom, RSS, RDF, and CDF feed formats.
  34. The feed formats all share common elements, some of which have conflicting
  35. interpretations. For simplicity, all of the base feed format support is
  36. collected here.
  37. """
  38. supported_namespaces = {
  39. '': '',
  40. 'http://backend.userland.com/rss': '',
  41. 'http://blogs.law.harvard.edu/tech/rss': '',
  42. 'http://purl.org/rss/1.0/': '',
  43. 'http://my.netscape.com/rdf/simple/0.9/': '',
  44. 'http://example.com/newformat#': '',
  45. 'http://example.com/necho': '',
  46. 'http://purl.org/echo/': '',
  47. 'uri/of/echo/namespace#': '',
  48. 'http://purl.org/pie/': '',
  49. 'http://purl.org/atom/ns#': '',
  50. 'http://www.w3.org/2005/Atom': '',
  51. 'http://purl.org/rss/1.0/modules/rss091#': '',
  52. }
  53. def _start_rss(self, attrs_d):
  54. versionmap = {
  55. '0.91': 'rss091u',
  56. '0.92': 'rss092',
  57. '0.93': 'rss093',
  58. '0.94': 'rss094',
  59. }
  60. # If we're here then this is an RSS feed.
  61. # If we don't have a version or have a version that starts with something
  62. # other than RSS then there's been a mistake. Correct it.
  63. if not self.version or not self.version.startswith('rss'):
  64. attr_version = attrs_d.get('version', '')
  65. version = versionmap.get(attr_version)
  66. if version:
  67. self.version = version
  68. elif attr_version.startswith('2.'):
  69. self.version = 'rss20'
  70. else:
  71. self.version = 'rss'
  72. def _start_channel(self, attrs_d):
  73. self.infeed = 1
  74. self._cdf_common(attrs_d)
  75. def _cdf_common(self, attrs_d):
  76. if 'lastmod' in attrs_d:
  77. self._start_modified({})
  78. self.elementstack[-1][-1] = attrs_d['lastmod']
  79. self._end_modified()
  80. if 'href' in attrs_d:
  81. self._start_link({})
  82. self.elementstack[-1][-1] = attrs_d['href']
  83. self._end_link()
  84. def _start_feed(self, attrs_d):
  85. self.infeed = 1
  86. versionmap = {'0.1': 'atom01',
  87. '0.2': 'atom02',
  88. '0.3': 'atom03'}
  89. if not self.version:
  90. attr_version = attrs_d.get('version')
  91. version = versionmap.get(attr_version)
  92. if version:
  93. self.version = version
  94. else:
  95. self.version = 'atom'
  96. def _end_channel(self):
  97. self.infeed = 0
  98. _end_feed = _end_channel
  99. def _start_image(self, attrs_d):
  100. context = self._get_context()
  101. if not self.inentry:
  102. context.setdefault('image', FeedParserDict())
  103. self.inimage = 1
  104. self.title_depth = -1
  105. self.push('image', 0)
  106. def _end_image(self):
  107. self.pop('image')
  108. self.inimage = 0
  109. def _start_textinput(self, attrs_d):
  110. context = self._get_context()
  111. context.setdefault('textinput', FeedParserDict())
  112. self.intextinput = 1
  113. self.title_depth = -1
  114. self.push('textinput', 0)
  115. _start_textInput = _start_textinput
  116. def _end_textinput(self):
  117. self.pop('textinput')
  118. self.intextinput = 0
  119. _end_textInput = _end_textinput
  120. def _start_author(self, attrs_d):
  121. self.inauthor = 1
  122. self.push('author', 1)
  123. # Append a new FeedParserDict when expecting an author
  124. context = self._get_context()
  125. context.setdefault('authors', [])
  126. context['authors'].append(FeedParserDict())
  127. _start_managingeditor = _start_author
  128. def _end_author(self):
  129. self.pop('author')
  130. self.inauthor = 0
  131. self._sync_author_detail()
  132. _end_managingeditor = _end_author
  133. def _start_contributor(self, attrs_d):
  134. self.incontributor = 1
  135. context = self._get_context()
  136. context.setdefault('contributors', [])
  137. context['contributors'].append(FeedParserDict())
  138. self.push('contributor', 0)
  139. def _end_contributor(self):
  140. self.pop('contributor')
  141. self.incontributor = 0
  142. def _start_name(self, attrs_d):
  143. self.push('name', 0)
  144. def _end_name(self):
  145. value = self.pop('name')
  146. if self.inpublisher:
  147. self._save_author('name', value, 'publisher')
  148. elif self.inauthor:
  149. self._save_author('name', value)
  150. elif self.incontributor:
  151. self._save_contributor('name', value)
  152. elif self.intextinput:
  153. context = self._get_context()
  154. context['name'] = value
  155. def _start_width(self, attrs_d):
  156. self.push('width', 0)
  157. def _end_width(self):
  158. value = self.pop('width')
  159. try:
  160. value = int(value)
  161. except ValueError:
  162. value = 0
  163. if self.inimage:
  164. context = self._get_context()
  165. context['width'] = value
  166. def _start_height(self, attrs_d):
  167. self.push('height', 0)
  168. def _end_height(self):
  169. value = self.pop('height')
  170. try:
  171. value = int(value)
  172. except ValueError:
  173. value = 0
  174. if self.inimage:
  175. context = self._get_context()
  176. context['height'] = value
  177. def _start_url(self, attrs_d):
  178. self.push('href', 1)
  179. _start_homepage = _start_url
  180. _start_uri = _start_url
  181. def _end_url(self):
  182. value = self.pop('href')
  183. if self.inauthor:
  184. self._save_author('href', value)
  185. elif self.incontributor:
  186. self._save_contributor('href', value)
  187. _end_homepage = _end_url
  188. _end_uri = _end_url
  189. def _start_email(self, attrs_d):
  190. self.push('email', 0)
  191. def _end_email(self):
  192. value = self.pop('email')
  193. if self.inpublisher:
  194. self._save_author('email', value, 'publisher')
  195. elif self.inauthor:
  196. self._save_author('email', value)
  197. elif self.incontributor:
  198. self._save_contributor('email', value)
  199. def _start_subtitle(self, attrs_d):
  200. self.push_content('subtitle', attrs_d, 'text/plain', 1)
  201. _start_tagline = _start_subtitle
  202. def _end_subtitle(self):
  203. self.pop_content('subtitle')
  204. _end_tagline = _end_subtitle
  205. def _start_rights(self, attrs_d):
  206. self.push_content('rights', attrs_d, 'text/plain', 1)
  207. _start_copyright = _start_rights
  208. def _end_rights(self):
  209. self.pop_content('rights')
  210. _end_copyright = _end_rights
  211. def _start_item(self, attrs_d):
  212. self.entries.append(FeedParserDict())
  213. self.push('item', 0)
  214. self.inentry = 1
  215. self.guidislink = 0
  216. self.title_depth = -1
  217. id = self._get_attribute(attrs_d, 'rdf:about')
  218. if id:
  219. context = self._get_context()
  220. context['id'] = id
  221. self._cdf_common(attrs_d)
  222. _start_entry = _start_item
  223. def _end_item(self):
  224. self.pop('item')
  225. self.inentry = 0
  226. self.hasContent = 0
  227. _end_entry = _end_item
  228. def _start_language(self, attrs_d):
  229. self.push('language', 1)
  230. def _end_language(self):
  231. self.lang = self.pop('language')
  232. def _start_webmaster(self, attrs_d):
  233. self.push('publisher', 1)
  234. def _end_webmaster(self):
  235. self.pop('publisher')
  236. self._sync_author_detail('publisher')
  237. def _start_published(self, attrs_d):
  238. self.push('published', 1)
  239. _start_issued = _start_published
  240. _start_pubdate = _start_published
  241. def _end_published(self):
  242. value = self.pop('published')
  243. self._save('published_parsed', _parse_date(value), overwrite=True)
  244. _end_issued = _end_published
  245. _end_pubdate = _end_published
  246. def _start_updated(self, attrs_d):
  247. self.push('updated', 1)
  248. _start_modified = _start_updated
  249. _start_lastbuilddate = _start_updated
  250. def _end_updated(self):
  251. value = self.pop('updated')
  252. parsed_value = _parse_date(value)
  253. self._save('updated_parsed', parsed_value, overwrite=True)
  254. _end_modified = _end_updated
  255. _end_lastbuilddate = _end_updated
  256. def _start_created(self, attrs_d):
  257. self.push('created', 1)
  258. def _end_created(self):
  259. value = self.pop('created')
  260. self._save('created_parsed', _parse_date(value), overwrite=True)
  261. def _start_expirationdate(self, attrs_d):
  262. self.push('expired', 1)
  263. def _end_expirationdate(self):
  264. self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
  265. def _start_category(self, attrs_d):
  266. term = attrs_d.get('term')
  267. scheme = attrs_d.get('scheme', attrs_d.get('domain'))
  268. label = attrs_d.get('label')
  269. self._add_tag(term, scheme, label)
  270. self.push('category', 1)
  271. _start_keywords = _start_category
  272. def _end_category(self):
  273. value = self.pop('category')
  274. if not value:
  275. return
  276. context = self._get_context()
  277. tags = context['tags']
  278. if value and len(tags) and not tags[-1]['term']:
  279. tags[-1]['term'] = value
  280. else:
  281. self._add_tag(value, None, None)
  282. _end_keywords = _end_category
  283. def _start_cloud(self, attrs_d):
  284. self._get_context()['cloud'] = FeedParserDict(attrs_d)
  285. def _start_link(self, attrs_d):
  286. attrs_d.setdefault('rel', 'alternate')
  287. if attrs_d['rel'] == 'self':
  288. attrs_d.setdefault('type', 'application/atom+xml')
  289. else:
  290. attrs_d.setdefault('type', 'text/html')
  291. context = self._get_context()
  292. attrs_d = self._enforce_href(attrs_d)
  293. if 'href' in attrs_d:
  294. attrs_d['href'] = self.resolve_uri(attrs_d['href'])
  295. expecting_text = self.infeed or self.inentry or self.insource
  296. context.setdefault('links', [])
  297. if not (self.inentry and self.inimage):
  298. context['links'].append(FeedParserDict(attrs_d))
  299. if 'href' in attrs_d:
  300. if (
  301. attrs_d.get('rel') == 'alternate'
  302. and self.map_content_type(attrs_d.get('type')) in self.html_types
  303. ):
  304. context['link'] = attrs_d['href']
  305. else:
  306. self.push('link', expecting_text)
  307. def _end_link(self):
  308. self.pop('link')
  309. def _start_guid(self, attrs_d):
  310. self.guidislink = (attrs_d.get('ispermalink', 'true') == 'true')
  311. self.push('id', 1)
  312. _start_id = _start_guid
  313. def _end_guid(self):
  314. value = self.pop('id')
  315. self._save('guidislink', self.guidislink and 'link' not in self._get_context())
  316. if self.guidislink:
  317. # guid acts as link, but only if 'ispermalink' is not present or is 'true',
  318. # and only if the item doesn't already have a link element
  319. self._save('link', value)
  320. _end_id = _end_guid
  321. def _start_title(self, attrs_d):
  322. if self.svgOK:
  323. return self.unknown_starttag('title', list(attrs_d.items()))
  324. self.push_content('title', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource)
  325. def _end_title(self):
  326. if self.svgOK:
  327. return
  328. value = self.pop_content('title')
  329. if not value:
  330. return
  331. self.title_depth = self.depth
  332. def _start_description(self, attrs_d):
  333. context = self._get_context()
  334. if 'summary' in context and not self.hasContent:
  335. self._summaryKey = 'content'
  336. self._start_content(attrs_d)
  337. else:
  338. self.push_content('description', attrs_d, 'text/html', self.infeed or self.inentry or self.insource)
  339. def _start_abstract(self, attrs_d):
  340. self.push_content('description', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource)
  341. def _end_description(self):
  342. if self._summaryKey == 'content':
  343. self._end_content()
  344. else:
  345. self.pop_content('description')
  346. self._summaryKey = None
  347. _end_abstract = _end_description
  348. def _start_info(self, attrs_d):
  349. self.push_content('info', attrs_d, 'text/plain', 1)
  350. _start_feedburner_browserfriendly = _start_info
  351. def _end_info(self):
  352. self.pop_content('info')
  353. _end_feedburner_browserfriendly = _end_info
  354. def _start_generator(self, attrs_d):
  355. if attrs_d:
  356. attrs_d = self._enforce_href(attrs_d)
  357. if 'href' in attrs_d:
  358. attrs_d['href'] = self.resolve_uri(attrs_d['href'])
  359. self._get_context()['generator_detail'] = FeedParserDict(attrs_d)
  360. self.push('generator', 1)
  361. def _end_generator(self):
  362. value = self.pop('generator')
  363. context = self._get_context()
  364. if 'generator_detail' in context:
  365. context['generator_detail']['name'] = value
  366. def _start_summary(self, attrs_d):
  367. context = self._get_context()
  368. if 'summary' in context and not self.hasContent:
  369. self._summaryKey = 'content'
  370. self._start_content(attrs_d)
  371. else:
  372. self._summaryKey = 'summary'
  373. self.push_content(self._summaryKey, attrs_d, 'text/plain', 1)
  374. def _end_summary(self):
  375. if self._summaryKey == 'content':
  376. self._end_content()
  377. else:
  378. self.pop_content(self._summaryKey or 'summary')
  379. self._summaryKey = None
  380. def _start_enclosure(self, attrs_d):
  381. attrs_d = self._enforce_href(attrs_d)
  382. context = self._get_context()
  383. attrs_d['rel'] = 'enclosure'
  384. context.setdefault('links', []).append(FeedParserDict(attrs_d))
  385. def _start_source(self, attrs_d):
  386. if 'url' in attrs_d:
  387. # This means that we're processing a source element from an RSS 2.0 feed
  388. self.sourcedata['href'] = attrs_d['url']
  389. self.push('source', 1)
  390. self.insource = 1
  391. self.title_depth = -1
  392. def _end_source(self):
  393. self.insource = 0
  394. value = self.pop('source')
  395. if value:
  396. self.sourcedata['title'] = value
  397. self._get_context()['source'] = copy.deepcopy(self.sourcedata)
  398. self.sourcedata.clear()
  399. def _start_content(self, attrs_d):
  400. self.hasContent = 1
  401. self.push_content('content', attrs_d, 'text/plain', 1)
  402. src = attrs_d.get('src')
  403. if src:
  404. self.contentparams['src'] = src
  405. self.push('content', 1)
  406. def _start_body(self, attrs_d):
  407. self.push_content('content', attrs_d, 'application/xhtml+xml', 1)
  408. _start_xhtml_body = _start_body
  409. def _start_content_encoded(self, attrs_d):
  410. self.hasContent = 1
  411. self.push_content('content', attrs_d, 'text/html', 1)
  412. _start_fullitem = _start_content_encoded
  413. def _end_content(self):
  414. copyToSummary = self.map_content_type(self.contentparams.get('type')) in ({'text/plain'} | self.html_types)
  415. value = self.pop_content('content')
  416. if copyToSummary:
  417. self._save('summary', value)
  418. _end_body = _end_content
  419. _end_xhtml_body = _end_content
  420. _end_content_encoded = _end_content
  421. _end_fullitem = _end_content
  422. def _start_newlocation(self, attrs_d):
  423. self.push('newlocation', 1)
  424. def _end_newlocation(self):
  425. url = self.pop('newlocation')
  426. context = self._get_context()
  427. # don't set newlocation if the context isn't right
  428. if context is not self.feeddata:
  429. return
  430. context['newlocation'] = make_safe_absolute_uri(self.baseuri, url.strip())