sanitizer.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950
  1. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  2. # Copyright 2002-2008 Mark Pilgrim
  3. # All rights reserved.
  4. #
  5. # This file is a part of feedparser.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  17. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. # POSSIBILITY OF SUCH DAMAGE.
  27. import re
  28. from .html import _BaseHTMLProcessor
  29. from .urls import make_safe_absolute_uri
  30. class _HTMLSanitizer(_BaseHTMLProcessor):
  31. acceptable_elements = {
  32. 'a',
  33. 'abbr',
  34. 'acronym',
  35. 'address',
  36. 'area',
  37. 'article',
  38. 'aside',
  39. 'audio',
  40. 'b',
  41. 'big',
  42. 'blockquote',
  43. 'br',
  44. 'button',
  45. 'canvas',
  46. 'caption',
  47. 'center',
  48. 'cite',
  49. 'code',
  50. 'col',
  51. 'colgroup',
  52. 'command',
  53. 'datagrid',
  54. 'datalist',
  55. 'dd',
  56. 'del',
  57. 'details',
  58. 'dfn',
  59. 'dialog',
  60. 'dir',
  61. 'div',
  62. 'dl',
  63. 'dt',
  64. 'em',
  65. 'event-source',
  66. 'fieldset',
  67. 'figcaption',
  68. 'figure',
  69. 'font',
  70. 'footer',
  71. 'form',
  72. 'h1',
  73. 'h2',
  74. 'h3',
  75. 'h4',
  76. 'h5',
  77. 'h6',
  78. 'header',
  79. 'hr',
  80. 'i',
  81. 'img',
  82. 'input',
  83. 'ins',
  84. 'kbd',
  85. 'keygen',
  86. 'label',
  87. 'legend',
  88. 'li',
  89. 'm',
  90. 'map',
  91. 'menu',
  92. 'meter',
  93. 'multicol',
  94. 'nav',
  95. 'nextid',
  96. 'noscript',
  97. 'ol',
  98. 'optgroup',
  99. 'option',
  100. 'output',
  101. 'p',
  102. 'pre',
  103. 'progress',
  104. 'q',
  105. 's',
  106. 'samp',
  107. 'section',
  108. 'select',
  109. 'small',
  110. 'sound',
  111. 'source',
  112. 'spacer',
  113. 'span',
  114. 'strike',
  115. 'strong',
  116. 'sub',
  117. 'sup',
  118. 'table',
  119. 'tbody',
  120. 'td',
  121. 'textarea',
  122. 'tfoot',
  123. 'th',
  124. 'thead',
  125. 'time',
  126. 'tr',
  127. 'tt',
  128. 'u',
  129. 'ul',
  130. 'var',
  131. 'video',
  132. }
  133. acceptable_attributes = {
  134. 'abbr',
  135. 'accept',
  136. 'accept-charset',
  137. 'accesskey',
  138. 'action',
  139. 'align',
  140. 'alt',
  141. 'autocomplete',
  142. 'autofocus',
  143. 'axis',
  144. 'background',
  145. 'balance',
  146. 'bgcolor',
  147. 'bgproperties',
  148. 'border',
  149. 'bordercolor',
  150. 'bordercolordark',
  151. 'bordercolorlight',
  152. 'bottompadding',
  153. 'cellpadding',
  154. 'cellspacing',
  155. 'ch',
  156. 'challenge',
  157. 'char',
  158. 'charoff',
  159. 'charset',
  160. 'checked',
  161. 'choff',
  162. 'cite',
  163. 'class',
  164. 'clear',
  165. 'color',
  166. 'cols',
  167. 'colspan',
  168. 'compact',
  169. 'contenteditable',
  170. 'controls',
  171. 'coords',
  172. 'data',
  173. 'datafld',
  174. 'datapagesize',
  175. 'datasrc',
  176. 'datetime',
  177. 'default',
  178. 'delay',
  179. 'dir',
  180. 'disabled',
  181. 'draggable',
  182. 'dynsrc',
  183. 'enctype',
  184. 'end',
  185. 'face',
  186. 'for',
  187. 'form',
  188. 'frame',
  189. 'galleryimg',
  190. 'gutter',
  191. 'headers',
  192. 'height',
  193. 'hidden',
  194. 'hidefocus',
  195. 'high',
  196. 'href',
  197. 'hreflang',
  198. 'hspace',
  199. 'icon',
  200. 'id',
  201. 'inputmode',
  202. 'ismap',
  203. 'keytype',
  204. 'label',
  205. 'lang',
  206. 'leftspacing',
  207. 'list',
  208. 'longdesc',
  209. 'loop',
  210. 'loopcount',
  211. 'loopend',
  212. 'loopstart',
  213. 'low',
  214. 'lowsrc',
  215. 'max',
  216. 'maxlength',
  217. 'media',
  218. 'method',
  219. 'min',
  220. 'multiple',
  221. 'name',
  222. 'nohref',
  223. 'noshade',
  224. 'nowrap',
  225. 'open',
  226. 'optimum',
  227. 'pattern',
  228. 'ping',
  229. 'point-size',
  230. 'poster',
  231. 'pqg',
  232. 'preload',
  233. 'prompt',
  234. 'radiogroup',
  235. 'readonly',
  236. 'rel',
  237. 'repeat-max',
  238. 'repeat-min',
  239. 'replace',
  240. 'required',
  241. 'rev',
  242. 'rightspacing',
  243. 'rows',
  244. 'rowspan',
  245. 'rules',
  246. 'scope',
  247. 'selected',
  248. 'shape',
  249. 'size',
  250. 'span',
  251. 'src',
  252. 'start',
  253. 'step',
  254. 'style',
  255. 'summary',
  256. 'suppress',
  257. 'tabindex',
  258. 'target',
  259. 'template',
  260. 'title',
  261. 'toppadding',
  262. 'type',
  263. 'unselectable',
  264. 'urn',
  265. 'usemap',
  266. 'valign',
  267. 'value',
  268. 'variable',
  269. 'volume',
  270. 'vrml',
  271. 'vspace',
  272. 'width',
  273. 'wrap',
  274. 'xml:lang',
  275. }
  276. unacceptable_elements_with_end_tag = {
  277. 'applet',
  278. 'script',
  279. 'style',
  280. }
  281. acceptable_css_properties = {
  282. 'azimuth',
  283. 'background-color',
  284. 'border-bottom-color',
  285. 'border-collapse',
  286. 'border-color',
  287. 'border-left-color',
  288. 'border-right-color',
  289. 'border-top-color',
  290. 'clear',
  291. 'color',
  292. 'cursor',
  293. 'direction',
  294. 'display',
  295. 'elevation',
  296. 'float',
  297. 'font',
  298. 'font-family',
  299. 'font-size',
  300. 'font-style',
  301. 'font-variant',
  302. 'font-weight',
  303. 'height',
  304. 'letter-spacing',
  305. 'line-height',
  306. 'overflow',
  307. 'pause',
  308. 'pause-after',
  309. 'pause-before',
  310. 'pitch',
  311. 'pitch-range',
  312. 'richness',
  313. 'speak',
  314. 'speak-header',
  315. 'speak-numeral',
  316. 'speak-punctuation',
  317. 'speech-rate',
  318. 'stress',
  319. 'text-align',
  320. 'text-decoration',
  321. 'text-indent',
  322. 'unicode-bidi',
  323. 'vertical-align',
  324. 'voice-family',
  325. 'volume',
  326. 'white-space',
  327. 'width',
  328. }
  329. # survey of common keywords found in feeds
  330. acceptable_css_keywords = {
  331. '!important',
  332. 'aqua',
  333. 'auto',
  334. 'black',
  335. 'block',
  336. 'blue',
  337. 'bold',
  338. 'both',
  339. 'bottom',
  340. 'brown',
  341. 'center',
  342. 'collapse',
  343. 'dashed',
  344. 'dotted',
  345. 'fuchsia',
  346. 'gray',
  347. 'green',
  348. 'italic',
  349. 'left',
  350. 'lime',
  351. 'maroon',
  352. 'medium',
  353. 'navy',
  354. 'none',
  355. 'normal',
  356. 'nowrap',
  357. 'olive',
  358. 'pointer',
  359. 'purple',
  360. 'red',
  361. 'right',
  362. 'silver',
  363. 'solid',
  364. 'teal',
  365. 'top',
  366. 'transparent',
  367. 'underline',
  368. 'white',
  369. 'yellow',
  370. }
  371. valid_css_values = re.compile(
  372. r'^('
  373. r'#[0-9a-f]+' # Hex values
  374. r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values
  375. r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths
  376. r')$'
  377. )
  378. mathml_elements = {
  379. 'annotation',
  380. 'annotation-xml',
  381. 'maction',
  382. 'maligngroup',
  383. 'malignmark',
  384. 'math',
  385. 'menclose',
  386. 'merror',
  387. 'mfenced',
  388. 'mfrac',
  389. 'mglyph',
  390. 'mi',
  391. 'mlabeledtr',
  392. 'mlongdiv',
  393. 'mmultiscripts',
  394. 'mn',
  395. 'mo',
  396. 'mover',
  397. 'mpadded',
  398. 'mphantom',
  399. 'mprescripts',
  400. 'mroot',
  401. 'mrow',
  402. 'ms',
  403. 'mscarries',
  404. 'mscarry',
  405. 'msgroup',
  406. 'msline',
  407. 'mspace',
  408. 'msqrt',
  409. 'msrow',
  410. 'mstack',
  411. 'mstyle',
  412. 'msub',
  413. 'msubsup',
  414. 'msup',
  415. 'mtable',
  416. 'mtd',
  417. 'mtext',
  418. 'mtr',
  419. 'munder',
  420. 'munderover',
  421. 'none',
  422. 'semantics',
  423. }
  424. mathml_attributes = {
  425. 'accent',
  426. 'accentunder',
  427. 'actiontype',
  428. 'align',
  429. 'alignmentscope',
  430. 'altimg',
  431. 'altimg-height',
  432. 'altimg-valign',
  433. 'altimg-width',
  434. 'alttext',
  435. 'bevelled',
  436. 'charalign',
  437. 'close',
  438. 'columnalign',
  439. 'columnlines',
  440. 'columnspacing',
  441. 'columnspan',
  442. 'columnwidth',
  443. 'crossout',
  444. 'decimalpoint',
  445. 'denomalign',
  446. 'depth',
  447. 'dir',
  448. 'display',
  449. 'displaystyle',
  450. 'edge',
  451. 'encoding',
  452. 'equalcolumns',
  453. 'equalrows',
  454. 'fence',
  455. 'fontstyle',
  456. 'fontweight',
  457. 'form',
  458. 'frame',
  459. 'framespacing',
  460. 'groupalign',
  461. 'height',
  462. 'href',
  463. 'id',
  464. 'indentalign',
  465. 'indentalignfirst',
  466. 'indentalignlast',
  467. 'indentshift',
  468. 'indentshiftfirst',
  469. 'indentshiftlast',
  470. 'indenttarget',
  471. 'infixlinebreakstyle',
  472. 'largeop',
  473. 'length',
  474. 'linebreak',
  475. 'linebreakmultchar',
  476. 'linebreakstyle',
  477. 'lineleading',
  478. 'linethickness',
  479. 'location',
  480. 'longdivstyle',
  481. 'lquote',
  482. 'lspace',
  483. 'mathbackground',
  484. 'mathcolor',
  485. 'mathsize',
  486. 'mathvariant',
  487. 'maxsize',
  488. 'minlabelspacing',
  489. 'minsize',
  490. 'movablelimits',
  491. 'notation',
  492. 'numalign',
  493. 'open',
  494. 'other',
  495. 'overflow',
  496. 'position',
  497. 'rowalign',
  498. 'rowlines',
  499. 'rowspacing',
  500. 'rowspan',
  501. 'rquote',
  502. 'rspace',
  503. 'scriptlevel',
  504. 'scriptminsize',
  505. 'scriptsizemultiplier',
  506. 'selection',
  507. 'separator',
  508. 'separators',
  509. 'shift',
  510. 'side',
  511. 'src',
  512. 'stackalign',
  513. 'stretchy',
  514. 'subscriptshift',
  515. 'superscriptshift',
  516. 'symmetric',
  517. 'voffset',
  518. 'width',
  519. 'xlink:href',
  520. 'xlink:show',
  521. 'xlink:type',
  522. 'xmlns',
  523. 'xmlns:xlink',
  524. }
  525. # svgtiny - foreignObject + linearGradient + radialGradient + stop
  526. svg_elements = {
  527. 'a',
  528. 'animate',
  529. 'animateColor',
  530. 'animateMotion',
  531. 'animateTransform',
  532. 'circle',
  533. 'defs',
  534. 'desc',
  535. 'ellipse',
  536. 'font-face',
  537. 'font-face-name',
  538. 'font-face-src',
  539. 'foreignObject',
  540. 'g',
  541. 'glyph',
  542. 'hkern',
  543. 'line',
  544. 'linearGradient',
  545. 'marker',
  546. 'metadata',
  547. 'missing-glyph',
  548. 'mpath',
  549. 'path',
  550. 'polygon',
  551. 'polyline',
  552. 'radialGradient',
  553. 'rect',
  554. 'set',
  555. 'stop',
  556. 'svg',
  557. 'switch',
  558. 'text',
  559. 'title',
  560. 'tspan',
  561. 'use',
  562. }
  563. # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
  564. svg_attributes = {
  565. 'accent-height',
  566. 'accumulate',
  567. 'additive',
  568. 'alphabetic',
  569. 'arabic-form',
  570. 'ascent',
  571. 'attributeName',
  572. 'attributeType',
  573. 'baseProfile',
  574. 'bbox',
  575. 'begin',
  576. 'by',
  577. 'calcMode',
  578. 'cap-height',
  579. 'class',
  580. 'color',
  581. 'color-rendering',
  582. 'content',
  583. 'cx',
  584. 'cy',
  585. 'd',
  586. 'descent',
  587. 'display',
  588. 'dur',
  589. 'dx',
  590. 'dy',
  591. 'end',
  592. 'fill',
  593. 'fill-opacity',
  594. 'fill-rule',
  595. 'font-family',
  596. 'font-size',
  597. 'font-stretch',
  598. 'font-style',
  599. 'font-variant',
  600. 'font-weight',
  601. 'from',
  602. 'fx',
  603. 'fy',
  604. 'g1',
  605. 'g2',
  606. 'glyph-name',
  607. 'gradientUnits',
  608. 'hanging',
  609. 'height',
  610. 'horiz-adv-x',
  611. 'horiz-origin-x',
  612. 'id',
  613. 'ideographic',
  614. 'k',
  615. 'keyPoints',
  616. 'keySplines',
  617. 'keyTimes',
  618. 'lang',
  619. 'marker-end',
  620. 'marker-mid',
  621. 'marker-start',
  622. 'markerHeight',
  623. 'markerUnits',
  624. 'markerWidth',
  625. 'mathematical',
  626. 'max',
  627. 'min',
  628. 'name',
  629. 'offset',
  630. 'opacity',
  631. 'orient',
  632. 'origin',
  633. 'overline-position',
  634. 'overline-thickness',
  635. 'panose-1',
  636. 'path',
  637. 'pathLength',
  638. 'points',
  639. 'preserveAspectRatio',
  640. 'r',
  641. 'refX',
  642. 'refY',
  643. 'repeatCount',
  644. 'repeatDur',
  645. 'requiredExtensions',
  646. 'requiredFeatures',
  647. 'restart',
  648. 'rotate',
  649. 'rx',
  650. 'ry',
  651. 'slope',
  652. 'stemh',
  653. 'stemv',
  654. 'stop-color',
  655. 'stop-opacity',
  656. 'strikethrough-position',
  657. 'strikethrough-thickness',
  658. 'stroke',
  659. 'stroke-dasharray',
  660. 'stroke-dashoffset',
  661. 'stroke-linecap',
  662. 'stroke-linejoin',
  663. 'stroke-miterlimit',
  664. 'stroke-opacity',
  665. 'stroke-width',
  666. 'systemLanguage',
  667. 'target',
  668. 'text-anchor',
  669. 'to',
  670. 'transform',
  671. 'type',
  672. 'u1',
  673. 'u2',
  674. 'underline-position',
  675. 'underline-thickness',
  676. 'unicode',
  677. 'unicode-range',
  678. 'units-per-em',
  679. 'values',
  680. 'version',
  681. 'viewBox',
  682. 'visibility',
  683. 'width',
  684. 'widths',
  685. 'x',
  686. 'x-height',
  687. 'x1',
  688. 'x2',
  689. 'xlink:actuate',
  690. 'xlink:arcrole',
  691. 'xlink:href',
  692. 'xlink:role',
  693. 'xlink:show',
  694. 'xlink:title',
  695. 'xlink:type',
  696. 'xml:base',
  697. 'xml:lang',
  698. 'xml:space',
  699. 'xmlns',
  700. 'xmlns:xlink',
  701. 'y',
  702. 'y1',
  703. 'y2',
  704. 'zoomAndPan',
  705. }
  706. svg_attr_map = None
  707. svg_elem_map = None
  708. acceptable_svg_properties = {
  709. 'fill',
  710. 'fill-opacity',
  711. 'fill-rule',
  712. 'stroke',
  713. 'stroke-linecap',
  714. 'stroke-linejoin',
  715. 'stroke-opacity',
  716. 'stroke-width',
  717. }
  718. def __init__(self, encoding=None, _type='application/xhtml+xml'):
  719. super(_HTMLSanitizer, self).__init__(encoding, _type)
  720. self.unacceptablestack = 0
  721. self.mathmlOK = 0
  722. self.svgOK = 0
  723. def reset(self):
  724. super(_HTMLSanitizer, self).reset()
  725. self.unacceptablestack = 0
  726. self.mathmlOK = 0
  727. self.svgOK = 0
  728. def unknown_starttag(self, tag, attrs):
  729. acceptable_attributes = self.acceptable_attributes
  730. keymap = {}
  731. if tag not in self.acceptable_elements or self.svgOK:
  732. if tag in self.unacceptable_elements_with_end_tag:
  733. self.unacceptablestack += 1
  734. # add implicit namespaces to html5 inline svg/mathml
  735. if self._type.endswith('html'):
  736. if not dict(attrs).get('xmlns'):
  737. if tag == 'svg':
  738. attrs.append(('xmlns', 'http://www.w3.org/2000/svg'))
  739. if tag == 'math':
  740. attrs.append(('xmlns', 'http://www.w3.org/1998/Math/MathML'))
  741. # not otherwise acceptable, perhaps it is MathML or SVG?
  742. if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs:
  743. self.mathmlOK += 1
  744. if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs:
  745. self.svgOK += 1
  746. # chose acceptable attributes based on tag class, else bail
  747. if self.mathmlOK and tag in self.mathml_elements:
  748. acceptable_attributes = self.mathml_attributes
  749. elif self.svgOK and tag in self.svg_elements:
  750. # For most vocabularies, lowercasing is a good idea. Many
  751. # svg elements, however, are camel case.
  752. if not self.svg_attr_map:
  753. lower = [attr.lower() for attr in self.svg_attributes]
  754. mix = [a for a in self.svg_attributes if a not in lower]
  755. self.svg_attributes = lower
  756. self.svg_attr_map = {a.lower(): a for a in mix}
  757. lower = [attr.lower() for attr in self.svg_elements]
  758. mix = [a for a in self.svg_elements if a not in lower]
  759. self.svg_elements = lower
  760. self.svg_elem_map = {a.lower(): a for a in mix}
  761. acceptable_attributes = self.svg_attributes
  762. tag = self.svg_elem_map.get(tag, tag)
  763. keymap = self.svg_attr_map
  764. elif tag not in self.acceptable_elements:
  765. return
  766. # declare xlink namespace, if needed
  767. if self.mathmlOK or self.svgOK:
  768. if any((a for a in attrs if a[0].startswith('xlink:'))):
  769. if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs:
  770. attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink'))
  771. clean_attrs = []
  772. for key, value in self.normalize_attrs(attrs):
  773. if key == 'style' and 'style' in acceptable_attributes:
  774. clean_value = self.sanitize_style(value)
  775. if clean_value:
  776. clean_attrs.append((key, clean_value))
  777. elif key in acceptable_attributes:
  778. key = keymap.get(key, key)
  779. # make sure the uri uses an acceptable uri scheme
  780. if key == 'href':
  781. value = make_safe_absolute_uri(value)
  782. clean_attrs.append((key, value))
  783. super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
  784. def unknown_endtag(self, tag):
  785. if tag not in self.acceptable_elements:
  786. if tag in self.unacceptable_elements_with_end_tag:
  787. self.unacceptablestack -= 1
  788. if self.mathmlOK and tag in self.mathml_elements:
  789. if tag == 'math' and self.mathmlOK:
  790. self.mathmlOK -= 1
  791. elif self.svgOK and tag in self.svg_elements:
  792. tag = self.svg_elem_map.get(tag, tag)
  793. if tag == 'svg' and self.svgOK:
  794. self.svgOK -= 1
  795. else:
  796. return
  797. super(_HTMLSanitizer, self).unknown_endtag(tag)
  798. def handle_pi(self, text):
  799. pass
  800. def handle_decl(self, text):
  801. pass
  802. def handle_data(self, text):
  803. if not self.unacceptablestack:
  804. super(_HTMLSanitizer, self).handle_data(text)
  805. def sanitize_style(self, style):
  806. # disallow urls
  807. style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  808. # gauntlet
  809. if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
  810. return ''
  811. # This replaced a regexp that used re.match and was prone to
  812. # pathological back-tracking.
  813. if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
  814. return ''
  815. clean = []
  816. for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
  817. if not value:
  818. continue
  819. if prop.lower() in self.acceptable_css_properties:
  820. clean.append(prop + ': ' + value + ';')
  821. elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']:
  822. for keyword in value.split():
  823. if (
  824. keyword not in self.acceptable_css_keywords
  825. and not self.valid_css_values.match(keyword)
  826. ):
  827. break
  828. else:
  829. clean.append(prop + ': ' + value + ';')
  830. elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
  831. clean.append(prop + ': ' + value + ';')
  832. return ' '.join(clean)
  833. def parse_comment(self, i, report=1):
  834. ret = super(_HTMLSanitizer, self).parse_comment(i, report)
  835. if ret >= 0:
  836. return ret
  837. # if ret == -1, this may be a malicious attempt to circumvent
  838. # sanitization, or a page-destroying unclosed comment
  839. match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
  840. if match:
  841. return match.end()
  842. # unclosed comment; deliberately fail to handle_data()
  843. return len(self.rawdata)
  844. def _sanitize_html(html_source, encoding, _type):
  845. p = _HTMLSanitizer(encoding, _type)
  846. html_source = html_source.replace('<![CDATA[', '&lt;![CDATA[')
  847. p.feed(html_source)
  848. data = p.output()
  849. data = data.strip().replace('\r\n', '\n')
  850. return data
  851. # Match XML entity declarations.
  852. # Example: <!ENTITY copyright "(C)">
  853. RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
  854. # Match XML DOCTYPE declarations.
  855. # Example: <!DOCTYPE feed [ ]>
  856. RE_DOCTYPE_PATTERN = re.compile(br'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
  857. # Match safe entity declarations.
  858. # This will allow hexadecimal character references through,
  859. # as well as text, but not arbitrary nested entities.
  860. # Example: cubed "&#179;"
  861. # Example: copyright "(C)"
  862. # Forbidden: explode1 "&explode2;&explode2;"
  863. RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
  864. def replace_doctype(data):
  865. """Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
  866. rss_version may be 'rss091n' or None
  867. stripped_data is the same XML document with a replaced DOCTYPE
  868. """
  869. # Divide the document into two groups by finding the location
  870. # of the first element that doesn't begin with '<?' or '<!'.
  871. start = re.search(br'<\w', data)
  872. start = start and start.start() or -1
  873. head, data = data[:start+1], data[start+1:]
  874. # Save and then remove all of the ENTITY declarations.
  875. entity_results = RE_ENTITY_PATTERN.findall(head)
  876. head = RE_ENTITY_PATTERN.sub(b'', head)
  877. # Find the DOCTYPE declaration and check the feed type.
  878. doctype_results = RE_DOCTYPE_PATTERN.findall(head)
  879. doctype = doctype_results and doctype_results[0] or b''
  880. if b'netscape' in doctype.lower():
  881. version = 'rss091n'
  882. else:
  883. version = None
  884. # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
  885. replacement = b''
  886. if len(doctype_results) == 1 and entity_results:
  887. safe_entities = [
  888. e
  889. for e in entity_results
  890. if RE_SAFE_ENTITY_PATTERN.match(e)
  891. ]
  892. if safe_entities:
  893. replacement = b'<!DOCTYPE feed [\n<!ENTITY' \
  894. + b'>\n<!ENTITY '.join(safe_entities) \
  895. + b'>\n]>'
  896. data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
  897. # Precompute the safe entities for the loose parser.
  898. safe_entities = {
  899. k.decode('utf-8'): v.decode('utf-8')
  900. for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
  901. }
  902. return version, data, safe_entities