| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950 |
- # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
- # Copyright 2002-2008 Mark Pilgrim
- # All rights reserved.
- #
- # This file is a part of feedparser.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # * Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- # POSSIBILITY OF SUCH DAMAGE.
- import re
- from .html import _BaseHTMLProcessor
- from .urls import make_safe_absolute_uri
- class _HTMLSanitizer(_BaseHTMLProcessor):
- acceptable_elements = {
- 'a',
- 'abbr',
- 'acronym',
- 'address',
- 'area',
- 'article',
- 'aside',
- 'audio',
- 'b',
- 'big',
- 'blockquote',
- 'br',
- 'button',
- 'canvas',
- 'caption',
- 'center',
- 'cite',
- 'code',
- 'col',
- 'colgroup',
- 'command',
- 'datagrid',
- 'datalist',
- 'dd',
- 'del',
- 'details',
- 'dfn',
- 'dialog',
- 'dir',
- 'div',
- 'dl',
- 'dt',
- 'em',
- 'event-source',
- 'fieldset',
- 'figcaption',
- 'figure',
- 'font',
- 'footer',
- 'form',
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'h6',
- 'header',
- 'hr',
- 'i',
- 'img',
- 'input',
- 'ins',
- 'kbd',
- 'keygen',
- 'label',
- 'legend',
- 'li',
- 'm',
- 'map',
- 'menu',
- 'meter',
- 'multicol',
- 'nav',
- 'nextid',
- 'noscript',
- 'ol',
- 'optgroup',
- 'option',
- 'output',
- 'p',
- 'pre',
- 'progress',
- 'q',
- 's',
- 'samp',
- 'section',
- 'select',
- 'small',
- 'sound',
- 'source',
- 'spacer',
- 'span',
- 'strike',
- 'strong',
- 'sub',
- 'sup',
- 'table',
- 'tbody',
- 'td',
- 'textarea',
- 'tfoot',
- 'th',
- 'thead',
- 'time',
- 'tr',
- 'tt',
- 'u',
- 'ul',
- 'var',
- 'video',
- }
- acceptable_attributes = {
- 'abbr',
- 'accept',
- 'accept-charset',
- 'accesskey',
- 'action',
- 'align',
- 'alt',
- 'autocomplete',
- 'autofocus',
- 'axis',
- 'background',
- 'balance',
- 'bgcolor',
- 'bgproperties',
- 'border',
- 'bordercolor',
- 'bordercolordark',
- 'bordercolorlight',
- 'bottompadding',
- 'cellpadding',
- 'cellspacing',
- 'ch',
- 'challenge',
- 'char',
- 'charoff',
- 'charset',
- 'checked',
- 'choff',
- 'cite',
- 'class',
- 'clear',
- 'color',
- 'cols',
- 'colspan',
- 'compact',
- 'contenteditable',
- 'controls',
- 'coords',
- 'data',
- 'datafld',
- 'datapagesize',
- 'datasrc',
- 'datetime',
- 'default',
- 'delay',
- 'dir',
- 'disabled',
- 'draggable',
- 'dynsrc',
- 'enctype',
- 'end',
- 'face',
- 'for',
- 'form',
- 'frame',
- 'galleryimg',
- 'gutter',
- 'headers',
- 'height',
- 'hidden',
- 'hidefocus',
- 'high',
- 'href',
- 'hreflang',
- 'hspace',
- 'icon',
- 'id',
- 'inputmode',
- 'ismap',
- 'keytype',
- 'label',
- 'lang',
- 'leftspacing',
- 'list',
- 'longdesc',
- 'loop',
- 'loopcount',
- 'loopend',
- 'loopstart',
- 'low',
- 'lowsrc',
- 'max',
- 'maxlength',
- 'media',
- 'method',
- 'min',
- 'multiple',
- 'name',
- 'nohref',
- 'noshade',
- 'nowrap',
- 'open',
- 'optimum',
- 'pattern',
- 'ping',
- 'point-size',
- 'poster',
- 'pqg',
- 'preload',
- 'prompt',
- 'radiogroup',
- 'readonly',
- 'rel',
- 'repeat-max',
- 'repeat-min',
- 'replace',
- 'required',
- 'rev',
- 'rightspacing',
- 'rows',
- 'rowspan',
- 'rules',
- 'scope',
- 'selected',
- 'shape',
- 'size',
- 'span',
- 'src',
- 'start',
- 'step',
- 'style',
- 'summary',
- 'suppress',
- 'tabindex',
- 'target',
- 'template',
- 'title',
- 'toppadding',
- 'type',
- 'unselectable',
- 'urn',
- 'usemap',
- 'valign',
- 'value',
- 'variable',
- 'volume',
- 'vrml',
- 'vspace',
- 'width',
- 'wrap',
- 'xml:lang',
- }
- unacceptable_elements_with_end_tag = {
- 'applet',
- 'script',
- 'style',
- }
- acceptable_css_properties = {
- 'azimuth',
- 'background-color',
- 'border-bottom-color',
- 'border-collapse',
- 'border-color',
- 'border-left-color',
- 'border-right-color',
- 'border-top-color',
- 'clear',
- 'color',
- 'cursor',
- 'direction',
- 'display',
- 'elevation',
- 'float',
- 'font',
- 'font-family',
- 'font-size',
- 'font-style',
- 'font-variant',
- 'font-weight',
- 'height',
- 'letter-spacing',
- 'line-height',
- 'overflow',
- 'pause',
- 'pause-after',
- 'pause-before',
- 'pitch',
- 'pitch-range',
- 'richness',
- 'speak',
- 'speak-header',
- 'speak-numeral',
- 'speak-punctuation',
- 'speech-rate',
- 'stress',
- 'text-align',
- 'text-decoration',
- 'text-indent',
- 'unicode-bidi',
- 'vertical-align',
- 'voice-family',
- 'volume',
- 'white-space',
- 'width',
- }
- # survey of common keywords found in feeds
- acceptable_css_keywords = {
- '!important',
- 'aqua',
- 'auto',
- 'black',
- 'block',
- 'blue',
- 'bold',
- 'both',
- 'bottom',
- 'brown',
- 'center',
- 'collapse',
- 'dashed',
- 'dotted',
- 'fuchsia',
- 'gray',
- 'green',
- 'italic',
- 'left',
- 'lime',
- 'maroon',
- 'medium',
- 'navy',
- 'none',
- 'normal',
- 'nowrap',
- 'olive',
- 'pointer',
- 'purple',
- 'red',
- 'right',
- 'silver',
- 'solid',
- 'teal',
- 'top',
- 'transparent',
- 'underline',
- 'white',
- 'yellow',
- }
- valid_css_values = re.compile(
- r'^('
- r'#[0-9a-f]+' # Hex values
- r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values
- r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths
- r')$'
- )
- mathml_elements = {
- 'annotation',
- 'annotation-xml',
- 'maction',
- 'maligngroup',
- 'malignmark',
- 'math',
- 'menclose',
- 'merror',
- 'mfenced',
- 'mfrac',
- 'mglyph',
- 'mi',
- 'mlabeledtr',
- 'mlongdiv',
- 'mmultiscripts',
- 'mn',
- 'mo',
- 'mover',
- 'mpadded',
- 'mphantom',
- 'mprescripts',
- 'mroot',
- 'mrow',
- 'ms',
- 'mscarries',
- 'mscarry',
- 'msgroup',
- 'msline',
- 'mspace',
- 'msqrt',
- 'msrow',
- 'mstack',
- 'mstyle',
- 'msub',
- 'msubsup',
- 'msup',
- 'mtable',
- 'mtd',
- 'mtext',
- 'mtr',
- 'munder',
- 'munderover',
- 'none',
- 'semantics',
- }
- mathml_attributes = {
- 'accent',
- 'accentunder',
- 'actiontype',
- 'align',
- 'alignmentscope',
- 'altimg',
- 'altimg-height',
- 'altimg-valign',
- 'altimg-width',
- 'alttext',
- 'bevelled',
- 'charalign',
- 'close',
- 'columnalign',
- 'columnlines',
- 'columnspacing',
- 'columnspan',
- 'columnwidth',
- 'crossout',
- 'decimalpoint',
- 'denomalign',
- 'depth',
- 'dir',
- 'display',
- 'displaystyle',
- 'edge',
- 'encoding',
- 'equalcolumns',
- 'equalrows',
- 'fence',
- 'fontstyle',
- 'fontweight',
- 'form',
- 'frame',
- 'framespacing',
- 'groupalign',
- 'height',
- 'href',
- 'id',
- 'indentalign',
- 'indentalignfirst',
- 'indentalignlast',
- 'indentshift',
- 'indentshiftfirst',
- 'indentshiftlast',
- 'indenttarget',
- 'infixlinebreakstyle',
- 'largeop',
- 'length',
- 'linebreak',
- 'linebreakmultchar',
- 'linebreakstyle',
- 'lineleading',
- 'linethickness',
- 'location',
- 'longdivstyle',
- 'lquote',
- 'lspace',
- 'mathbackground',
- 'mathcolor',
- 'mathsize',
- 'mathvariant',
- 'maxsize',
- 'minlabelspacing',
- 'minsize',
- 'movablelimits',
- 'notation',
- 'numalign',
- 'open',
- 'other',
- 'overflow',
- 'position',
- 'rowalign',
- 'rowlines',
- 'rowspacing',
- 'rowspan',
- 'rquote',
- 'rspace',
- 'scriptlevel',
- 'scriptminsize',
- 'scriptsizemultiplier',
- 'selection',
- 'separator',
- 'separators',
- 'shift',
- 'side',
- 'src',
- 'stackalign',
- 'stretchy',
- 'subscriptshift',
- 'superscriptshift',
- 'symmetric',
- 'voffset',
- 'width',
- 'xlink:href',
- 'xlink:show',
- 'xlink:type',
- 'xmlns',
- 'xmlns:xlink',
- }
- # svgtiny - foreignObject + linearGradient + radialGradient + stop
- svg_elements = {
- 'a',
- 'animate',
- 'animateColor',
- 'animateMotion',
- 'animateTransform',
- 'circle',
- 'defs',
- 'desc',
- 'ellipse',
- 'font-face',
- 'font-face-name',
- 'font-face-src',
- 'foreignObject',
- 'g',
- 'glyph',
- 'hkern',
- 'line',
- 'linearGradient',
- 'marker',
- 'metadata',
- 'missing-glyph',
- 'mpath',
- 'path',
- 'polygon',
- 'polyline',
- 'radialGradient',
- 'rect',
- 'set',
- 'stop',
- 'svg',
- 'switch',
- 'text',
- 'title',
- 'tspan',
- 'use',
- }
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
- svg_attributes = {
- 'accent-height',
- 'accumulate',
- 'additive',
- 'alphabetic',
- 'arabic-form',
- 'ascent',
- 'attributeName',
- 'attributeType',
- 'baseProfile',
- 'bbox',
- 'begin',
- 'by',
- 'calcMode',
- 'cap-height',
- 'class',
- 'color',
- 'color-rendering',
- 'content',
- 'cx',
- 'cy',
- 'd',
- 'descent',
- 'display',
- 'dur',
- 'dx',
- 'dy',
- 'end',
- 'fill',
- 'fill-opacity',
- 'fill-rule',
- 'font-family',
- 'font-size',
- 'font-stretch',
- 'font-style',
- 'font-variant',
- 'font-weight',
- 'from',
- 'fx',
- 'fy',
- 'g1',
- 'g2',
- 'glyph-name',
- 'gradientUnits',
- 'hanging',
- 'height',
- 'horiz-adv-x',
- 'horiz-origin-x',
- 'id',
- 'ideographic',
- 'k',
- 'keyPoints',
- 'keySplines',
- 'keyTimes',
- 'lang',
- 'marker-end',
- 'marker-mid',
- 'marker-start',
- 'markerHeight',
- 'markerUnits',
- 'markerWidth',
- 'mathematical',
- 'max',
- 'min',
- 'name',
- 'offset',
- 'opacity',
- 'orient',
- 'origin',
- 'overline-position',
- 'overline-thickness',
- 'panose-1',
- 'path',
- 'pathLength',
- 'points',
- 'preserveAspectRatio',
- 'r',
- 'refX',
- 'refY',
- 'repeatCount',
- 'repeatDur',
- 'requiredExtensions',
- 'requiredFeatures',
- 'restart',
- 'rotate',
- 'rx',
- 'ry',
- 'slope',
- 'stemh',
- 'stemv',
- 'stop-color',
- 'stop-opacity',
- 'strikethrough-position',
- 'strikethrough-thickness',
- 'stroke',
- 'stroke-dasharray',
- 'stroke-dashoffset',
- 'stroke-linecap',
- 'stroke-linejoin',
- 'stroke-miterlimit',
- 'stroke-opacity',
- 'stroke-width',
- 'systemLanguage',
- 'target',
- 'text-anchor',
- 'to',
- 'transform',
- 'type',
- 'u1',
- 'u2',
- 'underline-position',
- 'underline-thickness',
- 'unicode',
- 'unicode-range',
- 'units-per-em',
- 'values',
- 'version',
- 'viewBox',
- 'visibility',
- 'width',
- 'widths',
- 'x',
- 'x-height',
- 'x1',
- 'x2',
- 'xlink:actuate',
- 'xlink:arcrole',
- 'xlink:href',
- 'xlink:role',
- 'xlink:show',
- 'xlink:title',
- 'xlink:type',
- 'xml:base',
- 'xml:lang',
- 'xml:space',
- 'xmlns',
- 'xmlns:xlink',
- 'y',
- 'y1',
- 'y2',
- 'zoomAndPan',
- }
- svg_attr_map = None
- svg_elem_map = None
- acceptable_svg_properties = {
- 'fill',
- 'fill-opacity',
- 'fill-rule',
- 'stroke',
- 'stroke-linecap',
- 'stroke-linejoin',
- 'stroke-opacity',
- 'stroke-width',
- }
- def __init__(self, encoding=None, _type='application/xhtml+xml'):
- super(_HTMLSanitizer, self).__init__(encoding, _type)
- self.unacceptablestack = 0
- self.mathmlOK = 0
- self.svgOK = 0
- def reset(self):
- super(_HTMLSanitizer, self).reset()
- self.unacceptablestack = 0
- self.mathmlOK = 0
- self.svgOK = 0
- def unknown_starttag(self, tag, attrs):
- acceptable_attributes = self.acceptable_attributes
- keymap = {}
- if tag not in self.acceptable_elements or self.svgOK:
- if tag in self.unacceptable_elements_with_end_tag:
- self.unacceptablestack += 1
- # add implicit namespaces to html5 inline svg/mathml
- if self._type.endswith('html'):
- if not dict(attrs).get('xmlns'):
- if tag == 'svg':
- attrs.append(('xmlns', 'http://www.w3.org/2000/svg'))
- if tag == 'math':
- attrs.append(('xmlns', 'http://www.w3.org/1998/Math/MathML'))
- # not otherwise acceptable, perhaps it is MathML or SVG?
- if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs:
- self.mathmlOK += 1
- if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs:
- self.svgOK += 1
- # chose acceptable attributes based on tag class, else bail
- if self.mathmlOK and tag in self.mathml_elements:
- acceptable_attributes = self.mathml_attributes
- elif self.svgOK and tag in self.svg_elements:
- # For most vocabularies, lowercasing is a good idea. Many
- # svg elements, however, are camel case.
- if not self.svg_attr_map:
- lower = [attr.lower() for attr in self.svg_attributes]
- mix = [a for a in self.svg_attributes if a not in lower]
- self.svg_attributes = lower
- self.svg_attr_map = {a.lower(): a for a in mix}
- lower = [attr.lower() for attr in self.svg_elements]
- mix = [a for a in self.svg_elements if a not in lower]
- self.svg_elements = lower
- self.svg_elem_map = {a.lower(): a for a in mix}
- acceptable_attributes = self.svg_attributes
- tag = self.svg_elem_map.get(tag, tag)
- keymap = self.svg_attr_map
- elif tag not in self.acceptable_elements:
- return
- # declare xlink namespace, if needed
- if self.mathmlOK or self.svgOK:
- if any((a for a in attrs if a[0].startswith('xlink:'))):
- if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs:
- attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink'))
- clean_attrs = []
- for key, value in self.normalize_attrs(attrs):
- if key == 'style' and 'style' in acceptable_attributes:
- clean_value = self.sanitize_style(value)
- if clean_value:
- clean_attrs.append((key, clean_value))
- elif key in acceptable_attributes:
- key = keymap.get(key, key)
- # make sure the uri uses an acceptable uri scheme
- if key == 'href':
- value = make_safe_absolute_uri(value)
- clean_attrs.append((key, value))
- super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
- def unknown_endtag(self, tag):
- if tag not in self.acceptable_elements:
- if tag in self.unacceptable_elements_with_end_tag:
- self.unacceptablestack -= 1
- if self.mathmlOK and tag in self.mathml_elements:
- if tag == 'math' and self.mathmlOK:
- self.mathmlOK -= 1
- elif self.svgOK and tag in self.svg_elements:
- tag = self.svg_elem_map.get(tag, tag)
- if tag == 'svg' and self.svgOK:
- self.svgOK -= 1
- else:
- return
- super(_HTMLSanitizer, self).unknown_endtag(tag)
- def handle_pi(self, text):
- pass
- def handle_decl(self, text):
- pass
- def handle_data(self, text):
- if not self.unacceptablestack:
- super(_HTMLSanitizer, self).handle_data(text)
- def sanitize_style(self, style):
- # disallow urls
- style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
- # gauntlet
- if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
- return ''
- # This replaced a regexp that used re.match and was prone to
- # pathological back-tracking.
- if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
- return ''
- clean = []
- for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
- if not value:
- continue
- if prop.lower() in self.acceptable_css_properties:
- clean.append(prop + ': ' + value + ';')
- elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']:
- for keyword in value.split():
- if (
- keyword not in self.acceptable_css_keywords
- and not self.valid_css_values.match(keyword)
- ):
- break
- else:
- clean.append(prop + ': ' + value + ';')
- elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
- clean.append(prop + ': ' + value + ';')
- return ' '.join(clean)
- def parse_comment(self, i, report=1):
- ret = super(_HTMLSanitizer, self).parse_comment(i, report)
- if ret >= 0:
- return ret
- # if ret == -1, this may be a malicious attempt to circumvent
- # sanitization, or a page-destroying unclosed comment
- match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
- if match:
- return match.end()
- # unclosed comment; deliberately fail to handle_data()
- return len(self.rawdata)
- def _sanitize_html(html_source, encoding, _type):
- p = _HTMLSanitizer(encoding, _type)
- html_source = html_source.replace('<![CDATA[', '<![CDATA[')
- p.feed(html_source)
- data = p.output()
- data = data.strip().replace('\r\n', '\n')
- return data
- # Match XML entity declarations.
- # Example: <!ENTITY copyright "(C)">
- RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
- # Match XML DOCTYPE declarations.
- # Example: <!DOCTYPE feed [ ]>
- RE_DOCTYPE_PATTERN = re.compile(br'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
- # Match safe entity declarations.
- # This will allow hexadecimal character references through,
- # as well as text, but not arbitrary nested entities.
- # Example: cubed "³"
- # Example: copyright "(C)"
- # Forbidden: explode1 "&explode2;&explode2;"
- RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
- def replace_doctype(data):
- """Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
- rss_version may be 'rss091n' or None
- stripped_data is the same XML document with a replaced DOCTYPE
- """
- # Divide the document into two groups by finding the location
- # of the first element that doesn't begin with '<?' or '<!'.
- start = re.search(br'<\w', data)
- start = start and start.start() or -1
- head, data = data[:start+1], data[start+1:]
- # Save and then remove all of the ENTITY declarations.
- entity_results = RE_ENTITY_PATTERN.findall(head)
- head = RE_ENTITY_PATTERN.sub(b'', head)
- # Find the DOCTYPE declaration and check the feed type.
- doctype_results = RE_DOCTYPE_PATTERN.findall(head)
- doctype = doctype_results and doctype_results[0] or b''
- if b'netscape' in doctype.lower():
- version = 'rss091n'
- else:
- version = None
- # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
- replacement = b''
- if len(doctype_results) == 1 and entity_results:
- safe_entities = [
- e
- for e in entity_results
- if RE_SAFE_ENTITY_PATTERN.match(e)
- ]
- if safe_entities:
- replacement = b'<!DOCTYPE feed [\n<!ENTITY' \
- + b'>\n<!ENTITY '.join(safe_entities) \
- + b'>\n]>'
- data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
- # Precompute the safe entities for the loose parser.
- safe_entities = {
- k.decode('utf-8'): v.decode('utf-8')
- for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
- }
- return version, data, safe_entities
|