grammar.js 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. 'use strict';
  2. /**
  3. * Detects relevant unicode support for regular expressions in the runtime.
  4. * Should the runtime not accepts the flag `u` or unicode ranges,
  5. * character classes without unicode handling will be used.
  6. *
  7. * @param {typeof RegExp} [RegExpImpl=RegExp]
  8. * For testing: the RegExp class.
  9. * @returns {boolean}
  10. * @see https://node.green/#ES2015-syntax-RegExp--y--and--u--flags
  11. */
  12. function detectUnicodeSupport(RegExpImpl) {
  13. try {
  14. if (typeof RegExpImpl !== 'function') {
  15. RegExpImpl = RegExp;
  16. }
  17. // eslint-disable-next-line es5/no-unicode-regex,es5/no-unicode-code-point-escape
  18. var match = new RegExpImpl('\u{1d306}', 'u').exec('𝌆');
  19. return !!match && match[0].length === 2;
  20. } catch (error) {}
  21. return false;
  22. }
  23. var UNICODE_SUPPORT = detectUnicodeSupport();
  24. /**
  25. * Removes `[`, `]` and any trailing quantifiers from the source of a RegExp.
  26. *
  27. * @param {RegExp} regexp
  28. */
  29. function chars(regexp) {
  30. if (regexp.source[0] !== '[') {
  31. throw new Error(regexp + ' can not be used with chars');
  32. }
  33. return regexp.source.slice(1, regexp.source.lastIndexOf(']'));
  34. }
  35. /**
  36. * Creates a new character list regular expression,
  37. * by removing `search` from the source of `regexp`.
  38. *
  39. * @param {RegExp} regexp
  40. * @param {string} search
  41. * The character(s) to remove.
  42. * @returns {RegExp}
  43. */
  44. function chars_without(regexp, search) {
  45. if (regexp.source[0] !== '[') {
  46. throw new Error('/' + regexp.source + '/ can not be used with chars_without');
  47. }
  48. if (!search || typeof search !== 'string') {
  49. throw new Error(JSON.stringify(search) + ' is not a valid search');
  50. }
  51. if (regexp.source.indexOf(search) === -1) {
  52. throw new Error('"' + search + '" is not is /' + regexp.source + '/');
  53. }
  54. if (search === '-' && regexp.source.indexOf(search) !== 1) {
  55. throw new Error('"' + search + '" is not at the first postion of /' + regexp.source + '/');
  56. }
  57. return new RegExp(regexp.source.replace(search, ''), UNICODE_SUPPORT ? 'u' : '');
  58. }
  59. /**
  60. * Combines and Regular expressions correctly by using `RegExp.source`.
  61. *
  62. * @param {...(RegExp | string)[]} args
  63. * @returns {RegExp}
  64. */
  65. function reg(args) {
  66. var self = this;
  67. return new RegExp(
  68. Array.prototype.slice
  69. .call(arguments)
  70. .map(function (part) {
  71. var isStr = typeof part === 'string';
  72. if (isStr && self === undefined && part === '|') {
  73. throw new Error('use regg instead of reg to wrap expressions with `|`!');
  74. }
  75. return isStr ? part : part.source;
  76. })
  77. .join(''),
  78. UNICODE_SUPPORT ? 'mu' : 'm'
  79. );
  80. }
  81. /**
  82. * Like `reg` but wraps the expression in `(?:`,`)` to create a non tracking group.
  83. *
  84. * @param {...(RegExp | string)[]} args
  85. * @returns {RegExp}
  86. */
  87. function regg(args) {
  88. if (arguments.length === 0) {
  89. throw new Error('no parameters provided');
  90. }
  91. return reg.apply(regg, ['(?:'].concat(Array.prototype.slice.call(arguments), [')']));
  92. }
  93. // /**
  94. // * Append ^ to the beginning of the expression.
  95. // * @param {...(RegExp | string)[]} args
  96. // * @returns {RegExp}
  97. // */
  98. // function reg_start(args) {
  99. // if (arguments.length === 0) {
  100. // throw new Error('no parameters provided');
  101. // }
  102. // return reg.apply(reg_start, ['^'].concat(Array.prototype.slice.call(arguments)));
  103. // }
  104. // https://www.w3.org/TR/xml/#document
  105. // `[1] document ::= prolog element Misc*`
  106. // https://www.w3.org/TR/xml11/#NT-document
  107. // `[1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )`
  108. /**
  109. * A character usually appearing in wrongly converted strings.
  110. *
  111. * @type {string}
  112. * @see https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
  113. * @see https://nodejs.dev/en/api/v18/buffer/#buffers-and-character-encodings
  114. * @see https://www.unicode.org/faq/utf_bom.html#BOM
  115. * @readonly
  116. */
  117. var UNICODE_REPLACEMENT_CHARACTER = '\uFFFD';
  118. // https://www.w3.org/TR/xml/#NT-Char
  119. // any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
  120. // `[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]`
  121. // https://www.w3.org/TR/xml11/#NT-Char
  122. // `[2] Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]`
  123. // https://www.w3.org/TR/xml11/#NT-RestrictedChar
  124. // `[2a] RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]`
  125. // https://www.w3.org/TR/xml11/#charsets
  126. var Char = /[-\x09\x0A\x0D\x20-\x2C\x2E-\uD7FF\uE000-\uFFFD]/; // without \u10000-\uEFFFF
  127. if (UNICODE_SUPPORT) {
  128. // eslint-disable-next-line es5/no-unicode-code-point-escape
  129. Char = reg('[', chars(Char), '\\u{10000}-\\u{10FFFF}', ']');
  130. }
  131. var _SChar = /[\x20\x09\x0D\x0A]/;
  132. var SChar_s = chars(_SChar);
  133. // https://www.w3.org/TR/xml11/#NT-S
  134. // `[3] S ::= (#x20 | #x9 | #xD | #xA)+`
  135. var S = reg(_SChar, '+');
  136. // optional whitespace described as `S?` in the grammar,
  137. // simplified to 0-n occurrences of the character class
  138. // instead of 0-1 occurrences of a non-capturing group around S
  139. var S_OPT = reg(_SChar, '*');
  140. // https://www.w3.org/TR/xml11/#NT-NameStartChar
  141. // `[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]`
  142. var NameStartChar =
  143. /[:_a-zA-Z\xC0-\xD6\xD8-\xF6\xF8-\u02FF\u0370-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; // without \u10000-\uEFFFF
  144. if (UNICODE_SUPPORT) {
  145. // eslint-disable-next-line es5/no-unicode-code-point-escape
  146. NameStartChar = reg('[', chars(NameStartChar), '\\u{10000}-\\u{10FFFF}', ']');
  147. }
  148. var NameStartChar_s = chars(NameStartChar);
  149. // https://www.w3.org/TR/xml11/#NT-NameChar
  150. // `[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]`
  151. var NameChar = reg('[', NameStartChar_s, chars(/[-.0-9\xB7]/), chars(/[\u0300-\u036F\u203F-\u2040]/), ']');
  152. // https://www.w3.org/TR/xml11/#NT-Name
  153. // `[5] Name ::= NameStartChar (NameChar)*`
  154. var Name = reg(NameStartChar, NameChar, '*');
  155. /*
  156. https://www.w3.org/TR/xml11/#NT-Names
  157. `[6] Names ::= Name (#x20 Name)*`
  158. */
  159. // https://www.w3.org/TR/xml11/#NT-Nmtoken
  160. // `[7] Nmtoken ::= (NameChar)+`
  161. var Nmtoken = reg(NameChar, '+');
  162. /*
  163. https://www.w3.org/TR/xml11/#NT-Nmtokens
  164. `[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*`
  165. var Nmtokens = reg(Nmtoken, regg(/\x20/, Nmtoken), '*');
  166. */
  167. // https://www.w3.org/TR/xml11/#NT-EntityRef
  168. // `[68] EntityRef ::= '&' Name ';'` [WFC: Entity Declared] [VC: Entity Declared] [WFC: Parsed Entity] [WFC: No Recursion]
  169. var EntityRef = reg('&', Name, ';');
  170. // https://www.w3.org/TR/xml11/#NT-CharRef
  171. // `[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'` [WFC: Legal Character]
  172. var CharRef = regg(/&#[0-9]+;|&#x[0-9a-fA-F]+;/);
  173. /*
  174. https://www.w3.org/TR/xml11/#NT-Reference
  175. - `[67] Reference ::= EntityRef | CharRef`
  176. - `[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'` [WFC: Legal Character]
  177. - `[68] EntityRef ::= '&' Name ';'` [WFC: Entity Declared] [VC: Entity Declared] [WFC: Parsed Entity] [WFC: No Recursion]
  178. */
  179. var Reference = regg(EntityRef, '|', CharRef);
  180. // https://www.w3.org/TR/xml11/#NT-PEReference
  181. // `[69] PEReference ::= '%' Name ';'`
  182. // [VC: Entity Declared] [WFC: No Recursion] [WFC: In DTD]
  183. var PEReference = reg('%', Name, ';');
  184. // https://www.w3.org/TR/xml11/#NT-EntityValue
  185. // `[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'"`
  186. var EntityValue = regg(
  187. reg('"', regg(/[^%&"]/, '|', PEReference, '|', Reference), '*', '"'),
  188. '|',
  189. reg("'", regg(/[^%&']/, '|', PEReference, '|', Reference), '*', "'")
  190. );
  191. // https://www.w3.org/TR/xml11/#NT-AttValue
  192. // `[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"`
  193. var AttValue = regg('"', regg(/[^<&"]/, '|', Reference), '*', '"', '|', "'", regg(/[^<&']/, '|', Reference), '*', "'");
  194. // https://www.w3.org/TR/xml-names/#ns-decl
  195. // https://www.w3.org/TR/xml-names/#ns-qualnames
  196. // NameStartChar without ":"
  197. var NCNameStartChar = chars_without(NameStartChar, ':');
  198. // https://www.w3.org/TR/xml-names/#orphans
  199. // `[5] NCNameChar ::= NameChar - ':'`
  200. // An XML NameChar, minus the ":"
  201. var NCNameChar = chars_without(NameChar, ':');
  202. // https://www.w3.org/TR/xml-names/#NT-NCName
  203. // `[4] NCName ::= Name - (Char* ':' Char*)`
  204. // An XML Name, minus the ":"
  205. var NCName = reg(NCNameStartChar, NCNameChar, '*');
  206. /**
  207. https://www.w3.org/TR/xml-names/#ns-qualnames
  208. ```
  209. [7] QName ::= PrefixedName | UnprefixedName
  210. === (NCName ':' NCName) | NCName
  211. === NCName (':' NCName)?
  212. [8] PrefixedName ::= Prefix ':' LocalPart
  213. === NCName ':' NCName
  214. [9] UnprefixedName ::= LocalPart
  215. === NCName
  216. [10] Prefix ::= NCName
  217. [11] LocalPart ::= NCName
  218. ```
  219. */
  220. var QName = reg(NCName, regg(':', NCName), '?');
  221. var QName_exact = reg('^', QName, '$');
  222. var QName_group = reg('(', QName, ')');
  223. // https://www.w3.org/TR/xml11/#NT-SystemLiteral
  224. // `[11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")`
  225. var SystemLiteral = regg(/"[^"]*"|'[^']*'/);
  226. /*
  227. https://www.w3.org/TR/xml11/#NT-PI
  228. ```
  229. [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
  230. [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  231. ```
  232. target /xml/i is not excluded!
  233. */
  234. var PI = reg(/^<\?/, '(', Name, ')', regg(S, '(', Char, '*?)'), '?', /\?>/);
  235. // https://www.w3.org/TR/xml11/#NT-PubidChar
  236. // `[13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]`
  237. var PubidChar = /[\x20\x0D\x0Aa-zA-Z0-9-'()+,./:=?;!*#@$_%]/;
  238. // https://www.w3.org/TR/xml11/#NT-PubidLiteral
  239. // `[12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"`
  240. var PubidLiteral = regg('"', PubidChar, '*"', '|', "'", chars_without(PubidChar, "'"), "*'");
  241. // https://www.w3.org/TR/xml11/#NT-CharData
  242. // `[14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)`
  243. var COMMENT_START = '<!--';
  244. var COMMENT_END = '-->';
  245. // https://www.w3.org/TR/xml11/#NT-Comment
  246. // `[15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'`
  247. var Comment = reg(COMMENT_START, regg(chars_without(Char, '-'), '|', reg('-', chars_without(Char, '-'))), '*', COMMENT_END);
  248. var PCDATA = '#PCDATA';
  249. // https://www.w3.org/TR/xml11/#NT-Mixed
  250. // `[51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')'`
  251. // https://www.w3.org/TR/xml-names/#NT-Mixed
  252. // `[51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? QName)* S? ')*' | '(' S? '#PCDATA' S? ')'`
  253. // [VC: Proper Group/PE Nesting] [VC: No Duplicate Types]
  254. var Mixed = regg(
  255. reg(/\(/, S_OPT, PCDATA, regg(S_OPT, /\|/, S_OPT, QName), '*', S_OPT, /\)\*/),
  256. '|',
  257. reg(/\(/, S_OPT, PCDATA, S_OPT, /\)/)
  258. );
  259. var _children_quantity = /[?*+]?/;
  260. /*
  261. `[49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'` [VC: Proper Group/PE Nesting]
  262. `[50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'` [VC: Proper Group/PE Nesting]
  263. simplification to solve circular referencing, but doesn't check validity constraint "Proper Group/PE Nesting"
  264. var _choice_or_seq = reg('[', NameChar_s, SChar_s, chars(_children_quantity), '()|,]*');
  265. ```
  266. [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
  267. === (Name | '(' S? cp ( S? '|' S? cp )+ S? ')' | '(' S? cp ( S? ',' S? cp )* S? ')') ('?' | '*' | '+')?
  268. !== (Name | [_choice_or_seq]*) ('?' | '*' | '+')?
  269. ```
  270. simplification to solve circular referencing, but doesn't check validity constraint "Proper Group/PE Nesting"
  271. var cp = reg(regg(Name, '|', _choice_or_seq), _children_quantity);
  272. */
  273. /*
  274. Inefficient regular expression (High)
  275. This part of the regular expression may cause exponential backtracking on strings starting with '(|' and containing many repetitions of '|'.
  276. https://github.com/xmldom/xmldom/security/code-scanning/91
  277. var choice = regg(/\(/, S_OPT, cp, regg(S_OPT, /\|/, S_OPT, cp), '+', S_OPT, /\)/);
  278. */
  279. /*
  280. Inefficient regular expression (High)
  281. This part of the regular expression may cause exponential backtracking on strings starting with '(,' and containing many repetitions of ','.
  282. https://github.com/xmldom/xmldom/security/code-scanning/92
  283. var seq = regg(/\(/, S_OPT, cp, regg(S_OPT, /,/, S_OPT, cp), '*', S_OPT, /\)/);
  284. */
  285. // `[47] children ::= (choice | seq) ('?' | '*' | '+')?`
  286. // simplification to solve circular referencing, but doesn't check validity constraint "Proper Group/PE Nesting"
  287. var children = reg(/\([^>]+\)/, _children_quantity /*regg(choice, '|', seq), _children_quantity*/);
  288. // https://www.w3.org/TR/xml11/#NT-contentspec
  289. // `[46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children`
  290. var contentspec = regg('EMPTY', '|', 'ANY', '|', Mixed, '|', children);
  291. var ELEMENTDECL_START = '<!ELEMENT';
  292. // https://www.w3.org/TR/xml11/#NT-elementdecl
  293. // `[45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'`
  294. // https://www.w3.org/TR/xml-names/#NT-elementdecl
  295. // `[17] elementdecl ::= '<!ELEMENT' S QName S contentspec S? '>'`
  296. // because of https://www.w3.org/TR/xml11/#NT-PEReference
  297. // since xmldom is not supporting replacements of PEReferences in the DTD
  298. // this also supports PEReference in the possible places
  299. var elementdecl = reg(ELEMENTDECL_START, S, regg(QName, '|', PEReference), S, regg(contentspec, '|', PEReference), S_OPT, '>');
  300. // https://www.w3.org/TR/xml11/#NT-NotationType
  301. // `[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'`
  302. // [VC: Notation Attributes] [VC: One Notation Per Element Type] [VC: No Notation on Empty Element] [VC: No Duplicate Tokens]
  303. var NotationType = reg('NOTATION', S, /\(/, S_OPT, Name, regg(S_OPT, /\|/, S_OPT, Name), '*', S_OPT, /\)/);
  304. // https://www.w3.org/TR/xml11/#NT-Enumeration
  305. // `[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'`
  306. // [VC: Enumeration] [VC: No Duplicate Tokens]
  307. var Enumeration = reg(/\(/, S_OPT, Nmtoken, regg(S_OPT, /\|/, S_OPT, Nmtoken), '*', S_OPT, /\)/);
  308. // https://www.w3.org/TR/xml11/#NT-EnumeratedType
  309. // `[57] EnumeratedType ::= NotationType | Enumeration`
  310. var EnumeratedType = regg(NotationType, '|', Enumeration);
  311. /*
  312. ```
  313. [55] StringType ::= 'CDATA'
  314. [56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID per Element Type] [VC: ID Attribute Default]
  315. | 'IDREF' [VC: IDREF]
  316. | 'IDREFS' [VC: IDREF]
  317. | 'ENTITY' [VC: Entity Name]
  318. | 'ENTITIES' [VC: Entity Name]
  319. | 'NMTOKEN' [VC: Name Token]
  320. | 'NMTOKENS' [VC: Name Token]
  321. [54] AttType ::= StringType | TokenizedType | EnumeratedType
  322. ```*/
  323. var AttType = regg(/CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS/, '|', EnumeratedType);
  324. // `[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)`
  325. // [WFC: No < in Attribute Values] [WFC: No External Entity References]
  326. // [VC: Fixed Attribute Default] [VC: Required Attribute] [VC: Attribute Default Value Syntactically Correct]
  327. var DefaultDecl = regg(/#REQUIRED|#IMPLIED/, '|', regg(regg('#FIXED', S), '?', AttValue));
  328. // https://www.w3.org/TR/xml11/#NT-AttDef
  329. // [53] AttDef ::= S Name S AttType S DefaultDecl
  330. // https://www.w3.org/TR/xml-names/#NT-AttDef
  331. // [1] NSAttName ::= PrefixedAttName | DefaultAttName
  332. // [2] PrefixedAttName ::= 'xmlns:' NCName [NSC: Reserved Prefixes and Namespace Names]
  333. // [3] DefaultAttName ::= 'xmlns'
  334. // [21] AttDef ::= S (QName | NSAttName) S AttType S DefaultDecl
  335. // === S Name S AttType S DefaultDecl
  336. // xmldom is not distinguishing between QName and NSAttName on this level
  337. // to support XML without namespaces in DTD we can not restrict it to QName
  338. var AttDef = regg(S, Name, S, AttType, S, DefaultDecl);
  339. var ATTLIST_DECL_START = '<!ATTLIST';
  340. // https://www.w3.org/TR/xml11/#NT-AttlistDecl
  341. // `[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'`
  342. // https://www.w3.org/TR/xml-names/#NT-AttlistDecl
  343. // `[20] AttlistDecl ::= '<!ATTLIST' S QName AttDef* S? '>'`
  344. // to support XML without namespaces in DTD we can not restrict it to QName
  345. var AttlistDecl = reg(ATTLIST_DECL_START, S, Name, AttDef, '*', S_OPT, '>');
  346. // https://html.spec.whatwg.org/multipage/urls-and-fetching.html#about:legacy-compat
  347. var ABOUT_LEGACY_COMPAT = 'about:legacy-compat';
  348. var ABOUT_LEGACY_COMPAT_SystemLiteral = regg('"' + ABOUT_LEGACY_COMPAT + '"', '|', "'" + ABOUT_LEGACY_COMPAT + "'");
  349. var SYSTEM = 'SYSTEM';
  350. var PUBLIC = 'PUBLIC';
  351. // https://www.w3.org/TR/xml11/#NT-ExternalID
  352. // `[75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral`
  353. var ExternalID = regg(regg(SYSTEM, S, SystemLiteral), '|', regg(PUBLIC, S, PubidLiteral, S, SystemLiteral));
  354. var ExternalID_match = reg(
  355. '^',
  356. regg(
  357. regg(SYSTEM, S, '(?<SystemLiteralOnly>', SystemLiteral, ')'),
  358. '|',
  359. regg(PUBLIC, S, '(?<PubidLiteral>', PubidLiteral, ')', S, '(?<SystemLiteral>', SystemLiteral, ')')
  360. )
  361. );
  362. // https://www.w3.org/TR/xml11/#NT-NDataDecl
  363. // `[76] NDataDecl ::= S 'NDATA' S Name` [VC: Notation Declared]
  364. var NDataDecl = regg(S, 'NDATA', S, Name);
  365. // https://www.w3.org/TR/xml11/#NT-EntityDef
  366. // `[73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)`
  367. var EntityDef = regg(EntityValue, '|', regg(ExternalID, NDataDecl, '?'));
  368. var ENTITY_DECL_START = '<!ENTITY';
  369. // https://www.w3.org/TR/xml11/#NT-GEDecl
  370. // `[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'`
  371. var GEDecl = reg(ENTITY_DECL_START, S, Name, S, EntityDef, S_OPT, '>');
  372. // https://www.w3.org/TR/xml11/#NT-PEDef
  373. // `[74] PEDef ::= EntityValue | ExternalID`
  374. var PEDef = regg(EntityValue, '|', ExternalID);
  375. // https://www.w3.org/TR/xml11/#NT-PEDecl
  376. // `[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'`
  377. var PEDecl = reg(ENTITY_DECL_START, S, '%', S, Name, S, PEDef, S_OPT, '>');
  378. // https://www.w3.org/TR/xml11/#NT-EntityDecl
  379. // `[70] EntityDecl ::= GEDecl | PEDecl`
  380. var EntityDecl = regg(GEDecl, '|', PEDecl);
  381. // https://www.w3.org/TR/xml11/#NT-PublicID
  382. // `[83] PublicID ::= 'PUBLIC' S PubidLiteral`
  383. var PublicID = reg(PUBLIC, S, PubidLiteral);
  384. // https://www.w3.org/TR/xml11/#NT-NotationDecl
  385. // `[82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'` [VC: Unique Notation Name]
  386. var NotationDecl = reg('<!NOTATION', S, Name, S, regg(ExternalID, '|', PublicID), S_OPT, '>');
  387. // https://www.w3.org/TR/xml11/#NT-Eq
  388. // `[25] Eq ::= S? '=' S?`
  389. var Eq = reg(S_OPT, '=', S_OPT);
  390. // https://www.w3.org/TR/xml/#NT-VersionNum
  391. // `[26] VersionNum ::= '1.' [0-9]+`
  392. // https://www.w3.org/TR/xml11/#NT-VersionNum
  393. // `[26] VersionNum ::= '1.1'`
  394. var VersionNum = /1[.]\d+/;
  395. // https://www.w3.org/TR/xml11/#NT-VersionInfo
  396. // `[24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')`
  397. var VersionInfo = reg(S, 'version', Eq, regg("'", VersionNum, "'", '|', '"', VersionNum, '"'));
  398. // https://www.w3.org/TR/xml11/#NT-EncName
  399. // `[81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*`
  400. var EncName = /[A-Za-z][-A-Za-z0-9._]*/;
  401. // https://www.w3.org/TR/xml11/#NT-EncDecl
  402. // `[80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )`
  403. var EncodingDecl = regg(S, 'encoding', Eq, regg('"', EncName, '"', '|', "'", EncName, "'"));
  404. // https://www.w3.org/TR/xml11/#NT-SDDecl
  405. // `[32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))`
  406. var SDDecl = regg(S, 'standalone', Eq, regg("'", regg('yes', '|', 'no'), "'", '|', '"', regg('yes', '|', 'no'), '"'));
  407. // https://www.w3.org/TR/xml11/#NT-XMLDecl
  408. // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  409. var XMLDecl = reg(/^<\?xml/, VersionInfo, EncodingDecl, '?', SDDecl, '?', S_OPT, /\?>/);
  410. /*
  411. https://www.w3.org/TR/xml/#NT-markupdecl
  412. https://www.w3.org/TR/xml11/#NT-markupdecl
  413. `[29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment`
  414. var markupdecl = regg(elementdecl, '|', AttlistDecl, '|', EntityDecl, '|', NotationDecl, '|', PI_unsafe, '|', Comment);
  415. */
  416. /*
  417. https://www.w3.org/TR/xml-names/#NT-doctypedecl
  418. `[28a] DeclSep ::= PEReference | S`
  419. https://www.w3.org/TR/xml11/#NT-intSubset
  420. ```
  421. [28b] intSubset ::= (markupdecl | DeclSep)*
  422. === (markupdecl | PEReference | S)*
  423. ```
  424. [WFC: PE Between Declarations]
  425. var intSubset = reg(regg(markupdecl, '|', PEReference, '|', S), '*');
  426. */
  427. var DOCTYPE_DECL_START = '<!DOCTYPE';
  428. /*
  429. https://www.w3.org/TR/xml11/#NT-doctypedecl
  430. `[28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'`
  431. https://www.afterwardsw3.org/TR/xml-names/#NT-doctypedecl
  432. `[16] doctypedecl ::= '<!DOCTYPE' S QName (S ExternalID)? S? ('[' (markupdecl | PEReference | S)* ']' S?)? '>'`
  433. var doctypedecl = reg('<!DOCTYPE', S, Name, regg(S, ExternalID), '?', S_OPT, regg(/\[/, intSubset, /]/, S_OPT), '?', '>');
  434. */
  435. var CDATA_START = '<![CDATA[';
  436. var CDATA_END = ']]>';
  437. var CDStart = /<!\[CDATA\[/;
  438. var CDEnd = /\]\]>/;
  439. var CData = reg(Char, '*?', CDEnd);
  440. /*
  441. https://www.w3.org/TR/xml/#dt-cdsection
  442. `[18] CDSect ::= CDStart CData CDEnd`
  443. `[19] CDStart ::= '<![CDATA['`
  444. `[20] CData ::= (Char* - (Char* ']]>' Char*))`
  445. `[21] CDEnd ::= ']]>'`
  446. */
  447. var CDSect = reg(CDStart, CData);
  448. // unit tested
  449. exports.chars = chars;
  450. exports.chars_without = chars_without;
  451. exports.detectUnicodeSupport = detectUnicodeSupport;
  452. exports.reg = reg;
  453. exports.regg = regg;
  454. exports.ABOUT_LEGACY_COMPAT = ABOUT_LEGACY_COMPAT;
  455. exports.ABOUT_LEGACY_COMPAT_SystemLiteral = ABOUT_LEGACY_COMPAT_SystemLiteral;
  456. exports.AttlistDecl = AttlistDecl;
  457. exports.CDATA_START = CDATA_START;
  458. exports.CDATA_END = CDATA_END;
  459. exports.CDSect = CDSect;
  460. exports.Char = Char;
  461. exports.Comment = Comment;
  462. exports.COMMENT_START = COMMENT_START;
  463. exports.COMMENT_END = COMMENT_END;
  464. exports.DOCTYPE_DECL_START = DOCTYPE_DECL_START;
  465. exports.elementdecl = elementdecl;
  466. exports.EntityDecl = EntityDecl;
  467. exports.EntityValue = EntityValue;
  468. exports.ExternalID = ExternalID;
  469. exports.ExternalID_match = ExternalID_match;
  470. exports.Name = Name;
  471. exports.NotationDecl = NotationDecl;
  472. exports.Reference = Reference;
  473. exports.PEReference = PEReference;
  474. exports.PI = PI;
  475. exports.PUBLIC = PUBLIC;
  476. exports.PubidLiteral = PubidLiteral;
  477. exports.QName = QName;
  478. exports.QName_exact = QName_exact;
  479. exports.QName_group = QName_group;
  480. exports.S = S;
  481. exports.SChar_s = SChar_s;
  482. exports.S_OPT = S_OPT;
  483. exports.SYSTEM = SYSTEM;
  484. exports.SystemLiteral = SystemLiteral;
  485. exports.UNICODE_REPLACEMENT_CHARACTER = UNICODE_REPLACEMENT_CHARACTER;
  486. exports.UNICODE_SUPPORT = UNICODE_SUPPORT;
  487. exports.XMLDecl = XMLDecl;