sax.js 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929
  1. 'use strict';
  2. var conventions = require('./conventions');
  3. var g = require('./grammar');
  4. var errors = require('./errors');
  5. var isHTMLEscapableRawTextElement = conventions.isHTMLEscapableRawTextElement;
  6. var isHTMLMimeType = conventions.isHTMLMimeType;
  7. var isHTMLRawTextElement = conventions.isHTMLRawTextElement;
  8. var hasOwn = conventions.hasOwn;
  9. var NAMESPACE = conventions.NAMESPACE;
  10. var ParseError = errors.ParseError;
  11. var DOMException = errors.DOMException;
  12. //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
  13. //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
  14. //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
  15. var S_TAG = 0; //tag name offerring
  16. var S_ATTR = 1; //attr name offerring
  17. var S_ATTR_SPACE = 2; //attr name end and space offer
  18. var S_EQ = 3; //=space?
  19. var S_ATTR_NOQUOT_VALUE = 4; //attr value(no quot value only)
  20. var S_ATTR_END = 5; //attr value end and no space(quot end)
  21. var S_TAG_SPACE = 6; //(attr value end || tag end ) && (space offer)
  22. var S_TAG_CLOSE = 7; //closed el<el />
  23. function XMLReader() {}
  24. XMLReader.prototype = {
  25. parse: function (source, defaultNSMap, entityMap) {
  26. var domBuilder = this.domBuilder;
  27. domBuilder.startDocument();
  28. _copy(defaultNSMap, (defaultNSMap = Object.create(null)));
  29. parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler);
  30. domBuilder.endDocument();
  31. },
  32. };
  33. /**
  34. * Detecting everything that might be a reference,
  35. * including those without ending `;`, since those are allowed in HTML.
  36. * The entityReplacer takes care of verifying and transforming each occurrence,
  37. * and reports to the errorHandler on those that are not OK,
  38. * depending on the context.
  39. */
  40. var ENTITY_REG = /&#?\w+;?/g;
  41. function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) {
  42. var isHTML = isHTMLMimeType(domBuilder.mimeType);
  43. if (source.indexOf(g.UNICODE_REPLACEMENT_CHARACTER) >= 0) {
  44. errorHandler.warning('Unicode replacement character detected, source encoding issues?');
  45. }
  46. function fixedFromCharCode(code) {
  47. // String.prototype.fromCharCode does not supports
  48. // > 2 bytes unicode chars directly
  49. if (code > 0xffff) {
  50. code -= 0x10000;
  51. var surrogate1 = 0xd800 + (code >> 10),
  52. surrogate2 = 0xdc00 + (code & 0x3ff);
  53. return String.fromCharCode(surrogate1, surrogate2);
  54. } else {
  55. return String.fromCharCode(code);
  56. }
  57. }
  58. function entityReplacer(a) {
  59. var complete = a[a.length - 1] === ';' ? a : a + ';';
  60. if (!isHTML && complete !== a) {
  61. errorHandler.error('EntityRef: expecting ;');
  62. return a;
  63. }
  64. var match = g.Reference.exec(complete);
  65. if (!match || match[0].length !== complete.length) {
  66. errorHandler.error('entity not matching Reference production: ' + a);
  67. return a;
  68. }
  69. var k = complete.slice(1, -1);
  70. if (hasOwn(entityMap, k)) {
  71. return entityMap[k];
  72. } else if (k.charAt(0) === '#') {
  73. return fixedFromCharCode(parseInt(k.substring(1).replace('x', '0x')));
  74. } else {
  75. errorHandler.error('entity not found:' + a);
  76. return a;
  77. }
  78. }
  79. function appendText(end) {
  80. //has some bugs
  81. if (end > start) {
  82. var xt = source.substring(start, end).replace(ENTITY_REG, entityReplacer);
  83. locator && position(start);
  84. domBuilder.characters(xt, 0, end - start);
  85. start = end;
  86. }
  87. }
  88. var lineStart = 0;
  89. var lineEnd = 0;
  90. var linePattern = /\r\n?|\n|$/g;
  91. var locator = domBuilder.locator;
  92. function position(p, m) {
  93. while (p >= lineEnd && (m = linePattern.exec(source))) {
  94. lineStart = lineEnd;
  95. lineEnd = m.index + m[0].length;
  96. locator.lineNumber++;
  97. }
  98. locator.columnNumber = p - lineStart + 1;
  99. }
  100. var parseStack = [{ currentNSMap: defaultNSMapCopy }];
  101. var unclosedTags = [];
  102. var start = 0;
  103. while (true) {
  104. try {
  105. var tagStart = source.indexOf('<', start);
  106. if (tagStart < 0) {
  107. if (!isHTML && unclosedTags.length > 0) {
  108. return errorHandler.fatalError('unclosed xml tag(s): ' + unclosedTags.join(', '));
  109. }
  110. if (!source.substring(start).match(/^\s*$/)) {
  111. var doc = domBuilder.doc;
  112. var text = doc.createTextNode(source.substring(start));
  113. if (doc.documentElement) {
  114. return errorHandler.error('Extra content at the end of the document');
  115. }
  116. doc.appendChild(text);
  117. domBuilder.currentElement = text;
  118. }
  119. return;
  120. }
  121. if (tagStart > start) {
  122. var fromSource = source.substring(start, tagStart);
  123. if (!isHTML && unclosedTags.length === 0) {
  124. fromSource = fromSource.replace(new RegExp(g.S_OPT.source, 'g'), '');
  125. fromSource && errorHandler.error("Unexpected content outside root element: '" + fromSource + "'");
  126. }
  127. appendText(tagStart);
  128. }
  129. switch (source.charAt(tagStart + 1)) {
  130. case '/':
  131. var end = source.indexOf('>', tagStart + 2);
  132. var tagNameRaw = source.substring(tagStart + 2, end > 0 ? end : undefined);
  133. if (!tagNameRaw) {
  134. return errorHandler.fatalError('end tag name missing');
  135. }
  136. var tagNameMatch = end > 0 && g.reg('^', g.QName_group, g.S_OPT, '$').exec(tagNameRaw);
  137. if (!tagNameMatch) {
  138. return errorHandler.fatalError('end tag name contains invalid characters: "' + tagNameRaw + '"');
  139. }
  140. if (!domBuilder.currentElement && !domBuilder.doc.documentElement) {
  141. // not enough information to provide a helpful error message,
  142. // but parsing will throw since there is no root element
  143. return;
  144. }
  145. var currentTagName =
  146. unclosedTags[unclosedTags.length - 1] ||
  147. domBuilder.currentElement.tagName ||
  148. domBuilder.doc.documentElement.tagName ||
  149. '';
  150. if (currentTagName !== tagNameMatch[1]) {
  151. var tagNameLower = tagNameMatch[1].toLowerCase();
  152. if (!isHTML || currentTagName.toLowerCase() !== tagNameLower) {
  153. return errorHandler.fatalError('Opening and ending tag mismatch: "' + currentTagName + '" != "' + tagNameRaw + '"');
  154. }
  155. }
  156. var config = parseStack.pop();
  157. unclosedTags.pop();
  158. var localNSMap = config.localNSMap;
  159. domBuilder.endElement(config.uri, config.localName, currentTagName);
  160. if (localNSMap) {
  161. for (var prefix in localNSMap) {
  162. if (hasOwn(localNSMap, prefix)) {
  163. domBuilder.endPrefixMapping(prefix);
  164. }
  165. }
  166. }
  167. end++;
  168. break;
  169. // end element
  170. case '?': // <?...?>
  171. locator && position(tagStart);
  172. end = parseProcessingInstruction(source, tagStart, domBuilder, errorHandler);
  173. break;
  174. case '!': // <!doctype,<![CDATA,<!--
  175. locator && position(tagStart);
  176. end = parseDoctypeCommentOrCData(source, tagStart, domBuilder, errorHandler, isHTML);
  177. break;
  178. default:
  179. locator && position(tagStart);
  180. var el = new ElementAttributes();
  181. var currentNSMap = parseStack[parseStack.length - 1].currentNSMap;
  182. //elStartEnd
  183. var end = parseElementStartPart(source, tagStart, el, currentNSMap, entityReplacer, errorHandler, isHTML);
  184. var len = el.length;
  185. if (!el.closed) {
  186. if (isHTML && conventions.isHTMLVoidElement(el.tagName)) {
  187. el.closed = true;
  188. } else {
  189. unclosedTags.push(el.tagName);
  190. }
  191. }
  192. if (locator && len) {
  193. var locator2 = copyLocator(locator, {});
  194. //try{//attribute position fixed
  195. for (var i = 0; i < len; i++) {
  196. var a = el[i];
  197. position(a.offset);
  198. a.locator = copyLocator(locator, {});
  199. }
  200. domBuilder.locator = locator2;
  201. if (appendElement(el, domBuilder, currentNSMap)) {
  202. parseStack.push(el);
  203. }
  204. domBuilder.locator = locator;
  205. } else {
  206. if (appendElement(el, domBuilder, currentNSMap)) {
  207. parseStack.push(el);
  208. }
  209. }
  210. if (isHTML && !el.closed) {
  211. end = parseHtmlSpecialContent(source, end, el.tagName, entityReplacer, domBuilder);
  212. } else {
  213. end++;
  214. }
  215. }
  216. } catch (e) {
  217. if (e instanceof ParseError) {
  218. throw e;
  219. } else if (e instanceof DOMException) {
  220. throw new ParseError(e.name + ': ' + e.message, domBuilder.locator, e);
  221. }
  222. errorHandler.error('element parse error: ' + e);
  223. end = -1;
  224. }
  225. if (end > start) {
  226. start = end;
  227. } else {
  228. //Possible sax fallback here, risk of positional error
  229. appendText(Math.max(tagStart, start) + 1);
  230. }
  231. }
  232. }
  233. function copyLocator(f, t) {
  234. t.lineNumber = f.lineNumber;
  235. t.columnNumber = f.columnNumber;
  236. return t;
  237. }
  238. /**
  239. * @returns
  240. * end of the elementStartPart(end of elementEndPart for selfClosed el)
  241. * @see {@link #appendElement}
  242. */
  243. function parseElementStartPart(source, start, el, currentNSMap, entityReplacer, errorHandler, isHTML) {
  244. /**
  245. * @param {string} qname
  246. * @param {string} value
  247. * @param {number} startIndex
  248. */
  249. function addAttribute(qname, value, startIndex) {
  250. if (hasOwn(el.attributeNames, qname)) {
  251. return errorHandler.fatalError('Attribute ' + qname + ' redefined');
  252. }
  253. if (!isHTML && value.indexOf('<') >= 0) {
  254. return errorHandler.fatalError("Unescaped '<' not allowed in attributes values");
  255. }
  256. el.addValue(
  257. qname,
  258. // @see https://www.w3.org/TR/xml/#AVNormalize
  259. // since the xmldom sax parser does not "interpret" DTD the following is not implemented:
  260. // - recursive replacement of (DTD) entity references
  261. // - trimming and collapsing multiple spaces into a single one for attributes that are not of type CDATA
  262. value.replace(/[\t\n\r]/g, ' ').replace(ENTITY_REG, entityReplacer),
  263. startIndex
  264. );
  265. }
  266. var attrName;
  267. var value;
  268. var p = ++start;
  269. var s = S_TAG; //status
  270. while (true) {
  271. var c = source.charAt(p);
  272. switch (c) {
  273. case '=':
  274. if (s === S_ATTR) {
  275. //attrName
  276. attrName = source.slice(start, p);
  277. s = S_EQ;
  278. } else if (s === S_ATTR_SPACE) {
  279. s = S_EQ;
  280. } else {
  281. //fatalError: equal must after attrName or space after attrName
  282. throw new Error('attribute equal must after attrName'); // No known test case
  283. }
  284. break;
  285. case "'":
  286. case '"':
  287. if (
  288. s === S_EQ ||
  289. s === S_ATTR //|| s == S_ATTR_SPACE
  290. ) {
  291. //equal
  292. if (s === S_ATTR) {
  293. errorHandler.warning('attribute value must after "="');
  294. attrName = source.slice(start, p);
  295. }
  296. start = p + 1;
  297. p = source.indexOf(c, start);
  298. if (p > 0) {
  299. value = source.slice(start, p);
  300. addAttribute(attrName, value, start - 1);
  301. s = S_ATTR_END;
  302. } else {
  303. //fatalError: no end quot match
  304. throw new Error("attribute value no end '" + c + "' match");
  305. }
  306. } else if (s == S_ATTR_NOQUOT_VALUE) {
  307. value = source.slice(start, p);
  308. addAttribute(attrName, value, start);
  309. errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!');
  310. start = p + 1;
  311. s = S_ATTR_END;
  312. } else {
  313. //fatalError: no equal before
  314. throw new Error('attribute value must after "="'); // No known test case
  315. }
  316. break;
  317. case '/':
  318. switch (s) {
  319. case S_TAG:
  320. el.setTagName(source.slice(start, p));
  321. case S_ATTR_END:
  322. case S_TAG_SPACE:
  323. case S_TAG_CLOSE:
  324. s = S_TAG_CLOSE;
  325. el.closed = true;
  326. case S_ATTR_NOQUOT_VALUE:
  327. case S_ATTR:
  328. break;
  329. case S_ATTR_SPACE:
  330. el.closed = true;
  331. break;
  332. //case S_EQ:
  333. default:
  334. throw new Error("attribute invalid close char('/')"); // No known test case
  335. }
  336. break;
  337. case '': //end document
  338. errorHandler.error('unexpected end of input');
  339. if (s == S_TAG) {
  340. el.setTagName(source.slice(start, p));
  341. }
  342. return p;
  343. case '>':
  344. switch (s) {
  345. case S_TAG:
  346. el.setTagName(source.slice(start, p));
  347. case S_ATTR_END:
  348. case S_TAG_SPACE:
  349. case S_TAG_CLOSE:
  350. break; //normal
  351. case S_ATTR_NOQUOT_VALUE: //Compatible state
  352. case S_ATTR:
  353. value = source.slice(start, p);
  354. if (value.slice(-1) === '/') {
  355. el.closed = true;
  356. value = value.slice(0, -1);
  357. }
  358. case S_ATTR_SPACE:
  359. if (s === S_ATTR_SPACE) {
  360. value = attrName;
  361. }
  362. if (s == S_ATTR_NOQUOT_VALUE) {
  363. errorHandler.warning('attribute "' + value + '" missed quot(")!');
  364. addAttribute(attrName, value, start);
  365. } else {
  366. if (!isHTML) {
  367. errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!');
  368. }
  369. addAttribute(value, value, start);
  370. }
  371. break;
  372. case S_EQ:
  373. if (!isHTML) {
  374. return errorHandler.fatalError('AttValue: \' or " expected');
  375. }
  376. }
  377. return p;
  378. /*xml space '\x20' | #x9 | #xD | #xA; */
  379. case '\u0080':
  380. c = ' ';
  381. default:
  382. if (c <= ' ') {
  383. //space
  384. switch (s) {
  385. case S_TAG:
  386. el.setTagName(source.slice(start, p)); //tagName
  387. s = S_TAG_SPACE;
  388. break;
  389. case S_ATTR:
  390. attrName = source.slice(start, p);
  391. s = S_ATTR_SPACE;
  392. break;
  393. case S_ATTR_NOQUOT_VALUE:
  394. var value = source.slice(start, p);
  395. errorHandler.warning('attribute "' + value + '" missed quot(")!!');
  396. addAttribute(attrName, value, start);
  397. case S_ATTR_END:
  398. s = S_TAG_SPACE;
  399. break;
  400. //case S_TAG_SPACE:
  401. //case S_EQ:
  402. //case S_ATTR_SPACE:
  403. // void();break;
  404. //case S_TAG_CLOSE:
  405. //ignore warning
  406. }
  407. } else {
  408. //not space
  409. //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
  410. //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
  411. switch (s) {
  412. //case S_TAG:void();break;
  413. //case S_ATTR:void();break;
  414. //case S_ATTR_NOQUOT_VALUE:void();break;
  415. case S_ATTR_SPACE:
  416. if (!isHTML) {
  417. errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!');
  418. }
  419. addAttribute(attrName, attrName, start);
  420. start = p;
  421. s = S_ATTR;
  422. break;
  423. case S_ATTR_END:
  424. errorHandler.warning('attribute space is required"' + attrName + '"!!');
  425. case S_TAG_SPACE:
  426. s = S_ATTR;
  427. start = p;
  428. break;
  429. case S_EQ:
  430. s = S_ATTR_NOQUOT_VALUE;
  431. start = p;
  432. break;
  433. case S_TAG_CLOSE:
  434. throw new Error("elements closed character '/' and '>' must be connected to");
  435. }
  436. }
  437. } //end outer switch
  438. p++;
  439. }
  440. }
  441. /**
  442. * @returns
  443. * `true` if a new namespace has been defined.
  444. */
  445. function appendElement(el, domBuilder, currentNSMap) {
  446. var tagName = el.tagName;
  447. var localNSMap = null;
  448. var i = el.length;
  449. while (i--) {
  450. var a = el[i];
  451. var qName = a.qName;
  452. var value = a.value;
  453. var nsp = qName.indexOf(':');
  454. if (nsp > 0) {
  455. var prefix = (a.prefix = qName.slice(0, nsp));
  456. var localName = qName.slice(nsp + 1);
  457. var nsPrefix = prefix === 'xmlns' && localName;
  458. } else {
  459. localName = qName;
  460. prefix = null;
  461. nsPrefix = qName === 'xmlns' && '';
  462. }
  463. //can not set prefix,because prefix !== ''
  464. a.localName = localName;
  465. //prefix == null for no ns prefix attribute
  466. if (nsPrefix !== false) {
  467. //hack!!
  468. if (localNSMap == null) {
  469. localNSMap = Object.create(null);
  470. _copy(currentNSMap, (currentNSMap = Object.create(null)));
  471. }
  472. currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value;
  473. a.uri = NAMESPACE.XMLNS;
  474. domBuilder.startPrefixMapping(nsPrefix, value);
  475. }
  476. }
  477. var i = el.length;
  478. while (i--) {
  479. a = el[i];
  480. if (a.prefix) {
  481. //no prefix attribute has no namespace
  482. if (a.prefix === 'xml') {
  483. a.uri = NAMESPACE.XML;
  484. }
  485. if (a.prefix !== 'xmlns') {
  486. a.uri = currentNSMap[a.prefix];
  487. }
  488. }
  489. }
  490. var nsp = tagName.indexOf(':');
  491. if (nsp > 0) {
  492. prefix = el.prefix = tagName.slice(0, nsp);
  493. localName = el.localName = tagName.slice(nsp + 1);
  494. } else {
  495. prefix = null; //important!!
  496. localName = el.localName = tagName;
  497. }
  498. //no prefix element has default namespace
  499. var ns = (el.uri = currentNSMap[prefix || '']);
  500. domBuilder.startElement(ns, localName, tagName, el);
  501. //endPrefixMapping and startPrefixMapping have not any help for dom builder
  502. //localNSMap = null
  503. if (el.closed) {
  504. domBuilder.endElement(ns, localName, tagName);
  505. if (localNSMap) {
  506. for (prefix in localNSMap) {
  507. if (hasOwn(localNSMap, prefix)) {
  508. domBuilder.endPrefixMapping(prefix);
  509. }
  510. }
  511. }
  512. } else {
  513. el.currentNSMap = currentNSMap;
  514. el.localNSMap = localNSMap;
  515. //parseStack.push(el);
  516. return true;
  517. }
  518. }
  519. function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) {
  520. // https://html.spec.whatwg.org/#raw-text-elements
  521. // https://html.spec.whatwg.org/#escapable-raw-text-elements
  522. // https://html.spec.whatwg.org/#cdata-rcdata-restrictions:raw-text-elements
  523. // TODO: https://html.spec.whatwg.org/#cdata-rcdata-restrictions
  524. var isEscapableRaw = isHTMLEscapableRawTextElement(tagName);
  525. if (isEscapableRaw || isHTMLRawTextElement(tagName)) {
  526. var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd);
  527. var text = source.substring(elStartEnd + 1, elEndStart);
  528. if (isEscapableRaw) {
  529. text = text.replace(ENTITY_REG, entityReplacer);
  530. }
  531. domBuilder.characters(text, 0, text.length);
  532. return elEndStart;
  533. }
  534. return elStartEnd + 1;
  535. }
  536. function _copy(source, target) {
  537. for (var n in source) {
  538. if (hasOwn(source, n)) {
  539. target[n] = source[n];
  540. }
  541. }
  542. }
  543. /**
  544. * @typedef ParseUtils
  545. * @property {function(relativeIndex: number?): string | undefined} char
  546. * Provides look ahead access to a singe character relative to the current index.
  547. * @property {function(): number} getIndex
  548. * Provides read-only access to the current index.
  549. * @property {function(reg: RegExp): string | null} getMatch
  550. * Applies the provided regular expression enforcing that it starts at the current index and
  551. * returns the complete matching string,
  552. * and moves the current index by the length of the matching string.
  553. * @property {function(): string} getSource
  554. * Provides read-only access to the complete source.
  555. * @property {function(places: number?): void} skip
  556. * moves the current index by places (defaults to 1)
  557. * @property {function(): number} skipBlanks
  558. * Moves the current index by the amount of white space that directly follows the current index
  559. * and returns the amount of whitespace chars skipped (0..n),
  560. * or -1 if the end of the source was reached.
  561. * @property {function(): string} substringFromIndex
  562. * creates a substring from the current index to the end of `source`
  563. * @property {function(compareWith: string): boolean} substringStartsWith
  564. * Checks if `source` contains `compareWith`, starting from the current index.
  565. * @property {function(compareWith: string): boolean} substringStartsWithCaseInsensitive
  566. * Checks if `source` contains `compareWith`, starting from the current index,
  567. * comparing the upper case of both sides.
  568. * @see {@link parseUtils}
  569. */
  570. /**
  571. * A temporary scope for parsing and look ahead operations in `source`,
  572. * starting from index `start`.
  573. *
  574. * Some operations move the current index by a number of positions,
  575. * after which `getIndex` returns the new index.
  576. *
  577. * @param {string} source
  578. * @param {number} start
  579. * @returns {ParseUtils}
  580. */
  581. function parseUtils(source, start) {
  582. var index = start;
  583. function char(n) {
  584. n = n || 0;
  585. return source.charAt(index + n);
  586. }
  587. function skip(n) {
  588. n = n || 1;
  589. index += n;
  590. }
  591. function skipBlanks() {
  592. var blanks = 0;
  593. while (index < source.length) {
  594. var c = char();
  595. if (c !== ' ' && c !== '\n' && c !== '\t' && c !== '\r') {
  596. return blanks;
  597. }
  598. blanks++;
  599. skip();
  600. }
  601. return -1;
  602. }
  603. function substringFromIndex() {
  604. return source.substring(index);
  605. }
  606. function substringStartsWith(text) {
  607. return source.substring(index, index + text.length) === text;
  608. }
  609. function substringStartsWithCaseInsensitive(text) {
  610. return source.substring(index, index + text.length).toUpperCase() === text.toUpperCase();
  611. }
  612. function getMatch(args) {
  613. var expr = g.reg('^', args);
  614. var match = expr.exec(substringFromIndex());
  615. if (match) {
  616. skip(match[0].length);
  617. return match[0];
  618. }
  619. return null;
  620. }
  621. return {
  622. char: char,
  623. getIndex: function () {
  624. return index;
  625. },
  626. getMatch: getMatch,
  627. getSource: function () {
  628. return source;
  629. },
  630. skip: skip,
  631. skipBlanks: skipBlanks,
  632. substringFromIndex: substringFromIndex,
  633. substringStartsWith: substringStartsWith,
  634. substringStartsWithCaseInsensitive: substringStartsWithCaseInsensitive,
  635. };
  636. }
  637. /**
  638. * @param {ParseUtils} p
  639. * @param {DOMHandler} errorHandler
  640. * @returns {string}
  641. */
  642. function parseDoctypeInternalSubset(p, errorHandler) {
  643. /**
  644. * @param {ParseUtils} p
  645. * @param {DOMHandler} errorHandler
  646. * @returns {string}
  647. */
  648. function parsePI(p, errorHandler) {
  649. var match = g.PI.exec(p.substringFromIndex());
  650. if (!match) {
  651. return errorHandler.fatalError('processing instruction is not well-formed at position ' + p.getIndex());
  652. }
  653. if (match[1].toLowerCase() === 'xml') {
  654. return errorHandler.fatalError(
  655. 'xml declaration is only allowed at the start of the document, but found at position ' + p.getIndex()
  656. );
  657. }
  658. p.skip(match[0].length);
  659. return match[0];
  660. }
  661. // Parse internal subset
  662. var source = p.getSource();
  663. if (p.char() === '[') {
  664. p.skip(1);
  665. var intSubsetStart = p.getIndex();
  666. while (p.getIndex() < source.length) {
  667. p.skipBlanks();
  668. if (p.char() === ']') {
  669. var internalSubset = source.substring(intSubsetStart, p.getIndex());
  670. p.skip(1);
  671. return internalSubset;
  672. }
  673. var current = null;
  674. // Only in external subset
  675. // if (char() === '<' && char(1) === '!' && char(2) === '[') {
  676. // parseConditionalSections(p, errorHandler);
  677. // } else
  678. if (p.char() === '<' && p.char(1) === '!') {
  679. switch (p.char(2)) {
  680. case 'E': // ELEMENT | ENTITY
  681. if (p.char(3) === 'L') {
  682. current = p.getMatch(g.elementdecl);
  683. } else if (p.char(3) === 'N') {
  684. current = p.getMatch(g.EntityDecl);
  685. }
  686. break;
  687. case 'A': // ATTRIBUTE
  688. current = p.getMatch(g.AttlistDecl);
  689. break;
  690. case 'N': // NOTATION
  691. current = p.getMatch(g.NotationDecl);
  692. break;
  693. case '-': // COMMENT
  694. current = p.getMatch(g.Comment);
  695. break;
  696. }
  697. } else if (p.char() === '<' && p.char(1) === '?') {
  698. current = parsePI(p, errorHandler);
  699. } else if (p.char() === '%') {
  700. current = p.getMatch(g.PEReference);
  701. } else {
  702. return errorHandler.fatalError('Error detected in Markup declaration');
  703. }
  704. if (!current) {
  705. return errorHandler.fatalError('Error in internal subset at position ' + p.getIndex());
  706. }
  707. }
  708. return errorHandler.fatalError('doctype internal subset is not well-formed, missing ]');
  709. }
  710. }
  711. /**
  712. * Called when the parser encounters an element starting with '<!'.
  713. *
  714. * @param {string} source
  715. * The xml.
  716. * @param {number} start
  717. * the start index of the '<!'
  718. * @param {DOMHandler} domBuilder
  719. * @param {DOMHandler} errorHandler
  720. * @param {boolean} isHTML
  721. * @returns {number | never}
  722. * The end index of the element.
  723. * @throws {ParseError}
  724. * In case the element is not well-formed.
  725. */
  726. function parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isHTML) {
  727. var p = parseUtils(source, start);
  728. switch (isHTML ? p.char(2).toUpperCase() : p.char(2)) {
  729. case '-':
  730. // should be a comment
  731. var comment = p.getMatch(g.Comment);
  732. if (comment) {
  733. domBuilder.comment(comment, g.COMMENT_START.length, comment.length - g.COMMENT_START.length - g.COMMENT_END.length);
  734. return p.getIndex();
  735. } else {
  736. return errorHandler.fatalError('comment is not well-formed at position ' + p.getIndex());
  737. }
  738. case '[':
  739. // should be CDATA
  740. var cdata = p.getMatch(g.CDSect);
  741. if (cdata) {
  742. if (!isHTML && !domBuilder.currentElement) {
  743. return errorHandler.fatalError('CDATA outside of element');
  744. }
  745. domBuilder.startCDATA();
  746. domBuilder.characters(cdata, g.CDATA_START.length, cdata.length - g.CDATA_START.length - g.CDATA_END.length);
  747. domBuilder.endCDATA();
  748. return p.getIndex();
  749. } else {
  750. return errorHandler.fatalError('Invalid CDATA starting at position ' + start);
  751. }
  752. case 'D': {
  753. // should be DOCTYPE
  754. if (domBuilder.doc && domBuilder.doc.documentElement) {
  755. return errorHandler.fatalError('Doctype not allowed inside or after documentElement at position ' + p.getIndex());
  756. }
  757. if (isHTML ? !p.substringStartsWithCaseInsensitive(g.DOCTYPE_DECL_START) : !p.substringStartsWith(g.DOCTYPE_DECL_START)) {
  758. return errorHandler.fatalError('Expected ' + g.DOCTYPE_DECL_START + ' at position ' + p.getIndex());
  759. }
  760. p.skip(g.DOCTYPE_DECL_START.length);
  761. if (p.skipBlanks() < 1) {
  762. return errorHandler.fatalError('Expected whitespace after ' + g.DOCTYPE_DECL_START + ' at position ' + p.getIndex());
  763. }
  764. var doctype = {
  765. name: undefined,
  766. publicId: undefined,
  767. systemId: undefined,
  768. internalSubset: undefined,
  769. };
  770. // Parse the DOCTYPE name
  771. doctype.name = p.getMatch(g.Name);
  772. if (!doctype.name)
  773. return errorHandler.fatalError('doctype name missing or contains unexpected characters at position ' + p.getIndex());
  774. if (isHTML && doctype.name.toLowerCase() !== 'html') {
  775. errorHandler.warning('Unexpected DOCTYPE in HTML document at position ' + p.getIndex());
  776. }
  777. p.skipBlanks();
  778. // Check for ExternalID
  779. if (p.substringStartsWith(g.PUBLIC) || p.substringStartsWith(g.SYSTEM)) {
  780. var match = g.ExternalID_match.exec(p.substringFromIndex());
  781. if (!match) {
  782. return errorHandler.fatalError('doctype external id is not well-formed at position ' + p.getIndex());
  783. }
  784. if (match.groups.SystemLiteralOnly !== undefined) {
  785. doctype.systemId = match.groups.SystemLiteralOnly;
  786. } else {
  787. doctype.systemId = match.groups.SystemLiteral;
  788. doctype.publicId = match.groups.PubidLiteral;
  789. }
  790. p.skip(match[0].length);
  791. } else if (isHTML && p.substringStartsWithCaseInsensitive(g.SYSTEM)) {
  792. // https://html.spec.whatwg.org/multipage/syntax.html#doctype-legacy-string
  793. p.skip(g.SYSTEM.length);
  794. if (p.skipBlanks() < 1) {
  795. return errorHandler.fatalError('Expected whitespace after ' + g.SYSTEM + ' at position ' + p.getIndex());
  796. }
  797. doctype.systemId = p.getMatch(g.ABOUT_LEGACY_COMPAT_SystemLiteral);
  798. if (!doctype.systemId) {
  799. return errorHandler.fatalError(
  800. 'Expected ' + g.ABOUT_LEGACY_COMPAT + ' in single or double quotes after ' + g.SYSTEM + ' at position ' + p.getIndex()
  801. );
  802. }
  803. }
  804. if (isHTML && doctype.systemId && !g.ABOUT_LEGACY_COMPAT_SystemLiteral.test(doctype.systemId)) {
  805. errorHandler.warning('Unexpected doctype.systemId in HTML document at position ' + p.getIndex());
  806. }
  807. if (!isHTML) {
  808. p.skipBlanks();
  809. doctype.internalSubset = parseDoctypeInternalSubset(p, errorHandler);
  810. }
  811. p.skipBlanks();
  812. if (p.char() !== '>') {
  813. return errorHandler.fatalError('doctype not terminated with > at position ' + p.getIndex());
  814. }
  815. p.skip(1);
  816. domBuilder.startDTD(doctype.name, doctype.publicId, doctype.systemId, doctype.internalSubset);
  817. domBuilder.endDTD();
  818. return p.getIndex();
  819. }
  820. default:
  821. return errorHandler.fatalError('Not well-formed XML starting with "<!" at position ' + start);
  822. }
  823. }
  824. function parseProcessingInstruction(source, start, domBuilder, errorHandler) {
  825. var match = source.substring(start).match(g.PI);
  826. if (!match) {
  827. return errorHandler.fatalError('Invalid processing instruction starting at position ' + start);
  828. }
  829. if (match[1].toLowerCase() === 'xml') {
  830. if (start > 0) {
  831. return errorHandler.fatalError(
  832. 'processing instruction at position ' + start + ' is an xml declaration which is only at the start of the document'
  833. );
  834. }
  835. if (!g.XMLDecl.test(source.substring(start))) {
  836. return errorHandler.fatalError('xml declaration is not well-formed');
  837. }
  838. }
  839. domBuilder.processingInstruction(match[1], match[2]);
  840. return start + match[0].length;
  841. }
  842. function ElementAttributes() {
  843. this.attributeNames = Object.create(null);
  844. }
  845. ElementAttributes.prototype = {
  846. setTagName: function (tagName) {
  847. if (!g.QName_exact.test(tagName)) {
  848. throw new Error('invalid tagName:' + tagName);
  849. }
  850. this.tagName = tagName;
  851. },
  852. addValue: function (qName, value, offset) {
  853. if (!g.QName_exact.test(qName)) {
  854. throw new Error('invalid attribute:' + qName);
  855. }
  856. this.attributeNames[qName] = this.length;
  857. this[this.length++] = { qName: qName, value: value, offset: offset };
  858. },
  859. length: 0,
  860. getLocalName: function (i) {
  861. return this[i].localName;
  862. },
  863. getLocator: function (i) {
  864. return this[i].locator;
  865. },
  866. getQName: function (i) {
  867. return this[i].qName;
  868. },
  869. getURI: function (i) {
  870. return this[i].uri;
  871. },
  872. getValue: function (i) {
  873. return this[i].value;
  874. },
  875. // ,getIndex:function(uri, localName)){
  876. // if(localName){
  877. //
  878. // }else{
  879. // var qName = uri
  880. // }
  881. // },
  882. // getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
  883. // getType:function(uri,localName){}
  884. // getType:function(i){},
  885. };
  886. exports.XMLReader = XMLReader;
  887. exports.parseUtils = parseUtils;
  888. exports.parseDoctypeCommentOrCData = parseDoctypeCommentOrCData;