dom-parser.js 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. 'use strict';
  2. var conventions = require('./conventions');
  3. var dom = require('./dom');
  4. var errors = require('./errors');
  5. var entities = require('./entities');
  6. var sax = require('./sax');
  7. var DOMImplementation = dom.DOMImplementation;
  8. var hasDefaultHTMLNamespace = conventions.hasDefaultHTMLNamespace;
  9. var isHTMLMimeType = conventions.isHTMLMimeType;
  10. var isValidMimeType = conventions.isValidMimeType;
  11. var MIME_TYPE = conventions.MIME_TYPE;
  12. var NAMESPACE = conventions.NAMESPACE;
  13. var ParseError = errors.ParseError;
  14. var XMLReader = sax.XMLReader;
  15. /**
  16. * Normalizes line ending according to <https://www.w3.org/TR/xml11/#sec-line-ends>,
  17. * including some Unicode "newline" characters:
  18. *
  19. * > XML parsed entities are often stored in computer files which,
  20. * > for editing convenience, are organized into lines.
  21. * > These lines are typically separated by some combination
  22. * > of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
  23. * >
  24. * > To simplify the tasks of applications, the XML processor must behave
  25. * > as if it normalized all line breaks in external parsed entities (including the document entity)
  26. * > on input, before parsing, by translating the following to a single #xA character:
  27. * >
  28. * > 1. the two-character sequence #xD #xA,
  29. * > 2. the two-character sequence #xD #x85,
  30. * > 3. the single character #x85,
  31. * > 4. the single character #x2028,
  32. * > 5. the single character #x2029,
  33. * > 6. any #xD character that is not immediately followed by #xA or #x85.
  34. *
  35. * @param {string} input
  36. * @returns {string}
  37. * @prettierignore
  38. */
  39. function normalizeLineEndings(input) {
  40. return input.replace(/\r[\n\u0085]/g, '\n').replace(/[\r\u0085\u2028\u2029]/g, '\n');
  41. }
  42. /**
  43. * @typedef Locator
  44. * @property {number} [columnNumber]
  45. * @property {number} [lineNumber]
  46. */
  47. /**
  48. * @typedef DOMParserOptions
  49. * @property {typeof assign} [assign]
  50. * The method to use instead of `conventions.assign`, which is used to copy values from
  51. * `options` before they are used for parsing.
  52. * @property {typeof DOMHandler} [domHandler]
  53. * For internal testing: The class for creating an instance for handling events from the SAX
  54. * parser.
  55. * *****Warning: By configuring a faulty implementation, the specified behavior can completely
  56. * be broken.*****.
  57. * @property {Function} [errorHandler]
  58. * DEPRECATED! use `onError` instead.
  59. * @property {function(level:ErrorLevel, message:string, context: DOMHandler):void}
  60. * [onError]
  61. * A function invoked for every error that occurs during parsing.
  62. *
  63. * If it is not provided, all errors are reported to `console.error`
  64. * and only `fatalError`s are thrown as a `ParseError`,
  65. * which prevents any further processing.
  66. * If the provided method throws, a `ParserError` is thrown,
  67. * which prevents any further processing.
  68. *
  69. * Be aware that many `warning`s are considered an error that prevents further processing in
  70. * most implementations.
  71. * @property {boolean} [locator=true]
  72. * Configures if the nodes created during parsing will have a `lineNumber` and a `columnNumber`
  73. * attribute describing their location in the XML string.
  74. * Default is true.
  75. * @property {(string) => string} [normalizeLineEndings]
  76. * used to replace line endings before parsing, defaults to exported `normalizeLineEndings`,
  77. * which normalizes line endings according to <https://www.w3.org/TR/xml11/#sec-line-ends>,
  78. * including some Unicode "newline" characters.
  79. * @property {Object} [xmlns]
  80. * The XML namespaces that should be assumed when parsing.
  81. * The default namespace can be provided by the key that is the empty string.
  82. * When the `mimeType` for HTML, XHTML or SVG are passed to `parseFromString`,
  83. * the default namespace that will be used,
  84. * will be overridden according to the specification.
  85. * @see {@link normalizeLineEndings}
  86. */
  87. /**
  88. * The DOMParser interface provides the ability to parse XML or HTML source code from a string
  89. * into a DOM `Document`.
  90. *
  91. * ***xmldom is different from the spec in that it allows an `options` parameter,
  92. * to control the behavior***.
  93. *
  94. * @class
  95. * @param {DOMParserOptions} [options]
  96. * @see https://developer.mozilla.org/en-US/docs/Web/API/DOMParser
  97. * @see https://html.spec.whatwg.org/multipage/dynamic-markup-insertion.html#dom-parsing-and-serialization
  98. */
  99. function DOMParser(options) {
  100. options = options || {};
  101. if (options.locator === undefined) {
  102. options.locator = true;
  103. }
  104. /**
  105. * The method to use instead of `conventions.assign`, which is used to copy values from
  106. * `options`
  107. * before they are used for parsing.
  108. *
  109. * @type {conventions.assign}
  110. * @private
  111. * @see {@link conventions.assign}
  112. * @readonly
  113. */
  114. this.assign = options.assign || conventions.assign;
  115. /**
  116. * For internal testing: The class for creating an instance for handling events from the SAX
  117. * parser.
  118. * *****Warning: By configuring a faulty implementation, the specified behavior can completely
  119. * be broken*****.
  120. *
  121. * @type {typeof DOMHandler}
  122. * @private
  123. * @readonly
  124. */
  125. this.domHandler = options.domHandler || DOMHandler;
  126. /**
  127. * A function that is invoked for every error that occurs during parsing.
  128. *
  129. * If it is not provided, all errors are reported to `console.error`
  130. * and only `fatalError`s are thrown as a `ParseError`,
  131. * which prevents any further processing.
  132. * If the provided method throws, a `ParserError` is thrown,
  133. * which prevents any further processing.
  134. *
  135. * Be aware that many `warning`s are considered an error that prevents further processing in
  136. * most implementations.
  137. *
  138. * @type {function(level:ErrorLevel, message:string, context: DOMHandler):void}
  139. * @see {@link onErrorStopParsing}
  140. * @see {@link onWarningStopParsing}
  141. */
  142. this.onError = options.onError || options.errorHandler;
  143. if (options.errorHandler && typeof options.errorHandler !== 'function') {
  144. throw new TypeError('errorHandler object is no longer supported, switch to onError!');
  145. } else if (options.errorHandler) {
  146. options.errorHandler('warning', 'The `errorHandler` option has been deprecated, use `onError` instead!', this);
  147. }
  148. /**
  149. * used to replace line endings before parsing, defaults to `normalizeLineEndings`
  150. *
  151. * @type {(string) => string}
  152. * @readonly
  153. */
  154. this.normalizeLineEndings = options.normalizeLineEndings || normalizeLineEndings;
  155. /**
  156. * Configures if the nodes created during parsing will have a `lineNumber` and a
  157. * `columnNumber`
  158. * attribute describing their location in the XML string.
  159. * Default is true.
  160. *
  161. * @type {boolean}
  162. * @readonly
  163. */
  164. this.locator = !!options.locator;
  165. /**
  166. * The default namespace can be provided by the key that is the empty string.
  167. * When the `mimeType` for HTML, XHTML or SVG are passed to `parseFromString`,
  168. * the default namespace that will be used,
  169. * will be overridden according to the specification.
  170. *
  171. * @type {Readonly<Object>}
  172. * @readonly
  173. */
  174. this.xmlns = this.assign(Object.create(null), options.xmlns);
  175. }
  176. /**
  177. * Parses `source` using the options in the way configured by the `DOMParserOptions` of `this`
  178. * `DOMParser`. If `mimeType` is `text/html` an HTML `Document` is created,
  179. * otherwise an XML `Document` is created.
  180. *
  181. * __It behaves different from the description in the living standard__:
  182. * - Uses the `options` passed to the `DOMParser` constructor to modify the behavior.
  183. * - Any unexpected input is reported to `onError` with either a `warning`,
  184. * `error` or `fatalError` level.
  185. * - Any `fatalError` throws a `ParseError` which prevents further processing.
  186. * - Any error thrown by `onError` is converted to a `ParseError` which prevents further
  187. * processing - If no `Document` was created during parsing it is reported as a `fatalError`.
  188. * *****Warning: By configuring a faulty DOMHandler implementation,
  189. * the specified behavior can completely be broken*****.
  190. *
  191. * @param {string} source
  192. * The XML mime type only allows string input!
  193. * @param {string} [mimeType='application/xml']
  194. * the mimeType or contentType of the document to be created determines the `type` of document
  195. * created (XML or HTML)
  196. * @returns {Document}
  197. * The `Document` node.
  198. * @throws {ParseError}
  199. * for any `fatalError` or anything that is thrown by `onError`
  200. * @throws {TypeError}
  201. * for any invalid `mimeType`
  202. * @see https://developer.mozilla.org/en-US/docs/Web/API/DOMParser/parseFromString
  203. * @see https://html.spec.whatwg.org/#dom-domparser-parsefromstring-dev
  204. */
  205. DOMParser.prototype.parseFromString = function (source, mimeType) {
  206. if (!isValidMimeType(mimeType)) {
  207. throw new TypeError('DOMParser.parseFromString: the provided mimeType "' + mimeType + '" is not valid.');
  208. }
  209. var defaultNSMap = this.assign(Object.create(null), this.xmlns);
  210. var entityMap = entities.XML_ENTITIES;
  211. var defaultNamespace = defaultNSMap[''] || null;
  212. if (hasDefaultHTMLNamespace(mimeType)) {
  213. entityMap = entities.HTML_ENTITIES;
  214. defaultNamespace = NAMESPACE.HTML;
  215. } else if (mimeType === MIME_TYPE.XML_SVG_IMAGE) {
  216. defaultNamespace = NAMESPACE.SVG;
  217. }
  218. defaultNSMap[''] = defaultNamespace;
  219. defaultNSMap.xml = defaultNSMap.xml || NAMESPACE.XML;
  220. var domBuilder = new this.domHandler({
  221. mimeType: mimeType,
  222. defaultNamespace: defaultNamespace,
  223. onError: this.onError,
  224. });
  225. var locator = this.locator ? {} : undefined;
  226. if (this.locator) {
  227. domBuilder.setDocumentLocator(locator);
  228. }
  229. var sax = new XMLReader();
  230. sax.errorHandler = domBuilder;
  231. sax.domBuilder = domBuilder;
  232. var isXml = !conventions.isHTMLMimeType(mimeType);
  233. if (isXml && typeof source !== 'string') {
  234. sax.errorHandler.fatalError('source is not a string');
  235. }
  236. sax.parse(this.normalizeLineEndings(String(source)), defaultNSMap, entityMap);
  237. if (!domBuilder.doc.documentElement) {
  238. sax.errorHandler.fatalError('missing root element');
  239. }
  240. return domBuilder.doc;
  241. };
  242. /**
  243. * @typedef DOMHandlerOptions
  244. * @property {string} [mimeType=MIME_TYPE.XML_APPLICATION]
  245. * @property {string | null} [defaultNamespace=null]
  246. */
  247. /**
  248. * The class that is used to handle events from the SAX parser to create the related DOM
  249. * elements.
  250. *
  251. * Some methods are only implemented as an empty function,
  252. * since they are (at least currently) not relevant for xmldom.
  253. *
  254. * @class
  255. * @param {DOMHandlerOptions} [options]
  256. * @see http://www.saxproject.org/apidoc/org/xml/sax/ext/DefaultHandler2.html
  257. */
  258. function DOMHandler(options) {
  259. var opt = options || {};
  260. /**
  261. * The mime type is used to determine if the DOM handler will create an XML or HTML document.
  262. * Only if it is set to `text/html` it will create an HTML document.
  263. * It defaults to MIME_TYPE.XML_APPLICATION.
  264. *
  265. * @type {string}
  266. * @see {@link MIME_TYPE}
  267. * @readonly
  268. */
  269. this.mimeType = opt.mimeType || MIME_TYPE.XML_APPLICATION;
  270. /**
  271. * The namespace to use to create an XML document.
  272. * For the following reasons this is required:
  273. * - The SAX API for `startDocument` doesn't offer any way to pass a namespace,
  274. * since at that point there is no way for the parser to know what the default namespace from
  275. * the document will be.
  276. * - When creating using `DOMImplementation.createDocument` it is required to pass a
  277. * namespace,
  278. * to determine the correct `Document.contentType`, which should match `this.mimeType`.
  279. * - When parsing an XML document with the `application/xhtml+xml` mimeType,
  280. * the HTML namespace needs to be the default namespace.
  281. *
  282. * @type {string | null}
  283. * @private
  284. * @readonly
  285. */
  286. this.defaultNamespace = opt.defaultNamespace || null;
  287. /**
  288. * @type {boolean}
  289. * @private
  290. */
  291. this.cdata = false;
  292. /**
  293. * The last `Element` that was created by `startElement`.
  294. * `endElement` sets it to the `currentElement.parentNode`.
  295. *
  296. * Note: The sax parser currently sets it to white space text nodes between tags.
  297. *
  298. * @type {Element | Node | undefined}
  299. * @private
  300. */
  301. this.currentElement = undefined;
  302. /**
  303. * The Document that is created as part of `startDocument`,
  304. * and returned by `DOMParser.parseFromString`.
  305. *
  306. * @type {Document | undefined}
  307. * @readonly
  308. */
  309. this.doc = undefined;
  310. /**
  311. * The locator is stored as part of setDocumentLocator.
  312. * It is controlled and mutated by the SAX parser to store the current parsing position.
  313. * It is used by DOMHandler to set `columnNumber` and `lineNumber`
  314. * on the DOM nodes.
  315. *
  316. * @type {Readonly<Locator> | undefined}
  317. * @private
  318. * @readonly (the
  319. * sax parser currently sometimes set's it)
  320. */
  321. this.locator = undefined;
  322. /**
  323. * @type {function (level:ErrorLevel ,message:string, context:DOMHandler):void}
  324. * @readonly
  325. */
  326. this.onError = opt.onError;
  327. }
  328. function position(locator, node) {
  329. node.lineNumber = locator.lineNumber;
  330. node.columnNumber = locator.columnNumber;
  331. }
  332. DOMHandler.prototype = {
  333. /**
  334. * Either creates an XML or an HTML document and stores it under `this.doc`.
  335. * If it is an XML document, `this.defaultNamespace` is used to create it,
  336. * and it will not contain any `childNodes`.
  337. * If it is an HTML document, it will be created without any `childNodes`.
  338. *
  339. * @see http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
  340. */
  341. startDocument: function () {
  342. var impl = new DOMImplementation();
  343. this.doc = isHTMLMimeType(this.mimeType) ? impl.createHTMLDocument(false) : impl.createDocument(this.defaultNamespace, '');
  344. },
  345. startElement: function (namespaceURI, localName, qName, attrs) {
  346. var doc = this.doc;
  347. var el = doc.createElementNS(namespaceURI, qName || localName);
  348. var len = attrs.length;
  349. appendElement(this, el);
  350. this.currentElement = el;
  351. this.locator && position(this.locator, el);
  352. for (var i = 0; i < len; i++) {
  353. var namespaceURI = attrs.getURI(i);
  354. var value = attrs.getValue(i);
  355. var qName = attrs.getQName(i);
  356. var attr = doc.createAttributeNS(namespaceURI, qName);
  357. this.locator && position(attrs.getLocator(i), attr);
  358. attr.value = attr.nodeValue = value;
  359. el.setAttributeNode(attr);
  360. }
  361. },
  362. endElement: function (namespaceURI, localName, qName) {
  363. this.currentElement = this.currentElement.parentNode;
  364. },
  365. startPrefixMapping: function (prefix, uri) {},
  366. endPrefixMapping: function (prefix) {},
  367. processingInstruction: function (target, data) {
  368. var ins = this.doc.createProcessingInstruction(target, data);
  369. this.locator && position(this.locator, ins);
  370. appendElement(this, ins);
  371. },
  372. ignorableWhitespace: function (ch, start, length) {},
  373. characters: function (chars, start, length) {
  374. chars = _toString.apply(this, arguments);
  375. //console.log(chars)
  376. if (chars) {
  377. if (this.cdata) {
  378. var charNode = this.doc.createCDATASection(chars);
  379. } else {
  380. var charNode = this.doc.createTextNode(chars);
  381. }
  382. if (this.currentElement) {
  383. this.currentElement.appendChild(charNode);
  384. } else if (/^\s*$/.test(chars)) {
  385. this.doc.appendChild(charNode);
  386. //process xml
  387. }
  388. this.locator && position(this.locator, charNode);
  389. }
  390. },
  391. skippedEntity: function (name) {},
  392. endDocument: function () {
  393. this.doc.normalize();
  394. },
  395. /**
  396. * Stores the locator to be able to set the `columnNumber` and `lineNumber`
  397. * on the created DOM nodes.
  398. *
  399. * @param {Locator} locator
  400. */
  401. setDocumentLocator: function (locator) {
  402. if (locator) {
  403. locator.lineNumber = 0;
  404. }
  405. this.locator = locator;
  406. },
  407. //LexicalHandler
  408. comment: function (chars, start, length) {
  409. chars = _toString.apply(this, arguments);
  410. var comm = this.doc.createComment(chars);
  411. this.locator && position(this.locator, comm);
  412. appendElement(this, comm);
  413. },
  414. startCDATA: function () {
  415. //used in characters() methods
  416. this.cdata = true;
  417. },
  418. endCDATA: function () {
  419. this.cdata = false;
  420. },
  421. startDTD: function (name, publicId, systemId, internalSubset) {
  422. var impl = this.doc.implementation;
  423. if (impl && impl.createDocumentType) {
  424. var dt = impl.createDocumentType(name, publicId, systemId, internalSubset);
  425. this.locator && position(this.locator, dt);
  426. appendElement(this, dt);
  427. this.doc.doctype = dt;
  428. }
  429. },
  430. reportError: function (level, message) {
  431. if (typeof this.onError === 'function') {
  432. try {
  433. this.onError(level, message, this);
  434. } catch (e) {
  435. throw new ParseError('Reporting ' + level + ' "' + message + '" caused ' + e, this.locator);
  436. }
  437. } else {
  438. console.error('[xmldom ' + level + ']\t' + message, _locator(this.locator));
  439. }
  440. },
  441. /**
  442. * @see http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
  443. */
  444. warning: function (message) {
  445. this.reportError('warning', message);
  446. },
  447. error: function (message) {
  448. this.reportError('error', message);
  449. },
  450. /**
  451. * This function reports a fatal error and throws a ParseError.
  452. *
  453. * @param {string} message
  454. * - The message to be used for reporting and throwing the error.
  455. * @returns {never}
  456. * This function always throws an error and never returns a value.
  457. * @throws {ParseError}
  458. * Always throws a ParseError with the provided message.
  459. */
  460. fatalError: function (message) {
  461. this.reportError('fatalError', message);
  462. throw new ParseError(message, this.locator);
  463. },
  464. };
  465. function _locator(l) {
  466. if (l) {
  467. return '\n@#[line:' + l.lineNumber + ',col:' + l.columnNumber + ']';
  468. }
  469. }
  470. function _toString(chars, start, length) {
  471. if (typeof chars == 'string') {
  472. return chars.substr(start, length);
  473. } else {
  474. //java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
  475. if (chars.length >= start + length || start) {
  476. return new java.lang.String(chars, start, length) + '';
  477. }
  478. return chars;
  479. }
  480. }
  481. /*
  482. * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
  483. * used method of org.xml.sax.ext.LexicalHandler:
  484. * #comment(chars, start, length)
  485. * #startCDATA()
  486. * #endCDATA()
  487. * #startDTD(name, publicId, systemId)
  488. *
  489. *
  490. * IGNORED method of org.xml.sax.ext.LexicalHandler:
  491. * #endDTD()
  492. * #startEntity(name)
  493. * #endEntity(name)
  494. *
  495. *
  496. * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
  497. * IGNORED method of org.xml.sax.ext.DeclHandler
  498. * #attributeDecl(eName, aName, type, mode, value)
  499. * #elementDecl(name, model)
  500. * #externalEntityDecl(name, publicId, systemId)
  501. * #internalEntityDecl(name, value)
  502. * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
  503. * IGNORED method of org.xml.sax.EntityResolver2
  504. * #resolveEntity(String name,String publicId,String baseURI,String systemId)
  505. * #resolveEntity(publicId, systemId)
  506. * #getExternalSubset(name, baseURI)
  507. * @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
  508. * IGNORED method of org.xml.sax.DTDHandler
  509. * #notationDecl(name, publicId, systemId) {};
  510. * #unparsedEntityDecl(name, publicId, systemId, notationName) {};
  511. */
  512. 'endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl'.replace(
  513. /\w+/g,
  514. function (key) {
  515. DOMHandler.prototype[key] = function () {
  516. return null;
  517. };
  518. }
  519. );
  520. /* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
  521. function appendElement(handler, node) {
  522. if (!handler.currentElement) {
  523. handler.doc.appendChild(node);
  524. } else {
  525. handler.currentElement.appendChild(node);
  526. }
  527. }
  528. /**
  529. * A method that prevents any further parsing when an `error`
  530. * with level `error` is reported during parsing.
  531. *
  532. * @see {@link DOMParserOptions.onError}
  533. * @see {@link onWarningStopParsing}
  534. */
  535. function onErrorStopParsing(level) {
  536. if (level === 'error') throw 'onErrorStopParsing';
  537. }
  538. /**
  539. * A method that prevents any further parsing when any `error` is reported during parsing.
  540. *
  541. * @see {@link DOMParserOptions.onError}
  542. * @see {@link onErrorStopParsing}
  543. */
  544. function onWarningStopParsing() {
  545. throw 'onWarningStopParsing';
  546. }
  547. exports.__DOMHandler = DOMHandler;
  548. exports.DOMParser = DOMParser;
  549. exports.normalizeLineEndings = normalizeLineEndings;
  550. exports.onErrorStopParsing = onErrorStopParsing;
  551. exports.onWarningStopParsing = onWarningStopParsing;