'use strict'; var conventions = require('./conventions'); var g = require('./grammar'); var errors = require('./errors'); var isHTMLEscapableRawTextElement = conventions.isHTMLEscapableRawTextElement; var isHTMLMimeType = conventions.isHTMLMimeType; var isHTMLRawTextElement = conventions.isHTMLRawTextElement; var hasOwn = conventions.hasOwn; var NAMESPACE = conventions.NAMESPACE; var ParseError = errors.ParseError; var DOMException = errors.DOMException; //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE var S_TAG = 0; //tag name offerring var S_ATTR = 1; //attr name offerring var S_ATTR_SPACE = 2; //attr name end and space offer var S_EQ = 3; //=space? var S_ATTR_NOQUOT_VALUE = 4; //attr value(no quot value only) var S_ATTR_END = 5; //attr value end and no space(quot end) var S_TAG_SPACE = 6; //(attr value end || tag end ) && (space offer) var S_TAG_CLOSE = 7; //closed el function XMLReader() {} XMLReader.prototype = { parse: function (source, defaultNSMap, entityMap) { var domBuilder = this.domBuilder; domBuilder.startDocument(); _copy(defaultNSMap, (defaultNSMap = Object.create(null))); parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler); domBuilder.endDocument(); }, }; /** * Detecting everything that might be a reference, * including those without ending `;`, since those are allowed in HTML. * The entityReplacer takes care of verifying and transforming each occurrence, * and reports to the errorHandler on those that are not OK, * depending on the context. */ var ENTITY_REG = /&#?\w+;?/g; function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) { var isHTML = isHTMLMimeType(domBuilder.mimeType); if (source.indexOf(g.UNICODE_REPLACEMENT_CHARACTER) >= 0) { errorHandler.warning('Unicode replacement character detected, source encoding issues?'); } function fixedFromCharCode(code) { // String.prototype.fromCharCode does not supports // > 2 bytes unicode chars directly if (code > 0xffff) { code -= 0x10000; var surrogate1 = 0xd800 + (code >> 10), surrogate2 = 0xdc00 + (code & 0x3ff); return String.fromCharCode(surrogate1, surrogate2); } else { return String.fromCharCode(code); } } function entityReplacer(a) { var complete = a[a.length - 1] === ';' ? a : a + ';'; if (!isHTML && complete !== a) { errorHandler.error('EntityRef: expecting ;'); return a; } var match = g.Reference.exec(complete); if (!match || match[0].length !== complete.length) { errorHandler.error('entity not matching Reference production: ' + a); return a; } var k = complete.slice(1, -1); if (hasOwn(entityMap, k)) { return entityMap[k]; } else if (k.charAt(0) === '#') { return fixedFromCharCode(parseInt(k.substring(1).replace('x', '0x'))); } else { errorHandler.error('entity not found:' + a); return a; } } function appendText(end) { //has some bugs if (end > start) { var xt = source.substring(start, end).replace(ENTITY_REG, entityReplacer); locator && position(start); domBuilder.characters(xt, 0, end - start); start = end; } } var lineStart = 0; var lineEnd = 0; var linePattern = /\r\n?|\n|$/g; var locator = domBuilder.locator; function position(p, m) { while (p >= lineEnd && (m = linePattern.exec(source))) { lineStart = lineEnd; lineEnd = m.index + m[0].length; locator.lineNumber++; } locator.columnNumber = p - lineStart + 1; } var parseStack = [{ currentNSMap: defaultNSMapCopy }]; var unclosedTags = []; var start = 0; while (true) { try { var tagStart = source.indexOf('<', start); if (tagStart < 0) { if (!isHTML && unclosedTags.length > 0) { return errorHandler.fatalError('unclosed xml tag(s): ' + unclosedTags.join(', ')); } if (!source.substring(start).match(/^\s*$/)) { var doc = domBuilder.doc; var text = doc.createTextNode(source.substring(start)); if (doc.documentElement) { return errorHandler.error('Extra content at the end of the document'); } doc.appendChild(text); domBuilder.currentElement = text; } return; } if (tagStart > start) { var fromSource = source.substring(start, tagStart); if (!isHTML && unclosedTags.length === 0) { fromSource = fromSource.replace(new RegExp(g.S_OPT.source, 'g'), ''); fromSource && errorHandler.error("Unexpected content outside root element: '" + fromSource + "'"); } appendText(tagStart); } switch (source.charAt(tagStart + 1)) { case '/': var end = source.indexOf('>', tagStart + 2); var tagNameRaw = source.substring(tagStart + 2, end > 0 ? end : undefined); if (!tagNameRaw) { return errorHandler.fatalError('end tag name missing'); } var tagNameMatch = end > 0 && g.reg('^', g.QName_group, g.S_OPT, '$').exec(tagNameRaw); if (!tagNameMatch) { return errorHandler.fatalError('end tag name contains invalid characters: "' + tagNameRaw + '"'); } if (!domBuilder.currentElement && !domBuilder.doc.documentElement) { // not enough information to provide a helpful error message, // but parsing will throw since there is no root element return; } var currentTagName = unclosedTags[unclosedTags.length - 1] || domBuilder.currentElement.tagName || domBuilder.doc.documentElement.tagName || ''; if (currentTagName !== tagNameMatch[1]) { var tagNameLower = tagNameMatch[1].toLowerCase(); if (!isHTML || currentTagName.toLowerCase() !== tagNameLower) { return errorHandler.fatalError('Opening and ending tag mismatch: "' + currentTagName + '" != "' + tagNameRaw + '"'); } } var config = parseStack.pop(); unclosedTags.pop(); var localNSMap = config.localNSMap; domBuilder.endElement(config.uri, config.localName, currentTagName); if (localNSMap) { for (var prefix in localNSMap) { if (hasOwn(localNSMap, prefix)) { domBuilder.endPrefixMapping(prefix); } } } end++; break; // end element case '?': // locator && position(tagStart); end = parseProcessingInstruction(source, tagStart, domBuilder, errorHandler); break; case '!': // start) { start = end; } else { //Possible sax fallback here, risk of positional error appendText(Math.max(tagStart, start) + 1); } } } function copyLocator(f, t) { t.lineNumber = f.lineNumber; t.columnNumber = f.columnNumber; return t; } /** * @returns * end of the elementStartPart(end of elementEndPart for selfClosed el) * @see {@link #appendElement} */ function parseElementStartPart(source, start, el, currentNSMap, entityReplacer, errorHandler, isHTML) { /** * @param {string} qname * @param {string} value * @param {number} startIndex */ function addAttribute(qname, value, startIndex) { if (hasOwn(el.attributeNames, qname)) { return errorHandler.fatalError('Attribute ' + qname + ' redefined'); } if (!isHTML && value.indexOf('<') >= 0) { return errorHandler.fatalError("Unescaped '<' not allowed in attributes values"); } el.addValue( qname, // @see https://www.w3.org/TR/xml/#AVNormalize // since the xmldom sax parser does not "interpret" DTD the following is not implemented: // - recursive replacement of (DTD) entity references // - trimming and collapsing multiple spaces into a single one for attributes that are not of type CDATA value.replace(/[\t\n\r]/g, ' ').replace(ENTITY_REG, entityReplacer), startIndex ); } var attrName; var value; var p = ++start; var s = S_TAG; //status while (true) { var c = source.charAt(p); switch (c) { case '=': if (s === S_ATTR) { //attrName attrName = source.slice(start, p); s = S_EQ; } else if (s === S_ATTR_SPACE) { s = S_EQ; } else { //fatalError: equal must after attrName or space after attrName throw new Error('attribute equal must after attrName'); // No known test case } break; case "'": case '"': if ( s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE ) { //equal if (s === S_ATTR) { errorHandler.warning('attribute value must after "="'); attrName = source.slice(start, p); } start = p + 1; p = source.indexOf(c, start); if (p > 0) { value = source.slice(start, p); addAttribute(attrName, value, start - 1); s = S_ATTR_END; } else { //fatalError: no end quot match throw new Error("attribute value no end '" + c + "' match"); } } else if (s == S_ATTR_NOQUOT_VALUE) { value = source.slice(start, p); addAttribute(attrName, value, start); errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!'); start = p + 1; s = S_ATTR_END; } else { //fatalError: no equal before throw new Error('attribute value must after "="'); // No known test case } break; case '/': switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); case S_ATTR_END: case S_TAG_SPACE: case S_TAG_CLOSE: s = S_TAG_CLOSE; el.closed = true; case S_ATTR_NOQUOT_VALUE: case S_ATTR: break; case S_ATTR_SPACE: el.closed = true; break; //case S_EQ: default: throw new Error("attribute invalid close char('/')"); // No known test case } break; case '': //end document errorHandler.error('unexpected end of input'); if (s == S_TAG) { el.setTagName(source.slice(start, p)); } return p; case '>': switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); case S_ATTR_END: case S_TAG_SPACE: case S_TAG_CLOSE: break; //normal case S_ATTR_NOQUOT_VALUE: //Compatible state case S_ATTR: value = source.slice(start, p); if (value.slice(-1) === '/') { el.closed = true; value = value.slice(0, -1); } case S_ATTR_SPACE: if (s === S_ATTR_SPACE) { value = attrName; } if (s == S_ATTR_NOQUOT_VALUE) { errorHandler.warning('attribute "' + value + '" missed quot(")!'); addAttribute(attrName, value, start); } else { if (!isHTML) { errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!'); } addAttribute(value, value, start); } break; case S_EQ: if (!isHTML) { return errorHandler.fatalError('AttValue: \' or " expected'); } } return p; /*xml space '\x20' | #x9 | #xD | #xA; */ case '\u0080': c = ' '; default: if (c <= ' ') { //space switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); //tagName s = S_TAG_SPACE; break; case S_ATTR: attrName = source.slice(start, p); s = S_ATTR_SPACE; break; case S_ATTR_NOQUOT_VALUE: var value = source.slice(start, p); errorHandler.warning('attribute "' + value + '" missed quot(")!!'); addAttribute(attrName, value, start); case S_ATTR_END: s = S_TAG_SPACE; break; //case S_TAG_SPACE: //case S_EQ: //case S_ATTR_SPACE: // void();break; //case S_TAG_CLOSE: //ignore warning } } else { //not space //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE switch (s) { //case S_TAG:void();break; //case S_ATTR:void();break; //case S_ATTR_NOQUOT_VALUE:void();break; case S_ATTR_SPACE: if (!isHTML) { errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!'); } addAttribute(attrName, attrName, start); start = p; s = S_ATTR; break; case S_ATTR_END: errorHandler.warning('attribute space is required"' + attrName + '"!!'); case S_TAG_SPACE: s = S_ATTR; start = p; break; case S_EQ: s = S_ATTR_NOQUOT_VALUE; start = p; break; case S_TAG_CLOSE: throw new Error("elements closed character '/' and '>' must be connected to"); } } } //end outer switch p++; } } /** * @returns * `true` if a new namespace has been defined. */ function appendElement(el, domBuilder, currentNSMap) { var tagName = el.tagName; var localNSMap = null; var i = el.length; while (i--) { var a = el[i]; var qName = a.qName; var value = a.value; var nsp = qName.indexOf(':'); if (nsp > 0) { var prefix = (a.prefix = qName.slice(0, nsp)); var localName = qName.slice(nsp + 1); var nsPrefix = prefix === 'xmlns' && localName; } else { localName = qName; prefix = null; nsPrefix = qName === 'xmlns' && ''; } //can not set prefix,because prefix !== '' a.localName = localName; //prefix == null for no ns prefix attribute if (nsPrefix !== false) { //hack!! if (localNSMap == null) { localNSMap = Object.create(null); _copy(currentNSMap, (currentNSMap = Object.create(null))); } currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value; a.uri = NAMESPACE.XMLNS; domBuilder.startPrefixMapping(nsPrefix, value); } } var i = el.length; while (i--) { a = el[i]; if (a.prefix) { //no prefix attribute has no namespace if (a.prefix === 'xml') { a.uri = NAMESPACE.XML; } if (a.prefix !== 'xmlns') { a.uri = currentNSMap[a.prefix]; } } } var nsp = tagName.indexOf(':'); if (nsp > 0) { prefix = el.prefix = tagName.slice(0, nsp); localName = el.localName = tagName.slice(nsp + 1); } else { prefix = null; //important!! localName = el.localName = tagName; } //no prefix element has default namespace var ns = (el.uri = currentNSMap[prefix || '']); domBuilder.startElement(ns, localName, tagName, el); //endPrefixMapping and startPrefixMapping have not any help for dom builder //localNSMap = null if (el.closed) { domBuilder.endElement(ns, localName, tagName); if (localNSMap) { for (prefix in localNSMap) { if (hasOwn(localNSMap, prefix)) { domBuilder.endPrefixMapping(prefix); } } } } else { el.currentNSMap = currentNSMap; el.localNSMap = localNSMap; //parseStack.push(el); return true; } } function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) { // https://html.spec.whatwg.org/#raw-text-elements // https://html.spec.whatwg.org/#escapable-raw-text-elements // https://html.spec.whatwg.org/#cdata-rcdata-restrictions:raw-text-elements // TODO: https://html.spec.whatwg.org/#cdata-rcdata-restrictions var isEscapableRaw = isHTMLEscapableRawTextElement(tagName); if (isEscapableRaw || isHTMLRawTextElement(tagName)) { var elEndStart = source.indexOf('', elStartEnd); var text = source.substring(elStartEnd + 1, elEndStart); if (isEscapableRaw) { text = text.replace(ENTITY_REG, entityReplacer); } domBuilder.characters(text, 0, text.length); return elEndStart; } return elStartEnd + 1; } function _copy(source, target) { for (var n in source) { if (hasOwn(source, n)) { target[n] = source[n]; } } } /** * @typedef ParseUtils * @property {function(relativeIndex: number?): string | undefined} char * Provides look ahead access to a singe character relative to the current index. * @property {function(): number} getIndex * Provides read-only access to the current index. * @property {function(reg: RegExp): string | null} getMatch * Applies the provided regular expression enforcing that it starts at the current index and * returns the complete matching string, * and moves the current index by the length of the matching string. * @property {function(): string} getSource * Provides read-only access to the complete source. * @property {function(places: number?): void} skip * moves the current index by places (defaults to 1) * @property {function(): number} skipBlanks * Moves the current index by the amount of white space that directly follows the current index * and returns the amount of whitespace chars skipped (0..n), * or -1 if the end of the source was reached. * @property {function(): string} substringFromIndex * creates a substring from the current index to the end of `source` * @property {function(compareWith: string): boolean} substringStartsWith * Checks if `source` contains `compareWith`, starting from the current index. * @property {function(compareWith: string): boolean} substringStartsWithCaseInsensitive * Checks if `source` contains `compareWith`, starting from the current index, * comparing the upper case of both sides. * @see {@link parseUtils} */ /** * A temporary scope for parsing and look ahead operations in `source`, * starting from index `start`. * * Some operations move the current index by a number of positions, * after which `getIndex` returns the new index. * * @param {string} source * @param {number} start * @returns {ParseUtils} */ function parseUtils(source, start) { var index = start; function char(n) { n = n || 0; return source.charAt(index + n); } function skip(n) { n = n || 1; index += n; } function skipBlanks() { var blanks = 0; while (index < source.length) { var c = char(); if (c !== ' ' && c !== '\n' && c !== '\t' && c !== '\r') { return blanks; } blanks++; skip(); } return -1; } function substringFromIndex() { return source.substring(index); } function substringStartsWith(text) { return source.substring(index, index + text.length) === text; } function substringStartsWithCaseInsensitive(text) { return source.substring(index, index + text.length).toUpperCase() === text.toUpperCase(); } function getMatch(args) { var expr = g.reg('^', args); var match = expr.exec(substringFromIndex()); if (match) { skip(match[0].length); return match[0]; } return null; } return { char: char, getIndex: function () { return index; }, getMatch: getMatch, getSource: function () { return source; }, skip: skip, skipBlanks: skipBlanks, substringFromIndex: substringFromIndex, substringStartsWith: substringStartsWith, substringStartsWithCaseInsensitive: substringStartsWithCaseInsensitive, }; } /** * @param {ParseUtils} p * @param {DOMHandler} errorHandler * @returns {string} */ function parseDoctypeInternalSubset(p, errorHandler) { /** * @param {ParseUtils} p * @param {DOMHandler} errorHandler * @returns {string} */ function parsePI(p, errorHandler) { var match = g.PI.exec(p.substringFromIndex()); if (!match) { return errorHandler.fatalError('processing instruction is not well-formed at position ' + p.getIndex()); } if (match[1].toLowerCase() === 'xml') { return errorHandler.fatalError( 'xml declaration is only allowed at the start of the document, but found at position ' + p.getIndex() ); } p.skip(match[0].length); return match[0]; } // Parse internal subset var source = p.getSource(); if (p.char() === '[') { p.skip(1); var intSubsetStart = p.getIndex(); while (p.getIndex() < source.length) { p.skipBlanks(); if (p.char() === ']') { var internalSubset = source.substring(intSubsetStart, p.getIndex()); p.skip(1); return internalSubset; } var current = null; // Only in external subset // if (char() === '<' && char(1) === '!' && char(2) === '[') { // parseConditionalSections(p, errorHandler); // } else if (p.char() === '<' && p.char(1) === '!') { switch (p.char(2)) { case 'E': // ELEMENT | ENTITY if (p.char(3) === 'L') { current = p.getMatch(g.elementdecl); } else if (p.char(3) === 'N') { current = p.getMatch(g.EntityDecl); } break; case 'A': // ATTRIBUTE current = p.getMatch(g.AttlistDecl); break; case 'N': // NOTATION current = p.getMatch(g.NotationDecl); break; case '-': // COMMENT current = p.getMatch(g.Comment); break; } } else if (p.char() === '<' && p.char(1) === '?') { current = parsePI(p, errorHandler); } else if (p.char() === '%') { current = p.getMatch(g.PEReference); } else { return errorHandler.fatalError('Error detected in Markup declaration'); } if (!current) { return errorHandler.fatalError('Error in internal subset at position ' + p.getIndex()); } } return errorHandler.fatalError('doctype internal subset is not well-formed, missing ]'); } } /** * Called when the parser encounters an element starting with '') { return errorHandler.fatalError('doctype not terminated with > at position ' + p.getIndex()); } p.skip(1); domBuilder.startDTD(doctype.name, doctype.publicId, doctype.systemId, doctype.internalSubset); domBuilder.endDTD(); return p.getIndex(); } default: return errorHandler.fatalError('Not well-formed XML starting with " 0) { return errorHandler.fatalError( 'processing instruction at position ' + start + ' is an xml declaration which is only at the start of the document' ); } if (!g.XMLDecl.test(source.substring(start))) { return errorHandler.fatalError('xml declaration is not well-formed'); } } domBuilder.processingInstruction(match[1], match[2]); return start + match[0].length; } function ElementAttributes() { this.attributeNames = Object.create(null); } ElementAttributes.prototype = { setTagName: function (tagName) { if (!g.QName_exact.test(tagName)) { throw new Error('invalid tagName:' + tagName); } this.tagName = tagName; }, addValue: function (qName, value, offset) { if (!g.QName_exact.test(qName)) { throw new Error('invalid attribute:' + qName); } this.attributeNames[qName] = this.length; this[this.length++] = { qName: qName, value: value, offset: offset }; }, length: 0, getLocalName: function (i) { return this[i].localName; }, getLocator: function (i) { return this[i].locator; }, getQName: function (i) { return this[i].qName; }, getURI: function (i) { return this[i].uri; }, getValue: function (i) { return this[i].value; }, // ,getIndex:function(uri, localName)){ // if(localName){ // // }else{ // var qName = uri // } // }, // getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, // getType:function(uri,localName){} // getType:function(i){}, }; exports.XMLReader = XMLReader; exports.parseUtils = parseUtils; exports.parseDoctypeCommentOrCData = parseDoctypeCommentOrCData;