"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.QuoteType = void 0; var decode_js_1 = require("entities/lib/decode.js"); var CharCodes; (function (CharCodes) { CharCodes[CharCodes["Tab"] = 9] = "Tab"; CharCodes[CharCodes["NewLine"] = 10] = "NewLine"; CharCodes[CharCodes["FormFeed"] = 12] = "FormFeed"; CharCodes[CharCodes["CarriageReturn"] = 13] = "CarriageReturn"; CharCodes[CharCodes["Space"] = 32] = "Space"; CharCodes[CharCodes["ExclamationMark"] = 33] = "ExclamationMark"; CharCodes[CharCodes["Number"] = 35] = "Number"; CharCodes[CharCodes["Amp"] = 38] = "Amp"; CharCodes[CharCodes["SingleQuote"] = 39] = "SingleQuote"; CharCodes[CharCodes["DoubleQuote"] = 34] = "DoubleQuote"; CharCodes[CharCodes["Dash"] = 45] = "Dash"; CharCodes[CharCodes["Slash"] = 47] = "Slash"; CharCodes[CharCodes["Zero"] = 48] = "Zero"; CharCodes[CharCodes["Nine"] = 57] = "Nine"; CharCodes[CharCodes["Semi"] = 59] = "Semi"; CharCodes[CharCodes["Lt"] = 60] = "Lt"; CharCodes[CharCodes["Eq"] = 61] = "Eq"; CharCodes[CharCodes["Gt"] = 62] = "Gt"; CharCodes[CharCodes["Questionmark"] = 63] = "Questionmark"; CharCodes[CharCodes["UpperA"] = 65] = "UpperA"; CharCodes[CharCodes["LowerA"] = 97] = "LowerA"; CharCodes[CharCodes["UpperF"] = 70] = "UpperF"; CharCodes[CharCodes["LowerF"] = 102] = "LowerF"; CharCodes[CharCodes["UpperZ"] = 90] = "UpperZ"; CharCodes[CharCodes["LowerZ"] = 122] = "LowerZ"; CharCodes[CharCodes["LowerX"] = 120] = "LowerX"; CharCodes[CharCodes["OpeningSquareBracket"] = 91] = "OpeningSquareBracket"; })(CharCodes || (CharCodes = {})); /** All the states the tokenizer can be in. */ var State; (function (State) { State[State["Text"] = 1] = "Text"; State[State["BeforeTagName"] = 2] = "BeforeTagName"; State[State["InTagName"] = 3] = "InTagName"; State[State["InSelfClosingTag"] = 4] = "InSelfClosingTag"; State[State["BeforeClosingTagName"] = 5] = "BeforeClosingTagName"; State[State["InClosingTagName"] = 6] = "InClosingTagName"; State[State["AfterClosingTagName"] = 7] = "AfterClosingTagName"; // Attributes State[State["BeforeAttributeName"] = 8] = "BeforeAttributeName"; State[State["InAttributeName"] = 9] = "InAttributeName"; State[State["AfterAttributeName"] = 10] = "AfterAttributeName"; State[State["BeforeAttributeValue"] = 11] = "BeforeAttributeValue"; State[State["InAttributeValueDq"] = 12] = "InAttributeValueDq"; State[State["InAttributeValueSq"] = 13] = "InAttributeValueSq"; State[State["InAttributeValueNq"] = 14] = "InAttributeValueNq"; // Declarations State[State["BeforeDeclaration"] = 15] = "BeforeDeclaration"; State[State["InDeclaration"] = 16] = "InDeclaration"; // Processing instructions State[State["InProcessingInstruction"] = 17] = "InProcessingInstruction"; // Comments & CDATA State[State["BeforeComment"] = 18] = "BeforeComment"; State[State["CDATASequence"] = 19] = "CDATASequence"; State[State["InSpecialComment"] = 20] = "InSpecialComment"; State[State["InCommentLike"] = 21] = "InCommentLike"; // Special tags State[State["BeforeSpecialS"] = 22] = "BeforeSpecialS"; State[State["SpecialStartSequence"] = 23] = "SpecialStartSequence"; State[State["InSpecialTag"] = 24] = "InSpecialTag"; State[State["BeforeEntity"] = 25] = "BeforeEntity"; State[State["BeforeNumericEntity"] = 26] = "BeforeNumericEntity"; State[State["InNamedEntity"] = 27] = "InNamedEntity"; State[State["InNumericEntity"] = 28] = "InNumericEntity"; State[State["InHexEntity"] = 29] = "InHexEntity"; })(State || (State = {})); function isWhitespace(c) { return (c === CharCodes.Space || c === CharCodes.NewLine || c === CharCodes.Tab || c === CharCodes.FormFeed || c === CharCodes.CarriageReturn); } function isEndOfTagSection(c) { return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); } function isNumber(c) { return c >= CharCodes.Zero && c <= CharCodes.Nine; } function isASCIIAlpha(c) { return ((c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)); } function isHexDigit(c) { return ((c >= CharCodes.UpperA && c <= CharCodes.UpperF) || (c >= CharCodes.LowerA && c <= CharCodes.LowerF)); } var QuoteType; (function (QuoteType) { QuoteType[QuoteType["NoValue"] = 0] = "NoValue"; QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted"; QuoteType[QuoteType["Single"] = 2] = "Single"; QuoteType[QuoteType["Double"] = 3] = "Double"; })(QuoteType = exports.QuoteType || (exports.QuoteType = {})); /** * Sequences used to match longer strings. * * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End * sequences with an increased offset. */ var Sequences = { Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // ` this.sectionStart) { this.cbs.ontext(this.sectionStart, this.index); } this.state = State.BeforeTagName; this.sectionStart = this.index; } else if (this.decodeEntities && c === CharCodes.Amp) { this.state = State.BeforeEntity; } }; Tokenizer.prototype.stateSpecialStartSequence = function (c) { var isEnd = this.sequenceIndex === this.currentSequence.length; var isMatch = isEnd ? // If we are at the end of the sequence, make sure the tag name has ended isEndOfTagSection(c) : // Otherwise, do a case-insensitive comparison (c | 0x20) === this.currentSequence[this.sequenceIndex]; if (!isMatch) { this.isSpecial = false; } else if (!isEnd) { this.sequenceIndex++; return; } this.sequenceIndex = 0; this.state = State.InTagName; this.stateInTagName(c); }; /** Look for an end tag. For tags, also decode entities. */ Tokenizer.prototype.stateInSpecialTag = function (c) { if (this.sequenceIndex === this.currentSequence.length) { if (c === CharCodes.Gt || isWhitespace(c)) { var endOfText = this.index - this.currentSequence.length; if (this.sectionStart < endOfText) { // Spoof the index so that reported locations match up. var actualIndex = this.index; this.index = endOfText; this.cbs.ontext(this.sectionStart, endOfText); this.index = actualIndex; } this.isSpecial = false; this.sectionStart = endOfText + 2; // Skip over the `</` this.stateInClosingTagName(c); return; // We are done; skip the rest of the function. } this.sequenceIndex = 0; } if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) { this.sequenceIndex += 1; } else if (this.sequenceIndex === 0) { if (this.currentSequence === Sequences.TitleEnd) { // We have to parse entities in <title> tags. if (this.decodeEntities && c === CharCodes.Amp) { this.state = State.BeforeEntity; } } else if (this.fastForwardTo(CharCodes.Lt)) { // Outside of <title> tags, we can fast-forward. this.sequenceIndex = 1; } } else { // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`. this.sequenceIndex = Number(c === CharCodes.Lt); } }; Tokenizer.prototype.stateCDATASequence = function (c) { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { this.state = State.InCommentLike; this.currentSequence = Sequences.CdataEnd; this.sequenceIndex = 0; this.sectionStart = this.index + 1; } } else { this.sequenceIndex = 0; this.state = State.InDeclaration; this.stateInDeclaration(c); // Reconsume the character } }; /** * When we wait for one specific character, we can speed things up * by skipping through the buffer until we find it. * * @returns Whether the character was found. */ Tokenizer.prototype.fastForwardTo = function (c) { while (++this.index < this.buffer.length + this.offset) { if (this.buffer.charCodeAt(this.index - this.offset) === c) { return true; } } /* * We increment the index at the end of the `parse` loop, * so set it to `buffer.length - 1` here. * * TODO: Refactor `parse` to increment index before calling states. */ this.index = this.buffer.length + this.offset - 1; return false; }; /** * Comments and CDATA end with `-->` and `]]>`. * * Their common qualities are: * - Their end sequences have a distinct character they start with. * - That character is then repeated, so we have to check multiple repeats. * - All characters but the start character of the sequence can be skipped. */ Tokenizer.prototype.stateInCommentLike = function (c) { if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, this.index, 2); } else { this.cbs.oncomment(this.sectionStart, this.index, 2); } this.sequenceIndex = 0; this.sectionStart = this.index + 1; this.state = State.Text; } } else if (this.sequenceIndex === 0) { // Fast-forward to the first character of the sequence if (this.fastForwardTo(this.currentSequence[0])) { this.sequenceIndex = 1; } } else if (c !== this.currentSequence[this.sequenceIndex - 1]) { // Allow long sequences, eg. --->, ]]]> this.sequenceIndex = 0; } }; /** * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. * * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar). * We allow anything that wouldn't end the tag. */ Tokenizer.prototype.isTagStartChar = function (c) { return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c); }; Tokenizer.prototype.startSpecial = function (sequence, offset) { this.isSpecial = true; this.currentSequence = sequence; this.sequenceIndex = offset; this.state = State.SpecialStartSequence; }; Tokenizer.prototype.stateBeforeTagName = function (c) { if (c === CharCodes.ExclamationMark) { this.state = State.BeforeDeclaration; this.sectionStart = this.index + 1; } else if (c === CharCodes.Questionmark) { this.state = State.InProcessingInstruction; this.sectionStart = this.index + 1; } else if (this.isTagStartChar(c)) { var lower = c | 0x20; this.sectionStart = this.index; if (!this.xmlM