"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.getEncoding = exports.Sniffer = exports.STRINGS = exports.ResultType = void 0; var whatwg_encoding_1 = require("whatwg-encoding"); // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding var State; (function (State) { // Before anything starts; can be any of BOM, UTF-16 XML declarations or meta tags State[State["Begin"] = 0] = "Begin"; // Inside of a BOM State[State["BOM16BE"] = 1] = "BOM16BE"; State[State["BOM16LE"] = 2] = "BOM16LE"; State[State["BOM8"] = 3] = "BOM8"; // XML prefix State[State["UTF16LE_XML_PREFIX"] = 4] = "UTF16LE_XML_PREFIX"; State[State["BeginLT"] = 5] = "BeginLT"; State[State["UTF16BE_XML_PREFIX"] = 6] = "UTF16BE_XML_PREFIX"; // Waiting for opening `<` State[State["BeforeTag"] = 7] = "BeforeTag"; // After the opening `<` State[State["BeforeTagName"] = 8] = "BeforeTagName"; // After `"), }; function isAsciiAlpha(c) { return ((c >= Chars.UpperA && c <= Chars.UpperZ) || (c >= Chars.LowerA && c <= Chars.LowerZ)); } function isQuote(c) { return c === Chars.DQUOTE || c === Chars.SQUOTE; } var Sniffer = /** @class */ (function () { function Sniffer(_a) { var _b = _a === void 0 ? {} : _a, _c = _b.maxBytes, maxBytes = _c === void 0 ? 1024 : _c, userEncoding = _b.userEncoding, transportLayerEncodingLabel = _b.transportLayerEncodingLabel, defaultEncoding = _b.defaultEncoding; /** The offset of the previous buffers. */ this.offset = 0; this.state = State.Begin; this.sectionIndex = 0; this.attribType = AttribType.None; /** * Indicates if the `http-equiv` is `content-type`. * * Initially `null`, a boolean when a value is found. */ this.gotPragma = null; this.needsPragma = null; this.inMetaTag = false; this.encoding = "windows-1252"; this.resultType = ResultType.DEFAULT; this.quoteCharacter = 0; this.attributeValue = []; this.maxBytes = maxBytes; if (userEncoding) { this.setResult(userEncoding, ResultType.PASSED); } if (transportLayerEncodingLabel) { this.setResult(transportLayerEncodingLabel, ResultType.PASSED); } if (defaultEncoding) { this.setResult(defaultEncoding, ResultType.DEFAULT); } } Sniffer.prototype.setResult = function (label, type) { if (this.resultType === ResultType.DEFAULT || this.resultType > type) { var encoding = (0, whatwg_encoding_1.labelToName)(label); if (encoding) { this.encoding = // Check if we are in a meta tag and the encoding is `x-user-defined` type === ResultType.META_TAG && encoding === "x-user-defined" ? "windows-1252" : // Check if we are in a meta tag or xml declaration, and the encoding is UTF-16 (type === ResultType.META_TAG || type === ResultType.XML_ENCODING) && (encoding === "UTF-16LE" || encoding === "UTF-16BE") ? "UTF-8" : encoding; this.resultType = type; } } }; Sniffer.prototype.stateBegin = function (c) { switch (c) { case exports.STRINGS.UTF16BE_BOM[0]: { this.state = State.BOM16BE; break; } case exports.STRINGS.UTF16LE_BOM[0]: { this.state = State.BOM16LE; break; } case exports.STRINGS.UTF8_BOM[0]: { this.sectionIndex = 1; this.state = State.BOM8; break; } case Chars.NIL: { this.state = State.UTF16BE_XML_PREFIX; this.sectionIndex = 1; break; } case Chars.LT: { this.state = State.BeginLT; break; } default: { this.state = State.BeforeTag; } } }; Sniffer.prototype.stateBeginLT = function (c) { if (c === Chars.NIL) { this.state = State.UTF16LE_XML_PREFIX; this.sectionIndex = 2; } else if (c === Chars.QUESTION) { this.state = State.XMLDeclaration; this.sectionIndex = 2; } else { this.state = State.BeforeTagName; this.stateBeforeTagName(c); } }; Sniffer.prototype.stateUTF16BE_XML_PREFIX = function (c) { // Advance position in the section if (this.advanceSection(exports.STRINGS.UTF16BE_XML_PREFIX, c)) { if (this.sectionIndex === exports.STRINGS.UTF16BE_XML_PREFIX.length) { // We have the whole prefix this.setResult("utf-16be", ResultType.XML_PREFIX); } } else { this.state = State.BeforeTag; this.stateBeforeTag(c); } }; Sniffer.prototype.stateUTF16LE_XML_PREFIX = function (c) { // Advance position in the section if (this.advanceSection(exports.STRINGS.UTF16LE_XML_PREFIX, c)) { if (this.sectionIndex === exports.STRINGS.UTF16LE_XML_PREFIX.length) { // We have the whole prefix this.setResult("utf-16le", ResultType.XML_PREFIX); } } else { this.state = State.BeforeTag; this.stateBeforeTag(c); } }; Sniffer.prototype.stateBOM16LE = function (c) { if (c === exports.STRINGS.UTF16LE_BOM[1]) { this.setResult("utf-16le", ResultType.BOM); } else { this.state = State.BeforeTag; this.stateBeforeTag(c); } }; Sniffer.prototype.stateBOM16BE = function (c) { if (c === exports.STRINGS.UTF16BE_BOM[1]) { this.setResult("utf-16be", ResultType.BOM); } else { this.state = State.BeforeTag; this.stateBeforeTag(c); } }; Sniffer.prototype.stateBOM8 = function (c) { if (this.advanceSection(exports.STRINGS.UTF8_BOM, c) && this.sectionIndex === exports.STRINGS.UTF8_BOM.length) { this.setResult("utf-8", ResultType.BOM); } }; Sniffer.prototype.stateBeforeTag = function (c) { if (c === Chars.LT) { this.state = State.BeforeTagName; this.inMetaTag = false; } }; /** * We have seen a `<`, and now have to figure out what to do. * * Options: * - `` above. * Set this to 2, to support many dashes before the closing `>`. */ this.sectionIndex = 2; } }; /** * Any section starting with ``. this.stateTagNameOther(c); }; Sniffer.prototype.stateTagNameOther = function (c) { if (SPACE_CHARACTERS.has(c)) { this.state = State.BeforeAttribute; } else if (c === Chars.GT) { this.state = State.BeforeTag; } }; Sniffer.prototype.stateBeforeAttribute = function (c) { if (SPACE_CHARACTERS.has(c)) return; if (this.inMetaTag) { var lower = c | 0x20; if (lower === exports.STRINGS.HTTP_EQUIV[0]) { this.sectionIndex = 1; this.state = State.MetaAttribHttpEquiv; return; } else if (lower === exports.STRINGS.CHARSET[0]) { this.sectionIndex = 1; this.state = State.MetaAttribC; return; } } this.state = c === Chars.SLASH || c === Chars.GT ? State.BeforeTag : State.AnyAttribName; }; Sniffer.prototype.handleMetaAttrib = function (c, section, type) { if (this.advanceSectionIC(section, c)) { if (this.sectionIndex === section.length) { this.attribType = type; this.state = State.MetaAttribAfterName; } } else { this.state = State.AnyAttribName; this.stateAnyAttribName(c); } }; Sniffer.prototype.stateMetaAttribHttpEquiv = function (c) { this.handleMetaAttrib(c, exports.STRINGS.HTTP_EQUIV, AttribType.HttpEquiv); }; Sniffer.prototype.stateMetaAttribC = function (c) { var lower = c | 0x20; if (lower === exports.STRINGS.CHARSET[1]) { this.sectionIndex = 2; this.state = State.MetaAttribCharset; } else if (lower === exports.STRINGS.CONTENT[1]) { this.sectionIndex = 2; this.state = State.MetaAttribContent; } else { this.state = State.AnyAttribName; this.stateAnyAttribName(c); } }; Sniffer.prototype.stateMetaAttribCharset = function (c) { this.handleMetaAttrib(c, exports.STRINGS.CHARSET, AttribType.Charset); }; Sniffer.prototype.stateMetaAttribContent = function (c) { this.handleMetaAttrib(c, exports.STRINGS.CONTENT, AttribType.Content); }; Sniffer.prototype.stateMetaAttribAfterName = function (c) { if (SPACE_CHARACTERS.has(c) || c === Chars.EQUALS) { this.state = State.AfterAttributeName; this.stateAfterAttributeName(c); } else { this.state = State.AnyAttribName; this.stateAnyAttribName(c); } }; Sniffer.prototype.stateAnyAttribName = function (c) { if (SPACE_CHARACTERS.has(c)) { this.attribType = AttribType.None; this.state = State.AfterAttributeName; } else if (c === Chars.SLASH || c === Chars.GT) { this.state = State.BeforeTag; } else if (c === Chars.EQUALS) { this.state = State.BeforeAttributeValue; } }; Sniffer.prototype.stateAfterAttributeName = function (c) { if (SPACE_CHARACTERS.has(c)) return; if (c === Chars.EQUALS) { this.state = State.BeforeAttributeValue; } else { this.state = State.BeforeAttribute; this.stateBeforeAttribute(c); } }; Sniffer.prototype.stateBeforeAttributeValue = function (c) { if (SPACE_CHARACTERS.has(c)) return; this.attributeValue.length = 0; this.sectionIndex = 0; if (isQuote(c)) { this.quoteCharacter = c; this.state = this.attribType === AttribType.Content ? State.MetaContentValueQuotedBeforeEncoding : this.attribType === AttribType.HttpEquiv ? State.MetaAttribHttpEquivValue : State.AttributeValueQuoted; } else if (this.attribType === AttribType.Content) { this.state = State.MetaContentValueUnquotedBeforeEncoding; this.stateMetaContentValueUnquotedBeforeEncoding(c); } else if (this.attribType === AttribType.HttpEquiv) { // We use `quoteCharacter = 0` to signify that the value is unquoted. this.quoteCharacter = 0; this.sectionIndex = 0; this.state = State.MetaAttribHttpEquivValue; this.stateMetaAttribHttpEquivValue(c); } else { this.state = State.AttributeValueUnquoted; this.stateAttributeValueUnquoted(c); } }; // The value has to be `content-type` Sniffer.prototype.stateMetaAttribHttpEquivValue = function (c) { if (this.sectionIndex === exports.STRINGS.CONTENT_TYPE.length) { if (this.quoteCharacter === 0 ? END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c) : c === this.quoteCharacter) { if (this.needsPragma !== null) { this.setResult(this.needsPragma, ResultType.META_TAG); } else if (this.gotPragma === null) { this.gotPragma = true; } this.state = State.BeforeAttribute; return; } } else if (this.advanceSectionIC(exports.STRINGS.CONTENT_TYPE, c)) { return; } this.gotPragma = false; if (this.quoteCharacter === 0) { this.state = State.AttributeValueUnquoted; this.stateAttributeValueUnquoted(c); } else { this.state = State.AttributeValueQuoted; this.stateAttributeValueQuoted(c); } }; Sniffer.prototype.handleMetaContentValue = function () { if (this.attributeValue.length === 0) return; var encoding = String.fromCharCode.apply(String, this.attributeValue); if (this.gotPragma) { this.setResult(encoding, ResultType.META_TAG); } else if (this.needsPragma === null) { // Don't override a previous result. this.needsPragma = encoding; } this.attributeValue.length = 0; }; Sniffer.prototype.handleAttributeValue = function () { if (this.attribType === AttribType.Charset) { this.setResult(String.fromCharCode.apply(String, this.attributeValue), ResultType.META_TAG); } }; Sniffer.prototype.stateAttributeValueUnquoted = function (c) { if (SPACE_CHARACTERS.has(c)) { this.handleAttributeValue(); this.state = State.BeforeAttribute; } else if (c === Chars.SLASH || c === Chars.GT) { this.handleAttributeValue(); this.state = State.BeforeTag; } else if (this.attribType === AttribType.Charset) { this.attributeValue.push(c | 0x20); } }; Sniffer.prototype.findMetaContentEncoding = function (c) { if (this.advanceSectionIC(exports.STRINGS.CHARSET, c)) { if (this.sectionIndex === exports.STRINGS.CHARSET.length) { return true; } } else { // If we encountered another `c`, assume we started over. this.sectionIndex = Number(c === exports.STRINGS.CHARSET[0]); } return false; }; Sniffer.prototype.stateMetaContentValueUnquotedBeforeEncoding = function (c) { if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) { this.stateAttributeValueUnquoted(c); } else if (this.sectionIndex === exports.STRINGS.CHARSET.length) { if (c === Chars.EQUALS) { this.state = State.MetaContentValueUnquotedBeforeValue; } } else { this.findMetaContentEncoding(c); } }; Sniffer.prototype.stateMetaContentValueUnquotedBeforeValue = function (c) { if (isQuote(c)) { this.quoteCharacter = c; this.state = State.MetaContentValueUnquotedValueQuoted; } else if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) { // Can't have spaces here, as it would no longer be part of the attribute value. this.stateAttributeValueUnquoted(c); } else { this.state = State.MetaContentValueUnquotedValueUnquoted; this.stateMetaContentValueUnquotedValueUnquoted(c); } }; Sniffer.prototype.stateMetaContentValueUnquotedValueQuoted = function (c) { if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) { // Quotes weren't matched, so we're done. this.stateAttributeValueUnquoted(c); } else if (c === this.quoteCharacter) { this.handleMetaContentValue(); this.state = State.AttributeValueUnquoted; } else { this.attributeValue.push(c | 0x20); } }; Sniffer.prototype.stateMetaContentValueUnquotedValueUnquoted = function (c) { if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c) || c === Chars.SEMICOLON) { this.handleMetaContentValue(); this.state = State.AttributeValueUnquoted; this.stateAttributeValueUnquoted(c); } else { this.attributeValue.push(c | 0x20); } }; Sniffer.prototype.stateMetaContentValueQuotedValueUnquoted = function (c) { if (isQuote(c) || SPACE_CHARACTERS.has(c) || c === Chars.SEMICOLON) { this.handleMetaContentValue(); // We are done with the value, but might not be at the end of the attribute this.state = State.AttributeValueQuoted; this.stateAttributeValueQuoted(c); } else { this.attributeValue.push(c | 0x20); } }; Sniffer.prototype.stateMetaContentValueQuotedValueQuoted = function (c) { if (isQuote(c)) { // We have reached the end of our value. if (c !== this.quoteCharacter) { // Only handle the value if inner quotes were matched. this.handleMetaContentValue(); } this.state = State.AttributeValueQuoted; this.stateAttributeValueQuoted(c); } else { this.attributeValue.push(c | 0x20); } }; Sniffer.prototype.stateMetaContentValueQuotedBeforeEncoding = function (c) { if (c === this.quoteCharacter) { this.stateAttributeValueQuoted(c); } else if (this.findMetaContentEncoding(c)) { this.state = State.MetaContentValueQuotedAfterEncoding; } }; Sniffer.prototype.stateMetaContentValueQuotedAfterEncoding = function (c) { if (c === Chars.EQUALS) { this.state = State.MetaContentValueQuotedBeforeValue; } else if (!SPACE_CHARACTERS.has(c)) { // Look for the next encoding this.state = State.MetaContentValueQuotedBeforeEncoding; this.stateMetaContentValueQuotedBeforeEncoding(c); } }; Sniffer.prototype.stateMetaContentValueQuotedBeforeValue = function (c) { if (c === this.quoteCharacter) { this.stateAttributeValueQuoted(c); } else if (isQuote(c)) { this.state = State.MetaContentValueQuotedValueQuoted; } else if (!SPACE_CHARACTERS.has(c)) { this.state = State.MetaContentValueQuotedValueUnquoted; this.stateMetaContentValueQuotedValueUnquoted(c); } }; Sniffer.prototype.stateAttributeValueQuoted = function (c) { if (c === this.quoteCharacter) { this.handleAttributeValue(); this.state = State.BeforeAttribute; } else if (this.attribType === AttribType.Charset) { this.attributeValue.push(c | 0x20); } }; // Read STRINGS.XML_DECLARATION Sniffer.prototype.stateXMLDeclaration = function (c) { if (this.advanceSection(exports.STRINGS.XML_DECLARATION, c)) { if (this.sectionIndex === exports.STRINGS.XML_DECLARATION.length) { this.sectionIndex = 0; this.state = State.XMLDeclarationBeforeEncoding; } } else { this.state = State.WeirdTag; } }; Sniffer.prototype.stateXMLDeclarationBeforeEncoding = function (c) { if (this.advanceSection(exports.STRINGS.ENCODING, c)) { if (this.sectionIndex === exports.STRINGS.ENCODING.length) { this.state = State.XMLDeclarationAfterEncoding; } } else if (c === Chars.GT) { this.state = State.BeforeTag; } else { // If we encountered another `c`, assume we started over. this.sectionIndex = Number(c === exports.STRINGS.ENCODING[0]); } }; Sniffer.prototype.stateXMLDeclarationAfterEncoding = function (c) { if (c === Chars.EQUALS) { this.state = State.XMLDeclarationBeforeValue; } else if (c > Chars.SPACE) { this.state = State.WeirdTag; this.stateWeirdTag(c); } }; Sniffer.prototype.stateXMLDeclarationBeforeValue = function (c) { if (isQuote(c)) { this.attributeValue.length = 0; this.state = State.XMLDeclarationValue; } else if (c > Chars.SPACE) { this.state = State.WeirdTag; this.stateWeirdTag(c); } }; Sniffer.prototype.stateXMLDeclarationValue = function (c) { if (isQuote(c)) { this.setResult(String.fromCharCode.apply(String, this.attributeValue), ResultType.XML_ENCODING); this.state = State.WeirdTag; } else if (c === Chars.GT) { this.state = State.BeforeTag; } else if (c <= Chars.SPACE) { this.state = State.WeirdTag; } else { this.attributeValue.push(c | 0x20); } }; Sniffer.prototype.write = function (buffer) { var index = 0; for (; index < buffer.length && this.offset + index < this.maxBytes; index++) { var c = buffer[index]; switch (this.state) { case State.Begin: { this.stateBegin(c); break; } case State.BOM16BE: { this.stateBOM16BE(c); break; } case State.BOM16LE: { this.stateBOM16LE(c); break; } case State.BOM8: { this.stateBOM8(c); break; } case State.UTF16LE_XML_PREFIX: { this.stateUTF16LE_XML_PREFIX(c); break; } case State.BeginLT: { this.stateBeginLT(c); break; } case State.UTF16BE_XML_PREFIX: { this.stateUTF16BE_XML_PREFIX(c); break; } case State.BeforeTag: { // Optimization: Skip all characters until we find a `<` var idx = buffer.indexOf(Chars.LT, index); if (idx < 0) { // We are done with this buffer. Stay in the state and try on the next one. index = buffer.length; } else { index = idx; this.stateBeforeTag(Chars.LT); } break; } case State.BeforeTagName: { this.stateBeforeTagName(c); break; } case State.BeforeCloseTagName: { this.stateBeforeCloseTagName(c); break; } case State.CommentStart: { this.stateCommentStart(c); break; } case State.CommentEnd: { this.stateCommentEnd(c); break; } case State.TagNameMeta: { this.stateTagNameMeta(c); break; } case State.TagNameOther: { this.stateTagNameOther(c); break; } case State.XMLDeclaration: { this.stateXMLDeclaration(c); break; } case State.XMLDeclarationBeforeEncoding: { this.stateXMLDeclarationBeforeEncoding(c); break; } case State.XMLDeclarationAfterEncoding: { this.stateXMLDeclarationAfterEncoding(c); break; } case State.XMLDeclarationBeforeValue: { this.stateXMLDeclarationBeforeValue(c); break; } case State.XMLDeclarationValue: { this.stateXMLDeclarationValue(c); break; } case State.WeirdTag: { this.stateWeirdTag(c); break; } case State.BeforeAttribute: { this.stateBeforeAttribute(c); break; } case State.MetaAttribHttpEquiv: { this.stateMetaAttribHttpEquiv(c); break; } case State.MetaAttribHttpEquivValue: { this.stateMetaAttribHttpEquivValue(c); break; } case State.MetaAttribC: { this.stateMetaAttribC(c); break; } case State.MetaAttribContent: { this.stateMetaAttribContent(c); break; } case State.MetaAttribCharset: { this.stateMetaAttribCharset(c); break; } case State.MetaAttribAfterName: { this.stateMetaAttribAfterName(c); break; } case State.MetaContentValueQuotedBeforeEncoding: { this.stateMetaContentValueQuotedBeforeEncoding(c); break; } case State.MetaContentValueQuotedAfterEncoding: { this.stateMetaContentValueQuotedAfterEncoding(c); break; } case State.MetaContentValueQuotedBeforeValue: { this.stateMetaContentValueQuotedBeforeValue(c); break; } case State.MetaContentValueQuotedValueQuoted: { this.stateMetaContentValueQuotedValueQuoted(c); break; } case State.MetaContentValueQuotedValueUnquoted: { this.stateMetaContentValueQuotedValueUnquoted(c); break; } case State.MetaContentValueUnquotedBeforeEncoding: { this.stateMetaContentValueUnquotedBeforeEncoding(c); break; } case State.MetaContentValueUnquotedBeforeValue: { this.stateMetaContentValueUnquotedBeforeValue(c); break; } case State.MetaContentValueUnquotedValueQuoted: { this.stateMetaContentValueUnquotedValueQuoted(c); break; } case State.MetaContentValueUnquotedValueUnquoted: { this.stateMetaContentValueUnquotedValueUnquoted(c); break; } case State.AnyAttribName: { this.stateAnyAttribName(c); break; } case State.AfterAttributeName: { this.stateAfterAttributeName(c); break; } case State.BeforeAttributeValue: { this.stateBeforeAttributeValue(c); break; } case State.AttributeValueQuoted: { this.stateAttributeValueQuoted(c); break; } default: { // (State.AttributeValueUnquoted) this.stateAttributeValueUnquoted(c); } } } this.offset += index; }; return Sniffer; }()); exports.Sniffer = Sniffer; /** Get the encoding for the passed buffer. */ function getEncoding(buffer, options) { var sniffer = new Sniffer(options); sniffer.write(buffer); return sniffer.encoding; } exports.getEncoding = getEncoding; //# sourceMappingURL=sniffer.js.map