sniffer.js 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988
  1. import { labelToName } from "whatwg-encoding";
  2. // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
  3. var State;
  4. (function (State) {
  5. // Before anything starts; can be any of BOM, UTF-16 XML declarations or meta tags
  6. State[State["Begin"] = 0] = "Begin";
  7. // Inside of a BOM
  8. State[State["BOM16BE"] = 1] = "BOM16BE";
  9. State[State["BOM16LE"] = 2] = "BOM16LE";
  10. State[State["BOM8"] = 3] = "BOM8";
  11. // XML prefix
  12. State[State["UTF16LE_XML_PREFIX"] = 4] = "UTF16LE_XML_PREFIX";
  13. State[State["BeginLT"] = 5] = "BeginLT";
  14. State[State["UTF16BE_XML_PREFIX"] = 6] = "UTF16BE_XML_PREFIX";
  15. // Waiting for opening `<`
  16. State[State["BeforeTag"] = 7] = "BeforeTag";
  17. // After the opening `<`
  18. State[State["BeforeTagName"] = 8] = "BeforeTagName";
  19. // After `</`
  20. State[State["BeforeCloseTagName"] = 9] = "BeforeCloseTagName";
  21. // Beginning of a comment
  22. State[State["CommentStart"] = 10] = "CommentStart";
  23. // End of a comment
  24. State[State["CommentEnd"] = 11] = "CommentEnd";
  25. // A tag name that could be `meta`
  26. State[State["TagNameMeta"] = 12] = "TagNameMeta";
  27. // A tag name that is not `meta`
  28. State[State["TagNameOther"] = 13] = "TagNameOther";
  29. // XML declaration
  30. State[State["XMLDeclaration"] = 14] = "XMLDeclaration";
  31. State[State["XMLDeclarationBeforeEncoding"] = 15] = "XMLDeclarationBeforeEncoding";
  32. State[State["XMLDeclarationAfterEncoding"] = 16] = "XMLDeclarationAfterEncoding";
  33. State[State["XMLDeclarationBeforeValue"] = 17] = "XMLDeclarationBeforeValue";
  34. State[State["XMLDeclarationValue"] = 18] = "XMLDeclarationValue";
  35. // Anything that looks like a tag, but doesn't fit in the above categories
  36. State[State["WeirdTag"] = 19] = "WeirdTag";
  37. State[State["BeforeAttribute"] = 20] = "BeforeAttribute";
  38. /*
  39. * Attributes in meta tag — we compare them to our set here, and back out
  40. * We care about four attributes: http-equiv, content-type, content, charset
  41. */
  42. State[State["MetaAttribHttpEquiv"] = 21] = "MetaAttribHttpEquiv";
  43. // The value has to be `content-type`
  44. State[State["MetaAttribHttpEquivValue"] = 22] = "MetaAttribHttpEquivValue";
  45. State[State["MetaAttribC"] = 23] = "MetaAttribC";
  46. State[State["MetaAttribContent"] = 24] = "MetaAttribContent";
  47. State[State["MetaAttribCharset"] = 25] = "MetaAttribCharset";
  48. // Waiting for whitespace
  49. State[State["MetaAttribAfterName"] = 26] = "MetaAttribAfterName";
  50. State[State["MetaContentValueQuotedBeforeEncoding"] = 27] = "MetaContentValueQuotedBeforeEncoding";
  51. State[State["MetaContentValueQuotedAfterEncoding"] = 28] = "MetaContentValueQuotedAfterEncoding";
  52. State[State["MetaContentValueQuotedBeforeValue"] = 29] = "MetaContentValueQuotedBeforeValue";
  53. State[State["MetaContentValueQuotedValueQuoted"] = 30] = "MetaContentValueQuotedValueQuoted";
  54. State[State["MetaContentValueQuotedValueUnquoted"] = 31] = "MetaContentValueQuotedValueUnquoted";
  55. State[State["MetaContentValueUnquotedBeforeEncoding"] = 32] = "MetaContentValueUnquotedBeforeEncoding";
  56. State[State["MetaContentValueUnquotedBeforeValue"] = 33] = "MetaContentValueUnquotedBeforeValue";
  57. State[State["MetaContentValueUnquotedValueQuoted"] = 34] = "MetaContentValueUnquotedValueQuoted";
  58. State[State["MetaContentValueUnquotedValueUnquoted"] = 35] = "MetaContentValueUnquotedValueUnquoted";
  59. State[State["AnyAttribName"] = 36] = "AnyAttribName";
  60. // After the name of an attribute, before the equals sign
  61. State[State["AfterAttributeName"] = 37] = "AfterAttributeName";
  62. // After `=`
  63. State[State["BeforeAttributeValue"] = 38] = "BeforeAttributeValue";
  64. State[State["AttributeValueQuoted"] = 39] = "AttributeValueQuoted";
  65. State[State["AttributeValueUnquoted"] = 40] = "AttributeValueUnquoted";
  66. })(State || (State = {}));
  67. export var ResultType;
  68. (function (ResultType) {
  69. // Byte order mark
  70. ResultType[ResultType["BOM"] = 0] = "BOM";
  71. // User- or transport layer-defined
  72. ResultType[ResultType["PASSED"] = 1] = "PASSED";
  73. // XML prefixes
  74. ResultType[ResultType["XML_PREFIX"] = 2] = "XML_PREFIX";
  75. // Meta tag
  76. ResultType[ResultType["META_TAG"] = 3] = "META_TAG";
  77. // XML encoding
  78. ResultType[ResultType["XML_ENCODING"] = 4] = "XML_ENCODING";
  79. // Default
  80. ResultType[ResultType["DEFAULT"] = 5] = "DEFAULT";
  81. })(ResultType || (ResultType = {}));
  82. var AttribType;
  83. (function (AttribType) {
  84. AttribType[AttribType["None"] = 0] = "None";
  85. AttribType[AttribType["HttpEquiv"] = 1] = "HttpEquiv";
  86. AttribType[AttribType["Content"] = 2] = "Content";
  87. AttribType[AttribType["Charset"] = 3] = "Charset";
  88. })(AttribType || (AttribType = {}));
  89. var Chars;
  90. (function (Chars) {
  91. Chars[Chars["NIL"] = 0] = "NIL";
  92. Chars[Chars["TAB"] = 9] = "TAB";
  93. Chars[Chars["LF"] = 10] = "LF";
  94. Chars[Chars["CR"] = 13] = "CR";
  95. Chars[Chars["SPACE"] = 32] = "SPACE";
  96. Chars[Chars["EXCLAMATION"] = 33] = "EXCLAMATION";
  97. Chars[Chars["DQUOTE"] = 34] = "DQUOTE";
  98. Chars[Chars["SQUOTE"] = 39] = "SQUOTE";
  99. Chars[Chars["DASH"] = 45] = "DASH";
  100. Chars[Chars["SLASH"] = 47] = "SLASH";
  101. Chars[Chars["SEMICOLON"] = 59] = "SEMICOLON";
  102. Chars[Chars["LT"] = 60] = "LT";
  103. Chars[Chars["EQUALS"] = 61] = "EQUALS";
  104. Chars[Chars["GT"] = 62] = "GT";
  105. Chars[Chars["QUESTION"] = 63] = "QUESTION";
  106. Chars[Chars["UpperA"] = 65] = "UpperA";
  107. Chars[Chars["UpperZ"] = 90] = "UpperZ";
  108. Chars[Chars["LowerA"] = 97] = "LowerA";
  109. Chars[Chars["LowerZ"] = 122] = "LowerZ";
  110. })(Chars || (Chars = {}));
  111. var SPACE_CHARACTERS = new Set([Chars.SPACE, Chars.LF, Chars.CR, Chars.TAB]);
  112. var END_OF_UNQUOTED_ATTRIBUTE_VALUE = new Set([
  113. Chars.SPACE,
  114. Chars.LF,
  115. Chars.CR,
  116. Chars.TAB,
  117. Chars.GT,
  118. ]);
  119. function toUint8Array(str) {
  120. var arr = new Uint8Array(str.length);
  121. for (var i = 0; i < str.length; i++) {
  122. arr[i] = str.charCodeAt(i);
  123. }
  124. return arr;
  125. }
  126. export var STRINGS = {
  127. UTF8_BOM: new Uint8Array([0xef, 0xbb, 0xbf]),
  128. UTF16LE_BOM: new Uint8Array([0xff, 0xfe]),
  129. UTF16BE_BOM: new Uint8Array([0xfe, 0xff]),
  130. UTF16LE_XML_PREFIX: new Uint8Array([0x3c, 0x0, 0x3f, 0x0, 0x78, 0x0]),
  131. UTF16BE_XML_PREFIX: new Uint8Array([0x0, 0x3c, 0x0, 0x3f, 0x0, 0x78]),
  132. XML_DECLARATION: toUint8Array("<?xml"),
  133. ENCODING: toUint8Array("encoding"),
  134. META: toUint8Array("meta"),
  135. HTTP_EQUIV: toUint8Array("http-equiv"),
  136. CONTENT: toUint8Array("content"),
  137. CONTENT_TYPE: toUint8Array("content-type"),
  138. CHARSET: toUint8Array("charset"),
  139. COMMENT_START: toUint8Array("<!--"),
  140. COMMENT_END: toUint8Array("-->"),
  141. };
  142. function isAsciiAlpha(c) {
  143. return ((c >= Chars.UpperA && c <= Chars.UpperZ) ||
  144. (c >= Chars.LowerA && c <= Chars.LowerZ));
  145. }
  146. function isQuote(c) {
  147. return c === Chars.DQUOTE || c === Chars.SQUOTE;
  148. }
  149. var Sniffer = /** @class */ (function () {
  150. function Sniffer(_a) {
  151. var _b = _a === void 0 ? {} : _a, _c = _b.maxBytes, maxBytes = _c === void 0 ? 1024 : _c, userEncoding = _b.userEncoding, transportLayerEncodingLabel = _b.transportLayerEncodingLabel, defaultEncoding = _b.defaultEncoding;
  152. /** The offset of the previous buffers. */
  153. this.offset = 0;
  154. this.state = State.Begin;
  155. this.sectionIndex = 0;
  156. this.attribType = AttribType.None;
  157. /**
  158. * Indicates if the `http-equiv` is `content-type`.
  159. *
  160. * Initially `null`, a boolean when a value is found.
  161. */
  162. this.gotPragma = null;
  163. this.needsPragma = null;
  164. this.inMetaTag = false;
  165. this.encoding = "windows-1252";
  166. this.resultType = ResultType.DEFAULT;
  167. this.quoteCharacter = 0;
  168. this.attributeValue = [];
  169. this.maxBytes = maxBytes;
  170. if (userEncoding) {
  171. this.setResult(userEncoding, ResultType.PASSED);
  172. }
  173. if (transportLayerEncodingLabel) {
  174. this.setResult(transportLayerEncodingLabel, ResultType.PASSED);
  175. }
  176. if (defaultEncoding) {
  177. this.setResult(defaultEncoding, ResultType.DEFAULT);
  178. }
  179. }
  180. Sniffer.prototype.setResult = function (label, type) {
  181. if (this.resultType === ResultType.DEFAULT || this.resultType > type) {
  182. var encoding = labelToName(label);
  183. if (encoding) {
  184. this.encoding =
  185. // Check if we are in a meta tag and the encoding is `x-user-defined`
  186. type === ResultType.META_TAG &&
  187. encoding === "x-user-defined"
  188. ? "windows-1252"
  189. : // Check if we are in a meta tag or xml declaration, and the encoding is UTF-16
  190. (type === ResultType.META_TAG ||
  191. type === ResultType.XML_ENCODING) &&
  192. (encoding === "UTF-16LE" || encoding === "UTF-16BE")
  193. ? "UTF-8"
  194. : encoding;
  195. this.resultType = type;
  196. }
  197. }
  198. };
  199. Sniffer.prototype.stateBegin = function (c) {
  200. switch (c) {
  201. case STRINGS.UTF16BE_BOM[0]: {
  202. this.state = State.BOM16BE;
  203. break;
  204. }
  205. case STRINGS.UTF16LE_BOM[0]: {
  206. this.state = State.BOM16LE;
  207. break;
  208. }
  209. case STRINGS.UTF8_BOM[0]: {
  210. this.sectionIndex = 1;
  211. this.state = State.BOM8;
  212. break;
  213. }
  214. case Chars.NIL: {
  215. this.state = State.UTF16BE_XML_PREFIX;
  216. this.sectionIndex = 1;
  217. break;
  218. }
  219. case Chars.LT: {
  220. this.state = State.BeginLT;
  221. break;
  222. }
  223. default: {
  224. this.state = State.BeforeTag;
  225. }
  226. }
  227. };
  228. Sniffer.prototype.stateBeginLT = function (c) {
  229. if (c === Chars.NIL) {
  230. this.state = State.UTF16LE_XML_PREFIX;
  231. this.sectionIndex = 2;
  232. }
  233. else if (c === Chars.QUESTION) {
  234. this.state = State.XMLDeclaration;
  235. this.sectionIndex = 2;
  236. }
  237. else {
  238. this.state = State.BeforeTagName;
  239. this.stateBeforeTagName(c);
  240. }
  241. };
  242. Sniffer.prototype.stateUTF16BE_XML_PREFIX = function (c) {
  243. // Advance position in the section
  244. if (this.advanceSection(STRINGS.UTF16BE_XML_PREFIX, c)) {
  245. if (this.sectionIndex === STRINGS.UTF16BE_XML_PREFIX.length) {
  246. // We have the whole prefix
  247. this.setResult("utf-16be", ResultType.XML_PREFIX);
  248. }
  249. }
  250. else {
  251. this.state = State.BeforeTag;
  252. this.stateBeforeTag(c);
  253. }
  254. };
  255. Sniffer.prototype.stateUTF16LE_XML_PREFIX = function (c) {
  256. // Advance position in the section
  257. if (this.advanceSection(STRINGS.UTF16LE_XML_PREFIX, c)) {
  258. if (this.sectionIndex === STRINGS.UTF16LE_XML_PREFIX.length) {
  259. // We have the whole prefix
  260. this.setResult("utf-16le", ResultType.XML_PREFIX);
  261. }
  262. }
  263. else {
  264. this.state = State.BeforeTag;
  265. this.stateBeforeTag(c);
  266. }
  267. };
  268. Sniffer.prototype.stateBOM16LE = function (c) {
  269. if (c === STRINGS.UTF16LE_BOM[1]) {
  270. this.setResult("utf-16le", ResultType.BOM);
  271. }
  272. else {
  273. this.state = State.BeforeTag;
  274. this.stateBeforeTag(c);
  275. }
  276. };
  277. Sniffer.prototype.stateBOM16BE = function (c) {
  278. if (c === STRINGS.UTF16BE_BOM[1]) {
  279. this.setResult("utf-16be", ResultType.BOM);
  280. }
  281. else {
  282. this.state = State.BeforeTag;
  283. this.stateBeforeTag(c);
  284. }
  285. };
  286. Sniffer.prototype.stateBOM8 = function (c) {
  287. if (this.advanceSection(STRINGS.UTF8_BOM, c) &&
  288. this.sectionIndex === STRINGS.UTF8_BOM.length) {
  289. this.setResult("utf-8", ResultType.BOM);
  290. }
  291. };
  292. Sniffer.prototype.stateBeforeTag = function (c) {
  293. if (c === Chars.LT) {
  294. this.state = State.BeforeTagName;
  295. this.inMetaTag = false;
  296. }
  297. };
  298. /**
  299. * We have seen a `<`, and now have to figure out what to do.
  300. *
  301. * Options:
  302. * - `<meta`
  303. * - Any other tag
  304. * - A closing tag
  305. * - `<!--`
  306. * - An XML declaration
  307. *
  308. */
  309. Sniffer.prototype.stateBeforeTagName = function (c) {
  310. if (isAsciiAlpha(c)) {
  311. if ((c | 0x20) === STRINGS.META[0]) {
  312. this.sectionIndex = 1;
  313. this.state = State.TagNameMeta;
  314. }
  315. else {
  316. this.state = State.TagNameOther;
  317. }
  318. }
  319. else
  320. switch (c) {
  321. case Chars.SLASH: {
  322. this.state = State.BeforeCloseTagName;
  323. break;
  324. }
  325. case Chars.EXCLAMATION: {
  326. this.state = State.CommentStart;
  327. this.sectionIndex = 2;
  328. break;
  329. }
  330. case Chars.QUESTION: {
  331. this.state = State.WeirdTag;
  332. break;
  333. }
  334. default: {
  335. this.state = State.BeforeTag;
  336. this.stateBeforeTag(c);
  337. }
  338. }
  339. };
  340. Sniffer.prototype.stateBeforeCloseTagName = function (c) {
  341. this.state = isAsciiAlpha(c)
  342. ? // Switch to `TagNameOther`; the HTML spec allows attributes here as well.
  343. State.TagNameOther
  344. : State.WeirdTag;
  345. };
  346. Sniffer.prototype.stateCommentStart = function (c) {
  347. if (this.advanceSection(STRINGS.COMMENT_START, c)) {
  348. if (this.sectionIndex === STRINGS.COMMENT_START.length) {
  349. this.state = State.CommentEnd;
  350. // The -- of the comment start can be part of the end.
  351. this.sectionIndex = 2;
  352. }
  353. }
  354. else {
  355. this.state = State.WeirdTag;
  356. this.stateWeirdTag(c);
  357. }
  358. };
  359. Sniffer.prototype.stateCommentEnd = function (c) {
  360. if (this.advanceSection(STRINGS.COMMENT_END, c)) {
  361. if (this.sectionIndex === STRINGS.COMMENT_END.length) {
  362. this.state = State.BeforeTag;
  363. }
  364. }
  365. else if (c === Chars.DASH) {
  366. /*
  367. * If we are here, we know we expected a `>` above.
  368. * Set this to 2, to support many dashes before the closing `>`.
  369. */
  370. this.sectionIndex = 2;
  371. }
  372. };
  373. /**
  374. * Any section starting with `<!`, `<?`, `</`, without being a closing tag or comment.
  375. */
  376. Sniffer.prototype.stateWeirdTag = function (c) {
  377. if (c === Chars.GT) {
  378. this.state = State.BeforeTag;
  379. }
  380. };
  381. /**
  382. * Advances the section, ignoring upper/lower case.
  383. *
  384. * Make sure the section has left-over characters before calling.
  385. *
  386. * @returns `false` if we did not match the section.
  387. */
  388. Sniffer.prototype.advanceSectionIC = function (section, c) {
  389. return this.advanceSection(section, c | 0x20);
  390. };
  391. /**
  392. * Advances the section.
  393. *
  394. * Make sure the section has left-over characters before calling.
  395. *
  396. * @returns `false` if we did not match the section.
  397. */
  398. Sniffer.prototype.advanceSection = function (section, c) {
  399. if (section[this.sectionIndex] === c) {
  400. this.sectionIndex++;
  401. return true;
  402. }
  403. this.sectionIndex = 0;
  404. return false;
  405. };
  406. Sniffer.prototype.stateTagNameMeta = function (c) {
  407. if (this.sectionIndex < STRINGS.META.length) {
  408. if (this.advanceSectionIC(STRINGS.META, c)) {
  409. return;
  410. }
  411. }
  412. else if (SPACE_CHARACTERS.has(c)) {
  413. this.inMetaTag = true;
  414. this.gotPragma = null;
  415. this.needsPragma = null;
  416. this.state = State.BeforeAttribute;
  417. return;
  418. }
  419. this.state = State.TagNameOther;
  420. // Reconsume in case there is a `>`.
  421. this.stateTagNameOther(c);
  422. };
  423. Sniffer.prototype.stateTagNameOther = function (c) {
  424. if (SPACE_CHARACTERS.has(c)) {
  425. this.state = State.BeforeAttribute;
  426. }
  427. else if (c === Chars.GT) {
  428. this.state = State.BeforeTag;
  429. }
  430. };
  431. Sniffer.prototype.stateBeforeAttribute = function (c) {
  432. if (SPACE_CHARACTERS.has(c))
  433. return;
  434. if (this.inMetaTag) {
  435. var lower = c | 0x20;
  436. if (lower === STRINGS.HTTP_EQUIV[0]) {
  437. this.sectionIndex = 1;
  438. this.state = State.MetaAttribHttpEquiv;
  439. return;
  440. }
  441. else if (lower === STRINGS.CHARSET[0]) {
  442. this.sectionIndex = 1;
  443. this.state = State.MetaAttribC;
  444. return;
  445. }
  446. }
  447. this.state =
  448. c === Chars.SLASH || c === Chars.GT
  449. ? State.BeforeTag
  450. : State.AnyAttribName;
  451. };
  452. Sniffer.prototype.handleMetaAttrib = function (c, section, type) {
  453. if (this.advanceSectionIC(section, c)) {
  454. if (this.sectionIndex === section.length) {
  455. this.attribType = type;
  456. this.state = State.MetaAttribAfterName;
  457. }
  458. }
  459. else {
  460. this.state = State.AnyAttribName;
  461. this.stateAnyAttribName(c);
  462. }
  463. };
  464. Sniffer.prototype.stateMetaAttribHttpEquiv = function (c) {
  465. this.handleMetaAttrib(c, STRINGS.HTTP_EQUIV, AttribType.HttpEquiv);
  466. };
  467. Sniffer.prototype.stateMetaAttribC = function (c) {
  468. var lower = c | 0x20;
  469. if (lower === STRINGS.CHARSET[1]) {
  470. this.sectionIndex = 2;
  471. this.state = State.MetaAttribCharset;
  472. }
  473. else if (lower === STRINGS.CONTENT[1]) {
  474. this.sectionIndex = 2;
  475. this.state = State.MetaAttribContent;
  476. }
  477. else {
  478. this.state = State.AnyAttribName;
  479. this.stateAnyAttribName(c);
  480. }
  481. };
  482. Sniffer.prototype.stateMetaAttribCharset = function (c) {
  483. this.handleMetaAttrib(c, STRINGS.CHARSET, AttribType.Charset);
  484. };
  485. Sniffer.prototype.stateMetaAttribContent = function (c) {
  486. this.handleMetaAttrib(c, STRINGS.CONTENT, AttribType.Content);
  487. };
  488. Sniffer.prototype.stateMetaAttribAfterName = function (c) {
  489. if (SPACE_CHARACTERS.has(c) || c === Chars.EQUALS) {
  490. this.state = State.AfterAttributeName;
  491. this.stateAfterAttributeName(c);
  492. }
  493. else {
  494. this.state = State.AnyAttribName;
  495. this.stateAnyAttribName(c);
  496. }
  497. };
  498. Sniffer.prototype.stateAnyAttribName = function (c) {
  499. if (SPACE_CHARACTERS.has(c)) {
  500. this.attribType = AttribType.None;
  501. this.state = State.AfterAttributeName;
  502. }
  503. else if (c === Chars.SLASH || c === Chars.GT) {
  504. this.state = State.BeforeTag;
  505. }
  506. else if (c === Chars.EQUALS) {
  507. this.state = State.BeforeAttributeValue;
  508. }
  509. };
  510. Sniffer.prototype.stateAfterAttributeName = function (c) {
  511. if (SPACE_CHARACTERS.has(c))
  512. return;
  513. if (c === Chars.EQUALS) {
  514. this.state = State.BeforeAttributeValue;
  515. }
  516. else {
  517. this.state = State.BeforeAttribute;
  518. this.stateBeforeAttribute(c);
  519. }
  520. };
  521. Sniffer.prototype.stateBeforeAttributeValue = function (c) {
  522. if (SPACE_CHARACTERS.has(c))
  523. return;
  524. this.attributeValue.length = 0;
  525. this.sectionIndex = 0;
  526. if (isQuote(c)) {
  527. this.quoteCharacter = c;
  528. this.state =
  529. this.attribType === AttribType.Content
  530. ? State.MetaContentValueQuotedBeforeEncoding
  531. : this.attribType === AttribType.HttpEquiv
  532. ? State.MetaAttribHttpEquivValue
  533. : State.AttributeValueQuoted;
  534. }
  535. else if (this.attribType === AttribType.Content) {
  536. this.state = State.MetaContentValueUnquotedBeforeEncoding;
  537. this.stateMetaContentValueUnquotedBeforeEncoding(c);
  538. }
  539. else if (this.attribType === AttribType.HttpEquiv) {
  540. // We use `quoteCharacter = 0` to signify that the value is unquoted.
  541. this.quoteCharacter = 0;
  542. this.sectionIndex = 0;
  543. this.state = State.MetaAttribHttpEquivValue;
  544. this.stateMetaAttribHttpEquivValue(c);
  545. }
  546. else {
  547. this.state = State.AttributeValueUnquoted;
  548. this.stateAttributeValueUnquoted(c);
  549. }
  550. };
  551. // The value has to be `content-type`
  552. Sniffer.prototype.stateMetaAttribHttpEquivValue = function (c) {
  553. if (this.sectionIndex === STRINGS.CONTENT_TYPE.length) {
  554. if (this.quoteCharacter === 0
  555. ? END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)
  556. : c === this.quoteCharacter) {
  557. if (this.needsPragma !== null) {
  558. this.setResult(this.needsPragma, ResultType.META_TAG);
  559. }
  560. else if (this.gotPragma === null) {
  561. this.gotPragma = true;
  562. }
  563. this.state = State.BeforeAttribute;
  564. return;
  565. }
  566. }
  567. else if (this.advanceSectionIC(STRINGS.CONTENT_TYPE, c)) {
  568. return;
  569. }
  570. this.gotPragma = false;
  571. if (this.quoteCharacter === 0) {
  572. this.state = State.AttributeValueUnquoted;
  573. this.stateAttributeValueUnquoted(c);
  574. }
  575. else {
  576. this.state = State.AttributeValueQuoted;
  577. this.stateAttributeValueQuoted(c);
  578. }
  579. };
  580. Sniffer.prototype.handleMetaContentValue = function () {
  581. if (this.attributeValue.length === 0)
  582. return;
  583. var encoding = String.fromCharCode.apply(String, this.attributeValue);
  584. if (this.gotPragma) {
  585. this.setResult(encoding, ResultType.META_TAG);
  586. }
  587. else if (this.needsPragma === null) {
  588. // Don't override a previous result.
  589. this.needsPragma = encoding;
  590. }
  591. this.attributeValue.length = 0;
  592. };
  593. Sniffer.prototype.handleAttributeValue = function () {
  594. if (this.attribType === AttribType.Charset) {
  595. this.setResult(String.fromCharCode.apply(String, this.attributeValue), ResultType.META_TAG);
  596. }
  597. };
  598. Sniffer.prototype.stateAttributeValueUnquoted = function (c) {
  599. if (SPACE_CHARACTERS.has(c)) {
  600. this.handleAttributeValue();
  601. this.state = State.BeforeAttribute;
  602. }
  603. else if (c === Chars.SLASH || c === Chars.GT) {
  604. this.handleAttributeValue();
  605. this.state = State.BeforeTag;
  606. }
  607. else if (this.attribType === AttribType.Charset) {
  608. this.attributeValue.push(c | 0x20);
  609. }
  610. };
  611. Sniffer.prototype.findMetaContentEncoding = function (c) {
  612. if (this.advanceSectionIC(STRINGS.CHARSET, c)) {
  613. if (this.sectionIndex === STRINGS.CHARSET.length) {
  614. return true;
  615. }
  616. }
  617. else {
  618. // If we encountered another `c`, assume we started over.
  619. this.sectionIndex = Number(c === STRINGS.CHARSET[0]);
  620. }
  621. return false;
  622. };
  623. Sniffer.prototype.stateMetaContentValueUnquotedBeforeEncoding = function (c) {
  624. if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) {
  625. this.stateAttributeValueUnquoted(c);
  626. }
  627. else if (this.sectionIndex === STRINGS.CHARSET.length) {
  628. if (c === Chars.EQUALS) {
  629. this.state = State.MetaContentValueUnquotedBeforeValue;
  630. }
  631. }
  632. else {
  633. this.findMetaContentEncoding(c);
  634. }
  635. };
  636. Sniffer.prototype.stateMetaContentValueUnquotedBeforeValue = function (c) {
  637. if (isQuote(c)) {
  638. this.quoteCharacter = c;
  639. this.state = State.MetaContentValueUnquotedValueQuoted;
  640. }
  641. else if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) {
  642. // Can't have spaces here, as it would no longer be part of the attribute value.
  643. this.stateAttributeValueUnquoted(c);
  644. }
  645. else {
  646. this.state = State.MetaContentValueUnquotedValueUnquoted;
  647. this.stateMetaContentValueUnquotedValueUnquoted(c);
  648. }
  649. };
  650. Sniffer.prototype.stateMetaContentValueUnquotedValueQuoted = function (c) {
  651. if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) {
  652. // Quotes weren't matched, so we're done.
  653. this.stateAttributeValueUnquoted(c);
  654. }
  655. else if (c === this.quoteCharacter) {
  656. this.handleMetaContentValue();
  657. this.state = State.AttributeValueUnquoted;
  658. }
  659. else {
  660. this.attributeValue.push(c | 0x20);
  661. }
  662. };
  663. Sniffer.prototype.stateMetaContentValueUnquotedValueUnquoted = function (c) {
  664. if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c) || c === Chars.SEMICOLON) {
  665. this.handleMetaContentValue();
  666. this.state = State.AttributeValueUnquoted;
  667. this.stateAttributeValueUnquoted(c);
  668. }
  669. else {
  670. this.attributeValue.push(c | 0x20);
  671. }
  672. };
  673. Sniffer.prototype.stateMetaContentValueQuotedValueUnquoted = function (c) {
  674. if (isQuote(c) || SPACE_CHARACTERS.has(c) || c === Chars.SEMICOLON) {
  675. this.handleMetaContentValue();
  676. // We are done with the value, but might not be at the end of the attribute
  677. this.state = State.AttributeValueQuoted;
  678. this.stateAttributeValueQuoted(c);
  679. }
  680. else {
  681. this.attributeValue.push(c | 0x20);
  682. }
  683. };
  684. Sniffer.prototype.stateMetaContentValueQuotedValueQuoted = function (c) {
  685. if (isQuote(c)) {
  686. // We have reached the end of our value.
  687. if (c !== this.quoteCharacter) {
  688. // Only handle the value if inner quotes were matched.
  689. this.handleMetaContentValue();
  690. }
  691. this.state = State.AttributeValueQuoted;
  692. this.stateAttributeValueQuoted(c);
  693. }
  694. else {
  695. this.attributeValue.push(c | 0x20);
  696. }
  697. };
  698. Sniffer.prototype.stateMetaContentValueQuotedBeforeEncoding = function (c) {
  699. if (c === this.quoteCharacter) {
  700. this.stateAttributeValueQuoted(c);
  701. }
  702. else if (this.findMetaContentEncoding(c)) {
  703. this.state = State.MetaContentValueQuotedAfterEncoding;
  704. }
  705. };
  706. Sniffer.prototype.stateMetaContentValueQuotedAfterEncoding = function (c) {
  707. if (c === Chars.EQUALS) {
  708. this.state = State.MetaContentValueQuotedBeforeValue;
  709. }
  710. else if (!SPACE_CHARACTERS.has(c)) {
  711. // Look for the next encoding
  712. this.state = State.MetaContentValueQuotedBeforeEncoding;
  713. this.stateMetaContentValueQuotedBeforeEncoding(c);
  714. }
  715. };
  716. Sniffer.prototype.stateMetaContentValueQuotedBeforeValue = function (c) {
  717. if (c === this.quoteCharacter) {
  718. this.stateAttributeValueQuoted(c);
  719. }
  720. else if (isQuote(c)) {
  721. this.state = State.MetaContentValueQuotedValueQuoted;
  722. }
  723. else if (!SPACE_CHARACTERS.has(c)) {
  724. this.state = State.MetaContentValueQuotedValueUnquoted;
  725. this.stateMetaContentValueQuotedValueUnquoted(c);
  726. }
  727. };
  728. Sniffer.prototype.stateAttributeValueQuoted = function (c) {
  729. if (c === this.quoteCharacter) {
  730. this.handleAttributeValue();
  731. this.state = State.BeforeAttribute;
  732. }
  733. else if (this.attribType === AttribType.Charset) {
  734. this.attributeValue.push(c | 0x20);
  735. }
  736. };
  737. // Read STRINGS.XML_DECLARATION
  738. Sniffer.prototype.stateXMLDeclaration = function (c) {
  739. if (this.advanceSection(STRINGS.XML_DECLARATION, c)) {
  740. if (this.sectionIndex === STRINGS.XML_DECLARATION.length) {
  741. this.sectionIndex = 0;
  742. this.state = State.XMLDeclarationBeforeEncoding;
  743. }
  744. }
  745. else {
  746. this.state = State.WeirdTag;
  747. }
  748. };
  749. Sniffer.prototype.stateXMLDeclarationBeforeEncoding = function (c) {
  750. if (this.advanceSection(STRINGS.ENCODING, c)) {
  751. if (this.sectionIndex === STRINGS.ENCODING.length) {
  752. this.state = State.XMLDeclarationAfterEncoding;
  753. }
  754. }
  755. else if (c === Chars.GT) {
  756. this.state = State.BeforeTag;
  757. }
  758. else {
  759. // If we encountered another `c`, assume we started over.
  760. this.sectionIndex = Number(c === STRINGS.ENCODING[0]);
  761. }
  762. };
  763. Sniffer.prototype.stateXMLDeclarationAfterEncoding = function (c) {
  764. if (c === Chars.EQUALS) {
  765. this.state = State.XMLDeclarationBeforeValue;
  766. }
  767. else if (c > Chars.SPACE) {
  768. this.state = State.WeirdTag;
  769. this.stateWeirdTag(c);
  770. }
  771. };
  772. Sniffer.prototype.stateXMLDeclarationBeforeValue = function (c) {
  773. if (isQuote(c)) {
  774. this.attributeValue.length = 0;
  775. this.state = State.XMLDeclarationValue;
  776. }
  777. else if (c > Chars.SPACE) {
  778. this.state = State.WeirdTag;
  779. this.stateWeirdTag(c);
  780. }
  781. };
  782. Sniffer.prototype.stateXMLDeclarationValue = function (c) {
  783. if (isQuote(c)) {
  784. this.setResult(String.fromCharCode.apply(String, this.attributeValue), ResultType.XML_ENCODING);
  785. this.state = State.WeirdTag;
  786. }
  787. else if (c === Chars.GT) {
  788. this.state = State.BeforeTag;
  789. }
  790. else if (c <= Chars.SPACE) {
  791. this.state = State.WeirdTag;
  792. }
  793. else {
  794. this.attributeValue.push(c | 0x20);
  795. }
  796. };
  797. Sniffer.prototype.write = function (buffer) {
  798. var index = 0;
  799. for (; index < buffer.length && this.offset + index < this.maxBytes; index++) {
  800. var c = buffer[index];
  801. switch (this.state) {
  802. case State.Begin: {
  803. this.stateBegin(c);
  804. break;
  805. }
  806. case State.BOM16BE: {
  807. this.stateBOM16BE(c);
  808. break;
  809. }
  810. case State.BOM16LE: {
  811. this.stateBOM16LE(c);
  812. break;
  813. }
  814. case State.BOM8: {
  815. this.stateBOM8(c);
  816. break;
  817. }
  818. case State.UTF16LE_XML_PREFIX: {
  819. this.stateUTF16LE_XML_PREFIX(c);
  820. break;
  821. }
  822. case State.BeginLT: {
  823. this.stateBeginLT(c);
  824. break;
  825. }
  826. case State.UTF16BE_XML_PREFIX: {
  827. this.stateUTF16BE_XML_PREFIX(c);
  828. break;
  829. }
  830. case State.BeforeTag: {
  831. // Optimization: Skip all characters until we find a `<`
  832. var idx = buffer.indexOf(Chars.LT, index);
  833. if (idx < 0) {
  834. // We are done with this buffer. Stay in the state and try on the next one.
  835. index = buffer.length;
  836. }
  837. else {
  838. index = idx;
  839. this.stateBeforeTag(Chars.LT);
  840. }
  841. break;
  842. }
  843. case State.BeforeTagName: {
  844. this.stateBeforeTagName(c);
  845. break;
  846. }
  847. case State.BeforeCloseTagName: {
  848. this.stateBeforeCloseTagName(c);
  849. break;
  850. }
  851. case State.CommentStart: {
  852. this.stateCommentStart(c);
  853. break;
  854. }
  855. case State.CommentEnd: {
  856. this.stateCommentEnd(c);
  857. break;
  858. }
  859. case State.TagNameMeta: {
  860. this.stateTagNameMeta(c);
  861. break;
  862. }
  863. case State.TagNameOther: {
  864. this.stateTagNameOther(c);
  865. break;
  866. }
  867. case State.XMLDeclaration: {
  868. this.stateXMLDeclaration(c);
  869. break;
  870. }
  871. case State.XMLDeclarationBeforeEncoding: {
  872. this.stateXMLDeclarationBeforeEncoding(c);
  873. break;
  874. }
  875. case State.XMLDeclarationAfterEncoding: {
  876. this.stateXMLDeclarationAfterEncoding(c);
  877. break;
  878. }
  879. case State.XMLDeclarationBeforeValue: {
  880. this.stateXMLDeclarationBeforeValue(c);
  881. break;
  882. }
  883. case State.XMLDeclarationValue: {
  884. this.stateXMLDeclarationValue(c);
  885. break;
  886. }
  887. case State.WeirdTag: {
  888. this.stateWeirdTag(c);
  889. break;
  890. }
  891. case State.BeforeAttribute: {
  892. this.stateBeforeAttribute(c);
  893. break;
  894. }
  895. case State.MetaAttribHttpEquiv: {
  896. this.stateMetaAttribHttpEquiv(c);
  897. break;
  898. }
  899. case State.MetaAttribHttpEquivValue: {
  900. this.stateMetaAttribHttpEquivValue(c);
  901. break;
  902. }
  903. case State.MetaAttribC: {
  904. this.stateMetaAttribC(c);
  905. break;
  906. }
  907. case State.MetaAttribContent: {
  908. this.stateMetaAttribContent(c);
  909. break;
  910. }
  911. case State.MetaAttribCharset: {
  912. this.stateMetaAttribCharset(c);
  913. break;
  914. }
  915. case State.MetaAttribAfterName: {
  916. this.stateMetaAttribAfterName(c);
  917. break;
  918. }
  919. case State.MetaContentValueQuotedBeforeEncoding: {
  920. this.stateMetaContentValueQuotedBeforeEncoding(c);
  921. break;
  922. }
  923. case State.MetaContentValueQuotedAfterEncoding: {
  924. this.stateMetaContentValueQuotedAfterEncoding(c);
  925. break;
  926. }
  927. case State.MetaContentValueQuotedBeforeValue: {
  928. this.stateMetaContentValueQuotedBeforeValue(c);
  929. break;
  930. }
  931. case State.MetaContentValueQuotedValueQuoted: {
  932. this.stateMetaContentValueQuotedValueQuoted(c);
  933. break;
  934. }
  935. case State.MetaContentValueQuotedValueUnquoted: {
  936. this.stateMetaContentValueQuotedValueUnquoted(c);
  937. break;
  938. }
  939. case State.MetaContentValueUnquotedBeforeEncoding: {
  940. this.stateMetaContentValueUnquotedBeforeEncoding(c);
  941. break;
  942. }
  943. case State.MetaContentValueUnquotedBeforeValue: {
  944. this.stateMetaContentValueUnquotedBeforeValue(c);
  945. break;
  946. }
  947. case State.MetaContentValueUnquotedValueQuoted: {
  948. this.stateMetaContentValueUnquotedValueQuoted(c);
  949. break;
  950. }
  951. case State.MetaContentValueUnquotedValueUnquoted: {
  952. this.stateMetaContentValueUnquotedValueUnquoted(c);
  953. break;
  954. }
  955. case State.AnyAttribName: {
  956. this.stateAnyAttribName(c);
  957. break;
  958. }
  959. case State.AfterAttributeName: {
  960. this.stateAfterAttributeName(c);
  961. break;
  962. }
  963. case State.BeforeAttributeValue: {
  964. this.stateBeforeAttributeValue(c);
  965. break;
  966. }
  967. case State.AttributeValueQuoted: {
  968. this.stateAttributeValueQuoted(c);
  969. break;
  970. }
  971. default: {
  972. // (State.AttributeValueUnquoted)
  973. this.stateAttributeValueUnquoted(c);
  974. }
  975. }
  976. }
  977. this.offset += index;
  978. };
  979. return Sniffer;
  980. }());
  981. export { Sniffer };
  982. /** Get the encoding for the passed buffer. */
  983. export function getEncoding(buffer, options) {
  984. var sniffer = new Sniffer(options);
  985. sniffer.write(buffer);
  986. return sniffer.encoding;
  987. }
  988. //# sourceMappingURL=sniffer.js.map