lite.cjs 8.6 KB


  1. 'use strict';
  2. var base64 = require('base64-js');
  3. function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
  4. var base64__default = /*#__PURE__*/_interopDefault(base64);
  5. var __defProp = Object.defineProperty;
  6. var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
  7. var __publicField = (obj, key, value) => {
  8. __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
  9. return value;
  10. };
  11. // src/core.ts
  12. function bytePairMerge(piece, ranks) {
  13. let parts = Array.from(
  14. { length: piece.length },
  15. (_, i) => ({ start: i, end: i + 1 })
  16. );
  17. while (parts.length > 1) {
  18. let minRank = null;
  19. for (let i = 0; i < parts.length - 1; i++) {
  20. const slice = piece.slice(parts[i].start, parts[i + 1].end);
  21. const rank = ranks.get(slice.join(","));
  22. if (rank == null)
  23. continue;
  24. if (minRank == null || rank < minRank[0]) {
  25. minRank = [rank, i];
  26. }
  27. }
  28. if (minRank != null) {
  29. const i = minRank[1];
  30. parts[i] = { start: parts[i].start, end: parts[i + 1].end };
  31. parts.splice(i + 1, 1);
  32. } else {
  33. break;
  34. }
  35. }
  36. return parts;
  37. }
  38. function bytePairEncode(piece, ranks) {
  39. if (piece.length === 1)
  40. return [ranks.get(piece.join(","))];
  41. return bytePairMerge(piece, ranks).map((p) => ranks.get(piece.slice(p.start, p.end).join(","))).filter((x) => x != null);
  42. }
  43. function escapeRegex(str) {
  44. return str.replace(/[\\^$*+?.()|[\]{}]/g, "\\$&");
  45. }
  46. var _Tiktoken = class {
  47. /** @internal */
  48. specialTokens;
  49. /** @internal */
  50. inverseSpecialTokens;
  51. /** @internal */
  52. patStr;
  53. /** @internal */
  54. textEncoder = new TextEncoder();
  55. /** @internal */
  56. textDecoder = new TextDecoder("utf-8");
  57. /** @internal */
  58. rankMap = /* @__PURE__ */ new Map();
  59. /** @internal */
  60. textMap = /* @__PURE__ */ new Map();
  61. constructor(ranks, extendedSpecialTokens) {
  62. this.patStr = ranks.pat_str;
  63. const uncompressed = ranks.bpe_ranks.split("\n").filter(Boolean).reduce((memo, x) => {
  64. const [_, offsetStr, ...tokens] = x.split(" ");
  65. const offset = Number.parseInt(offsetStr, 10);
  66. tokens.forEach((token, i) => memo[token] = offset + i);
  67. return memo;
  68. }, {});
  69. for (const [token, rank] of Object.entries(uncompressed)) {
  70. const bytes = base64__default.default.toByteArray(token);
  71. this.rankMap.set(bytes.join(","), rank);
  72. this.textMap.set(rank, bytes);
  73. }
  74. this.specialTokens = { ...ranks.special_tokens, ...extendedSpecialTokens };
  75. this.inverseSpecialTokens = Object.entries(this.specialTokens).reduce((memo, [text, rank]) => {
  76. memo[rank] = this.textEncoder.encode(text);
  77. return memo;
  78. }, {});
  79. }
  80. encode(text, allowedSpecial = [], disallowedSpecial = "all") {
  81. const regexes = new RegExp(this.patStr, "ug");
  82. const specialRegex = _Tiktoken.specialTokenRegex(
  83. Object.keys(this.specialTokens)
  84. );
  85. const ret = [];
  86. const allowedSpecialSet = new Set(
  87. allowedSpecial === "all" ? Object.keys(this.specialTokens) : allowedSpecial
  88. );
  89. const disallowedSpecialSet = new Set(
  90. disallowedSpecial === "all" ? Object.keys(this.specialTokens).filter(
  91. (x) => !allowedSpecialSet.has(x)
  92. ) : disallowedSpecial
  93. );
  94. if (disallowedSpecialSet.size > 0) {
  95. const disallowedSpecialRegex = _Tiktoken.specialTokenRegex([
  96. ...disallowedSpecialSet
  97. ]);
  98. const specialMatch = text.match(disallowedSpecialRegex);
  99. if (specialMatch != null) {
  100. throw new Error(
  101. `The text contains a special token that is not allowed: ${specialMatch[0]}`
  102. );
  103. }
  104. }
  105. let start = 0;
  106. while (true) {
  107. let nextSpecial = null;
  108. let startFind = start;
  109. while (true) {
  110. specialRegex.lastIndex = startFind;
  111. nextSpecial = specialRegex.exec(text);
  112. if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0]))
  113. break;
  114. startFind = nextSpecial.index + 1;
  115. }
  116. const end = nextSpecial?.index ?? text.length;
  117. for (const match of text.substring(start, end).matchAll(regexes)) {
  118. const piece = this.textEncoder.encode(match[0]);
  119. const token2 = this.rankMap.get(piece.join(","));
  120. if (token2 != null) {
  121. ret.push(token2);
  122. continue;
  123. }
  124. ret.push(...bytePairEncode(piece, this.rankMap));
  125. }
  126. if (nextSpecial == null)
  127. break;
  128. let token = this.specialTokens[nextSpecial[0]];
  129. ret.push(token);
  130. start = nextSpecial.index + nextSpecial[0].length;
  131. }
  132. return ret;
  133. }
  134. decode(tokens) {
  135. const res = [];
  136. let length = 0;
  137. for (let i2 = 0; i2 < tokens.length; ++i2) {
  138. const token = tokens[i2];
  139. const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token];
  140. if (bytes != null) {
  141. res.push(bytes);
  142. length += bytes.length;
  143. }
  144. }
  145. const mergedArray = new Uint8Array(length);
  146. let i = 0;
  147. for (const bytes of res) {
  148. mergedArray.set(bytes, i);
  149. i += bytes.length;
  150. }
  151. return this.textDecoder.decode(mergedArray);
  152. }
  153. };
  154. var Tiktoken = _Tiktoken;
  155. __publicField(Tiktoken, "specialTokenRegex", (tokens) => {
  156. return new RegExp(tokens.map((i) => escapeRegex(i)).join("|"), "g");
  157. });
  158. function getEncodingNameForModel(model) {
  159. switch (model) {
  160. case "gpt2": {
  161. return "gpt2";
  162. }
  163. case "code-cushman-001":
  164. case "code-cushman-002":
  165. case "code-davinci-001":
  166. case "code-davinci-002":
  167. case "cushman-codex":
  168. case "davinci-codex":
  169. case "davinci-002":
  170. case "text-davinci-002":
  171. case "text-davinci-003": {
  172. return "p50k_base";
  173. }
  174. case "code-davinci-edit-001":
  175. case "text-davinci-edit-001": {
  176. return "p50k_edit";
  177. }
  178. case "ada":
  179. case "babbage":
  180. case "babbage-002":
  181. case "code-search-ada-code-001":
  182. case "code-search-babbage-code-001":
  183. case "curie":
  184. case "davinci":
  185. case "text-ada-001":
  186. case "text-babbage-001":
  187. case "text-curie-001":
  188. case "text-davinci-001":
  189. case "text-search-ada-doc-001":
  190. case "text-search-babbage-doc-001":
  191. case "text-search-curie-doc-001":
  192. case "text-search-davinci-doc-001":
  193. case "text-similarity-ada-001":
  194. case "text-similarity-babbage-001":
  195. case "text-similarity-curie-001":
  196. case "text-similarity-davinci-001": {
  197. return "r50k_base";
  198. }
  199. case "gpt-3.5-turbo-instruct-0914":
  200. case "gpt-3.5-turbo-instruct":
  201. case "gpt-3.5-turbo-16k-0613":
  202. case "gpt-3.5-turbo-16k":
  203. case "gpt-3.5-turbo-0613":
  204. case "gpt-3.5-turbo-0301":
  205. case "gpt-3.5-turbo":
  206. case "gpt-4-32k-0613":
  207. case "gpt-4-32k-0314":
  208. case "gpt-4-32k":
  209. case "gpt-4-0613":
  210. case "gpt-4-0314":
  211. case "gpt-4":
  212. case "gpt-3.5-turbo-1106":
  213. case "gpt-35-turbo":
  214. case "gpt-4-1106-preview":
  215. case "gpt-4-vision-preview":
  216. case "gpt-3.5-turbo-0125":
  217. case "gpt-4-turbo":
  218. case "gpt-4-turbo-2024-04-09":
  219. case "gpt-4-turbo-preview":
  220. case "gpt-4-0125-preview":
  221. case "text-embedding-ada-002":
  222. case "text-embedding-3-small":
  223. case "text-embedding-3-large": {
  224. return "cl100k_base";
  225. }
  226. case "gpt-4o":
  227. case "gpt-4o-2024-05-13":
  228. case "gpt-4o-2024-08-06":
  229. case "gpt-4o-2024-11-20":
  230. case "gpt-4o-mini-2024-07-18":
  231. case "gpt-4o-mini":
  232. case "gpt-4o-search-preview":
  233. case "gpt-4o-search-preview-2025-03-11":
  234. case "gpt-4o-mini-search-preview":
  235. case "gpt-4o-mini-search-preview-2025-03-11":
  236. case "gpt-4o-audio-preview":
  237. case "gpt-4o-audio-preview-2024-12-17":
  238. case "gpt-4o-audio-preview-2024-10-01":
  239. case "gpt-4o-mini-audio-preview":
  240. case "gpt-4o-mini-audio-preview-2024-12-17":
  241. case "o1":
  242. case "o1-2024-12-17":
  243. case "o1-mini":
  244. case "o1-mini-2024-09-12":
  245. case "o1-preview":
  246. case "o1-preview-2024-09-12":
  247. case "o1-pro":
  248. case "o1-pro-2025-03-19":
  249. case "o3":
  250. case "o3-2025-04-16":
  251. case "o3-mini":
  252. case "o3-mini-2025-01-31":
  253. case "o4-mini":
  254. case "o4-mini-2025-04-16":
  255. case "chatgpt-4o-latest":
  256. case "gpt-4o-realtime":
  257. case "gpt-4o-realtime-preview-2024-10-01":
  258. case "gpt-4o-realtime-preview-2024-12-17":
  259. case "gpt-4o-mini-realtime-preview":
  260. case "gpt-4o-mini-realtime-preview-2024-12-17":
  261. case "gpt-4.1":
  262. case "gpt-4.1-2025-04-14":
  263. case "gpt-4.1-mini":
  264. case "gpt-4.1-mini-2025-04-14":
  265. case "gpt-4.1-nano":
  266. case "gpt-4.1-nano-2025-04-14":
  267. case "gpt-4.5-preview":
  268. case "gpt-4.5-preview-2025-02-27": {
  269. return "o200k_base";
  270. }
  271. default:
  272. throw new Error("Unknown model");
  273. }
  274. }
  275. exports.Tiktoken = Tiktoken;
  276. exports.getEncodingNameForModel = getEncodingNameForModel;