chunk-ZDNLBERF.js 8.4 KB


  1. import base64 from 'base64-js';
  2. var __defProp = Object.defineProperty;
  3. var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
  4. var __publicField = (obj, key, value) => {
  5. __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
  6. return value;
  7. };
  8. // src/utils.ts
  9. function never(_) {
  10. }
  11. function bytePairMerge(piece, ranks) {
  12. let parts = Array.from(
  13. { length: piece.length },
  14. (_, i) => ({ start: i, end: i + 1 })
  15. );
  16. while (parts.length > 1) {
  17. let minRank = null;
  18. for (let i = 0; i < parts.length - 1; i++) {
  19. const slice = piece.slice(parts[i].start, parts[i + 1].end);
  20. const rank = ranks.get(slice.join(","));
  21. if (rank == null)
  22. continue;
  23. if (minRank == null || rank < minRank[0]) {
  24. minRank = [rank, i];
  25. }
  26. }
  27. if (minRank != null) {
  28. const i = minRank[1];
  29. parts[i] = { start: parts[i].start, end: parts[i + 1].end };
  30. parts.splice(i + 1, 1);
  31. } else {
  32. break;
  33. }
  34. }
  35. return parts;
  36. }
  37. function bytePairEncode(piece, ranks) {
  38. if (piece.length === 1)
  39. return [ranks.get(piece.join(","))];
  40. return bytePairMerge(piece, ranks).map((p) => ranks.get(piece.slice(p.start, p.end).join(","))).filter((x) => x != null);
  41. }
  42. function escapeRegex(str) {
  43. return str.replace(/[\\^$*+?.()|[\]{}]/g, "\\$&");
  44. }
  45. var _Tiktoken = class {
  46. /** @internal */
  47. specialTokens;
  48. /** @internal */
  49. inverseSpecialTokens;
  50. /** @internal */
  51. patStr;
  52. /** @internal */
  53. textEncoder = new TextEncoder();
  54. /** @internal */
  55. textDecoder = new TextDecoder("utf-8");
  56. /** @internal */
  57. rankMap = /* @__PURE__ */ new Map();
  58. /** @internal */
  59. textMap = /* @__PURE__ */ new Map();
  60. constructor(ranks, extendedSpecialTokens) {
  61. this.patStr = ranks.pat_str;
  62. const uncompressed = ranks.bpe_ranks.split("\n").filter(Boolean).reduce((memo, x) => {
  63. const [_, offsetStr, ...tokens] = x.split(" ");
  64. const offset = Number.parseInt(offsetStr, 10);
  65. tokens.forEach((token, i) => memo[token] = offset + i);
  66. return memo;
  67. }, {});
  68. for (const [token, rank] of Object.entries(uncompressed)) {
  69. const bytes = base64.toByteArray(token);
  70. this.rankMap.set(bytes.join(","), rank);
  71. this.textMap.set(rank, bytes);
  72. }
  73. this.specialTokens = { ...ranks.special_tokens, ...extendedSpecialTokens };
  74. this.inverseSpecialTokens = Object.entries(this.specialTokens).reduce((memo, [text, rank]) => {
  75. memo[rank] = this.textEncoder.encode(text);
  76. return memo;
  77. }, {});
  78. }
  79. encode(text, allowedSpecial = [], disallowedSpecial = "all") {
  80. const regexes = new RegExp(this.patStr, "ug");
  81. const specialRegex = _Tiktoken.specialTokenRegex(
  82. Object.keys(this.specialTokens)
  83. );
  84. const ret = [];
  85. const allowedSpecialSet = new Set(
  86. allowedSpecial === "all" ? Object.keys(this.specialTokens) : allowedSpecial
  87. );
  88. const disallowedSpecialSet = new Set(
  89. disallowedSpecial === "all" ? Object.keys(this.specialTokens).filter(
  90. (x) => !allowedSpecialSet.has(x)
  91. ) : disallowedSpecial
  92. );
  93. if (disallowedSpecialSet.size > 0) {
  94. const disallowedSpecialRegex = _Tiktoken.specialTokenRegex([
  95. ...disallowedSpecialSet
  96. ]);
  97. const specialMatch = text.match(disallowedSpecialRegex);
  98. if (specialMatch != null) {
  99. throw new Error(
  100. `The text contains a special token that is not allowed: ${specialMatch[0]}`
  101. );
  102. }
  103. }
  104. let start = 0;
  105. while (true) {
  106. let nextSpecial = null;
  107. let startFind = start;
  108. while (true) {
  109. specialRegex.lastIndex = startFind;
  110. nextSpecial = specialRegex.exec(text);
  111. if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0]))
  112. break;
  113. startFind = nextSpecial.index + 1;
  114. }
  115. const end = nextSpecial?.index ?? text.length;
  116. for (const match of text.substring(start, end).matchAll(regexes)) {
  117. const piece = this.textEncoder.encode(match[0]);
  118. const token2 = this.rankMap.get(piece.join(","));
  119. if (token2 != null) {
  120. ret.push(token2);
  121. continue;
  122. }
  123. ret.push(...bytePairEncode(piece, this.rankMap));
  124. }
  125. if (nextSpecial == null)
  126. break;
  127. let token = this.specialTokens[nextSpecial[0]];
  128. ret.push(token);
  129. start = nextSpecial.index + nextSpecial[0].length;
  130. }
  131. return ret;
  132. }
  133. decode(tokens) {
  134. const res = [];
  135. let length = 0;
  136. for (let i2 = 0; i2 < tokens.length; ++i2) {
  137. const token = tokens[i2];
  138. const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token];
  139. if (bytes != null) {
  140. res.push(bytes);
  141. length += bytes.length;
  142. }
  143. }
  144. const mergedArray = new Uint8Array(length);
  145. let i = 0;
  146. for (const bytes of res) {
  147. mergedArray.set(bytes, i);
  148. i += bytes.length;
  149. }
  150. return this.textDecoder.decode(mergedArray);
  151. }
  152. };
  153. var Tiktoken = _Tiktoken;
  154. __publicField(Tiktoken, "specialTokenRegex", (tokens) => {
  155. return new RegExp(tokens.map((i) => escapeRegex(i)).join("|"), "g");
  156. });
  157. function getEncodingNameForModel(model) {
  158. switch (model) {
  159. case "gpt2": {
  160. return "gpt2";
  161. }
  162. case "code-cushman-001":
  163. case "code-cushman-002":
  164. case "code-davinci-001":
  165. case "code-davinci-002":
  166. case "cushman-codex":
  167. case "davinci-codex":
  168. case "davinci-002":
  169. case "text-davinci-002":
  170. case "text-davinci-003": {
  171. return "p50k_base";
  172. }
  173. case "code-davinci-edit-001":
  174. case "text-davinci-edit-001": {
  175. return "p50k_edit";
  176. }
  177. case "ada":
  178. case "babbage":
  179. case "babbage-002":
  180. case "code-search-ada-code-001":
  181. case "code-search-babbage-code-001":
  182. case "curie":
  183. case "davinci":
  184. case "text-ada-001":
  185. case "text-babbage-001":
  186. case "text-curie-001":
  187. case "text-davinci-001":
  188. case "text-search-ada-doc-001":
  189. case "text-search-babbage-doc-001":
  190. case "text-search-curie-doc-001":
  191. case "text-search-davinci-doc-001":
  192. case "text-similarity-ada-001":
  193. case "text-similarity-babbage-001":
  194. case "text-similarity-curie-001":
  195. case "text-similarity-davinci-001": {
  196. return "r50k_base";
  197. }
  198. case "gpt-3.5-turbo-instruct-0914":
  199. case "gpt-3.5-turbo-instruct":
  200. case "gpt-3.5-turbo-16k-0613":
  201. case "gpt-3.5-turbo-16k":
  202. case "gpt-3.5-turbo-0613":
  203. case "gpt-3.5-turbo-0301":
  204. case "gpt-3.5-turbo":
  205. case "gpt-4-32k-0613":
  206. case "gpt-4-32k-0314":
  207. case "gpt-4-32k":
  208. case "gpt-4-0613":
  209. case "gpt-4-0314":
  210. case "gpt-4":
  211. case "gpt-3.5-turbo-1106":
  212. case "gpt-35-turbo":
  213. case "gpt-4-1106-preview":
  214. case "gpt-4-vision-preview":
  215. case "gpt-3.5-turbo-0125":
  216. case "gpt-4-turbo":
  217. case "gpt-4-turbo-2024-04-09":
  218. case "gpt-4-turbo-preview":
  219. case "gpt-4-0125-preview":
  220. case "text-embedding-ada-002":
  221. case "text-embedding-3-small":
  222. case "text-embedding-3-large": {
  223. return "cl100k_base";
  224. }
  225. case "gpt-4o":
  226. case "gpt-4o-2024-05-13":
  227. case "gpt-4o-2024-08-06":
  228. case "gpt-4o-2024-11-20":
  229. case "gpt-4o-mini-2024-07-18":
  230. case "gpt-4o-mini":
  231. case "gpt-4o-search-preview":
  232. case "gpt-4o-search-preview-2025-03-11":
  233. case "gpt-4o-mini-search-preview":
  234. case "gpt-4o-mini-search-preview-2025-03-11":
  235. case "gpt-4o-audio-preview":
  236. case "gpt-4o-audio-preview-2024-12-17":
  237. case "gpt-4o-audio-preview-2024-10-01":
  238. case "gpt-4o-mini-audio-preview":
  239. case "gpt-4o-mini-audio-preview-2024-12-17":
  240. case "o1":
  241. case "o1-2024-12-17":
  242. case "o1-mini":
  243. case "o1-mini-2024-09-12":
  244. case "o1-preview":
  245. case "o1-preview-2024-09-12":
  246. case "o1-pro":
  247. case "o1-pro-2025-03-19":
  248. case "o3":
  249. case "o3-2025-04-16":
  250. case "o3-mini":
  251. case "o3-mini-2025-01-31":
  252. case "o4-mini":
  253. case "o4-mini-2025-04-16":
  254. case "chatgpt-4o-latest":
  255. case "gpt-4o-realtime":
  256. case "gpt-4o-realtime-preview-2024-10-01":
  257. case "gpt-4o-realtime-preview-2024-12-17":
  258. case "gpt-4o-mini-realtime-preview":
  259. case "gpt-4o-mini-realtime-preview-2024-12-17":
  260. case "gpt-4.1":
  261. case "gpt-4.1-2025-04-14":
  262. case "gpt-4.1-mini":
  263. case "gpt-4.1-mini-2025-04-14":
  264. case "gpt-4.1-nano":
  265. case "gpt-4.1-nano-2025-04-14":
  266. case "gpt-4.5-preview":
  267. case "gpt-4.5-preview-2025-02-27": {
  268. return "o200k_base";
  269. }
  270. default:
  271. throw new Error("Unknown model");
  272. }
  273. }
  274. export { Tiktoken, getEncodingNameForModel, never };