123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- 'use strict';
- var base64 = require('base64-js');
- function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
- var base64__default = /*#__PURE__*/_interopDefault(base64);
- var __defProp = Object.defineProperty;
- var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
- var __publicField = (obj, key, value) => {
- __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
- return value;
- };
- // src/core.ts
- function bytePairMerge(piece, ranks) {
- let parts = Array.from(
- { length: piece.length },
- (_, i) => ({ start: i, end: i + 1 })
- );
- while (parts.length > 1) {
- let minRank = null;
- for (let i = 0; i < parts.length - 1; i++) {
- const slice = piece.slice(parts[i].start, parts[i + 1].end);
- const rank = ranks.get(slice.join(","));
- if (rank == null)
- continue;
- if (minRank == null || rank < minRank[0]) {
- minRank = [rank, i];
- }
- }
- if (minRank != null) {
- const i = minRank[1];
- parts[i] = { start: parts[i].start, end: parts[i + 1].end };
- parts.splice(i + 1, 1);
- } else {
- break;
- }
- }
- return parts;
- }
- function bytePairEncode(piece, ranks) {
- if (piece.length === 1)
- return [ranks.get(piece.join(","))];
- return bytePairMerge(piece, ranks).map((p) => ranks.get(piece.slice(p.start, p.end).join(","))).filter((x) => x != null);
- }
- function escapeRegex(str) {
- return str.replace(/[\\^$*+?.()|[\]{}]/g, "\\$&");
- }
- var _Tiktoken = class {
- /** @internal */
- specialTokens;
- /** @internal */
- inverseSpecialTokens;
- /** @internal */
- patStr;
- /** @internal */
- textEncoder = new TextEncoder();
- /** @internal */
- textDecoder = new TextDecoder("utf-8");
- /** @internal */
- rankMap = /* @__PURE__ */ new Map();
- /** @internal */
- textMap = /* @__PURE__ */ new Map();
- constructor(ranks, extendedSpecialTokens) {
- this.patStr = ranks.pat_str;
- const uncompressed = ranks.bpe_ranks.split("\n").filter(Boolean).reduce((memo, x) => {
- const [_, offsetStr, ...tokens] = x.split(" ");
- const offset = Number.parseInt(offsetStr, 10);
- tokens.forEach((token, i) => memo[token] = offset + i);
- return memo;
- }, {});
- for (const [token, rank] of Object.entries(uncompressed)) {
- const bytes = base64__default.default.toByteArray(token);
- this.rankMap.set(bytes.join(","), rank);
- this.textMap.set(rank, bytes);
- }
- this.specialTokens = { ...ranks.special_tokens, ...extendedSpecialTokens };
- this.inverseSpecialTokens = Object.entries(this.specialTokens).reduce((memo, [text, rank]) => {
- memo[rank] = this.textEncoder.encode(text);
- return memo;
- }, {});
- }
- encode(text, allowedSpecial = [], disallowedSpecial = "all") {
- const regexes = new RegExp(this.patStr, "ug");
- const specialRegex = _Tiktoken.specialTokenRegex(
- Object.keys(this.specialTokens)
- );
- const ret = [];
- const allowedSpecialSet = new Set(
- allowedSpecial === "all" ? Object.keys(this.specialTokens) : allowedSpecial
- );
- const disallowedSpecialSet = new Set(
- disallowedSpecial === "all" ? Object.keys(this.specialTokens).filter(
- (x) => !allowedSpecialSet.has(x)
- ) : disallowedSpecial
- );
- if (disallowedSpecialSet.size > 0) {
- const disallowedSpecialRegex = _Tiktoken.specialTokenRegex([
- ...disallowedSpecialSet
- ]);
- const specialMatch = text.match(disallowedSpecialRegex);
- if (specialMatch != null) {
- throw new Error(
- `The text contains a special token that is not allowed: ${specialMatch[0]}`
- );
- }
- }
- let start = 0;
- while (true) {
- let nextSpecial = null;
- let startFind = start;
- while (true) {
- specialRegex.lastIndex = startFind;
- nextSpecial = specialRegex.exec(text);
- if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0]))
- break;
- startFind = nextSpecial.index + 1;
- }
- const end = nextSpecial?.index ?? text.length;
- for (const match of text.substring(start, end).matchAll(regexes)) {
- const piece = this.textEncoder.encode(match[0]);
- const token2 = this.rankMap.get(piece.join(","));
- if (token2 != null) {
- ret.push(token2);
- continue;
- }
- ret.push(...bytePairEncode(piece, this.rankMap));
- }
- if (nextSpecial == null)
- break;
- let token = this.specialTokens[nextSpecial[0]];
- ret.push(token);
- start = nextSpecial.index + nextSpecial[0].length;
- }
- return ret;
- }
- decode(tokens) {
- const res = [];
- let length = 0;
- for (let i2 = 0; i2 < tokens.length; ++i2) {
- const token = tokens[i2];
- const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token];
- if (bytes != null) {
- res.push(bytes);
- length += bytes.length;
- }
- }
- const mergedArray = new Uint8Array(length);
- let i = 0;
- for (const bytes of res) {
- mergedArray.set(bytes, i);
- i += bytes.length;
- }
- return this.textDecoder.decode(mergedArray);
- }
- };
- var Tiktoken = _Tiktoken;
- __publicField(Tiktoken, "specialTokenRegex", (tokens) => {
- return new RegExp(tokens.map((i) => escapeRegex(i)).join("|"), "g");
- });
- function getEncodingNameForModel(model) {
- switch (model) {
- case "gpt2": {
- return "gpt2";
- }
- case "code-cushman-001":
- case "code-cushman-002":
- case "code-davinci-001":
- case "code-davinci-002":
- case "cushman-codex":
- case "davinci-codex":
- case "davinci-002":
- case "text-davinci-002":
- case "text-davinci-003": {
- return "p50k_base";
- }
- case "code-davinci-edit-001":
- case "text-davinci-edit-001": {
- return "p50k_edit";
- }
- case "ada":
- case "babbage":
- case "babbage-002":
- case "code-search-ada-code-001":
- case "code-search-babbage-code-001":
- case "curie":
- case "davinci":
- case "text-ada-001":
- case "text-babbage-001":
- case "text-curie-001":
- case "text-davinci-001":
- case "text-search-ada-doc-001":
- case "text-search-babbage-doc-001":
- case "text-search-curie-doc-001":
- case "text-search-davinci-doc-001":
- case "text-similarity-ada-001":
- case "text-similarity-babbage-001":
- case "text-similarity-curie-001":
- case "text-similarity-davinci-001": {
- return "r50k_base";
- }
- case "gpt-3.5-turbo-instruct-0914":
- case "gpt-3.5-turbo-instruct":
- case "gpt-3.5-turbo-16k-0613":
- case "gpt-3.5-turbo-16k":
- case "gpt-3.5-turbo-0613":
- case "gpt-3.5-turbo-0301":
- case "gpt-3.5-turbo":
- case "gpt-4-32k-0613":
- case "gpt-4-32k-0314":
- case "gpt-4-32k":
- case "gpt-4-0613":
- case "gpt-4-0314":
- case "gpt-4":
- case "gpt-3.5-turbo-1106":
- case "gpt-35-turbo":
- case "gpt-4-1106-preview":
- case "gpt-4-vision-preview":
- case "gpt-3.5-turbo-0125":
- case "gpt-4-turbo":
- case "gpt-4-turbo-2024-04-09":
- case "gpt-4-turbo-preview":
- case "gpt-4-0125-preview":
- case "text-embedding-ada-002":
- case "text-embedding-3-small":
- case "text-embedding-3-large": {
- return "cl100k_base";
- }
- case "gpt-4o":
- case "gpt-4o-2024-05-13":
- case "gpt-4o-2024-08-06":
- case "gpt-4o-2024-11-20":
- case "gpt-4o-mini-2024-07-18":
- case "gpt-4o-mini":
- case "gpt-4o-search-preview":
- case "gpt-4o-search-preview-2025-03-11":
- case "gpt-4o-mini-search-preview":
- case "gpt-4o-mini-search-preview-2025-03-11":
- case "gpt-4o-audio-preview":
- case "gpt-4o-audio-preview-2024-12-17":
- case "gpt-4o-audio-preview-2024-10-01":
- case "gpt-4o-mini-audio-preview":
- case "gpt-4o-mini-audio-preview-2024-12-17":
- case "o1":
- case "o1-2024-12-17":
- case "o1-mini":
- case "o1-mini-2024-09-12":
- case "o1-preview":
- case "o1-preview-2024-09-12":
- case "o1-pro":
- case "o1-pro-2025-03-19":
- case "o3":
- case "o3-2025-04-16":
- case "o3-mini":
- case "o3-mini-2025-01-31":
- case "o4-mini":
- case "o4-mini-2025-04-16":
- case "chatgpt-4o-latest":
- case "gpt-4o-realtime":
- case "gpt-4o-realtime-preview-2024-10-01":
- case "gpt-4o-realtime-preview-2024-12-17":
- case "gpt-4o-mini-realtime-preview":
- case "gpt-4o-mini-realtime-preview-2024-12-17":
- case "gpt-4.1":
- case "gpt-4.1-2025-04-14":
- case "gpt-4.1-mini":
- case "gpt-4.1-mini-2025-04-14":
- case "gpt-4.1-nano":
- case "gpt-4.1-nano-2025-04-14":
- case "gpt-4.5-preview":
- case "gpt-4.5-preview-2025-02-27": {
- return "o200k_base";
- }
- default:
- throw new Error("Unknown model");
- }
- }
- exports.Tiktoken = Tiktoken;
- exports.getEncodingNameForModel = getEncodingNameForModel;
|