ggml.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669
  1. # coding=utf-8
  2. # Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
  3. # https://github.com/99991/pygguf
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """
  17. Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf
  18. with extra methods beings exposed
  19. """
  20. from array import array
  21. import numpy as np
  22. from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
  23. from tokenizers.models import BPE
  24. from .. import AddedToken
  25. from ..convert_slow_tokenizer import GPT2Converter, LlamaConverter, Qwen2Converter
  26. from ..utils import logging
  27. from ..utils.logging import tqdm
  28. logger = logging.get_logger(__name__)
  29. GGUF_TENSOR_MAPPING = {
  30. "llama": {
  31. "token_embd": "model.embed_tokens",
  32. "blk": "model.layers",
  33. "ffn_up": "mlp.up_proj",
  34. "ffn_down": "mlp.down_proj",
  35. "ffn_gate": "mlp.gate_proj",
  36. "ffn_norm": "post_attention_layernorm",
  37. "attn_norm": "input_layernorm",
  38. "attn_q": "self_attn.q_proj",
  39. "attn_v": "self_attn.v_proj",
  40. "attn_k": "self_attn.k_proj",
  41. "attn_output": "self_attn.o_proj",
  42. "output.weight": "lm_head.weight",
  43. "output_norm": "model.norm",
  44. },
  45. "mistral": {
  46. "token_embd": "model.embed_tokens",
  47. "blk": "model.layers",
  48. "ffn_up": "mlp.up_proj",
  49. "ffn_down": "mlp.down_proj",
  50. "ffn_gate": "mlp.gate_proj",
  51. "ffn_norm": "post_attention_layernorm",
  52. "attn_norm": "input_layernorm",
  53. "attn_q": "self_attn.q_proj",
  54. "attn_v": "self_attn.v_proj",
  55. "attn_k": "self_attn.k_proj",
  56. "attn_output": "self_attn.o_proj",
  57. "output.weight": "lm_head.weight",
  58. "output_norm": "model.norm",
  59. },
  60. "qwen2": {
  61. "token_embd": "model.embed_tokens",
  62. "blk": "model.layers",
  63. "ffn_up": "mlp.up_proj",
  64. "ffn_down": "mlp.down_proj",
  65. "ffn_gate": "mlp.gate_proj",
  66. "ffn_norm": "post_attention_layernorm",
  67. "attn_norm": "input_layernorm",
  68. "attn_q": "self_attn.q_proj",
  69. "attn_v": "self_attn.v_proj",
  70. "attn_k": "self_attn.k_proj",
  71. "attn_output": "self_attn.o_proj",
  72. "output.weight": "lm_head.weight",
  73. "output_norm": "model.norm",
  74. },
  75. "qwen2moe": {
  76. "token_embd": "model.embed_tokens",
  77. "blk": "model.layers",
  78. "ffn_up_exps": "mlp.experts",
  79. "ffn_up_shexp": "mlp.shared_expert.up_proj",
  80. "ffn_down_exps": "mlp.experts",
  81. "ffn_down_shexp": "mlp.shared_expert.down_proj",
  82. "ffn_norm": "post_attention_layernorm",
  83. "ffn_gate_inp.weight": "mlp.gate.weight",
  84. "ffn_gate_exps": "mlp.experts",
  85. "ffn_gate_shexp": "mlp.shared_expert.gate_proj",
  86. "ffn_gate_inp_shexp": "mlp.shared_expert_gate",
  87. "attn_norm": "input_layernorm",
  88. "attn_q": "self_attn.q_proj",
  89. "attn_v": "self_attn.v_proj",
  90. "attn_k": "self_attn.k_proj",
  91. "attn_output": "self_attn.o_proj",
  92. "output.weight": "lm_head.weight",
  93. "output_norm": "model.norm",
  94. },
  95. "phi3": {
  96. "token_embd": "model.embed_tokens",
  97. "blk": "model.layers",
  98. "ffn_up": "mlp.gate_up_proj",
  99. "ffn_down": "mlp.down_proj",
  100. "ffn_gate": "mlp.gate_up_proj",
  101. "ffn_norm": "post_attention_layernorm",
  102. "attn_norm": "input_layernorm",
  103. "attn_qkv": "self_attn.qkv_proj",
  104. "attn_output": "self_attn.o_proj",
  105. "output.weight": "lm_head.weight",
  106. "output_norm": "model.norm",
  107. },
  108. "bloom": {
  109. "token_embd.weight": "transformer.word_embeddings.weight",
  110. "token_embd_norm": "transformer.word_embeddings_layernorm",
  111. "blk": "transformer.h",
  112. "ffn_up": "mlp.dense_h_to_4h",
  113. "ffn_down": "mlp.dense_4h_to_h",
  114. "ffn_norm": "post_attention_layernorm",
  115. "attn_norm": "input_layernorm",
  116. "attn_qkv": "self_attention.query_key_value",
  117. "attn_output": "self_attention.dense",
  118. "output.weight": "lm_head.weight",
  119. "output_norm": "transformer.ln_f",
  120. },
  121. "falcon7b": {
  122. "token_embd": "word_embeddings",
  123. "blk": "h",
  124. "ffn_up": "mlp.dense_h_to_4h",
  125. "ffn_down": "mlp.dense_4h_to_h",
  126. "attn_norm": "input_layernorm",
  127. "attn_qkv": "self_attention.query_key_value",
  128. "attn_output": "self_attention.dense",
  129. ".output.": ".lm_head.",
  130. "output_norm": "ln_f",
  131. },
  132. "falcon40b": {
  133. "token_embd": "word_embeddings",
  134. "blk": "h",
  135. "ffn_up": "mlp.dense_h_to_4h",
  136. "ffn_down": "mlp.dense_4h_to_h",
  137. ".attn_norm.": ".ln_mlp.",
  138. "attn_norm_2": "ln_attn",
  139. "attn_qkv": "self_attention.query_key_value",
  140. "attn_output": "self_attention.dense",
  141. ".output.": ".lm_head.",
  142. "output_norm": "ln_f",
  143. },
  144. "stablelm": {
  145. "token_embd": "model.embed_tokens",
  146. "blk": "model.layers",
  147. "ffn_up": "mlp.up_proj",
  148. "ffn_down": "mlp.down_proj",
  149. "ffn_gate": "mlp.gate_proj",
  150. "ffn_norm": "post_attention_layernorm",
  151. "attn_norm": "input_layernorm",
  152. "attn_q": "self_attn.q_proj",
  153. "attn_v": "self_attn.v_proj",
  154. "attn_k": "self_attn.k_proj",
  155. "attn_output": "self_attn.o_proj",
  156. "output.weight": "lm_head.weight",
  157. "output_norm": "model.norm",
  158. },
  159. "gpt2": {
  160. "token_embd": "transformer.wte",
  161. "blk": "transformer.h",
  162. "position_embd": "transformer.wpe",
  163. "output_norm": "transformer.ln_f",
  164. "attn_norm": "ln_1",
  165. "attn_qkv": "attn.c_attn",
  166. "attn_output.weight": "attn.c_proj.weight",
  167. "attn_output.bias": "attn.c_proj.bias",
  168. "ffn_norm": "ln_2",
  169. "ffn_up": "mlp.c_fc",
  170. "ffn_down": "mlp.c_proj",
  171. },
  172. "starcoder2": {
  173. "token_embd": "model.embed_tokens",
  174. "blk": "model.layers",
  175. "ffn_up": "mlp.c_fc",
  176. "ffn_down": "mlp.c_proj",
  177. "ffn_norm": "post_attention_layernorm",
  178. "attn_norm": "input_layernorm",
  179. "attn_q": "self_attn.q_proj",
  180. "attn_v": "self_attn.v_proj",
  181. "attn_k": "self_attn.k_proj",
  182. "attn_output": "self_attn.o_proj",
  183. "output.weight": "lm_head.weight",
  184. "output_norm": "model.norm",
  185. },
  186. }
  187. GGUF_CONFIG_MAPPING = {
  188. "general": {
  189. "architecture": "model_type",
  190. "name": "_model_name_or_path",
  191. },
  192. "llama": {
  193. "context_length": "max_position_embeddings",
  194. "block_count": "num_hidden_layers",
  195. "feed_forward_length": "intermediate_size",
  196. "embedding_length": "hidden_size",
  197. # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
  198. "rope.dimension_count": "head_dim",
  199. "rope.freq_base": "rope_theta",
  200. "attention.head_count": "num_attention_heads",
  201. "attention.head_count_kv": "num_key_value_heads",
  202. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  203. "vocab_size": "vocab_size",
  204. },
  205. "mistral": {
  206. "context_length": "max_position_embeddings",
  207. "block_count": "num_hidden_layers",
  208. "feed_forward_length": "intermediate_size",
  209. "embedding_length": "hidden_size",
  210. # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
  211. "rope.dimension_count": "head_dim",
  212. "rope.freq_base": "rope_theta",
  213. "attention.head_count": "num_attention_heads",
  214. "attention.head_count_kv": "num_key_value_heads",
  215. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  216. "vocab_size": "vocab_size",
  217. },
  218. "qwen2": {
  219. "context_length": "max_position_embeddings",
  220. "block_count": "num_hidden_layers",
  221. "feed_forward_length": "intermediate_size",
  222. "embedding_length": "hidden_size",
  223. "rope.dimension_count": None,
  224. "rope.freq_base": "rope_theta",
  225. "attention.head_count": "num_attention_heads",
  226. "attention.head_count_kv": "num_key_value_heads",
  227. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  228. "vocab_size": "vocab_size",
  229. },
  230. "qwen2moe": {
  231. "context_length": "max_position_embeddings",
  232. "block_count": "num_hidden_layers",
  233. "feed_forward_length": "intermediate_size",
  234. "embedding_length": "hidden_size",
  235. "rope.dimension_count": None,
  236. "rope.freq_base": "rope_theta",
  237. "attention.head_count": "num_attention_heads",
  238. "attention.head_count_kv": "num_key_value_heads",
  239. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  240. "vocab_size": "vocab_size",
  241. "expert_count": "num_experts",
  242. "expert_used_count": "num_experts_per_tok",
  243. },
  244. "falcon": {
  245. "context_length": "max_position_embeddings",
  246. "block_count": "num_hidden_layers",
  247. "feed_forward_length": "intermediate_size",
  248. "embedding_length": "hidden_size",
  249. "rope.dimension_count": None,
  250. "rope.freq_base": "rope_theta",
  251. "attention.head_count": "num_attention_heads",
  252. "attention.head_count_kv": "num_key_value_heads",
  253. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  254. "vocab_size": "vocab_size",
  255. },
  256. "tokenizer": {
  257. "ggml.bos_token_id": "bos_token_id",
  258. "ggml.eos_token_id": "eos_token_id",
  259. "ggml.unknown_token_id": "unk_token_id",
  260. "ggml.padding_token_id": "pad_token_id",
  261. },
  262. "phi3": {
  263. "context_length": "max_position_embeddings",
  264. "block_count": "num_hidden_layers",
  265. "feed_forward_length": "intermediate_size",
  266. "embedding_length": "hidden_size",
  267. "rope.dimension_count": None,
  268. "rope.freq_base": "rope_theta",
  269. "attention.head_count": "num_attention_heads",
  270. "attention.head_count_kv": "num_key_value_heads",
  271. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  272. "vocab_size": "vocab_size",
  273. },
  274. "bloom": {
  275. "block_count": "n_layer",
  276. "embedding_length": "hidden_size",
  277. "attention.head_count": "n_head",
  278. "vocab_size": "vocab_size",
  279. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  280. },
  281. "stablelm": {
  282. "context_length": "max_position_embeddings",
  283. "block_count": "num_hidden_layers",
  284. "feed_forward_length": "intermediate_size",
  285. "embedding_length": "hidden_size",
  286. "rope.dimension_count": None,
  287. "attention.head_count": "num_attention_heads",
  288. "attention.head_count_kv": "num_key_value_heads",
  289. "attention.layer_norm_epsilon": "layer_norm_eps",
  290. "vocab_size": "vocab_size",
  291. },
  292. "gpt2": {
  293. "block_count": "n_layer",
  294. "context_length": "n_ctx",
  295. "embedding_length": "n_embd",
  296. "feed_forward_length": "feed_forward_length",
  297. "attention.head_count": "n_head",
  298. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  299. },
  300. "starcoder2": {
  301. "block_count": "num_hidden_layers",
  302. "context_length": "max_position_embeddings",
  303. "embedding_length": "hidden_size",
  304. "feed_forward_length": "intermediate_size",
  305. "attention.head_count": "num_attention_heads",
  306. "attention.head_count_kv": "num_key_value_heads",
  307. "attention.layer_norm_epsilon": "norm_epsilon",
  308. },
  309. }
  310. GGUF_TOKENIZER_MAPPING = {
  311. "tokenizer": {
  312. "ggml.model": "tokenizer_type",
  313. "ggml.tokens": "tokens",
  314. "ggml.scores": "scores",
  315. "ggml.token_type": "token_type",
  316. "ggml.merges": "merges",
  317. "ggml.bos_token_id": "bos_token_id",
  318. "ggml.eos_token_id": "eos_token_id",
  319. "ggml.unknown_token_id": "unk_token_id",
  320. "ggml.padding_token_id": "pad_token_id",
  321. "ggml.add_space_prefix": "add_prefix_space",
  322. },
  323. "tokenizer_config": {
  324. "chat_template": "chat_template",
  325. "ggml.model": "model_type",
  326. "ggml.bos_token_id": "bos_token_id",
  327. "ggml.eos_token_id": "eos_token_id",
  328. "ggml.unknown_token_id": "unk_token_id",
  329. "ggml.padding_token_id": "pad_token_id",
  330. },
  331. }
  332. def _gguf_parse_value(_value, data_type):
  333. if not isinstance(data_type, list):
  334. data_type = [data_type]
  335. if len(data_type) == 1:
  336. data_type = data_type[0]
  337. array_data_type = None
  338. else:
  339. if data_type[0] != 9:
  340. raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
  341. data_type, array_data_type = data_type
  342. if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
  343. _value = int(_value[0])
  344. elif data_type in [6, 12]:
  345. _value = float(_value[0])
  346. elif data_type in [7]:
  347. _value = bool(_value[0])
  348. elif data_type in [8]:
  349. _value = array("B", list(_value)).tobytes().decode()
  350. elif data_type in [9]:
  351. _value = _gguf_parse_value(_value, array_data_type)
  352. return _value
  353. class GGUFTokenizerSkeleton:
  354. def __init__(self, dict_):
  355. for k, v in dict_.items():
  356. setattr(self, k, v)
  357. if not hasattr(self, "merges"):
  358. if not hasattr(self, "tokens") or not hasattr(self, "scores"):
  359. raise ValueError(
  360. "tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated."
  361. )
  362. tokens = self.tokens
  363. scores = self.scores
  364. vocab = {t: scores[i] for i, t in enumerate(tokens)}
  365. logger.warning("Merges were not in checkpoint, building merges on the fly.")
  366. merges = []
  367. for merge, piece_score in tqdm(vocab.items()):
  368. local = []
  369. for index in range(1, len(merge)):
  370. piece_l, piece_r = merge[:index], merge[index:]
  371. if piece_l in tokens and piece_r in tokens:
  372. local.append((piece_l, piece_r, piece_score))
  373. local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]), reverse=True)
  374. merges.extend(local)
  375. merges = sorted(merges, key=lambda val: val[2], reverse=True)
  376. merges = [(val[0], val[1]) for val in merges]
  377. self.merges = merges
  378. else:
  379. self.merges = [tuple(merge.split(" ")) for merge in self.merges]
  380. if not hasattr(self, "scores"):
  381. self.scores = [None for _ in range(len(self.tokens))]
  382. if not hasattr(self, "added_tokens"):
  383. self.added_tokens = []
  384. if not hasattr(self, "unk_token_id"):
  385. self.unk_token_id = None
  386. # Llama2 uses the field `unknown_token_id`
  387. if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
  388. self.unk_token_id = self.unknown_token_id
  389. class GGUFLlamaConverter(LlamaConverter):
  390. def __init__(self, tokenizer_dict):
  391. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  392. self.original_tokenizer = self.proto
  393. self.additional_kwargs = {}
  394. self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"
  395. def vocab(self, proto):
  396. return list(zip(proto.tokens, proto.scores))
  397. def merges(self, proto):
  398. return proto.merges
  399. def tokenizer(self, proto):
  400. vocab_scores = self.vocab(self.proto)
  401. merges = self.merges(self.proto)
  402. bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
  403. unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
  404. bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
  405. eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None
  406. tokenizer = Tokenizer(
  407. BPE(
  408. bpe_vocab,
  409. merges,
  410. unk_token=unk_token,
  411. fuse_unk=True,
  412. byte_fallback=True,
  413. )
  414. )
  415. special_tokens = []
  416. if not hasattr(self.proto, "token_type"):
  417. if unk_token is not None:
  418. special_tokens.append(AddedToken(unk_token, normalized=False, special=True))
  419. if bos_token is not None:
  420. special_tokens.append(AddedToken(bos_token, normalized=False, special=True))
  421. if eos_token is not None:
  422. special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
  423. else:
  424. # 3 stands for special tokens
  425. special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]
  426. for idx in special_tokens_idx:
  427. special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))
  428. if len(special_tokens) != 0:
  429. tokenizer.add_special_tokens(special_tokens)
  430. if len(self.proto.added_tokens) != 0:
  431. tokenizer.add_tokens(
  432. [AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
  433. )
  434. self.additional_kwargs["unk_token"] = unk_token
  435. self.additional_kwargs["eos_token"] = bos_token
  436. self.additional_kwargs["bos_token"] = eos_token
  437. if self.is_llama_3_tokenizer:
  438. self.additional_kwargs["add_prefix_space"] = None
  439. self.additional_kwargs["clean_up_tokenization_spaces"] = True
  440. self.additional_kwargs["legacy"] = False
  441. self.original_tokenizer.legacy = False
  442. return tokenizer
  443. def decoder(self, replacement, add_prefix_space):
  444. sequence = [
  445. decoders.ByteFallback(),
  446. decoders.Fuse(),
  447. decoders.Replace("▁", " "),
  448. ]
  449. if self.is_llama_3_tokenizer:
  450. sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]
  451. if add_prefix_space:
  452. sequence += [decoders.Strip(content=" ", left=1)]
  453. return decoders.Sequence(sequence)
  454. def converted(self):
  455. # Copied partly from converted method in SpmConverter class
  456. tokenizer = self.tokenizer(self.proto)
  457. # Tokenizer assemble
  458. normalizer = self.normalizer(self.proto)
  459. if normalizer is not None:
  460. tokenizer.normalizer = normalizer
  461. replacement = "▁"
  462. add_prefix_space = True
  463. if hasattr(self.original_tokenizer, "add_prefix_space"):
  464. add_prefix_space = self.original_tokenizer.add_prefix_space
  465. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  466. if pre_tokenizer is not None:
  467. tokenizer.pre_tokenizer = pre_tokenizer
  468. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  469. post_processor = self.post_processor()
  470. if post_processor:
  471. tokenizer.post_processor = post_processor
  472. # HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
  473. # and normalizer
  474. if self.is_llama_3_tokenizer:
  475. tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
  476. add_prefix_space=False, trim_offsets=False, use_regex=True
  477. )
  478. # This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
  479. # init.
  480. tokenizer.normalizer = normalizers.Sequence([])
  481. return tokenizer
  482. class GGUFQwen2Converter(Qwen2Converter):
  483. def __init__(self, tokenizer_dict):
  484. self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
  485. self.additional_kwargs = {}
  486. def converted(self) -> Tokenizer:
  487. vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
  488. merges = self.original_tokenizer.merges
  489. tokenizer = super().converted(vocab, merges)
  490. tokenizer.add_special_tokens(
  491. [
  492. AddedToken("<|endoftext|>", normalized=False, special=True),
  493. AddedToken("<|im_start|>", normalized=False, special=True),
  494. AddedToken("<|im_end|>", normalized=False, special=True),
  495. ]
  496. )
  497. return tokenizer
  498. class GGUFPhi3Converter(LlamaConverter):
  499. def __init__(self, tokenizer_dict):
  500. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  501. self.original_tokenizer = self.proto
  502. self.additional_kwargs = {}
  503. def vocab(self, proto):
  504. return list(zip(proto.tokens, proto.scores))
  505. def merges(self, proto):
  506. return proto.merges
  507. def tokenizer(self, proto):
  508. vocab_scores = self.vocab(self.proto)
  509. merges = self.merges(self.proto)
  510. bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
  511. tokenizer = Tokenizer(BPE(bpe_vocab, merges))
  512. # add the special tokens from phi3 tokenizer config
  513. tokenizer.add_special_tokens(
  514. [
  515. AddedToken("</s>", rstrip=True, lstrip=False, normalized=False, special=True),
  516. AddedToken("<|endoftext|>", normalized=False, special=True),
  517. AddedToken("<|assistant|>", rstrip=True, normalized=False, special=True),
  518. AddedToken("<|placeholder1|>", rstrip=True, normalized=False, special=True),
  519. AddedToken("<|placeholder2|>", rstrip=True, normalized=False, special=True),
  520. AddedToken("<|placeholder3|>", rstrip=True, normalized=False, special=True),
  521. AddedToken("<|placeholder4|>", rstrip=True, normalized=False, special=True),
  522. AddedToken("<|system|>", rstrip=True, normalized=False, special=True),
  523. AddedToken("<|end|>", rstrip=True, normalized=False, special=True),
  524. AddedToken("<|placeholder5|>", rstrip=True, normalized=False, special=True),
  525. AddedToken("<|placeholder6|>", rstrip=True, normalized=False, special=True),
  526. AddedToken("<|user|>", rstrip=True, normalized=False, special=True),
  527. ]
  528. )
  529. self.additional_kwargs["unk_token"] = (
  530. proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
  531. )
  532. self.additional_kwargs["eos_token"] = (
  533. proto.tokens[proto.eos_token_id] if proto.eos_token_id is not None else None
  534. )
  535. self.additional_kwargs["bos_token"] = (
  536. proto.tokens[proto.bos_token_id] if proto.bos_token_id is not None else None
  537. )
  538. self.additional_kwargs["pad_token"] = (
  539. proto.tokens[proto.pad_token_id] if proto.pad_token_id is not None else None
  540. )
  541. return tokenizer
  542. def decoder(self, replacement, add_prefix_space):
  543. sequence = [
  544. decoders.ByteFallback(),
  545. decoders.Fuse(),
  546. decoders.Replace(replacement, " "),
  547. ]
  548. if add_prefix_space:
  549. sequence += [decoders.Strip(content=" ", left=1)]
  550. return decoders.Sequence(sequence)
  551. def converted(self) -> Tokenizer:
  552. tokenizer = self.tokenizer(self.proto)
  553. replacement = "▁"
  554. add_prefix_space = True
  555. if hasattr(self.original_tokenizer, "add_prefix_space"):
  556. add_prefix_space = self.original_tokenizer.add_prefix_space
  557. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  558. return tokenizer
  559. class GGUFGPTConverter(GPT2Converter):
  560. def __init__(self, tokenizer_dict):
  561. self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
  562. self.additional_kwargs = {}
  563. def converted(self) -> Tokenizer:
  564. vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
  565. merges = self.original_tokenizer.merges
  566. tokenizer = super().converted(vocab, merges)
  567. return tokenizer
  568. GGUF_TO_FAST_CONVERTERS = {
  569. "llama": GGUFLlamaConverter,
  570. "qwen2": GGUFQwen2Converter,
  571. "qwen2_moe": GGUFQwen2Converter,
  572. "phi3": GGUFPhi3Converter,
  573. "bloom": GGUFGPTConverter,
  574. "falcon": GGUFGPTConverter,
  575. "stablelm": GGUFGPTConverter,
  576. "gpt2": GGUFGPTConverter,
  577. "starcoder2": GGUFGPTConverter,
  578. }
  579. def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer:
  580. """
  581. Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
  582. Args:
  583. architecture (`str`): The model architecture derived from gguf file.
  584. transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
  585. Instance of a slow tokenizer to convert in the backend tokenizer for
  586. [`~tokenization_utils_base.PreTrainedTokenizerFast`].
  587. Return:
  588. A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
  589. [`~tokenization_utils_base.PreTrainedTokenizerFast`]
  590. """
  591. tokenizer_class_name = architecture
  592. converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
  593. fast_tokenizer = converter.converted()
  594. return fast_tokenizer, converter.additional_kwargs