tokenization_auto.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. # coding=utf-8
  2. # Copyright 2018 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Auto Tokenizer class."""
  16. import importlib
  17. import json
  18. import os
  19. import warnings
  20. from collections import OrderedDict
  21. from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
  22. from ...configuration_utils import PretrainedConfig
  23. from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
  24. from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
  25. from ...tokenization_utils import PreTrainedTokenizer
  26. from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
  27. from ...utils import (
  28. cached_file,
  29. extract_commit_hash,
  30. is_g2p_en_available,
  31. is_sentencepiece_available,
  32. is_tokenizers_available,
  33. logging,
  34. )
  35. from ..encoder_decoder import EncoderDecoderConfig
  36. from .auto_factory import _LazyAutoMapping
  37. from .configuration_auto import (
  38. CONFIG_MAPPING_NAMES,
  39. AutoConfig,
  40. config_class_to_model_type,
  41. model_type_to_module_name,
  42. replace_list_option_in_docstrings,
  43. )
  44. if is_tokenizers_available():
  45. from ...tokenization_utils_fast import PreTrainedTokenizerFast
  46. else:
  47. PreTrainedTokenizerFast = None
  48. logger = logging.get_logger(__name__)
  49. if TYPE_CHECKING:
  50. # This significantly improves completion suggestion performance when
  51. # the transformers package is used with Microsoft's Pylance language server.
  52. TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
  53. else:
  54. TOKENIZER_MAPPING_NAMES = OrderedDict(
  55. [
  56. (
  57. "albert",
  58. (
  59. "AlbertTokenizer" if is_sentencepiece_available() else None,
  60. "AlbertTokenizerFast" if is_tokenizers_available() else None,
  61. ),
  62. ),
  63. ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  64. ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  65. ("bart", ("BartTokenizer", "BartTokenizerFast")),
  66. (
  67. "barthez",
  68. (
  69. "BarthezTokenizer" if is_sentencepiece_available() else None,
  70. "BarthezTokenizerFast" if is_tokenizers_available() else None,
  71. ),
  72. ),
  73. ("bartpho", ("BartphoTokenizer", None)),
  74. ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  75. ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
  76. ("bert-japanese", ("BertJapaneseTokenizer", None)),
  77. ("bertweet", ("BertweetTokenizer", None)),
  78. (
  79. "big_bird",
  80. (
  81. "BigBirdTokenizer" if is_sentencepiece_available() else None,
  82. "BigBirdTokenizerFast" if is_tokenizers_available() else None,
  83. ),
  84. ),
  85. ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
  86. ("biogpt", ("BioGptTokenizer", None)),
  87. ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
  88. ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
  89. ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  90. ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  91. ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
  92. ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
  93. ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  94. ("byt5", ("ByT5Tokenizer", None)),
  95. (
  96. "camembert",
  97. (
  98. "CamembertTokenizer" if is_sentencepiece_available() else None,
  99. "CamembertTokenizerFast" if is_tokenizers_available() else None,
  100. ),
  101. ),
  102. ("canine", ("CanineTokenizer", None)),
  103. (
  104. "chameleon",
  105. (
  106. "LlamaTokenizer" if is_sentencepiece_available() else None,
  107. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  108. ),
  109. ),
  110. ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  111. (
  112. "clap",
  113. (
  114. "RobertaTokenizer",
  115. "RobertaTokenizerFast" if is_tokenizers_available() else None,
  116. ),
  117. ),
  118. (
  119. "clip",
  120. (
  121. "CLIPTokenizer",
  122. "CLIPTokenizerFast" if is_tokenizers_available() else None,
  123. ),
  124. ),
  125. (
  126. "clipseg",
  127. (
  128. "CLIPTokenizer",
  129. "CLIPTokenizerFast" if is_tokenizers_available() else None,
  130. ),
  131. ),
  132. ("clvp", ("ClvpTokenizer", None)),
  133. (
  134. "code_llama",
  135. (
  136. "CodeLlamaTokenizer" if is_sentencepiece_available() else None,
  137. "CodeLlamaTokenizerFast" if is_tokenizers_available() else None,
  138. ),
  139. ),
  140. ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
  141. ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
  142. ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
  143. (
  144. "cpm",
  145. (
  146. "CpmTokenizer" if is_sentencepiece_available() else None,
  147. "CpmTokenizerFast" if is_tokenizers_available() else None,
  148. ),
  149. ),
  150. ("cpmant", ("CpmAntTokenizer", None)),
  151. ("ctrl", ("CTRLTokenizer", None)),
  152. ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)),
  153. ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
  154. ("dbrx", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  155. ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
  156. (
  157. "deberta-v2",
  158. (
  159. "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
  160. "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
  161. ),
  162. ),
  163. ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
  164. (
  165. "dpr",
  166. (
  167. "DPRQuestionEncoderTokenizer",
  168. "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
  169. ),
  170. ),
  171. ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
  172. ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  173. ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
  174. ("esm", ("EsmTokenizer", None)),
  175. ("falcon", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
  176. ("falcon_mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  177. (
  178. "fastspeech2_conformer",
  179. ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
  180. ),
  181. ("flaubert", ("FlaubertTokenizer", None)),
  182. ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
  183. ("fsmt", ("FSMTTokenizer", None)),
  184. ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
  185. (
  186. "gemma",
  187. (
  188. "GemmaTokenizer" if is_sentencepiece_available() else None,
  189. "GemmaTokenizerFast" if is_tokenizers_available() else None,
  190. ),
  191. ),
  192. (
  193. "gemma2",
  194. (
  195. "GemmaTokenizer" if is_sentencepiece_available() else None,
  196. "GemmaTokenizerFast" if is_tokenizers_available() else None,
  197. ),
  198. ),
  199. ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  200. ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
  201. ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
  202. ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  203. ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  204. ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  205. ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  206. ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
  207. ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  208. ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
  209. ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  210. ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
  211. ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
  212. ("hubert", ("Wav2Vec2CTCTokenizer", None)),
  213. ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
  214. ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  215. ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  216. ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  217. ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  218. ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  219. (
  220. "jamba",
  221. (
  222. "LlamaTokenizer" if is_sentencepiece_available() else None,
  223. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  224. ),
  225. ),
  226. (
  227. "jetmoe",
  228. (
  229. "LlamaTokenizer" if is_sentencepiece_available() else None,
  230. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  231. ),
  232. ),
  233. ("jukebox", ("JukeboxTokenizer", None)),
  234. (
  235. "kosmos-2",
  236. (
  237. "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
  238. "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
  239. ),
  240. ),
  241. ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
  242. ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
  243. ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
  244. ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
  245. ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
  246. ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
  247. (
  248. "llama",
  249. (
  250. "LlamaTokenizer" if is_sentencepiece_available() else None,
  251. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  252. ),
  253. ),
  254. ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  255. ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  256. ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  257. ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  258. ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
  259. (
  260. "longt5",
  261. (
  262. "T5Tokenizer" if is_sentencepiece_available() else None,
  263. "T5TokenizerFast" if is_tokenizers_available() else None,
  264. ),
  265. ),
  266. ("luke", ("LukeTokenizer", None)),
  267. ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
  268. ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
  269. ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  270. ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  271. ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
  272. (
  273. "mbart",
  274. (
  275. "MBartTokenizer" if is_sentencepiece_available() else None,
  276. "MBartTokenizerFast" if is_tokenizers_available() else None,
  277. ),
  278. ),
  279. (
  280. "mbart50",
  281. (
  282. "MBart50Tokenizer" if is_sentencepiece_available() else None,
  283. "MBart50TokenizerFast" if is_tokenizers_available() else None,
  284. ),
  285. ),
  286. ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
  287. ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  288. ("mgp-str", ("MgpstrTokenizer", None)),
  289. (
  290. "mistral",
  291. (
  292. "LlamaTokenizer" if is_sentencepiece_available() else None,
  293. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  294. ),
  295. ),
  296. (
  297. "mixtral",
  298. (
  299. "LlamaTokenizer" if is_sentencepiece_available() else None,
  300. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  301. ),
  302. ),
  303. ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  304. ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
  305. ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
  306. ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
  307. ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
  308. ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  309. ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
  310. (
  311. "mt5",
  312. (
  313. "MT5Tokenizer" if is_sentencepiece_available() else None,
  314. "MT5TokenizerFast" if is_tokenizers_available() else None,
  315. ),
  316. ),
  317. ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
  318. ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
  319. ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
  320. ("myt5", ("MyT5Tokenizer", None)),
  321. ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  322. (
  323. "nllb",
  324. (
  325. "NllbTokenizer" if is_sentencepiece_available() else None,
  326. "NllbTokenizerFast" if is_tokenizers_available() else None,
  327. ),
  328. ),
  329. (
  330. "nllb-moe",
  331. (
  332. "NllbTokenizer" if is_sentencepiece_available() else None,
  333. "NllbTokenizerFast" if is_tokenizers_available() else None,
  334. ),
  335. ),
  336. (
  337. "nystromformer",
  338. (
  339. "AlbertTokenizer" if is_sentencepiece_available() else None,
  340. "AlbertTokenizerFast" if is_tokenizers_available() else None,
  341. ),
  342. ),
  343. ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  344. ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  345. (
  346. "omdet-turbo",
  347. ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None),
  348. ),
  349. ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
  350. (
  351. "openai-gpt",
  352. ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
  353. ),
  354. ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  355. ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
  356. ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
  357. ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  358. (
  359. "pegasus",
  360. (
  361. "PegasusTokenizer" if is_sentencepiece_available() else None,
  362. "PegasusTokenizerFast" if is_tokenizers_available() else None,
  363. ),
  364. ),
  365. (
  366. "pegasus_x",
  367. (
  368. "PegasusTokenizer" if is_sentencepiece_available() else None,
  369. "PegasusTokenizerFast" if is_tokenizers_available() else None,
  370. ),
  371. ),
  372. (
  373. "perceiver",
  374. (
  375. "PerceiverTokenizer",
  376. None,
  377. ),
  378. ),
  379. (
  380. "persimmon",
  381. (
  382. "LlamaTokenizer" if is_sentencepiece_available() else None,
  383. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  384. ),
  385. ),
  386. ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
  387. ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  388. ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  389. ("phobert", ("PhobertTokenizer", None)),
  390. ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
  391. ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
  392. ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
  393. ("prophetnet", ("ProphetNetTokenizer", None)),
  394. ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  395. (
  396. "qwen2",
  397. (
  398. "Qwen2Tokenizer",
  399. "Qwen2TokenizerFast" if is_tokenizers_available() else None,
  400. ),
  401. ),
  402. ("qwen2_audio", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
  403. (
  404. "qwen2_moe",
  405. (
  406. "Qwen2Tokenizer",
  407. "Qwen2TokenizerFast" if is_tokenizers_available() else None,
  408. ),
  409. ),
  410. ("qwen2_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
  411. ("rag", ("RagTokenizer", None)),
  412. ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
  413. (
  414. "recurrent_gemma",
  415. (
  416. "GemmaTokenizer" if is_sentencepiece_available() else None,
  417. "GemmaTokenizerFast" if is_tokenizers_available() else None,
  418. ),
  419. ),
  420. (
  421. "reformer",
  422. (
  423. "ReformerTokenizer" if is_sentencepiece_available() else None,
  424. "ReformerTokenizerFast" if is_tokenizers_available() else None,
  425. ),
  426. ),
  427. (
  428. "rembert",
  429. (
  430. "RemBertTokenizer" if is_sentencepiece_available() else None,
  431. "RemBertTokenizerFast" if is_tokenizers_available() else None,
  432. ),
  433. ),
  434. ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
  435. ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
  436. (
  437. "roberta-prelayernorm",
  438. ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None),
  439. ),
  440. ("roc_bert", ("RoCBertTokenizer", None)),
  441. ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
  442. ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  443. (
  444. "seamless_m4t",
  445. (
  446. "SeamlessM4TTokenizer" if is_sentencepiece_available() else None,
  447. "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
  448. ),
  449. ),
  450. (
  451. "seamless_m4t_v2",
  452. (
  453. "SeamlessM4TTokenizer" if is_sentencepiece_available() else None,
  454. "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
  455. ),
  456. ),
  457. ("siglip", ("SiglipTokenizer" if is_sentencepiece_available() else None, None)),
  458. ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
  459. ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
  460. ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
  461. ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
  462. (
  463. "squeezebert",
  464. ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
  465. ),
  466. ("stablelm", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
  467. ("starcoder2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
  468. (
  469. "switch_transformers",
  470. (
  471. "T5Tokenizer" if is_sentencepiece_available() else None,
  472. "T5TokenizerFast" if is_tokenizers_available() else None,
  473. ),
  474. ),
  475. (
  476. "t5",
  477. (
  478. "T5Tokenizer" if is_sentencepiece_available() else None,
  479. "T5TokenizerFast" if is_tokenizers_available() else None,
  480. ),
  481. ),
  482. ("tapas", ("TapasTokenizer", None)),
  483. ("tapex", ("TapexTokenizer", None)),
  484. ("transfo-xl", ("TransfoXLTokenizer", None)),
  485. ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  486. (
  487. "udop",
  488. (
  489. "UdopTokenizer" if is_sentencepiece_available() else None,
  490. "UdopTokenizerFast" if is_tokenizers_available() else None,
  491. ),
  492. ),
  493. (
  494. "umt5",
  495. (
  496. "T5Tokenizer" if is_sentencepiece_available() else None,
  497. "T5TokenizerFast" if is_tokenizers_available() else None,
  498. ),
  499. ),
  500. ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  501. ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  502. ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
  503. ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
  504. ("vits", ("VitsTokenizer", None)),
  505. ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
  506. ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)),
  507. ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
  508. ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
  509. ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
  510. ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
  511. (
  512. "xglm",
  513. (
  514. "XGLMTokenizer" if is_sentencepiece_available() else None,
  515. "XGLMTokenizerFast" if is_tokenizers_available() else None,
  516. ),
  517. ),
  518. ("xlm", ("XLMTokenizer", None)),
  519. ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
  520. (
  521. "xlm-roberta",
  522. (
  523. "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
  524. "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
  525. ),
  526. ),
  527. (
  528. "xlm-roberta-xl",
  529. (
  530. "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
  531. "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
  532. ),
  533. ),
  534. (
  535. "xlnet",
  536. (
  537. "XLNetTokenizer" if is_sentencepiece_available() else None,
  538. "XLNetTokenizerFast" if is_tokenizers_available() else None,
  539. ),
  540. ),
  541. (
  542. "xmod",
  543. (
  544. "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
  545. "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
  546. ),
  547. ),
  548. (
  549. "yoso",
  550. (
  551. "AlbertTokenizer" if is_sentencepiece_available() else None,
  552. "AlbertTokenizerFast" if is_tokenizers_available() else None,
  553. ),
  554. ),
  555. (
  556. "zamba",
  557. (
  558. "LlamaTokenizer" if is_sentencepiece_available() else None,
  559. "LlamaTokenizerFast" if is_tokenizers_available() else None,
  560. ),
  561. ),
  562. ]
  563. )
  564. TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
  565. CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
  566. def tokenizer_class_from_name(class_name: str):
  567. if class_name == "PreTrainedTokenizerFast":
  568. return PreTrainedTokenizerFast
  569. for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
  570. if class_name in tokenizers:
  571. module_name = model_type_to_module_name(module_name)
  572. module = importlib.import_module(f".{module_name}", "transformers.models")
  573. try:
  574. return getattr(module, class_name)
  575. except AttributeError:
  576. continue
  577. for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
  578. for tokenizer in tokenizers:
  579. if getattr(tokenizer, "__name__", None) == class_name:
  580. return tokenizer
  581. # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
  582. # init and we return the proper dummy to get an appropriate error message.
  583. main_module = importlib.import_module("transformers")
  584. if hasattr(main_module, class_name):
  585. return getattr(main_module, class_name)
  586. return None
  587. def get_tokenizer_config(
  588. pretrained_model_name_or_path: Union[str, os.PathLike],
  589. cache_dir: Optional[Union[str, os.PathLike]] = None,
  590. force_download: bool = False,
  591. resume_download: Optional[bool] = None,
  592. proxies: Optional[Dict[str, str]] = None,
  593. token: Optional[Union[bool, str]] = None,
  594. revision: Optional[str] = None,
  595. local_files_only: bool = False,
  596. subfolder: str = "",
  597. **kwargs,
  598. ):
  599. """
  600. Loads the tokenizer configuration from a pretrained model tokenizer configuration.
  601. Args:
  602. pretrained_model_name_or_path (`str` or `os.PathLike`):
  603. This can be either:
  604. - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
  605. huggingface.co.
  606. - a path to a *directory* containing a configuration file saved using the
  607. [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
  608. cache_dir (`str` or `os.PathLike`, *optional*):
  609. Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
  610. cache should not be used.
  611. force_download (`bool`, *optional*, defaults to `False`):
  612. Whether or not to force to (re-)download the configuration files and override the cached versions if they
  613. exist.
  614. resume_download:
  615. Deprecated and ignored. All downloads are now resumed by default when possible.
  616. Will be removed in v5 of Transformers.
  617. proxies (`Dict[str, str]`, *optional*):
  618. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  619. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  620. token (`str` or *bool*, *optional*):
  621. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  622. when running `huggingface-cli login` (stored in `~/.huggingface`).
  623. revision (`str`, *optional*, defaults to `"main"`):
  624. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  625. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  626. identifier allowed by git.
  627. local_files_only (`bool`, *optional*, defaults to `False`):
  628. If `True`, will only try to load the tokenizer configuration from local files.
  629. subfolder (`str`, *optional*, defaults to `""`):
  630. In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
  631. specify the folder name here.
  632. <Tip>
  633. Passing `token=True` is required when you want to use a private model.
  634. </Tip>
  635. Returns:
  636. `Dict`: The configuration of the tokenizer.
  637. Examples:
  638. ```python
  639. # Download configuration from huggingface.co and cache.
  640. tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
  641. # This model does not have a tokenizer config so the result will be an empty dict.
  642. tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
  643. # Save a pretrained tokenizer locally and you can reload its config
  644. from transformers import AutoTokenizer
  645. tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
  646. tokenizer.save_pretrained("tokenizer-test")
  647. tokenizer_config = get_tokenizer_config("tokenizer-test")
  648. ```"""
  649. use_auth_token = kwargs.pop("use_auth_token", None)
  650. if use_auth_token is not None:
  651. warnings.warn(
  652. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  653. FutureWarning,
  654. )
  655. if token is not None:
  656. raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
  657. token = use_auth_token
  658. commit_hash = kwargs.get("_commit_hash", None)
  659. resolved_config_file = cached_file(
  660. pretrained_model_name_or_path,
  661. TOKENIZER_CONFIG_FILE,
  662. cache_dir=cache_dir,
  663. force_download=force_download,
  664. resume_download=resume_download,
  665. proxies=proxies,
  666. token=token,
  667. revision=revision,
  668. local_files_only=local_files_only,
  669. subfolder=subfolder,
  670. _raise_exceptions_for_gated_repo=False,
  671. _raise_exceptions_for_missing_entries=False,
  672. _raise_exceptions_for_connection_errors=False,
  673. _commit_hash=commit_hash,
  674. )
  675. if resolved_config_file is None:
  676. logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
  677. return {}
  678. commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
  679. with open(resolved_config_file, encoding="utf-8") as reader:
  680. result = json.load(reader)
  681. result["_commit_hash"] = commit_hash
  682. return result
  683. class AutoTokenizer:
  684. r"""
  685. This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
  686. created with the [`AutoTokenizer.from_pretrained`] class method.
  687. This class cannot be instantiated directly using `__init__()` (throws an error).
  688. """
  689. def __init__(self):
  690. raise EnvironmentError(
  691. "AutoTokenizer is designed to be instantiated "
  692. "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
  693. )
  694. @classmethod
  695. @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
  696. def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
  697. r"""
  698. Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
  699. The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
  700. passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
  701. falling back to using pattern matching on `pretrained_model_name_or_path`:
  702. List options
  703. Params:
  704. pretrained_model_name_or_path (`str` or `os.PathLike`):
  705. Can be either:
  706. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
  707. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
  708. using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
  709. - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
  710. single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
  711. applicable to all derived classes)
  712. inputs (additional positional arguments, *optional*):
  713. Will be passed along to the Tokenizer `__init__()` method.
  714. config ([`PretrainedConfig`], *optional*)
  715. The configuration object used to determine the tokenizer class to instantiate.
  716. cache_dir (`str` or `os.PathLike`, *optional*):
  717. Path to a directory in which a downloaded pretrained model configuration should be cached if the
  718. standard cache should not be used.
  719. force_download (`bool`, *optional*, defaults to `False`):
  720. Whether or not to force the (re-)download the model weights and configuration files and override the
  721. cached versions if they exist.
  722. resume_download:
  723. Deprecated and ignored. All downloads are now resumed by default when possible.
  724. Will be removed in v5 of Transformers.
  725. proxies (`Dict[str, str]`, *optional*):
  726. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  727. 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
  728. revision (`str`, *optional*, defaults to `"main"`):
  729. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  730. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  731. identifier allowed by git.
  732. subfolder (`str`, *optional*):
  733. In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
  734. facebook/rag-token-base), specify it here.
  735. use_fast (`bool`, *optional*, defaults to `True`):
  736. Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
  737. a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
  738. is returned instead.
  739. tokenizer_type (`str`, *optional*):
  740. Tokenizer type to be loaded.
  741. trust_remote_code (`bool`, *optional*, defaults to `False`):
  742. Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
  743. should only be set to `True` for repositories you trust and in which you have read the code, as it will
  744. execute code present on the Hub on your local machine.
  745. kwargs (additional keyword arguments, *optional*):
  746. Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
  747. `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
  748. `additional_special_tokens`. See parameters in the `__init__()` for more details.
  749. Examples:
  750. ```python
  751. >>> from transformers import AutoTokenizer
  752. >>> # Download vocabulary from huggingface.co and cache.
  753. >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
  754. >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
  755. >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
  756. >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
  757. >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
  758. >>> # Download vocabulary from huggingface.co and define model-specific arguments
  759. >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
  760. ```"""
  761. use_auth_token = kwargs.pop("use_auth_token", None)
  762. if use_auth_token is not None:
  763. warnings.warn(
  764. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  765. FutureWarning,
  766. )
  767. if kwargs.get("token", None) is not None:
  768. raise ValueError(
  769. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  770. )
  771. kwargs["token"] = use_auth_token
  772. config = kwargs.pop("config", None)
  773. kwargs["_from_auto"] = True
  774. use_fast = kwargs.pop("use_fast", True)
  775. tokenizer_type = kwargs.pop("tokenizer_type", None)
  776. trust_remote_code = kwargs.pop("trust_remote_code", None)
  777. gguf_file = kwargs.get("gguf_file", None)
  778. # First, let's see whether the tokenizer_type is passed so that we can leverage it
  779. if tokenizer_type is not None:
  780. tokenizer_class = None
  781. tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)
  782. if tokenizer_class_tuple is None:
  783. raise ValueError(
  784. f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
  785. f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
  786. )
  787. tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple
  788. if use_fast:
  789. if tokenizer_fast_class_name is not None:
  790. tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
  791. else:
  792. logger.warning(
  793. "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
  794. " Falling back to the slow version."
  795. )
  796. if tokenizer_class is None:
  797. tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)
  798. if tokenizer_class is None:
  799. raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")
  800. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  801. # Next, let's try to use the tokenizer_config file to get the tokenizer class.
  802. tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
  803. if "_commit_hash" in tokenizer_config:
  804. kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
  805. config_tokenizer_class = tokenizer_config.get("tokenizer_class")
  806. tokenizer_auto_map = None
  807. if "auto_map" in tokenizer_config:
  808. if isinstance(tokenizer_config["auto_map"], (tuple, list)):
  809. # Legacy format for dynamic tokenizers
  810. tokenizer_auto_map = tokenizer_config["auto_map"]
  811. else:
  812. tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
  813. # If that did not work, let's try to use the config.
  814. if config_tokenizer_class is None:
  815. if not isinstance(config, PretrainedConfig):
  816. if gguf_file:
  817. gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
  818. config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
  819. config = AutoConfig.for_model(**config_dict)
  820. else:
  821. config = AutoConfig.from_pretrained(
  822. pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
  823. )
  824. config_tokenizer_class = config.tokenizer_class
  825. if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
  826. tokenizer_auto_map = config.auto_map["AutoTokenizer"]
  827. has_remote_code = tokenizer_auto_map is not None
  828. has_local_code = type(config) in TOKENIZER_MAPPING or (
  829. config_tokenizer_class is not None
  830. and (
  831. tokenizer_class_from_name(config_tokenizer_class) is not None
  832. or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None
  833. )
  834. )
  835. trust_remote_code = resolve_trust_remote_code(
  836. trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
  837. )
  838. if has_remote_code and trust_remote_code:
  839. if use_fast and tokenizer_auto_map[1] is not None:
  840. class_ref = tokenizer_auto_map[1]
  841. else:
  842. class_ref = tokenizer_auto_map[0]
  843. tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
  844. _ = kwargs.pop("code_revision", None)
  845. if os.path.isdir(pretrained_model_name_or_path):
  846. tokenizer_class.register_for_auto_class()
  847. return tokenizer_class.from_pretrained(
  848. pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
  849. )
  850. elif config_tokenizer_class is not None:
  851. tokenizer_class = None
  852. if use_fast and not config_tokenizer_class.endswith("Fast"):
  853. tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
  854. tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
  855. if tokenizer_class is None:
  856. tokenizer_class_candidate = config_tokenizer_class
  857. tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
  858. if tokenizer_class is None:
  859. raise ValueError(
  860. f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
  861. )
  862. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  863. # Otherwise we have to be creative.
  864. # if model is an encoder decoder, the encoder tokenizer class is used by default
  865. if isinstance(config, EncoderDecoderConfig):
  866. if type(config.decoder) is not type(config.encoder): # noqa: E721
  867. logger.warning(
  868. f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
  869. f"config class: {config.decoder.__class__}. It is not recommended to use the "
  870. "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
  871. "specific tokenizer classes."
  872. )
  873. config = config.encoder
  874. model_type = config_class_to_model_type(type(config).__name__)
  875. if model_type is not None:
  876. tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
  877. if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
  878. return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  879. else:
  880. if tokenizer_class_py is not None:
  881. return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  882. else:
  883. raise ValueError(
  884. "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
  885. "in order to use this tokenizer."
  886. )
  887. raise ValueError(
  888. f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
  889. f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
  890. )
  891. def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False):
  892. """
  893. Register a new tokenizer in this mapping.
  894. Args:
  895. config_class ([`PretrainedConfig`]):
  896. The configuration corresponding to the model to register.
  897. slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
  898. The slow tokenizer to register.
  899. fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
  900. The fast tokenizer to register.
  901. """
  902. if slow_tokenizer_class is None and fast_tokenizer_class is None:
  903. raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
  904. if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
  905. raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
  906. if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
  907. raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")
  908. if (
  909. slow_tokenizer_class is not None
  910. and fast_tokenizer_class is not None
  911. and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
  912. and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
  913. ):
  914. raise ValueError(
  915. "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
  916. "consistent with the slow tokenizer class you passed (fast tokenizer has "
  917. f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
  918. "so they match!"
  919. )
  920. # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
  921. if config_class in TOKENIZER_MAPPING._extra_content:
  922. existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
  923. if slow_tokenizer_class is None:
  924. slow_tokenizer_class = existing_slow
  925. if fast_tokenizer_class is None:
  926. fast_tokenizer_class = existing_fast
  927. TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok)