__init__.pyi 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. # Generated content DO NOT EDIT
  2. class Model:
  3. """
  4. Base class for all models
  5. The model represents the actual tokenization algorithm. This is the part that
  6. will contain and manage the learned vocabulary.
  7. This class cannot be constructed directly. Please use one of the concrete models.
  8. """
  9. def get_trainer(self):
  10. """
  11. Get the associated :class:`~tokenizers.trainers.Trainer`
  12. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  13. :class:`~tokenizers.models.Model`.
  14. Returns:
  15. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  16. """
  17. pass
  18. def id_to_token(self, id):
  19. """
  20. Get the token associated to an ID
  21. Args:
  22. id (:obj:`int`):
  23. An ID to convert to a token
  24. Returns:
  25. :obj:`str`: The token associated to the ID
  26. """
  27. pass
  28. def save(self, folder, prefix):
  29. """
  30. Save the current model
  31. Save the current model in the given folder, using the given prefix for the various
  32. files that will get created.
  33. Any file with the same name that already exists in this folder will be overwritten.
  34. Args:
  35. folder (:obj:`str`):
  36. The path to the target folder in which to save the various files
  37. prefix (:obj:`str`, `optional`):
  38. An optional prefix, used to prefix each file name
  39. Returns:
  40. :obj:`List[str]`: The list of saved files
  41. """
  42. pass
  43. def token_to_id(self, tokens):
  44. """
  45. Get the ID associated to a token
  46. Args:
  47. token (:obj:`str`):
  48. A token to convert to an ID
  49. Returns:
  50. :obj:`int`: The ID associated to the token
  51. """
  52. pass
  53. def tokenize(self, sequence):
  54. """
  55. Tokenize a sequence
  56. Args:
  57. sequence (:obj:`str`):
  58. A sequence to tokenize
  59. Returns:
  60. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  61. """
  62. pass
  63. class BPE(Model):
  64. """
  65. An implementation of the BPE (Byte-Pair Encoding) algorithm
  66. Args:
  67. vocab (:obj:`Dict[str, int]`, `optional`):
  68. A dictionary of string keys and their ids :obj:`{"am": 0,...}`
  69. merges (:obj:`List[Tuple[str, str]]`, `optional`):
  70. A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
  71. cache_capacity (:obj:`int`, `optional`):
  72. The number of words that the BPE cache can contain. The cache allows
  73. to speed-up the process by keeping the result of the merge operations
  74. for a number of words.
  75. dropout (:obj:`float`, `optional`):
  76. A float between 0 and 1 that represents the BPE dropout to use.
  77. unk_token (:obj:`str`, `optional`):
  78. The unknown token to be used by the model.
  79. continuing_subword_prefix (:obj:`str`, `optional`):
  80. The prefix to attach to subword units that don't represent a beginning of word.
  81. end_of_word_suffix (:obj:`str`, `optional`):
  82. The suffix to attach to subword units that represent an end of word.
  83. fuse_unk (:obj:`bool`, `optional`):
  84. Whether to fuse any subsequent unknown tokens into a single one
  85. byte_fallback (:obj:`bool`, `optional`):
  86. Whether to use spm byte-fallback trick (defaults to False)
  87. ignore_merges (:obj:`bool`, `optional`):
  88. Whether or not to match tokens with the vocab before using merges.
  89. """
  90. def __init__(
  91. self,
  92. vocab=None,
  93. merges=None,
  94. cache_capacity=None,
  95. dropout=None,
  96. unk_token=None,
  97. continuing_subword_prefix=None,
  98. end_of_word_suffix=None,
  99. fuse_unk=None,
  100. byte_fallback=False,
  101. ignore_merges=False,
  102. ):
  103. pass
  104. @staticmethod
  105. def from_file(cls, vocab, merge, **kwargs):
  106. """
  107. Instantiate a BPE model from the given files.
  108. This method is roughly equivalent to doing::
  109. vocab, merges = BPE.read_file(vocab_filename, merges_filename)
  110. bpe = BPE(vocab, merges)
  111. If you don't need to keep the :obj:`vocab, merges` values lying around,
  112. this method is more optimized than manually calling
  113. :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
  114. Args:
  115. vocab (:obj:`str`):
  116. The path to a :obj:`vocab.json` file
  117. merges (:obj:`str`):
  118. The path to a :obj:`merges.txt` file
  119. Returns:
  120. :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
  121. """
  122. pass
  123. def get_trainer(self):
  124. """
  125. Get the associated :class:`~tokenizers.trainers.Trainer`
  126. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  127. :class:`~tokenizers.models.Model`.
  128. Returns:
  129. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  130. """
  131. pass
  132. def id_to_token(self, id):
  133. """
  134. Get the token associated to an ID
  135. Args:
  136. id (:obj:`int`):
  137. An ID to convert to a token
  138. Returns:
  139. :obj:`str`: The token associated to the ID
  140. """
  141. pass
  142. @staticmethod
  143. def read_file(self, vocab, merges):
  144. """
  145. Read a :obj:`vocab.json` and a :obj:`merges.txt` files
  146. This method provides a way to read and parse the content of these files,
  147. returning the relevant data structures. If you want to instantiate some BPE models
  148. from memory, this method gives you the expected input from the standard files.
  149. Args:
  150. vocab (:obj:`str`):
  151. The path to a :obj:`vocab.json` file
  152. merges (:obj:`str`):
  153. The path to a :obj:`merges.txt` file
  154. Returns:
  155. A :obj:`Tuple` with the vocab and the merges:
  156. The vocabulary and merges loaded into memory
  157. """
  158. pass
  159. def save(self, folder, prefix):
  160. """
  161. Save the current model
  162. Save the current model in the given folder, using the given prefix for the various
  163. files that will get created.
  164. Any file with the same name that already exists in this folder will be overwritten.
  165. Args:
  166. folder (:obj:`str`):
  167. The path to the target folder in which to save the various files
  168. prefix (:obj:`str`, `optional`):
  169. An optional prefix, used to prefix each file name
  170. Returns:
  171. :obj:`List[str]`: The list of saved files
  172. """
  173. pass
  174. def token_to_id(self, tokens):
  175. """
  176. Get the ID associated to a token
  177. Args:
  178. token (:obj:`str`):
  179. A token to convert to an ID
  180. Returns:
  181. :obj:`int`: The ID associated to the token
  182. """
  183. pass
  184. def tokenize(self, sequence):
  185. """
  186. Tokenize a sequence
  187. Args:
  188. sequence (:obj:`str`):
  189. A sequence to tokenize
  190. Returns:
  191. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  192. """
  193. pass
  194. class Unigram(Model):
  195. """
  196. An implementation of the Unigram algorithm
  197. Args:
  198. vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
  199. A list of vocabulary items and their relative score [("am", -0.2442),...]
  200. """
  201. def __init__(self, vocab, unk_id, byte_fallback):
  202. pass
  203. def get_trainer(self):
  204. """
  205. Get the associated :class:`~tokenizers.trainers.Trainer`
  206. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  207. :class:`~tokenizers.models.Model`.
  208. Returns:
  209. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  210. """
  211. pass
  212. def id_to_token(self, id):
  213. """
  214. Get the token associated to an ID
  215. Args:
  216. id (:obj:`int`):
  217. An ID to convert to a token
  218. Returns:
  219. :obj:`str`: The token associated to the ID
  220. """
  221. pass
  222. def save(self, folder, prefix):
  223. """
  224. Save the current model
  225. Save the current model in the given folder, using the given prefix for the various
  226. files that will get created.
  227. Any file with the same name that already exists in this folder will be overwritten.
  228. Args:
  229. folder (:obj:`str`):
  230. The path to the target folder in which to save the various files
  231. prefix (:obj:`str`, `optional`):
  232. An optional prefix, used to prefix each file name
  233. Returns:
  234. :obj:`List[str]`: The list of saved files
  235. """
  236. pass
  237. def token_to_id(self, tokens):
  238. """
  239. Get the ID associated to a token
  240. Args:
  241. token (:obj:`str`):
  242. A token to convert to an ID
  243. Returns:
  244. :obj:`int`: The ID associated to the token
  245. """
  246. pass
  247. def tokenize(self, sequence):
  248. """
  249. Tokenize a sequence
  250. Args:
  251. sequence (:obj:`str`):
  252. A sequence to tokenize
  253. Returns:
  254. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  255. """
  256. pass
  257. class WordLevel(Model):
  258. """
  259. An implementation of the WordLevel algorithm
  260. Most simple tokenizer model based on mapping tokens to their corresponding id.
  261. Args:
  262. vocab (:obj:`str`, `optional`):
  263. A dictionary of string keys and their ids :obj:`{"am": 0,...}`
  264. unk_token (:obj:`str`, `optional`):
  265. The unknown token to be used by the model.
  266. """
  267. def __init__(self, vocab, unk_token):
  268. pass
  269. @staticmethod
  270. def from_file(vocab, unk_token):
  271. """
  272. Instantiate a WordLevel model from the given file
  273. This method is roughly equivalent to doing::
  274. vocab = WordLevel.read_file(vocab_filename)
  275. wordlevel = WordLevel(vocab)
  276. If you don't need to keep the :obj:`vocab` values lying around, this method is
  277. more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
  278. initialize a :class:`~tokenizers.models.WordLevel`
  279. Args:
  280. vocab (:obj:`str`):
  281. The path to a :obj:`vocab.json` file
  282. Returns:
  283. :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
  284. """
  285. pass
  286. def get_trainer(self):
  287. """
  288. Get the associated :class:`~tokenizers.trainers.Trainer`
  289. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  290. :class:`~tokenizers.models.Model`.
  291. Returns:
  292. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  293. """
  294. pass
  295. def id_to_token(self, id):
  296. """
  297. Get the token associated to an ID
  298. Args:
  299. id (:obj:`int`):
  300. An ID to convert to a token
  301. Returns:
  302. :obj:`str`: The token associated to the ID
  303. """
  304. pass
  305. @staticmethod
  306. def read_file(vocab):
  307. """
  308. Read a :obj:`vocab.json`
  309. This method provides a way to read and parse the content of a vocabulary file,
  310. returning the relevant data structures. If you want to instantiate some WordLevel models
  311. from memory, this method gives you the expected input from the standard files.
  312. Args:
  313. vocab (:obj:`str`):
  314. The path to a :obj:`vocab.json` file
  315. Returns:
  316. :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
  317. """
  318. pass
  319. def save(self, folder, prefix):
  320. """
  321. Save the current model
  322. Save the current model in the given folder, using the given prefix for the various
  323. files that will get created.
  324. Any file with the same name that already exists in this folder will be overwritten.
  325. Args:
  326. folder (:obj:`str`):
  327. The path to the target folder in which to save the various files
  328. prefix (:obj:`str`, `optional`):
  329. An optional prefix, used to prefix each file name
  330. Returns:
  331. :obj:`List[str]`: The list of saved files
  332. """
  333. pass
  334. def token_to_id(self, tokens):
  335. """
  336. Get the ID associated to a token
  337. Args:
  338. token (:obj:`str`):
  339. A token to convert to an ID
  340. Returns:
  341. :obj:`int`: The ID associated to the token
  342. """
  343. pass
  344. def tokenize(self, sequence):
  345. """
  346. Tokenize a sequence
  347. Args:
  348. sequence (:obj:`str`):
  349. A sequence to tokenize
  350. Returns:
  351. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  352. """
  353. pass
  354. class WordPiece(Model):
  355. """
  356. An implementation of the WordPiece algorithm
  357. Args:
  358. vocab (:obj:`Dict[str, int]`, `optional`):
  359. A dictionary of string keys and their ids :obj:`{"am": 0,...}`
  360. unk_token (:obj:`str`, `optional`):
  361. The unknown token to be used by the model.
  362. max_input_chars_per_word (:obj:`int`, `optional`):
  363. The maximum number of characters to authorize in a single word.
  364. """
  365. def __init__(self, vocab, unk_token, max_input_chars_per_word):
  366. pass
  367. @staticmethod
  368. def from_file(vocab, **kwargs):
  369. """
  370. Instantiate a WordPiece model from the given file
  371. This method is roughly equivalent to doing::
  372. vocab = WordPiece.read_file(vocab_filename)
  373. wordpiece = WordPiece(vocab)
  374. If you don't need to keep the :obj:`vocab` values lying around, this method is
  375. more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
  376. initialize a :class:`~tokenizers.models.WordPiece`
  377. Args:
  378. vocab (:obj:`str`):
  379. The path to a :obj:`vocab.txt` file
  380. Returns:
  381. :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
  382. """
  383. pass
  384. def get_trainer(self):
  385. """
  386. Get the associated :class:`~tokenizers.trainers.Trainer`
  387. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  388. :class:`~tokenizers.models.Model`.
  389. Returns:
  390. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  391. """
  392. pass
  393. def id_to_token(self, id):
  394. """
  395. Get the token associated to an ID
  396. Args:
  397. id (:obj:`int`):
  398. An ID to convert to a token
  399. Returns:
  400. :obj:`str`: The token associated to the ID
  401. """
  402. pass
  403. @staticmethod
  404. def read_file(vocab):
  405. """
  406. Read a :obj:`vocab.txt` file
  407. This method provides a way to read and parse the content of a standard `vocab.txt`
  408. file as used by the WordPiece Model, returning the relevant data structures. If you
  409. want to instantiate some WordPiece models from memory, this method gives you the
  410. expected input from the standard files.
  411. Args:
  412. vocab (:obj:`str`):
  413. The path to a :obj:`vocab.txt` file
  414. Returns:
  415. :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
  416. """
  417. pass
  418. def save(self, folder, prefix):
  419. """
  420. Save the current model
  421. Save the current model in the given folder, using the given prefix for the various
  422. files that will get created.
  423. Any file with the same name that already exists in this folder will be overwritten.
  424. Args:
  425. folder (:obj:`str`):
  426. The path to the target folder in which to save the various files
  427. prefix (:obj:`str`, `optional`):
  428. An optional prefix, used to prefix each file name
  429. Returns:
  430. :obj:`List[str]`: The list of saved files
  431. """
  432. pass
  433. def token_to_id(self, tokens):
  434. """
  435. Get the ID associated to a token
  436. Args:
  437. token (:obj:`str`):
  438. A token to convert to an ID
  439. Returns:
  440. :obj:`int`: The ID associated to the token
  441. """
  442. pass
  443. def tokenize(self, sequence):
  444. """
  445. Tokenize a sequence
  446. Args:
  447. sequence (:obj:`str`):
  448. A sequence to tokenize
  449. Returns:
  450. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  451. """
  452. pass