__init__.pyi 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238
  1. # Generated content DO NOT EDIT
  2. class AddedToken:
  3. """
  4. Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
  5. It can have special options that defines the way it should behave.
  6. Args:
  7. content (:obj:`str`): The content of the token
  8. single_word (:obj:`bool`, defaults to :obj:`False`):
  9. Defines whether this token should only match single words. If :obj:`True`, this
  10. token will never match inside of a word. For example the token ``ing`` would match
  11. on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
  12. The notion of "`inside of a word`" is defined by the word boundaries pattern in
  13. regular expressions (ie. the token should start and end with word boundaries).
  14. lstrip (:obj:`bool`, defaults to :obj:`False`):
  15. Defines whether this token should strip all potential whitespaces on its left side.
  16. If :obj:`True`, this token will greedily match any whitespace on its left. For
  17. example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
  18. ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
  19. rstrip (:obj:`bool`, defaults to :obj:`False`):
  20. Defines whether this token should strip all potential whitespaces on its right
  21. side. If :obj:`True`, this token will greedily match any whitespace on its right.
  22. It works just like :obj:`lstrip` but on the right.
  23. normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
  24. Defines whether this token should match against the normalized version of the input
  25. text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
  26. lowercasing the text, the token could be extract from the input ``"I saw a lion
  27. Yesterday"``.
  28. special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
  29. Defines whether this token should be skipped when decoding.
  30. """
  31. def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
  32. pass
  33. @property
  34. def content(self):
  35. """
  36. Get the content of this :obj:`AddedToken`
  37. """
  38. pass
  39. @property
  40. def lstrip(self):
  41. """
  42. Get the value of the :obj:`lstrip` option
  43. """
  44. pass
  45. @property
  46. def normalized(self):
  47. """
  48. Get the value of the :obj:`normalized` option
  49. """
  50. pass
  51. @property
  52. def rstrip(self):
  53. """
  54. Get the value of the :obj:`rstrip` option
  55. """
  56. pass
  57. @property
  58. def single_word(self):
  59. """
  60. Get the value of the :obj:`single_word` option
  61. """
  62. pass
  63. @property
  64. def special(self):
  65. """
  66. Get the value of the :obj:`special` option
  67. """
  68. pass
  69. class Encoding:
  70. """
  71. The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
  72. """
  73. @property
  74. def attention_mask(self):
  75. """
  76. The attention mask
  77. This indicates to the LM which tokens should be attended to, and which should not.
  78. This is especially important when batching sequences, where we need to applying
  79. padding.
  80. Returns:
  81. :obj:`List[int]`: The attention mask
  82. """
  83. pass
  84. def char_to_token(self, char_pos, sequence_index=0):
  85. """
  86. Get the token that contains the char at the given position in the input sequence.
  87. Args:
  88. char_pos (:obj:`int`):
  89. The position of a char in the input string
  90. sequence_index (:obj:`int`, defaults to :obj:`0`):
  91. The index of the sequence that contains the target char
  92. Returns:
  93. :obj:`int`: The index of the token that contains this char in the encoded sequence
  94. """
  95. pass
  96. def char_to_word(self, char_pos, sequence_index=0):
  97. """
  98. Get the word that contains the char at the given position in the input sequence.
  99. Args:
  100. char_pos (:obj:`int`):
  101. The position of a char in the input string
  102. sequence_index (:obj:`int`, defaults to :obj:`0`):
  103. The index of the sequence that contains the target char
  104. Returns:
  105. :obj:`int`: The index of the word that contains this char in the input sequence
  106. """
  107. pass
  108. @property
  109. def ids(self):
  110. """
  111. The generated IDs
  112. The IDs are the main input to a Language Model. They are the token indices,
  113. the numerical representations that a LM understands.
  114. Returns:
  115. :obj:`List[int]`: The list of IDs
  116. """
  117. pass
  118. @staticmethod
  119. def merge(encodings, growing_offsets=True):
  120. """
  121. Merge the list of encodings into one final :class:`~tokenizers.Encoding`
  122. Args:
  123. encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
  124. The list of encodings that should be merged in one
  125. growing_offsets (:obj:`bool`, defaults to :obj:`True`):
  126. Whether the offsets should accumulate while merging
  127. Returns:
  128. :class:`~tokenizers.Encoding`: The resulting Encoding
  129. """
  130. pass
  131. @property
  132. def n_sequences(self):
  133. """
  134. The number of sequences represented
  135. Returns:
  136. :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
  137. """
  138. pass
  139. @property
  140. def offsets(self):
  141. """
  142. The offsets associated to each token
  143. These offsets let's you slice the input string, and thus retrieve the original
  144. part that led to producing the corresponding token.
  145. Returns:
  146. A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
  147. """
  148. pass
  149. @property
  150. def overflowing(self):
  151. """
  152. A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
  153. When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
  154. the output into as many pieces as required to match the specified maximum length.
  155. This field lets you retrieve all the subsequent pieces.
  156. When you use pairs of sequences, the overflowing pieces will contain enough
  157. variations to cover all the possible combinations, while respecting the provided
  158. maximum length.
  159. """
  160. pass
  161. def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
  162. """
  163. Pad the :class:`~tokenizers.Encoding` at the given length
  164. Args:
  165. length (:obj:`int`):
  166. The desired length
  167. direction: (:obj:`str`, defaults to :obj:`right`):
  168. The expected padding direction. Can be either :obj:`right` or :obj:`left`
  169. pad_id (:obj:`int`, defaults to :obj:`0`):
  170. The ID corresponding to the padding token
  171. pad_type_id (:obj:`int`, defaults to :obj:`0`):
  172. The type ID corresponding to the padding token
  173. pad_token (:obj:`str`, defaults to `[PAD]`):
  174. The pad token to use
  175. """
  176. pass
  177. @property
  178. def sequence_ids(self):
  179. """
  180. The generated sequence indices.
  181. They represent the index of the input sequence associated to each token.
  182. The sequence id can be None if the token is not related to any input sequence,
  183. like for example with special tokens.
  184. Returns:
  185. A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
  186. """
  187. pass
  188. def set_sequence_id(self, sequence_id):
  189. """
  190. Set the given sequence index
  191. Set the given sequence index for the whole range of tokens contained in this
  192. :class:`~tokenizers.Encoding`.
  193. """
  194. pass
  195. @property
  196. def special_tokens_mask(self):
  197. """
  198. The special token mask
  199. This indicates which tokens are special tokens, and which are not.
  200. Returns:
  201. :obj:`List[int]`: The special tokens mask
  202. """
  203. pass
  204. def token_to_chars(self, token_index):
  205. """
  206. Get the offsets of the token at the given index.
  207. The returned offsets are related to the input sequence that contains the
  208. token. In order to determine in which input sequence it belongs, you
  209. must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
  210. Args:
  211. token_index (:obj:`int`):
  212. The index of a token in the encoded sequence.
  213. Returns:
  214. :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
  215. """
  216. pass
  217. def token_to_sequence(self, token_index):
  218. """
  219. Get the index of the sequence represented by the given token.
  220. In the general use case, this method returns :obj:`0` for a single sequence or
  221. the first sequence of a pair, and :obj:`1` for the second sequence of a pair
  222. Args:
  223. token_index (:obj:`int`):
  224. The index of a token in the encoded sequence.
  225. Returns:
  226. :obj:`int`: The sequence id of the given token
  227. """
  228. pass
  229. def token_to_word(self, token_index):
  230. """
  231. Get the index of the word that contains the token in one of the input sequences.
  232. The returned word index is related to the input sequence that contains
  233. the token. In order to determine in which input sequence it belongs, you
  234. must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
  235. Args:
  236. token_index (:obj:`int`):
  237. The index of a token in the encoded sequence.
  238. Returns:
  239. :obj:`int`: The index of the word in the relevant input sequence.
  240. """
  241. pass
  242. @property
  243. def tokens(self):
  244. """
  245. The generated tokens
  246. They are the string representation of the IDs.
  247. Returns:
  248. :obj:`List[str]`: The list of tokens
  249. """
  250. pass
  251. def truncate(self, max_length, stride=0, direction="right"):
  252. """
  253. Truncate the :class:`~tokenizers.Encoding` at the given length
  254. If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
  255. this information is lost. It will be considered as representing a single sequence.
  256. Args:
  257. max_length (:obj:`int`):
  258. The desired length
  259. stride (:obj:`int`, defaults to :obj:`0`):
  260. The length of previous content to be included in each overflowing piece
  261. direction (:obj:`str`, defaults to :obj:`right`):
  262. Truncate direction
  263. """
  264. pass
  265. @property
  266. def type_ids(self):
  267. """
  268. The generated type IDs
  269. Generally used for tasks like sequence classification or question answering,
  270. these tokens let the LM know which input sequence corresponds to each tokens.
  271. Returns:
  272. :obj:`List[int]`: The list of type ids
  273. """
  274. pass
  275. @property
  276. def word_ids(self):
  277. """
  278. The generated word indices.
  279. They represent the index of the word associated to each token.
  280. When the input is pre-tokenized, they correspond to the ID of the given input label,
  281. otherwise they correspond to the words indices as defined by the
  282. :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
  283. For special tokens and such (any token that was generated from something that was
  284. not part of the input), the output is :obj:`None`
  285. Returns:
  286. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
  287. """
  288. pass
  289. def word_to_chars(self, word_index, sequence_index=0):
  290. """
  291. Get the offsets of the word at the given index in one of the input sequences.
  292. Args:
  293. word_index (:obj:`int`):
  294. The index of a word in one of the input sequences.
  295. sequence_index (:obj:`int`, defaults to :obj:`0`):
  296. The index of the sequence that contains the target word
  297. Returns:
  298. :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
  299. """
  300. pass
  301. def word_to_tokens(self, word_index, sequence_index=0):
  302. """
  303. Get the encoded tokens corresponding to the word at the given index
  304. in one of the input sequences.
  305. Args:
  306. word_index (:obj:`int`):
  307. The index of a word in one of the input sequences.
  308. sequence_index (:obj:`int`, defaults to :obj:`0`):
  309. The index of the sequence that contains the target word
  310. Returns:
  311. :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
  312. """
  313. pass
  314. @property
  315. def words(self):
  316. """
  317. The generated word indices.
  318. .. warning::
  319. This is deprecated and will be removed in a future version.
  320. Please use :obj:`~tokenizers.Encoding.word_ids` instead.
  321. They represent the index of the word associated to each token.
  322. When the input is pre-tokenized, they correspond to the ID of the given input label,
  323. otherwise they correspond to the words indices as defined by the
  324. :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
  325. For special tokens and such (any token that was generated from something that was
  326. not part of the input), the output is :obj:`None`
  327. Returns:
  328. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
  329. """
  330. pass
  331. class NormalizedString:
  332. """
  333. NormalizedString
  334. A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
  335. While making all the requested modifications, it keeps track of the alignment information
  336. between the two versions of the string.
  337. Args:
  338. sequence: str:
  339. The string sequence used to initialize this NormalizedString
  340. """
  341. def append(self, s):
  342. """
  343. Append the given sequence to the string
  344. """
  345. pass
  346. def clear(self):
  347. """
  348. Clears the string
  349. """
  350. pass
  351. def filter(self, func):
  352. """
  353. Filter each character of the string using the given func
  354. """
  355. pass
  356. def for_each(self, func):
  357. """
  358. Calls the given function for each character of the string
  359. """
  360. pass
  361. def lowercase(self):
  362. """
  363. Lowercase the string
  364. """
  365. pass
  366. def lstrip(self):
  367. """
  368. Strip the left of the string
  369. """
  370. pass
  371. def map(self, func):
  372. """
  373. Calls the given function for each character of the string
  374. Replaces each character of the string using the returned value. Each
  375. returned value **must** be a str of length 1 (ie a character).
  376. """
  377. pass
  378. def nfc(self):
  379. """
  380. Runs the NFC normalization
  381. """
  382. pass
  383. def nfd(self):
  384. """
  385. Runs the NFD normalization
  386. """
  387. pass
  388. def nfkc(self):
  389. """
  390. Runs the NFKC normalization
  391. """
  392. pass
  393. def nfkd(self):
  394. """
  395. Runs the NFKD normalization
  396. """
  397. pass
  398. @property
  399. def normalized(self):
  400. """
  401. The normalized part of the string
  402. """
  403. pass
  404. def prepend(self, s):
  405. """
  406. Prepend the given sequence to the string
  407. """
  408. pass
  409. def replace(self, pattern, content):
  410. """
  411. Replace the content of the given pattern with the provided content
  412. Args:
  413. pattern: Pattern:
  414. A pattern used to match the string. Usually a string or a Regex
  415. content: str:
  416. The content to be used as replacement
  417. """
  418. pass
  419. def rstrip(self):
  420. """
  421. Strip the right of the string
  422. """
  423. pass
  424. def slice(self, range):
  425. """
  426. Slice the string using the given range
  427. """
  428. pass
  429. def split(self, pattern, behavior):
  430. """
  431. Split the NormalizedString using the given pattern and the specified behavior
  432. Args:
  433. pattern: Pattern:
  434. A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
  435. behavior: SplitDelimiterBehavior:
  436. The behavior to use when splitting.
  437. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
  438. "contiguous"
  439. Returns:
  440. A list of NormalizedString, representing each split
  441. """
  442. pass
  443. def strip(self):
  444. """
  445. Strip both ends of the string
  446. """
  447. pass
  448. def uppercase(self):
  449. """
  450. Uppercase the string
  451. """
  452. pass
  453. class PreTokenizedString:
  454. """
  455. PreTokenizedString
  456. Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
  457. underlying string, while keeping track of the alignment information (offsets).
  458. The PreTokenizedString manages what we call `splits`. Each split represents a substring
  459. which is a subpart of the original string, with the relevant offsets and tokens.
  460. When calling one of the methods used to modify the PreTokenizedString (namely one of
  461. `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
  462. tokens will get modified.
  463. Args:
  464. sequence: str:
  465. The string sequence used to initialize this PreTokenizedString
  466. """
  467. def __init__(self, sequence):
  468. pass
  469. def get_splits(self, offset_referential="original", offset_type="char"):
  470. """
  471. Get the splits currently managed by the PreTokenizedString
  472. Args:
  473. offset_referential: :obj:`str`
  474. Whether the returned splits should have offsets expressed relative
  475. to the original string, or the normalized one. choices: "original", "normalized".
  476. offset_type: :obj:`str`
  477. Whether the returned splits should have offsets expressed in bytes or chars.
  478. When slicing an str, we usually want to use chars, which is the default value.
  479. Now in some cases it might be interesting to get these offsets expressed in bytes,
  480. so it is possible to change this here.
  481. choices: "char", "bytes"
  482. Returns
  483. A list of splits
  484. """
  485. pass
  486. def normalize(self, func):
  487. """
  488. Normalize each split of the `PreTokenizedString` using the given `func`
  489. Args:
  490. func: Callable[[NormalizedString], None]:
  491. The function used to normalize each underlying split. This function
  492. does not need to return anything, just calling the methods on the provided
  493. NormalizedString allow its modification.
  494. """
  495. pass
  496. def split(self, func):
  497. """
  498. Split the PreTokenizedString using the given `func`
  499. Args:
  500. func: Callable[[index, NormalizedString], List[NormalizedString]]:
  501. The function used to split each underlying split.
  502. It is expected to return a list of `NormalizedString`, that represent the new
  503. splits. If the given `NormalizedString` does not need any splitting, we can
  504. just return it directly.
  505. In order for the offsets to be tracked accurately, any returned `NormalizedString`
  506. should come from calling either `.split` or `.slice` on the received one.
  507. """
  508. pass
  509. def to_encoding(self, type_id=0, word_idx=None):
  510. """
  511. Return an Encoding generated from this PreTokenizedString
  512. Args:
  513. type_id: int = 0:
  514. The type_id to be used on the generated Encoding.
  515. word_idx: Optional[int] = None:
  516. An optional word index to be used for each token of this Encoding. If provided,
  517. all the word indices in the generated Encoding will use this value, instead
  518. of the one automatically tracked during pre-tokenization.
  519. Returns:
  520. An Encoding
  521. """
  522. pass
  523. def tokenize(self, func):
  524. """
  525. Tokenize each split of the `PreTokenizedString` using the given `func`
  526. Args:
  527. func: Callable[[str], List[Token]]:
  528. The function used to tokenize each underlying split. This function must return
  529. a list of Token generated from the input str.
  530. """
  531. pass
  532. class Regex:
  533. """
  534. Instantiate a new Regex with the given pattern
  535. """
  536. def __init__(self, pattern):
  537. pass
  538. class Token:
  539. pass
  540. class Tokenizer:
  541. """
  542. A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
  543. and outputs an :class:`~tokenizers.Encoding`.
  544. Args:
  545. model (:class:`~tokenizers.models.Model`):
  546. The core algorithm that this :obj:`Tokenizer` should be using.
  547. """
  548. def __init__(self, model):
  549. pass
  550. def add_special_tokens(self, tokens):
  551. """
  552. Add the given special tokens to the Tokenizer.
  553. If these tokens are already part of the vocabulary, it just let the Tokenizer know about
  554. them. If they don't exist, the Tokenizer creates them, giving them a new id.
  555. These special tokens will never be processed by the model (ie won't be split into
  556. multiple tokens), and they can be removed from the output when decoding.
  557. Args:
  558. tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
  559. The list of special tokens we want to add to the vocabulary. Each token can either
  560. be a string or an instance of :class:`~tokenizers.AddedToken` for more
  561. customization.
  562. Returns:
  563. :obj:`int`: The number of tokens that were created in the vocabulary
  564. """
  565. pass
  566. def add_tokens(self, tokens):
  567. """
  568. Add the given tokens to the vocabulary
  569. The given tokens are added only if they don't already exist in the vocabulary.
  570. Each token then gets a new attributed id.
  571. Args:
  572. tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
  573. The list of tokens we want to add to the vocabulary. Each token can be either a
  574. string or an instance of :class:`~tokenizers.AddedToken` for more customization.
  575. Returns:
  576. :obj:`int`: The number of tokens that were created in the vocabulary
  577. """
  578. pass
  579. def decode(self, ids, skip_special_tokens=True):
  580. """
  581. Decode the given list of ids back to a string
  582. This is used to decode anything coming back from a Language Model
  583. Args:
  584. ids (A :obj:`List/Tuple` of :obj:`int`):
  585. The list of ids that we want to decode
  586. skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  587. Whether the special tokens should be removed from the decoded string
  588. Returns:
  589. :obj:`str`: The decoded string
  590. """
  591. pass
  592. def decode_batch(self, sequences, skip_special_tokens=True):
  593. """
  594. Decode a batch of ids back to their corresponding string
  595. Args:
  596. sequences (:obj:`List` of :obj:`List[int]`):
  597. The batch of sequences we want to decode
  598. skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  599. Whether the special tokens should be removed from the decoded strings
  600. Returns:
  601. :obj:`List[str]`: A list of decoded strings
  602. """
  603. pass
  604. @property
  605. def decoder(self):
  606. """
  607. The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
  608. """
  609. pass
  610. def enable_padding(
  611. self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
  612. ):
  613. """
  614. Enable the padding
  615. Args:
  616. direction (:obj:`str`, `optional`, defaults to :obj:`right`):
  617. The direction in which to pad. Can be either ``right`` or ``left``
  618. pad_to_multiple_of (:obj:`int`, `optional`):
  619. If specified, the padding length should always snap to the next multiple of the
  620. given value. For example if we were going to pad witha length of 250 but
  621. ``pad_to_multiple_of=8`` then we will pad to 256.
  622. pad_id (:obj:`int`, defaults to 0):
  623. The id to be used when padding
  624. pad_type_id (:obj:`int`, defaults to 0):
  625. The type id to be used when padding
  626. pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
  627. The pad token to be used when padding
  628. length (:obj:`int`, `optional`):
  629. If specified, the length at which to pad. If not specified we pad using the size of
  630. the longest sequence in a batch.
  631. """
  632. pass
  633. def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
  634. """
  635. Enable truncation
  636. Args:
  637. max_length (:obj:`int`):
  638. The max length at which to truncate
  639. stride (:obj:`int`, `optional`):
  640. The length of the previous first sequence to be included in the overflowing
  641. sequence
  642. strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
  643. The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
  644. ``only_second``.
  645. direction (:obj:`str`, defaults to :obj:`right`):
  646. Truncate direction
  647. """
  648. pass
  649. def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
  650. """
  651. Encode the given sequence and pair. This method can process raw text sequences
  652. as well as already pre-tokenized sequences.
  653. Example:
  654. Here are some examples of the inputs that are accepted::
  655. encode("A single sequence")`
  656. encode("A sequence", "And its pair")`
  657. encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
  658. encode(
  659. [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
  660. is_pretokenized=True
  661. )
  662. Args:
  663. sequence (:obj:`~tokenizers.InputSequence`):
  664. The main input sequence we want to encode. This sequence can be either raw
  665. text or pre-tokenized, according to the ``is_pretokenized`` argument:
  666. - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
  667. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
  668. pair (:obj:`~tokenizers.InputSequence`, `optional`):
  669. An optional input sequence. The expected format is the same that for ``sequence``.
  670. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  671. Whether the input is already pre-tokenized
  672. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  673. Whether to add the special tokens
  674. Returns:
  675. :class:`~tokenizers.Encoding`: The encoded result
  676. """
  677. pass
  678. def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
  679. """
  680. Encode the given batch of inputs. This method accept both raw text sequences
  681. as well as already pre-tokenized sequences. The reason we use `PySequence` is
  682. because it allows type checking with zero-cost (according to PyO3) as we don't
  683. have to convert to check.
  684. Example:
  685. Here are some examples of the inputs that are accepted::
  686. encode_batch([
  687. "A single sequence",
  688. ("A tuple with a sequence", "And its pair"),
  689. [ "A", "pre", "tokenized", "sequence" ],
  690. ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
  691. ])
  692. Args:
  693. input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
  694. A list of single sequences or pair sequences to encode. Each sequence
  695. can be either raw text or pre-tokenized, according to the ``is_pretokenized``
  696. argument:
  697. - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
  698. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
  699. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  700. Whether the input is already pre-tokenized
  701. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  702. Whether to add the special tokens
  703. Returns:
  704. A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
  705. """
  706. pass
  707. def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
  708. """
  709. Encode the given batch of inputs. This method is faster than `encode_batch`
  710. because it doesn't keep track of offsets, they will be all zeros.
  711. Example:
  712. Here are some examples of the inputs that are accepted::
  713. encode_batch_fast([
  714. "A single sequence",
  715. ("A tuple with a sequence", "And its pair"),
  716. [ "A", "pre", "tokenized", "sequence" ],
  717. ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
  718. ])
  719. Args:
  720. input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
  721. A list of single sequences or pair sequences to encode. Each sequence
  722. can be either raw text or pre-tokenized, according to the ``is_pretokenized``
  723. argument:
  724. - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
  725. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
  726. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  727. Whether the input is already pre-tokenized
  728. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  729. Whether to add the special tokens
  730. Returns:
  731. A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
  732. """
  733. pass
  734. @property
  735. def encode_special_tokens(self):
  736. """
  737. Modifies the tokenizer in order to use or not the special tokens
  738. during encoding.
  739. Args:
  740. value (:obj:`bool`):
  741. Whether to use the special tokens or not
  742. """
  743. pass
  744. @staticmethod
  745. def from_buffer(buffer):
  746. """
  747. Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
  748. Args:
  749. buffer (:obj:`bytes`):
  750. A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
  751. Returns:
  752. :class:`~tokenizers.Tokenizer`: The new tokenizer
  753. """
  754. pass
  755. @staticmethod
  756. def from_file(path):
  757. """
  758. Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
  759. Args:
  760. path (:obj:`str`):
  761. A path to a local JSON file representing a previously serialized
  762. :class:`~tokenizers.Tokenizer`
  763. Returns:
  764. :class:`~tokenizers.Tokenizer`: The new tokenizer
  765. """
  766. pass
  767. @staticmethod
  768. def from_pretrained(identifier, revision="main", token=None):
  769. """
  770. Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
  771. Hugging Face Hub.
  772. Args:
  773. identifier (:obj:`str`):
  774. The identifier of a Model on the Hugging Face Hub, that contains
  775. a tokenizer.json file
  776. revision (:obj:`str`, defaults to `main`):
  777. A branch or commit id
  778. token (:obj:`str`, `optional`, defaults to `None`):
  779. An optional auth token used to access private repositories on the
  780. Hugging Face Hub
  781. Returns:
  782. :class:`~tokenizers.Tokenizer`: The new tokenizer
  783. """
  784. pass
  785. @staticmethod
  786. def from_str(json):
  787. """
  788. Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
  789. Args:
  790. json (:obj:`str`):
  791. A valid JSON string representing a previously serialized
  792. :class:`~tokenizers.Tokenizer`
  793. Returns:
  794. :class:`~tokenizers.Tokenizer`: The new tokenizer
  795. """
  796. pass
  797. def get_added_tokens_decoder(self):
  798. """
  799. Get the underlying vocabulary
  800. Returns:
  801. :obj:`Dict[int, AddedToken]`: The vocabulary
  802. """
  803. pass
  804. def get_vocab(self, with_added_tokens=True):
  805. """
  806. Get the underlying vocabulary
  807. Args:
  808. with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
  809. Whether to include the added tokens
  810. Returns:
  811. :obj:`Dict[str, int]`: The vocabulary
  812. """
  813. pass
  814. def get_vocab_size(self, with_added_tokens=True):
  815. """
  816. Get the size of the underlying vocabulary
  817. Args:
  818. with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
  819. Whether to include the added tokens
  820. Returns:
  821. :obj:`int`: The size of the vocabulary
  822. """
  823. pass
  824. def id_to_token(self, id):
  825. """
  826. Convert the given id to its corresponding token if it exists
  827. Args:
  828. id (:obj:`int`):
  829. The id to convert
  830. Returns:
  831. :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
  832. """
  833. pass
  834. @property
  835. def model(self):
  836. """
  837. The :class:`~tokenizers.models.Model` in use by the Tokenizer
  838. """
  839. pass
  840. def no_padding(self):
  841. """
  842. Disable padding
  843. """
  844. pass
  845. def no_truncation(self):
  846. """
  847. Disable truncation
  848. """
  849. pass
  850. @property
  851. def normalizer(self):
  852. """
  853. The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
  854. """
  855. pass
  856. def num_special_tokens_to_add(self, is_pair):
  857. """
  858. Return the number of special tokens that would be added for single/pair sentences.
  859. :param is_pair: Boolean indicating if the input would be a single sentence or a pair
  860. :return:
  861. """
  862. pass
  863. @property
  864. def padding(self):
  865. """
  866. Get the current padding parameters
  867. `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
  868. Returns:
  869. (:obj:`dict`, `optional`):
  870. A dict with the current padding parameters if padding is enabled
  871. """
  872. pass
  873. def post_process(self, encoding, pair=None, add_special_tokens=True):
  874. """
  875. Apply all the post-processing steps to the given encodings.
  876. The various steps are:
  877. 1. Truncate according to the set truncation params (provided with
  878. :meth:`~tokenizers.Tokenizer.enable_truncation`)
  879. 2. Apply the :class:`~tokenizers.processors.PostProcessor`
  880. 3. Pad according to the set padding params (provided with
  881. :meth:`~tokenizers.Tokenizer.enable_padding`)
  882. Args:
  883. encoding (:class:`~tokenizers.Encoding`):
  884. The :class:`~tokenizers.Encoding` corresponding to the main sequence.
  885. pair (:class:`~tokenizers.Encoding`, `optional`):
  886. An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
  887. add_special_tokens (:obj:`bool`):
  888. Whether to add the special tokens
  889. Returns:
  890. :class:`~tokenizers.Encoding`: The final post-processed encoding
  891. """
  892. pass
  893. @property
  894. def post_processor(self):
  895. """
  896. The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
  897. """
  898. pass
  899. @property
  900. def pre_tokenizer(self):
  901. """
  902. The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
  903. """
  904. pass
  905. def save(self, path, pretty=True):
  906. """
  907. Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
  908. Args:
  909. path (:obj:`str`):
  910. A path to a file in which to save the serialized tokenizer.
  911. pretty (:obj:`bool`, defaults to :obj:`True`):
  912. Whether the JSON file should be pretty formatted.
  913. """
  914. pass
  915. def to_str(self, pretty=False):
  916. """
  917. Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
  918. Args:
  919. pretty (:obj:`bool`, defaults to :obj:`False`):
  920. Whether the JSON string should be pretty formatted.
  921. Returns:
  922. :obj:`str`: A string representing the serialized Tokenizer
  923. """
  924. pass
  925. def token_to_id(self, token):
  926. """
  927. Convert the given token to its corresponding id if it exists
  928. Args:
  929. token (:obj:`str`):
  930. The token to convert
  931. Returns:
  932. :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
  933. """
  934. pass
  935. def train(self, files, trainer=None):
  936. """
  937. Train the Tokenizer using the given files.
  938. Reads the files line by line, while keeping all the whitespace, even new lines.
  939. If you want to train from data store in-memory, you can check
  940. :meth:`~tokenizers.Tokenizer.train_from_iterator`
  941. Args:
  942. files (:obj:`List[str]`):
  943. A list of path to the files that we should use for training
  944. trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
  945. An optional trainer that should be used to train our Model
  946. """
  947. pass
  948. def train_from_iterator(self, iterator, trainer=None, length=None):
  949. """
  950. Train the Tokenizer using the provided iterator.
  951. You can provide anything that is a Python Iterator
  952. * A list of sequences :obj:`List[str]`
  953. * A generator that yields :obj:`str` or :obj:`List[str]`
  954. * A Numpy array of strings
  955. * ...
  956. Args:
  957. iterator (:obj:`Iterator`):
  958. Any iterator over strings or list of strings
  959. trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
  960. An optional trainer that should be used to train our Model
  961. length (:obj:`int`, `optional`):
  962. The total number of sequences in the iterator. This is used to
  963. provide meaningful progress tracking
  964. """
  965. pass
  966. @property
  967. def truncation(self):
  968. """
  969. Get the currently set truncation parameters
  970. `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
  971. Returns:
  972. (:obj:`dict`, `optional`):
  973. A dict with the current truncation parameters if truncation is enabled
  974. """
  975. pass