__init__.pyi 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. # Generated content DO NOT EDIT
  2. class PostProcessor:
  3. """
  4. Base class for all post-processors
  5. This class is not supposed to be instantiated directly. Instead, any implementation of
  6. a PostProcessor will return an instance of this class when instantiated.
  7. """
  8. def num_special_tokens_to_add(self, is_pair):
  9. """
  10. Return the number of special tokens that would be added for single/pair sentences.
  11. Args:
  12. is_pair (:obj:`bool`):
  13. Whether the input would be a pair of sequences
  14. Returns:
  15. :obj:`int`: The number of tokens to add
  16. """
  17. pass
  18. def process(self, encoding, pair=None, add_special_tokens=True):
  19. """
  20. Post-process the given encodings, generating the final one
  21. Args:
  22. encoding (:class:`~tokenizers.Encoding`):
  23. The encoding for the first sequence
  24. pair (:class:`~tokenizers.Encoding`, `optional`):
  25. The encoding for the pair sequence
  26. add_special_tokens (:obj:`bool`):
  27. Whether to add the special tokens
  28. Return:
  29. :class:`~tokenizers.Encoding`: The final encoding
  30. """
  31. pass
  32. class BertProcessing(PostProcessor):
  33. """
  34. This post-processor takes care of adding the special tokens needed by
  35. a Bert model:
  36. - a SEP token
  37. - a CLS token
  38. Args:
  39. sep (:obj:`Tuple[str, int]`):
  40. A tuple with the string representation of the SEP token, and its id
  41. cls (:obj:`Tuple[str, int]`):
  42. A tuple with the string representation of the CLS token, and its id
  43. """
  44. def __init__(self, sep, cls):
  45. pass
  46. def num_special_tokens_to_add(self, is_pair):
  47. """
  48. Return the number of special tokens that would be added for single/pair sentences.
  49. Args:
  50. is_pair (:obj:`bool`):
  51. Whether the input would be a pair of sequences
  52. Returns:
  53. :obj:`int`: The number of tokens to add
  54. """
  55. pass
  56. def process(self, encoding, pair=None, add_special_tokens=True):
  57. """
  58. Post-process the given encodings, generating the final one
  59. Args:
  60. encoding (:class:`~tokenizers.Encoding`):
  61. The encoding for the first sequence
  62. pair (:class:`~tokenizers.Encoding`, `optional`):
  63. The encoding for the pair sequence
  64. add_special_tokens (:obj:`bool`):
  65. Whether to add the special tokens
  66. Return:
  67. :class:`~tokenizers.Encoding`: The final encoding
  68. """
  69. pass
  70. class ByteLevel(PostProcessor):
  71. """
  72. This post-processor takes care of trimming the offsets.
  73. By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
  74. want the offsets to include these whitespaces, then this PostProcessor must be used.
  75. Args:
  76. trim_offsets (:obj:`bool`):
  77. Whether to trim the whitespaces from the produced offsets.
  78. """
  79. def __init__(self, trim_offsets=True):
  80. pass
  81. def num_special_tokens_to_add(self, is_pair):
  82. """
  83. Return the number of special tokens that would be added for single/pair sentences.
  84. Args:
  85. is_pair (:obj:`bool`):
  86. Whether the input would be a pair of sequences
  87. Returns:
  88. :obj:`int`: The number of tokens to add
  89. """
  90. pass
  91. def process(self, encoding, pair=None, add_special_tokens=True):
  92. """
  93. Post-process the given encodings, generating the final one
  94. Args:
  95. encoding (:class:`~tokenizers.Encoding`):
  96. The encoding for the first sequence
  97. pair (:class:`~tokenizers.Encoding`, `optional`):
  98. The encoding for the pair sequence
  99. add_special_tokens (:obj:`bool`):
  100. Whether to add the special tokens
  101. Return:
  102. :class:`~tokenizers.Encoding`: The final encoding
  103. """
  104. pass
  105. class RobertaProcessing(PostProcessor):
  106. """
  107. This post-processor takes care of adding the special tokens needed by
  108. a Roberta model:
  109. - a SEP token
  110. - a CLS token
  111. It also takes care of trimming the offsets.
  112. By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
  113. want the offsets to include these whitespaces, then this PostProcessor should be initialized
  114. with :obj:`trim_offsets=True`
  115. Args:
  116. sep (:obj:`Tuple[str, int]`):
  117. A tuple with the string representation of the SEP token, and its id
  118. cls (:obj:`Tuple[str, int]`):
  119. A tuple with the string representation of the CLS token, and its id
  120. trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
  121. Whether to trim the whitespaces from the produced offsets.
  122. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
  123. Whether the add_prefix_space option was enabled during pre-tokenization. This
  124. is relevant because it defines the way the offsets are trimmed out.
  125. """
  126. def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
  127. pass
  128. def num_special_tokens_to_add(self, is_pair):
  129. """
  130. Return the number of special tokens that would be added for single/pair sentences.
  131. Args:
  132. is_pair (:obj:`bool`):
  133. Whether the input would be a pair of sequences
  134. Returns:
  135. :obj:`int`: The number of tokens to add
  136. """
  137. pass
  138. def process(self, encoding, pair=None, add_special_tokens=True):
  139. """
  140. Post-process the given encodings, generating the final one
  141. Args:
  142. encoding (:class:`~tokenizers.Encoding`):
  143. The encoding for the first sequence
  144. pair (:class:`~tokenizers.Encoding`, `optional`):
  145. The encoding for the pair sequence
  146. add_special_tokens (:obj:`bool`):
  147. Whether to add the special tokens
  148. Return:
  149. :class:`~tokenizers.Encoding`: The final encoding
  150. """
  151. pass
  152. class Sequence(PostProcessor):
  153. """
  154. Sequence Processor
  155. Args:
  156. processors (:obj:`List[PostProcessor]`)
  157. The processors that need to be chained
  158. """
  159. def __init__(self, processors):
  160. pass
  161. def num_special_tokens_to_add(self, is_pair):
  162. """
  163. Return the number of special tokens that would be added for single/pair sentences.
  164. Args:
  165. is_pair (:obj:`bool`):
  166. Whether the input would be a pair of sequences
  167. Returns:
  168. :obj:`int`: The number of tokens to add
  169. """
  170. pass
  171. def process(self, encoding, pair=None, add_special_tokens=True):
  172. """
  173. Post-process the given encodings, generating the final one
  174. Args:
  175. encoding (:class:`~tokenizers.Encoding`):
  176. The encoding for the first sequence
  177. pair (:class:`~tokenizers.Encoding`, `optional`):
  178. The encoding for the pair sequence
  179. add_special_tokens (:obj:`bool`):
  180. Whether to add the special tokens
  181. Return:
  182. :class:`~tokenizers.Encoding`: The final encoding
  183. """
  184. pass
  185. class TemplateProcessing(PostProcessor):
  186. """
  187. Provides a way to specify templates in order to add the special tokens to each
  188. input sequence as relevant.
  189. Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
  190. delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
  191. sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
  192. sequences. The final result looks like this:
  193. - Single sequence: :obj:`[CLS] Hello there [SEP]`
  194. - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
  195. With the type ids as following::
  196. [CLS] ... [SEP] ... [SEP]
  197. 0 0 0 1 1
  198. You can achieve such behavior using a TemplateProcessing::
  199. TemplateProcessing(
  200. single="[CLS] $0 [SEP]",
  201. pair="[CLS] $A [SEP] $B:1 [SEP]:1",
  202. special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
  203. )
  204. In this example, each input sequence is identified using a ``$`` construct. This identifier
  205. lets us specify each input sequence, and the type_id to use. When nothing is specified,
  206. it uses the default values. Here are the different ways to specify it:
  207. - Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
  208. - Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
  209. - Specifying both: ``$A:0``, ``$B:1``, ...
  210. The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
  211. **Warning**: You must ensure that you are giving the correct tokens/ids as these
  212. will be added to the Encoding without any further check. If the given ids correspond
  213. to something totally different in a `Tokenizer` using this `PostProcessor`, it
  214. might lead to unexpected results.
  215. Args:
  216. single (:obj:`Template`):
  217. The template used for single sequences
  218. pair (:obj:`Template`):
  219. The template used when both sequences are specified
  220. special_tokens (:obj:`Tokens`):
  221. The list of special tokens used in each sequences
  222. Types:
  223. Template (:obj:`str` or :obj:`List`):
  224. - If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
  225. - If a :obj:`List[str]` is provided, a list of tokens
  226. Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
  227. - A :obj:`Tuple` with both a token and its associated ID, in any order
  228. - A :obj:`dict` with the following keys:
  229. - "id": :obj:`str` => The special token id, as specified in the Template
  230. - "ids": :obj:`List[int]` => The associated IDs
  231. - "tokens": :obj:`List[str]` => The associated tokens
  232. The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
  233. the same length.
  234. """
  235. def __init__(self, single, pair, special_tokens):
  236. pass
  237. def num_special_tokens_to_add(self, is_pair):
  238. """
  239. Return the number of special tokens that would be added for single/pair sentences.
  240. Args:
  241. is_pair (:obj:`bool`):
  242. Whether the input would be a pair of sequences
  243. Returns:
  244. :obj:`int`: The number of tokens to add
  245. """
  246. pass
  247. def process(self, encoding, pair=None, add_special_tokens=True):
  248. """
  249. Post-process the given encodings, generating the final one
  250. Args:
  251. encoding (:class:`~tokenizers.Encoding`):
  252. The encoding for the first sequence
  253. pair (:class:`~tokenizers.Encoding`, `optional`):
  254. The encoding for the pair sequence
  255. add_special_tokens (:obj:`bool`):
  256. Whether to add the special tokens
  257. Return:
  258. :class:`~tokenizers.Encoding`: The final encoding
  259. """
  260. pass