| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238 |
- # Generated content DO NOT EDIT
- class AddedToken:
- """
- Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
- It can have special options that defines the way it should behave.
- Args:
- content (:obj:`str`): The content of the token
- single_word (:obj:`bool`, defaults to :obj:`False`):
- Defines whether this token should only match single words. If :obj:`True`, this
- token will never match inside of a word. For example the token ``ing`` would match
- on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
- The notion of "`inside of a word`" is defined by the word boundaries pattern in
- regular expressions (ie. the token should start and end with word boundaries).
- lstrip (:obj:`bool`, defaults to :obj:`False`):
- Defines whether this token should strip all potential whitespaces on its left side.
- If :obj:`True`, this token will greedily match any whitespace on its left. For
- example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
- ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
- rstrip (:obj:`bool`, defaults to :obj:`False`):
- Defines whether this token should strip all potential whitespaces on its right
- side. If :obj:`True`, this token will greedily match any whitespace on its right.
- It works just like :obj:`lstrip` but on the right.
- normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
- Defines whether this token should match against the normalized version of the input
- text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
- lowercasing the text, the token could be extract from the input ``"I saw a lion
- Yesterday"``.
- special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
- Defines whether this token should be skipped when decoding.
- """
- def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
- pass
- @property
- def content(self):
- """
- Get the content of this :obj:`AddedToken`
- """
- pass
- @property
- def lstrip(self):
- """
- Get the value of the :obj:`lstrip` option
- """
- pass
- @property
- def normalized(self):
- """
- Get the value of the :obj:`normalized` option
- """
- pass
- @property
- def rstrip(self):
- """
- Get the value of the :obj:`rstrip` option
- """
- pass
- @property
- def single_word(self):
- """
- Get the value of the :obj:`single_word` option
- """
- pass
- @property
- def special(self):
- """
- Get the value of the :obj:`special` option
- """
- pass
- class Encoding:
- """
- The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
- """
- @property
- def attention_mask(self):
- """
- The attention mask
- This indicates to the LM which tokens should be attended to, and which should not.
- This is especially important when batching sequences, where we need to applying
- padding.
- Returns:
- :obj:`List[int]`: The attention mask
- """
- pass
- def char_to_token(self, char_pos, sequence_index=0):
- """
- Get the token that contains the char at the given position in the input sequence.
- Args:
- char_pos (:obj:`int`):
- The position of a char in the input string
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target char
- Returns:
- :obj:`int`: The index of the token that contains this char in the encoded sequence
- """
- pass
- def char_to_word(self, char_pos, sequence_index=0):
- """
- Get the word that contains the char at the given position in the input sequence.
- Args:
- char_pos (:obj:`int`):
- The position of a char in the input string
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target char
- Returns:
- :obj:`int`: The index of the word that contains this char in the input sequence
- """
- pass
- @property
- def ids(self):
- """
- The generated IDs
- The IDs are the main input to a Language Model. They are the token indices,
- the numerical representations that a LM understands.
- Returns:
- :obj:`List[int]`: The list of IDs
- """
- pass
- @staticmethod
- def merge(encodings, growing_offsets=True):
- """
- Merge the list of encodings into one final :class:`~tokenizers.Encoding`
- Args:
- encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
- The list of encodings that should be merged in one
- growing_offsets (:obj:`bool`, defaults to :obj:`True`):
- Whether the offsets should accumulate while merging
- Returns:
- :class:`~tokenizers.Encoding`: The resulting Encoding
- """
- pass
- @property
- def n_sequences(self):
- """
- The number of sequences represented
- Returns:
- :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
- """
- pass
- @property
- def offsets(self):
- """
- The offsets associated to each token
- These offsets let's you slice the input string, and thus retrieve the original
- part that led to producing the corresponding token.
- Returns:
- A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
- """
- pass
- @property
- def overflowing(self):
- """
- A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
- When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
- the output into as many pieces as required to match the specified maximum length.
- This field lets you retrieve all the subsequent pieces.
- When you use pairs of sequences, the overflowing pieces will contain enough
- variations to cover all the possible combinations, while respecting the provided
- maximum length.
- """
- pass
- def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
- """
- Pad the :class:`~tokenizers.Encoding` at the given length
- Args:
- length (:obj:`int`):
- The desired length
- direction: (:obj:`str`, defaults to :obj:`right`):
- The expected padding direction. Can be either :obj:`right` or :obj:`left`
- pad_id (:obj:`int`, defaults to :obj:`0`):
- The ID corresponding to the padding token
- pad_type_id (:obj:`int`, defaults to :obj:`0`):
- The type ID corresponding to the padding token
- pad_token (:obj:`str`, defaults to `[PAD]`):
- The pad token to use
- """
- pass
- @property
- def sequence_ids(self):
- """
- The generated sequence indices.
- They represent the index of the input sequence associated to each token.
- The sequence id can be None if the token is not related to any input sequence,
- like for example with special tokens.
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
- """
- pass
- def set_sequence_id(self, sequence_id):
- """
- Set the given sequence index
- Set the given sequence index for the whole range of tokens contained in this
- :class:`~tokenizers.Encoding`.
- """
- pass
- @property
- def special_tokens_mask(self):
- """
- The special token mask
- This indicates which tokens are special tokens, and which are not.
- Returns:
- :obj:`List[int]`: The special tokens mask
- """
- pass
- def token_to_chars(self, token_index):
- """
- Get the offsets of the token at the given index.
- The returned offsets are related to the input sequence that contains the
- token. In order to determine in which input sequence it belongs, you
- must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
- Args:
- token_index (:obj:`int`):
- The index of a token in the encoded sequence.
- Returns:
- :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
- """
- pass
- def token_to_sequence(self, token_index):
- """
- Get the index of the sequence represented by the given token.
- In the general use case, this method returns :obj:`0` for a single sequence or
- the first sequence of a pair, and :obj:`1` for the second sequence of a pair
- Args:
- token_index (:obj:`int`):
- The index of a token in the encoded sequence.
- Returns:
- :obj:`int`: The sequence id of the given token
- """
- pass
- def token_to_word(self, token_index):
- """
- Get the index of the word that contains the token in one of the input sequences.
- The returned word index is related to the input sequence that contains
- the token. In order to determine in which input sequence it belongs, you
- must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
- Args:
- token_index (:obj:`int`):
- The index of a token in the encoded sequence.
- Returns:
- :obj:`int`: The index of the word in the relevant input sequence.
- """
- pass
- @property
- def tokens(self):
- """
- The generated tokens
- They are the string representation of the IDs.
- Returns:
- :obj:`List[str]`: The list of tokens
- """
- pass
- def truncate(self, max_length, stride=0, direction="right"):
- """
- Truncate the :class:`~tokenizers.Encoding` at the given length
- If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
- this information is lost. It will be considered as representing a single sequence.
- Args:
- max_length (:obj:`int`):
- The desired length
- stride (:obj:`int`, defaults to :obj:`0`):
- The length of previous content to be included in each overflowing piece
- direction (:obj:`str`, defaults to :obj:`right`):
- Truncate direction
- """
- pass
- @property
- def type_ids(self):
- """
- The generated type IDs
- Generally used for tasks like sequence classification or question answering,
- these tokens let the LM know which input sequence corresponds to each tokens.
- Returns:
- :obj:`List[int]`: The list of type ids
- """
- pass
- @property
- def word_ids(self):
- """
- The generated word indices.
- They represent the index of the word associated to each token.
- When the input is pre-tokenized, they correspond to the ID of the given input label,
- otherwise they correspond to the words indices as defined by the
- :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
- For special tokens and such (any token that was generated from something that was
- not part of the input), the output is :obj:`None`
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
- """
- pass
- def word_to_chars(self, word_index, sequence_index=0):
- """
- Get the offsets of the word at the given index in one of the input sequences.
- Args:
- word_index (:obj:`int`):
- The index of a word in one of the input sequences.
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target word
- Returns:
- :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
- """
- pass
- def word_to_tokens(self, word_index, sequence_index=0):
- """
- Get the encoded tokens corresponding to the word at the given index
- in one of the input sequences.
- Args:
- word_index (:obj:`int`):
- The index of a word in one of the input sequences.
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target word
- Returns:
- :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
- """
- pass
- @property
- def words(self):
- """
- The generated word indices.
- .. warning::
- This is deprecated and will be removed in a future version.
- Please use :obj:`~tokenizers.Encoding.word_ids` instead.
- They represent the index of the word associated to each token.
- When the input is pre-tokenized, they correspond to the ID of the given input label,
- otherwise they correspond to the words indices as defined by the
- :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
- For special tokens and such (any token that was generated from something that was
- not part of the input), the output is :obj:`None`
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
- """
- pass
- class NormalizedString:
- """
- NormalizedString
- A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
- While making all the requested modifications, it keeps track of the alignment information
- between the two versions of the string.
- Args:
- sequence: str:
- The string sequence used to initialize this NormalizedString
- """
- def append(self, s):
- """
- Append the given sequence to the string
- """
- pass
- def clear(self):
- """
- Clears the string
- """
- pass
- def filter(self, func):
- """
- Filter each character of the string using the given func
- """
- pass
- def for_each(self, func):
- """
- Calls the given function for each character of the string
- """
- pass
- def lowercase(self):
- """
- Lowercase the string
- """
- pass
- def lstrip(self):
- """
- Strip the left of the string
- """
- pass
- def map(self, func):
- """
- Calls the given function for each character of the string
- Replaces each character of the string using the returned value. Each
- returned value **must** be a str of length 1 (ie a character).
- """
- pass
- def nfc(self):
- """
- Runs the NFC normalization
- """
- pass
- def nfd(self):
- """
- Runs the NFD normalization
- """
- pass
- def nfkc(self):
- """
- Runs the NFKC normalization
- """
- pass
- def nfkd(self):
- """
- Runs the NFKD normalization
- """
- pass
- @property
- def normalized(self):
- """
- The normalized part of the string
- """
- pass
- def prepend(self, s):
- """
- Prepend the given sequence to the string
- """
- pass
- def replace(self, pattern, content):
- """
- Replace the content of the given pattern with the provided content
- Args:
- pattern: Pattern:
- A pattern used to match the string. Usually a string or a Regex
- content: str:
- The content to be used as replacement
- """
- pass
- def rstrip(self):
- """
- Strip the right of the string
- """
- pass
- def slice(self, range):
- """
- Slice the string using the given range
- """
- pass
- def split(self, pattern, behavior):
- """
- Split the NormalizedString using the given pattern and the specified behavior
- Args:
- pattern: Pattern:
- A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
- behavior: SplitDelimiterBehavior:
- The behavior to use when splitting.
- Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
- "contiguous"
- Returns:
- A list of NormalizedString, representing each split
- """
- pass
- def strip(self):
- """
- Strip both ends of the string
- """
- pass
- def uppercase(self):
- """
- Uppercase the string
- """
- pass
- class PreTokenizedString:
- """
- PreTokenizedString
- Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
- underlying string, while keeping track of the alignment information (offsets).
- The PreTokenizedString manages what we call `splits`. Each split represents a substring
- which is a subpart of the original string, with the relevant offsets and tokens.
- When calling one of the methods used to modify the PreTokenizedString (namely one of
- `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
- tokens will get modified.
- Args:
- sequence: str:
- The string sequence used to initialize this PreTokenizedString
- """
- def __init__(self, sequence):
- pass
- def get_splits(self, offset_referential="original", offset_type="char"):
- """
- Get the splits currently managed by the PreTokenizedString
- Args:
- offset_referential: :obj:`str`
- Whether the returned splits should have offsets expressed relative
- to the original string, or the normalized one. choices: "original", "normalized".
- offset_type: :obj:`str`
- Whether the returned splits should have offsets expressed in bytes or chars.
- When slicing an str, we usually want to use chars, which is the default value.
- Now in some cases it might be interesting to get these offsets expressed in bytes,
- so it is possible to change this here.
- choices: "char", "bytes"
- Returns
- A list of splits
- """
- pass
- def normalize(self, func):
- """
- Normalize each split of the `PreTokenizedString` using the given `func`
- Args:
- func: Callable[[NormalizedString], None]:
- The function used to normalize each underlying split. This function
- does not need to return anything, just calling the methods on the provided
- NormalizedString allow its modification.
- """
- pass
- def split(self, func):
- """
- Split the PreTokenizedString using the given `func`
- Args:
- func: Callable[[index, NormalizedString], List[NormalizedString]]:
- The function used to split each underlying split.
- It is expected to return a list of `NormalizedString`, that represent the new
- splits. If the given `NormalizedString` does not need any splitting, we can
- just return it directly.
- In order for the offsets to be tracked accurately, any returned `NormalizedString`
- should come from calling either `.split` or `.slice` on the received one.
- """
- pass
- def to_encoding(self, type_id=0, word_idx=None):
- """
- Return an Encoding generated from this PreTokenizedString
- Args:
- type_id: int = 0:
- The type_id to be used on the generated Encoding.
- word_idx: Optional[int] = None:
- An optional word index to be used for each token of this Encoding. If provided,
- all the word indices in the generated Encoding will use this value, instead
- of the one automatically tracked during pre-tokenization.
- Returns:
- An Encoding
- """
- pass
- def tokenize(self, func):
- """
- Tokenize each split of the `PreTokenizedString` using the given `func`
- Args:
- func: Callable[[str], List[Token]]:
- The function used to tokenize each underlying split. This function must return
- a list of Token generated from the input str.
- """
- pass
- class Regex:
- """
- Instantiate a new Regex with the given pattern
- """
- def __init__(self, pattern):
- pass
- class Token:
- pass
- class Tokenizer:
- """
- A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
- and outputs an :class:`~tokenizers.Encoding`.
- Args:
- model (:class:`~tokenizers.models.Model`):
- The core algorithm that this :obj:`Tokenizer` should be using.
- """
- def __init__(self, model):
- pass
- def add_special_tokens(self, tokens):
- """
- Add the given special tokens to the Tokenizer.
- If these tokens are already part of the vocabulary, it just let the Tokenizer know about
- them. If they don't exist, the Tokenizer creates them, giving them a new id.
- These special tokens will never be processed by the model (ie won't be split into
- multiple tokens), and they can be removed from the output when decoding.
- Args:
- tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
- The list of special tokens we want to add to the vocabulary. Each token can either
- be a string or an instance of :class:`~tokenizers.AddedToken` for more
- customization.
- Returns:
- :obj:`int`: The number of tokens that were created in the vocabulary
- """
- pass
- def add_tokens(self, tokens):
- """
- Add the given tokens to the vocabulary
- The given tokens are added only if they don't already exist in the vocabulary.
- Each token then gets a new attributed id.
- Args:
- tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
- The list of tokens we want to add to the vocabulary. Each token can be either a
- string or an instance of :class:`~tokenizers.AddedToken` for more customization.
- Returns:
- :obj:`int`: The number of tokens that were created in the vocabulary
- """
- pass
- def decode(self, ids, skip_special_tokens=True):
- """
- Decode the given list of ids back to a string
- This is used to decode anything coming back from a Language Model
- Args:
- ids (A :obj:`List/Tuple` of :obj:`int`):
- The list of ids that we want to decode
- skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether the special tokens should be removed from the decoded string
- Returns:
- :obj:`str`: The decoded string
- """
- pass
- def decode_batch(self, sequences, skip_special_tokens=True):
- """
- Decode a batch of ids back to their corresponding string
- Args:
- sequences (:obj:`List` of :obj:`List[int]`):
- The batch of sequences we want to decode
- skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether the special tokens should be removed from the decoded strings
- Returns:
- :obj:`List[str]`: A list of decoded strings
- """
- pass
- @property
- def decoder(self):
- """
- The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
- """
- pass
- def enable_padding(
- self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
- ):
- """
- Enable the padding
- Args:
- direction (:obj:`str`, `optional`, defaults to :obj:`right`):
- The direction in which to pad. Can be either ``right`` or ``left``
- pad_to_multiple_of (:obj:`int`, `optional`):
- If specified, the padding length should always snap to the next multiple of the
- given value. For example if we were going to pad witha length of 250 but
- ``pad_to_multiple_of=8`` then we will pad to 256.
- pad_id (:obj:`int`, defaults to 0):
- The id to be used when padding
- pad_type_id (:obj:`int`, defaults to 0):
- The type id to be used when padding
- pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
- The pad token to be used when padding
- length (:obj:`int`, `optional`):
- If specified, the length at which to pad. If not specified we pad using the size of
- the longest sequence in a batch.
- """
- pass
- def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
- """
- Enable truncation
- Args:
- max_length (:obj:`int`):
- The max length at which to truncate
- stride (:obj:`int`, `optional`):
- The length of the previous first sequence to be included in the overflowing
- sequence
- strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
- The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
- ``only_second``.
- direction (:obj:`str`, defaults to :obj:`right`):
- Truncate direction
- """
- pass
- def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
- """
- Encode the given sequence and pair. This method can process raw text sequences
- as well as already pre-tokenized sequences.
- Example:
- Here are some examples of the inputs that are accepted::
- encode("A single sequence")`
- encode("A sequence", "And its pair")`
- encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
- encode(
- [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
- is_pretokenized=True
- )
- Args:
- sequence (:obj:`~tokenizers.InputSequence`):
- The main input sequence we want to encode. This sequence can be either raw
- text or pre-tokenized, according to the ``is_pretokenized`` argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
- pair (:obj:`~tokenizers.InputSequence`, `optional`):
- An optional input sequence. The expected format is the same that for ``sequence``.
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- :class:`~tokenizers.Encoding`: The encoded result
- """
- pass
- def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
- """
- Encode the given batch of inputs. This method accept both raw text sequences
- as well as already pre-tokenized sequences. The reason we use `PySequence` is
- because it allows type checking with zero-cost (according to PyO3) as we don't
- have to convert to check.
- Example:
- Here are some examples of the inputs that are accepted::
- encode_batch([
- "A single sequence",
- ("A tuple with a sequence", "And its pair"),
- [ "A", "pre", "tokenized", "sequence" ],
- ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
- ])
- Args:
- input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
- A list of single sequences or pair sequences to encode. Each sequence
- can be either raw text or pre-tokenized, according to the ``is_pretokenized``
- argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
- """
- pass
- def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
- """
- Encode the given batch of inputs. This method is faster than `encode_batch`
- because it doesn't keep track of offsets, they will be all zeros.
- Example:
- Here are some examples of the inputs that are accepted::
- encode_batch_fast([
- "A single sequence",
- ("A tuple with a sequence", "And its pair"),
- [ "A", "pre", "tokenized", "sequence" ],
- ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
- ])
- Args:
- input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
- A list of single sequences or pair sequences to encode. Each sequence
- can be either raw text or pre-tokenized, according to the ``is_pretokenized``
- argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
- """
- pass
- @property
- def encode_special_tokens(self):
- """
- Modifies the tokenizer in order to use or not the special tokens
- during encoding.
- Args:
- value (:obj:`bool`):
- Whether to use the special tokens or not
- """
- pass
- @staticmethod
- def from_buffer(buffer):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
- Args:
- buffer (:obj:`bytes`):
- A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- @staticmethod
- def from_file(path):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
- Args:
- path (:obj:`str`):
- A path to a local JSON file representing a previously serialized
- :class:`~tokenizers.Tokenizer`
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- @staticmethod
- def from_pretrained(identifier, revision="main", token=None):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
- Hugging Face Hub.
- Args:
- identifier (:obj:`str`):
- The identifier of a Model on the Hugging Face Hub, that contains
- a tokenizer.json file
- revision (:obj:`str`, defaults to `main`):
- A branch or commit id
- token (:obj:`str`, `optional`, defaults to `None`):
- An optional auth token used to access private repositories on the
- Hugging Face Hub
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- @staticmethod
- def from_str(json):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
- Args:
- json (:obj:`str`):
- A valid JSON string representing a previously serialized
- :class:`~tokenizers.Tokenizer`
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- def get_added_tokens_decoder(self):
- """
- Get the underlying vocabulary
- Returns:
- :obj:`Dict[int, AddedToken]`: The vocabulary
- """
- pass
- def get_vocab(self, with_added_tokens=True):
- """
- Get the underlying vocabulary
- Args:
- with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to include the added tokens
- Returns:
- :obj:`Dict[str, int]`: The vocabulary
- """
- pass
- def get_vocab_size(self, with_added_tokens=True):
- """
- Get the size of the underlying vocabulary
- Args:
- with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to include the added tokens
- Returns:
- :obj:`int`: The size of the vocabulary
- """
- pass
- def id_to_token(self, id):
- """
- Convert the given id to its corresponding token if it exists
- Args:
- id (:obj:`int`):
- The id to convert
- Returns:
- :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
- """
- pass
- @property
- def model(self):
- """
- The :class:`~tokenizers.models.Model` in use by the Tokenizer
- """
- pass
- def no_padding(self):
- """
- Disable padding
- """
- pass
- def no_truncation(self):
- """
- Disable truncation
- """
- pass
- @property
- def normalizer(self):
- """
- The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
- """
- pass
- def num_special_tokens_to_add(self, is_pair):
- """
- Return the number of special tokens that would be added for single/pair sentences.
- :param is_pair: Boolean indicating if the input would be a single sentence or a pair
- :return:
- """
- pass
- @property
- def padding(self):
- """
- Get the current padding parameters
- `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
- Returns:
- (:obj:`dict`, `optional`):
- A dict with the current padding parameters if padding is enabled
- """
- pass
- def post_process(self, encoding, pair=None, add_special_tokens=True):
- """
- Apply all the post-processing steps to the given encodings.
- The various steps are:
- 1. Truncate according to the set truncation params (provided with
- :meth:`~tokenizers.Tokenizer.enable_truncation`)
- 2. Apply the :class:`~tokenizers.processors.PostProcessor`
- 3. Pad according to the set padding params (provided with
- :meth:`~tokenizers.Tokenizer.enable_padding`)
- Args:
- encoding (:class:`~tokenizers.Encoding`):
- The :class:`~tokenizers.Encoding` corresponding to the main sequence.
- pair (:class:`~tokenizers.Encoding`, `optional`):
- An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
- add_special_tokens (:obj:`bool`):
- Whether to add the special tokens
- Returns:
- :class:`~tokenizers.Encoding`: The final post-processed encoding
- """
- pass
- @property
- def post_processor(self):
- """
- The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
- """
- pass
- @property
- def pre_tokenizer(self):
- """
- The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
- """
- pass
- def save(self, path, pretty=True):
- """
- Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
- Args:
- path (:obj:`str`):
- A path to a file in which to save the serialized tokenizer.
- pretty (:obj:`bool`, defaults to :obj:`True`):
- Whether the JSON file should be pretty formatted.
- """
- pass
- def to_str(self, pretty=False):
- """
- Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
- Args:
- pretty (:obj:`bool`, defaults to :obj:`False`):
- Whether the JSON string should be pretty formatted.
- Returns:
- :obj:`str`: A string representing the serialized Tokenizer
- """
- pass
- def token_to_id(self, token):
- """
- Convert the given token to its corresponding id if it exists
- Args:
- token (:obj:`str`):
- The token to convert
- Returns:
- :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
- """
- pass
- def train(self, files, trainer=None):
- """
- Train the Tokenizer using the given files.
- Reads the files line by line, while keeping all the whitespace, even new lines.
- If you want to train from data store in-memory, you can check
- :meth:`~tokenizers.Tokenizer.train_from_iterator`
- Args:
- files (:obj:`List[str]`):
- A list of path to the files that we should use for training
- trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
- An optional trainer that should be used to train our Model
- """
- pass
- def train_from_iterator(self, iterator, trainer=None, length=None):
- """
- Train the Tokenizer using the provided iterator.
- You can provide anything that is a Python Iterator
- * A list of sequences :obj:`List[str]`
- * A generator that yields :obj:`str` or :obj:`List[str]`
- * A Numpy array of strings
- * ...
- Args:
- iterator (:obj:`Iterator`):
- Any iterator over strings or list of strings
- trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
- An optional trainer that should be used to train our Model
- length (:obj:`int`, `optional`):
- The total number of sequences in the iterator. This is used to
- provide meaningful progress tracking
- """
- pass
- @property
- def truncation(self):
- """
- Get the currently set truncation parameters
- `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
- Returns:
- (:obj:`dict`, `optional`):
- A dict with the current truncation parameters if truncation is enabled
- """
- pass
|