| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134 |
- # coding=utf-8
- # Copyright 2020 The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
- tokenization_utils_fast.py
- """
- import bisect
- import itertools
- import re
- import unicodedata
- from collections import OrderedDict
- from typing import Any, Dict, List, Optional, Tuple, Union, overload
- from .tokenization_utils_base import (
- ENCODE_KWARGS_DOCSTRING,
- ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
- INIT_TOKENIZER_DOCSTRING,
- AddedToken,
- BatchEncoding,
- EncodedInput,
- EncodedInputPair,
- PreTokenizedInput,
- PreTokenizedInputPair,
- PreTrainedTokenizerBase,
- TextInput,
- TextInputPair,
- TruncationStrategy,
- )
- from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging
- logger = logging.get_logger(__name__)
- # Slow tokenizers are saved in a vocabulary plus three separated files
- SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
- ADDED_TOKENS_FILE = "added_tokens.json"
- TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
- class Trie:
- """
- Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
- Loose reference https://en.wikipedia.org/wiki/Trie
- """
- def __init__(self, *args):
- self.data = {}
- self._tokens = set()
- self._termination_char = ""
- self.update(*args)
- def update(self, *args):
- """
- Updates the Trie with new tokens provided as arguments.
- Args:
- *args: Variable number of words to be added to the Trie.
- """
- for token in tuple(*args):
- self.add(token)
- def add(self, word: str):
- """
- Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
- The special key `""` in `self._termination_char` is used to represent termination.
- This function is idempotent, adding twice the same word will leave the trie unchanged
- Example:
- ```python
- >>> trie = Trie()
- >>> trie.add("Hello 友達")
- >>> trie.data
- {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
- >>> trie.add("Hello")
- >>> trie.data
- {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
- ```
- """
- if not word:
- # Prevent empty string
- return
- self._tokens.add(word)
- ref = self.data
- for char in word:
- ref[char] = ref.setdefault(char, {})
- ref = ref[char]
- ref[self._termination_char] = 1
- def split(self, text: str) -> List[str]:
- """
- Will look for the words added to the trie within `text`. Output is the original string splitted along the
- boundaries of the words found.
- This trie will match the longest possible word first !
- Example:
- ```python
- >>> trie = Trie()
- >>> trie.split("[CLS] This is a extra_id_100")
- ["[CLS] This is a extra_id_100"]
- >>> trie.add("[CLS]")
- >>> trie.add("extra_id_1")
- >>> trie.add("extra_id_100")
- >>> trie.split("[CLS] This is a extra_id_100")
- ["[CLS]", " This is a ", "extra_id_100"]
- ```
- """
- # indexes are counted left of the chars index.
- # "hello", index 0, is left of h, index 1 is between h and e.
- # index 5 is right of the "o".
- # States are going to capture every possible start (indexes as above)
- # as keys, and have as values, a pointer to the position in the trie
- # where we're at. This is a partial match for now.
- # This enables to keep track of multiple matches while we're iterating
- # the string
- # If the trie contains, "blowing", and "lower" and we encounter the
- # string "blower", we need to split into ["b", "lower"].
- # This is where we need to keep track of multiple possible starts.
- states = OrderedDict()
- # This will contain every indices where we need
- # to cut.
- # We force to cut at offset 0 and len(text) (added later)
- offsets = [0]
- # This is used by the lookahead which needs to skip over
- # some text where the full match exceeded the place in the initial
- # for loop
- skip = 0
- # Main loop, Giving this algorithm O(n) complexity
- for current, current_char in enumerate(text):
- if skip and current < skip:
- # Prevents the lookahead for matching twice
- # like extra_id_100 and id_100
- continue
- # This will track every state
- # that stop matching, we need to stop tracking them.
- # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
- # fail on "b", we need to remove 0 from the valid states.
- to_remove = set()
- # Whenever we found a match, we need to drop everything
- # this is a greedy algorithm, it will match on the first found token
- reset = False
- # In this case, we already have partial matches (But unfinished)
- for start, trie_pointer in states.items():
- if "" in trie_pointer:
- # This is a final match, we need to reset and
- # store the results in `offsets`.
- # Lookahead to match longest first
- # Important in case of extra_id_1 vs extra_id_100
- # Here we are also actively looking for other earlier partial
- # matches
- # "[CLS]", "L", we need to match CLS even if L is special
- for lookstart, looktrie_pointer in states.items():
- if lookstart > start:
- # This partial match is later, we can stop looking
- break
- elif lookstart < start:
- # This partial match is earlier, the trie pointer
- # was already updated, so index is + 1
- lookahead_index = current + 1
- end = current + 1
- else:
- # Here lookstart == start and
- # looktrie_pointer == trie_pointer
- # It wasn't updated yet so indices are current ones
- lookahead_index = current
- end = current
- next_char = text[lookahead_index] if lookahead_index < len(text) else None
- if "" in looktrie_pointer:
- start = lookstart
- end = lookahead_index
- skip = lookahead_index
- while next_char in looktrie_pointer:
- looktrie_pointer = looktrie_pointer[next_char]
- lookahead_index += 1
- if "" in looktrie_pointer:
- start = lookstart
- end = lookahead_index
- skip = lookahead_index
- if lookahead_index == len(text):
- # End of string
- break
- next_char = text[lookahead_index]
- # End lookahead
- # Storing and resetting
- offsets.append(start)
- offsets.append(end)
- reset = True
- break
- elif current_char in trie_pointer:
- # The current character being looked at has a match within the trie
- # update the pointer (it will be stored back into states later).
- trie_pointer = trie_pointer[current_char]
- # Storing back the new pointer into the states.
- # Partial matches got longer by one.
- states[start] = trie_pointer
- else:
- # The new character has not match in the trie, we need
- # to stop keeping track of this partial match.
- # We can't do it directly within the loop because of how
- # python iteration works
- to_remove.add(start)
- # Either clearing the full start (we found a real match)
- # Or clearing only the partial matches that didn't work.
- if reset:
- states = {}
- else:
- for start in to_remove:
- del states[start]
- # If this character is a starting character within the trie
- # start keeping track of this partial match.
- if current >= skip and current_char in self.data:
- states[current] = self.data[current_char]
- # We have a cut at the end with states.
- for start, trie_pointer in states.items():
- if "" in trie_pointer:
- # This is a final match, we need to reset and
- # store the results in `offsets`.
- end = len(text)
- offsets.append(start)
- offsets.append(end)
- # Longest cut is always the one with lower start so the first
- # item so we need to break.
- break
- return self.cut_text(text, offsets)
- def cut_text(self, text, offsets):
- # We have all the offsets now, we just need to do the actual splitting.
- # We need to eventually add the first part of the string and the eventual
- # last part.
- offsets.append(len(text))
- tokens = []
- start = 0
- for end in offsets:
- if start > end:
- logger.error(
- "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
- " anyway."
- )
- continue
- elif start == end:
- # This might happen if there's a match at index 0
- # we're also preventing zero-width cuts in case of two
- # consecutive matches
- continue
- tokens.append(text[start:end])
- start = end
- return tokens
- class ExtensionsTrie(Trie):
- def __init__(self, *args):
- super().__init__(*args)
- def extensions(self, prefix: str):
- """
- Generates all extensions of a given prefix token in the Trie.
- Example:
- ```python
- >>> trie = Trie()
- >>> trie.add("apple")
- >>> trie.add("app")
- >>> trie.add("application")
- >>> trie.extensions("app")
- ['app', 'apple', 'application']
- ```
- """
- prefix_node = self._get_node(prefix)
- ret = self._collect_tokens(prefix_node)
- return [prefix + token for token in ret]
- def _get_node(self, token: str) -> dict:
- """
- Retrieves the node corresponding to the given token in the Trie.
- Args:
- token (str): The token for which the corresponding node needs to be retrieved.
- Returns:
- dict: The node in the Trie corresponding to the given token.
- """
- node = self.data
- for char in token:
- if char not in node:
- break
- node = node[char]
- return node
- def _collect_tokens(self, node: dict) -> list:
- """
- Generates all tokens in the Trie starting from a given node.
- Args:
- node (dict): The node in the Trie from which tokens need to be generated.
- Returns:
- list: List of tokens generated from the given node.
- """
- tokens = [self._termination_char] if self._termination_char in node else []
- for token, subtrie_head in node.items():
- if token != self._termination_char:
- subtokens = self._collect_tokens(subtrie_head)
- tokens.extend([token + subtoken for subtoken in subtokens])
- return tokens
- def _is_whitespace(char):
- """Checks whether `char` is a whitespace character."""
- # \t, \n, and \r are technically control characters but we treat them
- # as whitespace since they are generally considered as such.
- if char == " " or char == "\t" or char == "\n" or char == "\r":
- return True
- cat = unicodedata.category(char)
- if cat == "Zs":
- return True
- return False
- def _is_control(char):
- """Checks whether `char` is a control character."""
- # These are technically control characters but we count them as whitespace
- # characters.
- if char == "\t" or char == "\n" or char == "\r":
- return False
- cat = unicodedata.category(char)
- if cat.startswith("C"):
- return True
- return False
- def _is_punctuation(char):
- """Checks whether `char` is a punctuation character."""
- cp = ord(char)
- # We treat all non-letter/number ASCII as punctuation.
- # Characters such as "^", "$", and "`" are not in the Unicode
- # Punctuation class but we treat them as punctuation anyways, for
- # consistency.
- if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
- return True
- cat = unicodedata.category(char)
- if cat.startswith("P"):
- return True
- return False
- def _is_end_of_word(text):
- """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
- last_char = text[-1]
- return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
- def _is_start_of_word(text):
- """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
- first_char = text[0]
- return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
- def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
- """
- Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
- """
- insertion_idx = bisect.bisect_left(token_list, new_token)
- # Checks if new_token is already in the ordered token_list
- if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
- # new_token is in token_list, don't add
- return
- else:
- token_list.insert(insertion_idx, new_token)
- @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
- class PreTrainedTokenizer(PreTrainedTokenizerBase):
- """
- Base class for all slow tokenizers.
- Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
- Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
- pretrained tokenizers as well as adding tokens to the vocabulary.
- This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
- specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
- """
- def __init__(self, **kwargs):
- # 1. Init the parent class
- self.tokens_trie = Trie()
- # 2. init `_added_tokens_decoder` if child class did not
- if not hasattr(self, "_added_tokens_decoder"):
- self._added_tokens_decoder: Dict[int, AddedToken] = {}
- # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
- self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
- self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
- # 4 init the parent class
- super().__init__(**kwargs)
- # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
- # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
- self._add_tokens(
- [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
- special_tokens=True,
- )
- self._decode_use_source_tokenizer = False
- @property
- def is_fast(self) -> bool:
- return False
- @property
- def vocab_size(self) -> int:
- """
- `int`: Size of the base vocabulary (without the added tokens).
- """
- raise NotImplementedError
- @property
- def added_tokens_encoder(self) -> Dict[str, int]:
- """
- Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
- optimisation in `self._added_tokens_encoder` for the slow tokenizers.
- """
- return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
- @property
- def added_tokens_decoder(self) -> Dict[int, AddedToken]:
- """
- Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
- Returns:
- `Dict[str, int]`: The added tokens.
- """
- return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
- @added_tokens_decoder.setter
- def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
- # Always raise an error if string because users should define the behavior
- for index, token in value.items():
- if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
- raise TypeError(
- f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}"
- )
- self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
- self._added_tokens_encoder[str(token)] = index
- self._update_total_vocab_size()
- def get_added_vocab(self) -> Dict[str, int]:
- """
- Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
- the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
- something we should change.
- Returns:
- `Dict[str, int]`: The added tokens.
- """
- return self._added_tokens_encoder
- def __len__(self):
- """
- Size of the full vocabulary with the added tokens.
- """
- return self.total_vocab_size
- def _update_total_vocab_size(self):
- """
- Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
- otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
- is only updated when adding tokens.
- """
- self.total_vocab_size = len(self.get_vocab())
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
- """
- Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
- it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
- vocab which is why they have to be handled specifically.
- Args:
- new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
- Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
- (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
- of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
- stripping and normalization of this token. This is NOT possible in `tokenizers`.
- special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the tokens should be added as special tokens.
- Returns:
- `int`: The number of tokens actually added to the vocabulary.
- Examples:
- ```python
- # Let's see how to increase the vocabulary of Bert model and tokenizer
- tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
- model = BertModel.from_pretrained("google-bert/bert-base-uncased")
- num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
- print("We have added", num_added_toks, "tokens")
- # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
- model.resize_token_embeddings(len(tokenizer))
- ```"""
- added_tokens = 0
- if new_tokens is None:
- return added_tokens
- # TODO this is fairly slow to improve!
- current_vocab = self.get_vocab().copy()
- new_idx = len(current_vocab) # only call this once, len gives the last index + 1
- for token in new_tokens:
- if not isinstance(token, (str, AddedToken)):
- raise TypeError(f"Token {token} is not a string but a {type(token)}.")
- if str(token) == "":
- continue
- if isinstance(token, str):
- if token in self._added_tokens_encoder:
- continue
- else:
- # very important for fast and slow equivalence!
- is_special = token in self.all_special_tokens or special_tokens
- token = AddedToken(
- token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special
- )
- elif special_tokens:
- # doing token.special=True changes the normalization! will fix in rust
- # this is important and the only reason why the AddedTokens in each class are normalized by default
- token.__setstate__({"special": True, "normalized": token.normalized})
- if token in self._added_tokens_decoder:
- continue
- if not token.special and token.normalized and getattr(self, "do_lower_case", False):
- # Normalize if requested
- token.content = token.content.lower()
- if token.content not in current_vocab:
- token_index = new_idx + added_tokens
- current_vocab[token.content] = token_index
- added_tokens += 1
- else:
- token_index = current_vocab[token.content]
- if token.special and str(token) not in self.all_special_tokens:
- self._additional_special_tokens.append(token)
- # the setter automatically updates the reverse map
- self._added_tokens_decoder[token_index] = token
- self._added_tokens_encoder[token.content] = token_index
- if self.verbose:
- logger.info(f"Adding {token} to the vocabulary")
- self._update_trie()
- self._update_total_vocab_size()
- return added_tokens
- def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
- for token in self._added_tokens_decoder.values():
- if token not in self.tokens_trie._tokens:
- self.tokens_trie.add(token.content)
- for token in unique_no_split_tokens:
- if token not in self.tokens_trie._tokens:
- self.tokens_trie.add(token)
- def num_special_tokens_to_add(self, pair: bool = False) -> int:
- """
- Returns the number of added tokens when encoding a sequence with special tokens.
- <Tip>
- This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
- this inside your training loop.
- </Tip>
- Args:
- pair (`bool`, *optional*, defaults to `False`):
- Whether the number of added tokens should be computed in the case of a sequence pair or a single
- sequence.
- Returns:
- `int`: Number of special tokens added to sequences.
- """
- token_ids_0 = []
- token_ids_1 = []
- return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
- def tokenize(self, text: TextInput, **kwargs) -> List[str]:
- """
- Converts a string into a sequence of tokens, using the tokenizer.
- Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
- (BPE/SentencePieces/WordPieces). Takes care of added tokens.
- Args:
- text (`str`):
- The sequence to be encoded.
- **kwargs (additional keyword arguments):
- Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
- Returns:
- `List[str]`: The list of tokens.
- """
- split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
- text, kwargs = self.prepare_for_tokenization(text, **kwargs)
- if kwargs:
- logger.warning(f"Keyword arguments {kwargs} not recognized.")
- if hasattr(self, "do_lower_case") and self.do_lower_case:
- # convert non-special tokens to lowercase. Might be super slow as well?
- escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
- escaped_special_toks += [
- re.escape(s_tok.content)
- for s_tok in (self._added_tokens_decoder.values())
- if not s_tok.special and s_tok.normalized
- ]
- pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
- text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
- if split_special_tokens:
- no_split_token = []
- tokens = [text]
- else:
- no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens
- # "This is something<special_token_1> else"
- tokens = self.tokens_trie.split(text)
- # ["This is something", "<special_token_1>", " else"]
- for i, token in enumerate(tokens):
- if token in no_split_token:
- tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
- left = tokens[i - 1] if i > 0 else None
- right = tokens[i + 1] if i < len(tokens) - 1 else None
- if isinstance(tok_extended, AddedToken):
- if tok_extended.rstrip and right:
- # A bit counter-intuitive but we strip the left of the string
- # since tok_extended.rstrip means the special token is eating all white spaces on its right
- tokens[i + 1] = right.lstrip()
- # Strip white spaces on the left
- if tok_extended.lstrip and left:
- tokens[i - 1] = left.rstrip() # Opposite here
- if tok_extended.single_word and left and left[-1] != " ":
- tokens[i - 1] += token
- tokens[i] = ""
- elif tok_extended.single_word and right and right[0] != " ":
- tokens[i + 1] = token + tokens[i + 1]
- tokens[i] = ""
- else:
- raise ValueError(
- f"{tok_extended} cannot be tokenized because it was not properly added"
- f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
- )
- # ["This is something", "<special_token_1>", "else"]
- tokenized_text = []
- for token in tokens:
- # Need to skip eventual empty (fully stripped) tokens
- if not token:
- continue
- if token in no_split_token:
- tokenized_text.append(token)
- else:
- tokenized_text.extend(self._tokenize(token))
- # ["This", " is", " something", "<special_token_1>", "else"]
- return tokenized_text
- def _tokenize(self, text, **kwargs):
- """
- Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
- Do NOT take care of added tokens.
- """
- raise NotImplementedError
- def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
- """
- Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
- vocabulary.
- Args:
- tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
- Returns:
- `int` or `List[int]`: The token id or list of token ids.
- """
- if tokens is None:
- return None
- if isinstance(tokens, str):
- return self._convert_token_to_id_with_added_voc(tokens)
- ids = []
- for token in tokens:
- ids.append(self._convert_token_to_id_with_added_voc(token))
- return ids
- def _convert_token_to_id_with_added_voc(self, token):
- if token is None:
- return None
- if token in self._added_tokens_encoder:
- return self._added_tokens_encoder[token]
- return self._convert_token_to_id(token)
- def _convert_token_to_id(self, token):
- raise NotImplementedError
- def _encode_plus(
- self,
- text: Union[TextInput, PreTokenizedInput, EncodedInput],
- text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: Optional[int] = None,
- padding_side: Optional[bool] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- **kwargs,
- ) -> BatchEncoding:
- def get_input_ids(text):
- if isinstance(text, str):
- tokens = self.tokenize(text, **kwargs)
- return self.convert_tokens_to_ids(tokens)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
- if is_split_into_words:
- tokens = list(
- itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
- )
- return self.convert_tokens_to_ids(tokens)
- else:
- return self.convert_tokens_to_ids(text)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
- return text
- else:
- if is_split_into_words:
- raise ValueError(
- f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
- " `is_split_into_words=True`."
- )
- else:
- raise ValueError(
- f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
- " integers."
- )
- if return_offsets_mapping:
- raise NotImplementedError(
- "return_offset_mapping is not available when using Python tokenizers. "
- "To use this feature, change your tokenizer to one deriving from "
- "transformers.PreTrainedTokenizerFast. "
- "More information on available tokenizers at "
- "https://github.com/huggingface/transformers/pull/2674"
- )
- first_ids = get_input_ids(text)
- second_ids = get_input_ids(text_pair) if text_pair is not None else None
- return self.prepare_for_model(
- first_ids,
- pair_ids=second_ids,
- add_special_tokens=add_special_tokens,
- padding=padding_strategy.value,
- truncation=truncation_strategy.value,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- return_tensors=return_tensors,
- prepend_batch_axis=True,
- return_attention_mask=return_attention_mask,
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- verbose=verbose,
- )
- def _batch_encode_plus(
- self,
- batch_text_or_text_pairs: Union[
- List[TextInput],
- List[TextInputPair],
- List[PreTokenizedInput],
- List[PreTokenizedInputPair],
- List[EncodedInput],
- List[EncodedInputPair],
- ],
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: Optional[int] = None,
- padding_side: Optional[bool] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- split_special_tokens: bool = False,
- **kwargs,
- ) -> BatchEncoding:
- def get_input_ids(text):
- if isinstance(text, str):
- tokens = self.tokenize(text, **kwargs)
- return self.convert_tokens_to_ids(tokens)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
- if is_split_into_words:
- tokens = list(
- itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
- )
- return self.convert_tokens_to_ids(tokens)
- else:
- return self.convert_tokens_to_ids(text)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
- return text
- else:
- raise ValueError(
- "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
- )
- if return_offsets_mapping:
- raise NotImplementedError(
- "return_offset_mapping is not available when using Python tokenizers. "
- "To use this feature, change your tokenizer to one deriving from "
- "transformers.PreTrainedTokenizerFast."
- )
- input_ids = []
- for ids_or_pair_ids in batch_text_or_text_pairs:
- if not isinstance(ids_or_pair_ids, (list, tuple)):
- ids, pair_ids = ids_or_pair_ids, None
- elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
- ids, pair_ids = ids_or_pair_ids, None
- else:
- ids, pair_ids = ids_or_pair_ids
- first_ids = get_input_ids(ids)
- second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
- input_ids.append((first_ids, second_ids))
- batch_outputs = self._batch_prepare_for_model(
- input_ids,
- add_special_tokens=add_special_tokens,
- padding_strategy=padding_strategy,
- truncation_strategy=truncation_strategy,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- return_attention_mask=return_attention_mask,
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- return_tensors=return_tensors,
- verbose=verbose,
- split_special_tokens=split_special_tokens,
- )
- return BatchEncoding(batch_outputs)
- @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
- def _batch_prepare_for_model(
- self,
- batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- pad_to_multiple_of: Optional[int] = None,
- padding_side: Optional[bool] = None,
- return_tensors: Optional[str] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- split_special_tokens: bool = False,
- ) -> BatchEncoding:
- """
- Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
- manages a moving window (with user defined stride) for overflowing tokens
- Args:
- batch_ids_pairs: list of tokenized input ids or input ids pairs
- """
- batch_outputs = {}
- for first_ids, second_ids in batch_ids_pairs:
- outputs = self.prepare_for_model(
- first_ids,
- second_ids,
- add_special_tokens=add_special_tokens,
- padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
- truncation=truncation_strategy.value,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=None, # we pad in batch afterward
- padding_side=None, # we pad in batch afterward
- return_attention_mask=False, # we pad in batch afterward
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- return_tensors=None, # We convert the whole batch to tensors at the end
- prepend_batch_axis=False,
- verbose=verbose,
- split_special_tokens=split_special_tokens,
- )
- for key, value in outputs.items():
- if key not in batch_outputs:
- batch_outputs[key] = []
- batch_outputs[key].append(value)
- batch_outputs = self.pad(
- batch_outputs,
- padding=padding_strategy.value,
- max_length=max_length,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- return_attention_mask=return_attention_mask,
- )
- batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
- return batch_outputs
- def prepare_for_tokenization(
- self, text: str, is_split_into_words: bool = False, **kwargs
- ) -> Tuple[str, Dict[str, Any]]:
- """
- Performs any necessary transformations before tokenization.
- This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
- `kwargs` at the end of the encoding process to be sure all the arguments have been used.
- Args:
- text (`str`):
- The text to prepare.
- is_split_into_words (`bool`, *optional*, defaults to `False`):
- Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
- tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
- which it will tokenize. This is useful for NER or token classification.
- kwargs (`Dict[str, Any]`, *optional*):
- Keyword arguments to use for the tokenization.
- Returns:
- `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
- """
- return (text, kwargs)
- def get_special_tokens_mask(
- self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
- ) -> List[int]:
- """
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
- Args:
- token_ids_0 (`List[int]`):
- List of ids of the first sequence.
- token_ids_1 (`List[int]`, *optional*):
- List of ids of the second sequence.
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the token list is already formatted with special tokens for the model.
- Returns:
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
- """
- if already_has_special_tokens:
- if token_ids_1 is not None:
- raise ValueError(
- "You should not supply a second sequence if the provided sequence of "
- "ids is already formatted with special tokens for the model."
- )
- return super().get_special_tokens_mask(
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
- )
- return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
- @overload
- def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
- @overload
- def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]: ...
- def convert_ids_to_tokens(
- self, ids: Union[int, List[int]], skip_special_tokens: bool = False
- ) -> Union[str, List[str]]:
- """
- Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
- added tokens.
- Args:
- ids (`int` or `List[int]`):
- The token id (or token ids) to convert to tokens.
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not to remove special tokens in the decoding.
- Returns:
- `str` or `List[str]`: The decoded token(s).
- """
- if isinstance(ids, int):
- if ids in self._added_tokens_decoder:
- return self._added_tokens_decoder[ids].content
- else:
- return self._convert_id_to_token(ids)
- tokens = []
- for index in ids:
- index = int(index)
- if skip_special_tokens and index in self.all_special_ids:
- continue
- if index in self._added_tokens_decoder:
- tokens.append(self._added_tokens_decoder[index].content)
- else:
- tokens.append(self._convert_id_to_token(index))
- return tokens
- def _convert_id_to_token(self, index: int) -> str:
- raise NotImplementedError
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
- return " ".join(tokens)
- def _decode(
- self,
- token_ids: Union[int, List[int]],
- skip_special_tokens: bool = False,
- clean_up_tokenization_spaces: bool = None,
- spaces_between_special_tokens: bool = True,
- **kwargs,
- ) -> str:
- self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
- filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
- # If given is a single id, prevents splitting the string in upcoming loop
- if isinstance(filtered_tokens, str):
- filtered_tokens = [filtered_tokens]
- legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
- token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
- }
- # To avoid mixing byte-level and unicode for byte-level BPT
- # we need to build string separately for added tokens and byte-level tokens
- # cf. https://github.com/huggingface/transformers/issues/1133
- sub_texts = []
- current_sub_text = []
- # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
- for token in filtered_tokens:
- if skip_special_tokens and token in self.all_special_tokens:
- continue
- if token in legacy_added_tokens:
- if current_sub_text:
- string = self.convert_tokens_to_string(current_sub_text)
- if len(string) > 0:
- sub_texts.append(string)
- current_sub_text = []
- sub_texts.append(token)
- else:
- current_sub_text.append(token)
- if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
- if spaces_between_special_tokens:
- text = " ".join(sub_texts)
- else:
- text = "".join(sub_texts)
- clean_up_tokenization_spaces = (
- clean_up_tokenization_spaces
- if clean_up_tokenization_spaces is not None
- else self.clean_up_tokenization_spaces
- )
- if clean_up_tokenization_spaces:
- clean_text = self.clean_up_tokenization(text)
- return clean_text
- else:
- return text
|