| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591 |
- # Generated content DO NOT EDIT
- class Model:
- """
- Base class for all models
- The model represents the actual tokenization algorithm. This is the part that
- will contain and manage the learned vocabulary.
- This class cannot be constructed directly. Please use one of the concrete models.
- """
- def get_trainer(self):
- """
- Get the associated :class:`~tokenizers.trainers.Trainer`
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
- :class:`~tokenizers.models.Model`.
- Returns:
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
- """
- pass
- def id_to_token(self, id):
- """
- Get the token associated to an ID
- Args:
- id (:obj:`int`):
- An ID to convert to a token
- Returns:
- :obj:`str`: The token associated to the ID
- """
- pass
- def save(self, folder, prefix):
- """
- Save the current model
- Save the current model in the given folder, using the given prefix for the various
- files that will get created.
- Any file with the same name that already exists in this folder will be overwritten.
- Args:
- folder (:obj:`str`):
- The path to the target folder in which to save the various files
- prefix (:obj:`str`, `optional`):
- An optional prefix, used to prefix each file name
- Returns:
- :obj:`List[str]`: The list of saved files
- """
- pass
- def token_to_id(self, tokens):
- """
- Get the ID associated to a token
- Args:
- token (:obj:`str`):
- A token to convert to an ID
- Returns:
- :obj:`int`: The ID associated to the token
- """
- pass
- def tokenize(self, sequence):
- """
- Tokenize a sequence
- Args:
- sequence (:obj:`str`):
- A sequence to tokenize
- Returns:
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
- """
- pass
- class BPE(Model):
- """
- An implementation of the BPE (Byte-Pair Encoding) algorithm
- Args:
- vocab (:obj:`Dict[str, int]`, `optional`):
- A dictionary of string keys and their ids :obj:`{"am": 0,...}`
- merges (:obj:`List[Tuple[str, str]]`, `optional`):
- A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
- cache_capacity (:obj:`int`, `optional`):
- The number of words that the BPE cache can contain. The cache allows
- to speed-up the process by keeping the result of the merge operations
- for a number of words.
- dropout (:obj:`float`, `optional`):
- A float between 0 and 1 that represents the BPE dropout to use.
- unk_token (:obj:`str`, `optional`):
- The unknown token to be used by the model.
- continuing_subword_prefix (:obj:`str`, `optional`):
- The prefix to attach to subword units that don't represent a beginning of word.
- end_of_word_suffix (:obj:`str`, `optional`):
- The suffix to attach to subword units that represent an end of word.
- fuse_unk (:obj:`bool`, `optional`):
- Whether to fuse any subsequent unknown tokens into a single one
- byte_fallback (:obj:`bool`, `optional`):
- Whether to use spm byte-fallback trick (defaults to False)
- ignore_merges (:obj:`bool`, `optional`):
- Whether or not to match tokens with the vocab before using merges.
- """
- def __init__(
- self,
- vocab=None,
- merges=None,
- cache_capacity=None,
- dropout=None,
- unk_token=None,
- continuing_subword_prefix=None,
- end_of_word_suffix=None,
- fuse_unk=None,
- byte_fallback=False,
- ignore_merges=False,
- ):
- pass
- @staticmethod
- def from_file(cls, vocab, merge, **kwargs):
- """
- Instantiate a BPE model from the given files.
- This method is roughly equivalent to doing::
- vocab, merges = BPE.read_file(vocab_filename, merges_filename)
- bpe = BPE(vocab, merges)
- If you don't need to keep the :obj:`vocab, merges` values lying around,
- this method is more optimized than manually calling
- :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
- Args:
- vocab (:obj:`str`):
- The path to a :obj:`vocab.json` file
- merges (:obj:`str`):
- The path to a :obj:`merges.txt` file
- Returns:
- :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
- """
- pass
- def get_trainer(self):
- """
- Get the associated :class:`~tokenizers.trainers.Trainer`
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
- :class:`~tokenizers.models.Model`.
- Returns:
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
- """
- pass
- def id_to_token(self, id):
- """
- Get the token associated to an ID
- Args:
- id (:obj:`int`):
- An ID to convert to a token
- Returns:
- :obj:`str`: The token associated to the ID
- """
- pass
- @staticmethod
- def read_file(self, vocab, merges):
- """
- Read a :obj:`vocab.json` and a :obj:`merges.txt` files
- This method provides a way to read and parse the content of these files,
- returning the relevant data structures. If you want to instantiate some BPE models
- from memory, this method gives you the expected input from the standard files.
- Args:
- vocab (:obj:`str`):
- The path to a :obj:`vocab.json` file
- merges (:obj:`str`):
- The path to a :obj:`merges.txt` file
- Returns:
- A :obj:`Tuple` with the vocab and the merges:
- The vocabulary and merges loaded into memory
- """
- pass
- def save(self, folder, prefix):
- """
- Save the current model
- Save the current model in the given folder, using the given prefix for the various
- files that will get created.
- Any file with the same name that already exists in this folder will be overwritten.
- Args:
- folder (:obj:`str`):
- The path to the target folder in which to save the various files
- prefix (:obj:`str`, `optional`):
- An optional prefix, used to prefix each file name
- Returns:
- :obj:`List[str]`: The list of saved files
- """
- pass
- def token_to_id(self, tokens):
- """
- Get the ID associated to a token
- Args:
- token (:obj:`str`):
- A token to convert to an ID
- Returns:
- :obj:`int`: The ID associated to the token
- """
- pass
- def tokenize(self, sequence):
- """
- Tokenize a sequence
- Args:
- sequence (:obj:`str`):
- A sequence to tokenize
- Returns:
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
- """
- pass
- class Unigram(Model):
- """
- An implementation of the Unigram algorithm
- Args:
- vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
- A list of vocabulary items and their relative score [("am", -0.2442),...]
- """
- def __init__(self, vocab, unk_id, byte_fallback):
- pass
- def get_trainer(self):
- """
- Get the associated :class:`~tokenizers.trainers.Trainer`
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
- :class:`~tokenizers.models.Model`.
- Returns:
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
- """
- pass
- def id_to_token(self, id):
- """
- Get the token associated to an ID
- Args:
- id (:obj:`int`):
- An ID to convert to a token
- Returns:
- :obj:`str`: The token associated to the ID
- """
- pass
- def save(self, folder, prefix):
- """
- Save the current model
- Save the current model in the given folder, using the given prefix for the various
- files that will get created.
- Any file with the same name that already exists in this folder will be overwritten.
- Args:
- folder (:obj:`str`):
- The path to the target folder in which to save the various files
- prefix (:obj:`str`, `optional`):
- An optional prefix, used to prefix each file name
- Returns:
- :obj:`List[str]`: The list of saved files
- """
- pass
- def token_to_id(self, tokens):
- """
- Get the ID associated to a token
- Args:
- token (:obj:`str`):
- A token to convert to an ID
- Returns:
- :obj:`int`: The ID associated to the token
- """
- pass
- def tokenize(self, sequence):
- """
- Tokenize a sequence
- Args:
- sequence (:obj:`str`):
- A sequence to tokenize
- Returns:
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
- """
- pass
- class WordLevel(Model):
- """
- An implementation of the WordLevel algorithm
- Most simple tokenizer model based on mapping tokens to their corresponding id.
- Args:
- vocab (:obj:`str`, `optional`):
- A dictionary of string keys and their ids :obj:`{"am": 0,...}`
- unk_token (:obj:`str`, `optional`):
- The unknown token to be used by the model.
- """
- def __init__(self, vocab, unk_token):
- pass
- @staticmethod
- def from_file(vocab, unk_token):
- """
- Instantiate a WordLevel model from the given file
- This method is roughly equivalent to doing::
- vocab = WordLevel.read_file(vocab_filename)
- wordlevel = WordLevel(vocab)
- If you don't need to keep the :obj:`vocab` values lying around, this method is
- more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
- initialize a :class:`~tokenizers.models.WordLevel`
- Args:
- vocab (:obj:`str`):
- The path to a :obj:`vocab.json` file
- Returns:
- :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
- """
- pass
- def get_trainer(self):
- """
- Get the associated :class:`~tokenizers.trainers.Trainer`
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
- :class:`~tokenizers.models.Model`.
- Returns:
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
- """
- pass
- def id_to_token(self, id):
- """
- Get the token associated to an ID
- Args:
- id (:obj:`int`):
- An ID to convert to a token
- Returns:
- :obj:`str`: The token associated to the ID
- """
- pass
- @staticmethod
- def read_file(vocab):
- """
- Read a :obj:`vocab.json`
- This method provides a way to read and parse the content of a vocabulary file,
- returning the relevant data structures. If you want to instantiate some WordLevel models
- from memory, this method gives you the expected input from the standard files.
- Args:
- vocab (:obj:`str`):
- The path to a :obj:`vocab.json` file
- Returns:
- :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
- """
- pass
- def save(self, folder, prefix):
- """
- Save the current model
- Save the current model in the given folder, using the given prefix for the various
- files that will get created.
- Any file with the same name that already exists in this folder will be overwritten.
- Args:
- folder (:obj:`str`):
- The path to the target folder in which to save the various files
- prefix (:obj:`str`, `optional`):
- An optional prefix, used to prefix each file name
- Returns:
- :obj:`List[str]`: The list of saved files
- """
- pass
- def token_to_id(self, tokens):
- """
- Get the ID associated to a token
- Args:
- token (:obj:`str`):
- A token to convert to an ID
- Returns:
- :obj:`int`: The ID associated to the token
- """
- pass
- def tokenize(self, sequence):
- """
- Tokenize a sequence
- Args:
- sequence (:obj:`str`):
- A sequence to tokenize
- Returns:
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
- """
- pass
- class WordPiece(Model):
- """
- An implementation of the WordPiece algorithm
- Args:
- vocab (:obj:`Dict[str, int]`, `optional`):
- A dictionary of string keys and their ids :obj:`{"am": 0,...}`
- unk_token (:obj:`str`, `optional`):
- The unknown token to be used by the model.
- max_input_chars_per_word (:obj:`int`, `optional`):
- The maximum number of characters to authorize in a single word.
- """
- def __init__(self, vocab, unk_token, max_input_chars_per_word):
- pass
- @staticmethod
- def from_file(vocab, **kwargs):
- """
- Instantiate a WordPiece model from the given file
- This method is roughly equivalent to doing::
- vocab = WordPiece.read_file(vocab_filename)
- wordpiece = WordPiece(vocab)
- If you don't need to keep the :obj:`vocab` values lying around, this method is
- more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
- initialize a :class:`~tokenizers.models.WordPiece`
- Args:
- vocab (:obj:`str`):
- The path to a :obj:`vocab.txt` file
- Returns:
- :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
- """
- pass
- def get_trainer(self):
- """
- Get the associated :class:`~tokenizers.trainers.Trainer`
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
- :class:`~tokenizers.models.Model`.
- Returns:
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
- """
- pass
- def id_to_token(self, id):
- """
- Get the token associated to an ID
- Args:
- id (:obj:`int`):
- An ID to convert to a token
- Returns:
- :obj:`str`: The token associated to the ID
- """
- pass
- @staticmethod
- def read_file(vocab):
- """
- Read a :obj:`vocab.txt` file
- This method provides a way to read and parse the content of a standard `vocab.txt`
- file as used by the WordPiece Model, returning the relevant data structures. If you
- want to instantiate some WordPiece models from memory, this method gives you the
- expected input from the standard files.
- Args:
- vocab (:obj:`str`):
- The path to a :obj:`vocab.txt` file
- Returns:
- :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
- """
- pass
- def save(self, folder, prefix):
- """
- Save the current model
- Save the current model in the given folder, using the given prefix for the various
- files that will get created.
- Any file with the same name that already exists in this folder will be overwritten.
- Args:
- folder (:obj:`str`):
- The path to the target folder in which to save the various files
- prefix (:obj:`str`, `optional`):
- An optional prefix, used to prefix each file name
- Returns:
- :obj:`List[str]`: The list of saved files
- """
- pass
- def token_to_id(self, tokens):
- """
- Get the ID associated to a token
- Args:
- token (:obj:`str`):
- A token to convert to an ID
- Returns:
- :obj:`int`: The ID associated to the token
- """
- pass
- def tokenize(self, sequence):
- """
- Tokenize a sequence
- Args:
- sequence (:obj:`str`):
- A sequence to tokenize
- Returns:
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
- """
- pass
|