| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- from enum import Enum
- from typing import List, Tuple, Union
- Offsets = Tuple[int, int]
- TextInputSequence = str
- """A :obj:`str` that represents an input sequence """
- PreTokenizedInputSequence = Union[List[str], Tuple[str]]
- """A pre-tokenized input sequence. Can be one of:
- - A :obj:`List` of :obj:`str`
- - A :obj:`Tuple` of :obj:`str`
- """
- TextEncodeInput = Union[
- TextInputSequence,
- Tuple[TextInputSequence, TextInputSequence],
- List[TextInputSequence],
- ]
- """Represents a textual input for encoding. Can be either:
- - A single sequence: :data:`~tokenizers.TextInputSequence`
- - A pair of sequences:
- - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
- - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
- """
- PreTokenizedEncodeInput = Union[
- PreTokenizedInputSequence,
- Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
- List[PreTokenizedInputSequence],
- ]
- """Represents a pre-tokenized input for encoding. Can be either:
- - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
- - A pair of sequences:
- - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
- - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
- """
- InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
- """Represents all the possible types of input sequences for encoding. Can be:
- - When ``is_pretokenized=False``: :data:`~TextInputSequence`
- - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
- """
- EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
- """Represents all the possible types of input for encoding. Can be:
- - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
- - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
- """
- class OffsetReferential(Enum):
- ORIGINAL = "original"
- NORMALIZED = "normalized"
- class OffsetType(Enum):
- BYTE = "byte"
- CHAR = "char"
- class SplitDelimiterBehavior(Enum):
- REMOVED = "removed"
- ISOLATED = "isolated"
- MERGED_WITH_PREVIOUS = "merged_with_previous"
- MERGED_WITH_NEXT = "merged_with_next"
- CONTIGUOUS = "contiguous"
- from .tokenizers import (
- AddedToken,
- Encoding,
- NormalizedString,
- PreTokenizedString,
- Regex,
- Token,
- Tokenizer,
- decoders,
- models,
- normalizers,
- pre_tokenizers,
- processors,
- trainers,
- __version__,
- )
- from .implementations import (
- BertWordPieceTokenizer,
- ByteLevelBPETokenizer,
- CharBPETokenizer,
- SentencePieceBPETokenizer,
- SentencePieceUnigramTokenizer,
- )
|