__init__.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. from enum import Enum
  2. from typing import List, Tuple, Union
  3. Offsets = Tuple[int, int]
  4. TextInputSequence = str
  5. """A :obj:`str` that represents an input sequence """
  6. PreTokenizedInputSequence = Union[List[str], Tuple[str]]
  7. """A pre-tokenized input sequence. Can be one of:
  8. - A :obj:`List` of :obj:`str`
  9. - A :obj:`Tuple` of :obj:`str`
  10. """
  11. TextEncodeInput = Union[
  12. TextInputSequence,
  13. Tuple[TextInputSequence, TextInputSequence],
  14. List[TextInputSequence],
  15. ]
  16. """Represents a textual input for encoding. Can be either:
  17. - A single sequence: :data:`~tokenizers.TextInputSequence`
  18. - A pair of sequences:
  19. - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
  20. - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
  21. """
  22. PreTokenizedEncodeInput = Union[
  23. PreTokenizedInputSequence,
  24. Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
  25. List[PreTokenizedInputSequence],
  26. ]
  27. """Represents a pre-tokenized input for encoding. Can be either:
  28. - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
  29. - A pair of sequences:
  30. - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
  31. - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
  32. """
  33. InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
  34. """Represents all the possible types of input sequences for encoding. Can be:
  35. - When ``is_pretokenized=False``: :data:`~TextInputSequence`
  36. - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
  37. """
  38. EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
  39. """Represents all the possible types of input for encoding. Can be:
  40. - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
  41. - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
  42. """
  43. class OffsetReferential(Enum):
  44. ORIGINAL = "original"
  45. NORMALIZED = "normalized"
  46. class OffsetType(Enum):
  47. BYTE = "byte"
  48. CHAR = "char"
  49. class SplitDelimiterBehavior(Enum):
  50. REMOVED = "removed"
  51. ISOLATED = "isolated"
  52. MERGED_WITH_PREVIOUS = "merged_with_previous"
  53. MERGED_WITH_NEXT = "merged_with_next"
  54. CONTIGUOUS = "contiguous"
  55. from .tokenizers import (
  56. AddedToken,
  57. Encoding,
  58. NormalizedString,
  59. PreTokenizedString,
  60. Regex,
  61. Token,
  62. Tokenizer,
  63. decoders,
  64. models,
  65. normalizers,
  66. pre_tokenizers,
  67. processors,
  68. trainers,
  69. __version__,
  70. )
  71. from .implementations import (
  72. BertWordPieceTokenizer,
  73. ByteLevelBPETokenizer,
  74. CharBPETokenizer,
  75. SentencePieceBPETokenizer,
  76. SentencePieceUnigramTokenizer,
  77. )