char_level_bpe.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. from typing import Dict, Iterator, List, Optional, Tuple, Union
  2. from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
  3. from ..models import BPE
  4. from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
  5. from .base_tokenizer import BaseTokenizer
  6. class CharBPETokenizer(BaseTokenizer):
  7. """Original BPE Tokenizer
  8. Represents the BPE algorithm, as introduced by Rico Sennrich
  9. (https://arxiv.org/abs/1508.07909)
  10. The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
  11. Sennrich subword-nmt implementation by the following options that you can deactivate:
  12. - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
  13. * removing any control characters and replacing all whitespaces by the classic one.
  14. * handle chinese chars by putting spaces around them.
  15. * strip all accents.
  16. - spitting on punctuation in addition to whitespaces (deactivate it with
  17. `split_on_whitespace_only=True`)
  18. """
  19. def __init__(
  20. self,
  21. vocab: Optional[Union[str, Dict[str, int]]] = None,
  22. merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
  23. unk_token: Union[str, AddedToken] = "<unk>",
  24. suffix: str = "</w>",
  25. dropout: Optional[float] = None,
  26. lowercase: bool = False,
  27. unicode_normalizer: Optional[str] = None,
  28. bert_normalizer: bool = True,
  29. split_on_whitespace_only: bool = False,
  30. ):
  31. if vocab is not None and merges is not None:
  32. tokenizer = Tokenizer(
  33. BPE(
  34. vocab,
  35. merges,
  36. dropout=dropout,
  37. unk_token=str(unk_token),
  38. end_of_word_suffix=suffix,
  39. )
  40. )
  41. else:
  42. tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
  43. if tokenizer.token_to_id(str(unk_token)) is not None:
  44. tokenizer.add_special_tokens([str(unk_token)])
  45. # Check for Unicode normalization first (before everything else)
  46. normalizers = []
  47. if unicode_normalizer:
  48. normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
  49. if bert_normalizer:
  50. normalizers += [BertNormalizer(lowercase=False)]
  51. if lowercase:
  52. normalizers += [Lowercase()]
  53. # Create the normalizer structure
  54. if len(normalizers) > 0:
  55. if len(normalizers) > 1:
  56. tokenizer.normalizer = Sequence(normalizers)
  57. else:
  58. tokenizer.normalizer = normalizers[0]
  59. if split_on_whitespace_only:
  60. tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
  61. else:
  62. tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
  63. tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
  64. parameters = {
  65. "model": "BPE",
  66. "unk_token": unk_token,
  67. "suffix": suffix,
  68. "dropout": dropout,
  69. "lowercase": lowercase,
  70. "unicode_normalizer": unicode_normalizer,
  71. "bert_normalizer": bert_normalizer,
  72. "split_on_whitespace_only": split_on_whitespace_only,
  73. }
  74. super().__init__(tokenizer, parameters)
  75. @staticmethod
  76. def from_file(vocab_filename: str, merges_filename: str, **kwargs):
  77. vocab, merges = BPE.read_file(vocab_filename, merges_filename)
  78. return CharBPETokenizer(vocab, merges, **kwargs)
  79. def train(
  80. self,
  81. files: Union[str, List[str]],
  82. vocab_size: int = 30000,
  83. min_frequency: int = 2,
  84. special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
  85. limit_alphabet: int = 1000,
  86. initial_alphabet: List[str] = [],
  87. suffix: Optional[str] = "</w>",
  88. show_progress: bool = True,
  89. ):
  90. """Train the model using the given files"""
  91. trainer = trainers.BpeTrainer(
  92. vocab_size=vocab_size,
  93. min_frequency=min_frequency,
  94. special_tokens=special_tokens,
  95. limit_alphabet=limit_alphabet,
  96. initial_alphabet=initial_alphabet,
  97. end_of_word_suffix=suffix,
  98. show_progress=show_progress,
  99. )
  100. if isinstance(files, str):
  101. files = [files]
  102. self._tokenizer.train(files, trainer=trainer)
  103. def train_from_iterator(
  104. self,
  105. iterator: Union[Iterator[str], Iterator[Iterator[str]]],
  106. vocab_size: int = 30000,
  107. min_frequency: int = 2,
  108. special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
  109. limit_alphabet: int = 1000,
  110. initial_alphabet: List[str] = [],
  111. suffix: Optional[str] = "</w>",
  112. show_progress: bool = True,
  113. length: Optional[int] = None,
  114. ):
  115. """Train the model using the given iterator"""
  116. trainer = trainers.BpeTrainer(
  117. vocab_size=vocab_size,
  118. min_frequency=min_frequency,
  119. special_tokens=special_tokens,
  120. limit_alphabet=limit_alphabet,
  121. initial_alphabet=initial_alphabet,
  122. end_of_word_suffix=suffix,
  123. show_progress=show_progress,
  124. )
  125. self._tokenizer.train_from_iterator(
  126. iterator,
  127. trainer=trainer,
  128. length=length,
  129. )