tokenization_openai_fast.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. # coding=utf-8
  2. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Fast Tokenization classes for OpenAI GPT."""
  16. from typing import Optional, Tuple
  17. from ...tokenization_utils_fast import PreTrainedTokenizerFast
  18. from ...utils import logging
  19. from .tokenization_openai import OpenAIGPTTokenizer
  20. logger = logging.get_logger(__name__)
  21. VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
  22. class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
  23. """
  24. Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
  25. the following peculiarities:
  26. - lower case all inputs
  27. - uses BERT's BasicTokenizer for pre-BPE tokenization
  28. This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
  29. refer to this superclass for more information regarding those methods.
  30. Args:
  31. vocab_file (`str`):
  32. Path to the vocabulary file.
  33. merges_file (`str`):
  34. Path to the merges file.
  35. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  36. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  37. token instead.
  38. """
  39. vocab_files_names = VOCAB_FILES_NAMES
  40. model_input_names = ["input_ids", "attention_mask"]
  41. slow_tokenizer_class = OpenAIGPTTokenizer
  42. def __init__(self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="<unk>", **kwargs):
  43. super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)
  44. @property
  45. def do_lower_case(self):
  46. return True
  47. def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
  48. files = self._tokenizer.model.save(save_directory, name=filename_prefix)
  49. return tuple(files)