| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- # coding=utf-8
- # Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Tokenization classes for OpenAI Jukebox."""
- import json
- import os
- import re
- import unicodedata
- from json.encoder import INFINITY
- from typing import Any, Dict, List, Optional, Tuple, Union
- import numpy as np
- import regex
- from ....tokenization_utils import AddedToken, PreTrainedTokenizer
- from ....tokenization_utils_base import BatchEncoding
- from ....utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
- from ....utils.generic import _is_jax, _is_numpy
- logger = logging.get_logger(__name__)
- VOCAB_FILES_NAMES = {
- "artists_file": "artists.json",
- "lyrics_file": "lyrics.json",
- "genres_file": "genres.json",
- }
- class JukeboxTokenizer(PreTrainedTokenizer):
- """
- Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
- - Artists, unique ids are associated to each artist from the provided dictionary.
- - Genres, unique ids are associated to each genre from the provided dictionary.
- - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
- vocabulary.
- This tokenizer does not require training. It should be able to process a different number of inputs:
- as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
- Depending on the number of genres on which the model should be conditioned (`n_genres`).
- ```python
- >>> from transformers import JukeboxTokenizer
- >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
- >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
- [tensor([[ 0, 0, 0, 6785, 546, 41, 38, 30, 76, 46, 41, 49,
- 40, 76, 44, 41, 27, 30]]), tensor([[ 0, 0, 0, 145, 0]]), tensor([[ 0, 0, 0, 145, 0]])]
- ```
- You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
- call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
- <Tip>
- If nothing is provided, the genres and the artist will either be selected randomly or set to None
- </Tip>
- This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
- this superclass for more information regarding those methods.
- However the code does not allow that and only supports composing from various genres.
- Args:
- artists_file (`str`):
- Path to the vocabulary file which contains a mapping between artists and ids. The default file supports
- both "v2" and "v3"
- genres_file (`str`):
- Path to the vocabulary file which contain a mapping between genres and ids.
- lyrics_file (`str`):
- Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
- version (`List[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
- List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
- `v2`.
- n_genres (`int`, `optional`, defaults to 1):
- Maximum number of genres to use for composition.
- max_n_lyric_tokens (`int`, `optional`, defaults to 512):
- Maximum number of lyric tokens to keep.
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
- token instead.
- """
- vocab_files_names = VOCAB_FILES_NAMES
- model_input_names = ["input_ids", "attention_mask"]
- def __init__(
- self,
- artists_file,
- genres_file,
- lyrics_file,
- version=["v3", "v2", "v2"],
- max_n_lyric_tokens=512,
- n_genres=5,
- unk_token="<|endoftext|>",
- **kwargs,
- ):
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- self.version = version
- self.max_n_lyric_tokens = max_n_lyric_tokens
- self.n_genres = n_genres
- self._added_tokens_decoder = {0: unk_token}
- with open(artists_file, encoding="utf-8") as vocab_handle:
- self.artists_encoder = json.load(vocab_handle)
- with open(genres_file, encoding="utf-8") as vocab_handle:
- self.genres_encoder = json.load(vocab_handle)
- with open(lyrics_file, encoding="utf-8") as vocab_handle:
- self.lyrics_encoder = json.load(vocab_handle)
- oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
- # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
- if len(self.lyrics_encoder) == 79:
- oov = oov.replace(r"\-'", r"\-+'")
- self.out_of_vocab = regex.compile(oov)
- self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
- self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
- self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
- super().__init__(
- unk_token=unk_token,
- n_genres=n_genres,
- version=version,
- max_n_lyric_tokens=max_n_lyric_tokens,
- **kwargs,
- )
- @property
- def vocab_size(self):
- return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
- def get_vocab(self):
- return {
- "artists_encoder": self.artists_encoder,
- "genres_encoder": self.genres_encoder,
- "lyrics_encoder": self.lyrics_encoder,
- }
- def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
- """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
- The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
- the lyrics token sequence.
- """
- artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists]
- for genres in range(len(list_genres)):
- list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
- list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
- lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[0]], [], []]
- return artists_id, list_genres, lyric_ids
- def _tokenize(self, lyrics):
- """
- Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
- Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
- """
- # only lyrics are not tokenized, but character based is easily handled
- return list(lyrics)
- def tokenize(self, artist, genre, lyrics, **kwargs):
- """
- Converts three strings in a 3 sequence of tokens using the tokenizer
- """
- artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics)
- lyrics = self._tokenize(lyrics)
- return artist, genre, lyrics
- def prepare_for_tokenization(
- self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False
- ) -> Tuple[str, str, str, Dict[str, Any]]:
- """
- Performs any necessary transformations before tokenization.
- Args:
- artist (`str`):
- The artist name to prepare. This will mostly lower the string
- genres (`str`):
- The genre name to prepare. This will mostly lower the string.
- lyrics (`str`):
- The lyrics to prepare.
- is_split_into_words (`bool`, *optional*, defaults to `False`):
- Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
- tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
- which it will tokenize. This is useful for NER or token classification.
- """
- for idx in range(len(self.version)):
- if self.version[idx] == "v3":
- artists[idx] = artists[idx].lower()
- genres[idx] = [genres[idx].lower()]
- else:
- artists[idx] = self._normalize(artists[idx]) + ".v2"
- genres[idx] = [
- self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
- ] # split is for the full dictionary with combined genres
- if self.version[0] == "v2":
- self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
- vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
- self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
- self.vocab["<unk>"] = 0
- self.n_vocab = len(vocab) + 1
- self.lyrics_encoder = self.vocab
- self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
- self.lyrics_decoder[0] = ""
- else:
- self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
- lyrics = self._run_strip_accents(lyrics)
- lyrics = lyrics.replace("\\", "\n")
- lyrics = self.out_of_vocab.sub("", lyrics), [], []
- return artists, genres, lyrics
- def _run_strip_accents(self, text):
- """Strips accents from a piece of text."""
- text = unicodedata.normalize("NFD", text)
- output = []
- for char in text:
- cat = unicodedata.category(char)
- if cat == "Mn":
- continue
- output.append(char)
- return "".join(output)
- def _normalize(self, text: str) -> str:
- """
- Normalizes the input text. This process is for the genres and the artist
- Args:
- text (`str`):
- Artist or Genre string to normalize
- """
- accepted = (
- [chr(i) for i in range(ord("a"), ord("z") + 1)]
- + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
- + [chr(i) for i in range(ord("0"), ord("9") + 1)]
- + ["."]
- )
- accepted = frozenset(accepted)
- pattern = re.compile(r"_+")
- text = "".join([c if c in accepted else "_" for c in text.lower()])
- text = pattern.sub("_", text).strip("_")
- return text
- def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
- return " ".join(lyrics)
- def convert_to_tensors(
- self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
- ):
- """
- Convert the inner content to tensors.
- Args:
- tensor_type (`str` or [`~utils.TensorType`], *optional*):
- The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
- unset, no modification is done.
- prepend_batch_axis (`int`, *optional*, defaults to `False`):
- Whether or not to add the batch dimension during the conversion.
- """
- # Convert to TensorType
- if not isinstance(tensor_type, TensorType):
- tensor_type = TensorType(tensor_type)
- # Get a function reference for the correct framework
- if tensor_type == TensorType.TENSORFLOW:
- if not is_tf_available():
- raise ImportError(
- "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
- )
- import tensorflow as tf
- as_tensor = tf.constant
- is_tensor = tf.is_tensor
- elif tensor_type == TensorType.PYTORCH:
- if not is_torch_available():
- raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
- import torch
- as_tensor = torch.tensor
- is_tensor = torch.is_tensor
- elif tensor_type == TensorType.JAX:
- if not is_flax_available():
- raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
- import jax.numpy as jnp # noqa: F811
- as_tensor = jnp.array
- is_tensor = _is_jax
- else:
- as_tensor = np.asarray
- is_tensor = _is_numpy
- # Do the tensor conversion in batch
- try:
- if prepend_batch_axis:
- inputs = [inputs]
- if not is_tensor(inputs):
- inputs = as_tensor(inputs)
- except: # noqa E722
- raise ValueError(
- "Unable to create tensor, you should probably activate truncation and/or padding "
- "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
- )
- return inputs
- def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncoding:
- """Convert the raw string to a list of token ids
- Args:
- artist (`str`):
- Name of the artist.
- genres (`str`):
- List of genres that will be mixed to condition the audio
- lyrics (`str`, *optional*, defaults to `""`):
- Lyrics used to condition the generation
- """
- input_ids = [0, 0, 0]
- artist = [artist] * len(self.version)
- genres = [genres] * len(self.version)
- artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
- artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens)
- attention_masks = [-INFINITY] * len(full_tokens[-1])
- input_ids = [
- self.convert_to_tensors(
- [input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors
- )
- for i in range(len(self.version))
- ]
- return BatchEncoding({"input_ids": input_ids, "attention_masks": attention_masks})
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
- """
- Saves the tokenizer's vocabulary dictionary to the provided save_directory.
- Args:
- save_directory (`str`):
- A path to the directory where to saved. It will be created if it doesn't exist.
- filename_prefix (`Optional[str]`, *optional*):
- A prefix to add to the names of the files saved by the tokenizer.
- """
- if not os.path.isdir(save_directory):
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
- return
- artists_file = os.path.join(
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["artists_file"]
- )
- with open(artists_file, "w", encoding="utf-8") as f:
- f.write(json.dumps(self.artists_encoder, ensure_ascii=False))
- genres_file = os.path.join(
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["genres_file"]
- )
- with open(genres_file, "w", encoding="utf-8") as f:
- f.write(json.dumps(self.genres_encoder, ensure_ascii=False))
- lyrics_file = os.path.join(
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["lyrics_file"]
- )
- with open(lyrics_file, "w", encoding="utf-8") as f:
- f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
- return (artists_file, genres_file, lyrics_file)
- def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
- """
- Converts an index (integer) in a token (str) using the vocab.
- Args:
- artists_index (`int`):
- Index of the artist in its corresponding dictionary.
- genres_index (`Union[List[int], int]`):
- Index of the genre in its corresponding dictionary.
- lyric_index (`List[int]`):
- List of character indices, which each correspond to a character.
- """
- artist = self.artists_decoder.get(artists_index)
- genres = [self.genres_decoder.get(genre) for genre in genres_index]
- lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
- return artist, genres, lyrics
|