| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302 |
- # coding=utf-8
- # Copyright 2018 The HuggingFace Inc. team.
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Benchmarking the library on inference and training in PyTorch.
- """
- import random
- import timeit
- from functools import wraps
- from typing import Callable, Optional
- from ..configuration_utils import PretrainedConfig
- from ..models.auto.modeling_tf_auto import TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING
- from ..utils import is_py3nvml_available, is_tf_available, logging
- from .benchmark_utils import (
- Benchmark,
- Memory,
- MemorySummary,
- measure_peak_memory_cpu,
- start_memory_tracing,
- stop_memory_tracing,
- )
- if is_tf_available():
- import tensorflow as tf
- from tensorflow.python.framework.errors_impl import ResourceExhaustedError
- from .benchmark_args_tf import TensorFlowBenchmarkArguments
- if is_py3nvml_available():
- import py3nvml.py3nvml as nvml
- logger = logging.get_logger(__name__)
- def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
- def run_func(func):
- @wraps(func)
- def run_in_eager_mode(*args, **kwargs):
- return func(*args, **kwargs)
- @wraps(func)
- @tf.function(experimental_compile=use_xla)
- def run_in_graph_mode(*args, **kwargs):
- return func(*args, **kwargs)
- if do_eager_mode is True:
- if use_xla is not False:
- raise ValueError(
- "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
- )
- return run_in_eager_mode
- else:
- return run_in_graph_mode
- return run_func
- def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> ["tf.Tensor"]:
- rng = random.Random()
- values = [rng.randint(0, vocab_size - 1) for i in range(batch_size * sequence_length)]
- return tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
- class TensorFlowBenchmark(Benchmark):
- args: TensorFlowBenchmarkArguments
- configs: PretrainedConfig
- framework: str = "TensorFlow"
- @property
- def framework_version(self):
- return tf.__version__
- def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
- # initialize GPU on separate process
- strategy = self.args.strategy
- if strategy is None:
- raise ValueError("A device strategy has to be initialized before using TensorFlow.")
- _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
- return self._measure_speed(_inference)
- def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
- strategy = self.args.strategy
- if strategy is None:
- raise ValueError("A device strategy has to be initialized before using TensorFlow.")
- _train = self._prepare_train_func(model_name, batch_size, sequence_length)
- return self._measure_speed(_train)
- def _inference_memory(
- self, model_name: str, batch_size: int, sequence_length: int
- ) -> [Memory, Optional[MemorySummary]]:
- # initialize GPU on separate process
- if self.args.is_gpu:
- tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
- strategy = self.args.strategy
- if strategy is None:
- raise ValueError("A device strategy has to be initialized before using TensorFlow.")
- _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
- return self._measure_memory(_inference)
- def _train_memory(
- self, model_name: str, batch_size: int, sequence_length: int
- ) -> [Memory, Optional[MemorySummary]]:
- if self.args.is_gpu:
- tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
- strategy = self.args.strategy
- if strategy is None:
- raise ValueError("A device strategy has to be initialized before using TensorFlow.")
- _train = self._prepare_train_func(model_name, batch_size, sequence_length)
- return self._measure_memory(_train)
- def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
- config = self.config_dict[model_name]
- if self.args.fp16:
- raise NotImplementedError("Mixed precision is currently not supported.")
- has_model_class_in_config = (
- hasattr(config, "architectures")
- and isinstance(config.architectures, list)
- and len(config.architectures) > 0
- )
- if not self.args.only_pretrain_model and has_model_class_in_config:
- try:
- model_class = "TF" + config.architectures[0] # prepend 'TF' for tensorflow model
- transformers_module = __import__("transformers", fromlist=[model_class])
- model_cls = getattr(transformers_module, model_class)
- model = model_cls(config)
- except ImportError:
- raise ImportError(
- f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
- " set `--only_pretrain_model` or `args.only_pretrain_model=True`."
- )
- else:
- model = TF_MODEL_MAPPING[config.__class__](config)
- # encoder-decoder has vocab size saved differently
- vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
- input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
- @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
- def encoder_decoder_forward():
- return model(input_ids, decoder_input_ids=input_ids, training=False)
- @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
- def encoder_forward():
- return model(input_ids, training=False)
- _inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
- return _inference
- def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
- config = self.config_dict[model_name]
- if self.args.eager_mode is not False:
- raise ValueError("Training cannot be done in eager mode. Please make sure that `args.eager_mode = False`.")
- if self.args.fp16:
- raise NotImplementedError("Mixed precision is currently not supported.")
- has_model_class_in_config = (
- hasattr(config, "architectures")
- and isinstance(config.architectures, list)
- and len(config.architectures) > 0
- )
- if not self.args.only_pretrain_model and has_model_class_in_config:
- try:
- model_class = "TF" + config.architectures[0] # prepend 'TF' for tensorflow model
- transformers_module = __import__("transformers", fromlist=[model_class])
- model_cls = getattr(transformers_module, model_class)
- model = model_cls(config)
- except ImportError:
- raise ImportError(
- f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
- " set `--only_pretrain_model` or `args.only_pretrain_model=True`."
- )
- else:
- model = TF_MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
- # encoder-decoder has vocab size saved differently
- vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
- input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
- @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
- def encoder_decoder_train():
- loss = model(input_ids, decoder_input_ids=input_ids, labels=input_ids, training=True)[0]
- gradients = tf.gradients(loss, model.trainable_variables)
- return gradients
- @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
- def encoder_train():
- loss = model(input_ids, labels=input_ids, training=True)[0]
- gradients = tf.gradients(loss, model.trainable_variables)
- return gradients
- _train = encoder_decoder_train if config.is_encoder_decoder else encoder_train
- return _train
- def _measure_speed(self, func) -> float:
- with self.args.strategy.scope():
- try:
- if self.args.is_tpu or self.args.use_xla:
- # run additional 10 times to stabilize compilation for tpu
- logger.info("Do inference on TPU. Running model 5 times to stabilize compilation")
- timeit.repeat(func, repeat=1, number=5)
- # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
- runtimes = timeit.repeat(
- func,
- repeat=self.args.repeat,
- number=10,
- )
- return min(runtimes) / 10.0
- except ResourceExhaustedError as e:
- self.print_fn(f"Doesn't fit on GPU. {e}")
- def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
- logger.info(
- "Note that TensorFlow allocates more memory than "
- "it might need to speed up computation. "
- "The memory reported here corresponds to the memory "
- "reported by `nvidia-smi`, which can vary depending "
- "on total available memory on the GPU that is used."
- )
- with self.args.strategy.scope():
- try:
- if self.args.trace_memory_line_by_line:
- if not self.args.eager_mode:
- raise ValueError(
- "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory"
- " consumption line by line."
- )
- trace = start_memory_tracing("transformers")
- if self.args.is_tpu:
- # tpu
- raise NotImplementedError(
- "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking"
- " with `args.memory=False`"
- )
- elif self.args.is_gpu:
- # gpu
- if not is_py3nvml_available():
- logger.warning(
- "py3nvml not installed, we won't log GPU memory usage. "
- "Install py3nvml (pip install py3nvml) to log information about GPU."
- )
- memory = "N/A"
- else:
- logger.info(
- "Measuring total GPU usage on GPU device. Make sure to not have additional processes"
- " running on the same GPU."
- )
- # init nvml
- nvml.nvmlInit()
- func()
- handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
- meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
- max_bytes_in_use = meminfo.used
- memory = Memory(max_bytes_in_use)
- # shutdown nvml
- nvml.nvmlShutdown()
- else:
- # cpu
- if self.args.trace_memory_line_by_line:
- logger.info(
- "When enabling line by line tracing, the max peak memory for CPU is inaccurate in"
- " TensorFlow."
- )
- memory = None
- else:
- memory_bytes = measure_peak_memory_cpu(func)
- memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
- if self.args.trace_memory_line_by_line:
- summary = stop_memory_tracing(trace)
- if memory is None:
- memory = summary.total
- else:
- summary = None
- return memory, summary
- except ResourceExhaustedError as e:
- self.print_fn(f"Doesn't fit on GPU. {e}")
- return "N/A", None
|