flax_utils.py 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. # coding=utf-8
  2. # Copyright 2021 The Google AI Flax Team Authors, and The HuggingFace Inc. team.
  3. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import copy
  17. import inspect
  18. import warnings
  19. from functools import partial
  20. from typing import Any, Dict, Optional, Union
  21. import flax
  22. import jax
  23. import jax.numpy as jnp
  24. import numpy as np
  25. from jax import lax
  26. from ..models.auto import (
  27. FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
  28. FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
  29. FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
  30. )
  31. from ..utils import ModelOutput, logging
  32. from .configuration_utils import GenerationConfig
  33. from .flax_logits_process import (
  34. FlaxForcedBOSTokenLogitsProcessor,
  35. FlaxForcedEOSTokenLogitsProcessor,
  36. FlaxForceTokensLogitsProcessor,
  37. FlaxLogitsProcessorList,
  38. FlaxMinLengthLogitsProcessor,
  39. FlaxNoRepeatNGramLogitsProcessor,
  40. FlaxSuppressTokensAtBeginLogitsProcessor,
  41. FlaxSuppressTokensLogitsProcessor,
  42. FlaxTemperatureLogitsWarper,
  43. FlaxTopKLogitsWarper,
  44. FlaxTopPLogitsWarper,
  45. )
  46. logger = logging.get_logger(__name__)
  47. @flax.struct.dataclass
  48. class FlaxGreedySearchOutput(ModelOutput):
  49. """
  50. Flax Base class for outputs of decoder-only generation models using greedy search.
  51. Args:
  52. sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
  53. The generated sequences.
  54. """
  55. sequences: jnp.ndarray = None
  56. @flax.struct.dataclass
  57. class FlaxSampleOutput(ModelOutput):
  58. """
  59. Flax Base class for outputs of decoder-only generation models using sampling.
  60. Args:
  61. sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
  62. The generated sequences.
  63. """
  64. sequences: jnp.ndarray = None
  65. @flax.struct.dataclass
  66. class FlaxBeamSearchOutput(ModelOutput):
  67. """
  68. Flax Base class for outputs of decoder-only generation models using greedy search.
  69. Args:
  70. sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
  71. The generated sequences.
  72. scores (`jnp.ndarray` of shape `(batch_size,)`):
  73. The scores (log probabilities) of the generated sequences.
  74. """
  75. sequences: jnp.ndarray = None
  76. scores: jnp.ndarray = None
  77. @flax.struct.dataclass
  78. class GreedyState:
  79. cur_len: jnp.ndarray
  80. sequences: jnp.ndarray
  81. running_token: jnp.ndarray
  82. is_sent_finished: jnp.ndarray
  83. model_kwargs: Dict[str, jnp.ndarray]
  84. @flax.struct.dataclass
  85. class SampleState:
  86. cur_len: jnp.ndarray
  87. sequences: jnp.ndarray
  88. running_token: jnp.ndarray
  89. is_sent_finished: jnp.ndarray
  90. prng_key: jnp.ndarray
  91. model_kwargs: Dict[str, jnp.ndarray]
  92. @flax.struct.dataclass
  93. class BeamSearchState:
  94. cur_len: jnp.ndarray
  95. running_sequences: jnp.ndarray
  96. running_scores: jnp.ndarray
  97. sequences: jnp.ndarray
  98. scores: jnp.ndarray
  99. is_sent_finished: jnp.ndarray
  100. model_kwargs: Dict[str, jnp.ndarray]
  101. class FlaxGenerationMixin:
  102. """
  103. A class containing all functions for auto-regressive text generation, to be used as a mixin in
  104. [`FlaxPreTrainedModel`].
  105. The class exposes [`~generation.FlaxGenerationMixin.generate`], which can be used for:
  106. - *greedy decoding* by calling [`~generation.FlaxGenerationMixin._greedy_search`] if `num_beams=1` and
  107. `do_sample=False`
  108. - *multinomial sampling* by calling [`~generation.FlaxGenerationMixin._sample`] if `num_beams=1` and
  109. `do_sample=True`
  110. - *beam-search decoding* by calling [`~generation.FlaxGenerationMixin._beam_search`] if `num_beams>1` and
  111. `do_sample=False`
  112. You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
  113. learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
  114. """
  115. def prepare_inputs_for_generation(self, *args, **kwargs):
  116. raise NotImplementedError(
  117. "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
  118. )
  119. @staticmethod
  120. def _run_loop_in_debug(cond_fn, body_fn, init_state):
  121. """
  122. Run generation in untraced mode. This should only be used for debugging purposes.
  123. """
  124. state = init_state
  125. while cond_fn(state):
  126. state = body_fn(state)
  127. return state
  128. def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids, params, model_kwargs):
  129. encoder_kwargs = {
  130. argument: value
  131. for argument, value in model_kwargs.items()
  132. if not (argument.startswith("decoder_") or argument.startswith("cross_attn"))
  133. }
  134. model_kwargs["encoder_outputs"] = self.encode(input_ids, params=params, return_dict=True, **encoder_kwargs)
  135. return model_kwargs
  136. def _prepare_decoder_input_ids_for_generation(
  137. self,
  138. batch_size: int,
  139. decoder_start_token_id: int = None,
  140. bos_token_id: int = None,
  141. model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
  142. ) -> jnp.ndarray:
  143. if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
  144. # Only use this arg if not None, otherwise just remove from model_kwargs
  145. decoder_input_ids = model_kwargs.pop("decoder_input_ids")
  146. if decoder_input_ids is not None:
  147. return decoder_input_ids
  148. decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
  149. return jnp.array(decoder_start_token_id, dtype="i4").reshape(1, -1).repeat(batch_size, axis=0)
  150. def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
  151. # retrieve decoder_start_token_id for encoder-decoder models
  152. # fall back to bos_token_id if necessary
  153. decoder_start_token_id = (
  154. decoder_start_token_id
  155. if decoder_start_token_id is not None
  156. else self.generation_config.decoder_start_token_id
  157. )
  158. bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
  159. if decoder_start_token_id is not None:
  160. return decoder_start_token_id
  161. elif (
  162. hasattr(self.config, "decoder")
  163. and hasattr(self.config.decoder, "decoder_start_token_id")
  164. and self.config.decoder.decoder_start_token_id is not None
  165. ):
  166. return self.config.decoder.decoder_start_token_id
  167. elif bos_token_id is not None:
  168. return bos_token_id
  169. elif (
  170. hasattr(self.config, "decoder")
  171. and hasattr(self.config.decoder, "bos_token_id")
  172. and self.config.decoder.bos_token_id is not None
  173. ):
  174. return self.config.decoder.bos_token_id
  175. raise ValueError(
  176. "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
  177. )
  178. @staticmethod
  179. def _expand_to_num_beams(tensor, num_beams):
  180. return jnp.broadcast_to(tensor[:, None], (tensor.shape[0], num_beams) + tensor.shape[1:])
  181. def _adapt_logits_for_beam_search(self, logits):
  182. """
  183. This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
  184. search behavior. Note that the only model that overwrites this method is [`~transformes.FlaxMarianMTModel`].
  185. """
  186. return logits
  187. def _validate_model_class(self):
  188. """
  189. Confirms that the model class is compatible with generation. If not, raises an exception that points to the
  190. right class to use.
  191. """
  192. if not self.can_generate():
  193. generate_compatible_mappings = [
  194. FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
  195. FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
  196. FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
  197. ]
  198. generate_compatible_classes = set()
  199. for model_mapping in generate_compatible_mappings:
  200. supported_models = model_mapping.get(type(self.config), default=None)
  201. if supported_models is not None:
  202. generate_compatible_classes.add(supported_models.__name__)
  203. exception_message = (
  204. f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
  205. "it doesn't have a language model head."
  206. )
  207. if generate_compatible_classes:
  208. exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
  209. raise TypeError(exception_message)
  210. def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
  211. """Validates model kwargs for generation. Generate argument typos will also be caught here."""
  212. unused_model_args = []
  213. model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
  214. # `kwargs`/`model_kwargs` is often used to handle optional forward pass inputs like `attention_mask`. If
  215. # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
  216. if "kwargs" in model_args or "model_kwargs" in model_args:
  217. model_args |= set(inspect.signature(self.__call__).parameters)
  218. for key, value in model_kwargs.items():
  219. if value is not None and key not in model_args:
  220. unused_model_args.append(key)
  221. if unused_model_args:
  222. raise ValueError(
  223. f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
  224. " generate arguments will also show up in this list)"
  225. )
  226. def generate(
  227. self,
  228. input_ids: jnp.ndarray,
  229. generation_config: Optional[GenerationConfig] = None,
  230. prng_key: Optional[jnp.ndarray] = None,
  231. trace: bool = True,
  232. params: Optional[Dict[str, jnp.ndarray]] = None,
  233. logits_processor: Optional[FlaxLogitsProcessorList] = None,
  234. **kwargs,
  235. ):
  236. r"""
  237. Generates sequences of token ids for models with a language modeling head.
  238. Parameters:
  239. input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
  240. The sequence used as a prompt for the generation.
  241. generation_config (`~generation.GenerationConfig`, *optional*):
  242. The generation configuration to be used as base parametrization for the generation call. `**kwargs`
  243. passed to generate matching the attributes of `generation_config` will override them. If
  244. `generation_config` is not provided, the default will be used, which had the following loading
  245. priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
  246. configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
  247. default values, whose documentation should be checked to parameterize generation.
  248. trace (`bool`, *optional*, defaults to `True`):
  249. Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a
  250. considerably slower runtime.
  251. params (`Dict[str, jnp.ndarray]`, *optional*):
  252. Optionally the model parameters can be passed. Can be useful for parallelized generation.
  253. logits_processor (`FlaxLogitsProcessorList `, *optional*):
  254. Custom logits processors that complement the default logits processors built from arguments and
  255. generation config. If a logit processor is passed that is already created with the arguments or a
  256. generation config an error is thrown. This feature is intended for advanced users.
  257. kwargs (`Dict[str, Any]`, *optional*):
  258. Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
  259. forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
  260. specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
  261. Return:
  262. [`~utils.ModelOutput`].
  263. """
  264. # Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
  265. self._validate_model_class()
  266. # priority: `generation_config` argument > `model.generation_config` (the default generation config)
  267. if generation_config is None:
  268. # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
  269. # two conditions must be met
  270. # 1) the generation config must have been created from the model config (`_from_model_config` field);
  271. # 2) the generation config must have seen no modification since its creation (the hash is the same).
  272. if self.generation_config._from_model_config and self.generation_config._original_object_hash == hash(
  273. self.generation_config
  274. ):
  275. new_generation_config = GenerationConfig.from_model_config(self.config)
  276. if new_generation_config != self.generation_config:
  277. warnings.warn(
  278. "You have modified the pretrained model configuration to control generation. This is a"
  279. " deprecated strategy to control generation and will be removed soon, in a future version."
  280. " Please use and modify the model generation configuration (see"
  281. " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
  282. )
  283. self.generation_config = new_generation_config
  284. generation_config = self.generation_config
  285. generation_config = copy.deepcopy(generation_config)
  286. model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
  287. self._validate_model_kwargs(model_kwargs.copy())
  288. logits_processor = logits_processor if logits_processor is not None else FlaxLogitsProcessorList()
  289. # set init values
  290. prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
  291. if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
  292. if model_kwargs.get("attention_mask") is None:
  293. logger.warning(
  294. "The attention mask and the pad token id were not set. As a consequence, you may observe "
  295. "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
  296. )
  297. eos_token_id = generation_config.eos_token_id
  298. if isinstance(eos_token_id, list):
  299. eos_token_id = eos_token_id[0]
  300. logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
  301. generation_config.pad_token_id = eos_token_id
  302. if generation_config.decoder_start_token_id is None and self.config.is_encoder_decoder:
  303. raise ValueError("`decoder_start_token_id` has to be defined for encoder-decoder generation.")
  304. # decoder-only models should use left-padding for generation (can't be checked with `trace=True`)
  305. if not self.config.is_encoder_decoder and not trace:
  306. if (
  307. generation_config.pad_token_id is not None
  308. and jnp.sum(input_ids[:, -1] == generation_config.pad_token_id) > 0
  309. ):
  310. logger.warning(
  311. "A decoder-only architecture is being used, but right-padding was detected! For correct "
  312. "generation results, please set `padding_side='left'` when initializing the tokenizer."
  313. )
  314. batch_size = input_ids.shape[0]
  315. if self.config.is_encoder_decoder:
  316. # add encoder_outputs to model_kwargs
  317. if model_kwargs.get("encoder_outputs") is None:
  318. model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, params, model_kwargs)
  319. # prepare decoder_input_ids for generation
  320. input_ids = self._prepare_decoder_input_ids_for_generation(
  321. batch_size,
  322. decoder_start_token_id=generation_config.decoder_start_token_id,
  323. bos_token_id=generation_config.bos_token_id,
  324. model_kwargs=model_kwargs,
  325. )
  326. # Prepare `max_length` depending on other stopping criteria.
  327. input_ids_seq_length = input_ids.shape[-1]
  328. has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
  329. if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
  330. # 20 is the default max_length of the generation config
  331. warnings.warn(
  332. f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
  333. "to control the generation length. recommend setting `max_new_tokens` to control the maximum length of the generation.",
  334. UserWarning,
  335. )
  336. elif generation_config.max_new_tokens is not None:
  337. if not has_default_max_length and generation_config.max_length is not None:
  338. logger.warning(
  339. f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
  340. f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
  341. "Please refer to the documentation for more information. "
  342. "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
  343. )
  344. generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
  345. if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
  346. raise ValueError(
  347. f"Unfeasable length constraints: the minimum length ({generation_config.min_length}) is larger than"
  348. f" the maximum length ({generation_config.max_length})"
  349. )
  350. if input_ids_seq_length >= generation_config.max_length:
  351. input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
  352. logger.warning(
  353. f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
  354. f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
  355. " increasing`max_new_tokens`."
  356. )
  357. logits_processor = self._get_logits_processor(
  358. generation_config=generation_config,
  359. input_ids_seq_length=input_ids_seq_length,
  360. logits_processor=logits_processor,
  361. )
  362. if not generation_config.do_sample and generation_config.num_beams == 1:
  363. return self._greedy_search(
  364. input_ids,
  365. generation_config.max_length,
  366. generation_config.pad_token_id,
  367. generation_config.eos_token_id,
  368. logits_processor=logits_processor,
  369. trace=trace,
  370. params=params,
  371. model_kwargs=model_kwargs,
  372. )
  373. elif generation_config.do_sample and generation_config.num_beams == 1:
  374. logits_warper = self._get_logits_warper(generation_config=generation_config)
  375. return self._sample(
  376. input_ids,
  377. generation_config.max_length,
  378. generation_config.pad_token_id,
  379. generation_config.eos_token_id,
  380. prng_key,
  381. logits_warper=logits_warper,
  382. logits_processor=logits_processor,
  383. trace=trace,
  384. params=params,
  385. model_kwargs=model_kwargs,
  386. )
  387. elif not generation_config.do_sample and generation_config.num_beams > 1:
  388. # broadcast input_ids & encoder_outputs
  389. input_ids = self._expand_to_num_beams(input_ids, num_beams=generation_config.num_beams)
  390. if "encoder_outputs" in model_kwargs:
  391. model_kwargs["encoder_outputs"]["last_hidden_state"] = self._expand_to_num_beams(
  392. model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=generation_config.num_beams
  393. )
  394. for kwarg in ["attention_mask", "decoder_attention_mask"]:
  395. if kwarg in model_kwargs:
  396. model_kwargs[kwarg] = self._expand_to_num_beams(
  397. model_kwargs[kwarg], num_beams=generation_config.num_beams
  398. )
  399. return self._beam_search(
  400. input_ids,
  401. generation_config.max_length,
  402. generation_config.pad_token_id,
  403. generation_config.eos_token_id,
  404. length_penalty=generation_config.length_penalty,
  405. early_stopping=generation_config.early_stopping,
  406. logits_processor=logits_processor,
  407. trace=trace,
  408. params=params,
  409. num_return_sequences=generation_config.num_return_sequences,
  410. model_kwargs=model_kwargs,
  411. )
  412. else:
  413. raise NotImplementedError("`Beam sampling is currently not implemented.")
  414. def _get_logits_warper(self, generation_config: GenerationConfig) -> FlaxLogitsProcessorList:
  415. """
  416. This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsWarper`]
  417. instances used for multinomial sampling.
  418. """
  419. warpers = FlaxLogitsProcessorList()
  420. if generation_config.temperature is not None and generation_config.temperature != 1.0:
  421. warpers.append(FlaxTemperatureLogitsWarper(generation_config.temperature))
  422. if generation_config.top_k is not None and generation_config.top_k != 0:
  423. warpers.append(FlaxTopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=1))
  424. if generation_config.top_p is not None and generation_config.top_p < 1.0:
  425. warpers.append(FlaxTopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=1))
  426. return warpers
  427. def _get_logits_processor(
  428. self,
  429. generation_config: GenerationConfig,
  430. input_ids_seq_length: int,
  431. logits_processor: Optional[FlaxLogitsProcessorList],
  432. ) -> FlaxLogitsProcessorList:
  433. """
  434. This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsProcessor`]
  435. instances used to modify the scores of the language model head.
  436. """
  437. processors = FlaxLogitsProcessorList()
  438. if (
  439. generation_config.min_length is not None
  440. and generation_config.eos_token_id is not None
  441. and generation_config.min_length > -1
  442. ):
  443. processors.append(
  444. FlaxMinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id)
  445. )
  446. if generation_config.forced_bos_token_id is not None:
  447. processors.append(FlaxForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id))
  448. if generation_config.forced_eos_token_id is not None:
  449. processors.append(
  450. FlaxForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id)
  451. )
  452. if generation_config.suppress_tokens is not None:
  453. processors.append(FlaxSuppressTokensLogitsProcessor(generation_config.suppress_tokens))
  454. if generation_config.begin_suppress_tokens is not None:
  455. begin_index = input_ids_seq_length
  456. begin_index = (
  457. begin_index
  458. if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
  459. else begin_index + 1
  460. )
  461. if generation_config.forced_decoder_ids is not None and len(generation_config.forced_decoder_ids) > 0:
  462. # generation starts after the last token that is forced
  463. begin_index += generation_config.forced_decoder_ids[-1][0]
  464. processors.append(
  465. FlaxSuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index)
  466. )
  467. if generation_config.forced_decoder_ids is not None:
  468. forced_decoder_ids = [
  469. [input_ids_seq_length + i[0] - 1, i[1]] for i in generation_config.forced_decoder_ids
  470. ]
  471. processors.append(FlaxForceTokensLogitsProcessor(forced_decoder_ids))
  472. if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
  473. processors.append(FlaxNoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
  474. processors = self._merge_criteria_processor_list(processors, logits_processor)
  475. return processors
  476. def _merge_criteria_processor_list(
  477. self,
  478. default_list: FlaxLogitsProcessorList,
  479. custom_list: FlaxLogitsProcessorList,
  480. ) -> FlaxLogitsProcessorList:
  481. if len(custom_list) == 0:
  482. return default_list
  483. for default in default_list:
  484. for custom in custom_list:
  485. if type(custom) is type(default):
  486. object_type = "logits processor"
  487. raise ValueError(
  488. f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
  489. f" `generate`, but it has already been created with the values {default}. {default} has been"
  490. " created by passing the corresponding arguments to generate or by the model's config default"
  491. f" values. If you just want to change the default values of {object_type} consider passing"
  492. f" them as arguments to `generate` instead of using a custom {object_type}."
  493. )
  494. default_list.extend(custom_list)
  495. return default_list
  496. def _greedy_search(
  497. self,
  498. input_ids: None,
  499. max_length: Optional[int] = None,
  500. pad_token_id: Optional[int] = None,
  501. eos_token_id: Optional[int] = None,
  502. logits_processor: Optional[FlaxLogitsProcessorList] = None,
  503. trace: bool = True,
  504. params: Optional[Dict[str, jnp.ndarray]] = None,
  505. model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
  506. ):
  507. # init values
  508. max_length = max_length if max_length is not None else self.generation_config.max_length
  509. pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
  510. eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
  511. batch_size, cur_len = input_ids.shape
  512. eos_token_id = jnp.array(eos_token_id, dtype=jnp.int32 if eos_token_id is not None else None)
  513. pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
  514. cur_len = jnp.array(cur_len)
  515. # per batch-item holding current token in loop.
  516. sequences = jnp.full((batch_size, max_length), pad_token_id, dtype=jnp.int32)
  517. sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0))
  518. # per batch-item state bit indicating if sentence has finished.
  519. is_sent_finished = jnp.zeros((batch_size,), dtype=jnp.bool_)
  520. # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
  521. # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
  522. model = self.decode if self.config.is_encoder_decoder else self
  523. # initialize model specific kwargs
  524. model_kwargs = self.prepare_inputs_for_generation(input_ids, max_length, **model_kwargs)
  525. # initialize state
  526. state = GreedyState(
  527. cur_len=cur_len,
  528. sequences=sequences,
  529. running_token=input_ids,
  530. is_sent_finished=is_sent_finished,
  531. model_kwargs=model_kwargs,
  532. )
  533. def greedy_search_cond_fn(state):
  534. """state termination condition fn."""
  535. has_reached_max_length = state.cur_len == max_length
  536. all_sequence_finished = jnp.all(state.is_sent_finished)
  537. finish_generation = jnp.logical_or(has_reached_max_length, all_sequence_finished)
  538. return ~finish_generation
  539. def greedy_search_body_fn(state):
  540. """state update fn."""
  541. model_outputs = model(state.running_token, params=params, **state.model_kwargs)
  542. logits = model_outputs.logits[:, -1]
  543. # apply min_length, ...
  544. logits = logits_processor(state.sequences, logits, state.cur_len)
  545. next_token = jnp.argmax(logits, axis=-1)
  546. next_token = next_token * ~state.is_sent_finished + pad_token_id * state.is_sent_finished
  547. next_is_sent_finished = state.is_sent_finished | (next_token == eos_token_id)
  548. next_token = next_token[:, None]
  549. next_sequences = lax.dynamic_update_slice(state.sequences, next_token, (0, state.cur_len))
  550. next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
  551. return GreedyState(
  552. cur_len=state.cur_len + 1,
  553. sequences=next_sequences,
  554. running_token=next_token,
  555. is_sent_finished=next_is_sent_finished,
  556. model_kwargs=next_model_kwargs,
  557. )
  558. # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
  559. if input_ids.shape[1] > 1:
  560. state = greedy_search_body_fn(state)
  561. if not trace:
  562. state = self._run_loop_in_debug(greedy_search_cond_fn, greedy_search_body_fn, state)
  563. else:
  564. state = lax.while_loop(greedy_search_cond_fn, greedy_search_body_fn, state)
  565. return FlaxGreedySearchOutput(sequences=state.sequences)
  566. def _sample(
  567. self,
  568. input_ids: None,
  569. max_length: Optional[int] = None,
  570. pad_token_id: Optional[int] = None,
  571. eos_token_id: Optional[int] = None,
  572. prng_key: Optional[jnp.ndarray] = None,
  573. logits_processor: Optional[FlaxLogitsProcessorList] = None,
  574. logits_warper: Optional[FlaxLogitsProcessorList] = None,
  575. trace: bool = True,
  576. params: Optional[Dict[str, jnp.ndarray]] = None,
  577. model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
  578. ):
  579. # init values
  580. max_length = max_length if max_length is not None else self.generation_config.max_length
  581. pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
  582. eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
  583. prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
  584. batch_size, cur_len = input_ids.shape
  585. eos_token_id = jnp.array(eos_token_id, dtype=jnp.int32 if eos_token_id is not None else None)
  586. pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
  587. cur_len = jnp.array(cur_len)
  588. # per batch-item holding current token in loop.
  589. sequences = jnp.full((batch_size, max_length), pad_token_id, dtype=jnp.int32)
  590. sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0))
  591. # per batch-item state bit indicating if sentence has finished.
  592. is_sent_finished = jnp.zeros((batch_size,), dtype=jnp.bool_)
  593. # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
  594. # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
  595. model = self.decode if self.config.is_encoder_decoder else self
  596. # initialize model specific kwargs
  597. model_kwargs = self.prepare_inputs_for_generation(input_ids, max_length, **model_kwargs)
  598. # initialize state
  599. state = SampleState(
  600. cur_len=cur_len,
  601. sequences=sequences,
  602. running_token=input_ids,
  603. is_sent_finished=is_sent_finished,
  604. prng_key=prng_key,
  605. model_kwargs=model_kwargs,
  606. )
  607. def sample_search_cond_fn(state):
  608. """state termination condition fn."""
  609. has_reached_max_length = state.cur_len == max_length
  610. all_sequence_finished = jnp.all(state.is_sent_finished)
  611. finish_generation = jnp.logical_or(has_reached_max_length, all_sequence_finished)
  612. return ~finish_generation
  613. def sample_search_body_fn(state):
  614. """state update fn."""
  615. prng_key, prng_key_next = jax.random.split(state.prng_key)
  616. model_outputs = model(state.running_token, params=params, **state.model_kwargs)
  617. logits = model_outputs.logits[:, -1]
  618. # apply min_length, ...
  619. logits = logits_processor(state.sequences, logits, state.cur_len)
  620. # apply top_p, top_k, temperature
  621. logits = logits_warper(logits, logits, state.cur_len)
  622. next_token = jax.random.categorical(prng_key, logits, axis=-1)
  623. next_token = next_token * ~state.is_sent_finished + pad_token_id * state.is_sent_finished
  624. next_is_sent_finished = state.is_sent_finished | (next_token == eos_token_id)
  625. next_token = next_token[:, None]
  626. next_sequences = lax.dynamic_update_slice(state.sequences, next_token, (0, state.cur_len))
  627. next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
  628. return SampleState(
  629. cur_len=state.cur_len + 1,
  630. sequences=next_sequences,
  631. running_token=next_token,
  632. is_sent_finished=next_is_sent_finished,
  633. model_kwargs=next_model_kwargs,
  634. prng_key=prng_key_next,
  635. )
  636. # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
  637. if input_ids.shape[1] > 1:
  638. state = sample_search_body_fn(state)
  639. if not trace:
  640. state = self._run_loop_in_debug(sample_search_cond_fn, sample_search_body_fn, state)
  641. else:
  642. state = lax.while_loop(sample_search_cond_fn, sample_search_body_fn, state)
  643. return FlaxSampleOutput(sequences=state.sequences)
  644. def _beam_search(
  645. self,
  646. input_ids: None,
  647. max_length: Optional[int] = None,
  648. pad_token_id: Optional[int] = None,
  649. eos_token_id: Optional[int] = None,
  650. length_penalty: Optional[float] = None,
  651. early_stopping: Optional[Union[bool, str]] = None,
  652. logits_processor: Optional[FlaxLogitsProcessorList] = None,
  653. trace: bool = True,
  654. params: Optional[Dict[str, jnp.ndarray]] = None,
  655. num_return_sequences: Optional[int] = None,
  656. model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
  657. ):
  658. """
  659. This beam search function is heavily inspired by Flax's official example:
  660. https://github.com/google/flax/blob/main/examples/wmt/decode.py
  661. """
  662. def flatten_beam_dim(tensor):
  663. """Flattens the first two dimensions of a non-scalar array."""
  664. # ignore scalars (e.g. cache index)
  665. if tensor.ndim == 0:
  666. return tensor
  667. return tensor.reshape((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
  668. def unflatten_beam_dim(tensor, batch_size, num_beams):
  669. """Unflattens the first, flat batch*beam dimension of a non-scalar array."""
  670. # ignore scalars (e.g. cache index)
  671. if tensor.ndim == 0:
  672. return tensor
  673. return tensor.reshape((batch_size, num_beams) + tensor.shape[1:])
  674. def gather_beams(nested, beam_indices, batch_size, new_num_beams):
  675. """
  676. Gathers the beam slices indexed by beam_indices into new beam array.
  677. """
  678. batch_indices = jnp.reshape(
  679. jnp.arange(batch_size * new_num_beams) // new_num_beams, (batch_size, new_num_beams)
  680. )
  681. def gather_fn(tensor):
  682. # ignore scalars (e.g. cache index)
  683. if tensor.ndim == 0:
  684. return tensor
  685. else:
  686. return tensor[batch_indices, beam_indices]
  687. return jax.tree_util.tree_map(gather_fn, nested)
  688. # init values
  689. max_length = max_length if max_length is not None else self.generation_config.max_length
  690. pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
  691. eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
  692. length_penalty = length_penalty if length_penalty is not None else self.generation_config.length_penalty
  693. early_stopping = early_stopping if early_stopping is not None else self.generation_config.early_stopping
  694. num_return_sequences = (
  695. num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences
  696. )
  697. batch_size, num_beams, cur_len = input_ids.shape
  698. eos_token_id = jnp.array(eos_token_id, dtype=jnp.int32 if eos_token_id is not None else None)
  699. pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
  700. cur_len = jnp.array(cur_len)
  701. # record the prompt length of decoder
  702. decoder_prompt_len = input_ids.shape[-1]
  703. # per batch,beam-item holding current token in loop.
  704. sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
  705. running_sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
  706. running_sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0, 0))
  707. # per batch,beam-item state bit indicating if sentence has finished.
  708. is_sent_finished = jnp.zeros((batch_size, num_beams), dtype=jnp.bool_)
  709. # per batch,beam-item score, logprobs
  710. running_scores = jnp.tile(jnp.array([0.0] + [np.array(-1.0e7)] * (num_beams - 1)), [batch_size, 1])
  711. scores = jnp.ones((batch_size, num_beams)) * np.array(-1.0e7)
  712. # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
  713. # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
  714. model = self.decode if self.config.is_encoder_decoder else self
  715. # flatten beam dim
  716. if "encoder_outputs" in model_kwargs:
  717. model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
  718. model_kwargs["encoder_outputs"]["last_hidden_state"]
  719. )
  720. for kwarg in ["attention_mask", "decoder_attention_mask"]:
  721. if kwarg in model_kwargs:
  722. model_kwargs[kwarg] = flatten_beam_dim(model_kwargs[kwarg])
  723. # initialize model specific kwargs
  724. model_kwargs = self.prepare_inputs_for_generation(flatten_beam_dim(input_ids), max_length, **model_kwargs)
  725. # initialize state
  726. state = BeamSearchState(
  727. cur_len=cur_len,
  728. running_sequences=running_sequences,
  729. running_scores=running_scores,
  730. sequences=sequences,
  731. scores=scores,
  732. is_sent_finished=is_sent_finished,
  733. model_kwargs=model_kwargs,
  734. )
  735. def beam_search_cond_fn(state):
  736. """beam search state termination condition fn."""
  737. # 1. is less than max length?
  738. not_max_length_yet = state.cur_len < max_length
  739. # 2. can the new beams still improve?
  740. # early_stopping == False -> apply heuristic = always get the best score from `cur_len`. See the discussion
  741. # below for more details.
  742. # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
  743. # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
  744. # length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
  745. if early_stopping == "never" and length_penalty > 0.0:
  746. best_running_score = state.running_scores[:, :1] / (
  747. (max_length - decoder_prompt_len) ** length_penalty
  748. )
  749. else:
  750. best_running_score = state.running_scores[:, :1] / (
  751. (state.cur_len - decoder_prompt_len) ** length_penalty
  752. )
  753. worst_finished_score = jnp.where(
  754. state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
  755. )
  756. improvement_still_possible = jnp.any(best_running_score > worst_finished_score)
  757. # 3. is there still a beam that has not finished?
  758. still_open_beam = ~(jnp.all(state.is_sent_finished) & (early_stopping is True))
  759. return not_max_length_yet & still_open_beam & improvement_still_possible
  760. def beam_search_body_fn(state, input_ids_length=1):
  761. """beam search state update fn."""
  762. # 1. Forward current tokens
  763. # Collect the current position slice along length to feed the fast
  764. # autoregressive decoder model. Flatten the beam dimension into batch
  765. # dimension for feeding into the model.
  766. # unflatten beam dimension
  767. # Unflatten beam dimension in attention cache arrays
  768. input_token = flatten_beam_dim(
  769. lax.dynamic_slice(
  770. state.running_sequences,
  771. (0, 0, state.cur_len - input_ids_length),
  772. (batch_size, num_beams, input_ids_length),
  773. )
  774. )
  775. model_outputs = model(input_token, params=params, **state.model_kwargs)
  776. logits = unflatten_beam_dim(model_outputs.logits[:, -1], batch_size, num_beams)
  777. cache = jax.tree_util.tree_map(
  778. lambda tensor: unflatten_beam_dim(tensor, batch_size, num_beams), model_outputs.past_key_values
  779. )
  780. # adapt logits for FlaxMarianMTModel
  781. logits = self._adapt_logits_for_beam_search(logits)
  782. # 2. Compute log probs
  783. # get log probabilities from logits,
  784. # process logits with processors (*e.g.* min_length, ...), and
  785. # add new logprobs to existing running logprobs scores.
  786. log_probs = jax.nn.log_softmax(logits)
  787. log_probs = logits_processor(
  788. flatten_beam_dim(state.running_sequences), flatten_beam_dim(log_probs), state.cur_len
  789. )
  790. log_probs = unflatten_beam_dim(log_probs, batch_size, num_beams)
  791. log_probs = log_probs + jnp.expand_dims(state.running_scores, axis=2)
  792. vocab_size = log_probs.shape[2]
  793. log_probs = log_probs.reshape((batch_size, num_beams * vocab_size))
  794. # 3. Retrieve top-K
  795. # Each item in batch has num_beams * vocab_size candidate sequences.
  796. # For each item, get the top 2*k candidates with the highest log-
  797. # probabilities. We gather the top 2*K beams here so that even if the best
  798. # K sequences reach EOS simultaneously, we have another K sequences
  799. # remaining to continue the live beam search.
  800. # Gather the top 2*K scores from _all_ beams.
  801. # Gather 2*k top beams.
  802. # Recover the beam index by floor division.
  803. # Recover token id by modulo division and expand Id array for broadcasting.
  804. # Update sequences for the 2*K top-k new sequences.
  805. beams_to_keep = 2 * num_beams
  806. topk_log_probs, topk_indices = lax.top_k(log_probs, k=beams_to_keep)
  807. topk_beam_indices = topk_indices // vocab_size
  808. topk_running_sequences = gather_beams(
  809. state.running_sequences, topk_beam_indices, batch_size, beams_to_keep
  810. )
  811. topk_ids = jnp.expand_dims(topk_indices % vocab_size, axis=2)
  812. topk_sequences = lax.dynamic_update_slice(topk_running_sequences, topk_ids, (0, 0, state.cur_len))
  813. # 4. Check which sequences have ended
  814. # Update current sequences:
  815. # Did any of these sequences reach an end marker?
  816. # To prevent these just finished sequences from being added to the current sequences
  817. # set of active beam search sequences, set their log probs to a very large
  818. # negative value.
  819. did_topk_just_finished = topk_sequences[:, :, state.cur_len] == eos_token_id
  820. running_topk_log_probs = topk_log_probs + did_topk_just_finished * np.array(-1.0e7)
  821. # 5. Get running sequences scores for next
  822. # Determine the top k beam indices (from top 2*k beams) from log probs
  823. # and gather top k beams (from top 2*k beams).
  824. next_topk_indices = lax.top_k(running_topk_log_probs, k=num_beams)[1]
  825. next_running_sequences, next_running_scores = gather_beams(
  826. [topk_sequences, running_topk_log_probs], next_topk_indices, batch_size, num_beams
  827. )
  828. # 6. Process topk logits
  829. # Further process log probs:
  830. # - add length penalty
  831. # - make sure no scores can be added anymore if beam is full
  832. # - make sure still running sequences cannot be chosen as finalized beam
  833. topk_log_probs = topk_log_probs / ((state.cur_len + 1 - decoder_prompt_len) ** length_penalty)
  834. beams_in_batch_are_full = jnp.broadcast_to(
  835. state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape
  836. ) & (early_stopping is True)
  837. add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
  838. topk_log_probs += add_penalty * np.array(-1.0e7)
  839. # 7. Get scores, sequences, is sentence finished for next.
  840. # Combine sequences, scores, and flags along the beam dimension and compare
  841. # new finished sequence scores to existing finished scores and select the
  842. # best from the new set of beams
  843. merged_sequences = jnp.concatenate([state.sequences, topk_sequences], axis=1)
  844. merged_scores = jnp.concatenate([state.scores, topk_log_probs], axis=1)
  845. merged_is_sent_finished = jnp.concatenate([state.is_sent_finished, did_topk_just_finished], axis=1)
  846. topk_merged_indices = lax.top_k(merged_scores, k=num_beams)[1]
  847. next_sequences, next_scores, next_is_sent_finished = gather_beams(
  848. [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices, batch_size, num_beams
  849. )
  850. # 8. Update model kwargs.
  851. # Determine the top k beam indices from the original set of all beams.
  852. # With these, gather the top k beam-associated caches.
  853. next_running_indices = gather_beams(topk_beam_indices, next_topk_indices, batch_size, num_beams)
  854. next_cache = gather_beams(cache, next_running_indices, batch_size, num_beams)
  855. model_outputs["past_key_values"] = jax.tree_util.tree_map(lambda x: flatten_beam_dim(x), next_cache)
  856. next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
  857. return BeamSearchState(
  858. cur_len=state.cur_len + 1,
  859. running_scores=next_running_scores,
  860. running_sequences=next_running_sequences,
  861. scores=next_scores,
  862. sequences=next_sequences,
  863. is_sent_finished=next_is_sent_finished,
  864. model_kwargs=next_model_kwargs,
  865. )
  866. # Always run first iteration outside of `lax.while_loop` to avoid calling `beam_search_cond_fn`
  867. # when `state.cur_len` equals `decoder_prompt_len`. This also helps to comply with TPU when
  868. # the very first prompt has sequence length > 1.
  869. state = partial(beam_search_body_fn, input_ids_length=input_ids.shape[-1])(state)
  870. if not trace:
  871. state = self._run_loop_in_debug(beam_search_cond_fn, beam_search_body_fn, state)
  872. else:
  873. state = lax.while_loop(beam_search_cond_fn, beam_search_body_fn, state)
  874. # Account for the edge-case where there are no finished sequences for a
  875. # particular batch item. If so, return running sequences for that batch item.
  876. none_finished = jnp.any(state.is_sent_finished, axis=1)
  877. sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences)
  878. scores = jnp.where(none_finished[:, None], state.scores, state.running_scores)
  879. # Take best beams for each batch (the score is sorted in descending order)
  880. sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :])
  881. scores = flatten_beam_dim(scores[:, :num_return_sequences])
  882. return FlaxBeamSearchOutput(sequences=sequences, scores=scores)