processing_utils.py 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. # coding=utf-8
  2. # Copyright 2022 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Processing saving/loading class for common processors.
  17. """
  18. import copy
  19. import inspect
  20. import json
  21. import os
  22. import sys
  23. import typing
  24. import warnings
  25. from pathlib import Path
  26. from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
  27. import numpy as np
  28. import typing_extensions
  29. from .dynamic_module_utils import custom_object_save
  30. from .image_utils import ChannelDimension, is_valid_image, is_vision_available
  31. if is_vision_available():
  32. from .image_utils import PILImageResampling
  33. from .tokenization_utils_base import (
  34. PaddingStrategy,
  35. PreTokenizedInput,
  36. PreTrainedTokenizerBase,
  37. TextInput,
  38. TruncationStrategy,
  39. )
  40. from .utils import (
  41. CHAT_TEMPLATE_NAME,
  42. PROCESSOR_NAME,
  43. PushToHubMixin,
  44. TensorType,
  45. add_model_info_to_auto_map,
  46. add_model_info_to_custom_pipelines,
  47. cached_file,
  48. copy_func,
  49. direct_transformers_import,
  50. download_url,
  51. is_offline_mode,
  52. is_remote_url,
  53. logging,
  54. )
  55. logger = logging.get_logger(__name__)
  56. # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
  57. transformers_module = direct_transformers_import(Path(__file__).parent)
  58. AUTO_TO_BASE_CLASS_MAPPING = {
  59. "AutoTokenizer": "PreTrainedTokenizerBase",
  60. "AutoFeatureExtractor": "FeatureExtractionMixin",
  61. "AutoImageProcessor": "ImageProcessingMixin",
  62. }
  63. if sys.version_info >= (3, 11):
  64. Unpack = typing.Unpack
  65. else:
  66. Unpack = typing_extensions.Unpack
  67. class TextKwargs(TypedDict, total=False):
  68. """
  69. Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and
  70. docstrings associated.
  71. Attributes:
  72. add_special_tokens (`bool`, *optional*)
  73. Whether or not to add special tokens when encoding the sequences.
  74. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*)
  75. Activates and controls padding.
  76. truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*):
  77. Activates and controls truncation.
  78. max_length (`int`, *optional*):
  79. Controls the maximum length to use by one of the truncation/padding parameters.
  80. stride (`int`, *optional*):
  81. If set, the overflowing tokens will contain some tokens from the end of the truncated sequence.
  82. is_split_into_words (`bool`, *optional*):
  83. Whether or not the input is already pre-tokenized.
  84. pad_to_multiple_of (`int`, *optional*):
  85. If set, will pad the sequence to a multiple of the provided value.
  86. return_token_type_ids (`bool`, *optional*):
  87. Whether to return token type IDs.
  88. return_attention_mask (`bool`, *optional*):
  89. Whether to return the attention mask.
  90. return_overflowing_tokens (`bool`, *optional*):
  91. Whether or not to return overflowing token sequences.
  92. return_special_tokens_mask (`bool`, *optional*):
  93. Whether or not to return special tokens mask information.
  94. return_offsets_mapping (`bool`, *optional*):
  95. Whether or not to return `(char_start, char_end)` for each token.
  96. return_length (`bool`, *optional*):
  97. Whether or not to return the lengths of the encoded inputs.
  98. verbose (`bool`, *optional*):
  99. Whether or not to print more information and warnings.
  100. padding_side (`str`, *optional*):
  101. The side on which padding will be applied.
  102. """
  103. text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
  104. text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
  105. text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
  106. add_special_tokens: Optional[bool]
  107. padding: Union[bool, str, PaddingStrategy]
  108. truncation: Union[bool, str, TruncationStrategy]
  109. max_length: Optional[int]
  110. stride: Optional[int]
  111. is_split_into_words: Optional[bool]
  112. pad_to_multiple_of: Optional[int]
  113. return_token_type_ids: Optional[bool]
  114. return_attention_mask: Optional[bool]
  115. return_overflowing_tokens: Optional[bool]
  116. return_special_tokens_mask: Optional[bool]
  117. return_offsets_mapping: Optional[bool]
  118. return_length: Optional[bool]
  119. verbose: Optional[bool]
  120. padding_side: Optional[str]
  121. class ImagesKwargs(TypedDict, total=False):
  122. """
  123. Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor
  124. class methods and docstrings.
  125. Attributes:
  126. do_resize (`bool`, *optional*):
  127. Whether to resize the image.
  128. size (`Dict[str, int]`, *optional*):
  129. Resize the shorter side of the input to `size["shortest_edge"]`.
  130. size_divisor (`int`, *optional*):
  131. The size by which to make sure both the height and width can be divided.
  132. crop_size (`Dict[str, int]`, *optional*):
  133. Desired output size when applying center-cropping.
  134. resample (`PILImageResampling`, *optional*):
  135. Resampling filter to use if resizing the image.
  136. do_rescale (`bool`, *optional*):
  137. Whether to rescale the image by the specified scale `rescale_factor`.
  138. rescale_factor (`int` or `float`, *optional*):
  139. Scale factor to use if rescaling the image.
  140. do_normalize (`bool`, *optional*):
  141. Whether to normalize the image.
  142. image_mean (`float` or `List[float]`, *optional*):
  143. Mean to use if normalizing the image.
  144. image_std (`float` or `List[float]`, *optional*):
  145. Standard deviation to use if normalizing the image.
  146. do_pad (`bool`, *optional*):
  147. Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
  148. pad_size (`Dict[str, int]`, *optional*):
  149. The size `{"height": int, "width" int}` to pad the images to.
  150. do_center_crop (`bool`, *optional*):
  151. Whether to center crop the image.
  152. data_format (`ChannelDimension` or `str`, *optional*):
  153. The channel dimension format for the output image.
  154. input_data_format (`ChannelDimension` or `str`, *optional*):
  155. The channel dimension format for the input image.
  156. """
  157. do_resize: Optional[bool]
  158. size: Optional[Dict[str, int]]
  159. size_divisor: Optional[int]
  160. crop_size: Optional[Dict[str, int]]
  161. resample: Optional[Union["PILImageResampling", int]]
  162. do_rescale: Optional[bool]
  163. rescale_factor: Optional[float]
  164. do_normalize: Optional[bool]
  165. image_mean: Optional[Union[float, List[float]]]
  166. image_std: Optional[Union[float, List[float]]]
  167. do_pad: Optional[bool]
  168. pad_size: Optional[Dict[str, int]]
  169. do_center_crop: Optional[bool]
  170. data_format: Optional[ChannelDimension]
  171. input_data_format: Optional[Union[str, ChannelDimension]]
  172. class VideosKwargs(TypedDict, total=False):
  173. """
  174. Keyword arguments for video processing.
  175. Attributes:
  176. do_resize (`bool`):
  177. Whether to resize the image.
  178. size (`Dict[str, int]`, *optional*):
  179. Resize the shorter side of the input to `size["shortest_edge"]`.
  180. size_divisor (`int`, *optional*):
  181. The size by which to make sure both the height and width can be divided.
  182. resample (`PILImageResampling`, *optional*):
  183. Resampling filter to use if resizing the image.
  184. do_rescale (`bool`, *optional*):
  185. Whether to rescale the image by the specified scale `rescale_factor`.
  186. rescale_factor (`int` or `float`, *optional*):
  187. Scale factor to use if rescaling the image.
  188. do_normalize (`bool`, *optional*):
  189. Whether to normalize the image.
  190. image_mean (`float` or `List[float]`, *optional*):
  191. Mean to use if normalizing the image.
  192. image_std (`float` or `List[float]`, *optional*):
  193. Standard deviation to use if normalizing the image.
  194. do_pad (`bool`, *optional*):
  195. Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
  196. do_center_crop (`bool`, *optional*):
  197. Whether to center crop the image.
  198. data_format (`ChannelDimension` or `str`, *optional*):
  199. The channel dimension format for the output image.
  200. input_data_format (`ChannelDimension` or `str`, *optional*):
  201. The channel dimension format for the input image.
  202. """
  203. do_resize: Optional[bool]
  204. size: Optional[Dict[str, int]]
  205. size_divisor: Optional[int]
  206. resample: Optional["PILImageResampling"]
  207. do_rescale: Optional[bool]
  208. rescale_factor: Optional[float]
  209. do_normalize: Optional[bool]
  210. image_mean: Optional[Union[float, List[float]]]
  211. image_std: Optional[Union[float, List[float]]]
  212. do_pad: Optional[bool]
  213. do_center_crop: Optional[bool]
  214. data_format: Optional[ChannelDimension]
  215. input_data_format: Optional[Union[str, ChannelDimension]]
  216. class AudioKwargs(TypedDict, total=False):
  217. """
  218. Keyword arguments for audio processing.
  219. Attributes:
  220. sampling_rate (`int`, *optional*):
  221. The sampling rate at which the `raw_speech` input was sampled.
  222. raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
  223. The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
  224. values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
  225. stereo, i.e. single float per timestep.
  226. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*):
  227. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  228. index) among:
  229. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  230. sequence if provided).
  231. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  232. acceptable input length for the model if that argument is not provided.
  233. - `False` or `'do_not_pad'`
  234. max_length (`int`, *optional*):
  235. Maximum length of the returned list and optionally padding length (see above).
  236. truncation (`bool`, *optional*):
  237. Activates truncation to cut input sequences longer than *max_length* to *max_length*.
  238. pad_to_multiple_of (`int`, *optional*):
  239. If set, will pad the sequence to a multiple of the provided value.
  240. return_attention_mask (`bool`, *optional*):
  241. Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
  242. """
  243. sampling_rate: Optional[int]
  244. raw_speech: Optional[Union["np.ndarray", List[float], List["np.ndarray"], List[List[float]]]]
  245. padding: Optional[Union[bool, str, PaddingStrategy]]
  246. max_length: Optional[int]
  247. truncation: Optional[bool]
  248. pad_to_multiple_of: Optional[int]
  249. return_attention_mask: Optional[bool]
  250. class CommonKwargs(TypedDict, total=False):
  251. return_tensors: Optional[Union[str, TensorType]]
  252. class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False):
  253. """
  254. Base class for kwargs passing to processors.
  255. A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
  256. 1) Additional typed keys and that this model requires to process inputs.
  257. 2) Default values for existing keys under a `_defaults` attribute.
  258. New keys have to be defined as follows to ensure type hinting is done correctly.
  259. ```python
  260. # adding a new image kwarg for this model
  261. class ModelImagesKwargs(ImagesKwargs, total=False):
  262. new_image_kwarg: Optional[bool]
  263. class ModelProcessorKwargs(ProcessingKwargs, total=False):
  264. images_kwargs: ModelImagesKwargs
  265. _defaults = {
  266. "images_kwargs: {
  267. "new_image_kwarg": False,
  268. }
  269. "text_kwargs": {
  270. "padding": "max_length",
  271. },
  272. }
  273. ```
  274. For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs,
  275. you need to manually update the __annotations__ dictionary. This can be done as follows:
  276. ```python
  277. class CustomProcessorKwargs(ProcessingKwargs, total=False):
  278. images_kwargs: CustomImagesKwargs
  279. CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs # python 3.8 compatibility
  280. ```python
  281. """
  282. common_kwargs: CommonKwargs = {
  283. **CommonKwargs.__annotations__,
  284. }
  285. text_kwargs: TextKwargs = {
  286. **TextKwargs.__annotations__,
  287. }
  288. images_kwargs: ImagesKwargs = {
  289. **ImagesKwargs.__annotations__,
  290. }
  291. videos_kwargs: VideosKwargs = {
  292. **VideosKwargs.__annotations__,
  293. }
  294. audio_kwargs: AudioKwargs = {
  295. **AudioKwargs.__annotations__,
  296. }
  297. class ProcessorMixin(PushToHubMixin):
  298. """
  299. This is a mixin used to provide saving/loading functionality for all processor classes.
  300. """
  301. attributes = ["feature_extractor", "tokenizer"]
  302. optional_attributes = ["chat_template"]
  303. optional_call_args: List[str] = []
  304. # Names need to be attr_class for attr in attributes
  305. feature_extractor_class = None
  306. tokenizer_class = None
  307. _auto_class = None
  308. valid_kwargs: List[str] = []
  309. # args have to match the attributes class attribute
  310. def __init__(self, *args, **kwargs):
  311. # First, extract optional attributes from kwargs if present
  312. # Optional attributes can never be positional arguments
  313. for optional_attribute in self.optional_attributes:
  314. setattr(self, optional_attribute, kwargs.pop(optional_attribute, None))
  315. # Sanitize args and kwargs
  316. for key in kwargs:
  317. if key not in self.attributes:
  318. raise TypeError(f"Unexpected keyword argument {key}.")
  319. for arg, attribute_name in zip(args, self.attributes):
  320. if attribute_name in kwargs:
  321. raise TypeError(f"Got multiple values for argument {attribute_name}.")
  322. else:
  323. kwargs[attribute_name] = arg
  324. if len(kwargs) != len(self.attributes):
  325. raise ValueError(
  326. f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
  327. f"{len(args)} arguments instead."
  328. )
  329. # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
  330. for attribute_name, arg in kwargs.items():
  331. class_name = getattr(self, f"{attribute_name}_class")
  332. # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
  333. class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
  334. if isinstance(class_name, tuple):
  335. proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
  336. else:
  337. proper_class = getattr(transformers_module, class_name)
  338. if not isinstance(arg, proper_class):
  339. raise TypeError(
  340. f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected."
  341. )
  342. setattr(self, attribute_name, arg)
  343. def to_dict(self) -> Dict[str, Any]:
  344. """
  345. Serializes this instance to a Python dictionary.
  346. Returns:
  347. `Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
  348. """
  349. output = copy.deepcopy(self.__dict__)
  350. # Get the kwargs in `__init__`.
  351. sig = inspect.signature(self.__init__)
  352. # Only save the attributes that are presented in the kwargs of `__init__`.
  353. attrs_to_save = sig.parameters
  354. # Don't save attributes like `tokenizer`, `image processor` etc.
  355. attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
  356. # extra attributes to be kept
  357. attrs_to_save += ["auto_map"]
  358. output = {k: v for k, v in output.items() if k in attrs_to_save}
  359. output["processor_class"] = self.__class__.__name__
  360. if "tokenizer" in output:
  361. del output["tokenizer"]
  362. if "image_processor" in output:
  363. del output["image_processor"]
  364. if "feature_extractor" in output:
  365. del output["feature_extractor"]
  366. if "chat_template" in output:
  367. del output["chat_template"]
  368. # Some attributes have different names but containing objects that are not simple strings
  369. output = {
  370. k: v
  371. for k, v in output.items()
  372. if not (isinstance(v, PushToHubMixin) or v.__class__.__name__ == "BeamSearchDecoderCTC")
  373. }
  374. return output
  375. def to_json_string(self) -> str:
  376. """
  377. Serializes this instance to a JSON string.
  378. Returns:
  379. `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
  380. """
  381. dictionary = self.to_dict()
  382. return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
  383. def to_json_file(self, json_file_path: Union[str, os.PathLike]):
  384. """
  385. Save this instance to a JSON file.
  386. Args:
  387. json_file_path (`str` or `os.PathLike`):
  388. Path to the JSON file in which this processor instance's parameters will be saved.
  389. """
  390. with open(json_file_path, "w", encoding="utf-8") as writer:
  391. writer.write(self.to_json_string())
  392. def __repr__(self):
  393. attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
  394. attributes_repr = "\n".join(attributes_repr)
  395. return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
  396. def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
  397. """
  398. Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
  399. can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
  400. <Tip>
  401. This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
  402. [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
  403. methods above for more information.
  404. </Tip>
  405. Args:
  406. save_directory (`str` or `os.PathLike`):
  407. Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
  408. be created if it does not exist).
  409. push_to_hub (`bool`, *optional*, defaults to `False`):
  410. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  411. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  412. namespace).
  413. kwargs (`Dict[str, Any]`, *optional*):
  414. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  415. """
  416. use_auth_token = kwargs.pop("use_auth_token", None)
  417. if use_auth_token is not None:
  418. warnings.warn(
  419. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  420. FutureWarning,
  421. )
  422. if kwargs.get("token", None) is not None:
  423. raise ValueError(
  424. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  425. )
  426. kwargs["token"] = use_auth_token
  427. os.makedirs(save_directory, exist_ok=True)
  428. if push_to_hub:
  429. commit_message = kwargs.pop("commit_message", None)
  430. repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
  431. repo_id = self._create_repo(repo_id, **kwargs)
  432. files_timestamps = self._get_files_timestamps(save_directory)
  433. # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
  434. # loaded from the Hub.
  435. if self._auto_class is not None:
  436. attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
  437. configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
  438. configs.append(self)
  439. custom_object_save(self, save_directory, config=configs)
  440. for attribute_name in self.attributes:
  441. attribute = getattr(self, attribute_name)
  442. # Include the processor class in the attribute config so this processor can then be reloaded with the
  443. # `AutoProcessor` API.
  444. if hasattr(attribute, "_set_processor_class"):
  445. attribute._set_processor_class(self.__class__.__name__)
  446. attribute.save_pretrained(save_directory)
  447. if self._auto_class is not None:
  448. # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
  449. for attribute_name in self.attributes:
  450. attribute = getattr(self, attribute_name)
  451. if isinstance(attribute, PreTrainedTokenizerBase):
  452. del attribute.init_kwargs["auto_map"]
  453. # If we save using the predefined names, we can load using `from_pretrained`
  454. # plus we save chat_template in its own file
  455. output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
  456. output_chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_NAME)
  457. processor_dict = self.to_dict()
  458. # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
  459. # to avoid serializing chat template in json config file. So let's get it from `self` directly
  460. if self.chat_template is not None:
  461. chat_template_json_string = (
  462. json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n"
  463. )
  464. with open(output_chat_template_file, "w", encoding="utf-8") as writer:
  465. writer.write(chat_template_json_string)
  466. logger.info(f"chat template saved in {output_chat_template_file}")
  467. # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
  468. # `auto_map` is not specified.
  469. if set(processor_dict.keys()) != {"processor_class"}:
  470. self.to_json_file(output_processor_file)
  471. logger.info(f"processor saved in {output_processor_file}")
  472. if push_to_hub:
  473. self._upload_modified_files(
  474. save_directory,
  475. repo_id,
  476. files_timestamps,
  477. commit_message=commit_message,
  478. token=kwargs.get("token"),
  479. )
  480. if set(processor_dict.keys()) == {"processor_class"}:
  481. return []
  482. return [output_processor_file]
  483. @classmethod
  484. def get_processor_dict(
  485. cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
  486. ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
  487. """
  488. From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
  489. processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
  490. Parameters:
  491. pretrained_model_name_or_path (`str` or `os.PathLike`):
  492. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
  493. subfolder (`str`, *optional*, defaults to `""`):
  494. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  495. specify the folder name here.
  496. Returns:
  497. `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
  498. """
  499. cache_dir = kwargs.pop("cache_dir", None)
  500. force_download = kwargs.pop("force_download", False)
  501. resume_download = kwargs.pop("resume_download", None)
  502. proxies = kwargs.pop("proxies", None)
  503. token = kwargs.pop("token", None)
  504. local_files_only = kwargs.pop("local_files_only", False)
  505. revision = kwargs.pop("revision", None)
  506. subfolder = kwargs.pop("subfolder", "")
  507. from_pipeline = kwargs.pop("_from_pipeline", None)
  508. from_auto_class = kwargs.pop("_from_auto", False)
  509. user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
  510. if from_pipeline is not None:
  511. user_agent["using_pipeline"] = from_pipeline
  512. if is_offline_mode() and not local_files_only:
  513. logger.info("Offline mode: forcing local_files_only=True")
  514. local_files_only = True
  515. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  516. is_local = os.path.isdir(pretrained_model_name_or_path)
  517. if os.path.isdir(pretrained_model_name_or_path):
  518. processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
  519. chat_template_file = os.path.join(pretrained_model_name_or_path, "chat_template.json")
  520. if os.path.isfile(pretrained_model_name_or_path):
  521. resolved_processor_file = pretrained_model_name_or_path
  522. # cant't load chat-template when given a file as pretrained_model_name_or_path
  523. resolved_chat_template_file = None
  524. is_local = True
  525. elif is_remote_url(pretrained_model_name_or_path):
  526. processor_file = pretrained_model_name_or_path
  527. resolved_processor_file = download_url(pretrained_model_name_or_path)
  528. # can't load chat-template when given a file url as pretrained_model_name_or_path
  529. resolved_chat_template_file = None
  530. else:
  531. processor_file = PROCESSOR_NAME
  532. chat_template_file = CHAT_TEMPLATE_NAME
  533. try:
  534. # Load from local folder or from cache or download from model Hub and cache
  535. resolved_processor_file = cached_file(
  536. pretrained_model_name_or_path,
  537. processor_file,
  538. cache_dir=cache_dir,
  539. force_download=force_download,
  540. proxies=proxies,
  541. resume_download=resume_download,
  542. local_files_only=local_files_only,
  543. token=token,
  544. user_agent=user_agent,
  545. revision=revision,
  546. subfolder=subfolder,
  547. _raise_exceptions_for_missing_entries=False,
  548. )
  549. # Load chat template from a separate json if exists
  550. # because making it part of processor-config break BC.
  551. # Processors in older version do not accept any kwargs
  552. resolved_chat_template_file = cached_file(
  553. pretrained_model_name_or_path,
  554. chat_template_file,
  555. cache_dir=cache_dir,
  556. force_download=force_download,
  557. proxies=proxies,
  558. resume_download=resume_download,
  559. local_files_only=local_files_only,
  560. token=token,
  561. user_agent=user_agent,
  562. revision=revision,
  563. subfolder=subfolder,
  564. _raise_exceptions_for_missing_entries=False,
  565. )
  566. except EnvironmentError:
  567. # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
  568. # the original exception.
  569. raise
  570. except Exception:
  571. # For any other exception, we throw a generic error.
  572. raise EnvironmentError(
  573. f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
  574. " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
  575. f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
  576. f" directory containing a {PROCESSOR_NAME} file"
  577. )
  578. # Add chat template as kwarg before returning because most models don't have processor config
  579. chat_template = None
  580. if resolved_chat_template_file is not None:
  581. with open(resolved_chat_template_file, "r", encoding="utf-8") as reader:
  582. text = reader.read()
  583. chat_template = json.loads(text)["chat_template"]
  584. kwargs["chat_template"] = chat_template
  585. # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
  586. # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
  587. # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
  588. # However, for models added in the future, we won't get the expected error if this file is missing.
  589. if resolved_processor_file is None:
  590. return {}, kwargs
  591. try:
  592. # Load processor dict
  593. with open(resolved_processor_file, "r", encoding="utf-8") as reader:
  594. text = reader.read()
  595. processor_dict = json.loads(text)
  596. except json.JSONDecodeError:
  597. raise EnvironmentError(
  598. f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
  599. )
  600. if is_local:
  601. logger.info(f"loading configuration file {resolved_processor_file}")
  602. else:
  603. logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
  604. if "chat_template" in processor_dict and processor_dict["chat_template"] is not None:
  605. logger.warning_once(
  606. "Chat templates should be in a 'chat_template.json' file but found key='chat_template' "
  607. "in the processor's config. Make sure to move your template to its own file."
  608. )
  609. if not is_local:
  610. if "auto_map" in processor_dict:
  611. processor_dict["auto_map"] = add_model_info_to_auto_map(
  612. processor_dict["auto_map"], pretrained_model_name_or_path
  613. )
  614. if "custom_pipelines" in processor_dict:
  615. processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
  616. processor_dict["custom_pipelines"], pretrained_model_name_or_path
  617. )
  618. return processor_dict, kwargs
  619. @classmethod
  620. def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
  621. """
  622. Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
  623. Args:
  624. processor_dict (`Dict[str, Any]`):
  625. Dictionary that will be used to instantiate the processor object. Such a dictionary can be
  626. retrieved from a pretrained checkpoint by leveraging the
  627. [`~processing_utils.ProcessingMixin.to_dict`] method.
  628. kwargs (`Dict[str, Any]`):
  629. Additional parameters from which to initialize the processor object.
  630. Returns:
  631. [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
  632. parameters.
  633. """
  634. processor_dict = processor_dict.copy()
  635. return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
  636. chat_template = kwargs.pop("chat_template", None)
  637. # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
  638. # If we don't pop, some specific kwargs will raise a warning
  639. if "processor_class" in processor_dict:
  640. del processor_dict["processor_class"]
  641. if "auto_map" in processor_dict:
  642. del processor_dict["auto_map"]
  643. unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
  644. processor = cls(*args, **processor_dict)
  645. if chat_template is not None:
  646. setattr(processor, "chat_template", chat_template)
  647. # Update processor with kwargs if needed
  648. for key in set(kwargs.keys()):
  649. if hasattr(processor, key):
  650. setattr(processor, key, kwargs.pop(key))
  651. kwargs.update(unused_kwargs)
  652. logger.info(f"Processor {processor}")
  653. if return_unused_kwargs:
  654. return processor, kwargs
  655. else:
  656. return processor
  657. def _merge_kwargs(
  658. self,
  659. ModelProcessorKwargs: ProcessingKwargs,
  660. tokenizer_init_kwargs: Optional[Dict] = None,
  661. **kwargs,
  662. ) -> Dict[str, Dict]:
  663. """
  664. Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
  665. The order of operations is as follows:
  666. 1) kwargs passed as before have highest priority to preserve BC.
  667. ```python
  668. high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"}
  669. processor(..., **high_priority_kwargs)
  670. ```
  671. 2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
  672. ```python
  673. processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}})
  674. ```
  675. 3) kwargs passed during instantiation of a modality processor have fourth priority.
  676. ```python
  677. tokenizer = tokenizer_class(..., {"padding": "max_length"})
  678. image_processor = image_processor_class(...)
  679. processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call
  680. ```
  681. 4) defaults kwargs specified at processor level have lowest priority.
  682. ```python
  683. class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
  684. _defaults = {
  685. "text_kwargs": {
  686. "padding": "max_length",
  687. "max_length": 64,
  688. },
  689. }
  690. ```
  691. Args:
  692. ModelProcessorKwargs (`ProcessingKwargs`):
  693. Typed dictionary of kwargs specifically required by the model passed.
  694. tokenizer_init_kwargs (`Dict`, *optional*):
  695. Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
  696. Returns:
  697. output_kwargs (`Dict`):
  698. Dictionary of per-modality kwargs to be passed to each modality-specific processor.
  699. """
  700. # Initialize dictionaries
  701. output_kwargs = {
  702. "text_kwargs": {},
  703. "images_kwargs": {},
  704. "audio_kwargs": {},
  705. "videos_kwargs": {},
  706. "common_kwargs": {},
  707. }
  708. default_kwargs = {
  709. "text_kwargs": {},
  710. "images_kwargs": {},
  711. "audio_kwargs": {},
  712. "videos_kwargs": {},
  713. "common_kwargs": {},
  714. }
  715. used_keys = set()
  716. # get defaults from set model processor kwargs if they exist
  717. for modality in default_kwargs:
  718. default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
  719. # update defaults with arguments from tokenizer init
  720. for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
  721. # init with tokenizer init kwargs if necessary
  722. if modality_key in tokenizer_init_kwargs:
  723. value = (
  724. getattr(self.tokenizer, modality_key)
  725. if hasattr(self.tokenizer, modality_key)
  726. else tokenizer_init_kwargs[modality_key]
  727. )
  728. default_kwargs[modality][modality_key] = value
  729. # now defaults kwargs are updated with the tokenizers defaults.
  730. # pass defaults to output dictionary
  731. output_kwargs.update(default_kwargs)
  732. # update modality kwargs with passed kwargs
  733. non_modality_kwargs = set(kwargs) - set(output_kwargs)
  734. for modality in output_kwargs:
  735. for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
  736. # check if we received a structured kwarg dict or not to handle it correctly
  737. if modality in kwargs:
  738. kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
  739. # check if this key was passed as a flat kwarg.
  740. if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
  741. raise ValueError(
  742. f"Keyword argument {modality_key} was passed two times:\n"
  743. f"in a dictionary for {modality} and as a **kwarg."
  744. )
  745. elif modality_key in kwargs:
  746. # we get a modality_key instead of popping it because modality-specific processors
  747. # can have overlapping kwargs
  748. kwarg_value = kwargs.get(modality_key, "__empty__")
  749. else:
  750. kwarg_value = "__empty__"
  751. if kwarg_value != "__empty__":
  752. output_kwargs[modality][modality_key] = kwarg_value
  753. used_keys.add(modality_key)
  754. # Determine if kwargs is a flat dictionary or contains nested dictionaries
  755. if any(key in default_kwargs for key in kwargs):
  756. # kwargs is dictionary-based, and some keys match modality names
  757. for modality, subdict in kwargs.items():
  758. if modality in default_kwargs:
  759. for subkey, subvalue in subdict.items():
  760. if subkey not in used_keys:
  761. output_kwargs[modality][subkey] = subvalue
  762. used_keys.add(subkey)
  763. else:
  764. # kwargs is a flat dictionary
  765. for key in kwargs:
  766. if key not in used_keys:
  767. output_kwargs["common_kwargs"][key] = kwargs[key]
  768. # all modality-specific kwargs are updated with common kwargs
  769. for modality in output_kwargs:
  770. output_kwargs[modality].update(output_kwargs["common_kwargs"])
  771. return output_kwargs
  772. @classmethod
  773. def from_pretrained(
  774. cls,
  775. pretrained_model_name_or_path: Union[str, os.PathLike],
  776. cache_dir: Optional[Union[str, os.PathLike]] = None,
  777. force_download: bool = False,
  778. local_files_only: bool = False,
  779. token: Optional[Union[str, bool]] = None,
  780. revision: str = "main",
  781. **kwargs,
  782. ):
  783. r"""
  784. Instantiate a processor associated with a pretrained model.
  785. <Tip>
  786. This class method is simply calling the feature extractor
  787. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
  788. [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
  789. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
  790. methods above for more information.
  791. </Tip>
  792. Args:
  793. pretrained_model_name_or_path (`str` or `os.PathLike`):
  794. This can be either:
  795. - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
  796. huggingface.co.
  797. - a path to a *directory* containing a feature extractor file saved using the
  798. [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
  799. - a path or url to a saved feature extractor JSON *file*, e.g.,
  800. `./my_model_directory/preprocessor_config.json`.
  801. **kwargs
  802. Additional keyword arguments passed along to both
  803. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
  804. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
  805. """
  806. kwargs["cache_dir"] = cache_dir
  807. kwargs["force_download"] = force_download
  808. kwargs["local_files_only"] = local_files_only
  809. kwargs["revision"] = revision
  810. use_auth_token = kwargs.pop("use_auth_token", None)
  811. if use_auth_token is not None:
  812. warnings.warn(
  813. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  814. FutureWarning,
  815. )
  816. if token is not None:
  817. raise ValueError(
  818. "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
  819. )
  820. token = use_auth_token
  821. if token is not None:
  822. kwargs["token"] = token
  823. args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
  824. processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
  825. return cls.from_args_and_dict(args, processor_dict, **kwargs)
  826. @classmethod
  827. def register_for_auto_class(cls, auto_class="AutoProcessor"):
  828. """
  829. Register this class with a given auto class. This should only be used for custom feature extractors as the ones
  830. in the library are already mapped with `AutoProcessor`.
  831. <Tip warning={true}>
  832. This API is experimental and may have some slight breaking changes in the next releases.
  833. </Tip>
  834. Args:
  835. auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`):
  836. The auto class to register this new feature extractor with.
  837. """
  838. if not isinstance(auto_class, str):
  839. auto_class = auto_class.__name__
  840. import transformers.models.auto as auto_module
  841. if not hasattr(auto_module, auto_class):
  842. raise ValueError(f"{auto_class} is not a valid auto class.")
  843. cls._auto_class = auto_class
  844. @classmethod
  845. def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
  846. args = []
  847. for attribute_name in cls.attributes:
  848. class_name = getattr(cls, f"{attribute_name}_class")
  849. if isinstance(class_name, tuple):
  850. classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
  851. use_fast = kwargs.get("use_fast", True)
  852. if use_fast and classes[1] is not None:
  853. attribute_class = classes[1]
  854. else:
  855. attribute_class = classes[0]
  856. else:
  857. attribute_class = getattr(transformers_module, class_name)
  858. args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
  859. return args
  860. @property
  861. def model_input_names(self):
  862. first_attribute = getattr(self, self.attributes[0])
  863. return getattr(first_attribute, "model_input_names", None)
  864. @staticmethod
  865. def validate_init_kwargs(processor_config, valid_kwargs):
  866. kwargs_from_config = processor_config.keys()
  867. unused_kwargs = {}
  868. unused_keys = set(kwargs_from_config) - set(valid_kwargs)
  869. if unused_keys:
  870. unused_key_str = ", ".join(unused_keys)
  871. logger.warning(
  872. f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
  873. )
  874. unused_kwargs = {k: processor_config[k] for k in unused_keys}
  875. return unused_kwargs
  876. def prepare_and_validate_optional_call_args(self, *args):
  877. """
  878. Matches optional positional arguments to their corresponding names in `optional_call_args`
  879. in the processor class in the order they are passed to the processor call.
  880. Note that this should only be used in the `__call__` method of the processors with special
  881. arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos`
  882. but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are:
  883. - `CLIPSegProcessor`
  884. - `LayoutLMv2Processor`
  885. - `OwlViTProcessor`
  886. Also note that passing by position to the processor call is now deprecated and will be disallowed
  887. in future versions. We only have this for backward compatibility.
  888. Example:
  889. Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`.
  890. And we define the call method as:
  891. ```python
  892. def __call__(
  893. self,
  894. text: str,
  895. images: Optional[ImageInput] = None,
  896. *arg,
  897. audio=None,
  898. videos=None,
  899. )
  900. ```
  901. Then, if we call the processor as:
  902. ```python
  903. images = [...]
  904. processor("What is common in these images?", images, arg_value_1, arg_value_2)
  905. ```
  906. Then, this method will return:
  907. ```python
  908. {
  909. "arg_name_1": arg_value_1,
  910. "arg_name_2": arg_value_2,
  911. }
  912. ```
  913. which we could then pass as kwargs to `self._merge_kwargs`
  914. """
  915. if len(args):
  916. warnings.warn(
  917. "Passing positional arguments to the processor call is now deprecated and will be disallowed in v4.47. "
  918. "Please pass all arguments as keyword arguments."
  919. )
  920. if len(args) > len(self.optional_call_args):
  921. raise ValueError(
  922. f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call"
  923. f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed."
  924. f"However, got {len(args)} positional arguments instead."
  925. "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`."
  926. )
  927. return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)}
  928. def apply_chat_template(
  929. self,
  930. conversation: Union[List[Dict[str, str]]],
  931. chat_template: Optional[str] = None,
  932. tokenize: bool = False,
  933. **kwargs,
  934. ) -> str:
  935. """
  936. Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
  937. conversations to turn them into a single tokenizable string.
  938. Args:
  939. conversation (`List[Dict, str, str]`):
  940. The conversation to format.
  941. chat_template (`Optional[str]`, *optional*):
  942. The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
  943. chat template is used.
  944. tokenize (`bool`, *optional*, defaults to `False`):
  945. Whether to tokenize the output or not.
  946. **kwargs:
  947. Additional keyword arguments
  948. """
  949. if chat_template is None:
  950. if self.chat_template is not None:
  951. chat_template = self.chat_template
  952. else:
  953. raise ValueError(
  954. "No chat template is set for this processor. Please either set the `chat_template` attribute, "
  955. "or provide a chat template as an argument. See "
  956. "https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
  957. )
  958. return self.tokenizer.apply_chat_template(
  959. conversation, chat_template=chat_template, tokenize=tokenize, **kwargs
  960. )
  961. def _validate_images_text_input_order(images, text):
  962. """
  963. For backward compatibility: reverse the order of `images` and `text` inputs if they are swapped.
  964. This method should only be called for processors where `images` and `text` have been swapped for uniformization purposes.
  965. Note that this method assumes that two `None` inputs are valid inputs. If this is not the case, it should be handled
  966. in the processor's `__call__` method before calling this method.
  967. """
  968. def is_url(val) -> bool:
  969. return isinstance(val, str) and val.startswith("http")
  970. def _is_valid_images_input_for_processor(imgs):
  971. # If we have an list of images, make sure every image is valid
  972. if isinstance(imgs, (list, tuple)):
  973. for img in imgs:
  974. if not _is_valid_images_input_for_processor(img):
  975. return False
  976. # If not a list or tuple, we have been given a single image or batched tensor of images
  977. elif not (is_valid_image(imgs) or is_url(imgs)):
  978. return False
  979. return True
  980. def _is_valid_text_input_for_processor(t):
  981. if isinstance(t, str):
  982. # Strings are fine
  983. return True
  984. elif isinstance(t, (list, tuple)):
  985. # List are fine as long as they are...
  986. if len(t) == 0:
  987. # ... not empty
  988. return False
  989. for t_s in t:
  990. return _is_valid_text_input_for_processor(t_s)
  991. return False
  992. def _is_valid(input, validator):
  993. return validator(input) or input is None
  994. images_is_valid = _is_valid(images, _is_valid_images_input_for_processor)
  995. images_is_text = _is_valid_text_input_for_processor(images)
  996. text_is_valid = _is_valid(text, _is_valid_text_input_for_processor)
  997. text_is_images = _is_valid_images_input_for_processor(text)
  998. # Handle cases where both inputs are valid
  999. if images_is_valid and text_is_valid:
  1000. return images, text
  1001. # Handle cases where inputs need to and can be swapped
  1002. if (images is None and text_is_images) or (text is None and images_is_text) or (images_is_text and text_is_images):
  1003. logger.warning_once(
  1004. "You may have used the wrong order for inputs. `images` should be passed before `text`. "
  1005. "The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47."
  1006. )
  1007. return text, images
  1008. raise ValueError("Invalid input type. Check that `images` and/or `text` are valid inputs.")
  1009. ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
  1010. if ProcessorMixin.push_to_hub.__doc__ is not None:
  1011. ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
  1012. object="processor", object_class="AutoProcessor", object_files="processor files"
  1013. )