configuration_align.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. # coding=utf-8
  2. # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """ALIGN model configuration"""
  16. import os
  17. from typing import TYPE_CHECKING, List, Union
  18. if TYPE_CHECKING:
  19. pass
  20. from ...configuration_utils import PretrainedConfig
  21. from ...utils import logging
  22. logger = logging.get_logger(__name__)
  23. class AlignTextConfig(PretrainedConfig):
  24. r"""
  25. This is the configuration class to store the configuration of a [`AlignTextModel`]. It is used to instantiate a
  26. ALIGN text encoder according to the specified arguments, defining the model architecture. Instantiating a
  27. configuration with the defaults will yield a similar configuration to that of the text encoder of the ALIGN
  28. [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values here are
  29. copied from BERT.
  30. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  31. documentation from [`PretrainedConfig`] for more information.
  32. Args:
  33. vocab_size (`int`, *optional*, defaults to 30522):
  34. Vocabulary size of the Align Text model. Defines the number of different tokens that can be represented by
  35. the `inputs_ids` passed when calling [`AlignTextModel`].
  36. hidden_size (`int`, *optional*, defaults to 768):
  37. Dimensionality of the encoder layers and the pooler layer.
  38. num_hidden_layers (`int`, *optional*, defaults to 12):
  39. Number of hidden layers in the Transformer encoder.
  40. num_attention_heads (`int`, *optional*, defaults to 12):
  41. Number of attention heads for each attention layer in the Transformer encoder.
  42. intermediate_size (`int`, *optional*, defaults to 3072):
  43. Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
  44. hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
  45. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  46. `"relu"`, `"silu"` and `"gelu_new"` are supported.
  47. hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
  48. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  49. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
  50. The dropout ratio for the attention probabilities.
  51. max_position_embeddings (`int`, *optional*, defaults to 512):
  52. The maximum sequence length that this model might ever be used with. Typically set this to something large
  53. just in case (e.g., 512 or 1024 or 2048).
  54. type_vocab_size (`int`, *optional*, defaults to 2):
  55. The vocabulary size of the `token_type_ids` passed when calling [`AlignTextModel`].
  56. initializer_range (`float`, *optional*, defaults to 0.02):
  57. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  58. layer_norm_eps (`float`, *optional*, defaults to 1e-12):
  59. The epsilon used by the layer normalization layers.
  60. pad_token_id (`int`, *optional*, defaults to 0):
  61. Padding token id.
  62. position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
  63. Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
  64. positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
  65. [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
  66. For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
  67. with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
  68. use_cache (`bool`, *optional*, defaults to `True`):
  69. Whether or not the model should return the last key/values attentions (not used by all models). Only
  70. relevant if `config.is_decoder=True`.
  71. Example:
  72. ```python
  73. >>> from transformers import AlignTextConfig, AlignTextModel
  74. >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration
  75. >>> configuration = AlignTextConfig()
  76. >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration
  77. >>> model = AlignTextModel(configuration)
  78. >>> # Accessing the model configuration
  79. >>> configuration = model.config
  80. ```"""
  81. model_type = "align_text_model"
  82. def __init__(
  83. self,
  84. vocab_size=30522,
  85. hidden_size=768,
  86. num_hidden_layers=12,
  87. num_attention_heads=12,
  88. intermediate_size=3072,
  89. hidden_act="gelu",
  90. hidden_dropout_prob=0.1,
  91. attention_probs_dropout_prob=0.1,
  92. max_position_embeddings=512,
  93. type_vocab_size=2,
  94. initializer_range=0.02,
  95. layer_norm_eps=1e-12,
  96. pad_token_id=0,
  97. position_embedding_type="absolute",
  98. use_cache=True,
  99. **kwargs,
  100. ):
  101. super().__init__(**kwargs)
  102. self.vocab_size = vocab_size
  103. self.hidden_size = hidden_size
  104. self.num_hidden_layers = num_hidden_layers
  105. self.num_attention_heads = num_attention_heads
  106. self.hidden_act = hidden_act
  107. self.intermediate_size = intermediate_size
  108. self.hidden_dropout_prob = hidden_dropout_prob
  109. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  110. self.max_position_embeddings = max_position_embeddings
  111. self.type_vocab_size = type_vocab_size
  112. self.initializer_range = initializer_range
  113. self.layer_norm_eps = layer_norm_eps
  114. self.position_embedding_type = position_embedding_type
  115. self.use_cache = use_cache
  116. self.pad_token_id = pad_token_id
  117. @classmethod
  118. def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
  119. cls._set_token_in_kwargs(kwargs)
  120. config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
  121. # get the text config dict if we are loading from AlignConfig
  122. if config_dict.get("model_type") == "align":
  123. config_dict = config_dict["text_config"]
  124. if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
  125. logger.warning(
  126. f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
  127. f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
  128. )
  129. return cls.from_dict(config_dict, **kwargs)
  130. class AlignVisionConfig(PretrainedConfig):
  131. r"""
  132. This is the configuration class to store the configuration of a [`AlignVisionModel`]. It is used to instantiate a
  133. ALIGN vision encoder according to the specified arguments, defining the model architecture. Instantiating a
  134. configuration with the defaults will yield a similar configuration to that of the vision encoder of the ALIGN
  135. [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values are copied
  136. from EfficientNet (efficientnet-b7)
  137. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  138. documentation from [`PretrainedConfig`] for more information.
  139. Args:
  140. num_channels (`int`, *optional*, defaults to 3):
  141. The number of input channels.
  142. image_size (`int`, *optional*, defaults to 600):
  143. The input image size.
  144. width_coefficient (`float`, *optional*, defaults to 2.0):
  145. Scaling coefficient for network width at each stage.
  146. depth_coefficient (`float`, *optional*, defaults to 3.1):
  147. Scaling coefficient for network depth at each stage.
  148. depth_divisor `int`, *optional*, defaults to 8):
  149. A unit of network width.
  150. kernel_sizes (`List[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
  151. List of kernel sizes to be used in each block.
  152. in_channels (`List[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
  153. List of input channel sizes to be used in each block for convolutional layers.
  154. out_channels (`List[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
  155. List of output channel sizes to be used in each block for convolutional layers.
  156. depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
  157. List of block indices with square padding.
  158. strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
  159. List of stride sizes to be used in each block for convolutional layers.
  160. num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
  161. List of the number of times each block is to repeated.
  162. expand_ratios (`List[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
  163. List of scaling coefficient of each block.
  164. squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
  165. Squeeze expansion ratio.
  166. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
  167. The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
  168. `"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported.
  169. hidden_dim (`int`, *optional*, defaults to 1280):
  170. The hidden dimension of the layer before the classification head.
  171. pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
  172. Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
  173. `"max"`]
  174. initializer_range (`float`, *optional*, defaults to 0.02):
  175. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  176. batch_norm_eps (`float`, *optional*, defaults to 1e-3):
  177. The epsilon used by the batch normalization layers.
  178. batch_norm_momentum (`float`, *optional*, defaults to 0.99):
  179. The momentum used by the batch normalization layers.
  180. drop_connect_rate (`float`, *optional*, defaults to 0.2):
  181. The drop rate for skip connections.
  182. Example:
  183. ```python
  184. >>> from transformers import AlignVisionConfig, AlignVisionModel
  185. >>> # Initializing a AlignVisionConfig with kakaobrain/align-base style configuration
  186. >>> configuration = AlignVisionConfig()
  187. >>> # Initializing a AlignVisionModel (with random weights) from the kakaobrain/align-base style configuration
  188. >>> model = AlignVisionModel(configuration)
  189. >>> # Accessing the model configuration
  190. >>> configuration = model.config
  191. ```"""
  192. model_type = "align_vision_model"
  193. def __init__(
  194. self,
  195. num_channels: int = 3,
  196. image_size: int = 600,
  197. width_coefficient: float = 2.0,
  198. depth_coefficient: float = 3.1,
  199. depth_divisor: int = 8,
  200. kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
  201. in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
  202. out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
  203. depthwise_padding: List[int] = [],
  204. strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
  205. num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
  206. expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
  207. squeeze_expansion_ratio: float = 0.25,
  208. hidden_act: str = "swish",
  209. hidden_dim: int = 2560,
  210. pooling_type: str = "mean",
  211. initializer_range: float = 0.02,
  212. batch_norm_eps: float = 0.001,
  213. batch_norm_momentum: float = 0.99,
  214. drop_connect_rate: float = 0.2,
  215. **kwargs,
  216. ):
  217. super().__init__(**kwargs)
  218. self.num_channels = num_channels
  219. self.image_size = image_size
  220. self.width_coefficient = width_coefficient
  221. self.depth_coefficient = depth_coefficient
  222. self.depth_divisor = depth_divisor
  223. self.kernel_sizes = kernel_sizes
  224. self.in_channels = in_channels
  225. self.out_channels = out_channels
  226. self.depthwise_padding = depthwise_padding
  227. self.strides = strides
  228. self.num_block_repeats = num_block_repeats
  229. self.expand_ratios = expand_ratios
  230. self.squeeze_expansion_ratio = squeeze_expansion_ratio
  231. self.hidden_act = hidden_act
  232. self.hidden_dim = hidden_dim
  233. self.pooling_type = pooling_type
  234. self.initializer_range = initializer_range
  235. self.batch_norm_eps = batch_norm_eps
  236. self.batch_norm_momentum = batch_norm_momentum
  237. self.drop_connect_rate = drop_connect_rate
  238. self.num_hidden_layers = sum(num_block_repeats) * 4
  239. @classmethod
  240. def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
  241. cls._set_token_in_kwargs(kwargs)
  242. config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
  243. # get the vision config dict if we are loading from AlignConfig
  244. if config_dict.get("model_type") == "align":
  245. config_dict = config_dict["vision_config"]
  246. if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
  247. logger.warning(
  248. f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
  249. f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
  250. )
  251. return cls.from_dict(config_dict, **kwargs)
  252. class AlignConfig(PretrainedConfig):
  253. r"""
  254. [`AlignConfig`] is the configuration class to store the configuration of a [`AlignModel`]. It is used to
  255. instantiate a ALIGN model according to the specified arguments, defining the text model and vision model configs.
  256. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALIGN
  257. [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture.
  258. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  259. documentation from [`PretrainedConfig`] for more information.
  260. Args:
  261. text_config (`dict`, *optional*):
  262. Dictionary of configuration options used to initialize [`AlignTextConfig`].
  263. vision_config (`dict`, *optional*):
  264. Dictionary of configuration options used to initialize [`AlignVisionConfig`].
  265. projection_dim (`int`, *optional*, defaults to 640):
  266. Dimensionality of text and vision projection layers.
  267. temperature_init_value (`float`, *optional*, defaults to 1.0):
  268. The initial value of the *temperature* parameter. Default is used as per the original ALIGN implementation.
  269. initializer_range (`float`, *optional*, defaults to 0.02):
  270. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  271. kwargs (*optional*):
  272. Dictionary of keyword arguments.
  273. Example:
  274. ```python
  275. >>> from transformers import AlignConfig, AlignModel
  276. >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration
  277. >>> configuration = AlignConfig()
  278. >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration
  279. >>> model = AlignModel(configuration)
  280. >>> # Accessing the model configuration
  281. >>> configuration = model.config
  282. >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig
  283. >>> from transformers import AlignTextConfig, AlignVisionConfig
  284. >>> # Initializing ALIGN Text and Vision configurations
  285. >>> config_text = AlignTextConfig()
  286. >>> config_vision = AlignVisionConfig()
  287. >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision)
  288. ```"""
  289. model_type = "align"
  290. def __init__(
  291. self,
  292. text_config=None,
  293. vision_config=None,
  294. projection_dim=640,
  295. temperature_init_value=1.0,
  296. initializer_range=0.02,
  297. **kwargs,
  298. ):
  299. super().__init__(**kwargs)
  300. if text_config is None:
  301. text_config = {}
  302. logger.info("text_config is None. Initializing the AlignTextConfig with default values.")
  303. if vision_config is None:
  304. vision_config = {}
  305. logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.")
  306. self.text_config = AlignTextConfig(**text_config)
  307. self.vision_config = AlignVisionConfig(**vision_config)
  308. self.projection_dim = projection_dim
  309. self.temperature_init_value = temperature_init_value
  310. self.initializer_range = initializer_range
  311. @classmethod
  312. def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: AlignVisionConfig, **kwargs):
  313. r"""
  314. Instantiate a [`AlignConfig`] (or a derived class) from align text model configuration and align vision model
  315. configuration.
  316. Returns:
  317. [`AlignConfig`]: An instance of a configuration object
  318. """
  319. return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
  320. __all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]