configuration_dpt.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. # coding=utf-8
  2. # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """DPT model configuration"""
  16. import copy
  17. from ...configuration_utils import PretrainedConfig
  18. from ...utils import logging
  19. from ...utils.backbone_utils import verify_backbone_config_arguments
  20. from ..auto.configuration_auto import CONFIG_MAPPING
  21. from ..bit import BitConfig
  22. logger = logging.get_logger(__name__)
  23. class DPTConfig(PretrainedConfig):
  24. r"""
  25. This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
  26. model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  27. defaults will yield a similar configuration to that of the DPT
  28. [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.
  29. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
  30. documentation from [`PretrainedConfig`] for more information.
  31. Args:
  32. hidden_size (`int`, *optional*, defaults to 768):
  33. Dimensionality of the encoder layers and the pooler layer.
  34. num_hidden_layers (`int`, *optional*, defaults to 12):
  35. Number of hidden layers in the Transformer encoder.
  36. num_attention_heads (`int`, *optional*, defaults to 12):
  37. Number of attention heads for each attention layer in the Transformer encoder.
  38. intermediate_size (`int`, *optional*, defaults to 3072):
  39. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
  40. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
  41. The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
  42. `"relu"`, `"selu"` and `"gelu_new"` are supported.
  43. hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
  44. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
  45. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
  46. The dropout ratio for the attention probabilities.
  47. initializer_range (`float`, *optional*, defaults to 0.02):
  48. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
  49. layer_norm_eps (`float`, *optional*, defaults to 1e-12):
  50. The epsilon used by the layer normalization layers.
  51. image_size (`int`, *optional*, defaults to 384):
  52. The size (resolution) of each image.
  53. patch_size (`int`, *optional*, defaults to 16):
  54. The size (resolution) of each patch.
  55. num_channels (`int`, *optional*, defaults to 3):
  56. The number of input channels.
  57. is_hybrid (`bool`, *optional*, defaults to `False`):
  58. Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
  59. qkv_bias (`bool`, *optional*, defaults to `True`):
  60. Whether to add a bias to the queries, keys and values.
  61. backbone_out_indices (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
  62. Indices of the intermediate hidden states to use from backbone.
  63. readout_type (`str`, *optional*, defaults to `"project"`):
  64. The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
  65. the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
  66. - "ignore" simply ignores the CLS token.
  67. - "add" passes the information from the CLS token to all other tokens by adding the representations.
  68. - "project" passes information to the other tokens by concatenating the readout to all other tokens before
  69. projecting the
  70. representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
  71. reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
  72. The up/downsampling factors of the reassemble layers.
  73. neck_hidden_sizes (`List[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
  74. The hidden sizes to project to for the feature maps of the backbone.
  75. fusion_hidden_size (`int`, *optional*, defaults to 256):
  76. The number of channels before fusion.
  77. head_in_index (`int`, *optional*, defaults to -1):
  78. The index of the features to use in the heads.
  79. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
  80. Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
  81. use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
  82. Whether to use bias in the pre-activate residual units of the fusion blocks.
  83. add_projection (`bool`, *optional*, defaults to `False`):
  84. Whether to add a projection layer before the depth estimation head.
  85. use_auxiliary_head (`bool`, *optional*, defaults to `True`):
  86. Whether to use an auxiliary head during training.
  87. auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
  88. Weight of the cross-entropy loss of the auxiliary head.
  89. semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
  90. The index that is ignored by the loss function of the semantic segmentation model.
  91. semantic_classifier_dropout (`float`, *optional*, defaults to 0.1):
  92. The dropout ratio for the semantic classification head.
  93. backbone_featmap_shape (`List[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
  94. Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
  95. neck_ignore_stages (`List[int]`, *optional*, defaults to `[0, 1]`):
  96. Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
  97. backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
  98. The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
  99. leverage the [`AutoBackbone`] API.
  100. backbone (`str`, *optional*):
  101. Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
  102. will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
  103. is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
  104. use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
  105. Whether to use pretrained weights for the backbone.
  106. use_timm_backbone (`bool`, *optional*, defaults to `False`):
  107. Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
  108. library.
  109. backbone_kwargs (`dict`, *optional*):
  110. Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
  111. e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
  112. Example:
  113. ```python
  114. >>> from transformers import DPTModel, DPTConfig
  115. >>> # Initializing a DPT dpt-large style configuration
  116. >>> configuration = DPTConfig()
  117. >>> # Initializing a model from the dpt-large style configuration
  118. >>> model = DPTModel(configuration)
  119. >>> # Accessing the model configuration
  120. >>> configuration = model.config
  121. ```"""
  122. model_type = "dpt"
  123. def __init__(
  124. self,
  125. hidden_size=768,
  126. num_hidden_layers=12,
  127. num_attention_heads=12,
  128. intermediate_size=3072,
  129. hidden_act="gelu",
  130. hidden_dropout_prob=0.0,
  131. attention_probs_dropout_prob=0.0,
  132. initializer_range=0.02,
  133. layer_norm_eps=1e-12,
  134. image_size=384,
  135. patch_size=16,
  136. num_channels=3,
  137. is_hybrid=False,
  138. qkv_bias=True,
  139. backbone_out_indices=[2, 5, 8, 11],
  140. readout_type="project",
  141. reassemble_factors=[4, 2, 1, 0.5],
  142. neck_hidden_sizes=[96, 192, 384, 768],
  143. fusion_hidden_size=256,
  144. head_in_index=-1,
  145. use_batch_norm_in_fusion_residual=False,
  146. use_bias_in_fusion_residual=None,
  147. add_projection=False,
  148. use_auxiliary_head=True,
  149. auxiliary_loss_weight=0.4,
  150. semantic_loss_ignore_index=255,
  151. semantic_classifier_dropout=0.1,
  152. backbone_featmap_shape=[1, 1024, 24, 24],
  153. neck_ignore_stages=[0, 1],
  154. backbone_config=None,
  155. backbone=None,
  156. use_pretrained_backbone=False,
  157. use_timm_backbone=False,
  158. backbone_kwargs=None,
  159. **kwargs,
  160. ):
  161. super().__init__(**kwargs)
  162. self.hidden_size = hidden_size
  163. self.is_hybrid = is_hybrid
  164. use_autobackbone = False
  165. if self.is_hybrid:
  166. if backbone_config is None:
  167. backbone_config = {
  168. "global_padding": "same",
  169. "layer_type": "bottleneck",
  170. "depths": [3, 4, 9],
  171. "out_features": ["stage1", "stage2", "stage3"],
  172. "embedding_dynamic_padding": True,
  173. }
  174. if isinstance(backbone_config, dict):
  175. logger.info("Initializing the config with a `BiT` backbone.")
  176. backbone_config = BitConfig(**backbone_config)
  177. elif isinstance(backbone_config, PretrainedConfig):
  178. backbone_config = backbone_config
  179. else:
  180. raise ValueError(
  181. f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}."
  182. )
  183. self.backbone_config = backbone_config
  184. self.backbone_featmap_shape = backbone_featmap_shape
  185. self.neck_ignore_stages = neck_ignore_stages
  186. if readout_type != "project":
  187. raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
  188. elif backbone is not None or backbone_config is not None:
  189. use_autobackbone = True
  190. if isinstance(backbone_config, dict):
  191. backbone_model_type = backbone_config.get("model_type")
  192. config_class = CONFIG_MAPPING[backbone_model_type]
  193. backbone_config = config_class.from_dict(backbone_config)
  194. self.backbone_config = backbone_config
  195. self.backbone_featmap_shape = None
  196. self.neck_ignore_stages = []
  197. # We only use load_backbone when config.is_hydrid is False
  198. verify_backbone_config_arguments(
  199. use_timm_backbone=use_timm_backbone,
  200. use_pretrained_backbone=use_pretrained_backbone,
  201. backbone=backbone,
  202. backbone_config=backbone_config,
  203. backbone_kwargs=backbone_kwargs,
  204. )
  205. else:
  206. self.backbone_config = None
  207. self.backbone_featmap_shape = None
  208. self.neck_ignore_stages = []
  209. self.backbone = backbone
  210. self.use_pretrained_backbone = use_pretrained_backbone
  211. self.use_timm_backbone = use_timm_backbone
  212. self.backbone_kwargs = backbone_kwargs
  213. # ViT parameters used if not using a hybrid backbone
  214. self.num_hidden_layers = num_hidden_layers
  215. self.num_attention_heads = num_attention_heads
  216. self.intermediate_size = intermediate_size
  217. self.hidden_dropout_prob = hidden_dropout_prob
  218. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  219. self.layer_norm_eps = layer_norm_eps
  220. self.image_size = image_size
  221. self.patch_size = patch_size
  222. self.num_channels = num_channels
  223. self.qkv_bias = qkv_bias
  224. self.use_autobackbone = use_autobackbone
  225. self.backbone_out_indices = None if use_autobackbone else backbone_out_indices
  226. if readout_type not in ["ignore", "add", "project"]:
  227. raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
  228. self.hidden_act = hidden_act
  229. self.initializer_range = initializer_range
  230. self.readout_type = readout_type
  231. self.reassemble_factors = reassemble_factors
  232. self.neck_hidden_sizes = neck_hidden_sizes
  233. self.fusion_hidden_size = fusion_hidden_size
  234. self.head_in_index = head_in_index
  235. self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
  236. self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
  237. self.add_projection = add_projection
  238. # auxiliary head attributes (semantic segmentation)
  239. self.use_auxiliary_head = use_auxiliary_head
  240. self.auxiliary_loss_weight = auxiliary_loss_weight
  241. self.semantic_loss_ignore_index = semantic_loss_ignore_index
  242. self.semantic_classifier_dropout = semantic_classifier_dropout
  243. def to_dict(self):
  244. """
  245. Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
  246. `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
  247. """
  248. output = copy.deepcopy(self.__dict__)
  249. if output["backbone_config"] is not None:
  250. output["backbone_config"] = self.backbone_config.to_dict()
  251. output["model_type"] = self.__class__.model_type
  252. return output