modeling_blip.py 66 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565
  1. # coding=utf-8
  2. # Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """PyTorch BLIP model."""
  16. import warnings
  17. from dataclasses import dataclass
  18. from typing import Any, Optional, Tuple, Union
  19. import torch
  20. import torch.utils.checkpoint
  21. from torch import nn
  22. from torch.nn.functional import normalize
  23. from ...activations import ACT2FN
  24. from ...generation import GenerationMixin
  25. from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
  26. from ...modeling_utils import PreTrainedModel
  27. from ...utils import (
  28. ModelOutput,
  29. add_start_docstrings,
  30. add_start_docstrings_to_model_forward,
  31. logging,
  32. replace_return_docstrings,
  33. torch_int,
  34. )
  35. from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
  36. from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel
  37. logger = logging.get_logger(__name__)
  38. _CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"
  39. # Copied from transformers.models.clip.modeling_clip.contrastive_loss
  40. def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
  41. return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
  42. # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->blip
  43. def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
  44. caption_loss = contrastive_loss(similarity)
  45. image_loss = contrastive_loss(similarity.t())
  46. return (caption_loss + image_loss) / 2.0
  47. @dataclass
  48. class BlipForConditionalGenerationModelOutput(ModelOutput):
  49. """
  50. Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
  51. last hidden states. This class also adds the loss term from the text decoder.
  52. Args:
  53. loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
  54. Languge modeling loss from the text decoder.
  55. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
  56. Prediction scores of the language modeling head of the text decoder model.
  57. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
  58. The image embeddings obtained after applying the Vision Transformer model to the input image.
  59. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  60. Sequence of hidden-states at the output of the last layer of the model.
  61. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
  62. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  63. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  64. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  65. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
  66. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  67. sequence_length)`.
  68. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  69. heads.
  70. """
  71. loss: Optional[Tuple[torch.FloatTensor]] = None
  72. logits: Optional[Tuple[torch.FloatTensor]] = None
  73. image_embeds: Optional[torch.FloatTensor] = None
  74. last_hidden_state: torch.FloatTensor = None
  75. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  76. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  77. @property
  78. def decoder_logits(self):
  79. warnings.warn(
  80. "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
  81. " Please use the `logits` attribute to retrieve the final output instead.",
  82. FutureWarning,
  83. )
  84. return self.logits
  85. @dataclass
  86. class BlipTextVisionModelOutput(ModelOutput):
  87. """
  88. Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
  89. last hidden states. This class also adds the loss term from the text decoder.
  90. Args:
  91. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  92. Languge modeling loss from the text decoder.
  93. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
  94. The image embeddings obtained by applying the projection layer to the pooler_output.
  95. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  96. Sequence of hidden-states at the output of the last layer of the model.
  97. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  98. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  99. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  100. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  101. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  102. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  103. sequence_length)`.
  104. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  105. heads.
  106. """
  107. loss: Optional[torch.FloatTensor] = None
  108. image_embeds: Optional[torch.FloatTensor] = None
  109. last_hidden_state: torch.FloatTensor = None
  110. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  111. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  112. @dataclass
  113. class BlipImageTextMatchingModelOutput(ModelOutput):
  114. """
  115. Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
  116. last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
  117. scores.
  118. Args:
  119. itm_score (`torch.FloatTensor`):
  120. The image-text similarity scores.
  121. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  122. Languge modeling loss from the text decoder.
  123. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
  124. The image embeddings obtained by applying the projection layer to the pooler_output.
  125. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  126. Sequence of hidden-states at the output of the last layer of the model.
  127. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  128. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  129. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  130. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  131. vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
  132. Last layer hidden-state of the vision of the vision-only branch of the model.
  133. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  134. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  135. sequence_length)`.
  136. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  137. heads.
  138. question_embeds (`torch.FloatTensor`):
  139. The question embeddings obtained by the text projection layer.
  140. """
  141. itm_score: Optional[torch.FloatTensor] = None
  142. loss: Optional[torch.FloatTensor] = None
  143. image_embeds: Optional[torch.FloatTensor] = None
  144. last_hidden_state: torch.FloatTensor = None
  145. hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
  146. vision_pooler_output: Optional[torch.FloatTensor] = None
  147. attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
  148. question_embeds: Optional[Tuple[torch.FloatTensor]] = None
  149. @dataclass
  150. class BlipOutput(ModelOutput):
  151. """
  152. Args:
  153. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
  154. Contrastive loss for image-text similarity.
  155. logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
  156. The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
  157. similarity scores.
  158. logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
  159. The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
  160. similarity scores.
  161. text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
  162. The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
  163. image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
  164. The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
  165. text_model_output(`BaseModelOutputWithPooling`):
  166. The output of the [`BlipTextModel`].
  167. vision_model_output(`BaseModelOutputWithPooling`):
  168. The output of the [`BlipVisionModel`].
  169. """
  170. loss: Optional[torch.FloatTensor] = None
  171. logits_per_image: torch.FloatTensor = None
  172. logits_per_text: torch.FloatTensor = None
  173. text_embeds: torch.FloatTensor = None
  174. image_embeds: torch.FloatTensor = None
  175. text_model_output: BaseModelOutputWithPooling = None
  176. vision_model_output: BaseModelOutputWithPooling = None
  177. def to_tuple(self) -> Tuple[Any]:
  178. return tuple(
  179. self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
  180. for k in self.keys()
  181. )
  182. class BlipVisionEmbeddings(nn.Module):
  183. def __init__(self, config: BlipVisionConfig):
  184. super().__init__()
  185. self.config = config
  186. self.embed_dim = config.hidden_size
  187. self.image_size = config.image_size
  188. self.patch_size = config.patch_size
  189. self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
  190. self.patch_embedding = nn.Conv2d(
  191. in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
  192. )
  193. self.num_patches = (self.image_size // self.patch_size) ** 2
  194. self.num_positions = self.num_patches + 1
  195. self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
  196. def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
  197. """
  198. This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
  199. images. This method is also adapted to support torch.jit tracing.
  200. Adapted from:
  201. - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
  202. - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
  203. """
  204. num_patches = embeddings.shape[1] - 1
  205. num_positions = self.position_embedding.shape[1] - 1
  206. # always interpolate when tracing to ensure the exported model works for dynamic input shapes
  207. if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
  208. return self.position_embedding
  209. class_pos_embed = self.position_embedding[:, :1]
  210. patch_pos_embed = self.position_embedding[:, 1:]
  211. dim = embeddings.shape[-1]
  212. new_height = height // self.patch_size
  213. new_width = width // self.patch_size
  214. sqrt_num_positions = torch_int(num_positions**0.5)
  215. patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
  216. patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
  217. patch_pos_embed = nn.functional.interpolate(
  218. patch_pos_embed,
  219. size=(new_height, new_width),
  220. mode="bicubic",
  221. align_corners=False,
  222. )
  223. patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
  224. return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
  225. def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
  226. batch_size, _, height, width = pixel_values.shape
  227. target_dtype = self.patch_embedding.weight.dtype
  228. patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
  229. patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
  230. class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
  231. embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
  232. if interpolate_pos_encoding:
  233. position_embedding = self.interpolate_pos_encoding(embeddings, height, width)
  234. else:
  235. position_embedding = self.position_embedding
  236. embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype)
  237. return embeddings
  238. # Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Blip
  239. class BlipTextEmbeddings(nn.Module):
  240. def __init__(self, config: BlipTextConfig):
  241. super().__init__()
  242. embed_dim = config.hidden_size
  243. self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
  244. self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
  245. # position_ids (1, len position emb) is contiguous in memory and exported when serialized
  246. self.register_buffer(
  247. "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
  248. )
  249. def forward(
  250. self,
  251. input_ids: Optional[torch.LongTensor] = None,
  252. position_ids: Optional[torch.LongTensor] = None,
  253. inputs_embeds: Optional[torch.FloatTensor] = None,
  254. ) -> torch.Tensor:
  255. seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
  256. if position_ids is None:
  257. position_ids = self.position_ids[:, :seq_length]
  258. if inputs_embeds is None:
  259. inputs_embeds = self.token_embedding(input_ids)
  260. position_embeddings = self.position_embedding(position_ids)
  261. embeddings = inputs_embeds + position_embeddings
  262. return embeddings
  263. class BlipAttention(nn.Module):
  264. """Multi-headed attention from 'Attention Is All You Need' paper"""
  265. def __init__(self, config):
  266. super().__init__()
  267. self.config = config
  268. self.embed_dim = config.hidden_size
  269. self.num_heads = config.num_attention_heads
  270. self.head_dim = self.embed_dim // self.num_heads
  271. if self.head_dim * self.num_heads != self.embed_dim:
  272. raise ValueError(
  273. f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
  274. f" {self.num_heads})."
  275. )
  276. self.scale = self.head_dim**-0.5
  277. self.dropout = nn.Dropout(config.attention_dropout)
  278. self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim)
  279. self.projection = nn.Linear(self.embed_dim, self.embed_dim)
  280. def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
  281. return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
  282. def forward(
  283. self,
  284. hidden_states: torch.Tensor,
  285. head_mask: Optional[torch.Tensor] = None,
  286. output_attentions: Optional[bool] = False,
  287. ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
  288. """Input shape: Batch x Time x Channel"""
  289. bsz, tgt_len, embed_dim = hidden_states.size()
  290. mixed_qkv = (
  291. self.qkv(hidden_states)
  292. .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads)
  293. .permute(2, 0, 3, 1, 4)
  294. )
  295. query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
  296. # Take the dot product between "query" and "key" to get the raw attention scores.
  297. attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
  298. attention_scores = attention_scores * self.scale
  299. # Normalize the attention scores to probabilities.
  300. attention_probs = nn.functional.softmax(attention_scores, dim=-1)
  301. # This is actually dropping out entire tokens to attend to, which might
  302. # seem a bit unusual, but is taken from the original Transformer paper.
  303. attention_probs = self.dropout(attention_probs)
  304. # Mask heads if we want to
  305. if head_mask is not None:
  306. attention_probs = attention_probs * head_mask
  307. context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
  308. new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
  309. context_layer = context_layer.reshape(new_context_layer_shape)
  310. output = self.projection(context_layer)
  311. outputs = (output, attention_probs) if output_attentions else (output, None)
  312. return outputs
  313. # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip
  314. class BlipMLP(nn.Module):
  315. def __init__(self, config):
  316. super().__init__()
  317. self.config = config
  318. self.activation_fn = ACT2FN[config.hidden_act]
  319. self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
  320. self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
  321. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  322. hidden_states = self.fc1(hidden_states)
  323. hidden_states = self.activation_fn(hidden_states)
  324. hidden_states = self.fc2(hidden_states)
  325. return hidden_states
  326. class BlipEncoderLayer(nn.Module):
  327. def __init__(self, config: BlipConfig):
  328. super().__init__()
  329. self.embed_dim = config.hidden_size
  330. self.self_attn = BlipAttention(config)
  331. self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
  332. self.mlp = BlipMLP(config)
  333. self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
  334. def forward(
  335. self,
  336. hidden_states: torch.Tensor,
  337. attention_mask: torch.Tensor,
  338. output_attentions: Optional[bool] = False,
  339. ) -> Tuple[torch.FloatTensor]:
  340. """
  341. Args:
  342. hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
  343. attention_mask (`torch.FloatTensor`): attention mask of size
  344. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
  345. `(config.encoder_attention_heads,)`.
  346. output_attentions (`bool`, *optional*):
  347. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  348. returned tensors for more detail.
  349. """
  350. residual = hidden_states
  351. hidden_states = self.layer_norm1(hidden_states)
  352. hidden_states, attn_weights = self.self_attn(
  353. hidden_states=hidden_states,
  354. head_mask=attention_mask,
  355. output_attentions=output_attentions,
  356. )
  357. hidden_states = hidden_states + residual
  358. residual = hidden_states
  359. hidden_states = self.layer_norm2(hidden_states)
  360. hidden_states = self.mlp(hidden_states)
  361. hidden_states = hidden_states + residual
  362. outputs = (hidden_states,)
  363. if output_attentions:
  364. outputs += (attn_weights,)
  365. return outputs
  366. class BlipPreTrainedModel(PreTrainedModel):
  367. """
  368. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
  369. models.
  370. """
  371. config_class = BlipConfig
  372. base_model_prefix = "blip"
  373. supports_gradient_checkpointing = True
  374. def _init_weights(self, module):
  375. """Initialize the weights"""
  376. factor = self.config.initializer_range
  377. if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
  378. module.weight.data.normal_(mean=0.0, std=factor)
  379. if hasattr(module, "bias") and module.bias is not None:
  380. module.bias.data.zero_()
  381. if isinstance(module, BlipVisionEmbeddings):
  382. if hasattr(self.config, "vision_config"):
  383. factor = self.config.vision_config.initializer_range
  384. nn.init.trunc_normal_(
  385. module.position_embedding,
  386. mean=0.0,
  387. std=factor,
  388. )
  389. nn.init.trunc_normal_(
  390. module.class_embedding,
  391. mean=0.0,
  392. std=factor,
  393. )
  394. elif isinstance(module, nn.LayerNorm):
  395. module.bias.data.zero_()
  396. module.weight.data.fill_(1.0)
  397. elif isinstance(module, nn.Linear) and module.bias is not None:
  398. module.bias.data.zero_()
  399. BLIP_START_DOCSTRING = r"""
  400. This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
  401. library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  402. etc.)
  403. This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
  404. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
  405. and behavior.
  406. Parameters:
  407. config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
  408. Initializing with a config file does not load the weights associated with the model, only the
  409. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
  410. """
  411. BLIP_TEXT_INPUTS_DOCSTRING = r"""
  412. Args:
  413. input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
  414. Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
  415. it.
  416. Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
  417. [What are input IDs?](../glossary#input-ids)
  418. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  419. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  420. - 1 for tokens that are **not masked**,
  421. - 0 for tokens that are **masked**.
  422. [What are attention masks?](../glossary#attention-mask)
  423. position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  424. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  425. config.max_position_embeddings - 1]`.
  426. [What are position IDs?](../glossary#position-ids)
  427. output_attentions (`bool`, *optional*):
  428. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  429. tensors for more detail.
  430. output_hidden_states (`bool`, *optional*):
  431. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  432. more detail.
  433. return_dict (`bool`, *optional*):
  434. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  435. """
  436. BLIP_VISION_INPUTS_DOCSTRING = r"""
  437. Args:
  438. pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  439. Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
  440. [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
  441. output_attentions (`bool`, *optional*):
  442. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  443. tensors for more detail.
  444. output_hidden_states (`bool`, *optional*):
  445. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  446. more detail.
  447. return_dict (`bool`, *optional*):
  448. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  449. interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
  450. Whether to interpolate the pre-trained position encodings.
  451. """
  452. BLIP_INPUTS_DOCSTRING = r"""
  453. Args:
  454. input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
  455. Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
  456. it.
  457. Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
  458. [What are input IDs?](../glossary#input-ids)
  459. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  460. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  461. - 1 for tokens that are **not masked**,
  462. - 0 for tokens that are **masked**.
  463. [What are attention masks?](../glossary#attention-mask)
  464. position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  465. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  466. config.max_position_embeddings - 1]`.
  467. [What are position IDs?](../glossary#position-ids)
  468. pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  469. Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
  470. [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
  471. return_loss (`bool`, *optional*):
  472. Whether or not to return the contrastive loss.
  473. output_attentions (`bool`, *optional*):
  474. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  475. tensors for more detail.
  476. output_hidden_states (`bool`, *optional*):
  477. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  478. more detail.
  479. return_dict (`bool`, *optional*):
  480. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  481. interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
  482. Whether to interpolate the pre-trained position encodings.
  483. """
  484. class BlipEncoder(nn.Module):
  485. """
  486. Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
  487. [`BlipEncoderLayer`].
  488. Args:
  489. config (`BlipConfig`):
  490. The corresponding vision configuration for the `BlipEncoder`.
  491. """
  492. def __init__(self, config: BlipConfig):
  493. super().__init__()
  494. self.config = config
  495. self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
  496. self.gradient_checkpointing = False
  497. def forward(
  498. self,
  499. inputs_embeds,
  500. attention_mask: Optional[torch.Tensor] = None,
  501. output_attentions: Optional[bool] = None,
  502. output_hidden_states: Optional[bool] = None,
  503. return_dict: Optional[bool] = None,
  504. ) -> Union[Tuple, BaseModelOutput]:
  505. r"""
  506. Args:
  507. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  508. Embedded representation of the inputs. Should be float, not int tokens.
  509. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  510. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  511. - 1 for tokens that are **not masked**,
  512. - 0 for tokens that are **masked**.
  513. [What are attention masks?](../glossary#attention-mask)
  514. output_attentions (`bool`, *optional*):
  515. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  516. returned tensors for more detail.
  517. output_hidden_states (`bool`, *optional*):
  518. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
  519. for more detail.
  520. return_dict (`bool`, *optional*):
  521. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  522. """
  523. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  524. output_hidden_states = (
  525. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  526. )
  527. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  528. encoder_states = () if output_hidden_states else None
  529. all_attentions = () if output_attentions else None
  530. hidden_states = inputs_embeds
  531. for idx, encoder_layer in enumerate(self.layers):
  532. if output_hidden_states:
  533. encoder_states = encoder_states + (hidden_states,)
  534. if self.gradient_checkpointing and self.training:
  535. layer_outputs = self._gradient_checkpointing_func(
  536. encoder_layer.__call__,
  537. hidden_states,
  538. attention_mask,
  539. output_attentions,
  540. )
  541. else:
  542. layer_outputs = encoder_layer(
  543. hidden_states,
  544. attention_mask,
  545. output_attentions=output_attentions,
  546. )
  547. hidden_states = layer_outputs[0]
  548. if output_attentions:
  549. all_attentions = all_attentions + (layer_outputs[1],)
  550. if output_hidden_states:
  551. encoder_states = encoder_states + (hidden_states,)
  552. if not return_dict:
  553. return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
  554. return BaseModelOutput(
  555. last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
  556. )
  557. class BlipVisionModel(BlipPreTrainedModel):
  558. main_input_name = "pixel_values"
  559. config_class = BlipVisionConfig
  560. def __init__(self, config: BlipVisionConfig):
  561. super().__init__(config)
  562. self.config = config
  563. embed_dim = config.hidden_size
  564. self.embeddings = BlipVisionEmbeddings(config)
  565. self.encoder = BlipEncoder(config)
  566. self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
  567. self.post_init()
  568. @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
  569. @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=BlipVisionConfig)
  570. def forward(
  571. self,
  572. pixel_values: Optional[torch.FloatTensor] = None,
  573. output_attentions: Optional[bool] = None,
  574. output_hidden_states: Optional[bool] = None,
  575. return_dict: Optional[bool] = None,
  576. interpolate_pos_encoding: bool = False,
  577. ) -> Union[Tuple, BaseModelOutputWithPooling]:
  578. r"""
  579. Returns:
  580. """
  581. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  582. output_hidden_states = (
  583. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  584. )
  585. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  586. if pixel_values is None:
  587. raise ValueError("You have to specify pixel_values")
  588. hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
  589. encoder_outputs = self.encoder(
  590. inputs_embeds=hidden_states,
  591. output_attentions=output_attentions,
  592. output_hidden_states=output_hidden_states,
  593. return_dict=return_dict,
  594. )
  595. last_hidden_state = encoder_outputs[0]
  596. last_hidden_state = self.post_layernorm(last_hidden_state)
  597. pooled_output = last_hidden_state[:, 0, :]
  598. pooled_output = self.post_layernorm(pooled_output)
  599. if not return_dict:
  600. return (last_hidden_state, pooled_output) + encoder_outputs[1:]
  601. return BaseModelOutputWithPooling(
  602. last_hidden_state=last_hidden_state,
  603. pooler_output=pooled_output,
  604. hidden_states=encoder_outputs.hidden_states,
  605. attentions=encoder_outputs.attentions,
  606. )
  607. def get_input_embeddings(self):
  608. return self.embeddings
  609. @add_start_docstrings(
  610. """
  611. This model is going to be deprecated in future versions. Please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.
  612. """,
  613. BLIP_START_DOCSTRING,
  614. )
  615. class BlipModel(BlipPreTrainedModel):
  616. config_class = BlipConfig
  617. def __init__(self, config: BlipConfig):
  618. super().__init__(config)
  619. if not isinstance(config.text_config, BlipTextConfig):
  620. raise TypeError(
  621. "config.text_config is expected to be of type BlipTextConfig but is of type"
  622. f" {type(config.text_config)}."
  623. )
  624. if not isinstance(config.vision_config, BlipVisionConfig):
  625. raise TypeError(
  626. "config.vision_config is expected to be of type BlipVisionConfig but is of type"
  627. f" {type(config.vision_config)}."
  628. )
  629. text_config = config.text_config
  630. vision_config = config.vision_config
  631. self.projection_dim = config.projection_dim
  632. self.text_embed_dim = text_config.hidden_size
  633. self.vision_embed_dim = vision_config.hidden_size
  634. self.text_model = BlipTextModel(text_config)
  635. self.vision_model = BlipVisionModel(vision_config)
  636. self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
  637. self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
  638. self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
  639. logger.warning(
  640. "`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase."
  641. )
  642. # Initialize weights and apply final processing
  643. self.post_init()
  644. @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
  645. def get_text_features(
  646. self,
  647. input_ids: Optional[torch.Tensor] = None,
  648. attention_mask: Optional[torch.Tensor] = None,
  649. position_ids: Optional[torch.Tensor] = None,
  650. return_dict: Optional[bool] = None,
  651. ) -> torch.FloatTensor:
  652. r"""
  653. Returns:
  654. text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
  655. applying the projection layer to the pooled output of [`BlipTextModel`].
  656. Examples:
  657. ```python
  658. >>> from transformers import AutoProcessor, BlipModel
  659. >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
  660. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  661. >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
  662. >>> text_features = model.get_text_features(**inputs)
  663. ```"""
  664. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  665. text_outputs = self.text_model(
  666. input_ids=input_ids,
  667. attention_mask=attention_mask,
  668. position_ids=position_ids,
  669. return_dict=return_dict,
  670. )
  671. pooled_output = text_outputs[1]
  672. text_features = self.text_projection(pooled_output)
  673. return text_features
  674. @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
  675. def get_image_features(
  676. self,
  677. pixel_values: Optional[torch.FloatTensor] = None,
  678. return_dict: Optional[bool] = None,
  679. interpolate_pos_encoding: bool = False,
  680. ) -> torch.FloatTensor:
  681. r"""
  682. Returns:
  683. image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
  684. applying the projection layer to the pooled output of [`BlipVisionModel`].
  685. Examples:
  686. ```python
  687. >>> from PIL import Image
  688. >>> import requests
  689. >>> from transformers import AutoProcessor, BlipModel
  690. >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
  691. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  692. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  693. >>> image = Image.open(requests.get(url, stream=True).raw)
  694. >>> inputs = processor(images=image, return_tensors="pt")
  695. >>> image_features = model.get_image_features(**inputs)
  696. ```"""
  697. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  698. vision_outputs = self.vision_model(
  699. pixel_values=pixel_values,
  700. return_dict=return_dict,
  701. interpolate_pos_encoding=interpolate_pos_encoding,
  702. )
  703. pooled_output = vision_outputs[1] # pooled_output
  704. image_features = self.visual_projection(pooled_output)
  705. return image_features
  706. @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
  707. def get_multimodal_features(
  708. self,
  709. input_ids: Optional[torch.LongTensor] = None,
  710. pixel_values: Optional[torch.FloatTensor] = None,
  711. attention_mask: Optional[torch.Tensor] = None,
  712. return_dict: Optional[bool] = None,
  713. interpolate_pos_encoding: bool = False,
  714. ) -> torch.FloatTensor:
  715. r"""
  716. Returns:
  717. multimodal_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The multimodal embeddings
  718. obtained by applying the image embeddings to the text encoder using the cross-attention mechanism.
  719. Examples:
  720. ```python
  721. >>> from PIL import Image
  722. >>> import requests
  723. >>> from transformers import AutoProcessor, BlipModel
  724. >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
  725. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  726. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  727. >>> image = Image.open(requests.get(url, stream=True).raw)
  728. >>> texts = ["a photo of a cat", "a photo of a dog"]
  729. >>> inputs = processor(images=image, text=texts, padding=True, return_tensors="pt")
  730. >>> multimodal_features = model.get_multimodal_features(**inputs)
  731. ```"""
  732. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  733. vision_outputs = self.vision_model(
  734. pixel_values=pixel_values,
  735. output_attentions=True,
  736. output_hidden_states=True,
  737. return_dict=return_dict,
  738. interpolate_pos_encoding=interpolate_pos_encoding,
  739. )
  740. image_embeds = vision_outputs[0]
  741. image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
  742. text_outputs = self.text_model(
  743. input_ids=input_ids,
  744. attention_mask=attention_mask,
  745. encoder_hidden_states=image_embeds,
  746. encoder_attention_mask=image_atts,
  747. return_dict=return_dict,
  748. )
  749. pooled_output = text_outputs[1] # pooled_output
  750. multimodal_features = self.text_projection(pooled_output)
  751. return multimodal_features
  752. @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
  753. @replace_return_docstrings(output_type=BlipOutput, config_class=BlipConfig)
  754. def forward(
  755. self,
  756. input_ids: Optional[torch.LongTensor] = None,
  757. pixel_values: Optional[torch.FloatTensor] = None,
  758. attention_mask: Optional[torch.Tensor] = None,
  759. position_ids: Optional[torch.LongTensor] = None,
  760. return_loss: Optional[bool] = None,
  761. output_attentions: Optional[bool] = None,
  762. output_hidden_states: Optional[bool] = None,
  763. return_dict: Optional[bool] = None,
  764. interpolate_pos_encoding: bool = False,
  765. ) -> Union[Tuple, BlipOutput]:
  766. r"""
  767. Returns:
  768. Examples:
  769. ```python
  770. >>> from PIL import Image
  771. >>> import requests
  772. >>> from transformers import AutoProcessor, BlipModel
  773. >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
  774. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  775. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  776. >>> image = Image.open(requests.get(url, stream=True).raw)
  777. >>> inputs = processor(
  778. ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
  779. ... )
  780. >>> outputs = model(**inputs)
  781. >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
  782. >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
  783. ```"""
  784. # Use BLIP model's config for some fields (if specified) instead of those of vision & text components.
  785. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  786. output_hidden_states = (
  787. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  788. )
  789. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  790. vision_outputs = self.vision_model(
  791. pixel_values=pixel_values,
  792. output_attentions=output_attentions,
  793. output_hidden_states=output_hidden_states,
  794. return_dict=return_dict,
  795. interpolate_pos_encoding=interpolate_pos_encoding,
  796. )
  797. text_outputs = self.text_model(
  798. input_ids=input_ids,
  799. attention_mask=attention_mask,
  800. position_ids=position_ids,
  801. output_attentions=output_attentions,
  802. output_hidden_states=output_hidden_states,
  803. return_dict=return_dict,
  804. )
  805. image_embeds = vision_outputs[1]
  806. image_embeds = self.visual_projection(image_embeds)
  807. text_embeds = text_outputs[1]
  808. text_embeds = self.text_projection(text_embeds)
  809. # normalized features
  810. image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
  811. text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
  812. # cosine similarity as logits
  813. logit_scale = self.logit_scale.exp()
  814. logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
  815. logits_per_image = logits_per_text.t()
  816. loss = None
  817. if return_loss:
  818. loss = blip_loss(logits_per_text)
  819. if not return_dict:
  820. output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
  821. return ((loss,) + output) if loss is not None else output
  822. return BlipOutput(
  823. loss=loss,
  824. logits_per_image=logits_per_image,
  825. logits_per_text=logits_per_text,
  826. text_embeds=text_embeds,
  827. image_embeds=image_embeds,
  828. text_model_output=text_outputs,
  829. vision_model_output=vision_outputs,
  830. )
  831. @add_start_docstrings(
  832. """
  833. BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
  834. `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
  835. the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
  836. from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
  837. """,
  838. BLIP_START_DOCSTRING,
  839. )
  840. class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin):
  841. config_class = BlipConfig
  842. _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
  843. main_input_name = "pixel_values"
  844. def __init__(self, config: BlipConfig):
  845. super().__init__(config)
  846. self.vision_model = BlipVisionModel(config.vision_config)
  847. self.text_decoder = BlipTextLMHeadModel(config.text_config)
  848. self.decoder_input_ids = config.text_config.bos_token_id
  849. self.decoder_pad_token_id = config.text_config.pad_token_id
  850. # Initialize weights and apply final processing
  851. self.post_init()
  852. def get_input_embeddings(self) -> nn.Module:
  853. return self.vision_model.embeddings.patch_embedding
  854. @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
  855. @replace_return_docstrings(output_type=BlipForConditionalGenerationModelOutput, config_class=BlipVisionConfig)
  856. def forward(
  857. self,
  858. pixel_values: torch.FloatTensor,
  859. input_ids: Optional[torch.LongTensor] = None,
  860. attention_mask: Optional[torch.LongTensor] = None,
  861. output_attentions: Optional[bool] = None,
  862. output_hidden_states: Optional[bool] = None,
  863. labels: Optional[torch.LongTensor] = None,
  864. return_dict: Optional[bool] = None,
  865. interpolate_pos_encoding: bool = False,
  866. ) -> Union[Tuple, BlipForConditionalGenerationModelOutput]:
  867. r"""
  868. Returns:
  869. Examples:
  870. ```python
  871. >>> from PIL import Image
  872. >>> import requests
  873. >>> from transformers import AutoProcessor, BlipForConditionalGeneration
  874. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  875. >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
  876. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  877. >>> image = Image.open(requests.get(url, stream=True).raw)
  878. >>> text = "A picture of"
  879. >>> inputs = processor(images=image, text=text, return_tensors="pt")
  880. >>> outputs = model(**inputs)
  881. ```"""
  882. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  883. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  884. output_hidden_states = (
  885. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  886. )
  887. vision_outputs = self.vision_model(
  888. pixel_values=pixel_values,
  889. output_attentions=output_attentions,
  890. output_hidden_states=output_hidden_states,
  891. return_dict=return_dict,
  892. interpolate_pos_encoding=interpolate_pos_encoding,
  893. )
  894. image_embeds = vision_outputs[0]
  895. outputs = self.text_decoder(
  896. input_ids=input_ids,
  897. attention_mask=attention_mask,
  898. encoder_hidden_states=image_embeds,
  899. labels=labels,
  900. return_dict=return_dict,
  901. reduction="mean",
  902. )
  903. if not return_dict:
  904. outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
  905. return tuple(output for output in outputs if output is not None)
  906. return BlipForConditionalGenerationModelOutput(
  907. loss=outputs.loss,
  908. logits=outputs.logits,
  909. image_embeds=image_embeds,
  910. last_hidden_state=vision_outputs.last_hidden_state,
  911. hidden_states=vision_outputs.hidden_states,
  912. attentions=vision_outputs.attentions,
  913. )
  914. @torch.no_grad()
  915. def generate(
  916. self,
  917. pixel_values: torch.FloatTensor,
  918. input_ids: Optional[torch.LongTensor] = None,
  919. attention_mask: Optional[torch.LongTensor] = None,
  920. interpolate_pos_encoding: bool = False,
  921. **generate_kwargs,
  922. ) -> torch.LongTensor:
  923. r"""
  924. Overrides *generate* function to be able to use the model as a conditional generator
  925. Parameters:
  926. pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
  927. Input image to be processed
  928. input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
  929. The sequence used as a prompt for the generation.
  930. attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
  931. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  932. Examples:
  933. ```python
  934. >>> from PIL import Image
  935. >>> import requests
  936. >>> from transformers import AutoProcessor, BlipForConditionalGeneration
  937. >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
  938. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  939. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  940. >>> image = Image.open(requests.get(url, stream=True).raw)
  941. >>> inputs = processor(images=image, return_tensors="pt")
  942. >>> outputs = model.generate(**inputs)
  943. >>> print(processor.decode(outputs[0], skip_special_tokens=True))
  944. two cats sleeping on a couch
  945. ```
  946. """
  947. batch_size = pixel_values.shape[0]
  948. vision_outputs = self.vision_model(
  949. pixel_values=pixel_values,
  950. interpolate_pos_encoding=interpolate_pos_encoding,
  951. )
  952. image_embeds = vision_outputs[0]
  953. image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
  954. if isinstance(input_ids, list):
  955. input_ids = torch.LongTensor(input_ids)
  956. elif input_ids is None:
  957. input_ids = (
  958. torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]])
  959. .repeat(batch_size, 1)
  960. .to(image_embeds.device)
  961. )
  962. input_ids[:, 0] = self.config.text_config.bos_token_id
  963. attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
  964. outputs = self.text_decoder.generate(
  965. input_ids=input_ids[:, :-1],
  966. eos_token_id=self.config.text_config.sep_token_id,
  967. pad_token_id=self.config.text_config.pad_token_id,
  968. attention_mask=attention_mask,
  969. encoder_hidden_states=image_embeds,
  970. encoder_attention_mask=image_attention_mask,
  971. **generate_kwargs,
  972. )
  973. return outputs
  974. @add_start_docstrings(
  975. """
  976. BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
  977. decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
  978. with the encoding of the image, and the text decoder will output the answer to the question.
  979. """,
  980. BLIP_START_DOCSTRING,
  981. )
  982. class BlipForQuestionAnswering(BlipPreTrainedModel):
  983. config_class = BlipConfig
  984. _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
  985. def __init__(self, config: BlipConfig):
  986. super().__init__(config)
  987. self.vision_model = BlipVisionModel(config.vision_config)
  988. self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
  989. self.text_decoder = BlipTextLMHeadModel(config.text_config)
  990. self.decoder_pad_token_id = config.text_config.pad_token_id
  991. self.decoder_start_token_id = config.text_config.bos_token_id
  992. # Initialize weights and apply final processing
  993. self.post_init()
  994. def get_input_embeddings(self) -> nn.Module:
  995. return self.vision_model.embeddings.patch_embedding
  996. @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
  997. @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
  998. def forward(
  999. self,
  1000. input_ids: torch.LongTensor,
  1001. pixel_values: torch.FloatTensor,
  1002. decoder_input_ids: Optional[torch.LongTensor] = None,
  1003. decoder_attention_mask: Optional[torch.LongTensor] = None,
  1004. attention_mask: Optional[torch.LongTensor] = None,
  1005. output_attentions: Optional[bool] = None,
  1006. output_hidden_states: Optional[bool] = None,
  1007. labels: Optional[torch.LongTensor] = None,
  1008. return_dict: Optional[bool] = None,
  1009. interpolate_pos_encoding: bool = False,
  1010. ) -> Union[Tuple, BlipTextVisionModelOutput]:
  1011. r"""
  1012. Returns:
  1013. Examples:
  1014. ```python
  1015. >>> from PIL import Image
  1016. >>> import requests
  1017. >>> from transformers import AutoProcessor, BlipForQuestionAnswering
  1018. >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
  1019. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
  1020. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  1021. >>> image = Image.open(requests.get(url, stream=True).raw)
  1022. >>> # training
  1023. >>> text = "How many cats are in the picture?"
  1024. >>> label = "2"
  1025. >>> inputs = processor(images=image, text=text, return_tensors="pt")
  1026. >>> labels = processor(text=label, return_tensors="pt").input_ids
  1027. >>> inputs["labels"] = labels
  1028. >>> outputs = model(**inputs)
  1029. >>> loss = outputs.loss
  1030. >>> loss.backward()
  1031. >>> # inference
  1032. >>> text = "How many cats are in the picture?"
  1033. >>> inputs = processor(images=image, text=text, return_tensors="pt")
  1034. >>> outputs = model.generate(**inputs)
  1035. >>> print(processor.decode(outputs[0], skip_special_tokens=True))
  1036. 2
  1037. ```"""
  1038. if labels is None and decoder_input_ids is None:
  1039. raise ValueError(
  1040. "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with"
  1041. " `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you"
  1042. " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`"
  1043. )
  1044. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1045. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  1046. output_hidden_states = (
  1047. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  1048. )
  1049. vision_outputs = self.vision_model(
  1050. pixel_values=pixel_values,
  1051. output_attentions=output_attentions,
  1052. output_hidden_states=output_hidden_states,
  1053. return_dict=return_dict,
  1054. interpolate_pos_encoding=interpolate_pos_encoding,
  1055. )
  1056. image_embeds = vision_outputs[0]
  1057. image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
  1058. question_embeds = self.text_encoder(
  1059. input_ids=input_ids,
  1060. attention_mask=attention_mask,
  1061. encoder_hidden_states=image_embeds,
  1062. encoder_attention_mask=image_attention_mask,
  1063. return_dict=return_dict,
  1064. )
  1065. if labels is not None and decoder_input_ids is None:
  1066. # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
  1067. decoder_input_ids = labels
  1068. question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
  1069. answer_output = self.text_decoder(
  1070. input_ids=decoder_input_ids,
  1071. attention_mask=decoder_attention_mask,
  1072. encoder_hidden_states=question_embeds,
  1073. encoder_attention_mask=attention_mask,
  1074. labels=labels,
  1075. return_dict=return_dict,
  1076. reduction="mean",
  1077. )
  1078. if labels is not None:
  1079. decoder_loss = answer_output.loss.mean() if return_dict else answer_output[0].mean()
  1080. else:
  1081. decoder_loss = None
  1082. if not return_dict:
  1083. outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:]
  1084. return tuple(output for output in outputs if output is not None)
  1085. return BlipTextVisionModelOutput(
  1086. loss=decoder_loss,
  1087. image_embeds=image_embeds,
  1088. last_hidden_state=vision_outputs.last_hidden_state,
  1089. hidden_states=vision_outputs.hidden_states,
  1090. attentions=vision_outputs.attentions,
  1091. )
  1092. @torch.no_grad()
  1093. def generate(
  1094. self,
  1095. input_ids: torch.LongTensor,
  1096. pixel_values: torch.FloatTensor,
  1097. attention_mask: Optional[torch.LongTensor] = None,
  1098. interpolate_pos_encoding: bool = False,
  1099. **generate_kwargs,
  1100. ) -> torch.LongTensor:
  1101. r"""
  1102. Overrides *generate* function to be able to use the model as a conditional generator
  1103. Parameters:
  1104. input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
  1105. The sequence used as a prompt for the generation.
  1106. pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
  1107. Input image to be processed
  1108. attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
  1109. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
  1110. tokens that are NOT MASKED, `0` for MASKED tokens.
  1111. **generate_kwargs:
  1112. Additional arguments passed to the *generate* function of the decoder
  1113. Examples:
  1114. ```python
  1115. >>> from PIL import Image
  1116. >>> import requests
  1117. >>> from transformers import AutoProcessor, BlipForQuestionAnswering
  1118. >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
  1119. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
  1120. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  1121. >>> image = Image.open(requests.get(url, stream=True).raw)
  1122. >>> text = "How many cats are in the picture?"
  1123. >>> inputs = processor(images=image, text=text, return_tensors="pt")
  1124. >>> outputs = model.generate(**inputs)
  1125. >>> print(processor.decode(outputs[0], skip_special_tokens=True))
  1126. 2
  1127. ```
  1128. """
  1129. vision_outputs = self.vision_model(
  1130. pixel_values=pixel_values,
  1131. interpolate_pos_encoding=interpolate_pos_encoding,
  1132. )
  1133. image_embeds = vision_outputs[0]
  1134. image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
  1135. if isinstance(input_ids, list):
  1136. input_ids = torch.LongTensor(input_ids)
  1137. question_outputs = self.text_encoder(
  1138. input_ids=input_ids,
  1139. attention_mask=attention_mask,
  1140. encoder_hidden_states=image_embeds,
  1141. encoder_attention_mask=image_attention_mask,
  1142. return_dict=False,
  1143. )
  1144. question_embeds = question_outputs[0]
  1145. question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device)
  1146. bos_ids = torch.full(
  1147. (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device
  1148. )
  1149. outputs = self.text_decoder.generate(
  1150. input_ids=bos_ids,
  1151. eos_token_id=self.config.text_config.sep_token_id,
  1152. pad_token_id=self.config.text_config.pad_token_id,
  1153. encoder_hidden_states=question_embeds,
  1154. encoder_attention_mask=question_attention_mask,
  1155. **generate_kwargs,
  1156. )
  1157. return outputs
  1158. @add_start_docstrings(
  1159. """
  1160. BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
  1161. image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
  1162. the image.
  1163. """,
  1164. BLIP_START_DOCSTRING,
  1165. )
  1166. class BlipForImageTextRetrieval(BlipPreTrainedModel):
  1167. config_class = BlipConfig
  1168. def __init__(self, config: BlipConfig):
  1169. super().__init__(config)
  1170. self.vision_model = BlipVisionModel(config.vision_config)
  1171. self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
  1172. # vision projection layer
  1173. self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size)
  1174. # text projection layer
  1175. self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size)
  1176. # image text matching head
  1177. self.itm_head = nn.Linear(config.text_config.hidden_size, 2)
  1178. self.decoder_pad_token_id = (
  1179. config.text_config.pad_token_id
  1180. if not hasattr(config, "decoder_pad_token_id")
  1181. else config.decoder_pad_token_id
  1182. )
  1183. self.decoder_start_token_id = (
  1184. config.text_config.bos_token_id
  1185. if not hasattr(config, "decoder_start_token_id")
  1186. else config.decoder_start_token_id
  1187. )
  1188. # Initialize weights and apply final processing
  1189. self.post_init()
  1190. def get_input_embeddings(self) -> nn.Module:
  1191. return self.vision_model.embeddings.patch_embedding
  1192. @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
  1193. @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
  1194. def forward(
  1195. self,
  1196. input_ids: torch.LongTensor,
  1197. pixel_values: torch.FloatTensor,
  1198. use_itm_head: Optional[bool] = True,
  1199. attention_mask: Optional[torch.LongTensor] = None,
  1200. output_attentions: Optional[bool] = None,
  1201. output_hidden_states: Optional[bool] = None,
  1202. return_dict: Optional[bool] = None,
  1203. interpolate_pos_encoding: bool = False,
  1204. ) -> Union[Tuple, BlipTextVisionModelOutput]:
  1205. r"""
  1206. Returns:
  1207. Examples:
  1208. ```python
  1209. >>> from PIL import Image
  1210. >>> import requests
  1211. >>> from transformers import AutoProcessor, BlipForImageTextRetrieval
  1212. >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
  1213. >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
  1214. >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
  1215. >>> image = Image.open(requests.get(url, stream=True).raw)
  1216. >>> text = "an image of a cat"
  1217. >>> inputs = processor(images=image, text=text, return_tensors="pt")
  1218. >>> outputs = model(**inputs)
  1219. ```
  1220. """
  1221. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1222. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  1223. output_hidden_states = (
  1224. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  1225. )
  1226. vision_outputs = self.vision_model(
  1227. pixel_values=pixel_values,
  1228. output_attentions=output_attentions,
  1229. output_hidden_states=output_hidden_states,
  1230. return_dict=return_dict,
  1231. interpolate_pos_encoding=interpolate_pos_encoding,
  1232. )
  1233. image_embeds = vision_outputs[0]
  1234. image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
  1235. if use_itm_head:
  1236. question_embeds = self.text_encoder(
  1237. input_ids=input_ids,
  1238. attention_mask=attention_mask,
  1239. encoder_hidden_states=image_embeds,
  1240. encoder_attention_mask=image_atts,
  1241. return_dict=return_dict,
  1242. )
  1243. question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
  1244. output = self.itm_head(question_embeds[:, 0, :])
  1245. else:
  1246. question_embeds = self.text_encoder(
  1247. input_ids=input_ids,
  1248. attention_mask=attention_mask,
  1249. return_dict=return_dict,
  1250. )
  1251. question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
  1252. image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
  1253. text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1)
  1254. output = image_feat @ text_feat.t()
  1255. if not return_dict:
  1256. outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
  1257. return tuple(output for output in outputs if output is not None)
  1258. return BlipImageTextMatchingModelOutput(
  1259. itm_score=output,
  1260. last_hidden_state=vision_outputs.last_hidden_state,
  1261. hidden_states=vision_outputs.hidden_states,
  1262. attentions=vision_outputs.attentions,
  1263. question_embeds=question_embeds,
  1264. )