perceiver.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
  2. #
  3. # MIT License
  4. #
  5. # Copyright (c) 2020 The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
  6. #
  7. # Permission is hereby granted, free of charge, to any person obtaining a copy
  8. # of this software and associated documentation files (the "Software"), to deal
  9. # in the Software without restriction, including without limitation the rights
  10. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. # copies of the Software, and to permit persons to whom the Software is
  12. # furnished to do so, subject to the following conditions:
  13. #
  14. # The above copyright notice and this permission notice shall be included in all
  15. # copies or substantial portions of the Software.
  16. #
  17. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23. # SOFTWARE.
  24. """
  25. Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
  26. time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
  27. that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
  28. prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
  29. to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
  30. References:
  31. - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
  32. - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
  33. """
  34. from typing import Optional, Tuple
  35. import torch
  36. import torch.nn as nn
  37. from .configuration_idefics import IdeficsConfig
  38. class IdeficsPerceiverResampler(nn.Module):
  39. def __init__(
  40. self, config: IdeficsConfig, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int
  41. ) -> None:
  42. """
  43. Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
  44. MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
  45. returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
  46. to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
  47. Could be e.g., VIT embed_dim, ResNet pool dim, and so on.
  48. Args:
  49. config (`IdeficsConfig`): config object
  50. embed_dim (`int`): The size of each embedding vector
  51. depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
  52. n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
  53. head_dim (`int`): Dimensionality of each head projection in the Transformer block.
  54. n_latents (`int`):
  55. Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
  56. """
  57. super().__init__()
  58. self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
  59. self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
  60. # Create Latents for Perceiver
  61. self.latents = nn.Parameter(torch.randn(self.n_latents, self.embed_dim), requires_grad=True)
  62. self.intermediate_dim = (
  63. self.embed_dim * 4
  64. if not hasattr(config.vision_config, "embed_dim")
  65. else config.vision_config.embed_dim * 4
  66. )
  67. # Create Transformer Blocks
  68. self.blocks = nn.ModuleList(
  69. [
  70. nn.ModuleList(
  71. [
  72. IdeficsPerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
  73. IdeficsMLP(self.intermediate_dim, config),
  74. ]
  75. )
  76. for _ in range(depth)
  77. ]
  78. )
  79. self.layer_norm = nn.LayerNorm(self.embed_dim)
  80. def forward(self, context: torch.Tensor) -> torch.Tensor:
  81. """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
  82. # einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
  83. latents = self.latents.repeat(context.shape[0], 1, 1)
  84. # Feed through Perceiver Attention blocks...
  85. for attn, ff in self.blocks:
  86. latents = attn(context, latents) + latents
  87. latents = ff(latents) + latents
  88. return self.layer_norm(latents)
  89. class IdeficsPerceiverAttention(nn.Module):
  90. def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
  91. """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
  92. super().__init__()
  93. self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
  94. self.qk_layer_norms = qk_layer_norms
  95. # Normalization & Scaling
  96. self.context_layer_norm = nn.LayerNorm(self.embed_dim)
  97. self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
  98. if self.qk_layer_norms:
  99. self.q_layer_norm = nn.LayerNorm(self.head_dim)
  100. self.k_layer_norm = nn.LayerNorm(self.head_dim)
  101. self.qk_scale = self.head_dim**-0.5
  102. # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
  103. self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
  104. self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
  105. self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
  106. self.output_proj = nn.Linear(self.n_heads * self.head_dim, embed_dim, bias=False)
  107. def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
  108. """
  109. Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
  110. Args:
  111. context (`torch.Tensor`):
  112. Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
  113. latents (`torch.Tensor`):
  114. Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.
  115. Returns:
  116. `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
  117. from context.
  118. """
  119. context = self.context_layer_norm(context)
  120. latents = self.latents_layer_norm(latents)
  121. batch_size, seq_length, embed_dim = context.shape[:3]
  122. # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
  123. # Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
  124. q = self.q_proj(latents)
  125. k = self.k_proj(torch.cat([context, latents], dim=-2))
  126. v = self.v_proj(torch.cat([context, latents], dim=-2))
  127. # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
  128. # =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
  129. # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
  130. q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]
  131. if self.qk_layer_norms:
  132. q = self.q_layer_norm(q)
  133. k = self.k_layer_norm(k)
  134. scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
  135. stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
  136. attn = stabilized_scores.softmax(dim=-1)
  137. # Attend & project back to output...
  138. resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
  139. # einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
  140. return self.output_proj(resampled.transpose(1, 2).flatten(-2))
  141. class IdeficsMLP(nn.Module):
  142. def __init__(self, intermediate_size, config: IdeficsConfig):
  143. """Simple MLP block with intermediate_size and embedding size"""
  144. super().__init__()
  145. self.embed_dim = config.vision_config.embed_dim
  146. self.ln = nn.LayerNorm(self.embed_dim)
  147. self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
  148. self.act = nn.ReLU()
  149. self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)
  150. def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
  151. hidden_states = self.ln(hidden_states)
  152. hidden_states = self.fc(hidden_states)
  153. hidden_states = self.act(hidden_states)
  154. hidden_states = self.c_proj(hidden_states)
  155. return hidden_states