| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- from __future__ import annotations
- import math
- import os
- from pathlib import Path
- import numpy as np
- import torch
- from safetensors.torch import load_file as load_safetensors_file
- from safetensors.torch import save_file as save_safetensors_file
- from tokenizers import Tokenizer
- from torch import nn
- from transformers import PreTrainedTokenizerFast
- from sentence_transformers.util import get_device_name
- class StaticEmbedding(nn.Module):
- def __init__(
- self,
- tokenizer: Tokenizer | PreTrainedTokenizerFast,
- embedding_weights: np.array | torch.Tensor | None = None,
- embedding_dim: int | None = None,
- **kwargs,
- ) -> None:
- """
- Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
- takes the mean of trained per-token embeddings to compute text embeddings.
- Args:
- tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
- from ``transformers`` or ``tokenizers``.
- embedding_weights (np.array | torch.Tensor | None, optional): Pre-trained embedding weights.
- Defaults to None.
- embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
- is not provided. Defaults to None.
- Example::
- from sentence_transformers import SentenceTransformer
- from sentence_transformers.models import StaticEmbedding
- from tokenizers import Tokenizer
- # Pre-distilled embeddings:
- static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
- # or distill your own embeddings:
- static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
- # or start with randomized embeddings:
- tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
- static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)
- model = SentenceTransformer(modules=[static_embedding])
- embeddings = model.encode(["What are Pandas?", "The giant panda (Ailuropoda melanoleuca; Chinese: 大熊猫; pinyin: dàxióngmāo), also known as the panda bear or simply the panda, is a bear native to south central China."])
- similarity = model.similarity(embeddings[0], embeddings[1])
- # tensor([[0.9177]]) (If you use the distilled bge-base)
- Raises:
- ValueError: If the tokenizer is not a fast tokenizer.
- ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
- """
- super().__init__()
- if isinstance(tokenizer, PreTrainedTokenizerFast):
- tokenizer = tokenizer._tokenizer
- elif not isinstance(tokenizer, Tokenizer):
- raise ValueError(
- "The tokenizer must be fast (i.e. Rust-backed) to use this class. "
- "Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer."
- )
- if embedding_weights is not None:
- if isinstance(embedding_weights, np.ndarray):
- embedding_weights = torch.from_numpy(embedding_weights)
- self.embedding = nn.EmbeddingBag.from_pretrained(embedding_weights, freeze=False)
- elif embedding_dim is not None:
- self.embedding = nn.EmbeddingBag(tokenizer.get_vocab_size(), embedding_dim)
- else:
- raise ValueError("Either `embedding_weights` or `embedding_dim` must be provided.")
- self.num_embeddings = self.embedding.num_embeddings
- self.embedding_dim = self.embedding.embedding_dim
- self.tokenizer: Tokenizer = tokenizer
- self.tokenizer.no_padding()
- # For the model card
- self.base_model = kwargs.get("base_model", None)
- def tokenize(self, texts: list[str], **kwargs) -> dict[str, torch.Tensor]:
- encodings = self.tokenizer.encode_batch(texts, add_special_tokens=False)
- encodings_ids = [encoding.ids for encoding in encodings]
- offsets = torch.from_numpy(np.cumsum([0] + [len(token_ids) for token_ids in encodings_ids[:-1]]))
- input_ids = torch.tensor([token_id for token_ids in encodings_ids for token_id in token_ids], dtype=torch.long)
- return {"input_ids": input_ids, "offsets": offsets}
- def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
- features["sentence_embedding"] = self.embedding(features["input_ids"], features["offsets"])
- return features
- def get_config_dict(self) -> dict[str, float]:
- return {}
- @property
- def max_seq_length(self) -> int:
- return math.inf
- def get_sentence_embedding_dimension(self) -> int:
- return self.embedding_dim
- def save(self, save_dir: str, safe_serialization: bool = True, **kwargs) -> None:
- if safe_serialization:
- save_safetensors_file(self.state_dict(), os.path.join(save_dir, "model.safetensors"))
- else:
- torch.save(self.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))
- self.tokenizer.save(str(Path(save_dir) / "tokenizer.json"))
- def load(load_dir: str, **kwargs) -> StaticEmbedding:
- tokenizer = Tokenizer.from_file(str(Path(load_dir) / "tokenizer.json"))
- if os.path.exists(os.path.join(load_dir, "model.safetensors")):
- weights = load_safetensors_file(os.path.join(load_dir, "model.safetensors"))
- else:
- weights = torch.load(
- os.path.join(load_dir, "pytorch_model.bin"), map_location=torch.device("cpu"), weights_only=True
- )
- weights = weights["embedding.weight"]
- return StaticEmbedding(tokenizer, embedding_weights=weights)
- @classmethod
- def from_distillation(
- cls,
- model_name: str,
- vocabulary: list[str] | None = None,
- device: str | None = None,
- pca_dims: int | None = 256,
- apply_zipf: bool = True,
- use_subword: bool = True,
- ) -> StaticEmbedding:
- """
- Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.
- Args:
- model_name (str): The name of the model to distill.
- vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
- device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
- the strongest device is automatically detected. Defaults to None.
- pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
- apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
- use_subword (bool): Whether to use subword tokenization. Defaults to True.
- Returns:
- StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
- tokenizer and embedding weights.
- Raises:
- ImportError: If the `model2vec` package is not installed.
- """
- try:
- from model2vec.distill import distill
- except ImportError:
- raise ImportError(
- "To use this method, please install the `model2vec` package: `pip install model2vec[distill]`"
- )
- device = get_device_name()
- static_model = distill(
- model_name,
- vocabulary=vocabulary,
- device=device,
- pca_dims=pca_dims,
- apply_zipf=apply_zipf,
- use_subword=use_subword,
- )
- if isinstance(static_model.embedding, np.ndarray):
- embedding_weights = torch.from_numpy(static_model.embedding)
- else:
- embedding_weights = static_model.embedding.weight
- tokenizer: Tokenizer = static_model.tokenizer
- return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_name)
- @classmethod
- def from_model2vec(cls, model_id_or_path: str) -> StaticEmbedding:
- """
- Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
- and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.
- Args:
- model_id_or_path (str): The identifier or path to the pre-trained model2vec model.
- Returns:
- StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
- the model2vec model.
- Raises:
- ImportError: If the `model2vec` package is not installed.
- """
- try:
- from model2vec import StaticModel
- except ImportError:
- raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
- static_model = StaticModel.from_pretrained(model_id_or_path)
- if isinstance(static_model.embedding, np.ndarray):
- embedding_weights = torch.from_numpy(static_model.embedding)
- else:
- embedding_weights = static_model.embedding.weight
- tokenizer: Tokenizer = static_model.tokenizer
- return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_id_or_path)
|