Source code for denselinkage.embedding.sentence_transformer_embedder
"""``SentenceTransformerEmbedder`` — heavy adapter (extra:
``[sentence-transformers]``)."""
from collections.abc import Sequence
import numpy as np
from denselinkage._optional import require
from denselinkage.core.ports import Embedder, Vectors
[docs]
class SentenceTransformerEmbedder(Embedder):
"""Semantic embedder over a ``sentence-transformers`` checkpoint (extra:
``[sentence-transformers]``).
Where the lexical ``HashedNGramEmbedder`` recovers typos and abbreviations,
this captures *meaning*: it can link semantic renames (e.g. *Google* /
*Alphabet*) that share no characters. Encodes with
``normalize_embeddings=True`` so the unit-vector inner product equals cosine —
the similarity the numpy / FAISS indexes and ``similarity_threshold`` are
defined against. The model loads eagerly at construction, so a bad
``model_name`` fails fast.
"""
def __init__(self, model_name: str) -> None:
require("sentence_transformers")
from sentence_transformers import SentenceTransformer
self._model_name = model_name
self._model = SentenceTransformer(model_name)
@property
def model_id(self) -> str:
return self._model_name
@property
def embedding_dim(self) -> int:
# ``get_sentence_embedding_dimension`` was renamed to
# ``get_embedding_dimension``; support both across the ``>=3.0`` range
# (the new name avoids a deprecation warning and survives the old name's
# eventual removal).
get_dim = getattr(self._model, "get_embedding_dimension", None)
if get_dim is None:
get_dim = self._model.get_sentence_embedding_dimension
return int(get_dim())
def encode(
self,
texts: Sequence[str],
*,
batch_size: int | None = None,
show_progress: bool = False,
) -> Vectors:
embeddings = self._model.encode(
list(texts),
batch_size=batch_size or 32,
show_progress_bar=show_progress,
normalize_embeddings=True, # unit vectors -> inner product == cosine
convert_to_numpy=True,
)
return np.asarray(embeddings, dtype=np.float32)