_fastembed.py
1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
from typing import List
from fastembed import TextEmbedding
from bertopic.backend import BaseEmbedder
class FastEmbedBackend(BaseEmbedder):
"""FastEmbed embedding model.
The FastEmbed embedding model used for generating sentence embeddings.
Arguments:
embedding_model: A FastEmbed embedding model
Examples:
To create a model, you can load in a string pointing to a supported
FastEmbed model:
```python
from bertopic.backend import FastEmbedBackend
sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
```
"""
def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
super().__init__()
supported_models = [m["model"] for m in TextEmbedding.list_supported_models()]
if isinstance(embedding_model, str) and embedding_model in supported_models:
self.embedding_model = TextEmbedding(model_name=embedding_model)
else:
raise ValueError(
"Please select a correct FasteEmbed model: \n"
"the model must be a string and must be supported. \n"
"The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
return embeddings