aboutsummaryrefslogtreecommitdiff
path: root/R2R/r2r/base/providers/embedding_provider.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/r2r/base/providers/embedding_provider.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to 'R2R/r2r/base/providers/embedding_provider.py')
-rwxr-xr-xR2R/r2r/base/providers/embedding_provider.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/R2R/r2r/base/providers/embedding_provider.py b/R2R/r2r/base/providers/embedding_provider.py
new file mode 100755
index 00000000..8f3af56f
--- /dev/null
+++ b/R2R/r2r/base/providers/embedding_provider.py
@@ -0,0 +1,83 @@
+import logging
+from abc import abstractmethod
+from enum import Enum
+from typing import Optional
+
+from ..abstractions.search import VectorSearchResult
+from .base_provider import Provider, ProviderConfig
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingConfig(ProviderConfig):
+ """A base embedding configuration class"""
+
+ provider: Optional[str] = None
+ base_model: Optional[str] = None
+ base_dimension: Optional[int] = None
+ rerank_model: Optional[str] = None
+ rerank_dimension: Optional[int] = None
+ rerank_transformer_type: Optional[str] = None
+ batch_size: int = 1
+
+ def validate(self) -> None:
+ if self.provider not in self.supported_providers:
+ raise ValueError(f"Provider '{self.provider}' is not supported.")
+
+ @property
+ def supported_providers(self) -> list[str]:
+ return [None, "openai", "ollama", "sentence-transformers"]
+
+
+class EmbeddingProvider(Provider):
+ """An abstract class to provide a common interface for embedding providers."""
+
+ class PipeStage(Enum):
+ BASE = 1
+ RERANK = 2
+
+ def __init__(self, config: EmbeddingConfig):
+ if not isinstance(config, EmbeddingConfig):
+ raise ValueError(
+ "EmbeddingProvider must be initialized with a `EmbeddingConfig`."
+ )
+ logger.info(f"Initializing EmbeddingProvider with config {config}.")
+
+ super().__init__(config)
+
+ @abstractmethod
+ def get_embedding(self, text: str, stage: PipeStage = PipeStage.BASE):
+ pass
+
+ async def async_get_embedding(
+ self, text: str, stage: PipeStage = PipeStage.BASE
+ ):
+ return self.get_embedding(text, stage)
+
+ @abstractmethod
+ def get_embeddings(
+ self, texts: list[str], stage: PipeStage = PipeStage.BASE
+ ):
+ pass
+
+ async def async_get_embeddings(
+ self, texts: list[str], stage: PipeStage = PipeStage.BASE
+ ):
+ return self.get_embeddings(texts, stage)
+
+ @abstractmethod
+ def rerank(
+ self,
+ query: str,
+ results: list[VectorSearchResult],
+ stage: PipeStage = PipeStage.RERANK,
+ limit: int = 10,
+ ):
+ pass
+
+ @abstractmethod
+ def tokenize_string(
+ self, text: str, model: str, stage: PipeStage
+ ) -> list[int]:
+ """Tokenizes the input string."""
+ pass