diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/r2r/providers/embeddings/openai | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to 'R2R/r2r/providers/embeddings/openai')
-rwxr-xr-x | R2R/r2r/providers/embeddings/openai/openai_base.py | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/R2R/r2r/providers/embeddings/openai/openai_base.py b/R2R/r2r/providers/embeddings/openai/openai_base.py new file mode 100755 index 00000000..7e7d32aa --- /dev/null +++ b/R2R/r2r/providers/embeddings/openai/openai_base.py @@ -0,0 +1,200 @@ +import logging +import os + +from openai import AsyncOpenAI, AuthenticationError, OpenAI + +from r2r.base import EmbeddingConfig, EmbeddingProvider, VectorSearchResult + +logger = logging.getLogger(__name__) + + +class OpenAIEmbeddingProvider(EmbeddingProvider): + MODEL_TO_TOKENIZER = { + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + } + MODEL_TO_DIMENSIONS = { + "text-embedding-ada-002": [1536], + "text-embedding-3-small": [512, 1536], + "text-embedding-3-large": [256, 1024, 3072], + } + + def __init__(self, config: EmbeddingConfig): + super().__init__(config) + provider = config.provider + if not provider: + raise ValueError( + "Must set provider in order to initialize OpenAIEmbeddingProvider." + ) + + if provider != "openai": + raise ValueError( + "OpenAIEmbeddingProvider must be initialized with provider `openai`." + ) + if not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "Must set OPENAI_API_KEY in order to initialize OpenAIEmbeddingProvider." + ) + self.client = OpenAI() + self.async_client = AsyncOpenAI() + + if config.rerank_model: + raise ValueError( + "OpenAIEmbeddingProvider does not support separate reranking." + ) + self.base_model = config.base_model + self.base_dimension = config.base_dimension + + if self.base_model not in OpenAIEmbeddingProvider.MODEL_TO_TOKENIZER: + raise ValueError( + f"OpenAI embedding model {self.base_model} not supported." + ) + if ( + self.base_dimension + and self.base_dimension + not in OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[self.base_model] + ): + raise ValueError( + f"Dimensions {self.dimension} for {self.base_model} are not supported" + ) + + if not self.base_model or not self.base_dimension: + raise ValueError( + "Must set base_model and base_dimension in order to initialize OpenAIEmbeddingProvider." + ) + + if config.rerank_model: + raise ValueError( + "OpenAIEmbeddingProvider does not support separate reranking." + ) + + def get_embedding( + self, + text: str, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[float]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + return ( + self.client.embeddings.create( + input=[text], + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ) + .data[0] + .embedding + ) + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + async def async_get_embedding( + self, + text: str, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[float]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + response = await self.async_client.embeddings.create( + input=[text], + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ) + return response.data[0].embedding + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + def get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + return [ + ele.embedding + for ele in self.client.embeddings.create( + input=texts, + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ).data + ] + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + async def async_get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + response = await self.async_client.embeddings.create( + input=texts, + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ) + return [ele.embedding for ele in response.data] + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + def rerank( + self, + query: str, + results: list[VectorSearchResult], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.RERANK, + limit: int = 10, + ): + return results[:limit] + + def tokenize_string(self, text: str, model: str) -> list[int]: + try: + import tiktoken + except ImportError: + raise ValueError( + "Must download tiktoken library to run `tokenize_string`." + ) + # tiktoken encoding - + # cl100k_base - gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large + if model not in OpenAIEmbeddingProvider.MODEL_TO_TOKENIZER: + raise ValueError(f"OpenAI embedding model {model} not supported.") + encoding = tiktoken.get_encoding( + OpenAIEmbeddingProvider.MODEL_TO_TOKENIZER[model] + ) + return encoding.encode(text) |