diff options
Diffstat (limited to 'R2R/r2r')
147 files changed, 18192 insertions, 0 deletions
diff --git a/R2R/r2r/__init__.py b/R2R/r2r/__init__.py new file mode 100755 index 00000000..492cc13a --- /dev/null +++ b/R2R/r2r/__init__.py @@ -0,0 +1,110 @@ +import logging + +# Keep '*' imports for enhanced development velocity +# corresponding flake8 error codes are F403, F405 +from .base import * +from .integrations import * +from .main import * +from .parsers import * +from .pipelines import * +from .pipes import * +from .prompts import * + +logger = logging.getLogger("r2r") +logger.setLevel(logging.INFO) + +# Create a console handler and set the level to info +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) + +# Create a formatter and set it for the handler +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) +ch.setFormatter(formatter) + +# Add the handler to the logger +logger.addHandler(ch) + +# Optional: Prevent propagation to the root logger +logger.propagate = False + +__all__ = [ + "R2RException", + "LoggingConfig", + "LocalKVLoggingProvider", + "PostgresLoggingConfig", + "PostgresKVLoggingProvider", + "RedisLoggingConfig", + "RedisKVLoggingProvider", + "KVLoggingSingleton", + "VectorEntry", + "VectorType", + "Vector", + "VectorSearchRequest", + "VectorSearchResult", + "AsyncPipe", + "PipeType", + "AsyncState", + "Prompt", + "DataType", + "DocumentType", + "Document", + "Extraction", + "ExtractionType", + "Fragment", + "FragmentType", + "SearchPipe", + # Parsers + "AsyncParser", + "CSVParser", + "DOCXParser", + "HTMLParser", + "JSONParser", + "MDParser", + "PDFParser", + "PPTParser", + "TextParser", + "XLSXParser", + "AsyncPipeline", + # Providers + "EmbeddingConfig", + "EmbeddingProvider", + "EvalConfig", + "EvalProvider", + "LLMEvalProvider", + "PromptConfig", + "PromptProvider", + "GenerationConfig", + "LLMChatCompletion", + "LLMChatCompletionChunk", + "LLMConfig", + "LLMProvider", + "VectorDBConfig", + "VectorDBProvider", + "R2RConfig", + "TextSplitter", + "RecursiveCharacterTextSplitter", + "generate_run_id", + "generate_id_from_label", + "R2REngine", + # Pipes + "EmbeddingPipe", + "EvalPipe", + "ParsingPipe", + "QueryTransformPipe", + "SearchRAGPipe", + "StreamingSearchRAGPipe", + "VectorSearchPipe", + "VectorStoragePipe", + "R2RPromptProvider", + "WebSearchPipe", + "R2RBuilder", + "R2R", + "KGAgentSearchPipe", + # Prebuilts + "MultiSearchPipe", + "R2RPipeFactoryWithMultiSearch", + # Integrations + "SerperClient", +] diff --git a/R2R/r2r/base/__init__.py b/R2R/r2r/base/__init__.py new file mode 100755 index 00000000..a6794a84 --- /dev/null +++ b/R2R/r2r/base/__init__.py @@ -0,0 +1,160 @@ +from .abstractions.base import AsyncSyncMeta, UserStats, syncable +from .abstractions.document import ( + DataType, + Document, + DocumentInfo, + DocumentType, + Entity, + Extraction, + ExtractionType, + Fragment, + FragmentType, + KGExtraction, + Triple, + extract_entities, + extract_triples, +) +from .abstractions.exception import R2RDocumentProcessingError, R2RException +from .abstractions.llama_abstractions import VectorStoreQuery +from .abstractions.llm import ( + GenerationConfig, + LLMChatCompletion, + LLMChatCompletionChunk, + RAGCompletion, +) +from .abstractions.prompt import Prompt +from .abstractions.search import ( + AggregateSearchResult, + KGSearchRequest, + KGSearchResult, + KGSearchSettings, + VectorSearchRequest, + VectorSearchResult, + VectorSearchSettings, +) +from .abstractions.vector import Vector, VectorEntry, VectorType +from .logging.kv_logger import ( + KVLoggingSingleton, + LocalKVLoggingProvider, + LoggingConfig, + PostgresKVLoggingProvider, + PostgresLoggingConfig, + RedisKVLoggingProvider, + RedisLoggingConfig, +) +from .logging.log_processor import ( + AnalysisTypes, + FilterCriteria, + LogAnalytics, + LogAnalyticsConfig, + LogProcessor, +) +from .logging.run_manager import RunManager, manage_run +from .parsers import AsyncParser +from .pipeline.base_pipeline import AsyncPipeline +from .pipes.base_pipe import AsyncPipe, AsyncState, PipeType +from .providers.embedding_provider import EmbeddingConfig, EmbeddingProvider +from .providers.eval_provider import EvalConfig, EvalProvider +from .providers.kg_provider import KGConfig, KGProvider, update_kg_prompt +from .providers.llm_provider import LLMConfig, LLMProvider +from .providers.prompt_provider import PromptConfig, PromptProvider +from .providers.vector_db_provider import VectorDBConfig, VectorDBProvider +from .utils import ( + EntityType, + RecursiveCharacterTextSplitter, + Relation, + TextSplitter, + format_entity_types, + format_relations, + generate_id_from_label, + generate_run_id, + increment_version, + run_pipeline, + to_async_generator, +) + +__all__ = [ + # Logging + "AsyncParser", + "AnalysisTypes", + "LogAnalytics", + "LogAnalyticsConfig", + "LogProcessor", + "LoggingConfig", + "LocalKVLoggingProvider", + "PostgresLoggingConfig", + "PostgresKVLoggingProvider", + "RedisLoggingConfig", + "AsyncSyncMeta", + "syncable", + "RedisKVLoggingProvider", + "KVLoggingSingleton", + "RunManager", + "manage_run", + # Abstractions + "VectorEntry", + "VectorType", + "Vector", + "VectorSearchRequest", + "VectorSearchResult", + "VectorSearchSettings", + "KGSearchRequest", + "KGSearchResult", + "KGSearchSettings", + "AggregateSearchResult", + "AsyncPipe", + "PipeType", + "AsyncState", + "AsyncPipe", + "Prompt", + "DataType", + "DocumentType", + "Document", + "DocumentInfo", + "Extraction", + "ExtractionType", + "Fragment", + "FragmentType", + "extract_entities", + "Entity", + "extract_triples", + "R2RException", + "R2RDocumentProcessingError", + "Triple", + "KGExtraction", + "UserStats", + # Pipelines + "AsyncPipeline", + # Providers + "EmbeddingConfig", + "EmbeddingProvider", + "EvalConfig", + "EvalProvider", + "PromptConfig", + "PromptProvider", + "GenerationConfig", + "RAGCompletion", + "VectorStoreQuery", + "LLMChatCompletion", + "LLMChatCompletionChunk", + "LLMConfig", + "LLMProvider", + "VectorDBConfig", + "VectorDBProvider", + "KGProvider", + "KGConfig", + "update_kg_prompt", + # Other + "FilterCriteria", + "TextSplitter", + "RecursiveCharacterTextSplitter", + "to_async_generator", + "EntityType", + "Relation", + "format_entity_types", + "format_relations", + "increment_version", + "run_pipeline", + "generate_run_id", + "generate_id_from_label", +] diff --git a/R2R/r2r/base/abstractions/__init__.py b/R2R/r2r/base/abstractions/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/base/abstractions/__init__.py diff --git a/R2R/r2r/base/abstractions/base.py b/R2R/r2r/base/abstractions/base.py new file mode 100755 index 00000000..7121f6ce --- /dev/null +++ b/R2R/r2r/base/abstractions/base.py @@ -0,0 +1,93 @@ +import asyncio +import uuid +from typing import List + +from pydantic import BaseModel + + +class UserStats(BaseModel): + user_id: uuid.UUID + num_files: int + total_size_in_bytes: int + document_ids: List[uuid.UUID] + + +class AsyncSyncMeta(type): + _event_loop = None # Class-level shared event loop + + @classmethod + def get_event_loop(cls): + if cls._event_loop is None or cls._event_loop.is_closed(): + cls._event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(cls._event_loop) + return cls._event_loop + + def __new__(cls, name, bases, dct): + new_cls = super().__new__(cls, name, bases, dct) + for attr_name, attr_value in dct.items(): + if asyncio.iscoroutinefunction(attr_value) and getattr( + attr_value, "_syncable", False + ): + sync_method_name = attr_name[ + 1: + ] # Remove leading 'a' for sync method + async_method = attr_value + + def make_sync_method(async_method): + def sync_wrapper(self, *args, **kwargs): + loop = cls.get_event_loop() + if not loop.is_running(): + # Setup to run the loop in a background thread if necessary + # to prevent blocking the main thread in a synchronous call environment + from threading import Thread + + result = None + exception = None + + def run(): + nonlocal result, exception + try: + asyncio.set_event_loop(loop) + result = loop.run_until_complete( + async_method(self, *args, **kwargs) + ) + except Exception as e: + exception = e + finally: + generation_config = kwargs.get( + "rag_generation_config", None + ) + if ( + not generation_config + or not generation_config.stream + ): + loop.run_until_complete( + loop.shutdown_asyncgens() + ) + loop.close() + + thread = Thread(target=run) + thread.start() + thread.join() + if exception: + raise exception + return result + else: + # If there's already a running loop, schedule and execute the coroutine + future = asyncio.run_coroutine_threadsafe( + async_method(self, *args, **kwargs), loop + ) + return future.result() + + return sync_wrapper + + setattr( + new_cls, sync_method_name, make_sync_method(async_method) + ) + return new_cls + + +def syncable(func): + """Decorator to mark methods for synchronous wrapper creation.""" + func._syncable = True + return func diff --git a/R2R/r2r/base/abstractions/document.py b/R2R/r2r/base/abstractions/document.py new file mode 100755 index 00000000..117db7b9 --- /dev/null +++ b/R2R/r2r/base/abstractions/document.py @@ -0,0 +1,242 @@ +"""Abstractions for documents and their extractions.""" + +import base64 +import json +import logging +import uuid +from datetime import datetime +from enum import Enum +from typing import Optional, Union + +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + +DataType = Union[str, bytes] + + +class DocumentType(str, Enum): + """Types of documents that can be stored.""" + + CSV = "csv" + DOCX = "docx" + HTML = "html" + JSON = "json" + MD = "md" + PDF = "pdf" + PPTX = "pptx" + TXT = "txt" + XLSX = "xlsx" + GIF = "gif" + PNG = "png" + JPG = "jpg" + JPEG = "jpeg" + SVG = "svg" + MP3 = "mp3" + MP4 = "mp4" + + +class Document(BaseModel): + id: uuid.UUID = Field(default_factory=uuid.uuid4) + type: DocumentType + data: Union[str, bytes] + metadata: dict + + def __init__(self, *args, **kwargs): + data = kwargs.get("data") + if data and isinstance(data, str): + try: + # Try to decode if it's already base64 encoded + kwargs["data"] = base64.b64decode(data) + except: + # If it's not base64, encode it to bytes + kwargs["data"] = data.encode("utf-8") + + doc_type = kwargs.get("type") + if isinstance(doc_type, str): + kwargs["type"] = DocumentType(doc_type) + + # Generate UUID based on the hash of the data + if "id" not in kwargs: + if isinstance(kwargs["data"], bytes): + data_hash = uuid.uuid5( + uuid.NAMESPACE_DNS, kwargs["data"].decode("utf-8") + ) + else: + data_hash = uuid.uuid5(uuid.NAMESPACE_DNS, kwargs["data"]) + + kwargs["id"] = data_hash # Set the id based on the data hash + + super().__init__(*args, **kwargs) + + class Config: + arbitrary_types_allowed = True + json_encoders = { + uuid.UUID: str, + bytes: lambda v: base64.b64encode(v).decode("utf-8"), + } + + +class DocumentStatus(str, Enum): + """Status of document processing.""" + + PROCESSING = "processing" + # TODO - Extend support for `partial-failure` + # PARTIAL_FAILURE = "partial-failure" + FAILURE = "failure" + SUCCESS = "success" + + +class DocumentInfo(BaseModel): + """Base class for document information handling.""" + + document_id: uuid.UUID + version: str + size_in_bytes: int + metadata: dict + status: DocumentStatus = DocumentStatus.PROCESSING + + user_id: Optional[uuid.UUID] = None + title: Optional[str] = None + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + + def convert_to_db_entry(self): + """Prepare the document info for database entry, extracting certain fields from metadata.""" + now = datetime.now() + metadata = self.metadata + if "user_id" in metadata: + metadata["user_id"] = str(metadata["user_id"]) + + metadata["title"] = metadata.get("title", "N/A") + return { + "document_id": str(self.document_id), + "title": metadata.get("title", "N/A"), + "user_id": metadata.get("user_id", None), + "version": self.version, + "size_in_bytes": self.size_in_bytes, + "metadata": json.dumps(self.metadata), + "created_at": self.created_at or now, + "updated_at": self.updated_at or now, + "status": self.status, + } + + +class ExtractionType(Enum): + """Types of extractions that can be performed.""" + + TXT = "txt" + IMG = "img" + MOV = "mov" + + +class Extraction(BaseModel): + """An extraction from a document.""" + + id: uuid.UUID + type: ExtractionType = ExtractionType.TXT + data: DataType + metadata: dict + document_id: uuid.UUID + + +class FragmentType(Enum): + """A type of fragment that can be extracted from a document.""" + + TEXT = "text" + IMAGE = "image" + + +class Fragment(BaseModel): + """A fragment extracted from a document.""" + + id: uuid.UUID + type: FragmentType + data: DataType + metadata: dict + document_id: uuid.UUID + extraction_id: uuid.UUID + + +class Entity(BaseModel): + """An entity extracted from a document.""" + + category: str + subcategory: Optional[str] = None + value: str + + def __str__(self): + return ( + f"{self.category}:{self.subcategory}:{self.value}" + if self.subcategory + else f"{self.category}:{self.value}" + ) + + +class Triple(BaseModel): + """A triple extracted from a document.""" + + subject: str + predicate: str + object: str + + +def extract_entities(llm_payload: list[str]) -> dict[str, Entity]: + entities = {} + for entry in llm_payload: + try: + if "], " in entry: # Check if the entry is an entity + entry_val = entry.split("], ")[0] + "]" + entry = entry.split("], ")[1] + colon_count = entry.count(":") + + if colon_count == 1: + category, value = entry.split(":") + subcategory = None + elif colon_count >= 2: + parts = entry.split(":", 2) + category, subcategory, value = ( + parts[0], + parts[1], + parts[2], + ) + else: + raise ValueError("Unexpected entry format") + + entities[entry_val] = Entity( + category=category, subcategory=subcategory, value=value + ) + except Exception as e: + logger.error(f"Error processing entity {entry}: {e}") + continue + return entities + + +def extract_triples( + llm_payload: list[str], entities: dict[str, Entity] +) -> list[Triple]: + triples = [] + for entry in llm_payload: + try: + if "], " not in entry: # Check if the entry is an entity + elements = entry.split(" ") + subject = elements[0] + predicate = elements[1] + object = " ".join(elements[2:]) + subject = entities[subject].value # Use entity.value + if "[" in object and "]" in object: + object = entities[object].value # Use entity.value + triples.append( + Triple(subject=subject, predicate=predicate, object=object) + ) + except Exception as e: + logger.error(f"Error processing triplet {entry}: {e}") + continue + return triples + + +class KGExtraction(BaseModel): + """An extraction from a document that is part of a knowledge graph.""" + + entities: dict[str, Entity] + triples: list[Triple] diff --git a/R2R/r2r/base/abstractions/exception.py b/R2R/r2r/base/abstractions/exception.py new file mode 100755 index 00000000..c76625a3 --- /dev/null +++ b/R2R/r2r/base/abstractions/exception.py @@ -0,0 +1,16 @@ +from typing import Any, Optional + + +class R2RException(Exception): + def __init__( + self, message: str, status_code: int, detail: Optional[Any] = None + ): + self.message = message + self.status_code = status_code + super().__init__(self.message) + + +class R2RDocumentProcessingError(R2RException): + def __init__(self, error_message, document_id): + self.document_id = document_id + super().__init__(error_message, 400, {"document_id": document_id}) diff --git a/R2R/r2r/base/abstractions/llama_abstractions.py b/R2R/r2r/base/abstractions/llama_abstractions.py new file mode 100755 index 00000000..f6bc36e6 --- /dev/null +++ b/R2R/r2r/base/abstractions/llama_abstractions.py @@ -0,0 +1,439 @@ +# abstractions are taken from LlamaIndex +# https://github.com/run-llama/llama_index +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Optional, Tuple, Union + +from pydantic import BaseModel, Field, StrictFloat, StrictInt, StrictStr + + +class LabelledNode(BaseModel): + """An entity in a graph.""" + + label: str = Field(default="node", description="The label of the node.") + embedding: Optional[List[float]] = Field( + default=None, description="The embeddings of the node." + ) + properties: Dict[str, Any] = Field(default_factory=dict) + + @abstractmethod + def __str__(self) -> str: + """Return the string representation of the node.""" + ... + + @property + @abstractmethod + def id(self) -> str: + """Get the node id.""" + ... + + +class EntityNode(LabelledNode): + """An entity in a graph.""" + + name: str = Field(description="The name of the entity.") + label: str = Field(default="entity", description="The label of the node.") + properties: Dict[str, Any] = Field(default_factory=dict) + + def __str__(self) -> str: + """Return the string representation of the node.""" + return self.name + + @property + def id(self) -> str: + """Get the node id.""" + return self.name.replace('"', " ") + + +class ChunkNode(LabelledNode): + """A text chunk in a graph.""" + + text: str = Field(description="The text content of the chunk.") + id_: Optional[str] = Field( + default=None, + description="The id of the node. Defaults to a hash of the text.", + ) + label: str = Field( + default="text_chunk", description="The label of the node." + ) + properties: Dict[str, Any] = Field(default_factory=dict) + + def __str__(self) -> str: + """Return the string representation of the node.""" + return self.text + + @property + def id(self) -> str: + """Get the node id.""" + return str(hash(self.text)) if self.id_ is None else self.id_ + + +class Relation(BaseModel): + """A relation connecting two entities in a graph.""" + + label: str + source_id: str + target_id: str + properties: Dict[str, Any] = Field(default_factory=dict) + + def __str__(self) -> str: + """Return the string representation of the relation.""" + return self.label + + @property + def id(self) -> str: + """Get the relation id.""" + return self.label + + +Triplet = Tuple[LabelledNode, Relation, LabelledNode] + + +class VectorStoreQueryMode(str, Enum): + """Vector store query mode.""" + + DEFAULT = "default" + SPARSE = "sparse" + HYBRID = "hybrid" + TEXT_SEARCH = "text_search" + SEMANTIC_HYBRID = "semantic_hybrid" + + # fit learners + SVM = "svm" + LOGISTIC_REGRESSION = "logistic_regression" + LINEAR_REGRESSION = "linear_regression" + + # maximum marginal relevance + MMR = "mmr" + + +class FilterOperator(str, Enum): + """Vector store filter operator.""" + + # TODO add more operators + EQ = "==" # default operator (string, int, float) + GT = ">" # greater than (int, float) + LT = "<" # less than (int, float) + NE = "!=" # not equal to (string, int, float) + GTE = ">=" # greater than or equal to (int, float) + LTE = "<=" # less than or equal to (int, float) + IN = "in" # In array (string or number) + NIN = "nin" # Not in array (string or number) + ANY = "any" # Contains any (array of strings) + ALL = "all" # Contains all (array of strings) + TEXT_MATCH = "text_match" # full text match (allows you to search for a specific substring, token or phrase within the text field) + CONTAINS = "contains" # metadata array contains value (string or number) + + +class MetadataFilter(BaseModel): + """Comprehensive metadata filter for vector stores to support more operators. + + Value uses Strict* types, as int, float and str are compatible types and were all + converted to string before. + + See: https://docs.pydantic.dev/latest/usage/types/#strict-types + """ + + key: str + value: Union[ + StrictInt, + StrictFloat, + StrictStr, + List[StrictStr], + List[StrictFloat], + List[StrictInt], + ] + operator: FilterOperator = FilterOperator.EQ + + @classmethod + def from_dict( + cls, + filter_dict: Dict, + ) -> "MetadataFilter": + """Create MetadataFilter from dictionary. + + Args: + filter_dict: Dict with key, value and operator. + + """ + return MetadataFilter.parse_obj(filter_dict) + + +# # TODO: Deprecate ExactMatchFilter and use MetadataFilter instead +# # Keep class for now so that AutoRetriever can still work with old vector stores +# class ExactMatchFilter(BaseModel): +# key: str +# value: Union[StrictInt, StrictFloat, StrictStr] + +# set ExactMatchFilter to MetadataFilter +ExactMatchFilter = MetadataFilter + + +class FilterCondition(str, Enum): + """Vector store filter conditions to combine different filters.""" + + # TODO add more conditions + AND = "and" + OR = "or" + + +class MetadataFilters(BaseModel): + """Metadata filters for vector stores.""" + + # Exact match filters and Advanced filters with operators like >, <, >=, <=, !=, etc. + filters: List[Union[MetadataFilter, ExactMatchFilter, "MetadataFilters"]] + # and/or such conditions for combining different filters + condition: Optional[FilterCondition] = FilterCondition.AND + + +@dataclass +class VectorStoreQuery: + """Vector store query.""" + + query_embedding: Optional[List[float]] = None + similarity_top_k: int = 1 + doc_ids: Optional[List[str]] = None + node_ids: Optional[List[str]] = None + query_str: Optional[str] = None + output_fields: Optional[List[str]] = None + embedding_field: Optional[str] = None + + mode: VectorStoreQueryMode = VectorStoreQueryMode.DEFAULT + + # NOTE: only for hybrid search (0 for bm25, 1 for vector search) + alpha: Optional[float] = None + + # metadata filters + filters: Optional[MetadataFilters] = None + + # only for mmr + mmr_threshold: Optional[float] = None + + # NOTE: currently only used by postgres hybrid search + sparse_top_k: Optional[int] = None + # NOTE: return top k results from hybrid search. similarity_top_k is used for dense search top k + hybrid_top_k: Optional[int] = None + + +class PropertyGraphStore(ABC): + """Abstract labelled graph store protocol. + + This protocol defines the interface for a graph store, which is responsible + for storing and retrieving knowledge graph data. + + Attributes: + client: Any: The client used to connect to the graph store. + get: Callable[[str], List[List[str]]]: Get triplets for a given subject. + get_rel_map: Callable[[Optional[List[str]], int], Dict[str, List[List[str]]]]: + Get subjects' rel map in max depth. + upsert_triplet: Callable[[str, str, str], None]: Upsert a triplet. + delete: Callable[[str, str, str], None]: Delete a triplet. + persist: Callable[[str, Optional[fsspec.AbstractFileSystem]], None]: + Persist the graph store to a file. + """ + + supports_structured_queries: bool = False + supports_vector_queries: bool = False + + @property + def client(self) -> Any: + """Get client.""" + ... + + @abstractmethod + def get( + self, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> List[LabelledNode]: + """Get nodes with matching values.""" + ... + + @abstractmethod + def get_triplets( + self, + entity_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> List[Triplet]: + """Get triplets with matching values.""" + ... + + @abstractmethod + def get_rel_map( + self, + graph_nodes: List[LabelledNode], + depth: int = 2, + limit: int = 30, + ignore_rels: Optional[List[str]] = None, + ) -> List[Triplet]: + """Get depth-aware rel map.""" + ... + + @abstractmethod + def upsert_nodes(self, nodes: List[LabelledNode]) -> None: + """Upsert nodes.""" + ... + + @abstractmethod + def upsert_relations(self, relations: List[Relation]) -> None: + """Upsert relations.""" + ... + + @abstractmethod + def delete( + self, + entity_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> None: + """Delete matching data.""" + ... + + @abstractmethod + def structured_query( + self, query: str, param_map: Optional[Dict[str, Any]] = None + ) -> Any: + """Query the graph store with statement and parameters.""" + ... + + @abstractmethod + def vector_query( + self, query: VectorStoreQuery, **kwargs: Any + ) -> Tuple[List[LabelledNode], List[float]]: + """Query the graph store with a vector store query.""" + ... + + # def persist( + # self, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None + # ) -> None: + # """Persist the graph store to a file.""" + # return + + def get_schema(self, refresh: bool = False) -> Any: + """Get the schema of the graph store.""" + return None + + def get_schema_str(self, refresh: bool = False) -> str: + """Get the schema of the graph store as a string.""" + return str(self.get_schema(refresh=refresh)) + + ### ----- Async Methods ----- ### + + async def aget( + self, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> List[LabelledNode]: + """Asynchronously get nodes with matching values.""" + return self.get(properties, ids) + + async def aget_triplets( + self, + entity_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> List[Triplet]: + """Asynchronously get triplets with matching values.""" + return self.get_triplets(entity_names, relation_names, properties, ids) + + async def aget_rel_map( + self, + graph_nodes: List[LabelledNode], + depth: int = 2, + limit: int = 30, + ignore_rels: Optional[List[str]] = None, + ) -> List[Triplet]: + """Asynchronously get depth-aware rel map.""" + return self.get_rel_map(graph_nodes, depth, limit, ignore_rels) + + async def aupsert_nodes(self, nodes: List[LabelledNode]) -> None: + """Asynchronously add nodes.""" + return self.upsert_nodes(nodes) + + async def aupsert_relations(self, relations: List[Relation]) -> None: + """Asynchronously add relations.""" + return self.upsert_relations(relations) + + async def adelete( + self, + entity_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> None: + """Asynchronously delete matching data.""" + return self.delete(entity_names, relation_names, properties, ids) + + async def astructured_query( + self, query: str, param_map: Optional[Dict[str, Any]] = {} + ) -> Any: + """Asynchronously query the graph store with statement and parameters.""" + return self.structured_query(query, param_map) + + async def avector_query( + self, query: VectorStoreQuery, **kwargs: Any + ) -> Tuple[List[LabelledNode], List[float]]: + """Asynchronously query the graph store with a vector store query.""" + return self.vector_query(query, **kwargs) + + async def aget_schema(self, refresh: bool = False) -> str: + """Asynchronously get the schema of the graph store.""" + return self.get_schema(refresh=refresh) + + async def aget_schema_str(self, refresh: bool = False) -> str: + """Asynchronously get the schema of the graph store as a string.""" + return str(await self.aget_schema(refresh=refresh)) + + +LIST_LIMIT = 128 + + +def clean_string_values(text: str) -> str: + return text.replace("\n", " ").replace("\r", " ") + + +def value_sanitize(d: Any) -> Any: + """Sanitize the input dictionary or list. + + Sanitizes the input by removing embedding-like values, + lists with more than 128 elements, that are mostly irrelevant for + generating answers in a LLM context. These properties, if left in + results, can occupy significant context space and detract from + the LLM's performance by introducing unnecessary noise and cost. + """ + if isinstance(d, dict): + new_dict = {} + for key, value in d.items(): + if isinstance(value, dict): + sanitized_value = value_sanitize(value) + if ( + sanitized_value is not None + ): # Check if the sanitized value is not None + new_dict[key] = sanitized_value + elif isinstance(value, list): + if len(value) < LIST_LIMIT: + sanitized_value = value_sanitize(value) + if ( + sanitized_value is not None + ): # Check if the sanitized value is not None + new_dict[key] = sanitized_value + # Do not include the key if the list is oversized + else: + new_dict[key] = value + return new_dict + elif isinstance(d, list): + if len(d) < LIST_LIMIT: + return [ + value_sanitize(item) + for item in d + if value_sanitize(item) is not None + ] + else: + return None + else: + return d diff --git a/R2R/r2r/base/abstractions/llm.py b/R2R/r2r/base/abstractions/llm.py new file mode 100755 index 00000000..3178d8dc --- /dev/null +++ b/R2R/r2r/base/abstractions/llm.py @@ -0,0 +1,112 @@ +"""Abstractions for the LLM model.""" + +from typing import TYPE_CHECKING, ClassVar, Optional + +from openai.types.chat import ChatCompletion, ChatCompletionChunk +from pydantic import BaseModel, Field + +if TYPE_CHECKING: + from .search import AggregateSearchResult + +LLMChatCompletion = ChatCompletion +LLMChatCompletionChunk = ChatCompletionChunk + + +class RAGCompletion: + completion: LLMChatCompletion + search_results: "AggregateSearchResult" + + def __init__( + self, + completion: LLMChatCompletion, + search_results: "AggregateSearchResult", + ): + self.completion = completion + self.search_results = search_results + + +class GenerationConfig(BaseModel): + _defaults: ClassVar[dict] = { + "model": "gpt-4o", + "temperature": 0.1, + "top_p": 1.0, + "top_k": 100, + "max_tokens_to_sample": 1024, + "stream": False, + "functions": None, + "skip_special_tokens": False, + "stop_token": None, + "num_beams": 1, + "do_sample": True, + "generate_with_chat": False, + "add_generation_kwargs": None, + "api_base": None, + } + + model: str = Field( + default_factory=lambda: GenerationConfig._defaults["model"] + ) + temperature: float = Field( + default_factory=lambda: GenerationConfig._defaults["temperature"] + ) + top_p: float = Field( + default_factory=lambda: GenerationConfig._defaults["top_p"] + ) + top_k: int = Field( + default_factory=lambda: GenerationConfig._defaults["top_k"] + ) + max_tokens_to_sample: int = Field( + default_factory=lambda: GenerationConfig._defaults[ + "max_tokens_to_sample" + ] + ) + stream: bool = Field( + default_factory=lambda: GenerationConfig._defaults["stream"] + ) + functions: Optional[list[dict]] = Field( + default_factory=lambda: GenerationConfig._defaults["functions"] + ) + skip_special_tokens: bool = Field( + default_factory=lambda: GenerationConfig._defaults[ + "skip_special_tokens" + ] + ) + stop_token: Optional[str] = Field( + default_factory=lambda: GenerationConfig._defaults["stop_token"] + ) + num_beams: int = Field( + default_factory=lambda: GenerationConfig._defaults["num_beams"] + ) + do_sample: bool = Field( + default_factory=lambda: GenerationConfig._defaults["do_sample"] + ) + generate_with_chat: bool = Field( + default_factory=lambda: GenerationConfig._defaults[ + "generate_with_chat" + ] + ) + add_generation_kwargs: Optional[dict] = Field( + default_factory=lambda: GenerationConfig._defaults[ + "add_generation_kwargs" + ] + ) + api_base: Optional[str] = Field( + default_factory=lambda: GenerationConfig._defaults["api_base"] + ) + + @classmethod + def set_default(cls, **kwargs): + for key, value in kwargs.items(): + if key in cls._defaults: + cls._defaults[key] = value + else: + raise AttributeError( + f"No default attribute '{key}' in GenerationConfig" + ) + + def __init__(self, **data): + model = data.pop("model", None) + if model is not None: + super().__init__(model=model, **data) + else: + super().__init__(**data) diff --git a/R2R/r2r/base/abstractions/prompt.py b/R2R/r2r/base/abstractions/prompt.py new file mode 100755 index 00000000..e37eeb5f --- /dev/null +++ b/R2R/r2r/base/abstractions/prompt.py @@ -0,0 +1,31 @@ +"""Abstraction for a prompt that can be formatted with inputs.""" + +from typing import Any + +from pydantic import BaseModel + + +class Prompt(BaseModel): + """A prompt that can be formatted with inputs.""" + + name: str + template: str + input_types: dict[str, str] + + def format_prompt(self, inputs: dict[str, Any]) -> str: + self._validate_inputs(inputs) + return self.template.format(**inputs) + + def _validate_inputs(self, inputs: dict[str, Any]) -> None: + for var, expected_type_name in self.input_types.items(): + expected_type = self._convert_type(expected_type_name) + if var not in inputs: + raise ValueError(f"Missing input: {var}") + if not isinstance(inputs[var], expected_type): + raise TypeError( + f"Input '{var}' must be of type {expected_type.__name__}, got {type(inputs[var]).__name__} instead." + ) + + def _convert_type(self, type_name: str) -> type: + type_mapping = {"int": int, "str": str} + return type_mapping.get(type_name, str) diff --git a/R2R/r2r/base/abstractions/search.py b/R2R/r2r/base/abstractions/search.py new file mode 100755 index 00000000..b13cc5aa --- /dev/null +++ b/R2R/r2r/base/abstractions/search.py @@ -0,0 +1,84 @@ +"""Abstractions for search functionality.""" + +import uuid +from typing import Any, Dict, List, Optional, Tuple + +from pydantic import BaseModel, Field + +from .llm import GenerationConfig + + +class VectorSearchRequest(BaseModel): + """Request for a search operation.""" + + query: str + limit: int + filters: Optional[dict[str, Any]] = None + + +class VectorSearchResult(BaseModel): + """Result of a search operation.""" + + id: uuid.UUID + score: float + metadata: dict[str, Any] + + def __str__(self) -> str: + return f"VectorSearchResult(id={self.id}, score={self.score}, metadata={self.metadata})" + + def __repr__(self) -> str: + return f"VectorSearchResult(id={self.id}, score={self.score}, metadata={self.metadata})" + + def dict(self) -> dict: + return { + "id": self.id, + "score": self.score, + "metadata": self.metadata, + } + + +class KGSearchRequest(BaseModel): + """Request for a knowledge graph search operation.""" + + query: str + + +# [query, ...] +KGSearchResult = List[Tuple[str, List[Dict[str, Any]]]] + + +class AggregateSearchResult(BaseModel): + """Result of an aggregate search operation.""" + + vector_search_results: Optional[List[VectorSearchResult]] + kg_search_results: Optional[KGSearchResult] = None + + def __str__(self) -> str: + return f"AggregateSearchResult(vector_search_results={self.vector_search_results}, kg_search_results={self.kg_search_results})" + + def __repr__(self) -> str: + return f"AggregateSearchResult(vector_search_results={self.vector_search_results}, kg_search_results={self.kg_search_results})" + + def dict(self) -> dict: + return { + "vector_search_results": ( + [result.dict() for result in self.vector_search_results] + if self.vector_search_results + else [] + ), + "kg_search_results": self.kg_search_results or [], + } + + +class VectorSearchSettings(BaseModel): + use_vector_search: bool = True + search_filters: dict[str, Any] = Field(default_factory=dict) + search_limit: int = 10 + do_hybrid_search: bool = False + + +class KGSearchSettings(BaseModel): + use_kg_search: bool = False + agent_generation_config: Optional[GenerationConfig] = Field( + default_factory=GenerationConfig + ) diff --git a/R2R/r2r/base/abstractions/vector.py b/R2R/r2r/base/abstractions/vector.py new file mode 100755 index 00000000..445f3302 --- /dev/null +++ b/R2R/r2r/base/abstractions/vector.py @@ -0,0 +1,66 @@ +"""Abstraction for a vector that can be stored in the system.""" + +from enum import Enum +from typing import Any +from uuid import UUID + + +class VectorType(Enum): + FIXED = "FIXED" + + +class Vector: + """A vector with the option to fix the number of elements.""" + + def __init__( + self, + data: list[float], + type: VectorType = VectorType.FIXED, + length: int = -1, + ): + self.data = data + self.type = type + self.length = length + + if ( + self.type == VectorType.FIXED + and length > 0 + and len(data) != length + ): + raise ValueError(f"Vector must be exactly {length} elements long.") + + def __repr__(self) -> str: + return ( + f"Vector(data={self.data}, type={self.type}, length={self.length})" + ) + + +class VectorEntry: + """A vector entry that can be stored directly in supported vector databases.""" + + def __init__(self, id: UUID, vector: Vector, metadata: dict[str, Any]): + """Create a new VectorEntry object.""" + self.vector = vector + self.id = id + self.metadata = metadata + + def to_serializable(self) -> str: + """Return a serializable representation of the VectorEntry.""" + metadata = self.metadata + + for key in metadata: + if isinstance(metadata[key], UUID): + metadata[key] = str(metadata[key]) + return { + "id": str(self.id), + "vector": self.vector.data, + "metadata": metadata, + } + + def __str__(self) -> str: + """Return a string representation of the VectorEntry.""" + return f"VectorEntry(id={self.id}, vector={self.vector}, metadata={self.metadata})" + + def __repr__(self) -> str: + """Return an unambiguous string representation of the VectorEntry.""" + return f"VectorEntry(id={self.id}, vector={self.vector}, metadata={self.metadata})" diff --git a/R2R/r2r/base/logging/__init__.py b/R2R/r2r/base/logging/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/base/logging/__init__.py diff --git a/R2R/r2r/base/logging/kv_logger.py b/R2R/r2r/base/logging/kv_logger.py new file mode 100755 index 00000000..2d444e9f --- /dev/null +++ b/R2R/r2r/base/logging/kv_logger.py @@ -0,0 +1,547 @@ +import json +import logging +import os +import uuid +from abc import abstractmethod +from datetime import datetime +from typing import Optional + +import asyncpg +from pydantic import BaseModel + +from ..providers.base_provider import Provider, ProviderConfig + +logger = logging.getLogger(__name__) + + +class RunInfo(BaseModel): + run_id: uuid.UUID + log_type: str + + +class LoggingConfig(ProviderConfig): + provider: str = "local" + log_table: str = "logs" + log_info_table: str = "logs_pipeline_info" + logging_path: Optional[str] = None + + def validate(self) -> None: + pass + + @property + def supported_providers(self) -> list[str]: + return ["local", "postgres", "redis"] + + +class KVLoggingProvider(Provider): + @abstractmethod + async def close(self): + pass + + @abstractmethod + async def log(self, log_id: uuid.UUID, key: str, value: str): + pass + + @abstractmethod + async def get_run_info( + self, + limit: int = 10, + log_type_filter: Optional[str] = None, + ) -> list[RunInfo]: + pass + + @abstractmethod + async def get_logs( + self, run_ids: list[uuid.UUID], limit_per_run: int + ) -> list: + pass + + +class LocalKVLoggingProvider(KVLoggingProvider): + def __init__(self, config: LoggingConfig): + self.log_table = config.log_table + self.log_info_table = config.log_info_table + self.logging_path = config.logging_path or os.getenv( + "LOCAL_DB_PATH", "local.sqlite" + ) + if not self.logging_path: + raise ValueError( + "Please set the environment variable LOCAL_DB_PATH." + ) + self.conn = None + try: + import aiosqlite + + self.aiosqlite = aiosqlite + except ImportError: + raise ImportError( + "Please install aiosqlite to use the LocalKVLoggingProvider." + ) + + async def init(self): + self.conn = await self.aiosqlite.connect(self.logging_path) + await self.conn.execute( + f""" + CREATE TABLE IF NOT EXISTS {self.log_table} ( + timestamp DATETIME, + log_id TEXT, + key TEXT, + value TEXT + ) + """ + ) + await self.conn.execute( + f""" + CREATE TABLE IF NOT EXISTS {self.log_info_table} ( + timestamp DATETIME, + log_id TEXT UNIQUE, + log_type TEXT + ) + """ + ) + await self.conn.commit() + + async def __aenter__(self): + if self.conn is None: + await self.init() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def close(self): + if self.conn: + await self.conn.close() + self.conn = None + + async def log( + self, + log_id: uuid.UUID, + key: str, + value: str, + is_info_log=False, + ): + collection = self.log_info_table if is_info_log else self.log_table + + if is_info_log: + if "type" not in key: + raise ValueError("Info log keys must contain the text 'type'") + await self.conn.execute( + f"INSERT INTO {collection} (timestamp, log_id, log_type) VALUES (datetime('now'), ?, ?)", + (str(log_id), value), + ) + else: + await self.conn.execute( + f"INSERT INTO {collection} (timestamp, log_id, key, value) VALUES (datetime('now'), ?, ?, ?)", + (str(log_id), key, value), + ) + await self.conn.commit() + + async def get_run_info( + self, limit: int = 10, log_type_filter: Optional[str] = None + ) -> list[RunInfo]: + cursor = await self.conn.cursor() + query = f'SELECT log_id, log_type FROM "{self.log_info_table}"' + conditions = [] + params = [] + if log_type_filter: + conditions.append("log_type = ?") + params.append(log_type_filter) + if conditions: + query += " WHERE " + " AND ".join(conditions) + query += " ORDER BY timestamp DESC LIMIT ?" + params.append(limit) + await cursor.execute(query, params) + rows = await cursor.fetchall() + return [ + RunInfo(run_id=uuid.UUID(row[0]), log_type=row[1]) for row in rows + ] + + async def get_logs( + self, run_ids: list[uuid.UUID], limit_per_run: int = 10 + ) -> list: + if not run_ids: + raise ValueError("No run ids provided.") + cursor = await self.conn.cursor() + placeholders = ",".join(["?" for _ in run_ids]) + query = f""" + SELECT * + FROM ( + SELECT *, ROW_NUMBER() OVER (PARTITION BY log_id ORDER BY timestamp DESC) as rn + FROM {self.log_table} + WHERE log_id IN ({placeholders}) + ) + WHERE rn <= ? + ORDER BY timestamp DESC + """ + params = [str(ele) for ele in run_ids] + [limit_per_run] + await cursor.execute(query, params) + rows = await cursor.fetchall() + new_rows = [] + for row in rows: + new_rows.append( + (row[0], uuid.UUID(row[1]), row[2], row[3], row[4]) + ) + return [ + {desc[0]: row[i] for i, desc in enumerate(cursor.description)} + for row in new_rows + ] + + +class PostgresLoggingConfig(LoggingConfig): + provider: str = "postgres" + log_table: str = "logs" + log_info_table: str = "logs_pipeline_info" + + def validate(self) -> None: + required_env_vars = [ + "POSTGRES_DBNAME", + "POSTGRES_USER", + "POSTGRES_PASSWORD", + "POSTGRES_HOST", + "POSTGRES_PORT", + ] + for var in required_env_vars: + if not os.getenv(var): + raise ValueError(f"Environment variable {var} is not set.") + + @property + def supported_providers(self) -> list[str]: + return ["postgres"] + + +class PostgresKVLoggingProvider(KVLoggingProvider): + def __init__(self, config: PostgresLoggingConfig): + self.log_table = config.log_table + self.log_info_table = config.log_info_table + self.config = config + self.pool = None + if not os.getenv("POSTGRES_DBNAME"): + raise ValueError( + "Please set the environment variable POSTGRES_DBNAME." + ) + if not os.getenv("POSTGRES_USER"): + raise ValueError( + "Please set the environment variable POSTGRES_USER." + ) + if not os.getenv("POSTGRES_PASSWORD"): + raise ValueError( + "Please set the environment variable POSTGRES_PASSWORD." + ) + if not os.getenv("POSTGRES_HOST"): + raise ValueError( + "Please set the environment variable POSTGRES_HOST." + ) + if not os.getenv("POSTGRES_PORT"): + raise ValueError( + "Please set the environment variable POSTGRES_PORT." + ) + + async def init(self): + self.pool = await asyncpg.create_pool( + database=os.getenv("POSTGRES_DBNAME"), + user=os.getenv("POSTGRES_USER"), + password=os.getenv("POSTGRES_PASSWORD"), + host=os.getenv("POSTGRES_HOST"), + port=os.getenv("POSTGRES_PORT"), + statement_cache_size=0, # Disable statement caching + ) + async with self.pool.acquire() as conn: + await conn.execute( + f""" + CREATE TABLE IF NOT EXISTS "{self.log_table}" ( + timestamp TIMESTAMPTZ, + log_id UUID, + key TEXT, + value TEXT + ) + """ + ) + await conn.execute( + f""" + CREATE TABLE IF NOT EXISTS "{self.log_info_table}" ( + timestamp TIMESTAMPTZ, + log_id UUID UNIQUE, + log_type TEXT + ) + """ + ) + + async def __aenter__(self): + if self.pool is None: + await self.init() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def close(self): + if self.pool: + await self.pool.close() + self.pool = None + + async def log( + self, + log_id: uuid.UUID, + key: str, + value: str, + is_info_log=False, + ): + collection = self.log_info_table if is_info_log else self.log_table + + if is_info_log: + if "type" not in key: + raise ValueError( + "Info log key must contain the string `type`." + ) + async with self.pool.acquire() as conn: + await self.pool.execute( + f'INSERT INTO "{collection}" (timestamp, log_id, log_type) VALUES (NOW(), $1, $2)', + log_id, + value, + ) + else: + async with self.pool.acquire() as conn: + await conn.execute( + f'INSERT INTO "{collection}" (timestamp, log_id, key, value) VALUES (NOW(), $1, $2, $3)', + log_id, + key, + value, + ) + + async def get_run_info( + self, limit: int = 10, log_type_filter: Optional[str] = None + ) -> list[RunInfo]: + query = f"SELECT log_id, log_type FROM {self.log_info_table}" + conditions = [] + params = [] + if log_type_filter: + conditions.append("log_type = $1") + params.append(log_type_filter) + if conditions: + query += " WHERE " + " AND ".join(conditions) + query += " ORDER BY timestamp DESC LIMIT $2" + params.append(limit) + async with self.pool.acquire() as conn: + rows = await conn.fetch(query, *params) + return [ + RunInfo(run_id=row["log_id"], log_type=row["log_type"]) + for row in rows + ] + + async def get_logs( + self, run_ids: list[uuid.UUID], limit_per_run: int = 10 + ) -> list: + if not run_ids: + raise ValueError("No run ids provided.") + + placeholders = ",".join([f"${i + 1}" for i in range(len(run_ids))]) + query = f""" + SELECT * FROM ( + SELECT *, ROW_NUMBER() OVER (PARTITION BY log_id ORDER BY timestamp DESC) as rn + FROM "{self.log_table}" + WHERE log_id::text IN ({placeholders}) + ) sub + WHERE sub.rn <= ${len(run_ids) + 1} + ORDER BY sub.timestamp DESC + """ + params = [str(run_id) for run_id in run_ids] + [limit_per_run] + async with self.pool.acquire() as conn: + rows = await conn.fetch(query, *params) + return [{key: row[key] for key in row.keys()} for row in rows] + + +class RedisLoggingConfig(LoggingConfig): + provider: str = "redis" + log_table: str = "logs" + log_info_table: str = "logs_pipeline_info" + + def validate(self) -> None: + required_env_vars = ["REDIS_CLUSTER_IP", "REDIS_CLUSTER_PORT"] + for var in required_env_vars: + if not os.getenv(var): + raise ValueError(f"Environment variable {var} is not set.") + + @property + def supported_providers(self) -> list[str]: + return ["redis"] + + +class RedisKVLoggingProvider(KVLoggingProvider): + def __init__(self, config: RedisLoggingConfig): + logger.info( + f"Initializing RedisKVLoggingProvider with config: {config}" + ) + + if not all( + [ + os.getenv("REDIS_CLUSTER_IP"), + os.getenv("REDIS_CLUSTER_PORT"), + ] + ): + raise ValueError( + "Please set the environment variables REDIS_CLUSTER_IP and REDIS_CLUSTER_PORT to run `LoggingDatabaseConnection` with `redis`." + ) + try: + from redis.asyncio import Redis + except ImportError: + raise ValueError( + "Error, `redis` is not installed. Please install it using `pip install redis`." + ) + + cluster_ip = os.getenv("REDIS_CLUSTER_IP") + port = os.getenv("REDIS_CLUSTER_PORT") + self.redis = Redis(host=cluster_ip, port=port, decode_responses=True) + self.log_key = config.log_table + self.log_info_key = config.log_info_table + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + await self.close() + + async def close(self): + await self.redis.close() + + async def log( + self, + log_id: uuid.UUID, + key: str, + value: str, + is_info_log=False, + ): + timestamp = datetime.now().timestamp() + log_entry = { + "timestamp": timestamp, + "log_id": str(log_id), + "key": key, + "value": value, + } + if is_info_log: + if "type" not in key: + raise ValueError("Metadata keys must contain the text 'type'") + log_entry["log_type"] = value + await self.redis.hset( + self.log_info_key, str(log_id), json.dumps(log_entry) + ) + await self.redis.zadd( + f"{self.log_info_key}_sorted", {str(log_id): timestamp} + ) + else: + await self.redis.lpush( + f"{self.log_key}:{str(log_id)}", json.dumps(log_entry) + ) + + async def get_run_info( + self, limit: int = 10, log_type_filter: Optional[str] = None + ) -> list[RunInfo]: + run_info_list = [] + start = 0 + count_per_batch = 100 # Adjust batch size as needed + + while len(run_info_list) < limit: + log_ids = await self.redis.zrevrange( + f"{self.log_info_key}_sorted", + start, + start + count_per_batch - 1, + ) + if not log_ids: + break # No more log IDs to process + + start += count_per_batch + + for log_id in log_ids: + log_entry = json.loads( + await self.redis.hget(self.log_info_key, log_id) + ) + if log_type_filter: + if log_entry["log_type"] == log_type_filter: + run_info_list.append( + RunInfo( + run_id=uuid.UUID(log_entry["log_id"]), + log_type=log_entry["log_type"], + ) + ) + else: + run_info_list.append( + RunInfo( + run_id=uuid.UUID(log_entry["log_id"]), + log_type=log_entry["log_type"], + ) + ) + + if len(run_info_list) >= limit: + break + + return run_info_list[:limit] + + async def get_logs( + self, run_ids: list[uuid.UUID], limit_per_run: int = 10 + ) -> list: + logs = [] + for run_id in run_ids: + raw_logs = await self.redis.lrange( + f"{self.log_key}:{str(run_id)}", 0, limit_per_run - 1 + ) + for raw_log in raw_logs: + json_log = json.loads(raw_log) + json_log["log_id"] = uuid.UUID(json_log["log_id"]) + logs.append(json_log) + return logs + + +class KVLoggingSingleton: + _instance = None + _is_configured = False + + SUPPORTED_PROVIDERS = { + "local": LocalKVLoggingProvider, + "postgres": PostgresKVLoggingProvider, + "redis": RedisKVLoggingProvider, + } + + @classmethod + def get_instance(cls): + return cls.SUPPORTED_PROVIDERS[cls._config.provider](cls._config) + + @classmethod + def configure( + cls, logging_config: Optional[LoggingConfig] = LoggingConfig() + ): + if not cls._is_configured: + cls._config = logging_config + cls._is_configured = True + else: + raise Exception("KVLoggingSingleton is already configured.") + + @classmethod + async def log( + cls, + log_id: uuid.UUID, + key: str, + value: str, + is_info_log=False, + ): + try: + async with cls.get_instance() as provider: + await provider.log(log_id, key, value, is_info_log=is_info_log) + + except Exception as e: + logger.error(f"Error logging data {(log_id, key, value)}: {e}") + + @classmethod + async def get_run_info( + cls, limit: int = 10, log_type_filter: Optional[str] = None + ) -> list[RunInfo]: + async with cls.get_instance() as provider: + return await provider.get_run_info( + limit, log_type_filter=log_type_filter + ) + + @classmethod + async def get_logs( + cls, run_ids: list[uuid.UUID], limit_per_run: int = 10 + ) -> list: + async with cls.get_instance() as provider: + return await provider.get_logs(run_ids, limit_per_run) diff --git a/R2R/r2r/base/logging/log_processor.py b/R2R/r2r/base/logging/log_processor.py new file mode 100755 index 00000000..e85d8de2 --- /dev/null +++ b/R2R/r2r/base/logging/log_processor.py @@ -0,0 +1,196 @@ +import contextlib +import json +import logging +import statistics +from collections import defaultdict +from typing import Any, Callable, Dict, List, Optional, Sequence + +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + + +class FilterCriteria(BaseModel): + filters: Optional[dict[str, str]] = None + + +class LogProcessor: + timestamp_format = "%Y-%m-%d %H:%M:%S" + + def __init__(self, filters: Dict[str, Callable[[Dict[str, Any]], bool]]): + self.filters = filters + self.populations = {name: [] for name in filters} + + def process_log(self, log: Dict[str, Any]): + for name, filter_func in self.filters.items(): + if filter_func(log): + self.populations[name].append(log) + + +class StatisticsCalculator: + @staticmethod + def calculate_statistics( + population: List[Dict[str, Any]], + stat_functions: Dict[str, Callable[[List[Dict[str, Any]]], Any]], + ) -> Dict[str, Any]: + return { + name: func(population) for name, func in stat_functions.items() + } + + +class DistributionGenerator: + @staticmethod + def generate_distributions( + population: List[Dict[str, Any]], + dist_functions: Dict[str, Callable[[List[Dict[str, Any]]], Any]], + ) -> Dict[str, Any]: + return { + name: func(population) for name, func in dist_functions.items() + } + + +class VisualizationPreparer: + @staticmethod + def prepare_visualization_data( + data: Dict[str, Any], + vis_functions: Dict[str, Callable[[Dict[str, Any]], Any]], + ) -> Dict[str, Any]: + return {name: func(data) for name, func in vis_functions.items()} + + +class LogAnalyticsConfig: + def __init__(self, filters, stat_functions, dist_functions, vis_functions): + self.filters = filters + self.stat_functions = stat_functions + self.dist_functions = dist_functions + self.vis_functions = vis_functions + + +class AnalysisTypes(BaseModel): + analysis_types: Optional[dict[str, Sequence[str]]] = None + + @staticmethod + def generate_bar_chart_data(logs, key): + chart_data = {"labels": [], "datasets": []} + value_counts = defaultdict(int) + + for log in logs: + if "entries" in log: + for entry in log["entries"]: + if entry["key"] == key: + value_counts[entry["value"]] += 1 + elif "key" in log and log["key"] == key: + value_counts[log["value"]] += 1 + + for value, count in value_counts.items(): + chart_data["labels"].append(value) + chart_data["datasets"].append({"label": key, "data": [count]}) + + return chart_data + + @staticmethod + def calculate_basic_statistics(logs, key): + values = [] + for log in logs: + if log["key"] == "search_results": + results = json.loads(log["value"]) + scores = [ + float(json.loads(result)["score"]) for result in results + ] + values.extend(scores) + else: + value = log.get("value") + if value is not None: + with contextlib.suppress(ValueError): + values.append(float(value)) + + if not values: + return { + "Mean": None, + "Median": None, + "Mode": None, + "Standard Deviation": None, + "Variance": None, + } + + if len(values) == 1: + single_value = round(values[0], 3) + return { + "Mean": single_value, + "Median": single_value, + "Mode": single_value, + "Standard Deviation": 0, + "Variance": 0, + } + + mean = round(sum(values) / len(values), 3) + median = round(statistics.median(values), 3) + mode = ( + round(statistics.mode(values), 3) + if len(set(values)) != len(values) + else None + ) + std_dev = round(statistics.stdev(values) if len(values) > 1 else 0, 3) + variance = round( + statistics.variance(values) if len(values) > 1 else 0, 3 + ) + + return { + "Mean": mean, + "Median": median, + "Mode": mode, + "Standard Deviation": std_dev, + "Variance": variance, + } + + @staticmethod + def calculate_percentile(logs, key, percentile): + values = [] + for log in logs: + if log["key"] == key: + value = log.get("value") + if value is not None: + with contextlib.suppress(ValueError): + values.append(float(value)) + + if not values: + return {"percentile": percentile, "value": None} + + values.sort() + index = int((percentile / 100) * (len(values) - 1)) + return {"percentile": percentile, "value": round(values[index], 3)} + + +class LogAnalytics: + def __init__(self, logs: List[Dict[str, Any]], config: LogAnalyticsConfig): + self.logs = logs + self.log_processor = LogProcessor(config.filters) + self.statistics_calculator = StatisticsCalculator() + self.distribution_generator = DistributionGenerator() + self.visualization_preparer = VisualizationPreparer() + self.config = config + + def count_logs(self) -> Dict[str, Any]: + """Count the logs for each filter.""" + return { + name: len(population) + for name, population in self.log_processor.populations.items() + } + + def process_logs(self) -> Dict[str, Any]: + for log in self.logs: + self.log_processor.process_log(log) + + analytics = {} + for name, population in self.log_processor.populations.items(): + stats = self.statistics_calculator.calculate_statistics( + population, self.config.stat_functions + ) + dists = self.distribution_generator.generate_distributions( + population, self.config.dist_functions + ) + analytics[name] = {"statistics": stats, "distributions": dists} + + return self.visualization_preparer.prepare_visualization_data( + analytics, self.config.vis_functions + ) diff --git a/R2R/r2r/base/logging/run_manager.py b/R2R/r2r/base/logging/run_manager.py new file mode 100755 index 00000000..ac192bca --- /dev/null +++ b/R2R/r2r/base/logging/run_manager.py @@ -0,0 +1,56 @@ +import contextvars +import uuid +from contextlib import asynccontextmanager +from typing import Any + +from .kv_logger import KVLoggingSingleton + +run_id_var = contextvars.ContextVar("run_id", default=None) + + +class RunManager: + def __init__(self, logger: KVLoggingSingleton): + self.logger = logger + self.run_info = {} + + def generate_run_id(self) -> uuid.UUID: + return uuid.uuid4() + + async def set_run_info(self, pipeline_type: str): + run_id = run_id_var.get() + if run_id is None: + run_id = self.generate_run_id() + token = run_id_var.set(run_id) + self.run_info[run_id] = {"pipeline_type": pipeline_type} + else: + token = run_id_var.set(run_id) + return run_id, token + + async def get_run_info(self): + run_id = run_id_var.get() + return self.run_info.get(run_id, None) + + async def log_run_info( + self, key: str, value: Any, is_info_log: bool = False + ): + run_id = run_id_var.get() + if run_id: + await self.logger.log( + log_id=run_id, key=key, value=value, is_info_log=is_info_log + ) + + async def clear_run_info(self, token: contextvars.Token): + run_id = run_id_var.get() + run_id_var.reset(token) + if run_id and run_id in self.run_info: + del self.run_info[run_id] + + +@asynccontextmanager +async def manage_run(run_manager: RunManager, pipeline_type: str): + run_id, token = await run_manager.set_run_info(pipeline_type) + try: + yield run_id + finally: + # Note: Do not clear the run info to ensure the run ID remains the same + run_id_var.reset(token) diff --git a/R2R/r2r/base/parsers/__init__.py b/R2R/r2r/base/parsers/__init__.py new file mode 100755 index 00000000..d7696202 --- /dev/null +++ b/R2R/r2r/base/parsers/__init__.py @@ -0,0 +1,5 @@ +from .base_parser import AsyncParser + +__all__ = [ + "AsyncParser", +] diff --git a/R2R/r2r/base/parsers/base_parser.py b/R2R/r2r/base/parsers/base_parser.py new file mode 100755 index 00000000..f1bb49d7 --- /dev/null +++ b/R2R/r2r/base/parsers/base_parser.py @@ -0,0 +1,14 @@ +"""Abstract base class for parsers.""" + +from abc import ABC, abstractmethod +from typing import AsyncGenerator, Generic, TypeVar + +from ..abstractions.document import DataType + +T = TypeVar("T") + + +class AsyncParser(ABC, Generic[T]): + @abstractmethod + async def ingest(self, data: T) -> AsyncGenerator[DataType, None]: + pass diff --git a/R2R/r2r/base/pipeline/__init__.py b/R2R/r2r/base/pipeline/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/base/pipeline/__init__.py diff --git a/R2R/r2r/base/pipeline/base_pipeline.py b/R2R/r2r/base/pipeline/base_pipeline.py new file mode 100755 index 00000000..3c1eff9a --- /dev/null +++ b/R2R/r2r/base/pipeline/base_pipeline.py @@ -0,0 +1,233 @@ +"""Base pipeline class for running a sequence of pipes.""" + +import asyncio +import logging +from enum import Enum +from typing import Any, AsyncGenerator, Optional + +from ..logging.kv_logger import KVLoggingSingleton +from ..logging.run_manager import RunManager, manage_run +from ..pipes.base_pipe import AsyncPipe, AsyncState + +logger = logging.getLogger(__name__) + + +class PipelineTypes(Enum): + EVAL = "eval" + INGESTION = "ingestion" + SEARCH = "search" + RAG = "rag" + OTHER = "other" + + +class AsyncPipeline: + """Pipeline class for running a sequence of pipes.""" + + pipeline_type: str = "other" + + def __init__( + self, + pipe_logger: Optional[KVLoggingSingleton] = None, + run_manager: Optional[RunManager] = None, + ): + self.pipes: list[AsyncPipe] = [] + self.upstream_outputs: list[list[dict[str, str]]] = [] + self.pipe_logger = pipe_logger or KVLoggingSingleton() + self.run_manager = run_manager or RunManager(self.pipe_logger) + self.futures = {} + self.level = 0 + + def add_pipe( + self, + pipe: AsyncPipe, + add_upstream_outputs: Optional[list[dict[str, str]]] = None, + *args, + **kwargs, + ) -> None: + """Add a pipe to the pipeline.""" + self.pipes.append(pipe) + if not add_upstream_outputs: + add_upstream_outputs = [] + self.upstream_outputs.append(add_upstream_outputs) + + async def run( + self, + input: Any, + state: Optional[AsyncState] = None, + stream: bool = False, + run_manager: Optional[RunManager] = None, + log_run_info: bool = True, + *args: Any, + **kwargs: Any, + ): + """Run the pipeline.""" + run_manager = run_manager or self.run_manager + + try: + PipelineTypes(self.pipeline_type) + except ValueError: + raise ValueError( + f"Invalid pipeline type: {self.pipeline_type}, must be one of {PipelineTypes.__members__.keys()}" + ) + + self.state = state or AsyncState() + current_input = input + async with manage_run(run_manager, self.pipeline_type): + if log_run_info: + await run_manager.log_run_info( + key="pipeline_type", + value=self.pipeline_type, + is_info_log=True, + ) + try: + for pipe_num in range(len(self.pipes)): + config_name = self.pipes[pipe_num].config.name + self.futures[config_name] = asyncio.Future() + + current_input = self._run_pipe( + pipe_num, + current_input, + run_manager, + *args, + **kwargs, + ) + self.futures[config_name].set_result(current_input) + if not stream: + final_result = await self._consume_all(current_input) + return final_result + else: + return current_input + except Exception as error: + logger.error(f"Pipeline failed with error: {error}") + raise error + + async def _consume_all(self, gen: AsyncGenerator) -> list[Any]: + result = [] + async for item in gen: + if hasattr( + item, "__aiter__" + ): # Check if the item is an async generator + sub_result = await self._consume_all(item) + result.extend(sub_result) + else: + result.append(item) + return result + + async def _run_pipe( + self, + pipe_num: int, + input: Any, + run_manager: RunManager, + *args: Any, + **kwargs: Any, + ): + # Collect inputs, waiting for the necessary futures + pipe = self.pipes[pipe_num] + add_upstream_outputs = self.sort_upstream_outputs( + self.upstream_outputs[pipe_num] + ) + input_dict = {"message": input} + + # Group upstream outputs by prev_pipe_name + grouped_upstream_outputs = {} + for upstream_input in add_upstream_outputs: + upstream_pipe_name = upstream_input["prev_pipe_name"] + if upstream_pipe_name not in grouped_upstream_outputs: + grouped_upstream_outputs[upstream_pipe_name] = [] + grouped_upstream_outputs[upstream_pipe_name].append(upstream_input) + + for ( + upstream_pipe_name, + upstream_inputs, + ) in grouped_upstream_outputs.items(): + + async def resolve_future_output(future): + result = future.result() + # consume the async generator + return [item async for item in result] + + async def replay_items_as_async_gen(items): + for item in items: + yield item + + temp_results = await resolve_future_output( + self.futures[upstream_pipe_name] + ) + if upstream_pipe_name == self.pipes[pipe_num - 1].config.name: + input_dict["message"] = replay_items_as_async_gen(temp_results) + + for upstream_input in upstream_inputs: + outputs = await self.state.get(upstream_pipe_name, "output") + prev_output_field = upstream_input.get( + "prev_output_field", None + ) + if not prev_output_field: + raise ValueError( + "`prev_output_field` must be specified in the upstream_input" + ) + input_dict[upstream_input["input_field"]] = outputs[ + prev_output_field + ] + + # Handle the pipe generator + async for ele in await pipe.run( + pipe.Input(**input_dict), + self.state, + run_manager, + *args, + **kwargs, + ): + yield ele + + def sort_upstream_outputs( + self, add_upstream_outputs: list[dict[str, str]] + ) -> list[dict[str, str]]: + pipe_name_to_index = { + pipe.config.name: index for index, pipe in enumerate(self.pipes) + } + + def get_pipe_index(upstream_output): + return pipe_name_to_index[upstream_output["prev_pipe_name"]] + + sorted_outputs = sorted( + add_upstream_outputs, key=get_pipe_index, reverse=True + ) + return sorted_outputs + + +class EvalPipeline(AsyncPipeline): + """A pipeline for evaluation.""" + + pipeline_type: str = "eval" + + async def run( + self, + input: Any, + state: Optional[AsyncState] = None, + stream: bool = False, + run_manager: Optional[RunManager] = None, + *args: Any, + **kwargs: Any, + ): + return await super().run( + input, state, stream, run_manager, *args, **kwargs + ) + + def add_pipe( + self, + pipe: AsyncPipe, + add_upstream_outputs: Optional[list[dict[str, str]]] = None, + *args, + **kwargs, + ) -> None: + logger.debug(f"Adding pipe {pipe.config.name} to the EvalPipeline") + return super().add_pipe(pipe, add_upstream_outputs, *args, **kwargs) + + +async def dequeue_requests(queue: asyncio.Queue) -> AsyncGenerator: + """Create an async generator to dequeue requests.""" + while True: + request = await queue.get() + if request is None: + break + yield request diff --git a/R2R/r2r/base/pipes/__init__.py b/R2R/r2r/base/pipes/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/base/pipes/__init__.py diff --git a/R2R/r2r/base/pipes/base_pipe.py b/R2R/r2r/base/pipes/base_pipe.py new file mode 100755 index 00000000..63e3d04e --- /dev/null +++ b/R2R/r2r/base/pipes/base_pipe.py @@ -0,0 +1,163 @@ +import asyncio +import logging +import uuid +from abc import abstractmethod +from enum import Enum +from typing import Any, AsyncGenerator, Optional + +from pydantic import BaseModel + +from r2r.base.logging.kv_logger import KVLoggingSingleton +from r2r.base.logging.run_manager import RunManager, manage_run + +logger = logging.getLogger(__name__) + + +class PipeType(Enum): + INGESTOR = "ingestor" + EVAL = "eval" + GENERATOR = "generator" + SEARCH = "search" + TRANSFORM = "transform" + OTHER = "other" + + +class AsyncState: + """A state object for storing data between pipes.""" + + def __init__(self): + self.data = {} + self.lock = asyncio.Lock() + + async def update(self, outer_key: str, values: dict): + """Update the state with new values.""" + async with self.lock: + if not isinstance(values, dict): + raise ValueError("Values must be contained in a dictionary.") + if outer_key not in self.data: + self.data[outer_key] = {} + for inner_key, inner_value in values.items(): + self.data[outer_key][inner_key] = inner_value + + async def get(self, outer_key: str, inner_key: str, default=None): + """Get a value from the state.""" + async with self.lock: + if outer_key not in self.data: + raise ValueError( + f"Key {outer_key} does not exist in the state." + ) + if inner_key not in self.data[outer_key]: + return default or {} + return self.data[outer_key][inner_key] + + async def delete(self, outer_key: str, inner_key: Optional[str] = None): + """Delete a value from the state.""" + async with self.lock: + if outer_key in self.data and not inner_key: + del self.data[outer_key] + else: + if inner_key not in self.data[outer_key]: + raise ValueError( + f"Key {inner_key} does not exist in the state." + ) + del self.data[outer_key][inner_key] + + +class AsyncPipe: + """An asynchronous pipe for processing data with logging capabilities.""" + + class PipeConfig(BaseModel): + """Configuration for a pipe.""" + + name: str = "default_pipe" + max_log_queue_size: int = 100 + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + class Input(BaseModel): + """Input for a pipe.""" + + message: AsyncGenerator[Any, None] + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + def __init__( + self, + type: PipeType = PipeType.OTHER, + config: Optional[PipeConfig] = None, + pipe_logger: Optional[KVLoggingSingleton] = None, + run_manager: Optional[RunManager] = None, + ): + self._config = config or self.PipeConfig() + self._type = type + self.pipe_logger = pipe_logger or KVLoggingSingleton() + self.log_queue = asyncio.Queue() + self.log_worker_task = None + self._run_manager = run_manager or RunManager(self.pipe_logger) + + logger.debug( + f"Initialized pipe {self.config.name} of type {self.type}" + ) + + @property + def config(self) -> PipeConfig: + return self._config + + @property + def type(self) -> PipeType: + return self._type + + async def log_worker(self): + while True: + log_data = await self.log_queue.get() + run_id, key, value = log_data + await self.pipe_logger.log(run_id, key, value) + self.log_queue.task_done() + + async def enqueue_log(self, run_id: uuid.UUID, key: str, value: str): + if self.log_queue.qsize() < self.config.max_log_queue_size: + await self.log_queue.put((run_id, key, value)) + + async def run( + self, + input: Input, + state: AsyncState, + run_manager: Optional[RunManager] = None, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[Any, None]: + """Run the pipe with logging capabilities.""" + + run_manager = run_manager or self._run_manager + + async def wrapped_run() -> AsyncGenerator[Any, None]: + async with manage_run(run_manager, self.config.name) as run_id: + self.log_worker_task = asyncio.create_task( + self.log_worker(), name=f"log-worker-{self.config.name}" + ) + try: + async for result in self._run_logic( + input, state, run_id=run_id, *args, **kwargs + ): + yield result + finally: + await self.log_queue.join() + self.log_worker_task.cancel() + self.log_queue = asyncio.Queue() + + return wrapped_run() + + @abstractmethod + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[Any, None]: + pass diff --git a/R2R/r2r/base/providers/__init__.py b/R2R/r2r/base/providers/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/base/providers/__init__.py diff --git a/R2R/r2r/base/providers/base_provider.py b/R2R/r2r/base/providers/base_provider.py new file mode 100755 index 00000000..8ee8d56a --- /dev/null +++ b/R2R/r2r/base/providers/base_provider.py @@ -0,0 +1,48 @@ +from abc import ABC, abstractmethod, abstractproperty +from typing import Any, Optional, Type + +from pydantic import BaseModel + + +class ProviderConfig(BaseModel, ABC): + """A base provider configuration class""" + + extra_fields: dict[str, Any] = {} + provider: Optional[str] = None + + class Config: + arbitrary_types_allowed = True + ignore_extra = True + + @abstractmethod + def validate(self) -> None: + pass + + @classmethod + def create(cls: Type["ProviderConfig"], **kwargs: Any) -> "ProviderConfig": + base_args = cls.__fields__.keys() + filtered_kwargs = { + k: v if v != "None" else None + for k, v in kwargs.items() + if k in base_args + } + instance = cls(**filtered_kwargs) + for k, v in kwargs.items(): + if k not in base_args: + instance.extra_fields[k] = v + return instance + + @abstractproperty + @property + def supported_providers(self) -> list[str]: + """Define a list of supported providers.""" + pass + + +class Provider(ABC): + """A base provider class to provide a common interface for all providers.""" + + def __init__(self, config: Optional[ProviderConfig] = None): + if config: + config.validate() + self.config = config diff --git a/R2R/r2r/base/providers/embedding_provider.py b/R2R/r2r/base/providers/embedding_provider.py new file mode 100755 index 00000000..8f3af56f --- /dev/null +++ b/R2R/r2r/base/providers/embedding_provider.py @@ -0,0 +1,83 @@ +import logging +from abc import abstractmethod +from enum import Enum +from typing import Optional + +from ..abstractions.search import VectorSearchResult +from .base_provider import Provider, ProviderConfig + +logger = logging.getLogger(__name__) + + +class EmbeddingConfig(ProviderConfig): + """A base embedding configuration class""" + + provider: Optional[str] = None + base_model: Optional[str] = None + base_dimension: Optional[int] = None + rerank_model: Optional[str] = None + rerank_dimension: Optional[int] = None + rerank_transformer_type: Optional[str] = None + batch_size: int = 1 + + def validate(self) -> None: + if self.provider not in self.supported_providers: + raise ValueError(f"Provider '{self.provider}' is not supported.") + + @property + def supported_providers(self) -> list[str]: + return [None, "openai", "ollama", "sentence-transformers"] + + +class EmbeddingProvider(Provider): + """An abstract class to provide a common interface for embedding providers.""" + + class PipeStage(Enum): + BASE = 1 + RERANK = 2 + + def __init__(self, config: EmbeddingConfig): + if not isinstance(config, EmbeddingConfig): + raise ValueError( + "EmbeddingProvider must be initialized with a `EmbeddingConfig`." + ) + logger.info(f"Initializing EmbeddingProvider with config {config}.") + + super().__init__(config) + + @abstractmethod + def get_embedding(self, text: str, stage: PipeStage = PipeStage.BASE): + pass + + async def async_get_embedding( + self, text: str, stage: PipeStage = PipeStage.BASE + ): + return self.get_embedding(text, stage) + + @abstractmethod + def get_embeddings( + self, texts: list[str], stage: PipeStage = PipeStage.BASE + ): + pass + + async def async_get_embeddings( + self, texts: list[str], stage: PipeStage = PipeStage.BASE + ): + return self.get_embeddings(texts, stage) + + @abstractmethod + def rerank( + self, + query: str, + results: list[VectorSearchResult], + stage: PipeStage = PipeStage.RERANK, + limit: int = 10, + ): + pass + + @abstractmethod + def tokenize_string( + self, text: str, model: str, stage: PipeStage + ) -> list[int]: + """Tokenizes the input string.""" + pass diff --git a/R2R/r2r/base/providers/eval_provider.py b/R2R/r2r/base/providers/eval_provider.py new file mode 100755 index 00000000..76053f87 --- /dev/null +++ b/R2R/r2r/base/providers/eval_provider.py @@ -0,0 +1,46 @@ +from typing import Optional, Union + +from ..abstractions.llm import GenerationConfig +from .base_provider import Provider, ProviderConfig +from .llm_provider import LLMConfig + + +class EvalConfig(ProviderConfig): + """A base eval config class""" + + llm: Optional[LLMConfig] = None + + def validate(self) -> None: + if self.provider not in self.supported_providers: + raise ValueError(f"Provider {self.provider} not supported.") + if self.provider and not self.llm: + raise ValueError( + "EvalConfig must have a `llm` attribute when specifying a provider." + ) + + @property + def supported_providers(self) -> list[str]: + return [None, "local"] + + +class EvalProvider(Provider): + """An abstract class to provide a common interface for evaluation providers.""" + + def __init__(self, config: EvalConfig): + if not isinstance(config, EvalConfig): + raise ValueError( + "EvalProvider must be initialized with a `EvalConfig`." + ) + + super().__init__(config) + + def evaluate( + self, + query: str, + context: str, + completion: str, + eval_generation_config: Optional[GenerationConfig] = None, + ) -> dict[str, dict[str, Union[str, float]]]: + return self._evaluate( + query, context, completion, eval_generation_config + ) diff --git a/R2R/r2r/base/providers/kg_provider.py b/R2R/r2r/base/providers/kg_provider.py new file mode 100755 index 00000000..4ae96b11 --- /dev/null +++ b/R2R/r2r/base/providers/kg_provider.py @@ -0,0 +1,182 @@ +"""Base classes for knowledge graph providers.""" + +import json +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Optional, Tuple + +from .prompt_provider import PromptProvider + +if TYPE_CHECKING: + from r2r.main import R2RClient + +from ...base.utils.base_utils import EntityType, Relation +from ..abstractions.llama_abstractions import EntityNode, LabelledNode +from ..abstractions.llama_abstractions import Relation as LlamaRelation +from ..abstractions.llama_abstractions import VectorStoreQuery +from ..abstractions.llm import GenerationConfig +from .base_provider import ProviderConfig + +logger = logging.getLogger(__name__) + + +class KGConfig(ProviderConfig): + """A base KG config class""" + + provider: Optional[str] = None + batch_size: int = 1 + kg_extraction_prompt: Optional[str] = "few_shot_ner_kg_extraction" + kg_agent_prompt: Optional[str] = "kg_agent" + kg_extraction_config: Optional[GenerationConfig] = None + + def validate(self) -> None: + if self.provider not in self.supported_providers: + raise ValueError(f"Provider '{self.provider}' is not supported.") + + @property + def supported_providers(self) -> list[str]: + return [None, "neo4j"] + + +class KGProvider(ABC): + """An abstract class to provide a common interface for Knowledge Graphs.""" + + def __init__(self, config: KGConfig) -> None: + if not isinstance(config, KGConfig): + raise ValueError( + "KGProvider must be initialized with a `KGConfig`." + ) + logger.info(f"Initializing KG provider with config: {config}") + self.config = config + self.validate_config() + + def validate_config(self) -> None: + self.config.validate() + + @property + @abstractmethod + def client(self) -> Any: + """Get client.""" + pass + + @abstractmethod + def get(self, subj: str) -> list[list[str]]: + """Abstract method to get triplets.""" + pass + + @abstractmethod + def get_rel_map( + self, + subjs: Optional[list[str]] = None, + depth: int = 2, + limit: int = 30, + ) -> dict[str, list[list[str]]]: + """Abstract method to get depth-aware rel map.""" + pass + + @abstractmethod + def upsert_nodes(self, nodes: list[EntityNode]) -> None: + """Abstract method to add triplet.""" + pass + + @abstractmethod + def upsert_relations(self, relations: list[LlamaRelation]) -> None: + """Abstract method to add triplet.""" + pass + + @abstractmethod + def delete(self, subj: str, rel: str, obj: str) -> None: + """Abstract method to delete triplet.""" + pass + + @abstractmethod + def get_schema(self, refresh: bool = False) -> str: + """Abstract method to get the schema of the graph store.""" + pass + + @abstractmethod + def structured_query( + self, query: str, param_map: Optional[dict[str, Any]] = {} + ) -> Any: + """Abstract method to query the graph store with statement and parameters.""" + pass + + @abstractmethod + def vector_query( + self, query: VectorStoreQuery, **kwargs: Any + ) -> Tuple[list[LabelledNode], list[float]]: + """Abstract method to query the graph store with a vector store query.""" + + # TODO - Type this method. + @abstractmethod + def update_extraction_prompt( + self, + prompt_provider: Any, + entity_types: list[Any], + relations: list[Relation], + ): + """Abstract method to update the KG extraction prompt.""" + pass + + # TODO - Type this method. + @abstractmethod + def update_kg_agent_prompt( + self, + prompt_provider: Any, + entity_types: list[Any], + relations: list[Relation], + ): + """Abstract method to update the KG agent prompt.""" + pass + + +def escape_braces(s: str) -> str: + """ + Escape braces in a string. + This is a placeholder function - implement the actual logic as needed. + """ + # Implement your escape_braces logic here + return s.replace("{", "{{").replace("}", "}}") + + +# TODO - Make this more configurable / intelligent +def update_kg_prompt( + client: "R2RClient", + r2r_prompts: PromptProvider, + prompt_base: str, + entity_types: list[EntityType], + relations: list[Relation], +) -> None: + # Get the default extraction template + template_name: str = f"{prompt_base}_with_spec" + + new_template: str = r2r_prompts.get_prompt( + template_name, + { + "entity_types": json.dumps( + { + "entity_types": [ + str(entity.name) for entity in entity_types + ] + }, + indent=4, + ), + "relations": json.dumps( + {"predicates": [str(relation.name) for relation in relations]}, + indent=4, + ), + "input": """\n{input}""", + }, + ) + + # Escape all braces in the template, except for the {input} placeholder, for formatting + escaped_template: str = escape_braces(new_template).replace( + """{{input}}""", """{input}""" + ) + + # Update the client's prompt + client.update_prompt( + prompt_base, + template=escaped_template, + input_types={"input": "str"}, + ) diff --git a/R2R/r2r/base/providers/llm_provider.py b/R2R/r2r/base/providers/llm_provider.py new file mode 100755 index 00000000..9b6499a4 --- /dev/null +++ b/R2R/r2r/base/providers/llm_provider.py @@ -0,0 +1,66 @@ +"""Base classes for language model providers.""" + +import logging +from abc import abstractmethod +from typing import Optional + +from r2r.base.abstractions.llm import GenerationConfig + +from ..abstractions.llm import LLMChatCompletion, LLMChatCompletionChunk +from .base_provider import Provider, ProviderConfig + +logger = logging.getLogger(__name__) + + +class LLMConfig(ProviderConfig): + """A base LLM config class""" + + provider: Optional[str] = None + generation_config: Optional[GenerationConfig] = None + + def validate(self) -> None: + if not self.provider: + raise ValueError("Provider must be set.") + + if self.provider and self.provider not in self.supported_providers: + raise ValueError(f"Provider '{self.provider}' is not supported.") + + @property + def supported_providers(self) -> list[str]: + return ["litellm", "openai"] + + +class LLMProvider(Provider): + """An abstract class to provide a common interface for LLMs.""" + + def __init__( + self, + config: LLMConfig, + ) -> None: + if not isinstance(config, LLMConfig): + raise ValueError( + "LLMProvider must be initialized with a `LLMConfig`." + ) + logger.info(f"Initializing LLM provider with config: {config}") + + super().__init__(config) + + @abstractmethod + def get_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletion: + """Abstract method to get a chat completion from the provider.""" + pass + + @abstractmethod + def get_completion_stream( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletionChunk: + """Abstract method to get a completion stream from the provider.""" + pass diff --git a/R2R/r2r/base/providers/prompt_provider.py b/R2R/r2r/base/providers/prompt_provider.py new file mode 100755 index 00000000..78af9e11 --- /dev/null +++ b/R2R/r2r/base/providers/prompt_provider.py @@ -0,0 +1,65 @@ +import logging +from abc import abstractmethod +from typing import Any, Optional + +from .base_provider import Provider, ProviderConfig + +logger = logging.getLogger(__name__) + + +class PromptConfig(ProviderConfig): + def validate(self) -> None: + pass + + @property + def supported_providers(self) -> list[str]: + # Return a list of supported prompt providers + return ["default_prompt_provider"] + + +class PromptProvider(Provider): + def __init__(self, config: Optional[PromptConfig] = None): + if config is None: + config = PromptConfig() + elif not isinstance(config, PromptConfig): + raise ValueError( + "PromptProvider must be initialized with a `PromptConfig`." + ) + logger.info(f"Initializing PromptProvider with config {config}.") + super().__init__(config) + + @abstractmethod + def add_prompt( + self, name: str, template: str, input_types: dict[str, str] + ) -> None: + pass + + @abstractmethod + def get_prompt( + self, prompt_name: str, inputs: Optional[dict[str, Any]] = None + ) -> str: + pass + + @abstractmethod + def get_all_prompts(self) -> dict[str, str]: + pass + + @abstractmethod + def update_prompt( + self, + name: str, + template: Optional[str] = None, + input_types: Optional[dict[str, str]] = None, + ) -> None: + pass + + def _get_message_payload( + self, system_prompt: str, task_prompt: str + ) -> dict: + return [ + { + "role": "system", + "content": system_prompt, + }, + {"role": "user", "content": task_prompt}, + ] diff --git a/R2R/r2r/base/providers/vector_db_provider.py b/R2R/r2r/base/providers/vector_db_provider.py new file mode 100755 index 00000000..a6d5aaa8 --- /dev/null +++ b/R2R/r2r/base/providers/vector_db_provider.py @@ -0,0 +1,142 @@ +import logging +from abc import ABC, abstractmethod +from typing import Optional, Union + +from ..abstractions.document import DocumentInfo +from ..abstractions.search import VectorSearchResult +from ..abstractions.vector import VectorEntry +from .base_provider import Provider, ProviderConfig + +logger = logging.getLogger(__name__) + + +class VectorDBConfig(ProviderConfig): + provider: str + + def __post_init__(self): + self.validate() + # Capture additional fields + for key, value in self.extra_fields.items(): + setattr(self, key, value) + + def validate(self) -> None: + if self.provider not in self.supported_providers: + raise ValueError(f"Provider '{self.provider}' is not supported.") + + @property + def supported_providers(self) -> list[str]: + return ["local", "pgvector"] + + +class VectorDBProvider(Provider, ABC): + def __init__(self, config: VectorDBConfig): + if not isinstance(config, VectorDBConfig): + raise ValueError( + "VectorDBProvider must be initialized with a `VectorDBConfig`." + ) + logger.info(f"Initializing VectorDBProvider with config {config}.") + super().__init__(config) + + @abstractmethod + def initialize_collection(self, dimension: int) -> None: + pass + + @abstractmethod + def copy(self, entry: VectorEntry, commit: bool = True) -> None: + pass + + @abstractmethod + def upsert(self, entry: VectorEntry, commit: bool = True) -> None: + pass + + @abstractmethod + def search( + self, + query_vector: list[float], + filters: dict[str, Union[bool, int, str]] = {}, + limit: int = 10, + *args, + **kwargs, + ) -> list[VectorSearchResult]: + pass + + @abstractmethod + def hybrid_search( + self, + query_text: str, + query_vector: list[float], + limit: int = 10, + filters: Optional[dict[str, Union[bool, int, str]]] = None, + # Hybrid search parameters + full_text_weight: float = 1.0, + semantic_weight: float = 1.0, + rrf_k: int = 20, # typical value is ~2x the number of results you want + *args, + **kwargs, + ) -> list[VectorSearchResult]: + pass + + @abstractmethod + def create_index(self, index_type, column_name, index_options): + pass + + def upsert_entries( + self, entries: list[VectorEntry], commit: bool = True + ) -> None: + for entry in entries: + self.upsert(entry, commit=commit) + + def copy_entries( + self, entries: list[VectorEntry], commit: bool = True + ) -> None: + for entry in entries: + self.copy(entry, commit=commit) + + @abstractmethod + def delete_by_metadata( + self, + metadata_fields: list[str], + metadata_values: list[Union[bool, int, str]], + ) -> list[str]: + if len(metadata_fields) != len(metadata_values): + raise ValueError( + "The number of metadata fields and values must be equal." + ) + pass + + @abstractmethod + def get_metadatas( + self, + metadata_fields: list[str], + filter_field: Optional[str] = None, + filter_value: Optional[str] = None, + ) -> list[str]: + pass + + @abstractmethod + def upsert_documents_overview( + self, document_infs: list[DocumentInfo] + ) -> None: + pass + + @abstractmethod + def get_documents_overview( + self, + filter_document_ids: Optional[list[str]] = None, + filter_user_ids: Optional[list[str]] = None, + ) -> list[DocumentInfo]: + pass + + @abstractmethod + def get_document_chunks(self, document_id: str) -> list[dict]: + pass + + @abstractmethod + def delete_from_documents_overview( + self, document_id: str, version: Optional[str] = None + ) -> dict: + pass + + @abstractmethod + def get_users_overview(self, user_ids: Optional[list[str]] = None) -> dict: + pass diff --git a/R2R/r2r/base/utils/__init__.py b/R2R/r2r/base/utils/__init__.py new file mode 100755 index 00000000..104d50eb --- /dev/null +++ b/R2R/r2r/base/utils/__init__.py @@ -0,0 +1,26 @@ +from .base_utils import ( + EntityType, + Relation, + format_entity_types, + format_relations, + generate_id_from_label, + generate_run_id, + increment_version, + run_pipeline, + to_async_generator, +) +from .splitter.text import RecursiveCharacterTextSplitter, TextSplitter + +__all__ = [ + "RecursiveCharacterTextSplitter", + "TextSplitter", + "run_pipeline", + "to_async_generator", + "generate_run_id", + "generate_id_from_label", + "increment_version", + "EntityType", + "Relation", + "format_entity_types", + "format_relations", +] diff --git a/R2R/r2r/base/utils/base_utils.py b/R2R/r2r/base/utils/base_utils.py new file mode 100755 index 00000000..12652833 --- /dev/null +++ b/R2R/r2r/base/utils/base_utils.py @@ -0,0 +1,63 @@ +import asyncio +import uuid +from typing import TYPE_CHECKING, Any, AsyncGenerator, Iterable + +if TYPE_CHECKING: + from ..pipeline.base_pipeline import AsyncPipeline + + +def generate_run_id() -> uuid.UUID: + return uuid.uuid4() + + +def generate_id_from_label(label: str) -> uuid.UUID: + return uuid.uuid5(uuid.NAMESPACE_DNS, label) + + +async def to_async_generator( + iterable: Iterable[Any], +) -> AsyncGenerator[Any, None]: + for item in iterable: + yield item + + +def run_pipeline(pipeline: "AsyncPipeline", input: Any, *args, **kwargs): + if not isinstance(input, AsyncGenerator) and not isinstance(input, list): + input = to_async_generator([input]) + elif not isinstance(input, AsyncGenerator): + input = to_async_generator(input) + + async def _run_pipeline(input, *args, **kwargs): + return await pipeline.run(input, *args, **kwargs) + + return asyncio.run(_run_pipeline(input, *args, **kwargs)) + + +def increment_version(version: str) -> str: + prefix = version[:-1] + suffix = int(version[-1]) + return f"{prefix}{suffix + 1}" + + +class EntityType: + def __init__(self, name: str): + self.name = name + + +class Relation: + def __init__(self, name: str): + self.name = name + + +def format_entity_types(entity_types: list[EntityType]) -> str: + lines = [] + for entity in entity_types: + lines.append(entity.name) + return "\n".join(lines) + + +def format_relations(predicates: list[Relation]) -> str: + lines = [] + for predicate in predicates: + lines.append(predicate.name) + return "\n".join(lines) diff --git a/R2R/r2r/base/utils/splitter/__init__.py b/R2R/r2r/base/utils/splitter/__init__.py new file mode 100755 index 00000000..07a9f554 --- /dev/null +++ b/R2R/r2r/base/utils/splitter/__init__.py @@ -0,0 +1,3 @@ +from .text import RecursiveCharacterTextSplitter + +__all__ = ["RecursiveCharacterTextSplitter"] diff --git a/R2R/r2r/base/utils/splitter/text.py b/R2R/r2r/base/utils/splitter/text.py new file mode 100755 index 00000000..5458310c --- /dev/null +++ b/R2R/r2r/base/utils/splitter/text.py @@ -0,0 +1,1979 @@ +# Source - LangChain +# URL: https://github.com/langchain-ai/langchain/blob/6a5b084704afa22ca02f78d0464f35aed75d1ff2/libs/langchain/langchain/text_splitter.py#L851 +"""**Text Splitters** are classes for splitting text. + + +**Class hierarchy:** + +.. code-block:: + + BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter # Example: CharacterTextSplitter + RecursiveCharacterTextSplitter --> <name>TextSplitter + +Note: **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from TextSplitter. + + +**Main helpers:** + +.. code-block:: + + Document, Tokenizer, Language, LineType, HeaderType + +""" # noqa: E501 + +from __future__ import annotations + +import copy +import json +import logging +import pathlib +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from io import BytesIO, StringIO +from typing import ( + AbstractSet, + Any, + Callable, + Collection, + Dict, + Iterable, + List, + Literal, + Optional, + Sequence, + Tuple, + Type, + TypedDict, + TypeVar, + Union, + cast, +) + +import requests +from pydantic import BaseModel, Field, PrivateAttr +from typing_extensions import NotRequired + +logger = logging.getLogger(__name__) + +TS = TypeVar("TS", bound="TextSplitter") + + +class BaseSerialized(TypedDict): + """Base class for serialized objects.""" + + lc: int + id: List[str] + name: NotRequired[str] + graph: NotRequired[Dict[str, Any]] + + +class SerializedConstructor(BaseSerialized): + """Serialized constructor.""" + + type: Literal["constructor"] + kwargs: Dict[str, Any] + + +class SerializedSecret(BaseSerialized): + """Serialized secret.""" + + type: Literal["secret"] + + +class SerializedNotImplemented(BaseSerialized): + """Serialized not implemented.""" + + type: Literal["not_implemented"] + repr: Optional[str] + + +def try_neq_default(value: Any, key: str, model: BaseModel) -> bool: + """Try to determine if a value is different from the default. + + Args: + value: The value. + key: The key. + model: The model. + + Returns: + Whether the value is different from the default. + """ + try: + return model.__fields__[key].get_default() != value + except Exception: + return True + + +class Serializable(BaseModel, ABC): + """Serializable base class.""" + + @classmethod + def is_lc_serializable(cls) -> bool: + """Is this class serializable?""" + return False + + @classmethod + def get_lc_namespace(cls) -> List[str]: + """Get the namespace of the langchain object. + + For example, if the class is `langchain.llms.openai.OpenAI`, then the + namespace is ["langchain", "llms", "openai"] + """ + return cls.__module__.split(".") + + @property + def lc_secrets(self) -> Dict[str, str]: + """A map of constructor argument names to secret ids. + + For example, + {"openai_api_key": "OPENAI_API_KEY"} + """ + return dict() + + @property + def lc_attributes(self) -> Dict: + """List of attribute names that should be included in the serialized kwargs. + + These attributes must be accepted by the constructor. + """ + return {} + + @classmethod + def lc_id(cls) -> List[str]: + """A unique identifier for this class for serialization purposes. + + The unique identifier is a list of strings that describes the path + to the object. + """ + return [*cls.get_lc_namespace(), cls.__name__] + + class Config: + extra = "ignore" + + def __repr_args__(self) -> Any: + return [ + (k, v) + for k, v in super().__repr_args__() + if (k not in self.__fields__ or try_neq_default(v, k, self)) + ] + + _lc_kwargs = PrivateAttr(default_factory=dict) + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._lc_kwargs = kwargs + + def to_json( + self, + ) -> Union[SerializedConstructor, SerializedNotImplemented]: + if not self.is_lc_serializable(): + return self.to_json_not_implemented() + + secrets = dict() + # Get latest values for kwargs if there is an attribute with same name + lc_kwargs = { + k: getattr(self, k, v) + for k, v in self._lc_kwargs.items() + if not (self.__exclude_fields__ or {}).get(k, False) # type: ignore + } + + # Merge the lc_secrets and lc_attributes from every class in the MRO + for cls in [None, *self.__class__.mro()]: + # Once we get to Serializable, we're done + if cls is Serializable: + break + + if cls: + deprecated_attributes = [ + "lc_namespace", + "lc_serializable", + ] + + for attr in deprecated_attributes: + if hasattr(cls, attr): + raise ValueError( + f"Class {self.__class__} has a deprecated " + f"attribute {attr}. Please use the corresponding " + f"classmethod instead." + ) + + # Get a reference to self bound to each class in the MRO + this = cast( + Serializable, self if cls is None else super(cls, self) + ) + + secrets.update(this.lc_secrets) + # Now also add the aliases for the secrets + # This ensures known secret aliases are hidden. + # Note: this does NOT hide any other extra kwargs + # that are not present in the fields. + for key in list(secrets): + value = secrets[key] + if key in this.__fields__: + secrets[this.__fields__[key].alias] = value + lc_kwargs.update(this.lc_attributes) + + # include all secrets, even if not specified in kwargs + # as these secrets may be passed as an environment variable instead + for key in secrets.keys(): + secret_value = getattr(self, key, None) or lc_kwargs.get(key) + if secret_value is not None: + lc_kwargs.update({key: secret_value}) + + return { + "lc": 1, + "type": "constructor", + "id": self.lc_id(), + "kwargs": ( + lc_kwargs + if not secrets + else _replace_secrets(lc_kwargs, secrets) + ), + } + + def to_json_not_implemented(self) -> SerializedNotImplemented: + return to_json_not_implemented(self) + + +def _replace_secrets( + root: Dict[Any, Any], secrets_map: Dict[str, str] +) -> Dict[Any, Any]: + result = root.copy() + for path, secret_id in secrets_map.items(): + [*parts, last] = path.split(".") + current = result + for part in parts: + if part not in current: + break + current[part] = current[part].copy() + current = current[part] + if last in current: + current[last] = { + "lc": 1, + "type": "secret", + "id": [secret_id], + } + return result + + +def to_json_not_implemented(obj: object) -> SerializedNotImplemented: + """Serialize a "not implemented" object. + + Args: + obj: object to serialize + + Returns: + SerializedNotImplemented + """ + _id: List[str] = [] + try: + if hasattr(obj, "__name__"): + _id = [*obj.__module__.split("."), obj.__name__] + elif hasattr(obj, "__class__"): + _id = [ + *obj.__class__.__module__.split("."), + obj.__class__.__name__, + ] + except Exception: + pass + + result: SerializedNotImplemented = { + "lc": 1, + "type": "not_implemented", + "id": _id, + "repr": None, + } + try: + result["repr"] = repr(obj) + except Exception: + pass + return result + + +class Document(Serializable): + """Class for storing a piece of text and associated metadata.""" + + page_content: str + """String text.""" + metadata: dict = Field(default_factory=dict) + """Arbitrary metadata about the page content (e.g., source, relationships to other + documents, etc.). + """ + type: Literal["Document"] = "Document" + + def __init__(self, page_content: str, **kwargs: Any) -> None: + """Pass page_content in as positional or named arg.""" + super().__init__(page_content=page_content, **kwargs) + + @classmethod + def is_lc_serializable(cls) -> bool: + """Return whether this class is serializable.""" + return True + + @classmethod + def get_lc_namespace(cls) -> List[str]: + """Get the namespace of the langchain object.""" + return ["langchain", "schema", "document"] + + +class BaseDocumentTransformer(ABC): + """Abstract base class for document transformation systems. + + A document transformation system takes a sequence of Documents and returns a + sequence of transformed Documents. + + Example: + .. code-block:: python + + class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): + embeddings: Embeddings + similarity_fn: Callable = cosine_similarity + similarity_threshold: float = 0.95 + + class Config: + arbitrary_types_allowed = True + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + stateful_documents = get_stateful_documents(documents) + embedded_documents = _get_embeddings_from_stateful_docs( + self.embeddings, stateful_documents + ) + included_idxs = _filter_similar_embeddings( + embedded_documents, self.similarity_fn, self.similarity_threshold + ) + return [stateful_documents[i] for i in sorted(included_idxs)] + + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + raise NotImplementedError + + """ # noqa: E501 + + @abstractmethod + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Transform a list of documents. + + Args: + documents: A sequence of Documents to be transformed. + + Returns: + A list of transformed Documents. + """ + + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Asynchronously transform a list of documents. + + Args: + documents: A sequence of Documents to be transformed. + + Returns: + A list of transformed Documents. + """ + raise NotImplementedError("This method is not implemented.") + # return await langchain_core.runnables.config.run_in_executor( + # None, self.transform_documents, documents, **kwargs + # ) + + +def _make_spacy_pipe_for_splitting( + pipe: str, *, max_length: int = 1_000_000 +) -> Any: # avoid importing spacy + try: + import spacy + except ImportError: + raise ImportError( + "Spacy is not installed, please install it with `pip install spacy`." + ) + if pipe == "sentencizer": + from spacy.lang.en import English + + sentencizer = English() + sentencizer.add_pipe("sentencizer") + else: + sentencizer = spacy.load(pipe, exclude=["ner", "tagger"]) + sentencizer.max_length = max_length + return sentencizer + + +def _split_text_with_regex( + text: str, separator: str, keep_separator: bool +) -> List[str]: + # Now that we have the separator, split the text + if separator: + if keep_separator: + # The parentheses in the pattern keep the delimiters in the result. + _splits = re.split(f"({separator})", text) + splits = [ + _splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2) + ] + if len(_splits) % 2 == 0: + splits += _splits[-1:] + splits = [_splits[0]] + splits + else: + splits = re.split(separator, text) + else: + splits = list(text) + return [s for s in splits if s != ""] + + +class TextSplitter(BaseDocumentTransformer, ABC): + """Interface for splitting text into chunks.""" + + def __init__( + self, + chunk_size: int = 4000, + chunk_overlap: int = 200, + length_function: Callable[[str], int] = len, + keep_separator: bool = False, + add_start_index: bool = False, + strip_whitespace: bool = True, + ) -> None: + """Create a new TextSplitter. + + Args: + chunk_size: Maximum size of chunks to return + chunk_overlap: Overlap in characters between chunks + length_function: Function that measures the length of given chunks + keep_separator: Whether to keep the separator in the chunks + add_start_index: If `True`, includes chunk's start index in metadata + strip_whitespace: If `True`, strips whitespace from the start and end of + every document + """ + if chunk_overlap > chunk_size: + raise ValueError( + f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " + f"({chunk_size}), should be smaller." + ) + self._chunk_size = chunk_size + self._chunk_overlap = chunk_overlap + self._length_function = length_function + self._keep_separator = keep_separator + self._add_start_index = add_start_index + self._strip_whitespace = strip_whitespace + + @abstractmethod + def split_text(self, text: str) -> List[str]: + """Split text into multiple components.""" + + def create_documents( + self, texts: List[str], metadatas: Optional[List[dict]] = None + ) -> List[Document]: + """Create documents from a list of texts.""" + _metadatas = metadatas or [{}] * len(texts) + documents = [] + for i, text in enumerate(texts): + index = 0 + previous_chunk_len = 0 + for chunk in self.split_text(text): + metadata = copy.deepcopy(_metadatas[i]) + if self._add_start_index: + offset = index + previous_chunk_len - self._chunk_overlap + index = text.find(chunk, max(0, offset)) + metadata["start_index"] = index + previous_chunk_len = len(chunk) + new_doc = Document(page_content=chunk, metadata=metadata) + documents.append(new_doc) + return documents + + def split_documents(self, documents: Iterable[Document]) -> List[Document]: + """Split documents.""" + texts, metadatas = [], [] + for doc in documents: + texts.append(doc.page_content) + metadatas.append(doc.metadata) + return self.create_documents(texts, metadatas=metadatas) + + def _join_docs(self, docs: List[str], separator: str) -> Optional[str]: + text = separator.join(docs) + if self._strip_whitespace: + text = text.strip() + if text == "": + return None + else: + return text + + def _merge_splits( + self, splits: Iterable[str], separator: str + ) -> List[str]: + # We now want to combine these smaller pieces into medium size + # chunks to send to the LLM. + separator_len = self._length_function(separator) + + docs = [] + current_doc: List[str] = [] + total = 0 + for d in splits: + _len = self._length_function(d) + if ( + total + _len + (separator_len if len(current_doc) > 0 else 0) + > self._chunk_size + ): + if total > self._chunk_size: + logger.warning( + f"Created a chunk of size {total}, " + f"which is longer than the specified {self._chunk_size}" + ) + if len(current_doc) > 0: + doc = self._join_docs(current_doc, separator) + if doc is not None: + docs.append(doc) + # Keep on popping if: + # - we have a larger chunk than in the chunk overlap + # - or if we still have any chunks and the length is long + while total > self._chunk_overlap or ( + total + + _len + + (separator_len if len(current_doc) > 0 else 0) + > self._chunk_size + and total > 0 + ): + total -= self._length_function(current_doc[0]) + ( + separator_len if len(current_doc) > 1 else 0 + ) + current_doc = current_doc[1:] + current_doc.append(d) + total += _len + (separator_len if len(current_doc) > 1 else 0) + doc = self._join_docs(current_doc, separator) + if doc is not None: + docs.append(doc) + return docs + + @classmethod + def from_huggingface_tokenizer( + cls, tokenizer: Any, **kwargs: Any + ) -> TextSplitter: + """Text splitter that uses HuggingFace tokenizer to count length.""" + try: + from transformers import PreTrainedTokenizerBase + + if not isinstance(tokenizer, PreTrainedTokenizerBase): + raise ValueError( + "Tokenizer received was not an instance of PreTrainedTokenizerBase" + ) + + def _huggingface_tokenizer_length(text: str) -> int: + return len(tokenizer.encode(text)) + + except ImportError: + raise ValueError( + "Could not import transformers python package. " + "Please install it with `pip install transformers`." + ) + return cls(length_function=_huggingface_tokenizer_length, **kwargs) + + @classmethod + def from_tiktoken_encoder( + cls: Type[TS], + encoding_name: str = "gpt2", + model: Optional[str] = None, + allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), + disallowed_special: Union[Literal["all"], Collection[str]] = "all", + **kwargs: Any, + ) -> TS: + """Text splitter that uses tiktoken encoder to count length.""" + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to calculate max_tokens_for_prompt. " + "Please install it with `pip install tiktoken`." + ) + + if model is not None: + enc = tiktoken.encoding_for_model(model) + else: + enc = tiktoken.get_encoding(encoding_name) + + def _tiktoken_encoder(text: str) -> int: + return len( + enc.encode( + text, + allowed_special=allowed_special, + disallowed_special=disallowed_special, + ) + ) + + if issubclass(cls, TokenTextSplitter): + extra_kwargs = { + "encoding_name": encoding_name, + "model": model, + "allowed_special": allowed_special, + "disallowed_special": disallowed_special, + } + kwargs = {**kwargs, **extra_kwargs} + + return cls(length_function=_tiktoken_encoder, **kwargs) + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Transform sequence of documents by splitting them.""" + return self.split_documents(list(documents)) + + +class CharacterTextSplitter(TextSplitter): + """Splitting text that looks at characters.""" + + def __init__( + self, + separator: str = "\n\n", + is_separator_regex: bool = False, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs) + self._separator = separator + self._is_separator_regex = is_separator_regex + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + separator = ( + self._separator + if self._is_separator_regex + else re.escape(self._separator) + ) + splits = _split_text_with_regex(text, separator, self._keep_separator) + _separator = "" if self._keep_separator else self._separator + return self._merge_splits(splits, _separator) + + +class LineType(TypedDict): + """Line type as typed dict.""" + + metadata: Dict[str, str] + content: str + + +class HeaderType(TypedDict): + """Header type as typed dict.""" + + level: int + name: str + data: str + + +class MarkdownHeaderTextSplitter: + """Splitting markdown files based on specified headers.""" + + def __init__( + self, + headers_to_split_on: List[Tuple[str, str]], + return_each_line: bool = False, + strip_headers: bool = True, + ): + """Create a new MarkdownHeaderTextSplitter. + + Args: + headers_to_split_on: Headers we want to track + return_each_line: Return each line w/ associated headers + strip_headers: Strip split headers from the content of the chunk + """ + # Output line-by-line or aggregated into chunks w/ common headers + self.return_each_line = return_each_line + # Given the headers we want to split on, + # (e.g., "#, ##, etc") order by length + self.headers_to_split_on = sorted( + headers_to_split_on, key=lambda split: len(split[0]), reverse=True + ) + # Strip headers split headers from the content of the chunk + self.strip_headers = strip_headers + + def aggregate_lines_to_chunks( + self, lines: List[LineType] + ) -> List[Document]: + """Combine lines with common metadata into chunks + Args: + lines: Line of text / associated header metadata + """ + aggregated_chunks: List[LineType] = [] + + for line in lines: + if ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] == line["metadata"] + ): + # If the last line in the aggregated list + # has the same metadata as the current line, + # append the current content to the last lines's content + aggregated_chunks[-1]["content"] += " \n" + line["content"] + elif ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] != line["metadata"] + # may be issues if other metadata is present + and len(aggregated_chunks[-1]["metadata"]) + < len(line["metadata"]) + and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#" + and not self.strip_headers + ): + # If the last line in the aggregated list + # has different metadata as the current line, + # and has shallower header level than the current line, + # and the last line is a header, + # and we are not stripping headers, + # append the current content to the last line's content + aggregated_chunks[-1]["content"] += " \n" + line["content"] + # and update the last line's metadata + aggregated_chunks[-1]["metadata"] = line["metadata"] + else: + # Otherwise, append the current line to the aggregated list + aggregated_chunks.append(line) + + return [ + Document(page_content=chunk["content"], metadata=chunk["metadata"]) + for chunk in aggregated_chunks + ] + + def split_text(self, text: str) -> List[Document]: + """Split markdown file + Args: + text: Markdown file""" + + # Split the input text by newline character ("\n"). + lines = text.split("\n") + # Final output + lines_with_metadata: List[LineType] = [] + # Content and metadata of the chunk currently being processed + current_content: List[str] = [] + current_metadata: Dict[str, str] = {} + # Keep track of the nested header structure + # header_stack: List[Dict[str, Union[int, str]]] = [] + header_stack: List[HeaderType] = [] + initial_metadata: Dict[str, str] = {} + + in_code_block = False + opening_fence = "" + + for line in lines: + stripped_line = line.strip() + + if not in_code_block: + # Exclude inline code spans + if ( + stripped_line.startswith("```") + and stripped_line.count("```") == 1 + ): + in_code_block = True + opening_fence = "```" + elif stripped_line.startswith("~~~"): + in_code_block = True + opening_fence = "~~~" + else: + if stripped_line.startswith(opening_fence): + in_code_block = False + opening_fence = "" + + if in_code_block: + current_content.append(stripped_line) + continue + + # Check each line against each of the header types (e.g., #, ##) + for sep, name in self.headers_to_split_on: + # Check if line starts with a header that we intend to split on + if stripped_line.startswith(sep) and ( + # Header with no text OR header is followed by space + # Both are valid conditions that sep is being used a header + len(stripped_line) == len(sep) + or stripped_line[len(sep)] == " " + ): + # Ensure we are tracking the header as metadata + if name is not None: + # Get the current header level + current_header_level = sep.count("#") + + # Pop out headers of lower or same level from the stack + while ( + header_stack + and header_stack[-1]["level"] + >= current_header_level + ): + # We have encountered a new header + # at the same or higher level + popped_header = header_stack.pop() + # Clear the metadata for the + # popped header in initial_metadata + if popped_header["name"] in initial_metadata: + initial_metadata.pop(popped_header["name"]) + + # Push the current header to the stack + header: HeaderType = { + "level": current_header_level, + "name": name, + "data": stripped_line[len(sep) :].strip(), + } + header_stack.append(header) + # Update initial_metadata with the current header + initial_metadata[name] = header["data"] + + # Add the previous line to the lines_with_metadata + # only if current_content is not empty + if current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + if not self.strip_headers: + current_content.append(stripped_line) + + break + else: + if stripped_line: + current_content.append(stripped_line) + elif current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + current_metadata = initial_metadata.copy() + + if current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata, + } + ) + + # lines_with_metadata has each line with associated header metadata + # aggregate these into chunks based on common metadata + if not self.return_each_line: + return self.aggregate_lines_to_chunks(lines_with_metadata) + else: + return [ + Document( + page_content=chunk["content"], metadata=chunk["metadata"] + ) + for chunk in lines_with_metadata + ] + + +class ElementType(TypedDict): + """Element type as typed dict.""" + + url: str + xpath: str + content: str + metadata: Dict[str, str] + + +class HTMLHeaderTextSplitter: + """ + Splitting HTML files based on specified headers. + Requires lxml package. + """ + + def __init__( + self, + headers_to_split_on: List[Tuple[str, str]], + return_each_element: bool = False, + ): + """Create a new HTMLHeaderTextSplitter. + + Args: + headers_to_split_on: list of tuples of headers we want to track mapped to + (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4, + h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)]. + return_each_element: Return each element w/ associated headers. + """ + # Output element-by-element or aggregated into chunks w/ common headers + self.return_each_element = return_each_element + self.headers_to_split_on = sorted(headers_to_split_on) + + def aggregate_elements_to_chunks( + self, elements: List[ElementType] + ) -> List[Document]: + """Combine elements with common metadata into chunks + + Args: + elements: HTML element content with associated identifying info and metadata + """ + aggregated_chunks: List[ElementType] = [] + + for element in elements: + if ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] == element["metadata"] + ): + # If the last element in the aggregated list + # has the same metadata as the current element, + # append the current content to the last element's content + aggregated_chunks[-1]["content"] += " \n" + element["content"] + else: + # Otherwise, append the current element to the aggregated list + aggregated_chunks.append(element) + + return [ + Document(page_content=chunk["content"], metadata=chunk["metadata"]) + for chunk in aggregated_chunks + ] + + def split_text_from_url(self, url: str) -> List[Document]: + """Split HTML from web URL + + Args: + url: web URL + """ + r = requests.get(url) + return self.split_text_from_file(BytesIO(r.content)) + + def split_text(self, text: str) -> List[Document]: + """Split HTML text string + + Args: + text: HTML text + """ + return self.split_text_from_file(StringIO(text)) + + def split_text_from_file(self, file: Any) -> List[Document]: + """Split HTML file + + Args: + file: HTML file + """ + try: + from lxml import etree + except ImportError as e: + raise ImportError( + "Unable to import lxml, please install with `pip install lxml`." + ) from e + # use lxml library to parse html document and return xml ElementTree + # Explicitly encoding in utf-8 allows non-English + # html files to be processed without garbled characters + parser = etree.HTMLParser(encoding="utf-8") + tree = etree.parse(file, parser) + + # document transformation for "structure-aware" chunking is handled with xsl. + # see comments in html_chunks_with_headers.xslt for more detailed information. + xslt_path = ( + pathlib.Path(__file__).parent + / "document_transformers/xsl/html_chunks_with_headers.xslt" + ) + xslt_tree = etree.parse(xslt_path) + transform = etree.XSLT(xslt_tree) + result = transform(tree) + result_dom = etree.fromstring(str(result)) + + # create filter and mapping for header metadata + header_filter = [header[0] for header in self.headers_to_split_on] + header_mapping = dict(self.headers_to_split_on) + + # map xhtml namespace prefix + ns_map = {"h": "http://www.w3.org/1999/xhtml"} + + # build list of elements from DOM + elements = [] + for element in result_dom.findall("*//*", ns_map): + if element.findall("*[@class='headers']") or element.findall( + "*[@class='chunk']" + ): + elements.append( + ElementType( + url=file, + xpath="".join( + [ + node.text + for node in element.findall( + "*[@class='xpath']", ns_map + ) + ] + ), + content="".join( + [ + node.text + for node in element.findall( + "*[@class='chunk']", ns_map + ) + ] + ), + metadata={ + # Add text of specified headers to metadata using header + # mapping. + header_mapping[node.tag]: node.text + for node in filter( + lambda x: x.tag in header_filter, + element.findall( + "*[@class='headers']/*", ns_map + ), + ) + }, + ) + ) + + if not self.return_each_element: + return self.aggregate_elements_to_chunks(elements) + else: + return [ + Document( + page_content=chunk["content"], metadata=chunk["metadata"] + ) + for chunk in elements + ] + + +# should be in newer Python versions (3.10+) +# @dataclass(frozen=True, kw_only=True, slots=True) +@dataclass(frozen=True) +class Tokenizer: + """Tokenizer data class.""" + + chunk_overlap: int + """Overlap in tokens between chunks""" + tokens_per_chunk: int + """Maximum number of tokens per chunk""" + decode: Callable[[List[int]], str] + """ Function to decode a list of token ids to a string""" + encode: Callable[[str], List[int]] + """ Function to encode a string to a list of token ids""" + + +def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]: + """Split incoming text and return chunks using tokenizer.""" + splits: List[str] = [] + input_ids = tokenizer.encode(text) + start_idx = 0 + cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[start_idx:cur_idx] + while start_idx < len(input_ids): + splits.append(tokenizer.decode(chunk_ids)) + if cur_idx == len(input_ids): + break + start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap + cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[start_idx:cur_idx] + return splits + + +class TokenTextSplitter(TextSplitter): + """Splitting text to tokens using model tokenizer.""" + + def __init__( + self, + encoding_name: str = "gpt2", + model: Optional[str] = None, + allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), + disallowed_special: Union[Literal["all"], Collection[str]] = "all", + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs) + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for TokenTextSplitter. " + "Please install it with `pip install tiktoken`." + ) + + if model is not None: + enc = tiktoken.encoding_for_model(model) + else: + enc = tiktoken.get_encoding(encoding_name) + self._tokenizer = enc + self._allowed_special = allowed_special + self._disallowed_special = disallowed_special + + def split_text(self, text: str) -> List[str]: + def _encode(_text: str) -> List[int]: + return self._tokenizer.encode( + _text, + allowed_special=self._allowed_special, + disallowed_special=self._disallowed_special, + ) + + tokenizer = Tokenizer( + chunk_overlap=self._chunk_overlap, + tokens_per_chunk=self._chunk_size, + decode=self._tokenizer.decode, + encode=_encode, + ) + + return split_text_on_tokens(text=text, tokenizer=tokenizer) + + +class SentenceTransformersTokenTextSplitter(TextSplitter): + """Splitting text to tokens using sentence model tokenizer.""" + + def __init__( + self, + chunk_overlap: int = 50, + model: str = "sentence-transformers/all-mpnet-base-v2", + tokens_per_chunk: Optional[int] = None, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs, chunk_overlap=chunk_overlap) + + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + "Could not import sentence_transformer python package. " + "This is needed in order to for SentenceTransformersTokenTextSplitter. " + "Please install it with `pip install sentence-transformers`." + ) + + self.model = model + self._model = SentenceTransformer(self.model, trust_remote_code=True) + self.tokenizer = self._model.tokenizer + self._initialize_chunk_configuration(tokens_per_chunk=tokens_per_chunk) + + def _initialize_chunk_configuration( + self, *, tokens_per_chunk: Optional[int] + ) -> None: + self.maximum_tokens_per_chunk = cast(int, self._model.max_seq_length) + + if tokens_per_chunk is None: + self.tokens_per_chunk = self.maximum_tokens_per_chunk + else: + self.tokens_per_chunk = tokens_per_chunk + + if self.tokens_per_chunk > self.maximum_tokens_per_chunk: + raise ValueError( + f"The token limit of the models '{self.model}'" + f" is: {self.maximum_tokens_per_chunk}." + f" Argument tokens_per_chunk={self.tokens_per_chunk}" + f" > maximum token limit." + ) + + def split_text(self, text: str) -> List[str]: + def encode_strip_start_and_stop_token_ids(text: str) -> List[int]: + return self._encode(text)[1:-1] + + tokenizer = Tokenizer( + chunk_overlap=self._chunk_overlap, + tokens_per_chunk=self.tokens_per_chunk, + decode=self.tokenizer.decode, + encode=encode_strip_start_and_stop_token_ids, + ) + + return split_text_on_tokens(text=text, tokenizer=tokenizer) + + def count_tokens(self, *, text: str) -> int: + return len(self._encode(text)) + + _max_length_equal_32_bit_integer: int = 2**32 + + def _encode(self, text: str) -> List[int]: + token_ids_with_start_and_end_token_ids = self.tokenizer.encode( + text, + max_length=self._max_length_equal_32_bit_integer, + truncation="do_not_truncate", + ) + return token_ids_with_start_and_end_token_ids + + +class Language(str, Enum): + """Enum of the programming languages.""" + + CPP = "cpp" + GO = "go" + JAVA = "java" + KOTLIN = "kotlin" + JS = "js" + TS = "ts" + PHP = "php" + PROTO = "proto" + PYTHON = "python" + RST = "rst" + RUBY = "ruby" + RUST = "rust" + SCALA = "scala" + SWIFT = "swift" + MARKDOWN = "markdown" + LATEX = "latex" + HTML = "html" + SOL = "sol" + CSHARP = "csharp" + COBOL = "cobol" + C = "c" + LUA = "lua" + PERL = "perl" + + +class RecursiveCharacterTextSplitter(TextSplitter): + """Splitting text by recursively look at characters. + + Recursively tries to split by different characters to find one + that works. + """ + + def __init__( + self, + separators: Optional[List[str]] = None, + keep_separator: bool = True, + is_separator_regex: bool = False, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(keep_separator=keep_separator, **kwargs) + self._separators = separators or ["\n\n", "\n", " ", ""] + self._is_separator_regex = is_separator_regex + + def _split_text(self, text: str, separators: List[str]) -> List[str]: + """Split incoming text and return chunks.""" + final_chunks = [] + # Get appropriate separator to use + separator = separators[-1] + new_separators = [] + for i, _s in enumerate(separators): + _separator = _s if self._is_separator_regex else re.escape(_s) + if _s == "": + separator = _s + break + if re.search(_separator, text): + separator = _s + new_separators = separators[i + 1 :] + break + + _separator = ( + separator if self._is_separator_regex else re.escape(separator) + ) + splits = _split_text_with_regex(text, _separator, self._keep_separator) + + # Now go merging things, recursively splitting longer texts. + _good_splits = [] + _separator = "" if self._keep_separator else separator + for s in splits: + if self._length_function(s) < self._chunk_size: + _good_splits.append(s) + else: + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + _good_splits = [] + if not new_separators: + final_chunks.append(s) + else: + other_info = self._split_text(s, new_separators) + final_chunks.extend(other_info) + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + return final_chunks + + def split_text(self, text: str) -> List[str]: + return self._split_text(text, self._separators) + + @classmethod + def from_language( + cls, language: Language, **kwargs: Any + ) -> RecursiveCharacterTextSplitter: + separators = cls.get_separators_for_language(language) + return cls(separators=separators, is_separator_regex=True, **kwargs) + + @staticmethod + def get_separators_for_language(language: Language) -> List[str]: + if language == Language.CPP: + return [ + # Split along class definitions + "\nclass ", + # Split along function definitions + "\nvoid ", + "\nint ", + "\nfloat ", + "\ndouble ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.GO: + return [ + # Split along function definitions + "\nfunc ", + "\nvar ", + "\nconst ", + "\ntype ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.JAVA: + return [ + # Split along class definitions + "\nclass ", + # Split along method definitions + "\npublic ", + "\nprotected ", + "\nprivate ", + "\nstatic ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.KOTLIN: + return [ + # Split along class definitions + "\nclass ", + # Split along method definitions + "\npublic ", + "\nprotected ", + "\nprivate ", + "\ninternal ", + "\ncompanion ", + "\nfun ", + "\nval ", + "\nvar ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nwhen ", + "\ncase ", + "\nelse ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.JS: + return [ + # Split along function definitions + "\nfunction ", + "\nconst ", + "\nlet ", + "\nvar ", + "\nclass ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + "\ndefault ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.TS: + return [ + "\nenum ", + "\ninterface ", + "\nnamespace ", + "\ntype ", + # Split along class definitions + "\nclass ", + # Split along function definitions + "\nfunction ", + "\nconst ", + "\nlet ", + "\nvar ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + "\ndefault ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.PHP: + return [ + # Split along function definitions + "\nfunction ", + # Split along class definitions + "\nclass ", + # Split along control flow statements + "\nif ", + "\nforeach ", + "\nwhile ", + "\ndo ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.PROTO: + return [ + # Split along message definitions + "\nmessage ", + # Split along service definitions + "\nservice ", + # Split along enum definitions + "\nenum ", + # Split along option definitions + "\noption ", + # Split along import statements + "\nimport ", + # Split along syntax declarations + "\nsyntax ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.PYTHON: + return [ + # First, try to split along class definitions + "\nclass ", + "\ndef ", + "\n\tdef ", + # Now split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.RST: + return [ + # Split along section titles + "\n=+\n", + "\n-+\n", + "\n\\*+\n", + # Split along directive markers + "\n\n.. *\n\n", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.RUBY: + return [ + # Split along method definitions + "\ndef ", + "\nclass ", + # Split along control flow statements + "\nif ", + "\nunless ", + "\nwhile ", + "\nfor ", + "\ndo ", + "\nbegin ", + "\nrescue ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.RUST: + return [ + # Split along function definitions + "\nfn ", + "\nconst ", + "\nlet ", + # Split along control flow statements + "\nif ", + "\nwhile ", + "\nfor ", + "\nloop ", + "\nmatch ", + "\nconst ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.SCALA: + return [ + # Split along class definitions + "\nclass ", + "\nobject ", + # Split along method definitions + "\ndef ", + "\nval ", + "\nvar ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nmatch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.SWIFT: + return [ + # Split along function definitions + "\nfunc ", + # Split along class definitions + "\nclass ", + "\nstruct ", + "\nenum ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\ndo ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.MARKDOWN: + return [ + # First, try to split along Markdown headings (starting with level 2) + "\n#{1,6} ", + # Note the alternative syntax for headings (below) is not handled here + # Heading level 2 + # --------------- + # End of code block + "```\n", + # Horizontal lines + "\n\\*\\*\\*+\n", + "\n---+\n", + "\n___+\n", + # Note that this splitter doesn't handle horizontal lines defined + # by *three or more* of ***, ---, or ___, but this is not handled + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.LATEX: + return [ + # First, try to split along Latex sections + "\n\\\\chapter{", + "\n\\\\section{", + "\n\\\\subsection{", + "\n\\\\subsubsection{", + # Now split by environments + "\n\\\\begin{enumerate}", + "\n\\\\begin{itemize}", + "\n\\\\begin{description}", + "\n\\\\begin{list}", + "\n\\\\begin{quote}", + "\n\\\\begin{quotation}", + "\n\\\\begin{verse}", + "\n\\\\begin{verbatim}", + # Now split by math environments + "\n\\\begin{align}", + "$$", + "$", + # Now split by the normal type of lines + " ", + "", + ] + elif language == Language.HTML: + return [ + # First, try to split along HTML tags + "<body", + "<div", + "<p", + "<br", + "<li", + "<h1", + "<h2", + "<h3", + "<h4", + "<h5", + "<h6", + "<span", + "<table", + "<tr", + "<td", + "<th", + "<ul", + "<ol", + "<header", + "<footer", + "<nav", + # Head + "<head", + "<style", + "<script", + "<meta", + "<title", + "", + ] + elif language == Language.CSHARP: + return [ + "\ninterface ", + "\nenum ", + "\nimplements ", + "\ndelegate ", + "\nevent ", + # Split along class definitions + "\nclass ", + "\nabstract ", + # Split along method definitions + "\npublic ", + "\nprotected ", + "\nprivate ", + "\nstatic ", + "\nreturn ", + # Split along control flow statements + "\nif ", + "\ncontinue ", + "\nfor ", + "\nforeach ", + "\nwhile ", + "\nswitch ", + "\nbreak ", + "\ncase ", + "\nelse ", + # Split by exceptions + "\ntry ", + "\nthrow ", + "\nfinally ", + "\ncatch ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.SOL: + return [ + # Split along compiler information definitions + "\npragma ", + "\nusing ", + # Split along contract definitions + "\ncontract ", + "\ninterface ", + "\nlibrary ", + # Split along method definitions + "\nconstructor ", + "\ntype ", + "\nfunction ", + "\nevent ", + "\nmodifier ", + "\nerror ", + "\nstruct ", + "\nenum ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\ndo while ", + "\nassembly ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.COBOL: + return [ + # Split along divisions + "\nIDENTIFICATION DIVISION.", + "\nENVIRONMENT DIVISION.", + "\nDATA DIVISION.", + "\nPROCEDURE DIVISION.", + # Split along sections within DATA DIVISION + "\nWORKING-STORAGE SECTION.", + "\nLINKAGE SECTION.", + "\nFILE SECTION.", + # Split along sections within PROCEDURE DIVISION + "\nINPUT-OUTPUT SECTION.", + # Split along paragraphs and common statements + "\nOPEN ", + "\nCLOSE ", + "\nREAD ", + "\nWRITE ", + "\nIF ", + "\nELSE ", + "\nMOVE ", + "\nPERFORM ", + "\nUNTIL ", + "\nVARYING ", + "\nACCEPT ", + "\nDISPLAY ", + "\nSTOP RUN.", + # Split by the normal type of lines + "\n", + " ", + "", + ] + + else: + raise ValueError( + f"Language {language} is not supported! " + f"Please choose from {list(Language)}" + ) + + +class NLTKTextSplitter(TextSplitter): + """Splitting text using NLTK package.""" + + def __init__( + self, separator: str = "\n\n", language: str = "english", **kwargs: Any + ) -> None: + """Initialize the NLTK splitter.""" + super().__init__(**kwargs) + try: + from nltk.tokenize import sent_tokenize + + self._tokenizer = sent_tokenize + except ImportError: + raise ImportError( + "NLTK is not installed, please install it with `pip install nltk`." + ) + self._separator = separator + self._language = language + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + splits = self._tokenizer(text, language=self._language) + return self._merge_splits(splits, self._separator) + + +class SpacyTextSplitter(TextSplitter): + """Splitting text using Spacy package. + + + Per default, Spacy's `en_core_web_sm` model is used and + its default max_length is 1000000 (it is the length of maximum character + this model takes which can be increased for large files). For a faster, but + potentially less accurate splitting, you can use `pipe='sentencizer'`. + """ + + def __init__( + self, + separator: str = "\n\n", + pipe: str = "en_core_web_sm", + max_length: int = 1_000_000, + **kwargs: Any, + ) -> None: + """Initialize the spacy text splitter.""" + super().__init__(**kwargs) + self._tokenizer = _make_spacy_pipe_for_splitting( + pipe, max_length=max_length + ) + self._separator = separator + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + splits = (s.text for s in self._tokenizer(text).sents) + return self._merge_splits(splits, self._separator) + + +class KonlpyTextSplitter(TextSplitter): + """Splitting text using Konlpy package. + + It is good for splitting Korean text. + """ + + def __init__( + self, + separator: str = "\n\n", + **kwargs: Any, + ) -> None: + """Initialize the Konlpy text splitter.""" + super().__init__(**kwargs) + self._separator = separator + try: + from konlpy.tag import Kkma + except ImportError: + raise ImportError( + """ + Konlpy is not installed, please install it with + `pip install konlpy` + """ + ) + self.kkma = Kkma() + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + splits = self.kkma.sentences(text) + return self._merge_splits(splits, self._separator) + + +# For backwards compatibility +class PythonCodeTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Python syntax.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize a PythonCodeTextSplitter.""" + separators = self.get_separators_for_language(Language.PYTHON) + super().__init__(separators=separators, **kwargs) + + +class MarkdownTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Markdown-formatted headings.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize a MarkdownTextSplitter.""" + separators = self.get_separators_for_language(Language.MARKDOWN) + super().__init__(separators=separators, **kwargs) + + +class LatexTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Latex-formatted layout elements.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize a LatexTextSplitter.""" + separators = self.get_separators_for_language(Language.LATEX) + super().__init__(separators=separators, **kwargs) + + +class RecursiveJsonSplitter: + def __init__( + self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None + ): + super().__init__() + self.max_chunk_size = max_chunk_size + self.min_chunk_size = ( + min_chunk_size + if min_chunk_size is not None + else max(max_chunk_size - 200, 50) + ) + + @staticmethod + def _json_size(data: Dict) -> int: + """Calculate the size of the serialized JSON object.""" + return len(json.dumps(data)) + + @staticmethod + def _set_nested_dict(d: Dict, path: List[str], value: Any) -> None: + """Set a value in a nested dictionary based on the given path.""" + for key in path[:-1]: + d = d.setdefault(key, {}) + d[path[-1]] = value + + def _list_to_dict_preprocessing(self, data: Any) -> Any: + if isinstance(data, dict): + # Process each key-value pair in the dictionary + return { + k: self._list_to_dict_preprocessing(v) for k, v in data.items() + } + elif isinstance(data, list): + # Convert the list to a dictionary with index-based keys + return { + str(i): self._list_to_dict_preprocessing(item) + for i, item in enumerate(data) + } + else: + # Base case: the item is neither a dict nor a list, so return it unchanged + return data + + def _json_split( + self, + data: Dict[str, Any], + current_path: List[str] = [], + chunks: List[Dict] = [{}], + ) -> List[Dict]: + """ + Split json into maximum size dictionaries while preserving structure. + """ + if isinstance(data, dict): + for key, value in data.items(): + new_path = current_path + [key] + chunk_size = self._json_size(chunks[-1]) + size = self._json_size({key: value}) + remaining = self.max_chunk_size - chunk_size + + if size < remaining: + # Add item to current chunk + self._set_nested_dict(chunks[-1], new_path, value) + else: + if chunk_size >= self.min_chunk_size: + # Chunk is big enough, start a new chunk + chunks.append({}) + + # Iterate + self._json_split(value, new_path, chunks) + else: + # handle single item + self._set_nested_dict(chunks[-1], current_path, data) + return chunks + + def split_json( + self, + json_data: Dict[str, Any], + convert_lists: bool = False, + ) -> List[Dict]: + """Splits JSON into a list of JSON chunks""" + + if convert_lists: + chunks = self._json_split( + self._list_to_dict_preprocessing(json_data) + ) + else: + chunks = self._json_split(json_data) + + # Remove the last chunk if it's empty + if not chunks[-1]: + chunks.pop() + return chunks + + def split_text( + self, json_data: Dict[str, Any], convert_lists: bool = False + ) -> List[str]: + """Splits JSON into a list of JSON formatted strings""" + + chunks = self.split_json( + json_data=json_data, convert_lists=convert_lists + ) + + # Convert to string + return [json.dumps(chunk) for chunk in chunks] + + def create_documents( + self, + texts: List[Dict], + convert_lists: bool = False, + metadatas: Optional[List[dict]] = None, + ) -> List[Document]: + """Create documents from a list of json objects (Dict).""" + _metadatas = metadatas or [{}] * len(texts) + documents = [] + for i, text in enumerate(texts): + for chunk in self.split_text( + json_data=text, convert_lists=convert_lists + ): + metadata = copy.deepcopy(_metadatas[i]) + new_doc = Document(page_content=chunk, metadata=metadata) + documents.append(new_doc) + return documents diff --git a/R2R/r2r/cli/__init__.py b/R2R/r2r/cli/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/cli/__init__.py diff --git a/R2R/r2r/cli/cli.py b/R2R/r2r/cli/cli.py new file mode 100755 index 00000000..4ef38b1e --- /dev/null +++ b/R2R/r2r/cli/cli.py @@ -0,0 +1,592 @@ +import json +import os +import subprocess +import time +import uuid + +import click +from dotenv import load_dotenv + +from r2r.main.execution import R2RExecutionWrapper + + +class JsonParamType(click.ParamType): + name = "json" + + def convert(self, value, param, ctx): + try: + return json.loads(value) + except json.JSONDecodeError: + self.fail(f"'{value}' is not a valid JSON string", param, ctx) + + +JSON = JsonParamType() + + +@click.group() +@click.option( + "--config-path", default=None, help="Path to the configuration file" +) +@click.option( + "--config-name", default=None, help="Name of the configuration to use" +) +@click.option("--client-mode", default=True, help="Run in client mode") +@click.option( + "--base-url", + default="http://localhost:8000", + help="Base URL for client mode", +) +@click.pass_context +def cli(ctx, config_path, config_name, client_mode, base_url): + """R2R CLI for all core operations.""" + if config_path and config_name: + raise click.UsageError( + "Cannot specify both config_path and config_name" + ) + + # Convert relative config path to absolute path + if config_path: + config_path = os.path.abspath(config_path) + + if ctx.invoked_subcommand != "serve": + ctx.obj = R2RExecutionWrapper( + config_path, + config_name, + client_mode if ctx.invoked_subcommand != "serve" else False, + base_url, + ) + else: + ctx.obj = { + "config_path": config_path, + "config_name": config_name, + "base_url": base_url, + } + + +@cli.command() +@click.option("--host", default="0.0.0.0", help="Host to run the server on") +@click.option("--port", default=8000, help="Port to run the server on") +@click.option("--docker", is_flag=True, help="Run using Docker") +@click.option( + "--docker-ext-neo4j", + is_flag=True, + help="Run using Docker with external Neo4j", +) +@click.option("--project-name", default="r2r", help="Project name for Docker") +@click.pass_obj +def serve(obj, host, port, docker, docker_ext_neo4j, project_name): + """Start the R2R server.""" + # Load environment variables from .env file if it exists + load_dotenv() + + if docker: + if x := obj.get("config_path", None): + os.environ["CONFIG_PATH"] = x + else: + os.environ["CONFIG_NAME"] = ( + obj.get("config_name", None) or "default" + ) + + os.environ["OLLAMA_API_BASE"] = "http://host.docker.internal:11434" + # Check if compose files exist in the package directory + package_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", ".." + ) + compose_yaml = os.path.join(package_dir, "compose.yaml") + compose_neo4j_yaml = os.path.join(package_dir, "compose.neo4j.yaml") + + if not os.path.exists(compose_yaml) or not os.path.exists( + compose_neo4j_yaml + ): + click.echo( + "Error: Docker Compose files not found in the package directory." + ) + return + + # Build the docker-compose command with the specified host and port + docker_command = f"docker-compose -f {compose_yaml}" + if docker_ext_neo4j: + docker_command += f" -f {compose_neo4j_yaml}" + if host != "0.0.0.0" or port != 8000: + docker_command += ( + f" --build-arg HOST={host} --build-arg PORT={port}" + ) + + docker_command += f" --project-name {project_name}" + + docker_command += " up -d" + os.system(docker_command) + else: + wrapper = R2RExecutionWrapper(**obj, client_mode=False) + wrapper.serve(host, port) + + +@cli.command() +@click.option( + "--volumes", + is_flag=True, + help="Remove named volumes declared in the `volumes` section of the Compose file", +) +@click.option( + "--remove-orphans", + is_flag=True, + help="Remove containers for services not defined in the Compose file", +) +@click.option("--project-name", default="r2r", help="Project name for Docker") +@click.pass_context +def docker_down(ctx, volumes, remove_orphans, project_name): + """Bring down the Docker Compose setup and attempt to remove the network if necessary.""" + package_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", ".." + ) + compose_yaml = os.path.join(package_dir, "compose.yaml") + compose_neo4j_yaml = os.path.join(package_dir, "compose.neo4j.yaml") + + if not os.path.exists(compose_yaml) or not os.path.exists( + compose_neo4j_yaml + ): + click.echo( + "Error: Docker Compose files not found in the package directory." + ) + return + + docker_command = ( + f"docker-compose -f {compose_yaml} -f {compose_neo4j_yaml}" + ) + docker_command += f" --project-name {project_name}" + + if volumes: + docker_command += " --volumes" + + if remove_orphans: + docker_command += " --remove-orphans" + + docker_command += " down" + + click.echo("Bringing down Docker Compose setup...") + result = os.system(docker_command) + + if result != 0: + click.echo( + "An error occurred while bringing down the Docker Compose setup. Attempting to remove the network..." + ) + + # Get the list of networks + networks = ( + subprocess.check_output( + ["docker", "network", "ls", "--format", "{{.Name}}"] + ) + .decode() + .split() + ) + + # Find the r2r network + r2r_network = next( + ( + network + for network in networks + if network.startswith("r2r_") and "network" in network + ), + None, + ) + + if r2r_network: + # Try to remove the network + for _ in range(1): # Try 1 extra times + remove_command = f"docker network rm {r2r_network}" + remove_result = os.system(remove_command) + + if remove_result == 0: + click.echo(f"Successfully removed network: {r2r_network}") + return + else: + click.echo( + f"Failed to remove network: {r2r_network}. Retrying in 5 seconds..." + ) + time.sleep(5) + + click.echo( + "Failed to remove the network after multiple attempts. Please try the following steps:" + ) + click.echo( + "1. Run 'docker ps' to check for any running containers using this network." + ) + click.echo( + "2. Stop any running containers with 'docker stop <container_id>'." + ) + click.echo( + f"3. Try removing the network manually with 'docker network rm {r2r_network}'." + ) + click.echo( + "4. If the above steps don't work, you may need to restart the Docker daemon." + ) + else: + click.echo("Could not find the r2r network to remove.") + else: + click.echo("Docker Compose setup has been successfully brought down.") + + +@cli.command() +@click.argument("file-paths", nargs=-1) +@click.option( + "--document-ids", multiple=True, help="Document IDs for ingestion" +) +@click.option("--metadatas", multiple=True, help="Metadatas for ingestion") +@click.option( + "--versions", + multiple=True, + help="Starting version for ingested files (e.g. `v1`)", +) +@click.pass_obj +def ingest_files(obj, file_paths, document_ids, metadatas, versions): + """Ingest files into R2R.""" + + t0 = time.time() + + # Default to None if empty tuples are provided + document_ids = None if not document_ids else list(document_ids) + metadatas = None if not metadatas else list(metadatas) + versions = None if not versions else list(versions) + + response = obj.ingest_files( + list(file_paths), document_ids, metadatas, versions + ) + t1 = time.time() + click.echo(f"Time taken to ingest files: {t1 - t0:.2f} seconds") + click.echo(response) + + +@cli.command() +@click.argument("file-paths", nargs=-1) +@click.option( + "--document-ids", multiple=True, help="Document IDs for ingestion" +) +@click.option("--metadatas", multiple=True, help="Metadatas for ingestion") +@click.pass_obj +def update_files(obj, file_paths, document_ids, metadatas): + """Ingest files into R2R.""" + t0 = time.time() + + # Default to None if empty tuples are provided + metadatas = None if not metadatas else list(metadatas) + + response = obj.update_files( + list(file_paths), list(document_ids), metadatas + ) + t1 = time.time() + click.echo(f"Time taken to ingest files: {t1 - t0:.2f} seconds") + click.echo(response) + + +@cli.command() +@click.option( + "--query", prompt="Enter your search query", help="The search query" +) +@click.option( + "--use-vector-search", is_flag=True, default=True, help="Use vector search" +) +@click.option( + "--search-filters", type=JsonParamType(), help="Search filters as JSON" +) +@click.option( + "--search-limit", default=10, help="Number of search results to return" +) +@click.option("--do-hybrid-search", is_flag=True, help="Perform hybrid search") +@click.option( + "--use-kg-search", is_flag=True, help="Use knowledge graph search" +) +@click.option("--kg-agent-model", default=None, help="Model for KG agent") +@click.pass_obj +def search( + obj, + query, + use_vector_search, + search_filters, + search_limit, + do_hybrid_search, + use_kg_search, + kg_agent_model, +): + """Perform a search query.""" + kg_agent_generation_config = {} + if kg_agent_model: + kg_agent_generation_config["model"] = kg_agent_model + + t0 = time.time() + + results = obj.search( + query, + use_vector_search, + search_filters, + search_limit, + do_hybrid_search, + use_kg_search, + kg_agent_generation_config, + ) + + if isinstance(results, dict) and "results" in results: + results = results["results"] + + if "vector_search_results" in results: + click.echo("Vector search results:") + for result in results["vector_search_results"]: + click.echo(result) + if "kg_search_results" in results and results["kg_search_results"]: + click.echo("KG search results:", results["kg_search_results"]) + + t1 = time.time() + click.echo(f"Time taken to search: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option("--query", prompt="Enter your query", help="The query for RAG") +@click.option( + "--use-vector-search", is_flag=True, default=True, help="Use vector search" +) +@click.option( + "--search-filters", type=JsonParamType(), help="Search filters as JSON" +) +@click.option( + "--search-limit", default=10, help="Number of search results to return" +) +@click.option("--do-hybrid-search", is_flag=True, help="Perform hybrid search") +@click.option( + "--use-kg-search", is_flag=True, help="Use knowledge graph search" +) +@click.option("--kg-agent-model", default=None, help="Model for KG agent") +@click.option("--stream", is_flag=True, help="Stream the RAG response") +@click.option("--rag-model", default=None, help="Model for RAG") +@click.pass_obj +def rag( + obj, + query, + use_vector_search, + search_filters, + search_limit, + do_hybrid_search, + use_kg_search, + kg_agent_model, + stream, + rag_model, +): + """Perform a RAG query.""" + kg_agent_generation_config = {} + if kg_agent_model: + kg_agent_generation_config = {"model": kg_agent_model} + rag_generation_config = {"stream": stream} + if rag_model: + rag_generation_config["model"] = rag_model + t0 = time.time() + + response = obj.rag( + query, + use_vector_search, + search_filters, + search_limit, + do_hybrid_search, + use_kg_search, + kg_agent_generation_config, + stream, + rag_generation_config, + ) + if stream: + for chunk in response: + click.echo(chunk, nl=False) + click.echo() + else: + if obj.client_mode: + click.echo(f"Search Results:\n{response['search_results']}") + click.echo(f"Completion:\n{response['completion']}") + else: + click.echo(f"Search Results:\n{response.search_results}") + click.echo(f"Completion:\n{response.completion}") + + t1 = time.time() + click.echo(f"Time taken for RAG: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option("--keys", multiple=True, help="Keys for deletion") +@click.option("--values", multiple=True, help="Values for deletion") +@click.pass_obj +def delete(obj, keys, values): + """Delete documents based on keys and values.""" + if len(keys) != len(values): + raise click.UsageError("Number of keys must match number of values") + + t0 = time.time() + response = obj.delete(list(keys), list(values)) + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken for deletion: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option("--log-type-filter", help="Filter for log types") +@click.pass_obj +def logs(obj, log_type_filter): + """Retrieve logs with optional type filter.""" + t0 = time.time() + response = obj.logs(log_type_filter) + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to retrieve logs: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option("--document-ids", multiple=True, help="Document IDs to overview") +@click.option("--user-ids", multiple=True, help="User IDs to overview") +@click.pass_obj +def documents_overview(obj, document_ids, user_ids): + """Get an overview of documents.""" + document_ids = list(document_ids) if document_ids else None + user_ids = list(user_ids) if user_ids else None + + t0 = time.time() + response = obj.documents_overview(document_ids, user_ids) + t1 = time.time() + + for document in response: + click.echo(document) + click.echo(f"Time taken to get document overview: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.argument("document_id") +@click.pass_obj +def document_chunks(obj, document_id): + """Get chunks of a specific document.""" + t0 = time.time() + response = obj.document_chunks(document_id) + t1 = time.time() + + for chunk in response: + click.echo(chunk) + click.echo(f"Time taken to get document chunks: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.pass_obj +def app_settings(obj): + """Retrieve application settings.""" + t0 = time.time() + response = obj.app_settings() + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to get app settings: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option("--user-ids", multiple=True, help="User IDs to overview") +@click.pass_obj +def users_overview(obj, user_ids): + """Get an overview of users.""" + user_ids = ( + [uuid.UUID(user_id) for user_id in user_ids] if user_ids else None + ) + + t0 = time.time() + response = obj.users_overview(user_ids) + t1 = time.time() + + for user in response: + click.echo(user) + click.echo(f"Time taken to get users overview: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option( + "--filters", type=JsonParamType(), help="Filters for analytics as JSON" +) +@click.option( + "--analysis-types", type=JsonParamType(), help="Analysis types as JSON" +) +@click.pass_obj +def analytics(obj, filters, analysis_types): + """Retrieve analytics data.""" + t0 = time.time() + response = obj.analytics(filters, analysis_types) + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to get analytics: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option( + "--limit", default=100, help="Limit the number of relationships returned" +) +@click.pass_obj +def inspect_knowledge_graph(obj, limit): + """Print relationships from the knowledge graph.""" + t0 = time.time() + response = obj.inspect_knowledge_graph(limit) + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to print relationships: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option( + "--no-media", + default=True, + help="Exclude media files from ingestion", +) +@click.option("--option", default=0, help="Which file to ingest?") +@click.pass_obj +def ingest_sample_file(obj, no_media, option): + t0 = time.time() + response = obj.ingest_sample_file(no_media=no_media, option=option) + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to ingest sample: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.option( + "--no-media", + default=True, + help="Exclude media files from ingestion", +) +@click.pass_obj +def ingest_sample_files(obj, no_media): + """Ingest all sample files into R2R.""" + t0 = time.time() + response = obj.ingest_sample_files(no_media=no_media) + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to ingest sample files: {t1 - t0:.2f} seconds") + + +@cli.command() +@click.pass_obj +def health(obj): + """Check the health of the server.""" + t0 = time.time() + response = obj.health() + t1 = time.time() + + click.echo(response) + click.echo(f"Time taken to ingest sample: {t1 - t0:.2f} seconds") + + +@cli.command() +def version(): + """Print the version of R2R.""" + from importlib.metadata import version + + click.echo(version("r2r")) + + +def main(): + cli() + + +if __name__ == "__main__": + main() diff --git a/R2R/r2r/examples/__init__.py b/R2R/r2r/examples/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/examples/__init__.py diff --git a/R2R/r2r/examples/configs/local_neo4j_kg.json b/R2R/r2r/examples/configs/local_neo4j_kg.json new file mode 100755 index 00000000..0b4254dc --- /dev/null +++ b/R2R/r2r/examples/configs/local_neo4j_kg.json @@ -0,0 +1,69 @@ +{ + "kg": { + "provider": "neo4j", + "batch_size": 1, + "text_splitter": { + "type": "recursive_character", + "chunk_size": 512, + "chunk_overlap": 0 + }, + "max_entities": 10, + "max_relations": 20, + "kg_extraction_prompt": "zero_shot_ner_kg_extraction", + "kg_extraction_config": { + "model": "ollama/sciphi/triplex", + "temperature": 1.0, + "top_p": 1.0, + "top_k": 100, + "max_tokens_to_sample": 1024, + "stream": false, + "functions": null, + "skip_special_tokens": false, + "stop_token": null, + "num_beams": 1, + "do_sample": true, + "generate_with_chat": false, + "add_generation_kwargs": {}, + "api_base": null + } + }, + "completions": { + "provider": "litellm", + "generation_config": { + "model": "ollama/llama3", + "temperature": 0.1, + "top_p": 1.0, + "top_k": 100, + "max_tokens_to_sample": 1024, + "stream": false, + "functions": null, + "skip_special_tokens": false, + "stop_token": null, + "num_beams": 1, + "do_sample": true, + "generate_with_chat": false, + "add_generation_kwargs": {}, + "api_base": null + } + }, + "embedding": { + "provider": "ollama", + "base_model": "mxbai-embed-large", + "base_dimension": 1024, + "batch_size": 32 + }, + "ingestion":{ + "excluded_parsers": [ + "gif", + "jpeg", + "jpg", + "png", + "svg", + "mp3", + "mp4" + ] + }, + "vector_database": { + "provider": "pgvector" + } +} diff --git a/R2R/r2r/examples/configs/local_ollama.json b/R2R/r2r/examples/configs/local_ollama.json new file mode 100755 index 00000000..d6fd68a5 --- /dev/null +++ b/R2R/r2r/examples/configs/local_ollama.json @@ -0,0 +1,41 @@ +{ + "completions": { + "provider": "litellm", + "generation_config": { + "model": "ollama/llama3", + "temperature": 0.1, + "top_p": 1.0, + "top_k": 100, + "max_tokens_to_sample": 1024, + "stream": false, + "functions": null, + "skip_special_tokens": false, + "stop_token": null, + "num_beams": 1, + "do_sample": true, + "generate_with_chat": false, + "add_generation_kwargs": {}, + "api_base": null + } + }, + "embedding": { + "provider": "ollama", + "base_model": "mxbai-embed-large", + "base_dimension": 1024, + "batch_size": 32 + }, + "ingestion":{ + "excluded_parsers": [ + "gif", + "jpeg", + "jpg", + "png", + "svg", + "mp3", + "mp4" + ] + }, + "vector_database": { + "provider": "pgvector" + } +} diff --git a/R2R/r2r/examples/configs/local_ollama_rerank.json b/R2R/r2r/examples/configs/local_ollama_rerank.json new file mode 100755 index 00000000..3a9abbe2 --- /dev/null +++ b/R2R/r2r/examples/configs/local_ollama_rerank.json @@ -0,0 +1,46 @@ +{ + "completions": { + "provider": "litellm", + "generation_config": { + "model": "ollama/llama3", + "temperature": 0.1, + "top_p": 1.0, + "top_k": 100, + "max_tokens_to_sample": 1024, + "stream": false, + "functions": null, + "skip_special_tokens": false, + "stop_token": null, + "num_beams": 1, + "do_sample": true, + "generate_with_chat": false, + "add_generation_kwargs": {}, + "api_base": null + } + }, + "embedding": { + "provider": "sentence-transformers", + "base_model": "all-MiniLM-L6-v2", + "base_dimension": 384, + "rerank_model": "jinaai/jina-reranker-v1-turbo-en", + "rerank_dimension": 384, + "rerank_transformer_type": "CrossEncoder", + "batch_size": 32, + "text_splitter": { + "type": "recursive_character", + "chunk_size": 512, + "chunk_overlap": 20 + } + }, + "ingestion":{ + "excluded_parsers": [ + "gif", + "jpeg", + "jpg", + "png", + "svg", + "mp3", + "mp4" + ] + } +} diff --git a/R2R/r2r/examples/configs/neo4j_kg.json b/R2R/r2r/examples/configs/neo4j_kg.json new file mode 100755 index 00000000..67fd0682 --- /dev/null +++ b/R2R/r2r/examples/configs/neo4j_kg.json @@ -0,0 +1,27 @@ +{ + "kg": { + "provider": "neo4j", + "batch_size": 1, + "text_splitter": { + "type": "recursive_character", + "chunk_size": 1024, + "chunk_overlap": 0 + }, + "kg_extraction_config": { + "model": "gpt-4o", + "temperature": 0.1, + "top_p": 1.0, + "top_k": 100, + "max_tokens_to_sample": 1024, + "stream": false, + "functions": null, + "skip_special_tokens": false, + "stop_token": null, + "num_beams": 1, + "do_sample": true, + "generate_with_chat": false, + "add_generation_kwargs": {}, + "api_base": null + } + } +} diff --git a/R2R/r2r/examples/configs/postgres_logging.json b/R2R/r2r/examples/configs/postgres_logging.json new file mode 100755 index 00000000..ec659bf4 --- /dev/null +++ b/R2R/r2r/examples/configs/postgres_logging.json @@ -0,0 +1,7 @@ +{ + "logging": { + "provider": "postgres", + "log_table": "logs", + "log_info_table": "log_info" + } +} diff --git a/R2R/r2r/examples/hello_r2r.py b/R2R/r2r/examples/hello_r2r.py new file mode 100755 index 00000000..97a49956 --- /dev/null +++ b/R2R/r2r/examples/hello_r2r.py @@ -0,0 +1,25 @@ +from r2r import R2R, Document, GenerationConfig + +app = R2R() # You may pass a custom configuration to `R2R` with config=... + +app.ingest_documents( + [ + Document( + type="txt", + data="John is a person that works at Google.", + metadata={}, + ) + ] +) + +rag_results = app.rag( + "Who is john", GenerationConfig(model="gpt-3.5-turbo", temperature=0.0) +) +print(f"Search Results:\n{rag_results.search_results}") +print(f"Completion:\n{rag_results.completion}") + +# RAG Results: +# Search Results: +# AggregateSearchResult(vector_search_results=[VectorSearchResult(id=2d71e689-0a0e-5491-a50b-4ecb9494c832, score=0.6848798582029441, metadata={'text': 'John is a person that works at Google.', 'version': 'v0', 'chunk_order': 0, 'document_id': 'ed76b6ee-dd80-5172-9263-919d493b439a', 'extraction_id': '1ba494d7-cb2f-5f0e-9f64-76c31da11381', 'associatedQuery': 'Who is john'})], kg_search_results=None) +# Completion: +# ChatCompletion(id='chatcmpl-9g0HnjGjyWDLADe7E2EvLWa35cMkB', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='John is a person that works at Google [1].', role='assistant', function_call=None, tool_calls=None))], created=1719797903, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=11, prompt_tokens=145, total_tokens=156)) diff --git a/R2R/r2r/examples/scripts/advanced_kg_cookbook.py b/R2R/r2r/examples/scripts/advanced_kg_cookbook.py new file mode 100755 index 00000000..a4d59a79 --- /dev/null +++ b/R2R/r2r/examples/scripts/advanced_kg_cookbook.py @@ -0,0 +1,194 @@ +import json +import os + +import fire +import requests +from bs4 import BeautifulSoup, Comment + +from r2r import ( + EntityType, + R2RClient, + R2RPromptProvider, + Relation, + update_kg_prompt, +) + + +def escape_braces(text): + return text.replace("{", "{{").replace("}", "}}") + + +def get_all_yc_co_directory_urls(): + this_file_path = os.path.abspath(os.path.dirname(__file__)) + yc_company_dump_path = os.path.join( + this_file_path, "..", "data", "yc_companies.txt" + ) + + with open(yc_company_dump_path, "r") as f: + urls = f.readlines() + urls = [url.strip() for url in urls] + return {url.split("/")[-1]: url for url in urls} + + +# Function to fetch and clean HTML content +def fetch_and_clean_yc_co_data(url): + # Fetch the HTML content from the URL + response = requests.get(url) + response.raise_for_status() # Raise an error for bad status codes + html_content = response.text + + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html_content, "html.parser") + + # Remove all <script>, <style>, <meta>, <link>, <header>, <nav>, and <footer> elements + for element in soup( + ["script", "style", "meta", "link", "header", "nav", "footer"] + ): + element.decompose() + + # Remove comments + for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): + comment.extract() + + # Select the main content (you can adjust the selector based on the structure of your target pages) + main_content = soup.select_one("main") or soup.body + + if main_content: + spans = main_content.find_all(["span", "a"]) + + proc_spans = [] + for span in spans: + proc_spans.append(span.get_text(separator=" ", strip=True)) + span_text = "\n".join(proc_spans) + + # Extract the text content from the main content + paragraphs = main_content.find_all( + ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"] + ) + cleaned_text = ( + "### Bulk:\n\n" + + "\n\n".join( + paragraph.get_text(separator=" ", strip=True) + for paragraph in paragraphs + ) + + "\n\n### Metadata:\n\n" + + span_text + ) + + return cleaned_text + else: + return "Main content not found" + + +def execute_query(provider, query, params={}): + print(f"Executing query: {query}") + with provider.client.session(database=provider._database) as session: + result = session.run(query, params) + return [record.data() for record in result] + + +def main( + max_entries=50, + local_mode=True, + base_url="http://localhost:8000", +): + + # Specify the entity types for the KG extraction prompt + entity_types = [ + EntityType("COMPANY"), + EntityType("SCHOOL"), + EntityType("LOCATION"), + EntityType("PERSON"), + EntityType("DATE"), + EntityType("OTHER"), + EntityType("QUANTITY"), + EntityType("EVENT"), + EntityType("INDUSTRY"), + EntityType("MEDIA"), + ] + + # Specify the relations for the KG construction + relations = [ + # Founder Relations + Relation("EDUCATED_AT"), + Relation("WORKED_AT"), + Relation("FOUNDED"), + # Company relations + Relation("RAISED"), + Relation("REVENUE"), + Relation("TEAM_SIZE"), + Relation("LOCATION"), + Relation("ACQUIRED_BY"), + Relation("ANNOUNCED"), + Relation("INDUSTRY"), + # Product relations + Relation("PRODUCT"), + Relation("FEATURES"), + Relation("TECHNOLOGY"), + # Additional relations + Relation("HAS"), + Relation("AS_OF"), + Relation("PARTICIPATED"), + Relation("ASSOCIATED"), + ] + + client = R2RClient(base_url=base_url) + r2r_prompts = R2RPromptProvider() + + prompt_base = ( + "zero_shot_ner_kg_extraction" + if local_mode + else "few_shot_ner_kg_extraction" + ) + + update_kg_prompt(client, r2r_prompts, prompt_base, entity_types, relations) + + url_map = get_all_yc_co_directory_urls() + + i = 0 + # Ingest and clean the data for each company + for company, url in url_map.items(): + company_data = fetch_and_clean_yc_co_data(url) + if i >= max_entries: + break + i += 1 + + try: + # Ingest as a text document + file_name = f"{company}.txt" + with open(file_name, "w") as f: + f.write(company_data) + + client.ingest_files( + [file_name], + metadatas=[{"title": company}], + ) + os.remove(file_name) + except: + continue + + print(client.inspect_knowledge_graph(1_000)["results"]) + + if not local_mode: + + update_kg_prompt( + client, r2r_prompts, "kg_agent", entity_types, relations + ) + + result = client.search( + query="Find up to 10 founders that worked at Google", + use_kg_search=True, + )["results"] + + print("result:\n", result) + print("Search Result:\n", result["kg_search_results"]) + + result = client.rag( + query="Find up to 10 founders that worked at Google", + use_kg_search=True, + ) + print("RAG Result:\n", result) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/R2R/r2r/examples/scripts/basic_kg_cookbook.py b/R2R/r2r/examples/scripts/basic_kg_cookbook.py new file mode 100755 index 00000000..52db9cff --- /dev/null +++ b/R2R/r2r/examples/scripts/basic_kg_cookbook.py @@ -0,0 +1,67 @@ +from r2r import R2RClient + +if __name__ == "__main__": + client = R2RClient(base_url="http://localhost:8000") + + with open("john.txt", "w") as f: + f.write("John is a person that works at Google.") + with open("paul.txt", "w") as f: + f.write("Paul is a person that works at Microsoft that knows John.") + + client.ingest_files( + ["john.txt", "paul.txt"], + metadatas=[ + { + "title": "KG Document 1", + "user_id": "063edaf8-3e63-4cb9-a4d6-a855f36376c3", + }, + { + "title": "KG Document 2", + "user_id": "063edaf8-3e63-4cb9-a4d6-a855f36376c3", + }, + ], + ) + + # Get the KG provider + # neo4j_kg = app.providers.kg + + # # The expected entities + # entity_names = ["John", "Paul", "Google", "Microsoft"] + + # print("\nEntities:") + # for entity in entity_names: + # print( + # f"Locating {entity}:\n", neo4j_kg.get(properties={"name": entity}) + # ) + + # relationships = neo4j_kg.get_triplets(entity_names=entity_names) + + # print("\nRelationships:") + # for triplet in relationships: + # source, relation, target = triplet + # print(f"{source} -[{relation.label}]-> {target} ") + + # # Search the vector database + # search_results = app.search(query="Who is john") + # print("\nSearch Results:\n", search_results) + + # # Semantic search over the knowledge graph + # from r2r.base import VectorStoreQuery + + # node_result = neo4j_kg.vector_query( + # VectorStoreQuery( + # query_embedding=app.providers.embedding.get_embedding("A person"), + # ) + # ) + # print("\nNode Result:", node_result) + + # # Structured query + # structured_query = """ + # MATCH (p1:person)-[:KNOWS]->(p2:person) + # RETURN p1.name AS Person1, p2.name AS Person2 + # ORDER BY p1.name + # LIMIT 10; + # """ + # print("Executing query:\n", structured_query) + # structured_result = neo4j_kg.structured_query(structured_query) + # print("Structured Results:\n", structured_result) diff --git a/R2R/r2r/examples/scripts/run_hyde.py b/R2R/r2r/examples/scripts/run_hyde.py new file mode 100755 index 00000000..c82ce525 --- /dev/null +++ b/R2R/r2r/examples/scripts/run_hyde.py @@ -0,0 +1,33 @@ +import fire + +from r2r import R2RBuilder, R2RConfig +from r2r.base.abstractions.llm import GenerationConfig +from r2r.main.assembly.factory_extensions import R2RPipeFactoryWithMultiSearch + + +def main(task_prompt_name="hyde", query="Who was aristotle?"): + # Load the configuration file + config = R2RConfig.from_json() + + app = ( + R2RBuilder(config) + .with_pipe_factory(R2RPipeFactoryWithMultiSearch) + .build( + # Add optional override arguments which propagate to the pipe factory + task_prompt_name=task_prompt_name, + ) + ) + + # Run the RAG pipeline through the R2R application + result = app.rag( + query, + query_transform_generation_config=GenerationConfig(model="gpt-4o"), + rag_generation_config=GenerationConfig(model="gpt-3.5-turbo"), + ) + + print(f"Search Results:\n\n{result.search_results}") + print(f"RAG Results:\n\n{result.completion}") + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/R2R/r2r/examples/scripts/run_web_multi_rag.py b/R2R/r2r/examples/scripts/run_web_multi_rag.py new file mode 100755 index 00000000..3ba70b74 --- /dev/null +++ b/R2R/r2r/examples/scripts/run_web_multi_rag.py @@ -0,0 +1,54 @@ +import fire + +from r2r import R2RBuilder, SerperClient, WebSearchPipe +from r2r.base.abstractions.llm import GenerationConfig +from r2r.main.assembly.factory_extensions import R2RPipeFactoryWithMultiSearch + + +def run_rag_pipeline(query="Who was Aristotle?"): + # Initialize a web search pipe + web_search_pipe = WebSearchPipe(serper_client=SerperClient()) + + # Define a new synthetic query generation template + synthetic_query_generation_template = { + "name": "synthetic_query_generation_template", + "template": """ + ### Instruction: + Given the following query, write a double newline separated list of up to {num_outputs} advanced queries meant to help answer the original query. + DO NOT generate any single query which is likely to require information from multiple distinct documents. + EACH single query will be used to carry out a cosine similarity semantic search over distinct indexed documents. + FOR EXAMPLE, if asked `how do the key themes of Great Gatsby compare with 1984`, the two queries would be + `What are the key themes of Great Gatsby?` and `What are the key themes of 1984?`. + Here is the original user query to be transformed into answers: + + ### Query: + {message} + + ### Response: + """, + "input_types": {"num_outputs": "int", "message": "str"}, + } + + # Build the R2R application with the custom pipeline + app = ( + R2RBuilder() + .with_pipe_factory(R2RPipeFactoryWithMultiSearch) + .build( + # override inputs consumed in building the MultiSearchPipe + multi_inner_search_pipe_override=web_search_pipe, + query_generation_template_override=synthetic_query_generation_template, + ) + ) + + # Run the RAG pipeline through the R2R application + result = app.rag( + query, + rag_generation_config=GenerationConfig(model="gpt-4o"), + ) + + print(f"Search Results:\n\n{result.search_results}") + print(f"RAG Results:\n\n{result.completion}") + + +if __name__ == "__main__": + fire.Fire(run_rag_pipeline) diff --git a/R2R/r2r/examples/scripts/run_web_rag.py b/R2R/r2r/examples/scripts/run_web_rag.py new file mode 100755 index 00000000..7535ae27 --- /dev/null +++ b/R2R/r2r/examples/scripts/run_web_rag.py @@ -0,0 +1,26 @@ +import fire + +from r2r import R2RBuilder, SerperClient, WebSearchPipe +from r2r.base.abstractions.llm import GenerationConfig + + +def run_rag_pipeline(query="Who was Aristotle?"): + # Create search pipe override and pipes + web_search_pipe = WebSearchPipe( + serper_client=SerperClient() # TODO - Develop a `WebSearchProvider` for configurability + ) + + app = R2RBuilder().with_vector_search_pipe(web_search_pipe).build() + + # Run the RAG pipeline through the R2R application + result = app.rag( + query, + rag_generation_config=GenerationConfig(model="gpt-4o"), + ) + + print(f"Search Results:\n\n{result.search_results}") + print(f"RAG Results:\n\n{result.completion}") + + +if __name__ == "__main__": + fire.Fire(run_rag_pipeline) diff --git a/R2R/r2r/examples/scripts/sample_data_ingestor.py b/R2R/r2r/examples/scripts/sample_data_ingestor.py new file mode 100755 index 00000000..67eecd16 --- /dev/null +++ b/R2R/r2r/examples/scripts/sample_data_ingestor.py @@ -0,0 +1,81 @@ +import os +import uuid +from typing import TYPE_CHECKING + +import fire + +if TYPE_CHECKING: + from r2r.main.execution import R2RExecutionWrapper + + +class SampleDataIngestor: + USER_IDS = [ + "063edaf8-3e63-4cb9-a4d6-a855f36376c3", + "45c3f5a8-bcbe-43b1-9b20-51c07fd79f14", + "c6c23d85-6217-4caa-b391-91ec0021a000", + None, + ] + + def __init__( + self, + executor: "R2RExecutionWrapper", + ): + self.executor = executor + + @staticmethod + def get_sample_files(no_media: bool = True) -> list[str]: + examples_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), ".." + ) + + files = [ + os.path.join(examples_dir, "data", "aristotle.txt"), + os.path.join(examples_dir, "data", "got.txt"), + os.path.join(examples_dir, "data", "screen_shot.png"), + os.path.join(examples_dir, "data", "pg_essay_1.html"), + os.path.join(examples_dir, "data", "pg_essay_2.html"), + os.path.join(examples_dir, "data", "pg_essay_3.html"), + os.path.join(examples_dir, "data", "pg_essay_4.html"), + os.path.join(examples_dir, "data", "pg_essay_5.html"), + os.path.join(examples_dir, "data", "lyft_2021.pdf"), + os.path.join(examples_dir, "data", "uber_2021.pdf"), + os.path.join(examples_dir, "data", "sample.mp3"), + os.path.join(examples_dir, "data", "sample2.mp3"), + ] + if no_media: + excluded_types = ["jpeg", "jpg", "png", "svg", "mp3", "mp4"] + files = [ + file_path + for file_path in files + if file_path.split(".")[-1].lower() not in excluded_types + ] + return files + + def ingest_sample_files(self, no_media: bool = True): + sample_files = self.get_sample_files(no_media) + user_ids = [ + uuid.UUID(user_id) if user_id else None + for user_id in self.USER_IDS + ] + + response = self.executor.ingest_files( + sample_files, + [ + {"user_id": user_ids[it % len(user_ids)]} + for it in range(len(sample_files)) + ], + ) + return response + + def ingest_sample_file(self, no_media: bool = True, option: int = 0): + sample_files = self.get_sample_files() + user_id = uuid.UUID(self.USER_IDS[option % len(self.USER_IDS)]) + + response = self.executor.ingest_files( + [sample_files[option]], [{"user_id": user_id}] + ) + return response + + +if __name__ == "__main__": + fire.Fire(SampleDataIngestor) diff --git a/R2R/r2r/examples/scripts/test_e2e.py b/R2R/r2r/examples/scripts/test_e2e.py new file mode 100755 index 00000000..11829f94 --- /dev/null +++ b/R2R/r2r/examples/scripts/test_e2e.py @@ -0,0 +1,321 @@ +import json +import math +from datetime import datetime, timezone +from uuid import UUID + +from r2r.main.execution import R2RExecutionWrapper + +expected_logs = [ + { + "run_id": ..., + "run_type": "ingestion", + "entries": [ + { + "key": "document_parse_result", + "value": "Document 'aristotle.txt' processed successfully.", + } + ], + }, + { + "run_id": ..., + "run_type": "search", + "entries": [ + {"key": "search_latency", "value": "0.45"}, + { + "key": "search_results", + "value": '["{\\"id\\":\\"7ed3a01c-88dc-5a58-a68b-6e5d9f292df2\\",\\"score\\":0.773841586847122,\\"metadata\\":{\\"text\\":\\"Aristotle[A] (Greek: \\u1f08\\u03c1\\u03b9\\u03c3\\u03c4\\u03bf\\u03c4\\u03ad\\u03bb\\u03b7\\u03c2 Aristot\\u00e9l\\u0113s, pronounced [aristot\\u00e9l\\u025b\\u02d0s]; 384\\u2013322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":0,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e6f58828-2e6d-5eb1-94f3-efbc0b7c1699\\",\\"score\\":0.669298529624939,\\"metadata\\":{\\"text\\":\\"Aristotle was revered among medieval Muslim scholars as \\\\\\"The First Teacher\\\\\\", and among medieval Christians like Thomas Aquinas as simply \\\\\\"The Philosopher\\\\\\", while the poet Dante called him \\\\\\"the master of those who know\\\\\\". His works contain the earliest known formal study of logic, and were studied by medieval scholars such as Peter Abelard and Jean Buridan. Aristotle\'s influence on logic continued well into the 19th century. In addition, his ethics, although always influential, gained renewed interest with\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":5,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"c818bc72-2ac8-581b-b51a-0ca826f5f2b8\\",\\"score\\":0.652687707703574,\\"metadata\\":{\\"text\\":\\"Aristotle was born in 384 BC[C] in Stagira, Chalcidice,[2] about 55 km (34 miles) east of modern-day Thessaloniki.[3][4] His father, Nicomachus, was the personal physician to King Amyntas of Macedon. While he was young, Aristotle learned about biology and medical information, which was taught by his father.[5] Both of Aristotle\'s parents died when he was about thirteen, and Proxenus of Atarneus became his guardian.[6] Although little information about Aristotle\'s childhood has survived, he probably spent\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":8,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d8ea40eb-cd48-5bd7-b2df-05f6268aed10\\",\\"score\\":0.636079056730387,\\"metadata\\":{\\"text\\":\\"Aristotle has been called the father of logic, biology, political science, zoology, embryology, natural law, scientific method, rhetoric, psychology, realism, criticism, individualism, teleology, and meteorology.[151]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":177,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"b32f0e19-029f-5b3f-856c-e7e4141f52f5\\",\\"score\\":0.624098479747772,\\"metadata\\":{\\"text\\":\\"Among countless other achievements, Aristotle was the founder of formal logic,[146] pioneered the study of zoology, and left every future scientist and philosopher in his debt through his contributions to the scientific method.[2][147][148] Taneli Kukkonen, observes that his achievement in founding two sciences is unmatched, and his reach in influencing \\\\\\"every branch of intellectual enterprise\\\\\\" including Western ethical and political theory, theology, rhetoric, and literary analysis is equally long. As a\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":175,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e6c5d5f6-7fc4-5bb8-847d-44cfa16f5178\\",\\"score\\":0.619364976882935,\\"metadata\\":{\\"text\\":\\"Little is known about Aristotle\'s life. He was born in the city of Stagira in northern Greece during the Classical period. His father, Nicomachus, died when Aristotle was a child, and he was brought up by a guardian. At 17 or 18, he joined Plato\'s Academy in Athens and remained there until the age of 37 (c.\\u2009347 BC). Shortly after Plato died, Aristotle left Athens and, at the request of Philip II of Macedon, tutored his son Alexander the Great beginning in 343 BC. He established a library in the Lyceum,\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":1,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"90b891ee-0a67-54ba-838a-e02e1647adab\\",\\"score\\":0.6177915291003779,\\"metadata\\":{\\"text\\":\\"Like his teacher Plato, Aristotle\'s philosophy aims at the universal. Aristotle\'s ontology places the universal (katholou) in particulars (kath\' hekaston), things in the world, whereas for Plato the universal is a separately existing form which actual things imitate. For Aristotle, \\\\\\"form\\\\\\" is still what phenomena are based on, but is \\\\\\"instantiated\\\\\\" in a particular substance.[34]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":37,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5f6213d1-a46b-5ed4-a15c-e95bab271621\\",\\"score\\":0.606411576271057,\\"metadata\\":{\\"text\\":\\"Aristotle was one of the most revered Western thinkers in early Islamic theology. Most of the still extant works of Aristotle,[167] as well as a number of the original Greek commentaries, were translated into Arabic and studied by Muslim philosophers, scientists and scholars. Averroes, Avicenna and Alpharabius, who wrote on Aristotle in great depth, also influenced Thomas Aquinas and other Western Christian scholastic philosophers. Alkindus greatly admired Aristotle\'s philosophy,[168] and Averroes spoke of\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":194,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ce43de17-635c-5a01-aae2-e160a6d56f4b\\",\\"score\\":0.601802307421038,\\"metadata\\":{\\"text\\":\\"passed to Plato\'s nephew Speusippus, although it is possible that he feared the anti-Macedonian sentiments in Athens at that time and left before Plato died.[10] Aristotle then accompanied Xenocrates to the court of his friend Hermias of Atarneus in Asia Minor. After the death of Hermias, Aristotle travelled with his pupil Theophrastus to the island of Lesbos, where together they researched the botany and zoology of the island and its sheltered lagoon. While in Lesbos, Aristotle married Pythias, either\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":12,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8550e2b7-43f8-5a59-9c13-c9678670a2da\\",\\"score\\":0.595871686935425,\\"metadata\\":{\\"text\\":\\"The immediate influence of Aristotle\'s work was felt as the Lyceum grew into the Peripatetic school. Aristotle\'s students included Aristoxenus, Dicaearchus, Demetrius of Phalerum, Eudemos of Rhodes, Harpalus, Hephaestion, Mnason of Phocis, Nicomachus, and Theophrastus. Aristotle\'s influence over Alexander the Great is seen in the latter\'s bringing with him on his expedition a host of zoologists, botanists, and researchers. He had also learned a great deal about Persian customs and traditions from his\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":181,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}"]', + }, + {"key": "search_query", "value": "who is aristotle?"}, + ], + }, + # {'run_id': ..., 'run_type': 'search', 'entries': [{'key': 'search_query', 'value': 'who is aristotle?'}, {'key': 'search_latency', 'value': '0.51'}, {'key': 'search_results', 'value': '["{\\"id\\":\\"7ed3a01c-88dc-5a58-a68b-6e5d9f292df2\\",\\"score\\":0.773841586847122,\\"metadata\\":{\\"text\\":\\"Aristotle[A] (Greek: \\u1f08\\u03c1\\u03b9\\u03c3\\u03c4\\u03bf\\u03c4\\u03ad\\u03bb\\u03b7\\u03c2 Aristot\\u00e9l\\u0113s, pronounced [aristot\\u00e9l\\u025b\\u02d0s]; 384\\u2013322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":0,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e6f58828-2e6d-5eb1-94f3-efbc0b7c1699\\",\\"score\\":0.669298529624939,\\"metadata\\":{\\"text\\":\\"Aristotle was revered among medieval Muslim scholars as \\\\\\"The First Teacher\\\\\\", and among medieval Christians like Thomas Aquinas as simply \\\\\\"The Philosopher\\\\\\", while the poet Dante called him \\\\\\"the master of those who know\\\\\\". His works contain the earliest known formal study of logic, and were studied by medieval scholars such as Peter Abelard and Jean Buridan. Aristotle\'s influence on logic continued well into the 19th century. In addition, his ethics, although always influential, gained renewed interest with\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":5,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"c818bc72-2ac8-581b-b51a-0ca826f5f2b8\\",\\"score\\":0.652687707703574,\\"metadata\\":{\\"text\\":\\"Aristotle was born in 384 BC[C] in Stagira, Chalcidice,[2] about 55 km (34 miles) east of modern-day Thessaloniki.[3][4] His father, Nicomachus, was the personal physician to King Amyntas of Macedon. While he was young, Aristotle learned about biology and medical information, which was taught by his father.[5] Both of Aristotle\'s parents died when he was about thirteen, and Proxenus of Atarneus became his guardian.[6] Although little information about Aristotle\'s childhood has survived, he probably spent\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":8,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d8ea40eb-cd48-5bd7-b2df-05f6268aed10\\",\\"score\\":0.636050164699554,\\"metadata\\":{\\"text\\":\\"Aristotle has been called the father of logic, biology, political science, zoology, embryology, natural law, scientific method, rhetoric, psychology, realism, criticism, individualism, teleology, and meteorology.[151]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":177,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"b32f0e19-029f-5b3f-856c-e7e4141f52f5\\",\\"score\\":0.624127291194959,\\"metadata\\":{\\"text\\":\\"Among countless other achievements, Aristotle was the founder of formal logic,[146] pioneered the study of zoology, and left every future scientist and philosopher in his debt through his contributions to the scientific method.[2][147][148] Taneli Kukkonen, observes that his achievement in founding two sciences is unmatched, and his reach in influencing \\\\\\"every branch of intellectual enterprise\\\\\\" including Western ethical and political theory, theology, rhetoric, and literary analysis is equally long. As a\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":175,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e6c5d5f6-7fc4-5bb8-847d-44cfa16f5178\\",\\"score\\":0.619364976882935,\\"metadata\\":{\\"text\\":\\"Little is known about Aristotle\'s life. He was born in the city of Stagira in northern Greece during the Classical period. His father, Nicomachus, died when Aristotle was a child, and he was brought up by a guardian. At 17 or 18, he joined Plato\'s Academy in Athens and remained there until the age of 37 (c.\\u2009347 BC). Shortly after Plato died, Aristotle left Athens and, at the request of Philip II of Macedon, tutored his son Alexander the Great beginning in 343 BC. He established a library in the Lyceum,\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":1,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"90b891ee-0a67-54ba-838a-e02e1647adab\\",\\"score\\":0.6177915291003779,\\"metadata\\":{\\"text\\":\\"Like his teacher Plato, Aristotle\'s philosophy aims at the universal. Aristotle\'s ontology places the universal (katholou) in particulars (kath\' hekaston), things in the world, whereas for Plato the universal is a separately existing form which actual things imitate. For Aristotle, \\\\\\"form\\\\\\" is still what phenomena are based on, but is \\\\\\"instantiated\\\\\\" in a particular substance.[34]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":37,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5f6213d1-a46b-5ed4-a15c-e95bab271621\\",\\"score\\":0.606407422018273,\\"metadata\\":{\\"text\\":\\"Aristotle was one of the most revered Western thinkers in early Islamic theology. Most of the still extant works of Aristotle,[167] as well as a number of the original Greek commentaries, were translated into Arabic and studied by Muslim philosophers, scientists and scholars. Averroes, Avicenna and Alpharabius, who wrote on Aristotle in great depth, also influenced Thomas Aquinas and other Western Christian scholastic philosophers. Alkindus greatly admired Aristotle\'s philosophy,[168] and Averroes spoke of\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":194,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ce43de17-635c-5a01-aae2-e160a6d56f4b\\",\\"score\\":0.601802307421038,\\"metadata\\":{\\"text\\":\\"passed to Plato\'s nephew Speusippus, although it is possible that he feared the anti-Macedonian sentiments in Athens at that time and left before Plato died.[10] Aristotle then accompanied Xenocrates to the court of his friend Hermias of Atarneus in Asia Minor. After the death of Hermias, Aristotle travelled with his pupil Theophrastus to the island of Lesbos, where together they researched the botany and zoology of the island and its sheltered lagoon. While in Lesbos, Aristotle married Pythias, either\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":12,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8550e2b7-43f8-5a59-9c13-c9678670a2da\\",\\"score\\":0.5959202888059449,\\"metadata\\":{\\"text\\":\\"The immediate influence of Aristotle\'s work was felt as the Lyceum grew into the Peripatetic school. Aristotle\'s students included Aristoxenus, Dicaearchus, Demetrius of Phalerum, Eudemos of Rhodes, Harpalus, Hephaestion, Mnason of Phocis, Nicomachus, and Theophrastus. Aristotle\'s influence over Alexander the Great is seen in the latter\'s bringing with him on his expedition a host of zoologists, botanists, and researchers. He had also learned a great deal about Persian customs and traditions from his\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":181,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"1175585b-fc58-5e44-bfcb-cb1996289936\\",\\"score\\":0.594988668263635,\\"metadata\\":{\\"text\\":\\"after friends and relatives, and to deal with the latter as with beasts or plants\\\\\\".[13] By 335 BC, Aristotle had returned to Athens, establishing his own school there known as the Lyceum. Aristotle conducted courses at the school for the next twelve years. While in Athens, his wife Pythias died and Aristotle became involved with Herpyllis of Stagira. They had a son whom Aristotle named after his father, Nicomachus. If the Suda \\u2013 an uncritical compilation from the Middle Ages \\u2013 is accurate, he may also have\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":16,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8f8f5140-2d4b-5877-9cfb-d2df590831c2\\",\\"score\\":0.5928938565520601,\\"metadata\\":{\\"text\\":\\"In Protrepticus, the character \'Aristotle\' states:[123]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":147,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ee40bbc8-16f7-5808-8f14-c8fd16391cfc\\",\\"score\\":0.591026663780212,\\"metadata\\":{\\"text\\":\\"Transmission\\\\nFurther information: List of writers influenced by Aristotle\\\\nMore than 2300 years after his death, Aristotle remains one of the most influential people who ever lived.[142][143][144] He contributed to almost every field of human knowledge then in existence, and he was the founder of many new fields. According to the philosopher Bryan Magee, \\\\\\"it is doubtful whether any human being has ever known as much as he did\\\\\\".[145]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":174,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"703219b2-3284-533f-8df7-fee42f52c3d2\\",\\"score\\":0.5850032146276001,\\"metadata\\":{\\"text\\":\\"At the age of seventeen or eighteen, Aristotle moved to Athens to continue his education at Plato\'s Academy.[8] He probably experienced the Eleusinian Mysteries as he wrote when describing the sights one viewed at the Eleusinian Mysteries, \\\\\\"to experience is to learn\\\\\\" [\\u03c0\\u03b1\\u03b8\\u03b5\\u03af\\u03bd \\u03bc\\u03b1\\u03b8\\u03b5\\u0129\\u03bd].[9] Aristotle remained in Athens for nearly twenty years before leaving in 348/47 BC. The traditional story about his departure records that he was disappointed with the Academy\'s direction after control passed to Plato\'s\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":11,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"044a1a8a-7661-52b9-af63-83c243216d34\\",\\"score\\":0.5834955821337959,\\"metadata\\":{\\"text\\":\\"\\u2014\\u200aAristotle. Politics, Book 4, 1294b.10\\u201318\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":152,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"4173b9bc-4b39-5dc1-b9af-a2fc0e282389\\",\\"score\\":0.5787635539488301,\\"metadata\\":{\\"text\\":\\"This period in Athens, between 335 and 323 BC, is when Aristotle is believed to have composed many of his works.[12] He wrote many dialogues, of which only fragments have survived. Those works that have survived are in treatise form and were not, for the most part, intended for widespread publication; they are generally thought to be lecture aids for his students. His most important treatises include Physics, Metaphysics, Nicomachean Ethics, Politics, On the Soul and Poetics. Aristotle studied and made\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":19,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"038127f5-6603-5258-8658-7c60eaf3dae3\\",\\"score\\":0.575957238674164,\\"metadata\\":{\\"text\\":\\"Averroes spoke of Aristotle as the \\\\\\"exemplar\\\\\\" for all future philosophers.[169] Medieval Muslim scholars regularly described Aristotle as the \\\\\\"First Teacher\\\\\\".[167] The title was later used by Western philosophers (as in the famous poem of Dante) who were influenced by the tradition of Islamic philosophy.[170]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":195,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5fb90839-d04c-50b0-8f7f-ffc1a938c019\\",\\"score\\":0.574642419815063,\\"metadata\\":{\\"text\\":\\"Aristotle was appointed as the head of the royal Academy of Macedon. During Aristotle\'s time in the Macedonian court, he gave lessons not only to Alexander but also to two other future kings: Ptolemy and Cassander.[13] Aristotle encouraged Alexander toward eastern conquest, and Aristotle\'s own attitude towards Persia was unabashedly ethnocentric. In one famous example, he counsels Alexander to be \\\\\\"a leader to the Greeks and a despot to the barbarians, to look after the former as after friends and\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":15,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"76a117a7-706b-5d7f-a856-e5fc4bb0d8a2\\",\\"score\\":0.5701740640298101,\\"metadata\\":{\\"text\\":\\"Life\\\\nIn general, the details of Aristotle\'s life are not well-established. The biographies written in ancient times are often speculative and historians only agree on a few salient points.[B]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":7,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d64af737-640e-5ea0-9259-4c83638a714d\\",\\"score\\":0.565754532814026,\\"metadata\\":{\\"text\\":\\"Aristotle was the first person to study biology systematically,[61] and biology forms a large part of his writings. He spent two years observing and describing the zoology of Lesbos and the surrounding seas, including in particular the Pyrrha lagoon in the centre of Lesbos.[62][63] His data in History of Animals, Generation of Animals, Movement of Animals, and Parts of Animals are assembled from his own observations,[64] statements given by people with specialized knowledge, such as beekeepers and\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":85,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d00de43e-11c4-59b9-ba09-c16ad525a3d3\\",\\"score\\":0.56225860118866,\\"metadata\\":{\\"text\\":\\"Aristotle wrote his works on papyrus scrolls, the common writing medium of that era.[O] His writings are divisible into two groups: the \\\\\\"exoteric\\\\\\", intended for the public, and the \\\\\\"esoteric\\\\\\", for use within the Lyceum school.[208][P][209] Aristotle\'s \\\\\\"lost\\\\\\" works stray considerably in characterization from the surviving Aristotelian corpus. Whereas the lost works appear to have been originally written with a view to subsequent publication, the surviving works mostly resemble lecture notes not intended for\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":222,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"dec8fec3-2b27-554b-b953-27db6a221078\\",\\"score\\":0.561229228973389,\\"metadata\\":{\\"text\\":\\"Aristotle\'s views profoundly shaped medieval scholarship. The influence of his physical science extended from late antiquity and the Early Middle Ages into the Renaissance, and was not replaced systematically until the Enlightenment and theories such as classical mechanics were developed. He influenced Judeo-Islamic philosophies during the Middle Ages, as well as Christian theology, especially the Neoplatonism of the Early Church and the scholastic tradition of the Catholic Church.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":4,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"eb18cdd3-981e-54bb-835d-5988776c5bb9\\",\\"score\\":0.557450473308563,\\"metadata\\":{\\"text\\":\\"With the loss of the study of ancient Greek in the early medieval Latin West, Aristotle was practically unknown there from c.\\u2009CE 600 to c.\\u20091100 except through the Latin translation of the Organon made by Boethius. In the twelfth and thirteenth centuries, interest in Aristotle revived and Latin Christians had translations made, both from Arabic translations, such as those by Gerard of Cremona,[171] and from the original Greek, such as those by James of Venice and William of Moerbeke.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":197,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"40014d76-eba6-5a2a-9c2b-672ac6ddd212\\",\\"score\\":0.557388577305591,\\"metadata\\":{\\"text\\":\\"The Dutch historian of science Eduard Jan Dijksterhuis writes that Aristotle and his predecessors showed the difficulty of science by \\\\\\"proceed[ing] so readily to frame a theory of such a general character\\\\\\" on limited evidence from their senses.[192] In 1985, the biologist Peter Medawar could still state in \\\\\\"pure seventeenth century\\\\\\"[193] tones that Aristotle had assembled \\\\\\"a strange and generally speaking rather tiresome farrago of hearsay, imperfect observation, wishful thinking and credulity amounting to\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":212,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"65c3d9cd-0ab5-5ef7-8296-a1baeff3592d\\",\\"score\\":0.5566293664728881,\\"metadata\\":{\\"text\\":\\"Aristotle made substantial contributions to economic thought, especially to thought in the Middle Ages.[128] In Politics, Aristotle addresses the city, property, and trade. His response to criticisms of private property, in Lionel Robbins\'s view, anticipated later proponents of private property among philosophers and economists, as it related to the overall utility of social arrangements.[128] Aristotle believed that although communal arrangements may seem beneficial to society, and that although private\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":156,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a2cc5fc4-99a4-53c5-8c74-2c68399f2a7e\\",\\"score\\":0.549582594682865,\\"metadata\\":{\\"text\\":\\"Practical philosophy\\\\nAristotle\'s practical philosophy covers areas such as ethics, politics, economics, and rhetoric.[40]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":134,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"27a6a904-7b94-59ca-81ea-8b99b2816f81\\",\\"score\\":0.547278100576631,\\"metadata\\":{\\"text\\":\\"After the Scholastic Thomas Aquinas wrote his Summa Theologica, working from Moerbeke\'s translations and calling Aristotle \\\\\\"The Philosopher\\\\\\",[172] the demand for Aristotle\'s writings grew, and the Greek manuscripts returned to the West, stimulating a revival of Aristotelianism in Europe that continued into the Renaissance.[173] These thinkers blended Aristotelian philosophy with Christianity, bringing the thought of Ancient Greece into the Middle Ages. Scholars such as Boethius, Peter Abelard, and John\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":198,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d4fcaaff-1d1e-5ed4-8d60-6a71a5721ea4\\",\\"score\\":0.544766214801896,\\"metadata\\":{\\"text\\":\\"Aristotle\'s view, incapable of participating in political life.[124] On this ground, proponents of feminist metaphysics have accused Aristotle of misogyny[125] and sexism.[126] However, Aristotle gave equal weight to women\'s happiness as he did to men\'s, and commented in his Rhetoric that the things that lead to happiness need to be in women as well as men.[N]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":154,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"fe38e23d-4798-5cd3-8d42-e858f7e97537\\",\\"score\\":0.54476398229599,\\"metadata\\":{\\"text\\":\\"Theoretical philosophy\\\\nLogic\\\\nMain article: Term logic\\\\nFurther information: Non-Aristotelian logic\\\\nWith the Prior Analytics, Aristotle is credited with the earliest study of formal logic,[23] and his conception of it was the dominant form of Western logic until 19th-century advances in mathematical logic.[24] Kant stated in the Critique of Pure Reason that with Aristotle, logic reached its completion.[25]\\\\n\\\\nOrganon\\\\nMain article: Organon\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":23,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"b29f4f0e-c033-5613-a8fa-62ea11f951ed\\",\\"score\\":0.5409726500511169,\\"metadata\\":{\\"text\\":\\"Zoologists have frequently mocked Aristotle for errors and unverified secondhand reports. However, modern observation has confirmed several of his more surprising claims.[195][196][197] Aristotle\'s work remains largely unknown to modern scientists, though zoologists sometimes mention him as the father of biology[150] or in particular of marine biology.[198] Practising zoologists are unlikely to adhere to Aristotle\'s chain of being, but its influence is still perceptible in the use of the terms \\\\\\"lower\\\\\\" and\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":214,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"cb6775dd-85a1-5e62-bd23-b00842a34231\\",\\"score\\":0.53917521238327,\\"metadata\\":{\\"text\\":\\"Ethics\\\\nMain article: Aristotelian ethics\\\\nAristotle considered ethics to be a practical rather than theoretical study, i.e., one aimed at becoming good and doing good rather than knowing for its own sake. He wrote several treatises on ethics, most notably including the Nicomachean Ethics.[117]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":136,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"31ddcd60-5b19-5b9c-9f47-6b144001308f\\",\\"score\\":0.5359366855583609,\\"metadata\\":{\\"text\\":\\"Aristotle has been depicted by major artists including Lucas Cranach the Elder,[218] Justus van Gent, Raphael, Paolo Veronese, Jusepe de Ribera,[219] Rembrandt,[220] and Francesco Hayez over the centuries. Among the best-known depictions is Raphael\'s fresco The School of Athens, in the Vatican\'s Apostolic Palace, where the figures of Plato and Aristotle are central to the image, at the architectural vanishing point, reflecting their importance.[221] Rembrandt\'s Aristotle with a Bust of Homer, too, is a\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":231,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"c3c15401-51cf-5c53-8182-feac4f7fc6cf\\",\\"score\\":0.535427896056898,\\"metadata\\":{\\"text\\":\\"Though Aristotle wrote many elegant treatises and dialogues for publication, only around a third of his original output has survived, none of it intended for publication. Aristotle provided a complex synthesis of the various philosophies existing prior to him. His teachings and methods of inquiry have had a significant impact across the world, and remain a subject of contemporary philosophical discussion.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":3,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"b5da0460-6b55-550a-ad14-3eae61df567f\\",\\"score\\":0.53429651260376,\\"metadata\\":{\\"text\\":\\"Aristotle\'s views on women influenced later Western philosophers, who quoted him as an authority until the end of the Middle Ages, but these views have been controversial in modern times. Aristotle\'s analysis of procreation describes an active, ensouling masculine element bringing life to an inert, passive female element. The biological differences are a result of the fact that the female body is well-suited for reproduction, which changes her body temperature, which in turn makes her, in Aristotle\'s view,\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":153,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"bbb9c9e5-c084-5068-ac8a-019c8441eebb\\",\\"score\\":0.532585024833679,\\"metadata\\":{\\"text\\":\\"The scholar Taneli Kukkonen notes that \\\\\\"in the best 20th-century scholarship Aristotle comes alive as a thinker wrestling with the full weight of the Greek philosophical tradition.\\\\\\"[148] What follows is an overview of the transmission and influence of his texts and ideas into the modern era.\\\\n\\\\nHis successor, Theophrastus\\\\nMain articles: Theophrastus and Historia Plantarum (Theophrastus)\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":178,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8b9081fd-5603-5704-9ba6-8957525f0c05\\",\\"score\\":0.532379746437073,\\"metadata\\":{\\"text\\":\\"Politics\\\\nMain article: Politics (Aristotle)\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":141,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"f56f324f-e2fe-5552-b4fd-a2324d080e97\\",\\"score\\":0.5316953063011169,\\"metadata\\":{\\"text\\":\\"equally long. As a result, Kukkonen argues, any analysis of reality today \\\\\\"will almost certainly carry Aristotelian overtones ... evidence of an exceptionally forceful mind.\\\\\\"[148] Jonathan Barnes wrote that \\\\\\"an account of Aristotle\'s intellectual afterlife would be little less than a history of European thought\\\\\\".[149]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":176,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"1e64fff6-6f61-5490-8743-4947487cced2\\",\\"score\\":0.5270282030105591,\\"metadata\\":{\\"text\\":\\"Present science\\\\nThe philosopher Bertrand Russell claims that \\\\\\"almost every serious intellectual advance has had to begin with an attack on some Aristotelian doctrine\\\\\\". Russell calls Aristotle\'s ethics \\\\\\"repulsive\\\\\\", and labelled his logic \\\\\\"as definitely antiquated as Ptolemaic astronomy\\\\\\". Russell states that these errors make it difficult to do historical justice to Aristotle, until one remembers what an advance he made upon all of his predecessors.[191]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":211,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"68446969-0dbf-5cbb-852a-07612d886c46\\",\\"score\\":0.525567233562469,\\"metadata\\":{\\"text\\":\\"Plato (left) and Aristotle in Raphael\'s 1509 fresco, The School of Athens. Aristotle holds his Nicomachean Ethics and gestures to the earth, representing his view in immanent realism, whilst Plato gestures to the heavens, indicating his Theory of Forms, and holds his Timaeus.[26][27]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":24,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"794a7a47-8157-5a75-b3d4-7bd51dd9f04e\\",\\"score\\":0.523991107940674,\\"metadata\\":{\\"text\\":\\"The works of Aristotle that have survived from antiquity through medieval manuscript transmission are collected in the Corpus Aristotelicum. These texts, as opposed to Aristotle\'s lost works, are technical philosophical treatises from within Aristotle\'s school.[205] Reference to them is made according to the organization of Immanuel Bekker\'s Royal Prussian Academy edition (Aristotelis Opera edidit Academia Regia Borussica, Berlin, 1831\\u20131870), which in turn is based on ancient classifications of these\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":219,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"30561c1d-cdf7-5281-beb5-25d98903fb8f\\",\\"score\\":0.523762373700001,\\"metadata\\":{\\"text\\":\\"Charles Darwin regarded Aristotle as the most important contributor to the subject of biology. In an 1882 letter he wrote that \\\\\\"Linnaeus and Cuvier have been my two gods, though in very different ways, but they were mere schoolboys to old Aristotle\\\\\\".[187][188] Also, in later editions of the book \\\\\\"On the Origin of Species\', Darwin traced evolutionary ideas as far back as Aristotle;[189] the text he cites is a summary by Aristotle of the ideas of the earlier Greek philosopher Empedocles.[190]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":210,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a0dafc12-79f1-506c-8916-41733f864a76\\",\\"score\\":0.522506475448608,\\"metadata\\":{\\"text\\":\\"traditions from his teacher. Although his respect for Aristotle was diminished as his travels made it clear that much of Aristotle\'s geography was clearly wrong, when the old philosopher released his works to the public, Alexander complained \\\\\\"Thou hast not done well to publish thy acroamatic doctrines; for in what shall I surpass other men if those doctrines wherein I have been trained are to be all men\'s common property?\\\\\\"[155]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":182,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d2e3d3b2-a20d-510a-a96f-9705b02411be\\",\\"score\\":0.5219854116439819,\\"metadata\\":{\\"text\\":\\"Moses Maimonides (considered to be the foremost intellectual figure of medieval Judaism)[179] adopted Aristotelianism from the Islamic scholars and based his Guide for the Perplexed on it and that became the basis of Jewish scholastic philosophy. Maimonides also considered Aristotle to be the greatest philosopher that ever lived, and styled him as the \\\\\\"chief of the philosophers\\\\\\".[180][181][182] Also, in his letter to Samuel ibn Tibbon, Maimonides observes that there is no need for Samuel to study the\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":203,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"c2c7aa9c-8a52-5ccf-b5e6-ab9e918cac18\\",\\"score\\":0.5219346284866331,\\"metadata\\":{\\"text\\":\\"Metaphysics\\\\nMain article: Metaphysics (Aristotle)\\\\nThe word \\\\\\"metaphysics\\\\\\" appears to have been coined by the first century AD editor who assembled various small selections of Aristotle\'s works to the treatise we know by the name Metaphysics.[34] Aristotle called it \\\\\\"first philosophy\\\\\\", and distinguished it from mathematics and natural science (physics) as the contemplative (theoretik\\u0113) philosophy which is \\\\\\"theological\\\\\\" and studies the divine. He wrote in his Metaphysics (1026a16):\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":30,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"fad6a42d-845f-5f75-9416-560f6bdfc622\\",\\"score\\":0.5203073620796199,\\"metadata\\":{\\"text\\":\\"Near the end of his life, Alexander and Aristotle became estranged over Alexander\'s relationship with Persia and Persians. A widespread tradition in antiquity suspected Aristotle of playing a role in Alexander\'s death, but the only evidence of this is an unlikely claim made some six years after the death.[16] Following Alexander\'s death, anti-Macedonian sentiment in Athens was rekindled. In 322 BC, Demophilus and Eurymedon the Hierophant reportedly denounced Aristotle for impiety,[17] prompting him to flee\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":21,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ccfa969d-8634-5dd8-8b8f-e664667f95a0\\",\\"score\\":0.520039439201355,\\"metadata\\":{\\"text\\":\\"In addition to his works on ethics, which address the individual, Aristotle addressed the city in his work titled Politics. Aristotle considered the city to be a natural community. Moreover, he considered the city to be prior in importance to the family, which in turn is prior to the individual, \\\\\\"for the whole must of necessity be prior to the part\\\\\\".[120] He famously stated that \\\\\\"man is by nature a political animal\\\\\\" and argued that humanity\'s defining factor among others in the animal kingdom is its\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":142,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"f5db5c10-ace8-5a28-9b15-d1ef88ac2754\\",\\"score\\":0.519339799880981,\\"metadata\\":{\\"text\\":\\"sense, Aristotle\'s biology is scientific.[78]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":96,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5c80f870-f275-5109-a752-5f9e0caf859c\\",\\"score\\":0.517741410866893,\\"metadata\\":{\\"text\\":\\"Aristotle\'s \\\\\\"natural philosophy\\\\\\" spans a wide range of natural phenomena including those now covered by physics, biology and other natural sciences.[40] In Aristotle\'s terminology, \\\\\\"natural philosophy\\\\\\" is a branch of philosophy examining the phenomena of the natural world, and includes fields that would be regarded today as physics, biology and other natural sciences. Aristotle\'s work encompassed virtually all facets of intellectual inquiry. Aristotle makes philosophy in the broad sense coextensive with\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":51,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"87154733-18e0-56c0-9d00-a6e3d3601277\\",\\"score\\":0.513434050865886,\\"metadata\\":{\\"text\\":\\"Aristotle\'s writings on motion remained influential until the Early Modern period. John Philoponus (in Late antiquity) and Galileo (in Early modern period) are said to have shown by experiment that Aristotle\'s claim that a heavier object falls faster than a lighter object is incorrect.[40] A contrary opinion is given by Carlo Rovelli, who argues that Aristotle\'s physics of motion is correct within its domain of validity, that of objects in the Earth\'s gravitational field immersed in a fluid such as air. In\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":64,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d7fed20f-2710-529e-8518-33e31335ee4e\\",\\"score\\":0.513187944889069,\\"metadata\\":{\\"text\\":\\"Aristotle taught that virtue has to do with the proper function (ergon) of a thing. An eye is only a good eye in so much as it can see, because the proper function of an eye is sight. Aristotle reasoned that humans must have a function specific to humans, and that this function must be an activity of the psuch\\u0113 (soul) in accordance with reason (logos). Aristotle identified such an optimum activity (the virtuous mean, between the accompanying vices of excess or deficiency[15]) of the soul as the aim of all\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":137,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"604d5540-35cf-5575-90ac-e77cddf30b79\\",\\"score\\":0.50912079269414,\\"metadata\\":{\\"text\\":\\"Aristotle also made many observations about the hydrologic cycle. For example, he made some of the earliest observations about desalination: he observed early \\u2013 and correctly \\u2013 that when seawater is heated, freshwater evaporates and that the oceans are then replenished by the cycle of rainfall and river runoff (\\\\\\"I have proved by experiment that salt water evaporated forms fresh and the vapor does not when it condenses condense into sea water again.\\\\\\")[60]\\\\n\\\\nBiology\\\\nMain article: Aristotle\'s biology\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":83,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a1d385d3-062f-5b72-9ee6-8b6ba0d68140\\",\\"score\\":0.506430864334106,\\"metadata\\":{\\"text\\":\\"According to Strabo and Plutarch, after Aristotle\'s death, his library and writings went to Theophrastus (Aristotle\'s successor as head of the Lycaeum and the Peripatetic school).[215] After the death of Theophrastus, the peripatetic library went to Neleus of Scepsis.[216]:\\u200a5\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":225,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"3f156748-3db2-5d2a-bce2-4267f9b46784\\",\\"score\\":0.502232283220752,\\"metadata\\":{\\"text\\":\\"Most of Aristotle\'s work is probably not in its original form, because it was most likely edited by students and later lecturers. The logical works of Aristotle were compiled into a set of six books called the Organon around 40 BC by Andronicus of Rhodes or others among his followers.[28] The books are:\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":25,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"75295549-fd8c-562c-8a19-26ba3276c9e8\\",\\"score\\":0.500771760848554,\\"metadata\\":{\\"text\\":\\"Islamic portrayal of Aristotle (right) in the Kit\\u0101b na\\u02bft al-\\u1e25ayaw\\u0101n, c.\\u20091220.[166]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":193,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"bf8a1c08-f7fb-5636-83bf-87b9af9561e1\\",\\"score\\":0.500191748119388,\\"metadata\\":{\\"text\\":\\"Aristotle did not do experiments in the modern sense.[74] He used the ancient Greek term pepeiramenoi to mean observations, or at most investigative procedures like dissection.[75] In Generation of Animals, he finds a fertilized hen\'s egg of a suitable stage and opens it to see the embryo\'s heart beating inside.[76][77]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":94,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ace70c54-12d0-5dc2-9b27-b566e985e0f7\\",\\"score\\":0.49837121377162896,\\"metadata\\":{\\"text\\":\\"Portrait bust of Aristotle; an Imperial Roman (1st or 2nd century AD) copy of a lost bronze sculpture made by Lysippos.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":18,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"900f6eb4-d13b-5129-9ed6-3f78b31b83b1\\",\\"score\\":0.49812573194503795,\\"metadata\\":{\\"text\\":\\"The first medical teacher at Alexandria, Herophilus of Chalcedon, corrected Aristotle, placing intelligence in the brain, and connected the nervous system to motion and sensation. Herophilus also distinguished between veins and arteries, noting that the latter pulse while the former do not.[157] Though a few ancient atomists such as Lucretius challenged the teleological viewpoint of Aristotelian ideas about life, teleology (and after the rise of Christianity, natural theology) would remain central to\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":184,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"4c6bf2b3-9140-5fe5-adba-27579b339647\\",\\"score\\":0.49481466443451305,\\"metadata\\":{\\"text\\":\\"One of Aristotle\'s types of syllogism[D]\\\\nIn words\\\\tIn\\\\nterms[E]\\\\tIn equations[F]\\\\n All men are mortal.\\\\n\\\\n All Greeks are men.\\\\n\\\\n\\u2234 All Greeks are mortal.\\\\tM a P\\\\n\\\\nS a M\\\\n\\\\nS a P\\\\nWhat is today called Aristotelian logic with its types of syllogism (methods of logical argument),[32] Aristotle himself would have labelled \\\\\\"analytics\\\\\\". The term \\\\\\"logic\\\\\\" he reserved to mean dialectics.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":29,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"d1966c66-9f90-5d28-81c2-80e662e3b977\\",\\"score\\":0.49248007029532903,\\"metadata\\":{\\"text\\":\\"terms \\\\\\"lower\\\\\\" and \\\\\\"upper\\\\\\" to designate taxa such as groups of plants.[199] The evolutionary biologist Armand Marie Leroi has reconstructed Aristotle\'s biology,[200] while Niko Tinbergen\'s four questions, based on Aristotle\'s four causes, are used to analyse animal behaviour; they examine function, phylogeny, mechanism, and ontogeny.[201][202] The concept of homology began with Aristotle;[203] the evolutionary developmental biologist Lewis I. Held commented that he would be interested in the concept of deep\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":215,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"2eeb68da-6525-56d6-be4d-d42ab93c4172\\",\\"score\\":0.49181364403489203,\\"metadata\\":{\\"text\\":\\"Aristotle taught that to achieve a virtuous and potentially happy character requires a first stage of having the fortune to be habituated not deliberately, but by teachers, and experience, leading to a later stage in which one consciously chooses to do the best things. When the best people come to live life this way their practical wisdom (phronesis) and their intellect (nous) can develop with each other towards the highest possible human virtue, the wisdom of an accomplished theoretical or speculative\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":139,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"93bcfe9d-e093-5543-b47b-3e0b9b075801\\",\\"score\\":0.491194203233546,\\"metadata\\":{\\"text\\":\\"Aristotle was one of the first people to record any geological observations. He stated that geological change was too slow to be observed in one person\'s lifetime.[56][57] The geologist Charles Lyell noted that Aristotle described such change, including \\\\\\"lakes that had dried up\\\\\\" and \\\\\\"deserts that had become watered by rivers\\\\\\", giving as examples the growth of the Nile delta since the time of Homer, and \\\\\\"the upheaving of one of the Aeolian islands, previous to a volcanic eruption.\\\\\\"\'[58]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":81,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"399db731-efe2-5198-84b7-5b9cdf6d3a08\\",\\"score\\":0.48828518320829895,\\"metadata\\":{\\"text\\":\\"Poetics\\\\nMain article: Poetics (Aristotle)\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":166,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8f369acc-7504-5cec-ad88-89cb1d3889c0\\",\\"score\\":0.487353653469465,\\"metadata\\":{\\"text\\":\\"Epistemology\\\\nAristotle\'s immanent realism means his epistemology is based on the study of things that exist or happen in the world, and rises to knowledge of the universal, whereas for Plato epistemology begins with knowledge of universal Forms (or ideas) and descends to knowledge of particular imitations of these.[31] Aristotle uses induction from examples alongside deduction, whereas Plato relies on deduction from a priori principles.[31]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":49,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"084c501e-c5e7-53c8-9316-4734ba5f5045\\",\\"score\\":0.48603272438049305,\\"metadata\\":{\\"text\\":\\"Through meticulous commentaries and critical engagements, figures like Al-Farabi and Ibn Sina (Avicenna) breathed new life into Aristotle\'s ideas. They harmonized his logic with Islamic theology, employed his scientific methodologies to explore the natural world, and even reinterpreted his ethics within the framework of Islamic morality. This revival was not mere imitation. Islamic thinkers embraced Aristotle\'s rigorous methods while simultaneously challenging his conclusions where they diverged from their\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":187,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a4c8191b-7a5c-576b-883d-4ffdc9840553\\",\\"score\\":0.484963147936056,\\"metadata\\":{\\"text\\":\\"Greek Christian scribes played a crucial role in the preservation of Aristotle by copying all the extant Greek language manuscripts of the corpus. The first Greek Christians to comment extensively on Aristotle were Philoponus, Elias, and David in the sixth century, and Stephen of Alexandria in the early seventh century.[162] John Philoponus stands out for having attempted a fundamental critique of Aristotle\'s views on the eternity of the world, movement, and other elements of Aristotelian thought.[163]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":190,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"65c94370-53be-543f-bf07-c9c8ac8a5b3c\\",\\"score\\":0.48229515552520796,\\"metadata\\":{\\"text\\":\\"is Aristotle\'s division of sensation and thought, which generally differed from the concepts of previous philosophers, with the exception of Alcmaeon.[95]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":111,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"b95901dc-798d-5206-994d-9948ebf10aba\\",\\"score\\":0.48190322518348705,\\"metadata\\":{\\"text\\":\\"coextensive with reasoning, which he also would describe as \\\\\\"science\\\\\\". However, his use of the term science carries a different meaning than that covered by the term \\\\\\"scientific method\\\\\\". For Aristotle, \\\\\\"all science (dianoia) is either practical, poetical or theoretical\\\\\\" (Metaphysics 1025b25). His practical science includes ethics and politics; his poetical science means the study of fine arts including poetry; his theoretical science covers physics, mathematics and metaphysics.[40]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":52,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"7d161587-aad7-5a0a-9e16-01b8542cb93c\\",\\"score\\":0.47876675237120203,\\"metadata\\":{\\"text\\":\\"\\\\\\"Aristotle tutoring Alexander\\\\\\" by Jean Leon Gerome Ferris.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":14,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"c890996c-f87d-51f2-8d47-316d460893f3\\",\\"score\\":0.477234495630966,\\"metadata\\":{\\"text\\":\\"Samuel to study the writings of philosophers who preceded Aristotle because the works of the latter are \\\\\\"sufficient by themselves and [superior] to all that were written before them. His intellect, Aristotle\'s is the extreme limit of human intellect, apart from him upon whom the divine emanation has flowed forth to such an extent that they reach the level of prophecy, there being no level higher\\\\\\".[183]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":204,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"c2892882-5619-5283-a785-c1701168b0c9\\",\\"score\\":0.476796509218425,\\"metadata\\":{\\"text\\":\\"As Plato\'s disciple Aristotle was rather critical concerning democracy and, following the outline of certain ideas from Plato\'s Statesman, he developed a coherent theory of integrating various forms of power into a so-called mixed state:\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":150,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"91f7b4b9-700c-581c-bac0-028515f34bae\\",\\"score\\":0.47289261221885703,\\"metadata\\":{\\"text\\":\\"Aristotle suggested that the reason for anything coming about can be attributed to four different types of simultaneously active factors. His term aitia is traditionally translated as \\\\\\"cause\\\\\\", but it does not always refer to temporal sequence; it might be better translated as \\\\\\"explanation\\\\\\", but the traditional rendering will be employed here.[48][49]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":68,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"195a8d75-0627-5531-bef5-5354cbfd8bdf\\",\\"score\\":0.472823365712862,\\"metadata\\":{\\"text\\":\\"Byzantine scholars\\\\nSee also: Commentaries on Aristotle and Byzantine Aristotelianism\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":189,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e639d635-0587-59dc-b038-21633a6e025d\\",\\"score\\":0.47108021378517195,\\"metadata\\":{\\"text\\":\\"The order of the books (or the teachings from which they are composed) is not certain, but this list was derived from analysis of Aristotle\'s writings. It goes from the basics, the analysis of simple terms in the Categories, the analysis of propositions and their elementary relations in On Interpretation, to the study of more complex forms, namely, syllogisms (in the Analytics)[29][30] and dialectics (in the Topics and Sophistical Refutations). The first three treatises form the core of the logical theory\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":27,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"7f0b2712-d35d-5ceb-bacb-04b3916a9cc2\\",\\"score\\":0.46897770371966696,\\"metadata\\":{\\"text\\":\\"Aristotle believed the chain of thought, which ends in recollection of certain impressions, was connected systematically in relationships such as similarity, contrast, and contiguity, described in his laws of association. Aristotle believed that past experiences are hidden within the mind. A force operates to awaken the hidden material to bring up the actual experience. According to Aristotle, association is the power innate in a mental state, which operates upon the unexpressed remains of former\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":123,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ed4d0d2c-73dc-5baa-8351-065482dbdf9b\\",\\"score\\":0.46736327008227996,\\"metadata\\":{\\"text\\":\\"According to scholar Roger Theodore Lafferty, Dante built up the philosophy of the Comedy with the works of Aristotle as a foundation, just as the scholastics used Aristotle as the basis for their thinking. Dante knew Aristotle directly from Latin translations of his works and indirectly through quotations in the works of Albert Magnus.[175] Dante even acknowledges Aristotle\'s influence explicitly in the poem, specifically when Virgil justifies the Inferno\'s structure by citing the Nicomachean Ethics.[176]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":200,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e7513658-792c-5fb4-9458-162178a5b72b\\",\\"score\\":0.465195149183273,\\"metadata\\":{\\"text\\":\\"but while Aristotle was aware that new mutations or hybridizations could occur, he saw these as rare accidents. For Aristotle, accidents, like heat waves in winter, must be considered distinct from natural causes. He was thus critical of Empedocles\'s materialist theory of a \\\\\\"survival of the fittest\\\\\\" origin of living things and their organs, and ridiculed the idea that accidents could lead to orderly results.[72] To put his views into modern terms, he nowhere says that different species can have a common\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":90,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"f475f7c3-1f6a-5867-8e0c-af0054425446\\",\\"score\\":0.46305066347122203,\\"metadata\\":{\\"text\\":\\"When Sulla seized Athens in 86 BC, he seized the library and transferred it to Rome. There, Andronicus of Rhodes organized the texts into the first complete edition of Aristotle\'s works (and works attributed to him).[217] The Aristotelian texts we have today are based on these.[216]:\\u200a6\\u20138\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":229,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a586637b-49bb-59c8-b33a-5db313349a89\\",\\"score\\":0.462634655080161,\\"metadata\\":{\\"text\\":\\"Aristotle distinguished about 500 species of animals,[82][83] arranging these in the History of Animals in a graded scale of perfection, a nonreligious version of the scala naturae, with man at the top. His system had eleven grades of animal, from highest potential to lowest, expressed in their form at birth: the highest gave live birth to hot and wet creatures, the lowest laid cold, dry mineral-like eggs. Animals came above plants, and these in turn were above minerals.[84][85] He grouped what the modern\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":101,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e3fecc19-7a17-59c2-ba78-f5f2b7c98fb9\\",\\"score\\":0.46187688069435096,\\"metadata\\":{\\"text\\":\\"School of Aristotle in Mieza, Macedonia, Greece.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":10,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"baddd21e-a650-5789-b8ef-cfe8468da3dc\\",\\"score\\":0.45838105430490195,\\"metadata\\":{\\"text\\":\\"Plato\'s forms exist as universals, like the ideal form of an apple. For Aristotle, both matter and form belong to the individual thing (hylomorphism).\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":36,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e15c86e4-94c7-5b98-aafa-64f117bd3e75\\",\\"score\\":0.45783278597398,\\"metadata\\":{\\"text\\":\\"about it.[98] Aristotle\'s other criticism is that Plato\'s view of reincarnation entails that it is possible for a soul and its body to be mis-matched; in principle, Aristotle alleges, any soul can go with any body, according to Plato\'s theory.[99] Aristotle\'s claim that the soul is the form of a living being eliminates that possibility and thus rules out reincarnation.[100]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":113,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"feb50519-0826-5af7-9ef2-13f642f968ca\\",\\"score\\":0.45751739054711305,\\"metadata\\":{\\"text\\":\\"In On the Soul, Aristotle famously criticizes Plato\'s theory of the soul and develops his own in response. The first criticism is against Plato\'s view of the soul in the Timaeus that the soul takes up space and is able to come into physical contact with bodies.[96] 20th-century scholarship overwhelmingly opposed Aristotle\'s interpretation of Plato and maintained that he had misunderstood him.[97] Today\'s scholars have tended to re-assess Aristotle\'s interpretation and been more positive about it.[98]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":112,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"fa12e1f8-09bb-5b05-8606-f2c141b484be\\",\\"score\\":0.45727583765983604,\\"metadata\\":{\\"text\\":\\"kingdom is its rationality.[121] Aristotle conceived of politics as being like an organism rather than like a machine, and as a collection of parts none of which can exist without the others. Aristotle\'s conception of the city is organic, and he is considered one of the first to conceive of the city in this manner.[122]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":143,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"70eb9c23-af31-5dda-a721-5b6c8577d64e\\",\\"score\\":0.45682342221806704,\\"metadata\\":{\\"text\\":\\"Abelard, and John Buridan worked on Aristotelian logic.[174]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":199,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"bf6b1961-f96b-50c5-b8a9-c60a7b80bc0b\\",\\"score\\":0.452890230801411,\\"metadata\\":{\\"text\\":\\"studied and made significant contributions to \\\\\\"logic, metaphysics, mathematics, physics, biology, botany, ethics, politics, agriculture, medicine, dance, and theatre.\\\\\\"[15]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":20,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"694a4266-0e4c-5691-948e-05dacf99a30a\\",\\"score\\":0.45269417762756303,\\"metadata\\":{\\"text\\":\\"Revival\\\\nIn the slumbering centuries following the decline of the Roman Empire, Aristotle\'s vast philosophical and scientific corpus lay largely dormant in the West. But in the burgeoning intellectual heartland of the Abbasid Caliphate, his works underwent a remarkable revival.[159] Translated into Arabic alongside other Greek classics, Aristotle\'s logic, ethics, and natural philosophy ignited the minds of early Islamic scholars.[160]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":186,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a614876f-67b2-58e7-aaf1-12917410baff\\",\\"score\\":0.44979420603548304,\\"metadata\\":{\\"text\\":\\"Pythias, either Hermias\'s adoptive daughter or niece. They had a daughter, whom they also named Pythias. In 343 BC, Aristotle was invited by Philip II of Macedon to become the tutor to his son Alexander.[11][12]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":13,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"cd03f71f-20c8-501b-9468-cfce807ab7a7\\",\\"score\\":0.44468317095242205,\\"metadata\\":{\\"text\\":\\"Economics\\\\nMain article: Politics (Aristotle)\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":155,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8897fd3b-6eae-576b-9322-6dc057fbeb69\\",\\"score\\":0.444220006465912,\\"metadata\\":{\\"text\\":\\"not intended for publication.[210][208] Cicero\'s description of Aristotle\'s literary style as \\\\\\"a river of gold\\\\\\" must have applied to the published works, not the surviving notes.[Q] A major question in the history of Aristotle\'s works is how the exoteric writings were all lost, and how the ones now possessed came to be found.[212] The consensus is that Andronicus of Rhodes collected the esoteric works of Aristotle\'s school which existed in the form of smaller, separate works, distinguished them from those\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":223,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"f8e157ac-289f-57f7-9912-678686e2e358\\",\\"score\\":0.44353434443473805,\\"metadata\\":{\\"text\\":\\"In his On Generation and Corruption, Aristotle related each of the four elements proposed earlier by Empedocles, earth, water, air, and fire, to two of the four sensible qualities, hot, cold, wet, and dry. In the Empedoclean scheme, all matter was made of the four elements, in differing proportions. Aristotle\'s scheme added the heavenly aether, the divine substance of the heavenly spheres, stars and planets.[41]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":55,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"3728064f-4f14-5a96-a9ac-318385e7a9c2\\",\\"score\\":0.441104352474213,\\"metadata\\":{\\"text\\":\\"One component of Aristotle\'s theory of dreams disagrees with previously held beliefs. He claimed that dreams are not foretelling and not sent by a divine being. Aristotle reasoned naturalistically that instances in which dreams do resemble future events are simply coincidences.[116] Aristotle claimed that a dream is first established by the fact that the person is asleep when they experience it. If a person had an image appear for a moment after waking up or if they see something in the dark it is not\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":132,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"be401ab8-f58a-59da-a40a-deceae5b2fc2\\",\\"score\\":0.43738391622209105,\\"metadata\\":{\\"text\\":\\"Aristotle\'s Rhetoric proposes that a speaker can use three basic kinds of appeals to persuade his audience: ethos (an appeal to the speaker\'s character), pathos (an appeal to the audience\'s emotion), and logos (an appeal to logical reasoning).[130] He also categorizes rhetoric into three genres: epideictic (ceremonial speeches dealing with praise or blame), forensic (judicial speeches over guilt or innocence), and deliberative (speeches calling on an audience to make a decision on an issue).[131] Aristotle\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":164,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e663ee7c-62bc-52cf-b4a1-197ae5e91d4b\\",\\"score\\":0.43465039134025596,\\"metadata\\":{\\"text\\":\\"Instead, he practiced a different style of science: systematically gathering data, discovering patterns common to whole groups of animals, and inferring possible causal explanations from these.[78][79] This style is common in modern biology when large amounts of data become available in a new field, such as genomics. It does not result in the same certainty as experimental science, but it sets out testable hypotheses and constructs a narrative explanation of what is observed. In this sense, Aristotle\'s\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":95,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"dd14665e-c7e3-56af-9d31-dac1e78b5b9f\\",\\"score\\":0.43291583257412003,\\"metadata\\":{\\"text\\":\\"Aristotle examines the concepts of substance (ousia) and essence (to ti \\u00ean einai, \\\\\\"the what it was to be\\\\\\") in his Metaphysics (Book VII), and he concludes that a particular substance is a combination of both matter and form, a philosophical theory called hylomorphism. In Book VIII, he distinguishes the matter of the substance as the substratum, or the stuff of which it is composed. For example, the matter of a house is the bricks, stones, timbers, etc., or whatever constitutes the potential house, while\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":33,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"58ba0cc5-4324-5dd5-a4d4-5338cdf69b0e\\",\\"score\\":0.432026573177988,\\"metadata\\":{\\"text\\":\\"Hellenistic science\\\\nFurther information: Ancient Greek medicine\\\\nAfter Theophrastus, the Lyceum failed to produce any original work. Though interest in Aristotle\'s ideas survived, they were generally taken unquestioningly.[156] It is not until the age of Alexandria under the Ptolemies that advances in biology can be again found.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":183,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"a5ddeb77-7da5-561f-af07-ce2a71d1a8dd\\",\\"score\\":0.430962983908476,\\"metadata\\":{\\"text\\":\\"Medieval Europe\\\\nFurther information: Aristotelianism and Syllogism \\u00a7 Medieval\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":196,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"3932c7f5-e252-5209-a9a3-48c309e2bbf4\\",\\"score\\":0.43086590950882,\\"metadata\\":{\\"text\\":\\"\\ud835\\udc63\\\\n=\\\\n\\ud835\\udc50\\\\n\\ud835\\udc4a\\\\n\\ud835\\udf0c{\\\\\\\\displaystyle v=c{\\\\\\\\frac {W}{\\\\\\\\rho }}}\\\\nAristotle implies that in a vacuum the speed of fall would become infinite, and concludes from this apparent absurdity that a vacuum is not possible.[45][43] Opinions have varied on whether Aristotle intended to state quantitative laws. Henri Carteron held the \\\\\\"extreme view\\\\\\"[43] that Aristotle\'s concept of force was basically qualitative,[46] but other authors reject this.[43]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":62,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"b4134b8e-7569-5df5-93f3-9803da687c7a\\",\\"score\\":0.427144382766225,\\"metadata\\":{\\"text\\":\\"thought.[163] Philoponus questioned Aristotle\'s teaching of physics, noting its flaws and introducing the theory of impetus to explain his observations.[164]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":191,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ef9af951-f06e-5c1a-8c57-8964ee147dfa\\",\\"score\\":0.426049922344094,\\"metadata\\":{\\"text\\":\\"For Aristotle, the soul is the form of a living being. Because all beings are composites of form and matter, the form of living beings is that which endows them with what is specific to living beings, e.g. the ability to initiate movement (or in the case of plants, growth and transformations, which Aristotle considers types of movement).[11] In contrast to earlier philosophers, but in accordance with the Egyptians, he placed the rational soul in the heart, rather than the brain.[94] Notable is Aristotle\'s\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":110,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"71044b07-177e-5e0a-8267-941c535226f0\\",\\"score\\":0.423012614250183,\\"metadata\\":{\\"text\\":\\"exceptions, such as that sharks had a placenta like the tetrapods. To a modern biologist, the explanation, not available to Aristotle, is convergent evolution.[86] Philosophers of science have generally concluded that Aristotle was not interested in taxonomy,[87][88] but zoologists who studied this question in the early 21st century think otherwise.[89][90][91] He believed that purposive final causes guided all natural processes; this teleological view justified his observed data as an expression of formal\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":103,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}"]'}]}, + {}, + { + "run_id": ..., + "run_type": "search", + "entries": [ + {"key": "search_latency", "value": "0.47"}, + { + "key": "search_results", + "value": '["{\\"id\\":\\"c818bc72-2ac8-581b-b51a-0ca826f5f2b8\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Aristotle was born in 384 BC[C] in Stagira, Chalcidice,[2] about 55 km (34 miles) east of modern-day Thessaloniki.[3][4] His father, Nicomachus, was the personal physician to King Amyntas of Macedon. While he was young, Aristotle learned about biology and medical information, which was taught by his father.[5] Both of Aristotle\'s parents died when he was about thirteen, and Proxenus of Atarneus became his guardian.[6] Although little information about Aristotle\'s childhood has survived, he probably spent\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":8,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5f6213d1-a46b-5ed4-a15c-e95bab271621\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Aristotle was one of the most revered Western thinkers in early Islamic theology. Most of the still extant works of Aristotle,[167] as well as a number of the original Greek commentaries, were translated into Arabic and studied by Muslim philosophers, scientists and scholars. Averroes, Avicenna and Alpharabius, who wrote on Aristotle in great depth, also influenced Thomas Aquinas and other Western Christian scholastic philosophers. Alkindus greatly admired Aristotle\'s philosophy,[168] and Averroes spoke of\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":194,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e6c5d5f6-7fc4-5bb8-847d-44cfa16f5178\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Little is known about Aristotle\'s life. He was born in the city of Stagira in northern Greece during the Classical period. His father, Nicomachus, died when Aristotle was a child, and he was brought up by a guardian. At 17 or 18, he joined Plato\'s Academy in Athens and remained there until the age of 37 (c.\\u2009347 BC). Shortly after Plato died, Aristotle left Athens and, at the request of Philip II of Macedon, tutored his son Alexander the Great beginning in 343 BC. He established a library in the Lyceum,\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":1,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"1175585b-fc58-5e44-bfcb-cb1996289936\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"after friends and relatives, and to deal with the latter as with beasts or plants\\\\\\".[13] By 335 BC, Aristotle had returned to Athens, establishing his own school there known as the Lyceum. Aristotle conducted courses at the school for the next twelve years. While in Athens, his wife Pythias died and Aristotle became involved with Herpyllis of Stagira. They had a son whom Aristotle named after his father, Nicomachus. If the Suda \\u2013 an uncritical compilation from the Middle Ages \\u2013 is accurate, he may also have\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":16,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5fb90839-d04c-50b0-8f7f-ffc1a938c019\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Aristotle was appointed as the head of the royal Academy of Macedon. During Aristotle\'s time in the Macedonian court, he gave lessons not only to Alexander but also to two other future kings: Ptolemy and Cassander.[13] Aristotle encouraged Alexander toward eastern conquest, and Aristotle\'s own attitude towards Persia was unabashedly ethnocentric. In one famous example, he counsels Alexander to be \\\\\\"a leader to the Greeks and a despot to the barbarians, to look after the former as after friends and\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":15,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"8550e2b7-43f8-5a59-9c13-c9678670a2da\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"The immediate influence of Aristotle\'s work was felt as the Lyceum grew into the Peripatetic school. Aristotle\'s students included Aristoxenus, Dicaearchus, Demetrius of Phalerum, Eudemos of Rhodes, Harpalus, Hephaestion, Mnason of Phocis, Nicomachus, and Theophrastus. Aristotle\'s influence over Alexander the Great is seen in the latter\'s bringing with him on his expedition a host of zoologists, botanists, and researchers. He had also learned a great deal about Persian customs and traditions from his\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":181,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"7ed3a01c-88dc-5a58-a68b-6e5d9f292df2\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Aristotle[A] (Greek: \\u1f08\\u03c1\\u03b9\\u03c3\\u03c4\\u03bf\\u03c4\\u03ad\\u03bb\\u03b7\\u03c2 Aristot\\u00e9l\\u0113s, pronounced [aristot\\u00e9l\\u025b\\u02d0s]; 384\\u2013322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science.\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":0,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"ed4d0d2c-73dc-5baa-8351-065482dbdf9b\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"According to scholar Roger Theodore Lafferty, Dante built up the philosophy of the Comedy with the works of Aristotle as a foundation, just as the scholastics used Aristotle as the basis for their thinking. Dante knew Aristotle directly from Latin translations of his works and indirectly through quotations in the works of Albert Magnus.[175] Dante even acknowledges Aristotle\'s influence explicitly in the poem, specifically when Virgil justifies the Inferno\'s structure by citing the Nicomachean Ethics.[176]\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":200,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"e6f58828-2e6d-5eb1-94f3-efbc0b7c1699\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Aristotle was revered among medieval Muslim scholars as \\\\\\"The First Teacher\\\\\\", and among medieval Christians like Thomas Aquinas as simply \\\\\\"The Philosopher\\\\\\", while the poet Dante called him \\\\\\"the master of those who know\\\\\\". His works contain the earliest known formal study of logic, and were studied by medieval scholars such as Peter Abelard and Jean Buridan. Aristotle\'s influence on logic continued well into the 19th century. In addition, his ethics, although always influential, gained renewed interest with\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":5,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}", "{\\"id\\":\\"5c80f870-f275-5109-a752-5f9e0caf859c\\",\\"score\\":1.0,\\"metadata\\":{\\"text\\":\\"Aristotle\'s \\\\\\"natural philosophy\\\\\\" spans a wide range of natural phenomena including those now covered by physics, biology and other natural sciences.[40] In Aristotle\'s terminology, \\\\\\"natural philosophy\\\\\\" is a branch of philosophy examining the phenomena of the natural world, and includes fields that would be regarded today as physics, biology and other natural sciences. Aristotle\'s work encompassed virtually all facets of intellectual inquiry. Aristotle makes philosophy in the broad sense coextensive with\\",\\"title\\":\\"aristotle.txt\\",\\"user_id\\":\\"063edaf8-3e63-4cb9-a4d6-a855f36376c3\\",\\"version\\":\\"v0\\",\\"chunk_order\\":51,\\"document_id\\":\\"c9bdbac7-0ea3-5c9e-b590-018bd09b127b\\",\\"extraction_id\\":\\"472d6921-b4cd-5514-bf62-90b05c9102cb\\",\\"associatedQuery\\":\\"who is aristotle?\\"}}"]', + }, + {"key": "search_query", "value": "who is aristotle?"}, + ], + }, +] + + +def approx_equal(a, b, tolerance=1e-3): + """Compare two float values for approximate equality.""" + return math.isclose(a, b, rel_tol=tolerance) + + +def compare_search_results(actual, expected): + """Compare search results while allowing for slight differences in scores.""" + actual_results = json.loads(actual) + expected_results = json.loads(expected) + + if len(actual_results) != len(expected_results): + return False + + for actual_item, expected_item in zip(actual_results, expected_results): + actual_dict = json.loads(actual_item) + expected_dict = json.loads(expected_item) + + if actual_dict["id"] != expected_dict["id"]: + raise AssertionError( + f"IDs do not match: {actual_dict['id']} != {expected_dict['id']}" + ) + + if not approx_equal( + actual_dict["score"], expected_dict["score"], tolerance=1e-2 + ): + raise AssertionError( + f"Scores do not match: {actual_dict['score']} != {expected_dict['score']}" + ) + + if actual_dict["metadata"] != expected_dict["metadata"]: + raise AssertionError( + f"Metadata does not match: {actual_dict['metadata']} != {expected_dict['metadata']}" + ) + + return True + + +def test_ingestion_success(wrapper): + """Test the initial successful ingestion process.""" + result = wrapper.ingest_sample_file() + expected_payload = { + "processed_documents": [ + "Document 'aristotle.txt' processed successfully." + ], + "failed_documents": [], + "skipped_documents": [], + } + for key in expected_payload: + assert key in result + assert len(result[key]) == len(expected_payload[key]) + for i, value in enumerate(result[key]): + assert value == expected_payload[key][i] + print("Initial ingestion test passed successfully.") + + +def test_full_ingestion_success(wrapper): + """Test the initial successful ingestion process.""" + result = wrapper.ingest_sample_files() + + expected_processed = { + "Document 'lyft_2021.pdf' processed successfully.", + "Document 'uber_2021.pdf' processed successfully.", + "Document 'pg_essay_3.html' processed successfully.", + "Document 'pg_essay_2.html' processed successfully.", + "Document 'pg_essay_4.html' processed successfully.", + "Document 'got.txt' processed successfully.", + "Document 'pg_essay_5.html' processed successfully.", + "Document 'pg_essay_1.html' processed successfully.", + } + expected_failed = set() + expected_skipped = { + "Document 'aristotle.txt' skipped since it already exists." + } + + assert set(result["processed_documents"]) == expected_processed + assert set(result["failed_documents"]) == expected_failed + assert set(result["skipped_documents"]) == expected_skipped + + assert len(result["processed_documents"]) == len(expected_processed) + assert len(result["failed_documents"]) == len(expected_failed) + assert len(result["skipped_documents"]) == len(expected_skipped) + + print("Initial ingestion test passed successfully.") + + +def test_ingestion_failure(wrapper): + """Test the subsequent ingestion process that should fail.""" + try: + wrapper.ingest_sample_file() + raise AssertionError("Expected an exception, but none was raised.") + except Exception as e: + assert ( + str(e) + == "Document with ID c9bdbac7-0ea3-5c9e-b590-018bd09b127b was already successfully processed." + ) + print("Subsequent ingestion test passed: Expected error was raised.") + + +def test_logs(wrapper, expected_length): + """Test the logging functionality.""" + logs = wrapper.logs() + assert len(logs) == expected_length + log = logs[0] + expected_log = expected_logs[expected_length - 1] + + assert log["run_id"] is not None + assert log["run_type"] == expected_log["run_type"] + assert len(log["entries"]) == len(expected_log["entries"]) + + for i in range(len(log["entries"])): + entry = log["entries"][i] + print("entry: ", entry) + expected_entry = None + for expected_entry in expected_log["entries"]: + if expected_entry["key"] == entry["key"]: + break + if expected_entry is None: + raise AssertionError(f"Unexpected entry: {entry}") + print("expected_entry: ", expected_entry) + + if "latency" in entry["key"]: + continue + elif "search_results" == entry["key"]: + assert compare_search_results( + entry["value"], expected_entry["value"] + ) + else: + assert entry["key"] == entry["key"] + assert entry["value"] == expected_entry["value"] + print("Logs test passed.") + + +def test_vector_search( + wrapper, query, expected_scores, do_hybrid_search=False, search_limit=10 +): + """Test search functionality with given parameters.""" + search_results = wrapper.search( + query, do_hybrid_search=do_hybrid_search, search_limit=search_limit + ) + assert "vector_search_results" in search_results + scores = [ + result["score"] for result in search_results["vector_search_results"] + ] + if expected_scores: + assert len(scores) == len(expected_scores) + assert all(approx_equal(a, b) for a, b in zip(scores, expected_scores)) + print( + f"Search test passed for query: '{query}', hybrid: {do_hybrid_search}, limit: {search_limit}" + ) + return search_results["vector_search_results"] + + +def test_documents_overview(wrapper): + """Test the documents_overview functionality.""" + documents_overview = wrapper.documents_overview() + assert len(documents_overview) == 1 + doc_info = documents_overview[0] + + assert isinstance(doc_info.document_id, UUID) + assert doc_info.document_id == UUID("c9bdbac7-0ea3-5c9e-b590-018bd09b127b") + assert doc_info.version == "v0" + assert doc_info.size_in_bytes == 73353 + assert doc_info.metadata == { + "title": "aristotle.txt", + "user_id": "063edaf8-3e63-4cb9-a4d6-a855f36376c3", + } + assert doc_info.status.value == "success" + assert doc_info.user_id == UUID("063edaf8-3e63-4cb9-a4d6-a855f36376c3") + assert doc_info.title == "aristotle.txt" + assert isinstance(doc_info.created_at, datetime) + assert isinstance(doc_info.updated_at, datetime) + assert doc_info.created_at.tzinfo == timezone.utc + assert doc_info.updated_at.tzinfo == timezone.utc + + +def test_users_overview(wrapper): + """Test the users_overview functionality.""" + users_overview = wrapper.users_overview() + assert len(users_overview) == 1 + user_stats = users_overview[0] + + assert isinstance(user_stats.user_id, UUID) + assert user_stats.user_id == UUID("063edaf8-3e63-4cb9-a4d6-a855f36376c3") + assert user_stats.num_files == 1 + assert user_stats.total_size_in_bytes == 73353 + assert len(user_stats.document_ids) == 1 + assert user_stats.document_ids[0] == UUID( + "c9bdbac7-0ea3-5c9e-b590-018bd09b127b" + ) + + print("Users overview test passed successfully.") + + +def test_document_chunks(wrapper): + """Test the document_chunks functionality.""" + document_chunks = wrapper.document_chunks( + "c9bdbac7-0ea3-5c9e-b590-018bd09b127b" + ) + assert len(document_chunks) == 233 + first_chunk = document_chunks[0] + assert ( + first_chunk["text"] + == "Aristotle[A] (Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science." + ) + assert first_chunk["chunk_order"] == 0 + print("Document chunks test passed successfully.") + + +def main(): + wrapper = R2RExecutionWrapper(client_mode=False) + + # Test ingestion + test_ingestion_success(wrapper) + + # Test logs + test_logs(wrapper, expected_length=1) + + # Test the document overview table + _ = test_documents_overview(wrapper) + + # Test the users overview table + test_users_overview(wrapper) + + # Test the document chunks method + test_document_chunks(wrapper) + + # Test subsequent ingestion (expecting failure) + test_ingestion_failure(wrapper) + + # Test regular search + regular_expected_scores = [ + 0.7737913131713869, + 0.669298529624939, + 0.652687707703574, + 0.636050164699554, + 0.624127291194959, + 0.619364976882935, + 0.6177915291003779, + 0.606354117393494, + 0.601802307421038, + 0.595915484915322, + ] + _ = test_vector_search( + wrapper, "who is aristotle?", regular_expected_scores + ) + test_logs(wrapper, expected_length=2) + + # Test search with larger limit + large_filter_results = test_vector_search( + wrapper, "who is aristotle?", None, search_limit=100 + ) + assert len(large_filter_results) == 100 + assert approx_equal( + large_filter_results[0]["score"], regular_expected_scores[0] + ) + # test_logs(wrapper, expected_length=3) + + # Test hybrid search + hybrid_expected_zero_result = "Aristotle was born in 384 BC[C] in Stagira, Chalcidice,[2] about 55 km (34 miles) east of modern-day Thessaloniki.[3][4] His father, Nicomachus, was the personal physician to King Amyntas of Macedon. While he was young, Aristotle learned about biology and medical information, which was taught by his father.[5] Both of Aristotle's parents died when he was about thirteen, and Proxenus of Atarneus became his guardian.[6] Although little information about Aristotle's childhood has survived, he probably spent" + hybrid_expected_scores = [1] * 10 + hybrid_results = test_vector_search( + wrapper, + "who is aristotle?", + hybrid_expected_scores, + do_hybrid_search=True, + ) + assert hybrid_results[0]["metadata"]["text"] == hybrid_expected_zero_result + # test_logs(wrapper, expected_length=4) + + test_full_ingestion_success(wrapper) + + +if __name__ == "__main__": + main() diff --git a/R2R/r2r/integrations/__init__.py b/R2R/r2r/integrations/__init__.py new file mode 100755 index 00000000..0830f40c --- /dev/null +++ b/R2R/r2r/integrations/__init__.py @@ -0,0 +1,3 @@ +from .serper import SerperClient + +__all__ = ["SerperClient"] diff --git a/R2R/r2r/integrations/serper.py b/R2R/r2r/integrations/serper.py new file mode 100755 index 00000000..14333d1a --- /dev/null +++ b/R2R/r2r/integrations/serper.py @@ -0,0 +1,103 @@ +import http.client +import json +import os + + +# TODO - Move process json to dedicated data processing module +def process_json(json_object, indent=0): + """ + Recursively traverses the JSON object (dicts and lists) to create an unstructured text blob. + """ + text_blob = "" + if isinstance(json_object, dict): + for key, value in json_object.items(): + padding = " " * indent + if isinstance(value, (dict, list)): + text_blob += ( + f"{padding}{key}:\n{process_json(value, indent + 1)}" + ) + else: + text_blob += f"{padding}{key}: {value}\n" + elif isinstance(json_object, list): + for index, item in enumerate(json_object): + padding = " " * indent + if isinstance(item, (dict, list)): + text_blob += f"{padding}Item {index + 1}:\n{process_json(item, indent + 1)}" + else: + text_blob += f"{padding}Item {index + 1}: {item}\n" + return text_blob + + +# TODO - Introduce abstract "Integration" ABC. +class SerperClient: + def __init__(self, api_base: str = "google.serper.dev") -> None: + api_key = os.getenv("SERPER_API_KEY") + if not api_key: + raise ValueError( + "Please set the `SERPER_API_KEY` environment variable to use `SerperClient`." + ) + + self.api_base = api_base + self.headers = { + "X-API-KEY": api_key, + "Content-Type": "application/json", + } + + @staticmethod + def _extract_results(result_data: dict) -> list: + formatted_results = [] + + for key, value in result_data.items(): + # Skip searchParameters as it's not a result entry + if key == "searchParameters": + continue + + # Handle 'answerBox' as a single item + if key == "answerBox": + value["type"] = key # Add the type key to the dictionary + formatted_results.append(value) + # Handle lists of results + elif isinstance(value, list): + for item in value: + item["type"] = key # Add the type key to the dictionary + formatted_results.append(item) + # Handle 'peopleAlsoAsk' and potentially other single item formats + elif isinstance(value, dict): + value["type"] = key # Add the type key to the dictionary + formatted_results.append(value) + + return formatted_results + + # TODO - Add explicit typing for the return value + def get_raw(self, query: str, limit: int = 10) -> list: + connection = http.client.HTTPSConnection(self.api_base) + payload = json.dumps({"q": query, "num_outputs": limit}) + connection.request("POST", "/search", payload, self.headers) + response = connection.getresponse() + data = response.read() + json_data = json.loads(data.decode("utf-8")) + return SerperClient._extract_results(json_data) + + @staticmethod + def construct_context(results: list) -> str: + # Organize results by type + organized_results = {} + for result in results: + result_type = result.metadata.pop( + "type", "Unknown" + ) # Pop the type and use as key + if result_type not in organized_results: + organized_results[result_type] = [result.metadata] + else: + organized_results[result_type].append(result.metadata) + + context = "" + # Iterate over each result type + for result_type, items in organized_results.items(): + context += f"# {result_type} Results:\n" + for index, item in enumerate(items, start=1): + # Process each item under the current type + context += f"Item {index}:\n" + context += process_json(item) + "\n" + + return context diff --git a/R2R/r2r/main/__init__.py b/R2R/r2r/main/__init__.py new file mode 100755 index 00000000..55a828d6 --- /dev/null +++ b/R2R/r2r/main/__init__.py @@ -0,0 +1,54 @@ +from .abstractions import R2RPipelines, R2RProviders +from .api.client import R2RClient +from .api.requests import ( + R2RAnalyticsRequest, + R2RDeleteRequest, + R2RDocumentChunksRequest, + R2RDocumentsOverviewRequest, + R2REvalRequest, + R2RIngestFilesRequest, + R2RRAGRequest, + R2RSearchRequest, + R2RUpdateFilesRequest, + R2RUpdatePromptRequest, + R2RUsersOverviewRequest, +) +from .app import R2RApp +from .assembly.builder import R2RBuilder +from .assembly.config import R2RConfig +from .assembly.factory import ( + R2RPipeFactory, + R2RPipelineFactory, + R2RProviderFactory, +) +from .assembly.factory_extensions import R2RPipeFactoryWithMultiSearch +from .engine import R2REngine +from .execution import R2RExecutionWrapper +from .r2r import R2R + +__all__ = [ + "R2R", + "R2RPipelines", + "R2RProviders", + "R2RUpdatePromptRequest", + "R2RIngestFilesRequest", + "R2RUpdateFilesRequest", + "R2RSearchRequest", + "R2RRAGRequest", + "R2REvalRequest", + "R2RDeleteRequest", + "R2RAnalyticsRequest", + "R2RUsersOverviewRequest", + "R2RDocumentsOverviewRequest", + "R2RDocumentChunksRequest", + "R2REngine", + "R2RExecutionWrapper", + "R2RConfig", + "R2RClient", + "R2RPipeFactory", + "R2RPipelineFactory", + "R2RProviderFactory", + "R2RPipeFactoryWithMultiSearch", + "R2RBuilder", + "R2RApp", +] diff --git a/R2R/r2r/main/abstractions.py b/R2R/r2r/main/abstractions.py new file mode 100755 index 00000000..3622b22d --- /dev/null +++ b/R2R/r2r/main/abstractions.py @@ -0,0 +1,58 @@ +from typing import Optional + +from pydantic import BaseModel + +from r2r.base import ( + AsyncPipe, + EmbeddingProvider, + EvalProvider, + KGProvider, + LLMProvider, + PromptProvider, + VectorDBProvider, +) +from r2r.pipelines import ( + EvalPipeline, + IngestionPipeline, + RAGPipeline, + SearchPipeline, +) + + +class R2RProviders(BaseModel): + vector_db: Optional[VectorDBProvider] + embedding: Optional[EmbeddingProvider] + llm: Optional[LLMProvider] + prompt: Optional[PromptProvider] + eval: Optional[EvalProvider] + kg: Optional[KGProvider] + + class Config: + arbitrary_types_allowed = True + + +class R2RPipes(BaseModel): + parsing_pipe: Optional[AsyncPipe] + embedding_pipe: Optional[AsyncPipe] + vector_storage_pipe: Optional[AsyncPipe] + vector_search_pipe: Optional[AsyncPipe] + rag_pipe: Optional[AsyncPipe] + streaming_rag_pipe: Optional[AsyncPipe] + eval_pipe: Optional[AsyncPipe] + kg_pipe: Optional[AsyncPipe] + kg_storage_pipe: Optional[AsyncPipe] + kg_agent_search_pipe: Optional[AsyncPipe] + + class Config: + arbitrary_types_allowed = True + + +class R2RPipelines(BaseModel): + eval_pipeline: EvalPipeline + ingestion_pipeline: IngestionPipeline + search_pipeline: SearchPipeline + rag_pipeline: RAGPipeline + streaming_rag_pipeline: RAGPipeline + + class Config: + arbitrary_types_allowed = True diff --git a/R2R/r2r/main/api/__init__.py b/R2R/r2r/main/api/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/main/api/__init__.py diff --git a/R2R/r2r/main/api/client.py b/R2R/r2r/main/api/client.py new file mode 100755 index 00000000..b0f5b966 --- /dev/null +++ b/R2R/r2r/main/api/client.py @@ -0,0 +1,377 @@ +import asyncio +import functools +import json +import os +import threading +import time +import uuid +from contextlib import ExitStack +from typing import Any, AsyncGenerator, Generator, Optional, Union + +import fire +import httpx +import nest_asyncio +import requests + +from .requests import ( + R2RAnalyticsRequest, + R2RDeleteRequest, + R2RDocumentChunksRequest, + R2RDocumentsOverviewRequest, + R2RIngestFilesRequest, + R2RLogsRequest, + R2RPrintRelationshipsRequest, + R2RRAGRequest, + R2RSearchRequest, + R2RUpdateFilesRequest, + R2RUpdatePromptRequest, + R2RUsersOverviewRequest, +) + +nest_asyncio.apply() + + +class R2RHTTPError(Exception): + def __init__(self, status_code, error_type, message): + self.status_code = status_code + self.error_type = error_type + self.message = message + super().__init__(f"[{status_code}] {error_type}: {message}") + + +def handle_request_error(response): + if response.status_code >= 400: + try: + error_content = response.json() + if isinstance(error_content, dict) and "detail" in error_content: + detail = error_content["detail"] + if isinstance(detail, dict): + message = detail.get("message", str(response.text)) + error_type = detail.get("error_type", "UnknownError") + else: + message = str(detail) + error_type = "HTTPException" + else: + message = str(error_content) + error_type = "UnknownError" + except json.JSONDecodeError: + message = response.text + error_type = "UnknownError" + + raise R2RHTTPError( + status_code=response.status_code, + error_type=error_type, + message=message, + ) + + +def monitor_request(func): + @functools.wraps(func) + def wrapper(*args, monitor=False, **kwargs): + if not monitor: + return func(*args, **kwargs) + + result = None + exception = None + + def run_func(): + nonlocal result, exception + try: + result = func(*args, **kwargs) + except Exception as e: + exception = e + + thread = threading.Thread(target=run_func) + thread.start() + + dots = [".", "..", "..."] + i = 0 + while thread.is_alive(): + print(f"\rRequesting{dots[i % 3]}", end="", flush=True) + i += 1 + time.sleep(0.5) + + thread.join() + + print("\r", end="", flush=True) + + if exception: + raise exception + return result + + return wrapper + + +class R2RClient: + def __init__(self, base_url: str, prefix: str = "/v1"): + self.base_url = base_url + self.prefix = prefix + + def _make_request(self, method, endpoint, **kwargs): + url = f"{self.base_url}{self.prefix}/{endpoint}" + response = requests.request(method, url, **kwargs) + handle_request_error(response) + return response.json() + + def health(self) -> dict: + return self._make_request("GET", "health") + + def update_prompt( + self, + name: str = "default_system", + template: Optional[str] = None, + input_types: Optional[dict] = None, + ) -> dict: + request = R2RUpdatePromptRequest( + name=name, template=template, input_types=input_types + ) + return self._make_request( + "POST", "update_prompt", json=json.loads(request.json()) + ) + + @monitor_request + def ingest_files( + self, + file_paths: list[str], + metadatas: Optional[list[dict]] = None, + document_ids: Optional[list[Union[uuid.UUID, str]]] = None, + versions: Optional[list[str]] = None, + ) -> dict: + all_file_paths = [] + + for path in file_paths: + if os.path.isdir(path): + for root, _, files in os.walk(path): + all_file_paths.extend( + os.path.join(root, file) for file in files + ) + else: + all_file_paths.append(path) + + files_to_upload = [ + ( + "files", + ( + os.path.basename(file), + open(file, "rb"), + "application/octet-stream", + ), + ) + for file in all_file_paths + ] + request = R2RIngestFilesRequest( + metadatas=metadatas, + document_ids=( + [str(ele) for ele in document_ids] if document_ids else None + ), + versions=versions, + ) + try: + return self._make_request( + "POST", + "ingest_files", + data={ + k: json.dumps(v) + for k, v in json.loads(request.json()).items() + }, + files=files_to_upload, + ) + finally: + for _, file_tuple in files_to_upload: + file_tuple[1].close() + + @monitor_request + def update_files( + self, + file_paths: list[str], + document_ids: list[str], + metadatas: Optional[list[dict]] = None, + ) -> dict: + request = R2RUpdateFilesRequest( + metadatas=metadatas, + document_ids=document_ids, + ) + with ExitStack() as stack: + return self._make_request( + "POST", + "update_files", + data={ + k: json.dumps(v) + for k, v in json.loads(request.json()).items() + }, + files=[ + ( + "files", + ( + path.split("/")[-1], + stack.enter_context(open(path, "rb")), + "application/octet-stream", + ), + ) + for path in file_paths + ], + ) + + def search( + self, + query: str, + use_vector_search: bool = True, + search_filters: Optional[dict[str, Any]] = {}, + search_limit: int = 10, + do_hybrid_search: bool = False, + use_kg_search: bool = False, + kg_agent_generation_config: Optional[dict] = None, + ) -> dict: + request = R2RSearchRequest( + query=query, + vector_search_settings={ + "use_vector_search": use_vector_search, + "search_filters": search_filters or {}, + "search_limit": search_limit, + "do_hybrid_search": do_hybrid_search, + }, + kg_search_settings={ + "use_kg_search": use_kg_search, + "agent_generation_config": kg_agent_generation_config, + }, + ) + return self._make_request( + "POST", "search", json=json.loads(request.json()) + ) + + def rag( + self, + query: str, + use_vector_search: bool = True, + search_filters: Optional[dict[str, Any]] = {}, + search_limit: int = 10, + do_hybrid_search: bool = False, + use_kg_search: bool = False, + kg_agent_generation_config: Optional[dict] = None, + rag_generation_config: Optional[dict] = None, + ) -> dict: + request = R2RRAGRequest( + query=query, + vector_search_settings={ + "use_vector_search": use_vector_search, + "search_filters": search_filters or {}, + "search_limit": search_limit, + "do_hybrid_search": do_hybrid_search, + }, + kg_search_settings={ + "use_kg_search": use_kg_search, + "agent_generation_config": kg_agent_generation_config, + }, + rag_generation_config=rag_generation_config, + ) + + if rag_generation_config and rag_generation_config.get( + "stream", False + ): + return self._stream_rag_sync(request) + else: + return self._make_request( + "POST", "rag", json=json.loads(request.json()) + ) + + async def _stream_rag( + self, rag_request: R2RRAGRequest + ) -> AsyncGenerator[str, None]: + url = f"{self.base_url}{self.prefix}/rag" + async with httpx.AsyncClient() as client: + async with client.stream( + "POST", url, json=json.loads(rag_request.json()) + ) as response: + handle_request_error(response) + async for chunk in response.aiter_text(): + yield chunk + + def _stream_rag_sync( + self, rag_request: R2RRAGRequest + ) -> Generator[str, None, None]: + async def run_async_generator(): + async for chunk in self._stream_rag(rag_request): + yield chunk + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + async_gen = run_async_generator() + + try: + while True: + chunk = loop.run_until_complete(async_gen.__anext__()) + yield chunk + except StopAsyncIteration: + pass + finally: + loop.close() + + def delete( + self, keys: list[str], values: list[Union[bool, int, str]] + ) -> dict: + request = R2RDeleteRequest(keys=keys, values=values) + return self._make_request( + "DELETE", "delete", json=json.loads(request.json()) + ) + + def logs(self, log_type_filter: Optional[str] = None) -> dict: + request = R2RLogsRequest(log_type_filter=log_type_filter) + return self._make_request( + "GET", "logs", json=json.loads(request.json()) + ) + + def app_settings(self) -> dict: + return self._make_request("GET", "app_settings") + + def analytics(self, filter_criteria: dict, analysis_types: dict) -> dict: + request = R2RAnalyticsRequest( + filter_criteria=filter_criteria, analysis_types=analysis_types + ) + return self._make_request( + "GET", "analytics", json=json.loads(request.json()) + ) + + def users_overview( + self, user_ids: Optional[list[uuid.UUID]] = None + ) -> dict: + request = R2RUsersOverviewRequest(user_ids=user_ids) + return self._make_request( + "GET", "users_overview", json=json.loads(request.json()) + ) + + def documents_overview( + self, + document_ids: Optional[list[str]] = None, + user_ids: Optional[list[str]] = None, + ) -> dict: + request = R2RDocumentsOverviewRequest( + document_ids=( + [uuid.UUID(did) for did in document_ids] + if document_ids + else None + ), + user_ids=( + [uuid.UUID(uid) for uid in user_ids] if user_ids else None + ), + ) + return self._make_request( + "GET", "documents_overview", json=json.loads(request.json()) + ) + + def document_chunks(self, document_id: str) -> dict: + request = R2RDocumentChunksRequest(document_id=document_id) + return self._make_request( + "GET", "document_chunks", json=json.loads(request.json()) + ) + + def inspect_knowledge_graph(self, limit: int = 100) -> str: + request = R2RPrintRelationshipsRequest(limit=limit) + return self._make_request( + "POST", "inspect_knowledge_graph", json=json.loads(request.json()) + ) + + +if __name__ == "__main__": + client = R2RClient(base_url="http://localhost:8000") + fire.Fire(client) diff --git a/R2R/r2r/main/api/requests.py b/R2R/r2r/main/api/requests.py new file mode 100755 index 00000000..5c63ab82 --- /dev/null +++ b/R2R/r2r/main/api/requests.py @@ -0,0 +1,79 @@ +import uuid +from typing import Optional, Union + +from pydantic import BaseModel + +from r2r.base import AnalysisTypes, FilterCriteria + + +class R2RUpdatePromptRequest(BaseModel): + name: str + template: Optional[str] = None + input_types: Optional[dict[str, str]] = {} + + +class R2RIngestFilesRequest(BaseModel): + document_ids: Optional[list[uuid.UUID]] = None + metadatas: Optional[list[dict]] = None + versions: Optional[list[str]] = None + + +class R2RUpdateFilesRequest(BaseModel): + metadatas: Optional[list[dict]] = None + document_ids: Optional[list[uuid.UUID]] = None + + +class R2RSearchRequest(BaseModel): + query: str + vector_search_settings: Optional[dict] = None + kg_search_settings: Optional[dict] = None + + +class R2RRAGRequest(BaseModel): + query: str + vector_search_settings: Optional[dict] = None + kg_search_settings: Optional[dict] = None + rag_generation_config: Optional[dict] = None + + +class R2REvalRequest(BaseModel): + query: str + context: str + completion: str + + +class R2RDeleteRequest(BaseModel): + keys: list[str] + values: list[Union[bool, int, str]] + + +class R2RAnalyticsRequest(BaseModel): + filter_criteria: FilterCriteria + analysis_types: AnalysisTypes + + +class R2RUsersOverviewRequest(BaseModel): + user_ids: Optional[list[uuid.UUID]] + + +class R2RDocumentsOverviewRequest(BaseModel): + document_ids: Optional[list[uuid.UUID]] + user_ids: Optional[list[uuid.UUID]] + + +class R2RDocumentChunksRequest(BaseModel): + document_id: uuid.UUID + + +class R2RLogsRequest(BaseModel): + log_type_filter: Optional[str] = (None,) + max_runs_requested: int = 100 + + +class R2RPrintRelationshipsRequest(BaseModel): + limit: int = 100 + + +class R2RExtractionRequest(BaseModel): + entity_types: list[str] + relations: list[str] diff --git a/R2R/r2r/main/api/routes/__init__.py b/R2R/r2r/main/api/routes/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/main/api/routes/__init__.py diff --git a/R2R/r2r/main/api/routes/base_router.py b/R2R/r2r/main/api/routes/base_router.py new file mode 100755 index 00000000..d06a9935 --- /dev/null +++ b/R2R/r2r/main/api/routes/base_router.py @@ -0,0 +1,75 @@ +import functools +import logging + +from fastapi import APIRouter, HTTPException +from fastapi.responses import StreamingResponse + +from r2r.base import R2RException, manage_run + +logger = logging.getLogger(__name__) + + +class BaseRouter: + def __init__(self, engine): + self.engine = engine + self.router = APIRouter() + + def base_endpoint(self, func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + async with manage_run( + self.engine.run_manager, func.__name__ + ) as run_id: + try: + results = await func(*args, **kwargs) + if isinstance(results, StreamingResponse): + return results + + return {"results": results} + except R2RException as re: + raise HTTPException( + status_code=re.status_code, + detail={ + "message": re.message, + "error_type": type(re).__name__, + }, + ) + except Exception as e: + # Get the pipeline name based on the function name + pipeline_name = f"{func.__name__.split('_')[0]}_pipeline" + + # Safely get the pipeline object and its type + pipeline = getattr( + self.engine.pipelines, pipeline_name, None + ) + pipeline_type = getattr( + pipeline, "pipeline_type", "unknown" + ) + + await self.engine.logging_connection.log( + log_id=run_id, + key="pipeline_type", + value=pipeline_type, + is_info_log=True, + ) + await self.engine.logging_connection.log( + log_id=run_id, + key="error", + value=str(e), + is_info_log=False, + ) + logger.error(f"{func.__name__}() - \n\n{str(e)})") + raise HTTPException( + status_code=500, + detail={ + "message": f"An error occurred during {func.__name__}", + "error": str(e), + "error_type": type(e).__name__, + }, + ) from e + + return wrapper + + @classmethod + def build_router(cls, engine): + return cls(engine).router diff --git a/R2R/r2r/main/api/routes/ingestion.py b/R2R/r2r/main/api/routes/ingestion.py new file mode 100755 index 00000000..be583602 --- /dev/null +++ b/R2R/r2r/main/api/routes/ingestion.py @@ -0,0 +1,42 @@ +from fastapi import Depends, File, UploadFile + +from ...engine import R2REngine +from ...services.ingestion_service import IngestionService +from ..requests import R2RIngestFilesRequest, R2RUpdateFilesRequest +from .base_router import BaseRouter + + +class IngestionRouter(BaseRouter): + def __init__(self, engine: R2REngine): + super().__init__(engine) + self.setup_routes() + + def setup_routes(self): + @self.router.post("/ingest_files") + @self.base_endpoint + async def ingest_files_app( + files: list[UploadFile] = File(...), + request: R2RIngestFilesRequest = Depends( + IngestionService.parse_ingest_files_form_data + ), + ): + return await self.engine.aingest_files( + files=files, + metadatas=request.metadatas, + document_ids=request.document_ids, + versions=request.versions, + ) + + @self.router.post("/update_files") + @self.base_endpoint + async def update_files_app( + files: list[UploadFile] = File(...), + request: R2RUpdateFilesRequest = Depends( + IngestionService.parse_update_files_form_data + ), + ): + return await self.engine.aupdate_files( + files=files, + metadatas=request.metadatas, + document_ids=request.document_ids, + ) diff --git a/R2R/r2r/main/api/routes/management.py b/R2R/r2r/main/api/routes/management.py new file mode 100755 index 00000000..921fb534 --- /dev/null +++ b/R2R/r2r/main/api/routes/management.py @@ -0,0 +1,101 @@ +from ...engine import R2REngine +from ..requests import ( + R2RAnalyticsRequest, + R2RDeleteRequest, + R2RDocumentChunksRequest, + R2RDocumentsOverviewRequest, + R2RLogsRequest, + R2RPrintRelationshipsRequest, + R2RUpdatePromptRequest, + R2RUsersOverviewRequest, +) +from .base_router import BaseRouter + + +class ManagementRouter(BaseRouter): + def __init__(self, engine: R2REngine): + super().__init__(engine) + self.setup_routes() + + def setup_routes(self): + @self.router.get("/health") + async def health_check(): + return {"response": "ok"} + + @self.router.post("/update_prompt") + @self.base_endpoint + async def update_prompt_app(request: R2RUpdatePromptRequest): + return await self.engine.aupdate_prompt( + request.name, request.template, request.input_types + ) + + @self.router.post("/logs") + @self.router.get("/logs") + @self.base_endpoint + async def get_logs_app(request: R2RLogsRequest): + return await self.engine.alogs( + log_type_filter=request.log_type_filter, + max_runs_requested=request.max_runs_requested, + ) + + @self.router.post("/analytics") + @self.router.get("/analytics") + @self.base_endpoint + async def get_analytics_app(request: R2RAnalyticsRequest): + return await self.engine.aanalytics( + filter_criteria=request.filter_criteria, + analysis_types=request.analysis_types, + ) + + @self.router.post("/users_overview") + @self.router.get("/users_overview") + @self.base_endpoint + async def get_users_overview_app(request: R2RUsersOverviewRequest): + return await self.engine.ausers_overview(user_ids=request.user_ids) + + @self.router.delete("/delete") + @self.base_endpoint + async def delete_app(request: R2RDeleteRequest): + return await self.engine.adelete( + keys=request.keys, values=request.values + ) + + @self.router.post("/documents_overview") + @self.router.get("/documents_overview") + @self.base_endpoint + async def get_documents_overview_app( + request: R2RDocumentsOverviewRequest, + ): + return await self.engine.adocuments_overview( + document_ids=request.document_ids, user_ids=request.user_ids + ) + + @self.router.post("/document_chunks") + @self.router.get("/document_chunks") + @self.base_endpoint + async def get_document_chunks_app(request: R2RDocumentChunksRequest): + return await self.engine.adocument_chunks(request.document_id) + + @self.router.post("/inspect_knowledge_graph") + @self.router.get("/inspect_knowledge_graph") + @self.base_endpoint + async def inspect_knowledge_graph( + request: R2RPrintRelationshipsRequest, + ): + return await self.engine.inspect_knowledge_graph( + limit=request.limit + ) + + @self.router.get("/app_settings") + @self.base_endpoint + async def get_app_settings_app(): + return await self.engine.aapp_settings() + + @self.router.get("/openapi_spec") + @self.base_endpoint + def get_openapi_spec_app(): + return self.engine.openapi_spec() + + +def create_management_router(engine: R2REngine): + return ManagementRouter(engine).router diff --git a/R2R/r2r/main/api/routes/retrieval.py b/R2R/r2r/main/api/routes/retrieval.py new file mode 100755 index 00000000..b2d352aa --- /dev/null +++ b/R2R/r2r/main/api/routes/retrieval.py @@ -0,0 +1,91 @@ +from fastapi.responses import StreamingResponse + +from r2r.base import GenerationConfig, KGSearchSettings, VectorSearchSettings + +from ...engine import R2REngine +from ..requests import R2REvalRequest, R2RRAGRequest, R2RSearchRequest +from .base_router import BaseRouter + + +class RetrievalRouter(BaseRouter): + def __init__(self, engine: R2REngine): + super().__init__(engine) + self.setup_routes() + + def setup_routes(self): + @self.router.post("/search") + @self.base_endpoint + async def search_app(request: R2RSearchRequest): + if "agent_generation_config" in request.kg_search_settings: + request.kg_search_settings["agent_generation_config"] = ( + GenerationConfig( + **request.kg_search_settings["agent_generation_config"] + or {} + ) + ) + + results = await self.engine.asearch( + query=request.query, + vector_search_settings=VectorSearchSettings( + **(request.vector_search_settings or {}) + ), + kg_search_settings=KGSearchSettings( + **(request.kg_search_settings or {}) + ), + ) + return results + + @self.router.post("/rag") + @self.base_endpoint + async def rag_app(request: R2RRAGRequest): + if "agent_generation_config" in request.kg_search_settings: + request.kg_search_settings["agent_generation_config"] = ( + GenerationConfig( + **( + request.kg_search_settings[ + "agent_generation_config" + ] + or {} + ) + ) + ) + response = await self.engine.arag( + query=request.query, + vector_search_settings=VectorSearchSettings( + **(request.vector_search_settings or {}) + ), + kg_search_settings=KGSearchSettings( + **(request.kg_search_settings or {}) + ), + rag_generation_config=GenerationConfig( + **(request.rag_generation_config or {}) + ), + ) + if ( + request.rag_generation_config + and request.rag_generation_config.get("stream", False) + ): + + async def stream_generator(): + async for chunk in response: + yield chunk + + return StreamingResponse( + stream_generator(), media_type="application/json" + ) + else: + return response + + @self.router.post("/evaluate") + @self.base_endpoint + async def evaluate_app(request: R2REvalRequest): + results = await self.engine.aevaluate( + query=request.query, + context=request.context, + completion=request.completion, + ) + return results + + +def create_retrieval_router(engine: R2REngine): + return RetrievalRouter(engine).router diff --git a/R2R/r2r/main/app.py b/R2R/r2r/main/app.py new file mode 100755 index 00000000..981445e4 --- /dev/null +++ b/R2R/r2r/main/app.py @@ -0,0 +1,53 @@ +from fastapi import FastAPI + +from .engine import R2REngine + + +class R2RApp: + def __init__(self, engine: R2REngine): + self.engine = engine + self._setup_routes() + self._apply_cors() + + async def openapi_spec(self, *args, **kwargs): + from fastapi.openapi.utils import get_openapi + + return get_openapi( + title="R2R Application API", + version="1.0.0", + routes=self.app.routes, + ) + + def _setup_routes(self): + from .api.routes import ingestion, management, retrieval + + self.app = FastAPI() + + # Create routers with the engine + ingestion_router = ingestion.IngestionRouter.build_router(self.engine) + management_router = management.ManagementRouter.build_router( + self.engine + ) + retrieval_router = retrieval.RetrievalRouter.build_router(self.engine) + + # Include routers in the app + self.app.include_router(ingestion_router, prefix="/v1") + self.app.include_router(management_router, prefix="/v1") + self.app.include_router(retrieval_router, prefix="/v1") + + def _apply_cors(self): + from fastapi.middleware.cors import CORSMiddleware + + origins = ["*", "http://localhost:3000", "http://localhost:8000"] + self.app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + def serve(self, host: str = "0.0.0.0", port: int = 8000): + import uvicorn + + uvicorn.run(self.app, host=host, port=port) diff --git a/R2R/r2r/main/app_entry.py b/R2R/r2r/main/app_entry.py new file mode 100755 index 00000000..29b705d7 --- /dev/null +++ b/R2R/r2r/main/app_entry.py @@ -0,0 +1,84 @@ +import logging +import os +from enum import Enum +from typing import Optional + +from fastapi import FastAPI + +from r2r import R2RBuilder, R2RConfig +from r2r.main.execution import R2RExecutionWrapper + +logger = logging.getLogger(__name__) +current_file_path = os.path.dirname(__file__) +configs_path = os.path.join(current_file_path, "..", "..", "..") + + +class PipelineType(Enum): + QNA = "qna" + WEB = "web" + HYDE = "hyde" + + +def r2r_app( + config_name: Optional[str] = "default", + config_path: Optional[str] = None, + client_mode: bool = False, + base_url: Optional[str] = None, + pipeline_type: PipelineType = PipelineType.QNA, +) -> FastAPI: + if pipeline_type != PipelineType.QNA: + raise ValueError("Only QNA pipeline is supported in quickstart.") + if config_path and config_name: + raise ValueError("Cannot specify both config and config_name") + + if config_path: + config = R2RConfig.from_json(config_path) + else: + config_name = os.getenv("CONFIG_NAME") or config_name + if config_name not in R2RBuilder.CONFIG_OPTIONS: + raise ValueError(f"Invalid config name: {config_name}") + config = R2RConfig.from_json(R2RBuilder.CONFIG_OPTIONS[config_name]) + + if ( + config.embedding.provider == "openai" + and "OPENAI_API_KEY" not in os.environ + ): + raise ValueError( + "Must set OPENAI_API_KEY in order to initialize OpenAIEmbeddingProvider." + ) + + wrapper = R2RExecutionWrapper( + config_name=config_name, + config_path=config_path, + client_mode=client_mode, + base_url=base_url, + ) + + return wrapper.get_app() + + +logging.basicConfig(level=logging.INFO) + +config_name = os.getenv("CONFIG_NAME", None) +config_path = os.getenv("CONFIG_PATH", None) +if not config_path and not config_name: + config_name = "default" +client_mode = os.getenv("CLIENT_MODE", "false").lower() == "true" +base_url = os.getenv("BASE_URL") +host = os.getenv("HOST", "0.0.0.0") +port = int(os.getenv("PORT", "8000")) +pipeline_type = os.getenv("PIPELINE_TYPE", "qna") + +logger.info(f"Environment CONFIG_NAME: {config_name}") +logger.info(f"Environment CONFIG_PATH: {config_path}") +logger.info(f"Environment CLIENT_MODE: {client_mode}") +logger.info(f"Environment BASE_URL: {base_url}") +logger.info(f"Environment PIPELINE_TYPE: {pipeline_type}") + +app = r2r_app( + config_name=config_name, + config_path=config_path, + client_mode=client_mode, + base_url=base_url, + pipeline_type=PipelineType(pipeline_type), +) diff --git a/R2R/r2r/main/assembly/__init__.py b/R2R/r2r/main/assembly/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/main/assembly/__init__.py diff --git a/R2R/r2r/main/assembly/builder.py b/R2R/r2r/main/assembly/builder.py new file mode 100755 index 00000000..863fc6d0 --- /dev/null +++ b/R2R/r2r/main/assembly/builder.py @@ -0,0 +1,207 @@ +import os +from typing import Optional, Type + +from r2r.base import ( + AsyncPipe, + EmbeddingProvider, + EvalProvider, + LLMProvider, + PromptProvider, + VectorDBProvider, +) +from r2r.pipelines import ( + EvalPipeline, + IngestionPipeline, + RAGPipeline, + SearchPipeline, +) + +from ..app import R2RApp +from ..engine import R2REngine +from ..r2r import R2R +from .config import R2RConfig +from .factory import R2RPipeFactory, R2RPipelineFactory, R2RProviderFactory + + +class R2RBuilder: + current_file_path = os.path.dirname(__file__) + config_root = os.path.join( + current_file_path, "..", "..", "examples", "configs" + ) + CONFIG_OPTIONS = { + "default": None, + "local_ollama": os.path.join(config_root, "local_ollama.json"), + "local_ollama_rerank": os.path.join( + config_root, "local_ollama_rerank.json" + ), + "neo4j_kg": os.path.join(config_root, "neo4j_kg.json"), + "local_neo4j_kg": os.path.join(config_root, "local_neo4j_kg.json"), + "postgres_logging": os.path.join(config_root, "postgres_logging.json"), + } + + @staticmethod + def _get_config(config_name): + if config_name is None: + return R2RConfig.from_json() + if config_name in R2RBuilder.CONFIG_OPTIONS: + return R2RConfig.from_json(R2RBuilder.CONFIG_OPTIONS[config_name]) + raise ValueError(f"Invalid config name: {config_name}") + + def __init__( + self, + config: Optional[R2RConfig] = None, + from_config: Optional[str] = None, + ): + if config and from_config: + raise ValueError("Cannot specify both config and config_name") + self.config = config or R2RBuilder._get_config(from_config) + self.r2r_app_override: Optional[Type[R2REngine]] = None + self.provider_factory_override: Optional[Type[R2RProviderFactory]] = ( + None + ) + self.pipe_factory_override: Optional[R2RPipeFactory] = None + self.pipeline_factory_override: Optional[R2RPipelineFactory] = None + self.vector_db_provider_override: Optional[VectorDBProvider] = None + self.embedding_provider_override: Optional[EmbeddingProvider] = None + self.eval_provider_override: Optional[EvalProvider] = None + self.llm_provider_override: Optional[LLMProvider] = None + self.prompt_provider_override: Optional[PromptProvider] = None + self.parsing_pipe_override: Optional[AsyncPipe] = None + self.embedding_pipe_override: Optional[AsyncPipe] = None + self.vector_storage_pipe_override: Optional[AsyncPipe] = None + self.vector_search_pipe_override: Optional[AsyncPipe] = None + self.rag_pipe_override: Optional[AsyncPipe] = None + self.streaming_rag_pipe_override: Optional[AsyncPipe] = None + self.eval_pipe_override: Optional[AsyncPipe] = None + self.ingestion_pipeline: Optional[IngestionPipeline] = None + self.search_pipeline: Optional[SearchPipeline] = None + self.rag_pipeline: Optional[RAGPipeline] = None + self.streaming_rag_pipeline: Optional[RAGPipeline] = None + self.eval_pipeline: Optional[EvalPipeline] = None + + def with_app(self, app: Type[R2REngine]): + self.r2r_app_override = app + return self + + def with_provider_factory(self, factory: Type[R2RProviderFactory]): + self.provider_factory_override = factory + return self + + def with_pipe_factory(self, factory: R2RPipeFactory): + self.pipe_factory_override = factory + return self + + def with_pipeline_factory(self, factory: R2RPipelineFactory): + self.pipeline_factory_override = factory + return self + + def with_vector_db_provider(self, provider: VectorDBProvider): + self.vector_db_provider_override = provider + return self + + def with_embedding_provider(self, provider: EmbeddingProvider): + self.embedding_provider_override = provider + return self + + def with_eval_provider(self, provider: EvalProvider): + self.eval_provider_override = provider + return self + + def with_llm_provider(self, provider: LLMProvider): + self.llm_provider_override = provider + return self + + def with_prompt_provider(self, provider: PromptProvider): + self.prompt_provider_override = provider + return self + + def with_parsing_pipe(self, pipe: AsyncPipe): + self.parsing_pipe_override = pipe + return self + + def with_embedding_pipe(self, pipe: AsyncPipe): + self.embedding_pipe_override = pipe + return self + + def with_vector_storage_pipe(self, pipe: AsyncPipe): + self.vector_storage_pipe_override = pipe + return self + + def with_vector_search_pipe(self, pipe: AsyncPipe): + self.vector_search_pipe_override = pipe + return self + + def with_rag_pipe(self, pipe: AsyncPipe): + self.rag_pipe_override = pipe + return self + + def with_streaming_rag_pipe(self, pipe: AsyncPipe): + self.streaming_rag_pipe_override = pipe + return self + + def with_eval_pipe(self, pipe: AsyncPipe): + self.eval_pipe_override = pipe + return self + + def with_ingestion_pipeline(self, pipeline: IngestionPipeline): + self.ingestion_pipeline = pipeline + return self + + def with_vector_search_pipeline(self, pipeline: SearchPipeline): + self.search_pipeline = pipeline + return self + + def with_rag_pipeline(self, pipeline: RAGPipeline): + self.rag_pipeline = pipeline + return self + + def with_streaming_rag_pipeline(self, pipeline: RAGPipeline): + self.streaming_rag_pipeline = pipeline + return self + + def with_eval_pipeline(self, pipeline: EvalPipeline): + self.eval_pipeline = pipeline + return self + + def build(self, *args, **kwargs) -> R2R: + provider_factory = self.provider_factory_override or R2RProviderFactory + pipe_factory = self.pipe_factory_override or R2RPipeFactory + pipeline_factory = self.pipeline_factory_override or R2RPipelineFactory + + providers = provider_factory(self.config).create_providers( + vector_db_provider_override=self.vector_db_provider_override, + embedding_provider_override=self.embedding_provider_override, + eval_provider_override=self.eval_provider_override, + llm_provider_override=self.llm_provider_override, + prompt_provider_override=self.prompt_provider_override, + *args, + **kwargs, + ) + + pipes = pipe_factory(self.config, providers).create_pipes( + parsing_pipe_override=self.parsing_pipe_override, + embedding_pipe_override=self.embedding_pipe_override, + vector_storage_pipe_override=self.vector_storage_pipe_override, + vector_search_pipe_override=self.vector_search_pipe_override, + rag_pipe_override=self.rag_pipe_override, + streaming_rag_pipe_override=self.streaming_rag_pipe_override, + eval_pipe_override=self.eval_pipe_override, + *args, + **kwargs, + ) + + pipelines = pipeline_factory(self.config, pipes).create_pipelines( + ingestion_pipeline=self.ingestion_pipeline, + search_pipeline=self.search_pipeline, + rag_pipeline=self.rag_pipeline, + streaming_rag_pipeline=self.streaming_rag_pipeline, + eval_pipeline=self.eval_pipeline, + *args, + **kwargs, + ) + + engine = (self.r2r_app_override or R2REngine)( + self.config, providers, pipelines + ) + r2r_app = R2RApp(engine) + return R2R(engine=engine, app=r2r_app) diff --git a/R2R/r2r/main/assembly/config.py b/R2R/r2r/main/assembly/config.py new file mode 100755 index 00000000..d52c4561 --- /dev/null +++ b/R2R/r2r/main/assembly/config.py @@ -0,0 +1,167 @@ +import json +import logging +import os +from enum import Enum +from typing import Any + +from ...base.abstractions.document import DocumentType +from ...base.abstractions.llm import GenerationConfig +from ...base.logging.kv_logger import LoggingConfig +from ...base.providers.embedding_provider import EmbeddingConfig +from ...base.providers.eval_provider import EvalConfig +from ...base.providers.kg_provider import KGConfig +from ...base.providers.llm_provider import LLMConfig +from ...base.providers.prompt_provider import PromptConfig +from ...base.providers.vector_db_provider import ProviderConfig, VectorDBConfig + +logger = logging.getLogger(__name__) + + +class R2RConfig: + REQUIRED_KEYS: dict[str, list] = { + "app": ["max_file_size_in_mb"], + "embedding": [ + "provider", + "base_model", + "base_dimension", + "batch_size", + "text_splitter", + ], + "eval": ["llm"], + "kg": [ + "provider", + "batch_size", + "kg_extraction_config", + "text_splitter", + ], + "ingestion": ["excluded_parsers"], + "completions": ["provider"], + "logging": ["provider", "log_table"], + "prompt": ["provider"], + "vector_database": ["provider"], + } + app: dict[str, Any] + embedding: EmbeddingConfig + completions: LLMConfig + logging: LoggingConfig + prompt: PromptConfig + vector_database: VectorDBConfig + + def __init__(self, config_data: dict[str, Any]): + # Load the default configuration + default_config = self.load_default_config() + + # Override the default configuration with the passed configuration + for key in config_data: + if key in default_config: + default_config[key].update(config_data[key]) + else: + default_config[key] = config_data[key] + + # Validate and set the configuration + for section, keys in R2RConfig.REQUIRED_KEYS.items(): + # Check the keys when provider is set + # TODO - Clean up robust null checks + if "provider" in default_config[section] and ( + default_config[section]["provider"] is not None + and default_config[section]["provider"] != "None" + and default_config[section]["provider"] != "null" + ): + self._validate_config_section(default_config, section, keys) + setattr(self, section, default_config[section]) + + self.app = self.app # for type hinting + self.ingestion = self.ingestion # for type hinting + self.ingestion["excluded_parsers"] = [ + DocumentType(k) for k in self.ingestion["excluded_parsers"] + ] + # override GenerationConfig defaults + GenerationConfig.set_default( + **self.completions.get("generation_config", {}) + ) + self.embedding = EmbeddingConfig.create(**self.embedding) + self.kg = KGConfig.create(**self.kg) + eval_llm = self.eval.pop("llm", None) + self.eval = EvalConfig.create( + **self.eval, llm=LLMConfig.create(**eval_llm) if eval_llm else None + ) + self.completions = LLMConfig.create(**self.completions) + self.logging = LoggingConfig.create(**self.logging) + self.prompt = PromptConfig.create(**self.prompt) + self.vector_database = VectorDBConfig.create(**self.vector_database) + + def _validate_config_section( + self, config_data: dict[str, Any], section: str, keys: list + ): + if section not in config_data: + raise ValueError(f"Missing '{section}' section in config") + if not all(key in config_data[section] for key in keys): + raise ValueError(f"Missing required keys in '{section}' config") + + @classmethod + def from_json(cls, config_path: str = None) -> "R2RConfig": + if config_path is None: + # Get the root directory of the project + file_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join( + file_dir, "..", "..", "..", "config.json" + ) + + # Load configuration from JSON file + with open(config_path) as f: + config_data = json.load(f) + + return cls(config_data) + + def to_json(self): + config_data = { + section: self._serialize_config(getattr(self, section)) + for section in R2RConfig.REQUIRED_KEYS.keys() + } + return json.dumps(config_data) + + def save_to_redis(self, redis_client: Any, key: str): + redis_client.set(f"R2RConfig:{key}", self.to_json()) + + @classmethod + def load_from_redis(cls, redis_client: Any, key: str) -> "R2RConfig": + config_data = redis_client.get(f"R2RConfig:{key}") + if config_data is None: + raise ValueError( + f"Configuration not found in Redis with key '{key}'" + ) + config_data = json.loads(config_data) + # config_data["ingestion"]["selected_parsers"] = { + # DocumentType(k): v + # for k, v in config_data["ingestion"]["selected_parsers"].items() + # } + return cls(config_data) + + @classmethod + def load_default_config(cls) -> dict: + # Get the root directory of the project + file_dir = os.path.dirname(os.path.abspath(__file__)) + default_config_path = os.path.join( + file_dir, "..", "..", "..", "config.json" + ) + # Load default configuration from JSON file + with open(default_config_path) as f: + return json.load(f) + + @staticmethod + def _serialize_config(config_section: Any) -> dict: + # TODO - Make this approach cleaner + if isinstance(config_section, ProviderConfig): + config_section = config_section.dict() + filtered_result = {} + for k, v in config_section.items(): + if isinstance(k, Enum): + k = k.value + if isinstance(v, dict): + formatted_v = { + k2.value if isinstance(k2, Enum) else k2: v2 + for k2, v2 in v.items() + } + v = formatted_v + filtered_result[k] = v + return filtered_result diff --git a/R2R/r2r/main/assembly/factory.py b/R2R/r2r/main/assembly/factory.py new file mode 100755 index 00000000..4e147337 --- /dev/null +++ b/R2R/r2r/main/assembly/factory.py @@ -0,0 +1,484 @@ +import logging +import os +from typing import Any, Optional + +from r2r.base import ( + AsyncPipe, + EmbeddingConfig, + EmbeddingProvider, + EvalProvider, + KGProvider, + KVLoggingSingleton, + LLMConfig, + LLMProvider, + PromptProvider, + VectorDBConfig, + VectorDBProvider, +) +from r2r.pipelines import ( + EvalPipeline, + IngestionPipeline, + RAGPipeline, + SearchPipeline, +) + +from ..abstractions import R2RPipelines, R2RPipes, R2RProviders +from .config import R2RConfig + +logger = logging.getLogger(__name__) + + +class R2RProviderFactory: + def __init__(self, config: R2RConfig): + self.config = config + + def create_vector_db_provider( + self, vector_db_config: VectorDBConfig, *args, **kwargs + ) -> VectorDBProvider: + vector_db_provider: Optional[VectorDBProvider] = None + if vector_db_config.provider == "pgvector": + from r2r.providers.vector_dbs import PGVectorDB + + vector_db_provider = PGVectorDB(vector_db_config) + else: + raise ValueError( + f"Vector database provider {vector_db_config.provider} not supported" + ) + if not vector_db_provider: + raise ValueError("Vector database provider not found") + + if not self.config.embedding.base_dimension: + raise ValueError("Search dimension not found in embedding config") + + vector_db_provider.initialize_collection( + self.config.embedding.base_dimension + ) + return vector_db_provider + + def create_embedding_provider( + self, embedding: EmbeddingConfig, *args, **kwargs + ) -> EmbeddingProvider: + embedding_provider: Optional[EmbeddingProvider] = None + + if embedding.provider == "openai": + if not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "Must set OPENAI_API_KEY in order to initialize OpenAIEmbeddingProvider." + ) + from r2r.providers.embeddings import OpenAIEmbeddingProvider + + embedding_provider = OpenAIEmbeddingProvider(embedding) + elif embedding.provider == "ollama": + from r2r.providers.embeddings import OllamaEmbeddingProvider + + embedding_provider = OllamaEmbeddingProvider(embedding) + + elif embedding.provider == "sentence-transformers": + from r2r.providers.embeddings import ( + SentenceTransformerEmbeddingProvider, + ) + + embedding_provider = SentenceTransformerEmbeddingProvider( + embedding + ) + elif embedding is None: + embedding_provider = None + else: + raise ValueError( + f"Embedding provider {embedding.provider} not supported" + ) + + return embedding_provider + + def create_eval_provider( + self, eval_config, prompt_provider, *args, **kwargs + ) -> Optional[EvalProvider]: + if eval_config.provider == "local": + from r2r.providers.eval import LLMEvalProvider + + llm_provider = self.create_llm_provider(eval_config.llm) + eval_provider = LLMEvalProvider( + eval_config, + llm_provider=llm_provider, + prompt_provider=prompt_provider, + ) + elif eval_config.provider is None: + eval_provider = None + else: + raise ValueError( + f"Eval provider {eval_config.provider} not supported." + ) + + return eval_provider + + def create_llm_provider( + self, llm_config: LLMConfig, *args, **kwargs + ) -> LLMProvider: + llm_provider: Optional[LLMProvider] = None + if llm_config.provider == "openai": + from r2r.providers.llms import OpenAILLM + + llm_provider = OpenAILLM(llm_config) + elif llm_config.provider == "litellm": + from r2r.providers.llms import LiteLLM + + llm_provider = LiteLLM(llm_config) + else: + raise ValueError( + f"Language model provider {llm_config.provider} not supported" + ) + if not llm_provider: + raise ValueError("Language model provider not found") + return llm_provider + + def create_prompt_provider( + self, prompt_config, *args, **kwargs + ) -> PromptProvider: + prompt_provider = None + if prompt_config.provider == "local": + from r2r.prompts import R2RPromptProvider + + prompt_provider = R2RPromptProvider() + else: + raise ValueError( + f"Prompt provider {prompt_config.provider} not supported" + ) + return prompt_provider + + def create_kg_provider(self, kg_config, *args, **kwargs): + if kg_config.provider == "neo4j": + from r2r.providers.kg import Neo4jKGProvider + + return Neo4jKGProvider(kg_config) + elif kg_config.provider is None: + return None + else: + raise ValueError( + f"KG provider {kg_config.provider} not supported." + ) + + def create_providers( + self, + vector_db_provider_override: Optional[VectorDBProvider] = None, + embedding_provider_override: Optional[EmbeddingProvider] = None, + eval_provider_override: Optional[EvalProvider] = None, + llm_provider_override: Optional[LLMProvider] = None, + prompt_provider_override: Optional[PromptProvider] = None, + kg_provider_override: Optional[KGProvider] = None, + *args, + **kwargs, + ) -> R2RProviders: + prompt_provider = ( + prompt_provider_override + or self.create_prompt_provider(self.config.prompt, *args, **kwargs) + ) + return R2RProviders( + vector_db=vector_db_provider_override + or self.create_vector_db_provider( + self.config.vector_database, *args, **kwargs + ), + embedding=embedding_provider_override + or self.create_embedding_provider( + self.config.embedding, *args, **kwargs + ), + eval=eval_provider_override + or self.create_eval_provider( + self.config.eval, + prompt_provider=prompt_provider, + *args, + **kwargs, + ), + llm=llm_provider_override + or self.create_llm_provider( + self.config.completions, *args, **kwargs + ), + prompt=prompt_provider_override + or self.create_prompt_provider( + self.config.prompt, *args, **kwargs + ), + kg=kg_provider_override + or self.create_kg_provider(self.config.kg, *args, **kwargs), + ) + + +class R2RPipeFactory: + def __init__(self, config: R2RConfig, providers: R2RProviders): + self.config = config + self.providers = providers + + def create_pipes( + self, + parsing_pipe_override: Optional[AsyncPipe] = None, + embedding_pipe_override: Optional[AsyncPipe] = None, + kg_pipe_override: Optional[AsyncPipe] = None, + kg_storage_pipe_override: Optional[AsyncPipe] = None, + kg_agent_pipe_override: Optional[AsyncPipe] = None, + vector_storage_pipe_override: Optional[AsyncPipe] = None, + vector_search_pipe_override: Optional[AsyncPipe] = None, + rag_pipe_override: Optional[AsyncPipe] = None, + streaming_rag_pipe_override: Optional[AsyncPipe] = None, + eval_pipe_override: Optional[AsyncPipe] = None, + *args, + **kwargs, + ) -> R2RPipes: + return R2RPipes( + parsing_pipe=parsing_pipe_override + or self.create_parsing_pipe( + self.config.ingestion.get("excluded_parsers"), *args, **kwargs + ), + embedding_pipe=embedding_pipe_override + or self.create_embedding_pipe(*args, **kwargs), + kg_pipe=kg_pipe_override or self.create_kg_pipe(*args, **kwargs), + kg_storage_pipe=kg_storage_pipe_override + or self.create_kg_storage_pipe(*args, **kwargs), + kg_agent_search_pipe=kg_agent_pipe_override + or self.create_kg_agent_pipe(*args, **kwargs), + vector_storage_pipe=vector_storage_pipe_override + or self.create_vector_storage_pipe(*args, **kwargs), + vector_search_pipe=vector_search_pipe_override + or self.create_vector_search_pipe(*args, **kwargs), + rag_pipe=rag_pipe_override + or self.create_rag_pipe(*args, **kwargs), + streaming_rag_pipe=streaming_rag_pipe_override + or self.create_rag_pipe(stream=True, *args, **kwargs), + eval_pipe=eval_pipe_override + or self.create_eval_pipe(*args, **kwargs), + ) + + def create_parsing_pipe( + self, excluded_parsers: Optional[list] = None, *args, **kwargs + ) -> Any: + from r2r.pipes import ParsingPipe + + return ParsingPipe(excluded_parsers=excluded_parsers or []) + + def create_embedding_pipe(self, *args, **kwargs) -> Any: + if self.config.embedding.provider is None: + return None + + from r2r.base import RecursiveCharacterTextSplitter + from r2r.pipes import EmbeddingPipe + + text_splitter_config = self.config.embedding.extra_fields.get( + "text_splitter" + ) + if not text_splitter_config: + raise ValueError( + "Text splitter config not found in embedding config" + ) + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=text_splitter_config["chunk_size"], + chunk_overlap=text_splitter_config["chunk_overlap"], + length_function=len, + is_separator_regex=False, + ) + return EmbeddingPipe( + embedding_provider=self.providers.embedding, + vector_db_provider=self.providers.vector_db, + text_splitter=text_splitter, + embedding_batch_size=self.config.embedding.batch_size, + ) + + def create_vector_storage_pipe(self, *args, **kwargs) -> Any: + if self.config.embedding.provider is None: + return None + + from r2r.pipes import VectorStoragePipe + + return VectorStoragePipe(vector_db_provider=self.providers.vector_db) + + def create_vector_search_pipe(self, *args, **kwargs) -> Any: + if self.config.embedding.provider is None: + return None + + from r2r.pipes import VectorSearchPipe + + return VectorSearchPipe( + vector_db_provider=self.providers.vector_db, + embedding_provider=self.providers.embedding, + ) + + def create_kg_pipe(self, *args, **kwargs) -> Any: + if self.config.kg.provider is None: + return None + + from r2r.base import RecursiveCharacterTextSplitter + from r2r.pipes import KGExtractionPipe + + text_splitter_config = self.config.kg.extra_fields.get("text_splitter") + if not text_splitter_config: + raise ValueError("Text splitter config not found in kg config.") + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=text_splitter_config["chunk_size"], + chunk_overlap=text_splitter_config["chunk_overlap"], + length_function=len, + is_separator_regex=False, + ) + return KGExtractionPipe( + kg_provider=self.providers.kg, + llm_provider=self.providers.llm, + prompt_provider=self.providers.prompt, + vector_db_provider=self.providers.vector_db, + text_splitter=text_splitter, + kg_batch_size=self.config.kg.batch_size, + ) + + def create_kg_storage_pipe(self, *args, **kwargs) -> Any: + if self.config.kg.provider is None: + return None + + from r2r.pipes import KGStoragePipe + + return KGStoragePipe( + kg_provider=self.providers.kg, + embedding_provider=self.providers.embedding, + ) + + def create_kg_agent_pipe(self, *args, **kwargs) -> Any: + if self.config.kg.provider is None: + return None + + from r2r.pipes import KGAgentSearchPipe + + return KGAgentSearchPipe( + kg_provider=self.providers.kg, + llm_provider=self.providers.llm, + prompt_provider=self.providers.prompt, + ) + + def create_rag_pipe(self, stream: bool = False, *args, **kwargs) -> Any: + if stream: + from r2r.pipes import StreamingSearchRAGPipe + + return StreamingSearchRAGPipe( + llm_provider=self.providers.llm, + prompt_provider=self.providers.prompt, + ) + else: + from r2r.pipes import SearchRAGPipe + + return SearchRAGPipe( + llm_provider=self.providers.llm, + prompt_provider=self.providers.prompt, + ) + + def create_eval_pipe(self, *args, **kwargs) -> Any: + from r2r.pipes import EvalPipe + + return EvalPipe(eval_provider=self.providers.eval) + + +class R2RPipelineFactory: + def __init__(self, config: R2RConfig, pipes: R2RPipes): + self.config = config + self.pipes = pipes + + def create_ingestion_pipeline(self, *args, **kwargs) -> IngestionPipeline: + """factory method to create an ingestion pipeline.""" + ingestion_pipeline = IngestionPipeline() + + ingestion_pipeline.add_pipe( + pipe=self.pipes.parsing_pipe, parsing_pipe=True + ) + # Add embedding pipes if provider is set + if self.config.embedding.provider is not None: + ingestion_pipeline.add_pipe( + self.pipes.embedding_pipe, embedding_pipe=True + ) + ingestion_pipeline.add_pipe( + self.pipes.vector_storage_pipe, embedding_pipe=True + ) + # Add KG pipes if provider is set + if self.config.kg.provider is not None: + ingestion_pipeline.add_pipe(self.pipes.kg_pipe, kg_pipe=True) + ingestion_pipeline.add_pipe( + self.pipes.kg_storage_pipe, kg_pipe=True + ) + + return ingestion_pipeline + + def create_search_pipeline(self, *args, **kwargs) -> SearchPipeline: + """factory method to create an ingestion pipeline.""" + search_pipeline = SearchPipeline() + + # Add vector search pipes if embedding provider and vector provider is set + if ( + self.config.embedding.provider is not None + and self.config.vector_database.provider is not None + ): + search_pipeline.add_pipe( + self.pipes.vector_search_pipe, vector_search_pipe=True + ) + + # Add KG pipes if provider is set + if self.config.kg.provider is not None: + search_pipeline.add_pipe( + self.pipes.kg_agent_search_pipe, kg_pipe=True + ) + + return search_pipeline + + def create_rag_pipeline( + self, + search_pipeline: SearchPipeline, + stream: bool = False, + *args, + **kwargs, + ) -> RAGPipeline: + rag_pipe = ( + self.pipes.streaming_rag_pipe if stream else self.pipes.rag_pipe + ) + + rag_pipeline = RAGPipeline() + rag_pipeline.set_search_pipeline(search_pipeline) + rag_pipeline.add_pipe(rag_pipe) + return rag_pipeline + + def create_eval_pipeline(self, *args, **kwargs) -> EvalPipeline: + eval_pipeline = EvalPipeline() + eval_pipeline.add_pipe(self.pipes.eval_pipe) + return eval_pipeline + + def create_pipelines( + self, + ingestion_pipeline: Optional[IngestionPipeline] = None, + search_pipeline: Optional[SearchPipeline] = None, + rag_pipeline: Optional[RAGPipeline] = None, + streaming_rag_pipeline: Optional[RAGPipeline] = None, + eval_pipeline: Optional[EvalPipeline] = None, + *args, + **kwargs, + ) -> R2RPipelines: + try: + self.configure_logging() + except Exception as e: + logger.warn(f"Error configuring logging: {e}") + search_pipeline = search_pipeline or self.create_search_pipeline( + *args, **kwargs + ) + return R2RPipelines( + ingestion_pipeline=ingestion_pipeline + or self.create_ingestion_pipeline(*args, **kwargs), + search_pipeline=search_pipeline, + rag_pipeline=rag_pipeline + or self.create_rag_pipeline( + search_pipeline=search_pipeline, + stream=False, + *args, + **kwargs, + ), + streaming_rag_pipeline=streaming_rag_pipeline + or self.create_rag_pipeline( + search_pipeline=search_pipeline, + stream=True, + *args, + **kwargs, + ), + eval_pipeline=eval_pipeline + or self.create_eval_pipeline(*args, **kwargs), + ) + + def configure_logging(self): + KVLoggingSingleton.configure(self.config.logging) diff --git a/R2R/r2r/main/assembly/factory_extensions.py b/R2R/r2r/main/assembly/factory_extensions.py new file mode 100755 index 00000000..56e82ef7 --- /dev/null +++ b/R2R/r2r/main/assembly/factory_extensions.py @@ -0,0 +1,69 @@ +from r2r.main import R2RPipeFactory +from r2r.pipes.retrieval.multi_search import MultiSearchPipe +from r2r.pipes.retrieval.query_transform_pipe import QueryTransformPipe + + +class R2RPipeFactoryWithMultiSearch(R2RPipeFactory): + QUERY_GENERATION_TEMPLATE: dict = ( + { # TODO - Can we have stricter typing like so? `: {"template": str, "input_types": dict[str, str]} = {`` + "template": "### Instruction:\n\nGiven the following query that follows to write a double newline separated list of up to {num_outputs} queries meant to help answer the original query. \nDO NOT generate any single query which is likely to require information from multiple distinct documents, \nEACH single query will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents. \nFOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two queries would be \n`What are the key themes of Great Gatsby?` and `What are the key themes of 1984?`.\nHere is the original user query to be transformed into answers:\n\n### Query:\n{message}\n\n### Response:\n", + "input_types": {"num_outputs": "int", "message": "str"}, + } + ) + + def create_vector_search_pipe(self, *args, **kwargs): + """ + A factory method to create a search pipe. + + Overrides include + task_prompt_name: str + multi_query_transform_pipe_override: QueryTransformPipe + multi_inner_search_pipe_override: SearchPipe + query_generation_template_override: {'template': str, 'input_types': dict[str, str]} + """ + multi_search_config = MultiSearchPipe.PipeConfig() + if kwargs.get("task_prompt_name") and kwargs.get( + "query_generation_template_override" + ): + raise ValueError( + "Cannot provide both `task_prompt_name` and `query_generation_template_override`" + ) + task_prompt_name = ( + kwargs.get("task_prompt_name") + or f"{multi_search_config.name}_task_prompt" + ) + if kwargs.get("query_generation_template_override"): + # Add a prompt for transforming the user query + template = kwargs.get("query_generation_template_override") + self.providers.prompt.add_prompt( + **( + kwargs.get("query_generation_template_override") + or self.QUERY_GENERATION_TEMPLATE + ), + ) + task_prompt_name = template["name"] + + # Initialize the new query transform pipe + query_transform_pipe = kwargs.get( + "multi_query_transform_pipe_override", None + ) or QueryTransformPipe( + llm_provider=self.providers.llm, + prompt_provider=self.providers.prompt, + config=QueryTransformPipe.QueryTransformConfig( + name=multi_search_config.name, + task_prompt=task_prompt_name, + ), + ) + # Create search pipe override and pipes + inner_search_pipe = kwargs.get( + "multi_inner_search_pipe_override", None + ) or super().create_vector_search_pipe(*args, **kwargs) + + # TODO - modify `create_..._pipe` to allow naming the pipe + inner_search_pipe.config.name = multi_search_config.name + + return MultiSearchPipe( + query_transform_pipe=query_transform_pipe, + inner_search_pipe=inner_search_pipe, + config=multi_search_config, + ) diff --git a/R2R/r2r/main/engine.py b/R2R/r2r/main/engine.py new file mode 100755 index 00000000..a73b932e --- /dev/null +++ b/R2R/r2r/main/engine.py @@ -0,0 +1,109 @@ +from typing import Optional + +from r2r.base import KVLoggingSingleton, RunManager +from r2r.base.abstractions.base import AsyncSyncMeta, syncable + +from .abstractions import R2RPipelines, R2RProviders +from .assembly.config import R2RConfig +from .services.ingestion_service import IngestionService +from .services.management_service import ManagementService +from .services.retrieval_service import RetrievalService + + +class R2REngine(metaclass=AsyncSyncMeta): + def __init__( + self, + config: R2RConfig, + providers: R2RProviders, + pipelines: R2RPipelines, + run_manager: Optional[RunManager] = None, + ): + logging_connection = KVLoggingSingleton() + run_manager = run_manager or RunManager(logging_connection) + + self.config = config + self.providers = providers + self.pipelines = pipelines + self.logging_connection = KVLoggingSingleton() + self.run_manager = run_manager + + self.ingestion_service = IngestionService( + config, providers, pipelines, run_manager, logging_connection + ) + self.retrieval_service = RetrievalService( + config, providers, pipelines, run_manager, logging_connection + ) + self.management_service = ManagementService( + config, providers, pipelines, run_manager, logging_connection + ) + + # Ingestion routes + @syncable + async def aingest_documents(self, *args, **kwargs): + return await self.ingestion_service.ingest_documents(*args, **kwargs) + + @syncable + async def aupdate_documents(self, *args, **kwargs): + return await self.ingestion_service.update_documents(*args, **kwargs) + + @syncable + async def aingest_files(self, *args, **kwargs): + return await self.ingestion_service.ingest_files(*args, **kwargs) + + @syncable + async def aupdate_files(self, *args, **kwargs): + return await self.ingestion_service.update_files(*args, **kwargs) + + # Retrieval routes + @syncable + async def asearch(self, *args, **kwargs): + return await self.retrieval_service.search(*args, **kwargs) + + @syncable + async def arag(self, *args, **kwargs): + return await self.retrieval_service.rag(*args, **kwargs) + + @syncable + async def aevaluate(self, *args, **kwargs): + return await self.retrieval_service.evaluate(*args, **kwargs) + + # Management routes + @syncable + async def aupdate_prompt(self, *args, **kwargs): + return await self.management_service.update_prompt(*args, **kwargs) + + @syncable + async def alogs(self, *args, **kwargs): + return await self.management_service.alogs(*args, **kwargs) + + @syncable + async def aanalytics(self, *args, **kwargs): + return await self.management_service.aanalytics(*args, **kwargs) + + @syncable + async def aapp_settings(self, *args, **kwargs): + return await self.management_service.aapp_settings(*args, **kwargs) + + @syncable + async def ausers_overview(self, *args, **kwargs): + return await self.management_service.ausers_overview(*args, **kwargs) + + @syncable + async def adelete(self, *args, **kwargs): + return await self.management_service.delete(*args, **kwargs) + + @syncable + async def adocuments_overview(self, *args, **kwargs): + return await self.management_service.adocuments_overview( + *args, **kwargs + ) + + @syncable + async def inspect_knowledge_graph(self, *args, **kwargs): + return await self.management_service.inspect_knowledge_graph( + *args, **kwargs + ) + + @syncable + async def adocument_chunks(self, *args, **kwargs): + return await self.management_service.document_chunks(*args, **kwargs) diff --git a/R2R/r2r/main/execution.py b/R2R/r2r/main/execution.py new file mode 100755 index 00000000..187a2eea --- /dev/null +++ b/R2R/r2r/main/execution.py @@ -0,0 +1,421 @@ +import ast +import asyncio +import json +import os +import uuid +from typing import Optional, Union + +from fastapi import UploadFile + +from r2r.base import ( + AnalysisTypes, + FilterCriteria, + GenerationConfig, + KGSearchSettings, + VectorSearchSettings, + generate_id_from_label, +) + +from .api.client import R2RClient +from .assembly.builder import R2RBuilder +from .assembly.config import R2RConfig +from .r2r import R2R + + +class R2RExecutionWrapper: + """A demo class for the R2R library.""" + + def __init__( + self, + config_path: Optional[str] = None, + config_name: Optional[str] = "default", + client_mode: bool = True, + base_url="http://localhost:8000", + ): + if config_path and config_name: + raise Exception("Cannot specify both config_path and config_name") + + # Handle fire CLI + if isinstance(client_mode, str): + client_mode = client_mode.lower() == "true" + self.client_mode = client_mode + self.base_url = base_url + + if self.client_mode: + self.client = R2RClient(base_url) + self.app = None + else: + config = ( + R2RConfig.from_json(config_path) + if config_path + else R2RConfig.from_json( + R2RBuilder.CONFIG_OPTIONS[config_name or "default"] + ) + ) + + self.client = None + self.app = R2R(config=config) + + def serve(self, host: str = "0.0.0.0", port: int = 8000): + if not self.client_mode: + self.app.serve(host, port) + else: + raise ValueError( + "Serve method is only available when `client_mode=False`." + ) + + def _parse_metadata_string(metadata_string: str) -> list[dict]: + """ + Convert a string representation of metadata into a list of dictionaries. + + The input string can be in one of two formats: + 1. JSON array of objects: '[{"key": "value"}, {"key2": "value2"}]' + 2. Python-like list of dictionaries: "[{'key': 'value'}, {'key2': 'value2'}]" + + Args: + metadata_string (str): The string representation of metadata. + + Returns: + list[dict]: A list of dictionaries representing the metadata. + + Raises: + ValueError: If the string cannot be parsed into a list of dictionaries. + """ + if not metadata_string: + return [] + + try: + # First, try to parse as JSON + return json.loads(metadata_string) + except json.JSONDecodeError as e: + try: + # If JSON parsing fails, try to evaluate as a Python literal + result = ast.literal_eval(metadata_string) + if not isinstance(result, list) or not all( + isinstance(item, dict) for item in result + ): + raise ValueError( + "The string does not represent a list of dictionaries" + ) from e + return result + except (ValueError, SyntaxError) as exc: + raise ValueError( + "Unable to parse the metadata string. " + "Please ensure it's a valid JSON array or Python list of dictionaries." + ) from exc + + def ingest_files( + self, + file_paths: list[str], + metadatas: Optional[list[dict]] = None, + document_ids: Optional[list[Union[uuid.UUID, str]]] = None, + versions: Optional[list[str]] = None, + ): + if isinstance(file_paths, str): + file_paths = list(file_paths.split(",")) + if isinstance(metadatas, str): + metadatas = self._parse_metadata_string(metadatas) + if isinstance(document_ids, str): + document_ids = list(document_ids.split(",")) + if isinstance(versions, str): + versions = list(versions.split(",")) + + all_file_paths = [] + for path in file_paths: + if os.path.isdir(path): + for root, _, files in os.walk(path): + all_file_paths.extend( + os.path.join(root, file) for file in files + ) + else: + all_file_paths.append(path) + + if not document_ids: + document_ids = [ + generate_id_from_label(os.path.basename(file_path)) + for file_path in all_file_paths + ] + + files = [ + UploadFile( + filename=os.path.basename(file_path), + file=open(file_path, "rb"), + ) + for file_path in all_file_paths + ] + + for file in files: + file.file.seek(0, 2) + file.size = file.file.tell() + file.file.seek(0) + + try: + if self.client_mode: + return self.client.ingest_files( + file_paths=all_file_paths, + document_ids=document_ids, + metadatas=metadatas, + versions=versions, + monitor=True, + )["results"] + else: + return self.app.ingest_files( + files=files, + document_ids=document_ids, + metadatas=metadatas, + versions=versions, + ) + finally: + for file in files: + file.file.close() + + def update_files( + self, + file_paths: list[str], + document_ids: list[str], + metadatas: Optional[list[dict]] = None, + ): + if isinstance(file_paths, str): + file_paths = list(file_paths.split(",")) + if isinstance(metadatas, str): + metadatas = self._parse_metadata_string(metadatas) + if isinstance(document_ids, str): + document_ids = list(document_ids.split(",")) + + if self.client_mode: + return self.client.update_files( + file_paths=file_paths, + document_ids=document_ids, + metadatas=metadatas, + monitor=True, + )["results"] + else: + files = [ + UploadFile( + filename=file_path, + file=open(file_path, "rb"), + ) + for file_path in file_paths + ] + return self.app.update_files( + files=files, document_ids=document_ids, metadatas=metadatas + ) + + def search( + self, + query: str, + use_vector_search: bool = True, + search_filters: Optional[dict] = None, + search_limit: int = 10, + do_hybrid_search: bool = False, + use_kg_search: bool = False, + kg_agent_generation_config: Optional[dict] = None, + ): + if self.client_mode: + return self.client.search( + query, + use_vector_search, + search_filters, + search_limit, + do_hybrid_search, + use_kg_search, + kg_agent_generation_config, + )["results"] + else: + return self.app.search( + query, + VectorSearchSettings( + use_vector_search=use_vector_search, + search_filters=search_filters or {}, + search_limit=search_limit, + do_hybrid_search=do_hybrid_search, + ), + KGSearchSettings( + use_kg_search=use_kg_search, + agent_generation_config=GenerationConfig( + **(kg_agent_generation_config or {}) + ), + ), + ) + + def rag( + self, + query: str, + use_vector_search: bool = True, + search_filters: Optional[dict] = None, + search_limit: int = 10, + do_hybrid_search: bool = False, + use_kg_search: bool = False, + kg_agent_generation_config: Optional[dict] = None, + stream: bool = False, + rag_generation_config: Optional[dict] = None, + ): + if self.client_mode: + response = self.client.rag( + query=query, + use_vector_search=use_vector_search, + search_filters=search_filters or {}, + search_limit=search_limit, + do_hybrid_search=do_hybrid_search, + use_kg_search=use_kg_search, + kg_agent_generation_config=kg_agent_generation_config, + rag_generation_config=rag_generation_config, + ) + if not stream: + response = response["results"] + return response + else: + return response + else: + response = self.app.rag( + query, + vector_search_settings=VectorSearchSettings( + use_vector_search=use_vector_search, + search_filters=search_filters or {}, + search_limit=search_limit, + do_hybrid_search=do_hybrid_search, + ), + kg_search_settings=KGSearchSettings( + use_kg_search=use_kg_search, + agent_generation_config=GenerationConfig( + **(kg_agent_generation_config or {}) + ), + ), + rag_generation_config=GenerationConfig( + **(rag_generation_config or {}) + ), + ) + if not stream: + return response + else: + + async def async_generator(): + async for chunk in response: + yield chunk + + def sync_generator(): + try: + loop = asyncio.get_event_loop() + async_gen = async_generator() + while True: + try: + yield loop.run_until_complete( + async_gen.__anext__() + ) + except StopAsyncIteration: + break + except Exception: + pass + + return sync_generator() + + def documents_overview( + self, + document_ids: Optional[list[str]] = None, + user_ids: Optional[list[str]] = None, + ): + if self.client_mode: + return self.client.documents_overview(document_ids, user_ids)[ + "results" + ] + else: + return self.app.documents_overview(document_ids, user_ids) + + def delete( + self, + keys: list[str], + values: list[str], + ): + if self.client_mode: + return self.client.delete(keys, values)["results"] + else: + return self.app.delete(keys, values) + + def logs(self, log_type_filter: Optional[str] = None): + if self.client_mode: + return self.client.logs(log_type_filter)["results"] + else: + return self.app.logs(log_type_filter) + + def document_chunks(self, document_id: str): + doc_uuid = uuid.UUID(document_id) + if self.client_mode: + return self.client.document_chunks(doc_uuid)["results"] + else: + return self.app.document_chunks(doc_uuid) + + def app_settings(self): + if self.client_mode: + return self.client.app_settings() + else: + return self.app.app_settings() + + def users_overview(self, user_ids: Optional[list[uuid.UUID]] = None): + if self.client_mode: + return self.client.users_overview(user_ids)["results"] + else: + return self.app.users_overview(user_ids) + + def analytics( + self, + filters: Optional[str] = None, + analysis_types: Optional[str] = None, + ): + filter_criteria = FilterCriteria(filters=filters) + analysis_types = AnalysisTypes(analysis_types=analysis_types) + + if self.client_mode: + return self.client.analytics( + filter_criteria=filter_criteria.model_dump(), + analysis_types=analysis_types.model_dump(), + )["results"] + else: + return self.app.analytics( + filter_criteria=filter_criteria, analysis_types=analysis_types + ) + + def ingest_sample_file(self, no_media: bool = True, option: int = 0): + from r2r.examples.scripts.sample_data_ingestor import ( + SampleDataIngestor, + ) + + """Ingest the first sample file into R2R.""" + sample_ingestor = SampleDataIngestor(self) + return sample_ingestor.ingest_sample_file( + no_media=no_media, option=option + ) + + def ingest_sample_files(self, no_media: bool = True): + from r2r.examples.scripts.sample_data_ingestor import ( + SampleDataIngestor, + ) + + """Ingest the first sample file into R2R.""" + sample_ingestor = SampleDataIngestor(self) + return sample_ingestor.ingest_sample_files(no_media=no_media) + + def inspect_knowledge_graph(self, limit: int = 100) -> str: + if self.client_mode: + return self.client.inspect_knowledge_graph(limit)["results"] + else: + return self.engine.inspect_knowledge_graph(limit) + + def health(self) -> str: + if self.client_mode: + return self.client.health() + else: + pass + + def get_app(self): + if not self.client_mode: + return self.app.app.app + else: + raise Exception( + "`get_app` method is only available when running with `client_mode=False`." + ) + + +if __name__ == "__main__": + import fire + + fire.Fire(R2RExecutionWrapper) diff --git a/R2R/r2r/main/r2r.py b/R2R/r2r/main/r2r.py new file mode 100755 index 00000000..2d8601b2 --- /dev/null +++ b/R2R/r2r/main/r2r.py @@ -0,0 +1,51 @@ +from typing import Optional + +from .app import R2RApp +from .assembly.config import R2RConfig +from .engine import R2REngine + + +class R2R: + engine: R2REngine + app: R2RApp + + def __init__( + self, + engine: Optional[R2REngine] = None, + app: Optional[R2RApp] = None, + config: Optional[R2RConfig] = None, + from_config: Optional[str] = None, + *args, + **kwargs + ): + if engine and app: + self.engine = engine + self.app = app + elif (config or from_config) or ( + config is None and from_config is None + ): + from .assembly.builder import R2RBuilder + + # Handle the case where 'from_config' is None and 'config' is None + if not config and not from_config: + from_config = "default" + builder = R2RBuilder( + config=config, + from_config=from_config, + ) + built = builder.build() + self.engine = built.engine + self.app = built.app + else: + raise ValueError( + "Must provide either 'engine' and 'app', or 'config'/'from_config' to build the R2R object." + ) + + def __getattr__(self, name): + # Check if the attribute name is 'app' and return it directly + if name == "app": + return self.app + elif name == "serve": + return self.app.serve + # Otherwise, delegate to the engine + return getattr(self.engine, name) diff --git a/R2R/r2r/main/services/__init__.py b/R2R/r2r/main/services/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/main/services/__init__.py diff --git a/R2R/r2r/main/services/base.py b/R2R/r2r/main/services/base.py new file mode 100755 index 00000000..02c0675d --- /dev/null +++ b/R2R/r2r/main/services/base.py @@ -0,0 +1,22 @@ +from abc import ABC + +from r2r.base import KVLoggingSingleton, RunManager + +from ..abstractions import R2RPipelines, R2RProviders +from ..assembly.config import R2RConfig + + +class Service(ABC): + def __init__( + self, + config: R2RConfig, + providers: R2RProviders, + pipelines: R2RPipelines, + run_manager: RunManager, + logging_connection: KVLoggingSingleton, + ): + self.config = config + self.providers = providers + self.pipelines = pipelines + self.run_manager = run_manager + self.logging_connection = logging_connection diff --git a/R2R/r2r/main/services/ingestion_service.py b/R2R/r2r/main/services/ingestion_service.py new file mode 100755 index 00000000..5677807a --- /dev/null +++ b/R2R/r2r/main/services/ingestion_service.py @@ -0,0 +1,505 @@ +import json +import logging +import uuid +from collections import defaultdict +from datetime import datetime +from typing import Any, Optional, Union + +from fastapi import Form, UploadFile + +from r2r.base import ( + Document, + DocumentInfo, + DocumentType, + KVLoggingSingleton, + R2RDocumentProcessingError, + R2RException, + RunManager, + generate_id_from_label, + increment_version, + to_async_generator, +) +from r2r.telemetry.telemetry_decorator import telemetry_event + +from ..abstractions import R2RPipelines, R2RProviders +from ..api.requests import R2RIngestFilesRequest, R2RUpdateFilesRequest +from ..assembly.config import R2RConfig +from .base import Service + +logger = logging.getLogger(__name__) +MB_CONVERSION_FACTOR = 1024 * 1024 + + +class IngestionService(Service): + def __init__( + self, + config: R2RConfig, + providers: R2RProviders, + pipelines: R2RPipelines, + run_manager: RunManager, + logging_connection: KVLoggingSingleton, + ): + super().__init__( + config, providers, pipelines, run_manager, logging_connection + ) + + def _file_to_document( + self, file: UploadFile, document_id: uuid.UUID, metadata: dict + ) -> Document: + file_extension = file.filename.split(".")[-1].lower() + if file_extension.upper() not in DocumentType.__members__: + raise R2RException( + status_code=415, + message=f"'{file_extension}' is not a valid DocumentType.", + ) + + document_title = ( + metadata.get("title", None) or file.filename.split("/")[-1] + ) + metadata["title"] = document_title + + return Document( + id=document_id, + type=DocumentType[file_extension.upper()], + data=file.file.read(), + metadata=metadata, + ) + + @telemetry_event("IngestDocuments") + async def ingest_documents( + self, + documents: list[Document], + versions: Optional[list[str]] = None, + *args: Any, + **kwargs: Any, + ): + if len(documents) == 0: + raise R2RException( + status_code=400, message="No documents provided for ingestion." + ) + + document_infos = [] + skipped_documents = [] + processed_documents = {} + duplicate_documents = defaultdict(list) + + existing_document_info = { + doc_info.document_id: doc_info + for doc_info in self.providers.vector_db.get_documents_overview() + } + + for iteration, document in enumerate(documents): + version = versions[iteration] if versions else "v0" + + # Check for duplicates within the current batch + if document.id in processed_documents: + duplicate_documents[document.id].append( + document.metadata.get("title", str(document.id)) + ) + continue + + if ( + document.id in existing_document_info + and existing_document_info[document.id].version == version + and existing_document_info[document.id].status == "success" + ): + logger.error( + f"Document with ID {document.id} was already successfully processed." + ) + if len(documents) == 1: + raise R2RException( + status_code=409, + message=f"Document with ID {document.id} was already successfully processed.", + ) + skipped_documents.append( + ( + document.id, + document.metadata.get("title", None) + or str(document.id), + ) + ) + continue + + now = datetime.now() + document_infos.append( + DocumentInfo( + document_id=document.id, + version=version, + size_in_bytes=len(document.data), + metadata=document.metadata.copy(), + title=document.metadata.get("title", str(document.id)), + user_id=document.metadata.get("user_id", None), + created_at=now, + updated_at=now, + status="processing", # Set initial status to `processing` + ) + ) + + processed_documents[document.id] = document.metadata.get( + "title", str(document.id) + ) + + if duplicate_documents: + duplicate_details = [ + f"{doc_id}: {', '.join(titles)}" + for doc_id, titles in duplicate_documents.items() + ] + warning_message = f"Duplicate documents detected: {'; '.join(duplicate_details)}. These duplicates were skipped." + raise R2RException(status_code=418, message=warning_message) + + if skipped_documents and len(skipped_documents) == len(documents): + logger.error("All provided documents already exist.") + raise R2RException( + status_code=409, + message="All provided documents already exist. Use the `update_documents` endpoint instead to update these documents.", + ) + + # Insert pending document infos + self.providers.vector_db.upsert_documents_overview(document_infos) + ingestion_results = await self.pipelines.ingestion_pipeline.run( + input=to_async_generator( + [ + doc + for doc in documents + if doc.id + not in [skipped[0] for skipped in skipped_documents] + ] + ), + versions=[info.version for info in document_infos], + run_manager=self.run_manager, + *args, + **kwargs, + ) + + return await self._process_ingestion_results( + ingestion_results, + document_infos, + skipped_documents, + processed_documents, + ) + + @telemetry_event("IngestFiles") + async def ingest_files( + self, + files: list[UploadFile], + metadatas: Optional[list[dict]] = None, + document_ids: Optional[list[uuid.UUID]] = None, + versions: Optional[list[str]] = None, + *args: Any, + **kwargs: Any, + ): + if not files: + raise R2RException( + status_code=400, message="No files provided for ingestion." + ) + + try: + documents = [] + for iteration, file in enumerate(files): + logger.info(f"Processing file: {file.filename}") + if ( + file.size + > self.config.app.get("max_file_size_in_mb", 32) + * MB_CONVERSION_FACTOR + ): + raise R2RException( + status_code=413, + message=f"File size exceeds maximum allowed size: {file.filename}", + ) + if not file.filename: + raise R2RException( + status_code=400, message="File name not provided." + ) + + document_metadata = metadatas[iteration] if metadatas else {} + document_id = ( + document_ids[iteration] + if document_ids + else generate_id_from_label(file.filename.split("/")[-1]) + ) + + document = self._file_to_document( + file, document_id, document_metadata + ) + documents.append(document) + + return await self.ingest_documents( + documents, versions, *args, **kwargs + ) + + finally: + for file in files: + file.file.close() + + @telemetry_event("UpdateFiles") + async def update_files( + self, + files: list[UploadFile], + document_ids: list[uuid.UUID], + metadatas: Optional[list[dict]] = None, + *args: Any, + **kwargs: Any, + ): + if not files: + raise R2RException( + status_code=400, message="No files provided for update." + ) + + try: + if len(document_ids) != len(files): + raise R2RException( + status_code=400, + message="Number of ids does not match number of files.", + ) + + documents_overview = await self._documents_overview( + document_ids=document_ids + ) + if len(documents_overview) != len(files): + raise R2RException( + status_code=404, + message="One or more documents was not found.", + ) + + documents = [] + new_versions = [] + + for it, (file, doc_id, doc_info) in enumerate( + zip(files, document_ids, documents_overview) + ): + if not doc_info: + raise R2RException( + status_code=404, + message=f"Document with id {doc_id} not found.", + ) + + new_version = increment_version(doc_info.version) + new_versions.append(new_version) + + updated_metadata = ( + metadatas[it] if metadatas else doc_info.metadata + ) + updated_metadata["title"] = ( + updated_metadata.get("title", None) + or file.filename.split("/")[-1] + ) + + document = self._file_to_document( + file, doc_id, updated_metadata + ) + documents.append(document) + + ingestion_results = await self.ingest_documents( + documents, versions=new_versions, *args, **kwargs + ) + + for doc_id, old_version in zip( + document_ids, + [doc_info.version for doc_info in documents_overview], + ): + await self._delete( + ["document_id", "version"], [str(doc_id), old_version] + ) + self.providers.vector_db.delete_from_documents_overview( + doc_id, old_version + ) + + return ingestion_results + + finally: + for file in files: + file.file.close() + + async def _process_ingestion_results( + self, + ingestion_results: dict, + document_infos: list[DocumentInfo], + skipped_documents: list[tuple[str, str]], + processed_documents: dict, + ): + skipped_ids = [ele[0] for ele in skipped_documents] + failed_ids = [] + successful_ids = [] + + results = {} + if ingestion_results["embedding_pipeline_output"]: + results = { + k: v for k, v in ingestion_results["embedding_pipeline_output"] + } + for doc_id, error in results.items(): + if isinstance(error, R2RDocumentProcessingError): + logger.error( + f"Error processing document with ID {error.document_id}: {error.message}" + ) + failed_ids.append(error.document_id) + elif isinstance(error, Exception): + logger.error(f"Error processing document: {error}") + failed_ids.append(doc_id) + else: + successful_ids.append(doc_id) + + documents_to_upsert = [] + for document_info in document_infos: + if document_info.document_id not in skipped_ids: + if document_info.document_id in failed_ids: + document_info.status = "failure" + elif document_info.document_id in successful_ids: + document_info.status = "success" + documents_to_upsert.append(document_info) + + if documents_to_upsert: + self.providers.vector_db.upsert_documents_overview( + documents_to_upsert + ) + + results = { + "processed_documents": [ + f"Document '{processed_documents[document_id]}' processed successfully." + for document_id in successful_ids + ], + "failed_documents": [ + f"Document '{processed_documents[document_id]}': {results[document_id]}" + for document_id in failed_ids + ], + "skipped_documents": [ + f"Document '{filename}' skipped since it already exists." + for _, filename in skipped_documents + ], + } + + # TODO - Clean up logging for document parse results + run_ids = list(self.run_manager.run_info.keys()) + if run_ids: + run_id = run_ids[0] + for key in results: + if key in ["processed_documents", "failed_documents"]: + for value in results[key]: + await self.logging_connection.log( + log_id=run_id, + key="document_parse_result", + value=value, + ) + return results + + @staticmethod + def parse_ingest_files_form_data( + metadatas: Optional[str] = Form(None), + document_ids: str = Form(None), + versions: Optional[str] = Form(None), + ) -> R2RIngestFilesRequest: + try: + parsed_metadatas = ( + json.loads(metadatas) + if metadatas and metadatas != "null" + else None + ) + if parsed_metadatas is not None and not isinstance( + parsed_metadatas, list + ): + raise ValueError("metadatas must be a list of dictionaries") + + parsed_document_ids = ( + json.loads(document_ids) + if document_ids and document_ids != "null" + else None + ) + if parsed_document_ids is not None: + parsed_document_ids = [ + uuid.UUID(doc_id) for doc_id in parsed_document_ids + ] + + parsed_versions = ( + json.loads(versions) + if versions and versions != "null" + else None + ) + + request_data = { + "metadatas": parsed_metadatas, + "document_ids": parsed_document_ids, + "versions": parsed_versions, + } + return R2RIngestFilesRequest(**request_data) + except json.JSONDecodeError as e: + raise R2RException( + status_code=400, message=f"Invalid JSON in form data: {e}" + ) + except ValueError as e: + raise R2RException(status_code=400, message=str(e)) + except Exception as e: + raise R2RException( + status_code=400, message=f"Error processing form data: {e}" + ) + + @staticmethod + def parse_update_files_form_data( + metadatas: Optional[str] = Form(None), + document_ids: str = Form(...), + ) -> R2RUpdateFilesRequest: + try: + parsed_metadatas = ( + json.loads(metadatas) + if metadatas and metadatas != "null" + else None + ) + if parsed_metadatas is not None and not isinstance( + parsed_metadatas, list + ): + raise ValueError("metadatas must be a list of dictionaries") + + if not document_ids or document_ids == "null": + raise ValueError("document_ids is required and cannot be null") + + parsed_document_ids = json.loads(document_ids) + if not isinstance(parsed_document_ids, list): + raise ValueError("document_ids must be a list") + parsed_document_ids = [ + uuid.UUID(doc_id) for doc_id in parsed_document_ids + ] + + request_data = { + "metadatas": parsed_metadatas, + "document_ids": parsed_document_ids, + } + return R2RUpdateFilesRequest(**request_data) + except json.JSONDecodeError as e: + raise R2RException( + status_code=400, message=f"Invalid JSON in form data: {e}" + ) + except ValueError as e: + raise R2RException(status_code=400, message=str(e)) + except Exception as e: + raise R2RException( + status_code=400, message=f"Error processing form data: {e}" + ) + + # TODO - Move to mgmt service for document info, delete, post orchestration buildout + async def _documents_overview( + self, + document_ids: Optional[list[uuid.UUID]] = None, + user_ids: Optional[list[uuid.UUID]] = None, + *args: Any, + **kwargs: Any, + ): + return self.providers.vector_db.get_documents_overview( + filter_document_ids=( + [str(ele) for ele in document_ids] if document_ids else None + ), + filter_user_ids=( + [str(ele) for ele in user_ids] if user_ids else None + ), + ) + + async def _delete( + self, keys: list[str], values: list[Union[bool, int, str]] + ): + logger.info( + f"Deleting documents which match on these keys and values: ({keys}, {values})" + ) + + ids = self.providers.vector_db.delete_by_metadata(keys, values) + if not ids: + raise R2RException( + status_code=404, message="No entries found for deletion." + ) + return "Entries deleted successfully." diff --git a/R2R/r2r/main/services/management_service.py b/R2R/r2r/main/services/management_service.py new file mode 100755 index 00000000..00f1f56e --- /dev/null +++ b/R2R/r2r/main/services/management_service.py @@ -0,0 +1,385 @@ +import logging +import uuid +from collections import defaultdict +from typing import Any, Dict, List, Optional, Tuple, Union + +from r2r.base import ( + AnalysisTypes, + FilterCriteria, + KVLoggingSingleton, + LogProcessor, + R2RException, + RunManager, +) +from r2r.telemetry.telemetry_decorator import telemetry_event + +from ..abstractions import R2RPipelines, R2RProviders +from ..assembly.config import R2RConfig +from .base import Service + +logger = logging.getLogger(__name__) + + +class ManagementService(Service): + def __init__( + self, + config: R2RConfig, + providers: R2RProviders, + pipelines: R2RPipelines, + run_manager: RunManager, + logging_connection: KVLoggingSingleton, + ): + super().__init__( + config, providers, pipelines, run_manager, logging_connection + ) + + @telemetry_event("UpdatePrompt") + async def update_prompt( + self, + name: str, + template: Optional[str] = None, + input_types: Optional[dict[str, str]] = {}, + *args, + **kwargs, + ): + self.providers.prompt.update_prompt(name, template, input_types) + return f"Prompt '{name}' added successfully." + + @telemetry_event("Logs") + async def alogs( + self, + log_type_filter: Optional[str] = None, + max_runs_requested: int = 100, + *args: Any, + **kwargs: Any, + ): + if self.logging_connection is None: + raise R2RException( + status_code=404, message="Logging provider not found." + ) + if ( + self.config.app.get("max_logs_per_request", 100) + > max_runs_requested + ): + raise R2RException( + status_code=400, + message="Max runs requested exceeds the limit.", + ) + + run_info = await self.logging_connection.get_run_info( + limit=max_runs_requested, + log_type_filter=log_type_filter, + ) + run_ids = [run.run_id for run in run_info] + if len(run_ids) == 0: + return [] + logs = await self.logging_connection.get_logs(run_ids) + # Aggregate logs by run_id and include run_type + aggregated_logs = [] + + for run in run_info: + run_logs = [log for log in logs if log["log_id"] == run.run_id] + entries = [ + {"key": log["key"], "value": log["value"]} for log in run_logs + ][ + ::-1 + ] # Reverse order so that earliest logged values appear first. + aggregated_logs.append( + { + "run_id": run.run_id, + "run_type": run.log_type, + "entries": entries, + } + ) + + return aggregated_logs + + @telemetry_event("Analytics") + async def aanalytics( + self, + filter_criteria: FilterCriteria, + analysis_types: AnalysisTypes, + *args, + **kwargs, + ): + run_info = await self.logging_connection.get_run_info(limit=100) + run_ids = [info.run_id for info in run_info] + + if not run_ids: + return { + "analytics_data": "No logs found.", + "filtered_logs": {}, + } + logs = await self.logging_connection.get_logs(run_ids=run_ids) + + filters = {} + if filter_criteria.filters: + for key, value in filter_criteria.filters.items(): + filters[key] = lambda log, value=value: ( + any( + entry.get("key") == value + for entry in log.get("entries", []) + ) + if "entries" in log + else log.get("key") == value + ) + + log_processor = LogProcessor(filters) + for log in logs: + if "entries" in log and isinstance(log["entries"], list): + log_processor.process_log(log) + elif "key" in log: + log_processor.process_log(log) + else: + logger.warning( + f"Skipping log due to missing or malformed 'entries': {log}" + ) + + filtered_logs = dict(log_processor.populations.items()) + results = {"filtered_logs": filtered_logs} + + if analysis_types and analysis_types.analysis_types: + for ( + filter_key, + analysis_config, + ) in analysis_types.analysis_types.items(): + if filter_key in filtered_logs: + analysis_type = analysis_config[0] + if analysis_type == "bar_chart": + extract_key = analysis_config[1] + results[filter_key] = ( + AnalysisTypes.generate_bar_chart_data( + filtered_logs[filter_key], extract_key + ) + ) + elif analysis_type == "basic_statistics": + extract_key = analysis_config[1] + results[filter_key] = ( + AnalysisTypes.calculate_basic_statistics( + filtered_logs[filter_key], extract_key + ) + ) + elif analysis_type == "percentile": + extract_key = analysis_config[1] + percentile = int(analysis_config[2]) + results[filter_key] = ( + AnalysisTypes.calculate_percentile( + filtered_logs[filter_key], + extract_key, + percentile, + ) + ) + else: + logger.warning( + f"Unknown analysis type for filter key '{filter_key}': {analysis_type}" + ) + + return results + + @telemetry_event("AppSettings") + async def aapp_settings(self, *args: Any, **kwargs: Any): + prompts = self.providers.prompt.get_all_prompts() + return { + "config": self.config.to_json(), + "prompts": { + name: prompt.dict() for name, prompt in prompts.items() + }, + } + + @telemetry_event("UsersOverview") + async def ausers_overview( + self, + user_ids: Optional[list[uuid.UUID]] = None, + *args, + **kwargs, + ): + return self.providers.vector_db.get_users_overview( + [str(ele) for ele in user_ids] if user_ids else None + ) + + @telemetry_event("Delete") + async def delete( + self, + keys: list[str], + values: list[Union[bool, int, str]], + *args, + **kwargs, + ): + metadata = ", ".join( + f"{key}={value}" for key, value in zip(keys, values) + ) + values = [str(value) for value in values] + logger.info(f"Deleting entries with metadata: {metadata}") + ids = self.providers.vector_db.delete_by_metadata(keys, values) + if not ids: + raise R2RException( + status_code=404, message="No entries found for deletion." + ) + for id in ids: + self.providers.vector_db.delete_from_documents_overview(id) + return f"Documents {ids} deleted successfully." + + @telemetry_event("DocumentsOverview") + async def adocuments_overview( + self, + document_ids: Optional[list[uuid.UUID]] = None, + user_ids: Optional[list[uuid.UUID]] = None, + *args: Any, + **kwargs: Any, + ): + return self.providers.vector_db.get_documents_overview( + filter_document_ids=( + [str(ele) for ele in document_ids] if document_ids else None + ), + filter_user_ids=( + [str(ele) for ele in user_ids] if user_ids else None + ), + ) + + @telemetry_event("DocumentChunks") + async def document_chunks( + self, + document_id: uuid.UUID, + *args, + **kwargs, + ): + return self.providers.vector_db.get_document_chunks(str(document_id)) + + @telemetry_event("UsersOverview") + async def users_overview( + self, + user_ids: Optional[list[uuid.UUID]], + *args, + **kwargs, + ): + return self.providers.vector_db.get_users_overview( + [str(ele) for ele in user_ids] + ) + + @telemetry_event("InspectKnowledgeGraph") + async def inspect_knowledge_graph( + self, limit=10000, *args: Any, **kwargs: Any + ): + if self.providers.kg is None: + raise R2RException( + status_code=404, message="Knowledge Graph provider not found." + ) + + rel_query = f""" + MATCH (n1)-[r]->(n2) + RETURN n1.id AS subject, type(r) AS relation, n2.id AS object + LIMIT {limit} + """ + + try: + with self.providers.kg.client.session( + database=self.providers.kg._database + ) as session: + results = session.run(rel_query) + relationships = [ + (record["subject"], record["relation"], record["object"]) + for record in results + ] + + # Create graph representation and group relationships + graph, grouped_relationships = self.process_relationships( + relationships + ) + + # Generate output + output = self.generate_output(grouped_relationships, graph) + + return "\n".join(output) + + except Exception as e: + logger.error(f"Error printing relationships: {str(e)}") + raise R2RException( + status_code=500, + message=f"An error occurred while fetching relationships: {str(e)}", + ) + + def process_relationships( + self, relationships: List[Tuple[str, str, str]] + ) -> Tuple[Dict[str, List[str]], Dict[str, Dict[str, List[str]]]]: + graph = defaultdict(list) + grouped = defaultdict(lambda: defaultdict(list)) + for subject, relation, obj in relationships: + graph[subject].append(obj) + grouped[subject][relation].append(obj) + if obj not in graph: + graph[obj] = [] + return dict(graph), dict(grouped) + + def generate_output( + self, + grouped_relationships: Dict[str, Dict[str, List[str]]], + graph: Dict[str, List[str]], + ) -> List[str]: + output = [] + + # Print grouped relationships + for subject, relations in grouped_relationships.items(): + output.append(f"\n== {subject} ==") + for relation, objects in relations.items(): + output.append(f" {relation}:") + for obj in objects: + output.append(f" - {obj}") + + # Print basic graph statistics + output.append("\n== Graph Statistics ==") + output.append(f"Number of nodes: {len(graph)}") + output.append( + f"Number of edges: {sum(len(neighbors) for neighbors in graph.values())}" + ) + output.append( + f"Number of connected components: {self.count_connected_components(graph)}" + ) + + # Find central nodes + central_nodes = self.get_central_nodes(graph) + output.append("\n== Most Central Nodes ==") + for node, centrality in central_nodes: + output.append(f" {node}: {centrality:.4f}") + + return output + + def count_connected_components(self, graph: Dict[str, List[str]]) -> int: + visited = set() + components = 0 + + def dfs(node): + visited.add(node) + for neighbor in graph[node]: + if neighbor not in visited: + dfs(neighbor) + + for node in graph: + if node not in visited: + dfs(node) + components += 1 + + return components + + def get_central_nodes( + self, graph: Dict[str, List[str]] + ) -> List[Tuple[str, float]]: + degree = {node: len(neighbors) for node, neighbors in graph.items()} + total_nodes = len(graph) + centrality = { + node: deg / (total_nodes - 1) for node, deg in degree.items() + } + return sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5] + + @telemetry_event("AppSettings") + async def app_settings( + self, + *args, + **kwargs, + ): + prompts = self.providers.prompt.get_all_prompts() + return { + "config": self.config.to_json(), + "prompts": { + name: prompt.dict() for name, prompt in prompts.items() + }, + } diff --git a/R2R/r2r/main/services/retrieval_service.py b/R2R/r2r/main/services/retrieval_service.py new file mode 100755 index 00000000..c4f6aff5 --- /dev/null +++ b/R2R/r2r/main/services/retrieval_service.py @@ -0,0 +1,207 @@ +import logging +import time +import uuid +from typing import Optional + +from r2r.base import ( + GenerationConfig, + KGSearchSettings, + KVLoggingSingleton, + R2RException, + RunManager, + VectorSearchSettings, + manage_run, + to_async_generator, +) +from r2r.pipes import EvalPipe +from r2r.telemetry.telemetry_decorator import telemetry_event + +from ..abstractions import R2RPipelines, R2RProviders +from ..assembly.config import R2RConfig +from .base import Service + +logger = logging.getLogger(__name__) + + +class RetrievalService(Service): + def __init__( + self, + config: R2RConfig, + providers: R2RProviders, + pipelines: R2RPipelines, + run_manager: RunManager, + logging_connection: KVLoggingSingleton, + ): + super().__init__( + config, providers, pipelines, run_manager, logging_connection + ) + + @telemetry_event("Search") + async def search( + self, + query: str, + vector_search_settings: VectorSearchSettings = VectorSearchSettings(), + kg_search_settings: KGSearchSettings = KGSearchSettings(), + *args, + **kwargs, + ): + async with manage_run(self.run_manager, "search_app") as run_id: + t0 = time.time() + + if ( + kg_search_settings.use_kg_search + and self.config.kg.provider is None + ): + raise R2RException( + status_code=400, + message="Knowledge Graph search is not enabled in the configuration.", + ) + + if ( + vector_search_settings.use_vector_search + and self.config.vector_database.provider is None + ): + raise R2RException( + status_code=400, + message="Vector search is not enabled in the configuration.", + ) + + # TODO - Remove these transforms once we have a better way to handle this + for filter, value in vector_search_settings.search_filters.items(): + if isinstance(value, uuid.UUID): + vector_search_settings.search_filters[filter] = str(value) + + results = await self.pipelines.search_pipeline.run( + input=to_async_generator([query]), + vector_search_settings=vector_search_settings, + kg_search_settings=kg_search_settings, + run_manager=self.run_manager, + *args, + **kwargs, + ) + + t1 = time.time() + latency = f"{t1 - t0:.2f}" + + await self.logging_connection.log( + log_id=run_id, + key="search_latency", + value=latency, + is_info_log=False, + ) + + return results.dict() + + @telemetry_event("RAG") + async def rag( + self, + query: str, + rag_generation_config: GenerationConfig, + vector_search_settings: VectorSearchSettings = VectorSearchSettings(), + kg_search_settings: KGSearchSettings = KGSearchSettings(), + *args, + **kwargs, + ): + async with manage_run(self.run_manager, "rag_app") as run_id: + try: + t0 = time.time() + + # TODO - Remove these transforms once we have a better way to handle this + for ( + filter, + value, + ) in vector_search_settings.search_filters.items(): + if isinstance(value, uuid.UUID): + vector_search_settings.search_filters[filter] = str( + value + ) + + if rag_generation_config.stream: + t1 = time.time() + latency = f"{t1 - t0:.2f}" + + await self.logging_connection.log( + log_id=run_id, + key="rag_generation_latency", + value=latency, + is_info_log=False, + ) + + async def stream_response(): + async with manage_run(self.run_manager, "arag"): + async for ( + chunk + ) in await self.pipelines.streaming_rag_pipeline.run( + input=to_async_generator([query]), + run_manager=self.run_manager, + vector_search_settings=vector_search_settings, + kg_search_settings=kg_search_settings, + rag_generation_config=rag_generation_config, + ): + yield chunk + + return stream_response() + + results = await self.pipelines.rag_pipeline.run( + input=to_async_generator([query]), + run_manager=self.run_manager, + vector_search_settings=vector_search_settings, + kg_search_settings=kg_search_settings, + rag_generation_config=rag_generation_config, + *args, + **kwargs, + ) + + t1 = time.time() + latency = f"{t1 - t0:.2f}" + + await self.logging_connection.log( + log_id=run_id, + key="rag_generation_latency", + value=latency, + is_info_log=False, + ) + + if len(results) == 0: + raise R2RException( + status_code=404, message="No results found" + ) + if len(results) > 1: + logger.warning( + f"Multiple results found for query: {query}" + ) + # unpack the first result + return results[0] + + except Exception as e: + logger.error(f"Pipeline error: {str(e)}") + if "NoneType" in str(e): + raise R2RException( + status_code=502, + message="Ollama server not reachable or returned an invalid response", + ) + raise R2RException( + status_code=500, message="Internal Server Error" + ) + + @telemetry_event("Evaluate") + async def evaluate( + self, + query: str, + context: str, + completion: str, + eval_generation_config: Optional[GenerationConfig], + *args, + **kwargs, + ): + eval_payload = EvalPipe.EvalPayload( + query=query, + context=context, + completion=completion, + ) + result = await self.eval_pipeline.run( + input=to_async_generator([eval_payload]), + run_manager=self.run_manager, + eval_generation_config=eval_generation_config, + ) + return result diff --git a/R2R/r2r/parsers/__init__.py b/R2R/r2r/parsers/__init__.py new file mode 100755 index 00000000..bd833a95 --- /dev/null +++ b/R2R/r2r/parsers/__init__.py @@ -0,0 +1,27 @@ +from .media.audio_parser import AudioParser +from .media.docx_parser import DOCXParser +from .media.img_parser import ImageParser +from .media.movie_parser import MovieParser +from .media.pdf_parser import PDFParser +from .media.ppt_parser import PPTParser +from .structured.csv_parser import CSVParser +from .structured.json_parser import JSONParser +from .structured.xlsx_parser import XLSXParser +from .text.html_parser import HTMLParser +from .text.md_parser import MDParser +from .text.text_parser import TextParser + +__all__ = [ + "AudioParser", + "DOCXParser", + "ImageParser", + "MovieParser", + "PDFParser", + "PPTParser", + "MDParser", + "HTMLParser", + "TextParser", + "CSVParser", + "JSONParser", + "XLSXParser", +] diff --git a/R2R/r2r/parsers/media/__init__.py b/R2R/r2r/parsers/media/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/parsers/media/__init__.py diff --git a/R2R/r2r/parsers/media/audio_parser.py b/R2R/r2r/parsers/media/audio_parser.py new file mode 100755 index 00000000..8a7735e4 --- /dev/null +++ b/R2R/r2r/parsers/media/audio_parser.py @@ -0,0 +1,32 @@ +import os +from typing import AsyncGenerator + +from r2r.base.parsers.base_parser import AsyncParser +from r2r.parsers.media.openai_helpers import process_audio_with_openai + + +class AudioParser(AsyncParser[bytes]): + """A parser for audio data.""" + + def __init__( + self, api_base: str = "https://api.openai.com/v1/audio/transcriptions" + ): + self.api_base = api_base + self.openai_api_key = os.environ.get("OPENAI_API_KEY") + if not self.openai_api_key: + raise ValueError( + "Error, environment variable `OPENAI_API_KEY` is required to run `AudioParser`." + ) + + async def ingest(self, data: bytes) -> AsyncGenerator[str, None]: + """Ingest audio data and yield a transcription.""" + temp_audio_path = "temp_audio.wav" + with open(temp_audio_path, "wb") as f: + f.write(data) + try: + transcription_text = process_audio_with_openai( + open(temp_audio_path, "rb"), self.openai_api_key + ) + yield transcription_text + finally: + os.remove(temp_audio_path) diff --git a/R2R/r2r/parsers/media/docx_parser.py b/R2R/r2r/parsers/media/docx_parser.py new file mode 100755 index 00000000..9edced81 --- /dev/null +++ b/R2R/r2r/parsers/media/docx_parser.py @@ -0,0 +1,28 @@ +from io import BytesIO +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class DOCXParser(AsyncParser[DataType]): + """A parser for DOCX data.""" + + def __init__(self): + try: + from docx import Document + + self.Document = Document + except ImportError: + raise ValueError( + "Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`." + ) + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest DOCX data and yield text from each paragraph.""" + if isinstance(data, str): + raise ValueError("DOCX data must be in bytes format.") + + doc = self.Document(BytesIO(data)) + for paragraph in doc.paragraphs: + yield paragraph.text diff --git a/R2R/r2r/parsers/media/img_parser.py b/R2R/r2r/parsers/media/img_parser.py new file mode 100755 index 00000000..7c40656a --- /dev/null +++ b/R2R/r2r/parsers/media/img_parser.py @@ -0,0 +1,40 @@ +import os +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser +from r2r.parsers.media.openai_helpers import process_frame_with_openai + + +class ImageParser(AsyncParser[DataType]): + """A parser for image data.""" + + def __init__( + self, + model: str = "gpt-4o", + max_tokens: int = 2_048, + api_base: str = "https://api.openai.com/v1/chat/completions", + ): + self.model = model + self.max_tokens = max_tokens + self.openai_api_key = os.environ.get("OPENAI_API_KEY") + if not self.openai_api_key: + raise ValueError( + "Error, environment variable `OPENAI_API_KEY` is required to run `ImageParser`." + ) + self.api_base = api_base + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest image data and yield a description.""" + if isinstance(data, bytes): + import base64 + + data = base64.b64encode(data).decode("utf-8") + + yield process_frame_with_openai( + data, + self.openai_api_key, + self.model, + self.max_tokens, + self.api_base, + ) diff --git a/R2R/r2r/parsers/media/movie_parser.py b/R2R/r2r/parsers/media/movie_parser.py new file mode 100755 index 00000000..c00b80d9 --- /dev/null +++ b/R2R/r2r/parsers/media/movie_parser.py @@ -0,0 +1,108 @@ +import base64 +import os +from typing import AsyncGenerator + +from r2r.base.parsers.base_parser import AsyncParser +from r2r.parsers.media.openai_helpers import ( + process_audio_with_openai, + process_frame_with_openai, +) + + +class MovieParser(AsyncParser): + """A parser for movie data.""" + + def __init__( + self, + model: str = "gpt-4o", + max_tokens: int = 2048, + seconds_per_frame: int = 2, + max_frames: int = 10, + ): + try: + import cv2 + + self.cv2 = cv2 + except ImportError: + raise ValueError( + "Error, `opencv-python` is required to run `MovieParser`. Please install it using `pip install opencv-python`." + ) + try: + import moviepy.editor as mp + + self.mp = mp + except ImportError: + raise ValueError( + "Error, `moviepy` is required to run `MovieParser`. Please install it using `pip install moviepy`." + ) + + self.model = model + self.max_tokens = max_tokens + self.seconds_per_frame = seconds_per_frame + self.max_frames = max_frames + self.openai_api_key = os.environ.get("OPENAI_API_KEY") + if not self.openai_api_key: + raise ValueError( + "Error, environment variable `OPENAI_API_KEY` is required to run `MovieParser`." + ) + + async def ingest(self, data: bytes) -> AsyncGenerator[str, None]: + """Ingest movie data and yield a description.""" + temp_video_path = "temp_movie.mp4" + with open(temp_video_path, "wb") as f: + f.write(data) + try: + raw_frames, audio_file = self.process_video(temp_video_path) + for frame in raw_frames: + frame_text = process_frame_with_openai( + frame, self.openai_api_key + ) + yield frame_text + + if audio_file: + transcription_text = process_audio_with_openai( + audio_file, self.openai_api_key + ) + yield transcription_text + finally: + os.remove(temp_video_path) + + def process_video(self, video_path): + base64Frames = [] + base_video_path, _ = os.path.splitext(video_path) + + video = self.cv2.VideoCapture(video_path) + total_frames = int(video.get(self.cv2.CAP_PROP_FRAME_COUNT)) + fps = video.get(self.cv2.CAP_PROP_FPS) + frames_to_skip = int(fps * self.seconds_per_frame) + curr_frame = 0 + + # Calculate frames to skip based on max_frames if it is set + if self.max_frames and self.max_frames < total_frames / frames_to_skip: + frames_to_skip = max(total_frames // self.max_frames, 1) + + frame_count = 0 + while curr_frame < total_frames - 1 and ( + not self.max_frames or frame_count < self.max_frames + ): + video.set(self.cv2.CAP_PROP_POS_FRAMES, curr_frame) + success, frame = video.read() + if not success: + break + _, buffer = self.cv2.imencode(".jpg", frame) + base64Frames.append(base64.b64encode(buffer).decode("utf-8")) + curr_frame += frames_to_skip + frame_count += 1 + video.release() + + audio_path = f"{base_video_path}.wav" + audio_file = None + with self.mp.VideoFileClip(video_path) as clip: + if clip.audio is not None: + clip.audio.write_audiofile( + audio_path, codec="pcm_s16le", fps=16000 + ) + audio_file = open(audio_path, "rb") + os.remove(audio_path) + + return base64Frames, audio_file diff --git a/R2R/r2r/parsers/media/openai_helpers.py b/R2R/r2r/parsers/media/openai_helpers.py new file mode 100755 index 00000000..707dadda --- /dev/null +++ b/R2R/r2r/parsers/media/openai_helpers.py @@ -0,0 +1,58 @@ +"""Implementations of parsers for different data types.""" + +import requests + + +def process_frame_with_openai( + data: bytes, + api_key: str, + model: str = "gpt-4o", + max_tokens: int = 2_048, + api_base: str = "https://api.openai.com/v1/chat/completions", +) -> str: + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + } + + payload = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "First, provide a title for the image, then explain everything that you see. Be very thorough in your analysis as a user will need to understand the image without seeing it. If it is possible to transcribe the image to text directly, then do so. The more detail you provide, the better the user will understand the image.", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{data}"}, + }, + ], + } + ], + "max_tokens": max_tokens, + } + + response = requests.post(api_base, headers=headers, json=payload) + response_json = response.json() + return response_json["choices"][0]["message"]["content"] + + +def process_audio_with_openai( + audio_file, + api_key: str, + audio_api_base: str = "https://api.openai.com/v1/audio/transcriptions", +) -> str: + headers = {"Authorization": f"Bearer {api_key}"} + + transcription_response = requests.post( + audio_api_base, + headers=headers, + files={"file": audio_file}, + data={"model": "whisper-1"}, + ) + transcription = transcription_response.json() + + return transcription["text"] diff --git a/R2R/r2r/parsers/media/pdf_parser.py b/R2R/r2r/parsers/media/pdf_parser.py new file mode 100755 index 00000000..b60a9b33 --- /dev/null +++ b/R2R/r2r/parsers/media/pdf_parser.py @@ -0,0 +1,34 @@ +import string +from io import BytesIO +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class PDFParser(AsyncParser[DataType]): + """A parser for PDF data.""" + + def __init__(self): + try: + from pypdf import PdfReader + + self.PdfReader = PdfReader + except ImportError: + raise ValueError( + "Error, `pypdf` is required to run `PyPDFParser`. Please install it using `pip install pypdf`." + ) + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest PDF data and yield text from each page.""" + if isinstance(data, str): + raise ValueError("PDF data must be in bytes format.") + + pdf = self.PdfReader(BytesIO(data)) + for page in pdf.pages: + page_text = page.extract_text() + if page_text is not None: + page_text = "".join( + filter(lambda x: x in string.printable, page_text) + ) + yield page_text diff --git a/R2R/r2r/parsers/media/ppt_parser.py b/R2R/r2r/parsers/media/ppt_parser.py new file mode 100755 index 00000000..8f192840 --- /dev/null +++ b/R2R/r2r/parsers/media/ppt_parser.py @@ -0,0 +1,30 @@ +from io import BytesIO +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class PPTParser(AsyncParser[DataType]): + """A parser for PPT data.""" + + def __init__(self): + try: + from pptx import Presentation + + self.Presentation = Presentation + except ImportError: + raise ValueError( + "Error, `python-pptx` is required to run `PPTParser`. Please install it using `pip install python-pptx`." + ) + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest PPT data and yield text from each slide.""" + if isinstance(data, str): + raise ValueError("PPT data must be in bytes format.") + + prs = self.Presentation(BytesIO(data)) + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + yield shape.text diff --git a/R2R/r2r/parsers/structured/__init__.py b/R2R/r2r/parsers/structured/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/parsers/structured/__init__.py diff --git a/R2R/r2r/parsers/structured/csv_parser.py b/R2R/r2r/parsers/structured/csv_parser.py new file mode 100755 index 00000000..343d9fbf --- /dev/null +++ b/R2R/r2r/parsers/structured/csv_parser.py @@ -0,0 +1,25 @@ +from typing import AsyncGenerator, Union + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class CSVParser(AsyncParser[DataType]): + """A parser for CSV data.""" + + def __init__(self): + import csv + from io import StringIO + + self.csv = csv + self.StringIO = StringIO + + async def ingest( + self, data: Union[str, bytes] + ) -> AsyncGenerator[str, None]: + """Ingest CSV data and yield text from each row.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + csv_reader = self.csv.reader(self.StringIO(data)) + for row in csv_reader: + yield ", ".join(row) diff --git a/R2R/r2r/parsers/structured/json_parser.py b/R2R/r2r/parsers/structured/json_parser.py new file mode 100755 index 00000000..23d63065 --- /dev/null +++ b/R2R/r2r/parsers/structured/json_parser.py @@ -0,0 +1,49 @@ +import json +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class JSONParser(AsyncParser[DataType]): + """A parser for JSON data.""" + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest JSON data and yield a formatted text representation.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + yield self._parse_json(json.loads(data)) + + def _parse_json(self, data: dict) -> str: + def remove_objects_with_null(obj): + if not isinstance(obj, dict): + return obj + result = obj.copy() + for key, value in obj.items(): + if isinstance(value, dict): + result[key] = remove_objects_with_null(value) + elif value is None: + del result[key] + return result + + def format_json_as_text(obj, indent=0): + lines = [] + indent_str = " " * indent + + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(value, (dict, list)): + nested = format_json_as_text(value, indent + 2) + lines.append(f"{indent_str}{key}:\n{nested}") + else: + lines.append(f"{indent_str}{key}: {value}") + elif isinstance(obj, list): + for item in obj: + nested = format_json_as_text(item, indent + 2) + lines.append(f"{nested}") + else: + return f"{indent_str}{obj}" + + return "\n".join(lines) + + return format_json_as_text(remove_objects_with_null(data)) diff --git a/R2R/r2r/parsers/structured/xlsx_parser.py b/R2R/r2r/parsers/structured/xlsx_parser.py new file mode 100755 index 00000000..68a3bdc6 --- /dev/null +++ b/R2R/r2r/parsers/structured/xlsx_parser.py @@ -0,0 +1,29 @@ +from io import BytesIO +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class XLSXParser(AsyncParser[DataType]): + """A parser for XLSX data.""" + + def __init__(self): + try: + from openpyxl import load_workbook + + self.load_workbook = load_workbook + except ImportError: + raise ValueError( + "Error, `openpyxl` is required to run `XLSXParser`. Please install it using `pip install openpyxl`." + ) + + async def ingest(self, data: bytes) -> AsyncGenerator[str, None]: + """Ingest XLSX data and yield text from each row.""" + if isinstance(data, str): + raise ValueError("XLSX data must be in bytes format.") + + wb = self.load_workbook(filename=BytesIO(data)) + for sheet in wb.worksheets: + for row in sheet.iter_rows(values_only=True): + yield ", ".join(map(str, row)) diff --git a/R2R/r2r/parsers/text/__init__.py b/R2R/r2r/parsers/text/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/parsers/text/__init__.py diff --git a/R2R/r2r/parsers/text/html_parser.py b/R2R/r2r/parsers/text/html_parser.py new file mode 100755 index 00000000..9c663fbe --- /dev/null +++ b/R2R/r2r/parsers/text/html_parser.py @@ -0,0 +1,15 @@ +from typing import AsyncGenerator + +from bs4 import BeautifulSoup + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class HTMLParser(AsyncParser[DataType]): + """A parser for HTML data.""" + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest HTML data and yield text.""" + soup = BeautifulSoup(data, "html.parser") + yield soup.get_text() diff --git a/R2R/r2r/parsers/text/md_parser.py b/R2R/r2r/parsers/text/md_parser.py new file mode 100755 index 00000000..ada9ae57 --- /dev/null +++ b/R2R/r2r/parsers/text/md_parser.py @@ -0,0 +1,23 @@ +from typing import AsyncGenerator + +from bs4 import BeautifulSoup + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class MDParser(AsyncParser[DataType]): + """A parser for Markdown data.""" + + def __init__(self): + import markdown + + self.markdown = markdown + + async def ingest(self, data: DataType) -> AsyncGenerator[str, None]: + """Ingest Markdown data and yield text.""" + if isinstance(data, bytes): + data = data.decode("utf-8") + html = self.markdown.markdown(data) + soup = BeautifulSoup(html, "html.parser") + yield soup.get_text() diff --git a/R2R/r2r/parsers/text/text_parser.py b/R2R/r2r/parsers/text/text_parser.py new file mode 100755 index 00000000..0c8ab7ca --- /dev/null +++ b/R2R/r2r/parsers/text/text_parser.py @@ -0,0 +1,13 @@ +from typing import AsyncGenerator + +from r2r.base.abstractions.document import DataType +from r2r.base.parsers.base_parser import AsyncParser + + +class TextParser(AsyncParser[DataType]): + """A parser for raw text data.""" + + async def ingest(self, data: DataType) -> AsyncGenerator[DataType, None]: + if isinstance(data, bytes): + data = data.decode("utf-8") + yield data diff --git a/R2R/r2r/pipelines/__init__.py b/R2R/r2r/pipelines/__init__.py new file mode 100755 index 00000000..ebe3a0c3 --- /dev/null +++ b/R2R/r2r/pipelines/__init__.py @@ -0,0 +1,11 @@ +from .eval_pipeline import EvalPipeline +from .ingestion_pipeline import IngestionPipeline +from .rag_pipeline import RAGPipeline +from .search_pipeline import SearchPipeline + +__all__ = [ + "IngestionPipeline", + "SearchPipeline", + "RAGPipeline", + "EvalPipeline", +] diff --git a/R2R/r2r/pipelines/eval_pipeline.py b/R2R/r2r/pipelines/eval_pipeline.py new file mode 100755 index 00000000..60aa50d4 --- /dev/null +++ b/R2R/r2r/pipelines/eval_pipeline.py @@ -0,0 +1,37 @@ +import logging +from typing import Any, Optional + +from r2r.base.logging.run_manager import RunManager +from r2r.base.pipeline.base_pipeline import AsyncPipeline +from r2r.base.pipes.base_pipe import AsyncPipe, AsyncState + +logger = logging.getLogger(__name__) + + +class EvalPipeline(AsyncPipeline): + """A pipeline for evaluation.""" + + pipeline_type: str = "eval" + + async def run( + self, + input: Any, + state: Optional[AsyncState] = None, + stream: bool = False, + run_manager: Optional[RunManager] = None, + *args: Any, + **kwargs: Any, + ): + return await super().run( + input, state, stream, run_manager, *args, **kwargs + ) + + def add_pipe( + self, + pipe: AsyncPipe, + add_upstream_outputs: Optional[list[dict[str, str]]] = None, + *args, + **kwargs, + ) -> None: + logger.debug(f"Adding pipe {pipe.config.name} to the EvalPipeline") + return super().add_pipe(pipe, add_upstream_outputs, *args, **kwargs) diff --git a/R2R/r2r/pipelines/ingestion_pipeline.py b/R2R/r2r/pipelines/ingestion_pipeline.py new file mode 100755 index 00000000..df1263f9 --- /dev/null +++ b/R2R/r2r/pipelines/ingestion_pipeline.py @@ -0,0 +1,144 @@ +import asyncio +import logging +from asyncio import Queue +from typing import Any, Optional + +from r2r.base.logging.kv_logger import KVLoggingSingleton +from r2r.base.logging.run_manager import RunManager, manage_run +from r2r.base.pipeline.base_pipeline import AsyncPipeline, dequeue_requests +from r2r.base.pipes.base_pipe import AsyncPipe, AsyncState + +logger = logging.getLogger(__name__) + + +class IngestionPipeline(AsyncPipeline): + """A pipeline for ingestion.""" + + pipeline_type: str = "ingestion" + + def __init__( + self, + pipe_logger: Optional[KVLoggingSingleton] = None, + run_manager: Optional[RunManager] = None, + ): + super().__init__(pipe_logger, run_manager) + self.parsing_pipe = None + self.embedding_pipeline = None + self.kg_pipeline = None + + async def run( + self, + input: Any, + state: Optional[AsyncState] = None, + stream: bool = False, + run_manager: Optional[RunManager] = None, + log_run_info: bool = True, + *args: Any, + **kwargs: Any, + ) -> None: + self.state = state or AsyncState() + async with manage_run(run_manager, self.pipeline_type): + if log_run_info: + await run_manager.log_run_info( + key="pipeline_type", + value=self.pipeline_type, + is_info_log=True, + ) + if self.parsing_pipe is None: + raise ValueError( + "parsing_pipeline must be set before running the ingestion pipeline" + ) + if self.embedding_pipeline is None and self.kg_pipeline is None: + raise ValueError( + "At least one of embedding_pipeline or kg_pipeline must be set before running the ingestion pipeline" + ) + # Use queues to duplicate the documents for each pipeline + embedding_queue = Queue() + kg_queue = Queue() + + async def enqueue_documents(): + async for document in await self.parsing_pipe.run( + self.parsing_pipe.Input(message=input), + state, + run_manager, + *args, + **kwargs, + ): + if self.embedding_pipeline: + await embedding_queue.put(document) + if self.kg_pipeline: + await kg_queue.put(document) + await embedding_queue.put(None) + await kg_queue.put(None) + + # Start the document enqueuing process + enqueue_task = asyncio.create_task(enqueue_documents()) + + # Start the embedding and KG pipelines in parallel + if self.embedding_pipeline: + embedding_task = asyncio.create_task( + self.embedding_pipeline.run( + dequeue_requests(embedding_queue), + state, + stream, + run_manager, + log_run_info=False, # Do not log run info since we have already done so + *args, + **kwargs, + ) + ) + + if self.kg_pipeline: + kg_task = asyncio.create_task( + self.kg_pipeline.run( + dequeue_requests(kg_queue), + state, + stream, + run_manager, + log_run_info=False, # Do not log run info since we have already done so + *args, + **kwargs, + ) + ) + + # Wait for the enqueueing task to complete + await enqueue_task + + results = {} + # Wait for the embedding and KG tasks to complete + if self.embedding_pipeline: + results["embedding_pipeline_output"] = await embedding_task + if self.kg_pipeline: + results["kg_pipeline_output"] = await kg_task + return results + + def add_pipe( + self, + pipe: AsyncPipe, + add_upstream_outputs: Optional[list[dict[str, str]]] = None, + parsing_pipe: bool = False, + kg_pipe: bool = False, + embedding_pipe: bool = False, + *args, + **kwargs, + ) -> None: + logger.debug( + f"Adding pipe {pipe.config.name} to the IngestionPipeline" + ) + + if parsing_pipe: + self.parsing_pipe = pipe + elif kg_pipe: + if not self.kg_pipeline: + self.kg_pipeline = AsyncPipeline() + self.kg_pipeline.add_pipe( + pipe, add_upstream_outputs, *args, **kwargs + ) + elif embedding_pipe: + if not self.embedding_pipeline: + self.embedding_pipeline = AsyncPipeline() + self.embedding_pipeline.add_pipe( + pipe, add_upstream_outputs, *args, **kwargs + ) + else: + raise ValueError("Pipe must be a parsing, embedding, or KG pipe") diff --git a/R2R/r2r/pipelines/rag_pipeline.py b/R2R/r2r/pipelines/rag_pipeline.py new file mode 100755 index 00000000..b257ccaa --- /dev/null +++ b/R2R/r2r/pipelines/rag_pipeline.py @@ -0,0 +1,115 @@ +import asyncio +import logging +from typing import Any, Optional + +from ..base.abstractions.llm import GenerationConfig +from ..base.abstractions.search import KGSearchSettings, VectorSearchSettings +from ..base.logging.kv_logger import KVLoggingSingleton +from ..base.logging.run_manager import RunManager, manage_run +from ..base.pipeline.base_pipeline import AsyncPipeline +from ..base.pipes.base_pipe import AsyncPipe, AsyncState +from ..base.utils import to_async_generator + +logger = logging.getLogger(__name__) + + +class RAGPipeline(AsyncPipeline): + """A pipeline for RAG.""" + + pipeline_type: str = "rag" + + def __init__( + self, + pipe_logger: Optional[KVLoggingSingleton] = None, + run_manager: Optional[RunManager] = None, + ): + super().__init__(pipe_logger, run_manager) + self._search_pipeline = None + self._rag_pipeline = None + + async def run( + self, + input: Any, + state: Optional[AsyncState] = None, + run_manager: Optional[RunManager] = None, + log_run_info=True, + vector_search_settings: VectorSearchSettings = VectorSearchSettings(), + kg_search_settings: KGSearchSettings = KGSearchSettings(), + rag_generation_config: GenerationConfig = GenerationConfig(), + *args: Any, + **kwargs: Any, + ): + self.state = state or AsyncState() + async with manage_run(run_manager, self.pipeline_type): + if log_run_info: + await run_manager.log_run_info( + key="pipeline_type", + value=self.pipeline_type, + is_info_log=True, + ) + + if not self._search_pipeline: + raise ValueError( + "_search_pipeline must be set before running the RAG pipeline" + ) + + async def multi_query_generator(input): + tasks = [] + async for query in input: + task = asyncio.create_task( + self._search_pipeline.run( + to_async_generator([query]), + state=state, + stream=False, # do not stream the search results + run_manager=run_manager, + log_run_info=False, # do not log the run info as it is already logged above + vector_search_settings=vector_search_settings, + kg_search_settings=kg_search_settings, + *args, + **kwargs, + ) + ) + tasks.append((query, task)) + + for query, task in tasks: + yield (query, await task) + + rag_results = await self._rag_pipeline.run( + input=multi_query_generator(input), + state=state, + stream=rag_generation_config.stream, + run_manager=run_manager, + log_run_info=False, + rag_generation_config=rag_generation_config, + *args, + **kwargs, + ) + return rag_results + + def add_pipe( + self, + pipe: AsyncPipe, + add_upstream_outputs: Optional[list[dict[str, str]]] = None, + rag_pipe: bool = True, + *args, + **kwargs, + ) -> None: + logger.debug(f"Adding pipe {pipe.config.name} to the RAGPipeline") + if not rag_pipe: + raise ValueError( + "Only pipes that are part of the RAG pipeline can be added to the RAG pipeline" + ) + if not self._rag_pipeline: + self._rag_pipeline = AsyncPipeline() + self._rag_pipeline.add_pipe( + pipe, add_upstream_outputs, *args, **kwargs + ) + + def set_search_pipeline( + self, + _search_pipeline: AsyncPipeline, + *args, + **kwargs, + ) -> None: + logger.debug(f"Setting search pipeline for the RAGPipeline") + self._search_pipeline = _search_pipeline diff --git a/R2R/r2r/pipelines/search_pipeline.py b/R2R/r2r/pipelines/search_pipeline.py new file mode 100755 index 00000000..25e0c7bb --- /dev/null +++ b/R2R/r2r/pipelines/search_pipeline.py @@ -0,0 +1,140 @@ +import asyncio +import logging +from asyncio import Queue +from typing import Any, Optional + +from ..base.abstractions.search import ( + AggregateSearchResult, + KGSearchSettings, + VectorSearchSettings, +) +from ..base.logging.kv_logger import KVLoggingSingleton +from ..base.logging.run_manager import RunManager, manage_run +from ..base.pipeline.base_pipeline import AsyncPipeline, dequeue_requests +from ..base.pipes.base_pipe import AsyncPipe, AsyncState + +logger = logging.getLogger(__name__) + + +class SearchPipeline(AsyncPipeline): + """A pipeline for search.""" + + pipeline_type: str = "search" + + def __init__( + self, + pipe_logger: Optional[KVLoggingSingleton] = None, + run_manager: Optional[RunManager] = None, + ): + super().__init__(pipe_logger, run_manager) + self._parsing_pipe = None + self._vector_search_pipeline = None + self._kg_search_pipeline = None + + async def run( + self, + input: Any, + state: Optional[AsyncState] = None, + stream: bool = False, + run_manager: Optional[RunManager] = None, + log_run_info: bool = True, + vector_search_settings: VectorSearchSettings = VectorSearchSettings(), + kg_search_settings: KGSearchSettings = KGSearchSettings(), + *args: Any, + **kwargs: Any, + ): + self.state = state or AsyncState() + do_vector_search = ( + self._vector_search_pipeline is not None + and vector_search_settings.use_vector_search + ) + do_kg = ( + self._kg_search_pipeline is not None + and kg_search_settings.use_kg_search + ) + async with manage_run(run_manager, self.pipeline_type): + if log_run_info: + await run_manager.log_run_info( + key="pipeline_type", + value=self.pipeline_type, + is_info_log=True, + ) + + vector_search_queue = Queue() + kg_queue = Queue() + + async def enqueue_requests(): + async for message in input: + if do_vector_search: + await vector_search_queue.put(message) + if do_kg: + await kg_queue.put(message) + + await vector_search_queue.put(None) + await kg_queue.put(None) + + # Start the document enqueuing process + enqueue_task = asyncio.create_task(enqueue_requests()) + + # Start the embedding and KG pipelines in parallel + if do_vector_search: + vector_search_task = asyncio.create_task( + self._vector_search_pipeline.run( + dequeue_requests(vector_search_queue), + state, + stream, + run_manager, + log_run_info=False, + vector_search_settings=vector_search_settings, + ) + ) + + if do_kg: + kg_task = asyncio.create_task( + self._kg_search_pipeline.run( + dequeue_requests(kg_queue), + state, + stream, + run_manager, + log_run_info=False, + kg_search_settings=kg_search_settings, + ) + ) + + await enqueue_task + + vector_search_results = ( + await vector_search_task if do_vector_search else None + ) + kg_results = await kg_task if do_kg else None + + return AggregateSearchResult( + vector_search_results=vector_search_results, + kg_search_results=kg_results, + ) + + def add_pipe( + self, + pipe: AsyncPipe, + add_upstream_outputs: Optional[list[dict[str, str]]] = None, + kg_pipe: bool = False, + vector_search_pipe: bool = False, + *args, + **kwargs, + ) -> None: + logger.debug(f"Adding pipe {pipe.config.name} to the SearchPipeline") + + if kg_pipe: + if not self._kg_search_pipeline: + self._kg_search_pipeline = AsyncPipeline() + self._kg_search_pipeline.add_pipe( + pipe, add_upstream_outputs, *args, **kwargs + ) + elif vector_search_pipe: + if not self._vector_search_pipeline: + self._vector_search_pipeline = AsyncPipeline() + self._vector_search_pipeline.add_pipe( + pipe, add_upstream_outputs, *args, **kwargs + ) + else: + raise ValueError("Pipe must be a vector search or KG pipe") diff --git a/R2R/r2r/pipes/__init__.py b/R2R/r2r/pipes/__init__.py new file mode 100755 index 00000000..b86c31c0 --- /dev/null +++ b/R2R/r2r/pipes/__init__.py @@ -0,0 +1,31 @@ +from .abstractions.search_pipe import SearchPipe +from .ingestion.embedding_pipe import EmbeddingPipe +from .ingestion.kg_extraction_pipe import KGExtractionPipe +from .ingestion.kg_storage_pipe import KGStoragePipe +from .ingestion.parsing_pipe import ParsingPipe +from .ingestion.vector_storage_pipe import VectorStoragePipe +from .other.eval_pipe import EvalPipe +from .other.web_search_pipe import WebSearchPipe +from .retrieval.kg_agent_search_pipe import KGAgentSearchPipe +from .retrieval.multi_search import MultiSearchPipe +from .retrieval.query_transform_pipe import QueryTransformPipe +from .retrieval.search_rag_pipe import SearchRAGPipe +from .retrieval.streaming_rag_pipe import StreamingSearchRAGPipe +from .retrieval.vector_search_pipe import VectorSearchPipe + +__all__ = [ + "SearchPipe", + "EmbeddingPipe", + "EvalPipe", + "KGExtractionPipe", + "ParsingPipe", + "QueryTransformPipe", + "SearchRAGPipe", + "StreamingSearchRAGPipe", + "VectorSearchPipe", + "VectorStoragePipe", + "WebSearchPipe", + "KGAgentSearchPipe", + "KGStoragePipe", + "MultiSearchPipe", +] diff --git a/R2R/r2r/pipes/abstractions/__init__.py b/R2R/r2r/pipes/abstractions/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/pipes/abstractions/__init__.py diff --git a/R2R/r2r/pipes/abstractions/generator_pipe.py b/R2R/r2r/pipes/abstractions/generator_pipe.py new file mode 100755 index 00000000..002ebd23 --- /dev/null +++ b/R2R/r2r/pipes/abstractions/generator_pipe.py @@ -0,0 +1,58 @@ +import uuid +from abc import abstractmethod +from typing import Any, AsyncGenerator, Optional + +from r2r.base import ( + AsyncState, + KVLoggingSingleton, + LLMProvider, + PipeType, + PromptProvider, +) +from r2r.base.abstractions.llm import GenerationConfig +from r2r.base.pipes.base_pipe import AsyncPipe + + +class GeneratorPipe(AsyncPipe): + class Config(AsyncPipe.PipeConfig): + name: str + task_prompt: str + system_prompt: str = "default_system" + + def __init__( + self, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + type: PipeType = PipeType.GENERATOR, + config: Optional[Config] = None, + pipe_logger: Optional[KVLoggingSingleton] = None, + *args, + **kwargs, + ): + super().__init__( + type=type, + config=config or self.Config(), + pipe_logger=pipe_logger, + *args, + **kwargs, + ) + self.llm_provider = llm_provider + self.prompt_provider = prompt_provider + + @abstractmethod + async def _run_logic( + self, + input: AsyncPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + rag_generation_config: GenerationConfig, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[Any, None]: + pass + + @abstractmethod + def _get_message_payload( + self, message: str, *args: Any, **kwargs: Any + ) -> list: + pass diff --git a/R2R/r2r/pipes/abstractions/search_pipe.py b/R2R/r2r/pipes/abstractions/search_pipe.py new file mode 100755 index 00000000..bb0303e0 --- /dev/null +++ b/R2R/r2r/pipes/abstractions/search_pipe.py @@ -0,0 +1,62 @@ +import logging +import uuid +from abc import abstractmethod +from typing import Any, AsyncGenerator, Optional, Union + +from r2r.base import ( + AsyncPipe, + AsyncState, + KVLoggingSingleton, + PipeType, + VectorSearchResult, +) + +logger = logging.getLogger(__name__) + + +class SearchPipe(AsyncPipe): + class SearchConfig(AsyncPipe.PipeConfig): + name: str = "default_vector_search" + search_filters: dict = {} + search_limit: int = 10 + + class Input(AsyncPipe.Input): + message: Union[AsyncGenerator[str, None], str] + + def __init__( + self, + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.SEARCH, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + super().__init__( + pipe_logger=pipe_logger, + type=type, + config=config, + *args, + **kwargs, + ) + + @abstractmethod + async def search( + self, + query: str, + filters: dict[str, Any] = {}, + limit: int = 10, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[VectorSearchResult, None]: + pass + + @abstractmethod + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs, + ) -> AsyncGenerator[VectorSearchResult, None]: + pass diff --git a/R2R/r2r/pipes/ingestion/__init__.py b/R2R/r2r/pipes/ingestion/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/pipes/ingestion/__init__.py diff --git a/R2R/r2r/pipes/ingestion/embedding_pipe.py b/R2R/r2r/pipes/ingestion/embedding_pipe.py new file mode 100755 index 00000000..971ccc9d --- /dev/null +++ b/R2R/r2r/pipes/ingestion/embedding_pipe.py @@ -0,0 +1,218 @@ +import asyncio +import copy +import logging +import uuid +from typing import Any, AsyncGenerator, Optional, Union + +from r2r.base import ( + AsyncState, + EmbeddingProvider, + Extraction, + Fragment, + FragmentType, + KVLoggingSingleton, + PipeType, + R2RDocumentProcessingError, + TextSplitter, + Vector, + VectorEntry, + generate_id_from_label, +) +from r2r.base.pipes.base_pipe import AsyncPipe + +logger = logging.getLogger(__name__) + + +class EmbeddingPipe(AsyncPipe): + """ + Embeds and stores documents using a specified embedding model and database. + """ + + class Input(AsyncPipe.Input): + message: AsyncGenerator[ + Union[Extraction, R2RDocumentProcessingError], None + ] + + def __init__( + self, + embedding_provider: EmbeddingProvider, + text_splitter: TextSplitter, + embedding_batch_size: int = 1, + id_prefix: str = "demo", + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.INGESTOR, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + """ + Initializes the embedding pipe with necessary components and configurations. + """ + super().__init__( + pipe_logger=pipe_logger, + type=type, + config=config + or AsyncPipe.PipeConfig(name="default_embedding_pipe"), + ) + self.embedding_provider = embedding_provider + self.text_splitter = text_splitter + self.embedding_batch_size = embedding_batch_size + self.id_prefix = id_prefix + self.pipe_run_info = None + + async def fragment( + self, extraction: Extraction, run_id: uuid.UUID + ) -> AsyncGenerator[Fragment, None]: + """ + Splits text into manageable chunks for embedding. + """ + if not isinstance(extraction, Extraction): + raise ValueError( + f"Expected an Extraction, but received {type(extraction)}." + ) + if not isinstance(extraction.data, str): + raise ValueError( + f"Expected a string, but received {type(extraction.data)}." + ) + text_chunks = [ + ele.page_content + for ele in self.text_splitter.create_documents([extraction.data]) + ] + for iteration, chunk in enumerate(text_chunks): + fragment = Fragment( + id=generate_id_from_label(f"{extraction.id}-{iteration}"), + type=FragmentType.TEXT, + data=chunk, + metadata=copy.deepcopy(extraction.metadata), + extraction_id=extraction.id, + document_id=extraction.document_id, + ) + yield fragment + iteration += 1 + + async def transform_fragments( + self, fragments: list[Fragment], metadatas: list[dict] + ) -> AsyncGenerator[Fragment, None]: + """ + Transforms text chunks based on their metadata, e.g., adding prefixes. + """ + async for fragment, metadata in zip(fragments, metadatas): + if "chunk_prefix" in metadata: + prefix = metadata.pop("chunk_prefix") + fragment.data = f"{prefix}\n{fragment.data}" + yield fragment + + async def embed(self, fragments: list[Fragment]) -> list[float]: + return await self.embedding_provider.async_get_embeddings( + [fragment.data for fragment in fragments], + EmbeddingProvider.PipeStage.BASE, + ) + + async def _process_batch( + self, fragment_batch: list[Fragment] + ) -> list[VectorEntry]: + """ + Embeds a batch of fragments and yields vector entries. + """ + vectors = await self.embed(fragment_batch) + return [ + VectorEntry( + id=fragment.id, + vector=Vector(data=raw_vector), + metadata={ + "document_id": fragment.document_id, + "extraction_id": fragment.extraction_id, + "text": fragment.data, + **fragment.metadata, + }, + ) + for raw_vector, fragment in zip(vectors, fragment_batch) + ] + + async def _process_and_enqueue_batch( + self, fragment_batch: list[Fragment], vector_entry_queue: asyncio.Queue + ): + try: + batch_result = await self._process_batch(fragment_batch) + for vector_entry in batch_result: + await vector_entry_queue.put(vector_entry) + except Exception as e: + logger.error(f"Error processing batch: {e}") + await vector_entry_queue.put( + R2RDocumentProcessingError( + error_message=str(e), + document_id=fragment_batch[0].document_id, + ) + ) + finally: + await vector_entry_queue.put(None) # Signal completion + + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[Union[R2RDocumentProcessingError, VectorEntry], None]: + """ + Executes the embedding pipe: chunking, transforming, embedding, and storing documents. + """ + vector_entry_queue = asyncio.Queue() + fragment_batch = [] + active_tasks = 0 + + fragment_info = {} + async for extraction in input.message: + if isinstance(extraction, R2RDocumentProcessingError): + yield extraction + continue + + async for fragment in self.fragment(extraction, run_id): + if extraction.document_id in fragment_info: + fragment_info[extraction.document_id] += 1 + else: + fragment_info[extraction.document_id] = 0 # Start with 0 + fragment.metadata["chunk_order"] = fragment_info[ + extraction.document_id + ] + + version = fragment.metadata.get("version", "v0") + + # Ensure fragment ID is set correctly + if not fragment.id: + fragment.id = generate_id_from_label( + f"{extraction.id}-{fragment_info[extraction.document_id]}-{version}" + ) + + fragment_batch.append(fragment) + if len(fragment_batch) >= self.embedding_batch_size: + asyncio.create_task( + self._process_and_enqueue_batch( + fragment_batch.copy(), vector_entry_queue + ) + ) + active_tasks += 1 + fragment_batch.clear() + + logger.debug( + f"Fragmented the input document ids into counts as shown: {fragment_info}" + ) + + if fragment_batch: + asyncio.create_task( + self._process_and_enqueue_batch( + fragment_batch.copy(), vector_entry_queue + ) + ) + active_tasks += 1 + + while active_tasks > 0: + vector_entry = await vector_entry_queue.get() + if vector_entry is None: # Check for termination signal + active_tasks -= 1 + elif isinstance(vector_entry, Exception): + yield vector_entry # Propagate the exception + active_tasks -= 1 + else: + yield vector_entry diff --git a/R2R/r2r/pipes/ingestion/kg_extraction_pipe.py b/R2R/r2r/pipes/ingestion/kg_extraction_pipe.py new file mode 100755 index 00000000..13025e39 --- /dev/null +++ b/R2R/r2r/pipes/ingestion/kg_extraction_pipe.py @@ -0,0 +1,226 @@ +import asyncio +import copy +import json +import logging +import uuid +from typing import Any, AsyncGenerator, Optional + +from r2r.base import ( + AsyncState, + Extraction, + Fragment, + FragmentType, + KGExtraction, + KGProvider, + KVLoggingSingleton, + LLMProvider, + PipeType, + PromptProvider, + TextSplitter, + extract_entities, + extract_triples, + generate_id_from_label, +) +from r2r.base.pipes.base_pipe import AsyncPipe + +logger = logging.getLogger(__name__) + + +class ClientError(Exception): + """Base class for client connection errors.""" + + pass + + +class KGExtractionPipe(AsyncPipe): + """ + Embeds and stores documents using a specified embedding model and database. + """ + + def __init__( + self, + kg_provider: KGProvider, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + text_splitter: TextSplitter, + kg_batch_size: int = 1, + id_prefix: str = "demo", + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.INGESTOR, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + """ + Initializes the embedding pipe with necessary components and configurations. + """ + super().__init__( + pipe_logger=pipe_logger, + type=type, + config=config + or AsyncPipe.PipeConfig(name="default_embedding_pipe"), + ) + + self.kg_provider = kg_provider + self.prompt_provider = prompt_provider + self.llm_provider = llm_provider + self.text_splitter = text_splitter + self.kg_batch_size = kg_batch_size + self.id_prefix = id_prefix + self.pipe_run_info = None + + async def fragment( + self, extraction: Extraction, run_id: uuid.UUID + ) -> AsyncGenerator[Fragment, None]: + """ + Splits text into manageable chunks for embedding. + """ + if not isinstance(extraction, Extraction): + raise ValueError( + f"Expected an Extraction, but received {type(extraction)}." + ) + if not isinstance(extraction.data, str): + raise ValueError( + f"Expected a string, but received {type(extraction.data)}." + ) + text_chunks = [ + ele.page_content + for ele in self.text_splitter.create_documents([extraction.data]) + ] + for iteration, chunk in enumerate(text_chunks): + fragment = Fragment( + id=generate_id_from_label(f"{extraction.id}-{iteration}"), + type=FragmentType.TEXT, + data=chunk, + metadata=copy.deepcopy(extraction.metadata), + extraction_id=extraction.id, + document_id=extraction.document_id, + ) + yield fragment + + async def transform_fragments( + self, fragments: list[Fragment] + ) -> AsyncGenerator[Fragment, None]: + """ + Transforms text chunks based on their metadata, e.g., adding prefixes. + """ + async for fragment in fragments: + if "chunk_prefix" in fragment.metadata: + prefix = fragment.metadata.pop("chunk_prefix") + fragment.data = f"{prefix}\n{fragment.data}" + yield fragment + + async def extract_kg( + self, + fragment: Fragment, + retries: int = 3, + delay: int = 2, + ) -> KGExtraction: + """ + Extracts NER triples from a list of fragments with retries. + """ + task_prompt = self.prompt_provider.get_prompt( + self.kg_provider.config.kg_extraction_prompt, + inputs={"input": fragment.data}, + ) + messages = self.prompt_provider._get_message_payload( + self.prompt_provider.get_prompt("default_system"), task_prompt + ) + for attempt in range(retries): + try: + response = await self.llm_provider.aget_completion( + messages, self.kg_provider.config.kg_extraction_config + ) + + kg_extraction = response.choices[0].message.content + + # Parsing JSON from the response + kg_json = ( + json.loads( + kg_extraction.split("```json")[1].split("```")[0] + ) + if """```json""" in kg_extraction + else json.loads(kg_extraction) + ) + llm_payload = kg_json.get("entities_and_triples", {}) + + # Extract triples with detailed logging + entities = extract_entities(llm_payload) + triples = extract_triples(llm_payload, entities) + + # Create KG extraction object + return KGExtraction(entities=entities, triples=triples) + except ( + ClientError, + json.JSONDecodeError, + KeyError, + IndexError, + ) as e: + logger.error(f"Error in extract_kg: {e}") + if attempt < retries - 1: + await asyncio.sleep(delay) + else: + logger.error(f"Failed after retries with {e}") + # raise e # Ensure the exception is raised after the final attempt + + return KGExtraction(entities={}, triples=[]) + + async def _process_batch( + self, + fragment_batch: list[Fragment], + ) -> list[KGExtraction]: + """ + Embeds a batch of fragments and yields vector entries. + """ + tasks = [ + asyncio.create_task(self.extract_kg(fragment)) + for fragment in fragment_batch + ] + return await asyncio.gather(*tasks) + + async def _run_logic( + self, + input: AsyncPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[KGExtraction, None]: + """ + Executes the embedding pipe: chunking, transforming, embedding, and storing documents. + """ + batch_tasks = [] + fragment_batch = [] + + fragment_info = {} + async for extraction in input.message: + async for fragment in self.transform_fragments( + self.fragment(extraction, run_id) + ): + if extraction.document_id in fragment_info: + fragment_info[extraction.document_id] += 1 + else: + fragment_info[extraction.document_id] = 1 + extraction.metadata["chunk_order"] = fragment_info[ + extraction.document_id + ] + fragment_batch.append(fragment) + if len(fragment_batch) >= self.kg_batch_size: + # Here, ensure `_process_batch` is scheduled as a coroutine, not called directly + batch_tasks.append( + self._process_batch(fragment_batch.copy()) + ) # pass a copy if necessary + fragment_batch.clear() # Clear the batch for new fragments + + logger.debug( + f"Fragmented the input document ids into counts as shown: {fragment_info}" + ) + + if fragment_batch: # Process any remaining fragments + batch_tasks.append(self._process_batch(fragment_batch.copy())) + + # Process tasks as they complete + for task in asyncio.as_completed(batch_tasks): + batch_result = await task # Wait for the next task to complete + for kg_extraction in batch_result: + yield kg_extraction diff --git a/R2R/r2r/pipes/ingestion/kg_storage_pipe.py b/R2R/r2r/pipes/ingestion/kg_storage_pipe.py new file mode 100755 index 00000000..9ac63479 --- /dev/null +++ b/R2R/r2r/pipes/ingestion/kg_storage_pipe.py @@ -0,0 +1,133 @@ +import asyncio +import logging +import uuid +from typing import Any, AsyncGenerator, Optional + +from r2r.base import ( + AsyncState, + EmbeddingProvider, + KGExtraction, + KGProvider, + KVLoggingSingleton, + PipeType, +) +from r2r.base.abstractions.llama_abstractions import EntityNode, Relation +from r2r.base.pipes.base_pipe import AsyncPipe + +logger = logging.getLogger(__name__) + + +class KGStoragePipe(AsyncPipe): + class Input(AsyncPipe.Input): + message: AsyncGenerator[KGExtraction, None] + + def __init__( + self, + kg_provider: KGProvider, + embedding_provider: Optional[EmbeddingProvider] = None, + storage_batch_size: int = 1, + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.INGESTOR, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + """ + Initializes the async knowledge graph storage pipe with necessary components and configurations. + """ + logger.info( + f"Initializing an `KGStoragePipe` to store knowledge graph extractions in a graph database." + ) + + super().__init__( + pipe_logger=pipe_logger, + type=type, + config=config, + *args, + **kwargs, + ) + self.kg_provider = kg_provider + self.embedding_provider = embedding_provider + self.storage_batch_size = storage_batch_size + + async def store( + self, + kg_extractions: list[KGExtraction], + ) -> None: + """ + Stores a batch of knowledge graph extractions in the graph database. + """ + try: + nodes = [] + relations = [] + for extraction in kg_extractions: + for entity in extraction.entities.values(): + embedding = None + if self.embedding_provider: + embedding = self.embedding_provider.get_embedding( + "Entity:\n{entity.value}\nLabel:\n{entity.category}\nSubcategory:\n{entity.subcategory}" + ) + nodes.append( + EntityNode( + name=entity.value, + label=entity.category, + embedding=embedding, + properties=( + {"subcategory": entity.subcategory} + if entity.subcategory + else {} + ), + ) + ) + for triple in extraction.triples: + relations.append( + Relation( + source_id=triple.subject, + target_id=triple.object, + label=triple.predicate, + ) + ) + self.kg_provider.upsert_nodes(nodes) + self.kg_provider.upsert_relations(relations) + except Exception as e: + error_message = f"Failed to store knowledge graph extractions in the database: {e}" + logger.error(error_message) + raise ValueError(error_message) + + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[None, None]: + """ + Executes the async knowledge graph storage pipe: storing knowledge graph extractions in the graph database. + """ + batch_tasks = [] + kg_batch = [] + + async for kg_extraction in input.message: + kg_batch.append(kg_extraction) + if len(kg_batch) >= self.storage_batch_size: + # Schedule the storage task + batch_tasks.append( + asyncio.create_task( + self.store(kg_batch.copy()), + name=f"kg-store-{self.config.name}", + ) + ) + kg_batch.clear() + + if kg_batch: # Process any remaining extractions + batch_tasks.append( + asyncio.create_task( + self.store(kg_batch.copy()), + name=f"kg-store-{self.config.name}", + ) + ) + + # Wait for all storage tasks to complete + await asyncio.gather(*batch_tasks) + yield None diff --git a/R2R/r2r/pipes/ingestion/parsing_pipe.py b/R2R/r2r/pipes/ingestion/parsing_pipe.py new file mode 100755 index 00000000..f3c81ca0 --- /dev/null +++ b/R2R/r2r/pipes/ingestion/parsing_pipe.py @@ -0,0 +1,211 @@ +""" +This module contains the `DocumentParsingPipe` class, which is responsible for parsing incoming documents into plaintext. +""" + +import asyncio +import logging +import time +import uuid +from typing import AsyncGenerator, Optional, Union + +from r2r.base import ( + AsyncParser, + AsyncState, + Document, + DocumentType, + Extraction, + ExtractionType, + KVLoggingSingleton, + PipeType, + generate_id_from_label, +) +from r2r.base.abstractions.exception import R2RDocumentProcessingError +from r2r.base.pipes.base_pipe import AsyncPipe +from r2r.parsers.media.audio_parser import AudioParser +from r2r.parsers.media.docx_parser import DOCXParser +from r2r.parsers.media.img_parser import ImageParser +from r2r.parsers.media.movie_parser import MovieParser +from r2r.parsers.media.pdf_parser import PDFParser +from r2r.parsers.media.ppt_parser import PPTParser +from r2r.parsers.structured.csv_parser import CSVParser +from r2r.parsers.structured.json_parser import JSONParser +from r2r.parsers.structured.xlsx_parser import XLSXParser +from r2r.parsers.text.html_parser import HTMLParser +from r2r.parsers.text.md_parser import MDParser +from r2r.parsers.text.text_parser import TextParser + +logger = logging.getLogger(__name__) + + +class ParsingPipe(AsyncPipe): + """ + Processes incoming documents into plaintext based on their data type. + Supports TXT, JSON, HTML, and PDF formats. + """ + + class Input(AsyncPipe.Input): + message: AsyncGenerator[Document, None] + + AVAILABLE_PARSERS = { + DocumentType.CSV: CSVParser, + DocumentType.DOCX: DOCXParser, + DocumentType.HTML: HTMLParser, + DocumentType.JSON: JSONParser, + DocumentType.MD: MDParser, + DocumentType.PDF: PDFParser, + DocumentType.PPTX: PPTParser, + DocumentType.TXT: TextParser, + DocumentType.XLSX: XLSXParser, + DocumentType.GIF: ImageParser, + DocumentType.JPEG: ImageParser, + DocumentType.JPG: ImageParser, + DocumentType.PNG: ImageParser, + DocumentType.SVG: ImageParser, + DocumentType.MP3: AudioParser, + DocumentType.MP4: MovieParser, + } + + IMAGE_TYPES = { + DocumentType.GIF, + DocumentType.JPG, + DocumentType.JPEG, + DocumentType.PNG, + DocumentType.SVG, + } + + def __init__( + self, + excluded_parsers: list[DocumentType], + override_parsers: Optional[dict[DocumentType, AsyncParser]] = None, + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.INGESTOR, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + super().__init__( + pipe_logger=pipe_logger, + type=type, + config=config + or AsyncPipe.PipeConfig(name="default_document_parsing_pipe"), + *args, + **kwargs, + ) + + self.parsers = {} + + if not override_parsers: + override_parsers = {} + + # Apply overrides if specified + for doc_type, parser in override_parsers.items(): + self.parsers[doc_type] = parser + + for doc_type, parser_info in self.AVAILABLE_PARSERS.items(): + if ( + doc_type not in excluded_parsers + and doc_type not in self.parsers + ): + self.parsers[doc_type] = parser_info() + + @property + def supported_types(self) -> list[str]: + """ + Lists the data types supported by the pipe. + """ + return [entry_type for entry_type in DocumentType] + + async def _parse( + self, + document: Document, + run_id: uuid.UUID, + version: str, + ) -> AsyncGenerator[Union[R2RDocumentProcessingError, Extraction], None]: + if document.type not in self.parsers: + yield R2RDocumentProcessingError( + document_id=document.id, + error_message=f"Parser for {document.type} not found in `ParsingPipe`.", + ) + return + parser = self.parsers[document.type] + texts = parser.ingest(document.data) + extraction_type = ExtractionType.TXT + t0 = time.time() + if document.type in self.IMAGE_TYPES: + extraction_type = ExtractionType.IMG + document.metadata["image_type"] = document.type.value + # SAVE IMAGE DATA + # try: + # import base64 + # sanitized_data = base64.b64encode(document.data).decode('utf-8') + # except Exception as e: + # sanitized_data = document.data + + # document.metadata["image_data"] = sanitized_data + elif document.type == DocumentType.MP4: + extraction_type = ExtractionType.MOV + document.metadata["audio_type"] = document.type.value + + iteration = 0 + async for text in texts: + extraction_id = generate_id_from_label( + f"{document.id}-{iteration}-{version}" + ) + document.metadata["version"] = version + extraction = Extraction( + id=extraction_id, + data=text, + metadata=document.metadata, + document_id=document.id, + type=extraction_type, + ) + yield extraction + # TODO - Add settings to enable extraction logging + # extraction_dict = extraction.dict() + # await self.enqueue_log( + # run_id=run_id, + # key="extraction", + # value=json.dumps( + # { + # "data": extraction_dict["data"], + # "document_id": str(extraction_dict["document_id"]), + # "extraction_id": str(extraction_dict["id"]), + # } + # ), + # ) + iteration += 1 + logger.debug( + f"Parsed document with id={document.id}, title={document.metadata.get('title', None)}, user_id={document.metadata.get('user_id', None)}, metadata={document.metadata} into {iteration} extractions in t={time.time() - t0:.2f} seconds." + ) + + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + versions: Optional[list[str]] = None, + *args, + **kwargs, + ) -> AsyncGenerator[Extraction, None]: + parse_tasks = [] + + iteration = 0 + async for document in input.message: + version = versions[iteration] if versions else "v0" + iteration += 1 + parse_tasks.append( + self._handle_parse_task(document, version, run_id) + ) + + # Await all tasks and yield results concurrently + for parse_task in asyncio.as_completed(parse_tasks): + for extraction in await parse_task: + yield extraction + + async def _handle_parse_task( + self, document: Document, version: str, run_id: uuid.UUID + ) -> AsyncGenerator[Extraction, None]: + extractions = [] + async for extraction in self._parse(document, run_id, version): + extractions.append(extraction) + return extractions diff --git a/R2R/r2r/pipes/ingestion/vector_storage_pipe.py b/R2R/r2r/pipes/ingestion/vector_storage_pipe.py new file mode 100755 index 00000000..9564fd22 --- /dev/null +++ b/R2R/r2r/pipes/ingestion/vector_storage_pipe.py @@ -0,0 +1,128 @@ +import asyncio +import logging +import uuid +from typing import Any, AsyncGenerator, Optional, Tuple, Union + +from r2r.base import ( + AsyncState, + KVLoggingSingleton, + PipeType, + VectorDBProvider, + VectorEntry, +) +from r2r.base.pipes.base_pipe import AsyncPipe + +from ...base.abstractions.exception import R2RDocumentProcessingError + +logger = logging.getLogger(__name__) + + +class VectorStoragePipe(AsyncPipe): + class Input(AsyncPipe.Input): + message: AsyncGenerator[ + Union[R2RDocumentProcessingError, VectorEntry], None + ] + do_upsert: bool = True + + def __init__( + self, + vector_db_provider: VectorDBProvider, + storage_batch_size: int = 128, + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.INGESTOR, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + """ + Initializes the async vector storage pipe with necessary components and configurations. + """ + super().__init__( + pipe_logger=pipe_logger, + type=type, + config=config, + *args, + **kwargs, + ) + self.vector_db_provider = vector_db_provider + self.storage_batch_size = storage_batch_size + + async def store( + self, + vector_entries: list[VectorEntry], + do_upsert: bool = True, + ) -> None: + """ + Stores a batch of vector entries in the database. + """ + + try: + if do_upsert: + self.vector_db_provider.upsert_entries(vector_entries) + else: + self.vector_db_provider.copy_entries(vector_entries) + except Exception as e: + error_message = ( + f"Failed to store vector entries in the database: {e}" + ) + logger.error(error_message) + raise ValueError(error_message) + + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[ + Tuple[uuid.UUID, Union[str, R2RDocumentProcessingError]], None + ]: + """ + Executes the async vector storage pipe: storing embeddings in the vector database. + """ + batch_tasks = [] + vector_batch = [] + document_counts = {} + i = 0 + async for msg in input.message: + i += 1 + if isinstance(msg, R2RDocumentProcessingError): + yield (msg.document_id, msg) + continue + + document_id = msg.metadata.get("document_id", None) + if not document_id: + raise ValueError("Document ID not found in the metadata.") + if document_id not in document_counts: + document_counts[document_id] = 1 + else: + document_counts[document_id] += 1 + + vector_batch.append(msg) + if len(vector_batch) >= self.storage_batch_size: + # Schedule the storage task + batch_tasks.append( + asyncio.create_task( + self.store(vector_batch.copy(), input.do_upsert), + name=f"vector-store-{self.config.name}", + ) + ) + vector_batch.clear() + + if vector_batch: # Process any remaining vectors + batch_tasks.append( + asyncio.create_task( + self.store(vector_batch.copy(), input.do_upsert), + name=f"vector-store-{self.config.name}", + ) + ) + + # Wait for all storage tasks to complete + await asyncio.gather(*batch_tasks) + + for document_id, count in document_counts.items(): + yield ( + document_id, + f"Processed {count} vectors for document {document_id}.", + ) diff --git a/R2R/r2r/pipes/other/eval_pipe.py b/R2R/r2r/pipes/other/eval_pipe.py new file mode 100755 index 00000000..b1c60343 --- /dev/null +++ b/R2R/r2r/pipes/other/eval_pipe.py @@ -0,0 +1,54 @@ +import logging +import uuid +from typing import Any, AsyncGenerator, Optional + +from pydantic import BaseModel + +from r2r import AsyncState, EvalProvider, LLMChatCompletion, PipeType +from r2r.base.abstractions.llm import GenerationConfig +from r2r.base.pipes.base_pipe import AsyncPipe + +logger = logging.getLogger(__name__) + + +class EvalPipe(AsyncPipe): + class EvalPayload(BaseModel): + query: str + context: str + completion: str + + class Input(AsyncPipe.Input): + message: AsyncGenerator["EvalPipe.EvalPayload", None] + + def __init__( + self, + eval_provider: EvalProvider, + type: PipeType = PipeType.EVAL, + config: Optional[AsyncPipe.PipeConfig] = None, + *args, + **kwargs, + ): + self.eval_provider = eval_provider + super().__init__( + type=type, + config=config or AsyncPipe.PipeConfig(name="default_eval_pipe"), + *args, + **kwargs, + ) + + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + eval_generation_config: GenerationConfig, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[LLMChatCompletion, None]: + async for item in input.message: + yield self.eval_provider.evaluate( + item.query, + item.context, + item.completion, + eval_generation_config, + ) diff --git a/R2R/r2r/pipes/other/web_search_pipe.py b/R2R/r2r/pipes/other/web_search_pipe.py new file mode 100755 index 00000000..92e3feee --- /dev/null +++ b/R2R/r2r/pipes/other/web_search_pipe.py @@ -0,0 +1,105 @@ +import json +import logging +import uuid +from typing import Any, AsyncGenerator, Optional + +from r2r.base import ( + AsyncPipe, + AsyncState, + PipeType, + VectorSearchResult, + generate_id_from_label, +) +from r2r.integrations import SerperClient + +from ..abstractions.search_pipe import SearchPipe + +logger = logging.getLogger(__name__) + + +class WebSearchPipe(SearchPipe): + def __init__( + self, + serper_client: SerperClient, + type: PipeType = PipeType.SEARCH, + config: Optional[SearchPipe.SearchConfig] = None, + *args, + **kwargs, + ): + super().__init__( + type=type, + config=config or SearchPipe.SearchConfig(), + *args, + **kwargs, + ) + self.serper_client = serper_client + + async def search( + self, + message: str, + run_id: uuid.UUID, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[VectorSearchResult, None]: + search_limit_override = kwargs.get("search_limit", None) + await self.enqueue_log( + run_id=run_id, key="search_query", value=message + ) + # TODO - Make more general in the future by creating a SearchProvider interface + results = self.serper_client.get_raw( + query=message, + limit=search_limit_override or self.config.search_limit, + ) + + search_results = [] + for result in results: + if result.get("snippet") is None: + continue + result["text"] = result.pop("snippet") + search_result = VectorSearchResult( + id=generate_id_from_label(str(result)), + score=result.get( + "score", 0 + ), # TODO - Consider dynamically generating scores based on similarity + metadata=result, + ) + search_results.append(search_result) + yield search_result + + await self.enqueue_log( + run_id=run_id, + key="search_results", + value=json.dumps([ele.json() for ele in search_results]), + ) + + async def _run_logic( + self, + input: AsyncPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + *args: Any, + **kwargs, + ) -> AsyncGenerator[VectorSearchResult, None]: + search_queries = [] + search_results = [] + async for search_request in input.message: + search_queries.append(search_request) + async for result in self.search( + message=search_request, run_id=run_id, *args, **kwargs + ): + search_results.append(result) + yield result + + await state.update( + self.config.name, {"output": {"search_results": search_results}} + ) + + await state.update( + self.config.name, + { + "output": { + "search_queries": search_queries, + "search_results": search_results, + } + }, + ) diff --git a/R2R/r2r/pipes/retrieval/kg_agent_search_pipe.py b/R2R/r2r/pipes/retrieval/kg_agent_search_pipe.py new file mode 100755 index 00000000..60935265 --- /dev/null +++ b/R2R/r2r/pipes/retrieval/kg_agent_search_pipe.py @@ -0,0 +1,103 @@ +import logging +import uuid +from typing import Any, Optional + +from r2r.base import ( + AsyncState, + KGProvider, + KGSearchSettings, + KVLoggingSingleton, + LLMProvider, + PipeType, + PromptProvider, +) + +from ..abstractions.generator_pipe import GeneratorPipe + +logger = logging.getLogger(__name__) + + +class KGAgentSearchPipe(GeneratorPipe): + """ + Embeds and stores documents using a specified embedding model and database. + """ + + def __init__( + self, + kg_provider: KGProvider, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + pipe_logger: Optional[KVLoggingSingleton] = None, + type: PipeType = PipeType.INGESTOR, + config: Optional[GeneratorPipe.PipeConfig] = None, + *args, + **kwargs, + ): + """ + Initializes the embedding pipe with necessary components and configurations. + """ + super().__init__( + llm_provider=llm_provider, + prompt_provider=prompt_provider, + type=type, + config=config + or GeneratorPipe.Config( + name="kg_rag_pipe", task_prompt="kg_agent" + ), + pipe_logger=pipe_logger, + *args, + **kwargs, + ) + self.kg_provider = kg_provider + self.llm_provider = llm_provider + self.prompt_provider = prompt_provider + self.pipe_run_info = None + + async def _run_logic( + self, + input: GeneratorPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + kg_search_settings: KGSearchSettings, + *args: Any, + **kwargs: Any, + ): + async for message in input.message: + # TODO - Remove hard code + formatted_prompt = self.prompt_provider.get_prompt( + "kg_agent", {"input": message} + ) + messages = self._get_message_payload(formatted_prompt) + + result = await self.llm_provider.aget_completion( + messages=messages, + generation_config=kg_search_settings.agent_generation_config, + ) + + extraction = result.choices[0].message.content + query = extraction.split("```cypher")[1].split("```")[0] + result = self.kg_provider.structured_query(query) + yield (query, result) + + await self.enqueue_log( + run_id=run_id, + key="kg_agent_response", + value=extraction, + ) + + await self.enqueue_log( + run_id=run_id, + key="kg_agent_execution_result", + value=result, + ) + + def _get_message_payload(self, message: str) -> dict: + return [ + { + "role": "system", + "content": self.prompt_provider.get_prompt( + self.config.system_prompt, + ), + }, + {"role": "user", "content": message}, + ] diff --git a/R2R/r2r/pipes/retrieval/multi_search.py b/R2R/r2r/pipes/retrieval/multi_search.py new file mode 100755 index 00000000..6da2c34b --- /dev/null +++ b/R2R/r2r/pipes/retrieval/multi_search.py @@ -0,0 +1,79 @@ +import uuid +from copy import copy +from typing import Any, AsyncGenerator, Optional + +from r2r.base.abstractions.llm import GenerationConfig +from r2r.base.abstractions.search import VectorSearchResult +from r2r.base.pipes.base_pipe import AsyncPipe + +from ..abstractions.search_pipe import SearchPipe +from .query_transform_pipe import QueryTransformPipe + + +class MultiSearchPipe(AsyncPipe): + class PipeConfig(AsyncPipe.PipeConfig): + name: str = "multi_search_pipe" + + def __init__( + self, + query_transform_pipe: QueryTransformPipe, + inner_search_pipe: SearchPipe, + config: Optional[PipeConfig] = None, + *args, + **kwargs, + ): + self.query_transform_pipe = query_transform_pipe + self.vector_search_pipe = inner_search_pipe + if ( + not query_transform_pipe.config.name + == inner_search_pipe.config.name + ): + raise ValueError( + "The query transform pipe and search pipe must have the same name." + ) + if config and not config.name == query_transform_pipe.config.name: + raise ValueError( + "The pipe config name must match the query transform pipe name." + ) + + super().__init__( + config=config + or MultiSearchPipe.PipeConfig( + name=query_transform_pipe.config.name + ), + *args, + **kwargs, + ) + + async def _run_logic( + self, + input: Any, + state: Any, + run_id: uuid.UUID, + query_transform_generation_config: Optional[GenerationConfig] = None, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[VectorSearchResult, None]: + query_transform_generation_config = ( + query_transform_generation_config + or copy(kwargs.get("rag_generation_config", None)) + or GenerationConfig(model="gpt-4o") + ) + query_transform_generation_config.stream = False + + query_generator = await self.query_transform_pipe.run( + input, + state, + query_transform_generation_config=query_transform_generation_config, + num_query_xf_outputs=3, + *args, + **kwargs, + ) + + async for search_result in await self.vector_search_pipe.run( + self.vector_search_pipe.Input(message=query_generator), + state, + *args, + **kwargs, + ): + yield search_result diff --git a/R2R/r2r/pipes/retrieval/query_transform_pipe.py b/R2R/r2r/pipes/retrieval/query_transform_pipe.py new file mode 100755 index 00000000..99df6b5b --- /dev/null +++ b/R2R/r2r/pipes/retrieval/query_transform_pipe.py @@ -0,0 +1,101 @@ +import logging +import uuid +from typing import Any, AsyncGenerator, Optional + +from r2r.base import ( + AsyncPipe, + AsyncState, + LLMProvider, + PipeType, + PromptProvider, +) +from r2r.base.abstractions.llm import GenerationConfig + +from ..abstractions.generator_pipe import GeneratorPipe + +logger = logging.getLogger(__name__) + + +class QueryTransformPipe(GeneratorPipe): + class QueryTransformConfig(GeneratorPipe.PipeConfig): + name: str = "default_query_transform" + system_prompt: str = "default_system" + task_prompt: str = "hyde" + + class Input(GeneratorPipe.Input): + message: AsyncGenerator[str, None] + + def __init__( + self, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + type: PipeType = PipeType.TRANSFORM, + config: Optional[QueryTransformConfig] = None, + *args, + **kwargs, + ): + logger.info(f"Initalizing an `QueryTransformPipe` pipe.") + super().__init__( + llm_provider=llm_provider, + prompt_provider=prompt_provider, + type=type, + config=config or QueryTransformPipe.QueryTransformConfig(), + *args, + **kwargs, + ) + + async def _run_logic( + self, + input: AsyncPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + query_transform_generation_config: GenerationConfig, + num_query_xf_outputs: int = 3, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[str, None]: + async for query in input.message: + logger.info( + f"Transforming query: {query} into {num_query_xf_outputs} outputs with {self.config.task_prompt}." + ) + + query_transform_request = self._get_message_payload( + query, num_outputs=num_query_xf_outputs + ) + + response = await self.llm_provider.aget_completion( + messages=query_transform_request, + generation_config=query_transform_generation_config, + ) + content = self.llm_provider.extract_content(response) + outputs = content.split("\n") + outputs = [ + output.strip() for output in outputs if output.strip() != "" + ] + await state.update( + self.config.name, {"output": {"outputs": outputs}} + ) + + for output in outputs: + logger.info(f"Yielding transformed output: {output}") + yield output + + def _get_message_payload(self, input: str, num_outputs: int) -> dict: + return [ + { + "role": "system", + "content": self.prompt_provider.get_prompt( + self.config.system_prompt, + ), + }, + { + "role": "user", + "content": self.prompt_provider.get_prompt( + self.config.task_prompt, + inputs={ + "message": input, + "num_outputs": num_outputs, + }, + ), + }, + ] diff --git a/R2R/r2r/pipes/retrieval/search_rag_pipe.py b/R2R/r2r/pipes/retrieval/search_rag_pipe.py new file mode 100755 index 00000000..4d01d2df --- /dev/null +++ b/R2R/r2r/pipes/retrieval/search_rag_pipe.py @@ -0,0 +1,130 @@ +import logging +import uuid +from typing import Any, AsyncGenerator, Optional, Tuple + +from r2r.base import ( + AggregateSearchResult, + AsyncPipe, + AsyncState, + LLMProvider, + PipeType, + PromptProvider, +) +from r2r.base.abstractions.llm import GenerationConfig, RAGCompletion + +from ..abstractions.generator_pipe import GeneratorPipe + +logger = logging.getLogger(__name__) + + +class SearchRAGPipe(GeneratorPipe): + class Input(AsyncPipe.Input): + message: AsyncGenerator[Tuple[str, AggregateSearchResult], None] + + def __init__( + self, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + type: PipeType = PipeType.GENERATOR, + config: Optional[GeneratorPipe] = None, + *args, + **kwargs, + ): + super().__init__( + llm_provider=llm_provider, + prompt_provider=prompt_provider, + type=type, + config=config + or GeneratorPipe.Config( + name="default_rag_pipe", task_prompt="default_rag" + ), + *args, + **kwargs, + ) + + async def _run_logic( + self, + input: Input, + state: AsyncState, + run_id: uuid.UUID, + rag_generation_config: GenerationConfig, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[RAGCompletion, None]: + context = "" + search_iteration = 1 + total_results = 0 + # must select a query if there are multiple + sel_query = None + async for query, search_results in input.message: + if search_iteration == 1: + sel_query = query + context_piece, total_results = await self._collect_context( + query, search_results, search_iteration, total_results + ) + context += context_piece + search_iteration += 1 + + messages = self._get_message_payload(sel_query, context) + + response = await self.llm_provider.aget_completion( + messages=messages, generation_config=rag_generation_config + ) + yield RAGCompletion(completion=response, search_results=search_results) + + await self.enqueue_log( + run_id=run_id, + key="llm_response", + value=response.choices[0].message.content, + ) + + def _get_message_payload(self, query: str, context: str) -> dict: + return [ + { + "role": "system", + "content": self.prompt_provider.get_prompt( + self.config.system_prompt, + ), + }, + { + "role": "user", + "content": self.prompt_provider.get_prompt( + self.config.task_prompt, + inputs={ + "query": query, + "context": context, + }, + ), + }, + ] + + async def _collect_context( + self, + query: str, + results: AggregateSearchResult, + iteration: int, + total_results: int, + ) -> Tuple[str, int]: + context = f"Query:\n{query}\n\n" + if results.vector_search_results: + context += f"Vector Search Results({iteration}):\n" + it = total_results + 1 + for result in results.vector_search_results: + context += f"[{it}]: {result.metadata['text']}\n\n" + it += 1 + total_results = ( + it - 1 + ) # Update total_results based on the last index used + if results.kg_search_results: + context += f"Knowledge Graph ({iteration}):\n" + it = total_results + 1 + for query, search_results in results.kg_search_results: # [1]: + context += f"Query: {query}\n\n" + context += f"Results:\n" + for search_result in search_results: + context += f"[{it}]: {search_result}\n\n" + it += 1 + total_results = ( + it - 1 + ) # Update total_results based on the last index used + return context, total_results diff --git a/R2R/r2r/pipes/retrieval/streaming_rag_pipe.py b/R2R/r2r/pipes/retrieval/streaming_rag_pipe.py new file mode 100755 index 00000000..b01f6445 --- /dev/null +++ b/R2R/r2r/pipes/retrieval/streaming_rag_pipe.py @@ -0,0 +1,131 @@ +import json +import logging +import uuid +from typing import Any, AsyncGenerator, Generator, Optional + +from r2r.base import ( + AsyncState, + LLMChatCompletionChunk, + LLMProvider, + PipeType, + PromptProvider, +) +from r2r.base.abstractions.llm import GenerationConfig + +from ..abstractions.generator_pipe import GeneratorPipe +from .search_rag_pipe import SearchRAGPipe + +logger = logging.getLogger(__name__) + + +class StreamingSearchRAGPipe(SearchRAGPipe): + SEARCH_STREAM_MARKER = "search" + COMPLETION_STREAM_MARKER = "completion" + + def __init__( + self, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + type: PipeType = PipeType.GENERATOR, + config: Optional[GeneratorPipe] = None, + *args, + **kwargs, + ): + super().__init__( + llm_provider=llm_provider, + prompt_provider=prompt_provider, + type=type, + config=config + or GeneratorPipe.Config( + name="default_streaming_rag_pipe", task_prompt="default_rag" + ), + *args, + **kwargs, + ) + + async def _run_logic( + self, + input: SearchRAGPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + rag_generation_config: GenerationConfig, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[str, None]: + iteration = 0 + context = "" + # dump the search results and construct the context + async for query, search_results in input.message: + yield f"<{self.SEARCH_STREAM_MARKER}>" + if search_results.vector_search_results: + context += "Vector Search Results:\n" + for result in search_results.vector_search_results: + if iteration >= 1: + yield "," + yield json.dumps(result.json()) + context += ( + f"{iteration + 1}:\n{result.metadata['text']}\n\n" + ) + iteration += 1 + + # if search_results.kg_search_results: + # for result in search_results.kg_search_results: + # if iteration >= 1: + # yield "," + # yield json.dumps(result.json()) + # context += f"Result {iteration+1}:\n{result.metadata['text']}\n\n" + # iteration += 1 + + yield f"</{self.SEARCH_STREAM_MARKER}>" + + messages = self._get_message_payload(query, context) + yield f"<{self.COMPLETION_STREAM_MARKER}>" + response = "" + for chunk in self.llm_provider.get_completion_stream( + messages=messages, generation_config=rag_generation_config + ): + chunk = StreamingSearchRAGPipe._process_chunk(chunk) + response += chunk + yield chunk + + yield f"</{self.COMPLETION_STREAM_MARKER}>" + + await self.enqueue_log( + run_id=run_id, + key="llm_response", + value=response, + ) + + async def _yield_chunks( + self, + start_marker: str, + chunks: Generator[str, None, None], + end_marker: str, + ) -> str: + yield start_marker + for chunk in chunks: + yield chunk + yield end_marker + + def _get_message_payload( + self, query: str, context: str + ) -> list[dict[str, str]]: + return [ + { + "role": "system", + "content": self.prompt_provider.get_prompt( + self.config.system_prompt + ), + }, + { + "role": "user", + "content": self.prompt_provider.get_prompt( + self.config.task_prompt, + inputs={"query": query, "context": context}, + ), + }, + ] + + @staticmethod + def _process_chunk(chunk: LLMChatCompletionChunk) -> str: + return chunk.choices[0].delta.content or "" diff --git a/R2R/r2r/pipes/retrieval/vector_search_pipe.py b/R2R/r2r/pipes/retrieval/vector_search_pipe.py new file mode 100755 index 00000000..742de16b --- /dev/null +++ b/R2R/r2r/pipes/retrieval/vector_search_pipe.py @@ -0,0 +1,123 @@ +import json +import logging +import uuid +from typing import Any, AsyncGenerator, Optional + +from r2r.base import ( + AsyncPipe, + AsyncState, + EmbeddingProvider, + PipeType, + VectorDBProvider, + VectorSearchResult, + VectorSearchSettings, +) + +from ..abstractions.search_pipe import SearchPipe + +logger = logging.getLogger(__name__) + + +class VectorSearchPipe(SearchPipe): + def __init__( + self, + vector_db_provider: VectorDBProvider, + embedding_provider: EmbeddingProvider, + type: PipeType = PipeType.SEARCH, + config: Optional[SearchPipe.SearchConfig] = None, + *args, + **kwargs, + ): + super().__init__( + type=type, + config=config or SearchPipe.SearchConfig(), + *args, + **kwargs, + ) + self.embedding_provider = embedding_provider + self.vector_db_provider = vector_db_provider + + async def search( + self, + message: str, + run_id: uuid.UUID, + vector_search_settings: VectorSearchSettings, + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[VectorSearchResult, None]: + await self.enqueue_log( + run_id=run_id, key="search_query", value=message + ) + search_filters = ( + vector_search_settings.search_filters or self.config.search_filters + ) + search_limit = ( + vector_search_settings.search_limit or self.config.search_limit + ) + results = [] + query_vector = self.embedding_provider.get_embedding( + message, + ) + search_results = ( + self.vector_db_provider.hybrid_search( + query_vector=query_vector, + query_text=message, + filters=search_filters, + limit=search_limit, + ) + if vector_search_settings.do_hybrid_search + else self.vector_db_provider.search( + query_vector=query_vector, + filters=search_filters, + limit=search_limit, + ) + ) + reranked_results = self.embedding_provider.rerank( + query=message, results=search_results, limit=search_limit + ) + for result in reranked_results: + result.metadata["associatedQuery"] = message + results.append(result) + yield result + await self.enqueue_log( + run_id=run_id, + key="search_results", + value=json.dumps([ele.json() for ele in results]), + ) + + async def _run_logic( + self, + input: AsyncPipe.Input, + state: AsyncState, + run_id: uuid.UUID, + vector_search_settings: VectorSearchSettings = VectorSearchSettings(), + *args: Any, + **kwargs: Any, + ) -> AsyncGenerator[VectorSearchResult, None]: + search_queries = [] + search_results = [] + async for search_request in input.message: + search_queries.append(search_request) + async for result in self.search( + message=search_request, + run_id=run_id, + vector_search_settings=vector_search_settings, + *args, + **kwargs, + ): + search_results.append(result) + yield result + + await state.update( + self.config.name, {"output": {"search_results": search_results}} + ) + + await state.update( + self.config.name, + { + "output": { + "search_queries": search_queries, + "search_results": search_results, + } + }, + ) diff --git a/R2R/r2r/prompts/__init__.py b/R2R/r2r/prompts/__init__.py new file mode 100755 index 00000000..88ed0658 --- /dev/null +++ b/R2R/r2r/prompts/__init__.py @@ -0,0 +1,3 @@ +from .local.r2r_prompt_provider import R2RPromptProvider + +__all__ = ["R2RPromptProvider"] diff --git a/R2R/r2r/prompts/local/__init__.py b/R2R/r2r/prompts/local/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/prompts/local/__init__.py diff --git a/R2R/r2r/prompts/local/defaults.jsonl b/R2R/r2r/prompts/local/defaults.jsonl new file mode 100755 index 00000000..042136f6 --- /dev/null +++ b/R2R/r2r/prompts/local/defaults.jsonl @@ -0,0 +1,12 @@ +{"name": "default_system", "template": "You are a helpful assistant.", "input_types": {}} +{"name": "default_rag", "template": "## Task:\n\nAnswer the query given immediately below given the context which follows later. Use line item references to like [1], [2], ... refer to specifically numbered items in the provided context. Pay close attention to the title of each given source to ensure it is consistent with the query.\n\n### Query:\n{query}\n\n### Context:\n{context}\n\n### Query:\n{query}\n\nREMINDER - Use line item references to like [1], [2], ... refer to specifically numbered items in the provided context.\n## Response:\n", "input_types": {"query": "str", "context": "str"}} +{"name": "hyde", "template": "### Instruction:\n\nGiven the query that follows write a double newline separated list of {num_outputs} single paragraph distinct attempted answers to the given query. \nDO NOT generate any single answer which is likely to require information from multiple distinct documents, \nEACH single answer will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents. \nFOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two attempted answers would be \n`The key themes of Great Gatsby are ... ANSWER_CONTINUED` and `The key themes of 1984 are ... ANSWER_CONTINUED`, where `ANSWER_CONTINUED` IS TO BE COMPLETED BY YOU in your response. \nHere is the original user query to be transformed into answers:\n\n### Query:\n{message}\n\n### Response:\n", "input_types": {"num_outputs": "int", "message": "str"}} +{"name": "rag_fusion_prompt", "template": "### Instruction:\n\nGiven the following query that follows to write a double newline separated list of up to {num_outputs} queries meant to help answer the original query. \nDO NOT generate any single query which is likely to require information from multiple distinct documents, \nEACH single query will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents. \nFOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two queries would be \n`What are the key themes of Great Gatsby?` and `What are the key themes of 1984?`.\nHere is the original user query to be transformed into answers:\n\n### Query:\n{message}\n\n### Response:\n", "input_types": {"num_outputs": "int", "message": "str"}} +{"name": "rag_answer_eval", "template": "### Instruction:\n\nYou are given a `query`, related `context` and an associated `answer`. Your task is to sequentially score each sentence in the given answer as either 1 or 0, based on whether or not the given sentence is relevant to the given query and supported in full by the given context.\n### Example:\n#### Input:\n\nQuery:\nWhy does Alice prefer spending her mornings in the garden?\n\nContext:\nAlice loves to read books in her garden. She has a large collection of mystery novels. Every morning, she spends an hour reading while drinking her favorite tea. Her garden is filled with various flowers, and she especially loves the roses. On weekends, Alice's friend, Bob, often joins her for tea and they discuss the books they've read.\n\nAnswer:\nAlice enjoys her mornings in the garden because she loves to read there. She often listens to music while reading.\n####### Response:\n\n([1,0], '1/2')### Input:\nQuery:\n{query}\n\nContext:\n{context}\n\nAnswer:\n{answer}\n\nResponse:\n\n", "input_types": {"query": "str", "context": "str", "answer": "str"}} +{"name": "rag_context_eval", "template": "### Instruction:\n\nYou are given a `query` and an associated `context`. Your task is to sequentially score each sentence in the context as either 1 or 0, based on the relevancy to the given query. For instance, if the query is \"What is the capital of France?\" then the sentence \"The capital of France is Paris\" would receive a +1 value, whereas \"The french enjoy wine\" would receive a 0. Return your response as a tuple containing a list of 1s and 0s, where each value corresponds to the respective sentence in the context, and then the rational fraction of 1's to the total number of sentences (e.g. '1/4'). NOTE - do not include ANY extra text other than the requested tuple.\n\nQuery:\n{query}\n\nContext:\n{context}\n\n###Response\n\n", "input_types": {"query": "str", "context": "str"}} +{"name": "few_shot_ner_kg_extraction", "template": "### Instruction\nYou will shortly be asked to perform Named Entity Recognition (NER) and knowledge graph triplet extraction on the text that follows. NER involves identifying named entities in a text, and knowledge graph triplet extraction involves identifying relationships between these entities and other attributes in the text.\n\nA knowledge graph triplet contains the three following pieces of information:\n- `subject`: The main entity.\n- `predicate`: The relationship type.\n- `object`: The related entity.\n\nThey are represented below as `[subject]:<predicate>:[object]`.\n\n#### Process \n**Identify Named Entities**: Extract entities based on the given entity types, ensuring they appear in the order they are mentioned in the text.\n**Establish Triplets**: Form triplets using the provided predicates, again in the order they appear in the text.\n\nYour final response should follow this format:\n\n**Output:**\n```json\n{{\n \"entities_and_triples\": [\n \"[1], entity_type:entity_name\",\n \"[1] predicate [2]\",\n \"[1] predicate [3]\",\n \"[2], entity_type:entity_name\",\n ...\n ]\n}}\n```\n\n### Example:\n\n**Entity Types:**\nORGANIZATION\nCOMPANY\nCITY\nSTATE\nCOUNTRY\nOTHER\nPERSON\nYEAR\nMONTH\nDAY\nOTHER\nQUANTITY\nEVENT\n\n**Predicates:**\nFOUNDED_BY\nHEADQUARTERED_IN\nOPERATES_IN\nOWNED_BY\nACQUIRED_BY\nHAS_EMPLOYEE_COUNT\nGENERATED_REVENUE\nLISTED_ON\nINCORPORATED\nHAS_DIVISION\nALIAS\nANNOUNCED\nHAS_QUANTITY\nAS_OF\n\n**Input:**\nWalmart Inc. (formerly Wal-Mart Stores, Inc.) is an American multinational retail corporation that operates a chain of hypermarkets (also called supercenters), discount department stores, and grocery stores in the United States, headquartered in Bentonville, Arkansas.[10] The company was founded by brothers Sam and James \"Bud\" Walton in nearby Rogers, Arkansas in 1962 and incorporated under Delaware General Corporation Law on October 31, 1969. It also owns and operates Sam's Club retail warehouses.[11][12]\n\nAs of October 31, 2022, Walmart has 10,586 stores and clubs in 24 countries, operating under 46 different names.[2][3][4] The company operates under the name Walmart in the United States and Canada, as Walmart de M\u00e9xico y Centroam\u00e9rica in Mexico and Central America, and as Flipkart Wholesale in India.\n\n**Output:**\n```json\n{{\n \"entities_and_triples\": [\n \"[1], company:Walmart Inc.\",\n \"[2], company:Wal-Mart Stores, Inc.\",\n \"[1] ALIAS [2]\",\n \"[3], location:country:United States\",\n \"[1] OPERATES_IN [3]\",\n \"[4], location:city:Bentonville\",\n \"[1] HEADQUARTERED_IN [4]\",\n \"[5], location:state:Arkansas\",\n \"[1] HEADQUARTERED_IN [5]\",\n \"[6], person:Sam Walton\",\n \"[1] FOUNDED_BY [6]\",\n \"[7], person:James Walton\",\n \"[8], person:Bud Walton\",\n \"[7] ALIAS [8]\",\n \"[1] FOUNDED_BY [7]\",\n \"[9], location:city:Rogers\",\n \"[10], date:year:1962\",\n \"[11], event:incorporated under Delaware General Corporation Law\",\n \"[1] INCORPORATED [11]\",\n \"[12], date:day:October 31\",\n \"[1] INCORPORATED [12]\",\n \"[13], date:year:1969\",\n \"[1] INCORPORATED [13]\",\n \"[14], company:Sam's Club\",\n \"[1] INCORPORATED [14]\",\n \"[15], date:day:October 31, 2022\",\n \"[16], quantity:10,586 stores and clubs\",\n \"[16] AS_OF [15]\",\n \"[1] HAS_QUANTITY [16]\",\n \"[17], quantity:24 countries\",\n \"[18], quantity:46 different names\",\n \"[1] HAS_QUANTITY [18]\",\n \"[18], organization:company:Walmart de M\u00e9xico y Centroam\u00e9rica\",\n \"[1] ALIAS [18]\",\n \"[19], location:country:Mexico\",\n \"[1] OPERATES_IN [19]\",\n \"[20], location:region:Central America\",\n \"[1] OPERATES_IN [20]\",\n \"[21], organization:company:Flipkart Wholesale\",\n \"[1] ALIAS [21]\",\n \"[22], location:country:India\",\n \"[1] OPERATES_IN [22]\"\n ]\n}}\n```\n\n### Task:\nYour task is to perform Named Entity Recognition (NER) and knowledge graph triplet extraction on the text that follows below.\n\n**Input:**\n{input}\n\n**Output:**\n","input_types": {"input" : "str"}} +{"name": "few_shot_ner_kg_extraction_with_spec", "template": "### Instruction\nYou will shortly be asked to perform Named Entity Recognition (NER) and knowledge graph triplet extraction on the text that follows. NER involves identifying named entities in a text, and knowledge graph triplet extraction involves identifying relationships between these entities and other attributes in the text.\n\nA knowledge graph triplet contains the three following pieces of information:\n- `subject`: The main entity.\n- `predicate`: The relationship type.\n- `object`: The related entity.\n\nThey are represented below as `[subject]:<predicate>:[object]`.\n\n#### Process \n**Identify Named Entities**: Extract entities based on the given entity types, ensuring they appear in the order they are mentioned in the text.\n**Establish Triplets**: Form triplets using the provided predicates, again in the order they appear in the text.\n\nYour final response should follow this format:\n\n**Output:**\n```json\n{{\n \"entities_and_triples\": [\n \"[1], ENTITY_TYPE:ENTITY_NAME\",\n \"[1] PREDICATE [2]\",\n \"[1] PREDICATE [3]\",\n \"[2], ENTITY_TYPE:ENTITY_NAME\",\n ...\n ]\n}}\n```\n\n### Example:\n\n**Entity Types:**\nORGANIZATION\nCOMPANY\nCITY, STATE, COUNTRY, OTHER\nPERSON\nYEAR, MONTH, DAY, OTHER\nQUANTITY\nEVENT\n\n**Predicates:**\nFOUNDED_BY\nHEADQUARTERED_IN\nOPERATES_IN\nOWNED_BY\nACQUIRED_BY\nHAS_EMPLOYEE_COUNT\nGENERATED_REVENUE\nLISTED_ON\nINCORPORATED\nHAS_DIVISION\nALIAS\nANNOUNCED\nHAS_QUANTITY\nAS_OF\n\n**Input:**\nWalmart Inc. (formerly Wal-Mart Stores, Inc.) is an American multinational retail corporation that operates a chain of hypermarkets (also called supercenters), discount department stores, and grocery stores in the United States, headquartered in Bentonville, Arkansas.[10] The company was founded by brothers Sam and James \"Bud\" Walton in nearby Rogers, Arkansas in 1962 and incorporated under Delaware General Corporation Law on October 31, 1969. It also owns and operates Sam's Club retail warehouses.[11][12]\n\nAs of October 31, 2022, Walmart has 10,586 stores and clubs in 24 countries, operating under 46 different names.[2][3][4] The company operates under the name Walmart in the United States and Canada, as Walmart de M\u00e9xico y Centroam\u00e9rica in Mexico and Central America, and as Flipkart Wholesale in India.\n\n**Output:**\n```json\n{{\n \"entities_and_triples\": [\n \"[1], ORGANIZATION:COMPANY:Walmart Inc.\",\n \"[2], ORGANIZATION:COMPANY:Wal-Mart Stores, Inc.\",\n \"[1] ALIAS [2]\",\n \"[3], LOCATION:COUNTRY:United States\",\n \"[1] OPERATES_IN [3]\",\n \"[4], LOCATION:CITY:Bentonville\",\n \"[1] HEADQUARTERED_IN [4]\",\n \"[5], LOCATION:STATE:Arkansas\",\n \"[1] HEADQUARTERED_IN [5]\",\n \"[6], PERSON:Sam Walton\",\n \"[1] FOUNDED_BY [6]\",\n \"[7], PERSON:James Walton\",\n \"[8], PERSON:Bud Walton\",\n \"[7] ALIAS [8]\",\n \"[1] FOUNDED_BY [7]\",\n \"[9], LOCATION:CITY:Rogers\",\n \"[10], DATE:YEAR:1962\",\n \"[11], EVENT:Incorporated under Delaware General Corporation Law\",\n \"[1] INCORPORATED [11]\",\n \"[12], DATE:DAY:October 31\",\n \"[1] INCORPORATED [12]\",\n \"[13], DATE:YEAR:1969\",\n \"[1] INCORPORATED [13]\",\n \"[14], ORGANIZATION:COMPANY:Sam's Club\",\n \"[1] INCORPORATED [14]\",\n \"[15], DATE:DAY:October 31, 2022\",\n \"[16], QUANTITY:10,586 stores and clubs\",\n \"[16] AS_OF [15]\",\n \"[1] HAS_QUANTITY [16]\",\n \"[17], QUANTITY:24 countries\",\n \"[18], QUANTITY:46 different names\",\n \"[1] HAS_QUANTITY [18]\",\n \"[18], ORGANIZATION:COMPANY:Walmart de M\u00e9xico y Centroam\u00e9rica\",\n \"[1] ALIAS [18]\",\n \"[19], LOCATION:COUNTRY:Mexico\",\n \"[1] OPERATES_IN [19]\",\n \"[20], LOCATION:REGION:Central America\",\n \"[1] OPERATES_IN [20]\",\n \"[21], ORGANIZATION:COMPANY:Flipkart Wholesale\",\n \"[1] ALIAS [21]\",\n \"[22], LOCATION:COUNTRY:India\",\n \"[1] OPERATES_IN [22]\"\n ]\n}}\n```\n\n### Task:\nYour task is to perform Named Entity Recognition (NER) and knowledge graph triplet extraction on the text that follows below. Use the provided entities and predicates as shown\n\n**Entity Types:**\n{entity_types}\n\n**Predicates:**\n{relations}\n\n**Input:**\n{input}\n\n**Output:**\n", "input_types": {"entity_types": "str", "relations": "str", "input" : "str"}} +{"name": "zero_shot_ner_kg_extraction", "template": "Perform Named Entity Recognition (NER) and extract knowledge graph triplets from the text. NER identifies named entities of given entity types, and triple extraction identifies relationships between entities using specified predicates.\n\n**Entity Types**:\n\n[\"PERSON\", \"ORGANIZATION\", \"LOCATION\", \"DATE\", \"TIME\", \"MONEY\", \"PERCENTAGE\", \"PRODUCT\", \"EVENT\", \"LANGUAGE\", \"NATIONALITY\", \"RELIGION\", \"TITLE\", \"PROFESSION\", \"ANIMAL\", \"PLANT\", \"DISEASE\", \"MEDICATION\", \"CHEMICAL\", \"MATERIAL\", \"COLOR\", \"SHAPE\", \"MEASUREMENT\", \"WEATHER\", \"NATURAL_DISASTER\", \"AWARD\", \"LAW\", \"CRIME\", \"TECHNOLOGY\", \"SOFTWARE\", \"HARDWARE\", \"VEHICLE\", \"FOOD\", \"DRINK\", \"SPORT\", \"MUSIC_GENRE\", \"INSTRUMENT\", \"ARTWORK\", \"BOOK\", \"MOVIE\", \"TV_SHOW\", \"ACADEMIC_SUBJECT\", \"SCIENTIFIC_THEORY\", \"POLITICAL_PARTY\", \"CURRENCY\", \"STOCK_SYMBOL\", \"FILE_TYPE\", \"PROGRAMMING_LANGUAGE\", \"MEDICAL_PROCEDURE\", \"CELESTIAL_BODY\"]\n\n**Predicates**\n[\"IS_EMPLOYED_BY\", \"LIVES_IN\", \"BORN_IN\", \"DIED_IN\", \"FOUNDED\", \"INVENTED\", \"WROTE\", \"DIRECTED\", \"STARRED_IN\", \"MARRIED_TO\", \"PARENT_OF\", \"CHILD_OF\", \"SIBLING_OF\", \"MEMBER_OF\", \"OWNER_OF\", \"CEO_OF\", \"STUDIED_AT\", \"GRADUATED_FROM\", \"TEACHES_AT\", \"SPEAKS\", \"CAPITAL_OF\", \"LOCATED_IN\", \"PART_OF\", \"CONTAINS\", \"PRODUCES\", \"CONSUMES\", \"EXPORTS\", \"IMPORTS\", \"ALLIES_WITH\", \"CONFLICTS_WITH\", \"PREDECESSOR_OF\", \"SUCCESSOR_OF\", \"DISCOVERED\", \"DEVELOPED\", \"FUNDED_BY\", \"INVESTED_IN\", \"COLLABORATES_WITH\", \"COMPETES_WITH\", \"ACQUIRED\", \"MERGED_WITH\", \"SPECIALIZES_IN\", \"PERFORMS\", \"AFFECTS\", \"CAUSES\", \"PREVENTS\", \"TREATS\", \"SYMPTOMS_OF\", \"BELONGS_TO\", \"DERIVED_FROM\", \"MEASURED_IN\"]\n\n**Text**{input}", "input_types": {"input" : "str"}} +{"name": "zero_shot_ner_kg_extraction_with_spec", "template": "Perform Named Entity Recognition (NER) and extract knowledge graph triplets from the text. NER identifies named entities of given entity types, and triple extraction identifies relationships between entities using specified predicates.\n\n**Entity Types**\n{entity_types}\n\n**Predicates**\n{relations}\n\n**Text**{input}", "input_types": {"entity_types": "str", "relations": "str", "input" : "str"}} +{"name": "kg_agent", "template": "**System Message:**\n\nYou are an AI assistant capable of generating Cypher queries to interact with a Neo4j knowledge graph. The knowledge graph contains information about organizations, people, locations, and their relationships, such as founders of companies, locations of companies, and products associated with companies.\n\n**Instructions:**\n\nWhen a user asks a question, you will generate a Cypher query to retrieve the relevant information from the Neo4j knowledge graph. Later, you will be given a schema which specifies the available relationships to help you construct the query. First, review the examples provided to understand the expected format of the queries.\n\n### Example(s) - User Questions and Cypher Queries for an Academic Knowledge Graph\n\n**User Question:**\n\"List all courses available in the computer science department.\"\n\n**Generated Cypher Query:**\n```cypher\nMATCH (c:COURSE)-[:OFFERED_BY]->(d:DEPARTMENT)\nWHERE d.name CONTAINS 'Computer Science'\nRETURN c.id AS Course, d.name AS Department\nORDER BY c.id;\n```\n\n**User Question:**\n\"Retrieve all courses taught by professors who have published research on natural language processing.\"\n\n**Generated Cypher Query:**\n```cypher\nMATCH (pr:PERSON)-[:PUBLISHED]->(p:PAPER)\nMATCH (p)-[:TOPIC]->(t:TOPIC)\nWHERE t.name CONTAINS 'Natural Language Processing'\nMATCH (c:COURSE)-[:TAUGHT_BY]->(pr)\nRETURN DISTINCT c.id AS Course, pr.name AS Professor, t.name AS Topic\nORDER BY c.id;\n```\n\n\n### Example(s) - User Questions and Cypher Queries for an Historical Events and Figures\n\n**User Question:**\n\"List all battles that occurred in the 19th century and the generals who participated in them.\"\n\n**Generated Cypher Query:**\n```cypher\nMATCH (b:EVENT)-[:HAPPENED_AT]->(d:DATE)\nWHERE d.year >= 1800 AND d.year < 1900 AND b.type CONTAINS 'Battle'\nMATCH (g:PERSON)-[:PARTICIPATED_IN]->(b)\nRETURN b.name AS Battle, d.year AS Year, g.name AS General\nORDER BY d.year, b.name, g.name;\n```\n\n**User Question:**\n\"Find all treaties signed in Paris and the countries involved.\"\n\n\n**Generated Cypher Query:**\n```cypher\nMATCH (t:EVENT)-[:HAPPENED_AT]->(l:LOCATION)\nWHERE l.name CONTAINS 'Paris' AND t.type CONTAINS 'Treaty'\nMATCH (c:ORGANIZATION)-[:SIGNED]->(t)\nRETURN t.name AS Treaty, l.name AS Location, c.name AS Country\nORDER BY t.name, c.name;\n```\n\n\nNow, you will be provided with a schema for the entities and relationships in the Neo4j knowledge graph. Use this schema to construct Cypher queries based on user questions.\n\n- **Entities:**\n - `ORGANIZATION` (e.g.: `COMPANY`, `SCHOOL`, `NON-PROFIT`, `OTHER`)\n - `COMPANY`\n - `LOCATION` (e.g.: `CITY`, `STATE`, `COUNTRY`, `OTHER`)\n - `DATE` (e.g.: `YEAR`, `MONTH`, `DAY`, `BATCH`, `OTHER`)\n - `QUANTITY`\n - `EVENT` (e.g.: `INCORPORATION`, `FUNDING_ROUND`, `ACQUISITION`, `LAUNCH`, `OTHER`)\n\n- **Relationships:**\n - `FOUNDED_BY`\n - `HEADQUARTERED_IN`\n - `OPERATES_IN`\n - `RAISED`\n - `ACQUIRED_BY`\n - `HAS_EMPLOYEE_COUNT`\n - `GENERATED_REVENUE`\n - `LISTED_ON`\n - `INCORPORATED`\n - `HAS_DIVISION`\n - `ANNOUNCED`\n - `HAS_QUANTITY`\n\nUse the referenced examples and schema to help you construct an appropriate Cypher query based on the following question:\n\n**User Question:**\n{input}\n\n**Generated Cypher Query:**\n", "input_types": {"input" : "str"}} +{"name": "kg_agent_with_spec", "template": "**System Message:**\n\nYou are an AI assistant capable of generating Cypher queries to interact with a Neo4j knowledge graph. The knowledge graph contains information about organizations, people, locations, and their relationships, such as founders of companies, locations of companies, and products associated with companies.\n\n**Instructions:**\n\nWhen a user asks a question, you will generate a Cypher query to retrieve the relevant information from the Neo4j knowledge graph. Later, you will be given a schema which specifies the available relationships to help you construct the query. First, review the examples provided to understand the expected format of the queries.\n\n### Example(s) - User Questions and Cypher Queries for an Academic Knowledge Graph\n\n**User Question:**\n\"List all courses available in the computer science department.\"\n\n**Generated Cypher Query:**\n```cypher\nMATCH (c:COURSE)-[:OFFERED_BY]->(d:DEPARTMENT)\nWHERE d.name CONTAINS 'Computer Science'\nRETURN c.id AS Course, d.name AS Department\nORDER BY c.id;\n```\n\n**User Question:**\n\"Retrieve all courses taught by professors who have published research on natural language processing.\"\n\n**Generated Cypher Query:**\n```cypher\nMATCH (pr:PERSON)-[:PUBLISHED]->(p:PAPER)\nMATCH (p)-[:TOPIC]->(t:TOPIC)\nWHERE t.name CONTAINS 'Natural Language Processing'\nMATCH (c:COURSE)-[:TAUGHT_BY]->(pr)\nRETURN DISTINCT c.id AS Course, pr.name AS Professor, t.name AS Topic\nORDER BY c.id;\n```\n\n\n### Example(s) - User Questions and Cypher Queries for an Historical Events and Figures\n\n**User Question:**\n\"List all battles that occurred in the 19th century and the generals who participated in them.\"\n\n**Generated Cypher Query:**\n```cypher\nMATCH (b:EVENT)-[:HAPPENED_AT]->(d:DATE)\nWHERE d.year >= 1800 AND d.year < 1900 AND b.type CONTAINS 'Battle'\nMATCH (g:PERSON)-[:PARTICIPATED_IN]->(b)\nRETURN b.name AS Battle, d.year AS Year, g.name AS General\nORDER BY d.year, b.name, g.name;\n```\n\n**User Question:**\n\"Find all treaties signed in Paris and the countries involved.\"\n\n\n**Generated Cypher Query:**\n```cypher\nMATCH (t:EVENT)-[:HAPPENED_AT]->(l:LOCATION)\nWHERE l.name CONTAINS 'Paris' AND t.type CONTAINS 'Treaty'\nMATCH (c:ORGANIZATION)-[:SIGNED]->(t)\nRETURN t.name AS Treaty, l.name AS Location, c.name AS Country\nORDER BY t.name, c.name;\n```\n\n\nNow, you will be provided with a schema for the entities and relationships in the Neo4j knowledge graph. Use this schema to construct Cypher queries based on user questions.\n\n- **Entities:**\n{entity_types}\n\n- **Relationships:**\n{relations}\n\nUse the referenced examples and schema to help you construct an appropriate Cypher query based on the following question:\n\n**User Question:**\n{input}\n\n**Generated Cypher Query:**\n", "input_types": {"entity_types": "str", "relations": "str", "input" : "str"}}
\ No newline at end of file diff --git a/R2R/r2r/prompts/local/r2r_prompt_provider.py b/R2R/r2r/prompts/local/r2r_prompt_provider.py new file mode 100755 index 00000000..830c4203 --- /dev/null +++ b/R2R/r2r/prompts/local/r2r_prompt_provider.py @@ -0,0 +1,69 @@ +import json +import logging +import os +from typing import Any, Optional + +from r2r.base import Prompt, PromptProvider + +logger = logging.getLogger(__name__) + + +class R2RPromptProvider(PromptProvider): + def __init__(self, file_path: Optional[str] = None): + self.prompts: dict[str, Prompt] = {} + self._load_prompts_from_jsonl(file_path=file_path) + + def _load_prompts_from_jsonl(self, file_path: Optional[str] = None): + if not file_path: + file_path = os.path.join( + os.path.dirname(__file__), "defaults.jsonl" + ) + try: + with open(file_path, "r") as file: + for line in file: + if line.strip(): + data = json.loads(line) + self.add_prompt( + data["name"], + data["template"], + data.get("input_types", {}), + ) + except json.JSONDecodeError as e: + error_msg = f"Error loading prompts from JSONL file: {e}" + logger.error(error_msg) + raise ValueError(error_msg) + + def add_prompt( + self, name: str, template: str, input_types: dict[str, str] + ) -> None: + if name in self.prompts: + raise ValueError(f"Prompt '{name}' already exists.") + self.prompts[name] = Prompt( + name=name, template=template, input_types=input_types + ) + + def get_prompt( + self, prompt_name: str, inputs: Optional[dict[str, Any]] = None + ) -> str: + if prompt_name not in self.prompts: + raise ValueError(f"Prompt '{prompt_name}' not found.") + prompt = self.prompts[prompt_name] + if inputs is None: + return prompt.template + return prompt.format_prompt(inputs) + + def update_prompt( + self, + name: str, + template: Optional[str] = None, + input_types: Optional[dict[str, str]] = None, + ) -> None: + if name not in self.prompts: + raise ValueError(f"Prompt '{name}' not found.") + if template: + self.prompts[name].template = template + if input_types: + self.prompts[name].input_types = input_types + + def get_all_prompts(self) -> dict[str, Prompt]: + return self.prompts diff --git a/R2R/r2r/providers/__init__.py b/R2R/r2r/providers/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/providers/__init__.py diff --git a/R2R/r2r/providers/embeddings/__init__.py b/R2R/r2r/providers/embeddings/__init__.py new file mode 100755 index 00000000..6b0c8b83 --- /dev/null +++ b/R2R/r2r/providers/embeddings/__init__.py @@ -0,0 +1,11 @@ +from .ollama.ollama_base import OllamaEmbeddingProvider +from .openai.openai_base import OpenAIEmbeddingProvider +from .sentence_transformer.sentence_transformer_base import ( + SentenceTransformerEmbeddingProvider, +) + +__all__ = [ + "OllamaEmbeddingProvider", + "OpenAIEmbeddingProvider", + "SentenceTransformerEmbeddingProvider", +] diff --git a/R2R/r2r/providers/embeddings/ollama/ollama_base.py b/R2R/r2r/providers/embeddings/ollama/ollama_base.py new file mode 100755 index 00000000..31a8c717 --- /dev/null +++ b/R2R/r2r/providers/embeddings/ollama/ollama_base.py @@ -0,0 +1,156 @@ +import asyncio +import logging +import os +import random +from typing import Any + +from ollama import AsyncClient, Client + +from r2r.base import EmbeddingConfig, EmbeddingProvider, VectorSearchResult + +logger = logging.getLogger(__name__) + + +class OllamaEmbeddingProvider(EmbeddingProvider): + def __init__(self, config: EmbeddingConfig): + super().__init__(config) + provider = config.provider + if not provider: + raise ValueError( + "Must set provider in order to initialize `OllamaEmbeddingProvider`." + ) + if provider != "ollama": + raise ValueError( + "OllamaEmbeddingProvider must be initialized with provider `ollama`." + ) + if config.rerank_model: + raise ValueError( + "OllamaEmbeddingProvider does not support separate reranking." + ) + + self.base_model = config.base_model + self.base_dimension = config.base_dimension + self.base_url = os.getenv("OLLAMA_API_BASE") + logger.info( + f"Using Ollama API base URL: {self.base_url or 'http://127.0.0.1:11434'}" + ) + self.client = Client(host=self.base_url) + self.aclient = AsyncClient(host=self.base_url) + + self.request_queue = asyncio.Queue() + self.max_retries = 2 + self.initial_backoff = 1 + self.max_backoff = 60 + self.concurrency_limit = 10 + self.semaphore = asyncio.Semaphore(self.concurrency_limit) + + async def process_queue(self): + while True: + task = await self.request_queue.get() + try: + result = await self.execute_task_with_backoff(task) + task["future"].set_result(result) + except Exception as e: + task["future"].set_exception(e) + finally: + self.request_queue.task_done() + + async def execute_task_with_backoff(self, task: dict[str, Any]): + retries = 0 + backoff = self.initial_backoff + while retries < self.max_retries: + try: + async with self.semaphore: + response = await asyncio.wait_for( + self.aclient.embeddings( + prompt=task["text"], model=self.base_model + ), + timeout=30, + ) + return response["embedding"] + except Exception as e: + logger.warning( + f"Request failed (attempt {retries + 1}): {str(e)}" + ) + retries += 1 + if retries == self.max_retries: + raise Exception( + f"Max retries reached. Last error: {str(e)}" + ) + await asyncio.sleep(backoff + random.uniform(0, 1)) + backoff = min(backoff * 2, self.max_backoff) + + def get_embedding( + self, + text: str, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[float]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OllamaEmbeddingProvider only supports search stage." + ) + + try: + response = self.client.embeddings( + prompt=text, model=self.base_model + ) + return response["embedding"] + except Exception as e: + logger.error(f"Error getting embedding: {str(e)}") + raise + + def get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + return [self.get_embedding(text, stage) for text in texts] + + async def async_get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OllamaEmbeddingProvider only supports search stage." + ) + + queue_processor = asyncio.create_task(self.process_queue()) + futures = [] + for text in texts: + future = asyncio.Future() + await self.request_queue.put({"text": text, "future": future}) + futures.append(future) + + try: + results = await asyncio.gather(*futures, return_exceptions=True) + # Check if any result is an exception and raise it + exceptions = set([r for r in results if isinstance(r, Exception)]) + if exceptions: + raise Exception( + f"Embedding generation failed for one or more embeddings." + ) + return results + except Exception as e: + logger.error(f"Embedding generation failed: {str(e)}") + raise + finally: + await self.request_queue.join() + queue_processor.cancel() + + def rerank( + self, + query: str, + results: list[VectorSearchResult], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.RERANK, + limit: int = 10, + ) -> list[VectorSearchResult]: + return results[:limit] + + def tokenize_string( + self, text: str, model: str, stage: EmbeddingProvider.PipeStage + ) -> list[int]: + raise NotImplementedError( + "Tokenization is not supported by OllamaEmbeddingProvider." + ) diff --git a/R2R/r2r/providers/embeddings/openai/openai_base.py b/R2R/r2r/providers/embeddings/openai/openai_base.py new file mode 100755 index 00000000..7e7d32aa --- /dev/null +++ b/R2R/r2r/providers/embeddings/openai/openai_base.py @@ -0,0 +1,200 @@ +import logging +import os + +from openai import AsyncOpenAI, AuthenticationError, OpenAI + +from r2r.base import EmbeddingConfig, EmbeddingProvider, VectorSearchResult + +logger = logging.getLogger(__name__) + + +class OpenAIEmbeddingProvider(EmbeddingProvider): + MODEL_TO_TOKENIZER = { + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + } + MODEL_TO_DIMENSIONS = { + "text-embedding-ada-002": [1536], + "text-embedding-3-small": [512, 1536], + "text-embedding-3-large": [256, 1024, 3072], + } + + def __init__(self, config: EmbeddingConfig): + super().__init__(config) + provider = config.provider + if not provider: + raise ValueError( + "Must set provider in order to initialize OpenAIEmbeddingProvider." + ) + + if provider != "openai": + raise ValueError( + "OpenAIEmbeddingProvider must be initialized with provider `openai`." + ) + if not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "Must set OPENAI_API_KEY in order to initialize OpenAIEmbeddingProvider." + ) + self.client = OpenAI() + self.async_client = AsyncOpenAI() + + if config.rerank_model: + raise ValueError( + "OpenAIEmbeddingProvider does not support separate reranking." + ) + self.base_model = config.base_model + self.base_dimension = config.base_dimension + + if self.base_model not in OpenAIEmbeddingProvider.MODEL_TO_TOKENIZER: + raise ValueError( + f"OpenAI embedding model {self.base_model} not supported." + ) + if ( + self.base_dimension + and self.base_dimension + not in OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[self.base_model] + ): + raise ValueError( + f"Dimensions {self.dimension} for {self.base_model} are not supported" + ) + + if not self.base_model or not self.base_dimension: + raise ValueError( + "Must set base_model and base_dimension in order to initialize OpenAIEmbeddingProvider." + ) + + if config.rerank_model: + raise ValueError( + "OpenAIEmbeddingProvider does not support separate reranking." + ) + + def get_embedding( + self, + text: str, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[float]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + return ( + self.client.embeddings.create( + input=[text], + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ) + .data[0] + .embedding + ) + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + async def async_get_embedding( + self, + text: str, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[float]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + response = await self.async_client.embeddings.create( + input=[text], + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ) + return response.data[0].embedding + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + def get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + return [ + ele.embedding + for ele in self.client.embeddings.create( + input=texts, + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ).data + ] + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + async def async_get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError( + "OpenAIEmbeddingProvider only supports search stage." + ) + + try: + response = await self.async_client.embeddings.create( + input=texts, + model=self.base_model, + dimensions=self.base_dimension + or OpenAIEmbeddingProvider.MODEL_TO_DIMENSIONS[ + self.base_model + ][-1], + ) + return [ele.embedding for ele in response.data] + except AuthenticationError as e: + raise ValueError( + "Invalid OpenAI API key provided. Please check your OPENAI_API_KEY environment variable." + ) from e + + def rerank( + self, + query: str, + results: list[VectorSearchResult], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.RERANK, + limit: int = 10, + ): + return results[:limit] + + def tokenize_string(self, text: str, model: str) -> list[int]: + try: + import tiktoken + except ImportError: + raise ValueError( + "Must download tiktoken library to run `tokenize_string`." + ) + # tiktoken encoding - + # cl100k_base - gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large + if model not in OpenAIEmbeddingProvider.MODEL_TO_TOKENIZER: + raise ValueError(f"OpenAI embedding model {model} not supported.") + encoding = tiktoken.get_encoding( + OpenAIEmbeddingProvider.MODEL_TO_TOKENIZER[model] + ) + return encoding.encode(text) diff --git a/R2R/r2r/providers/embeddings/sentence_transformer/sentence_transformer_base.py b/R2R/r2r/providers/embeddings/sentence_transformer/sentence_transformer_base.py new file mode 100755 index 00000000..3316cb60 --- /dev/null +++ b/R2R/r2r/providers/embeddings/sentence_transformer/sentence_transformer_base.py @@ -0,0 +1,160 @@ +import logging + +from r2r.base import EmbeddingConfig, EmbeddingProvider, VectorSearchResult + +logger = logging.getLogger(__name__) + + +class SentenceTransformerEmbeddingProvider(EmbeddingProvider): + def __init__( + self, + config: EmbeddingConfig, + ): + super().__init__(config) + logger.info( + "Initializing `SentenceTransformerEmbeddingProvider` with separate models for search and rerank." + ) + provider = config.provider + if not provider: + raise ValueError( + "Must set provider in order to initialize SentenceTransformerEmbeddingProvider." + ) + if provider != "sentence-transformers": + raise ValueError( + "SentenceTransformerEmbeddingProvider must be initialized with provider `sentence-transformers`." + ) + try: + from sentence_transformers import CrossEncoder, SentenceTransformer + + self.SentenceTransformer = SentenceTransformer + # TODO - Modify this to be configurable, as `bge-reranker-large` is a `SentenceTransformer` model + self.CrossEncoder = CrossEncoder + except ImportError as e: + raise ValueError( + "Must download sentence-transformers library to run `SentenceTransformerEmbeddingProvider`." + ) from e + + # Initialize separate models for search and rerank + self.do_search = False + self.do_rerank = False + + self.search_encoder = self._init_model( + config, EmbeddingProvider.PipeStage.BASE + ) + self.rerank_encoder = self._init_model( + config, EmbeddingProvider.PipeStage.RERANK + ) + + def _init_model(self, config: EmbeddingConfig, stage: str): + stage_name = stage.name.lower() + model = config.dict().get(f"{stage_name}_model", None) + dimension = config.dict().get(f"{stage_name}_dimension", None) + + transformer_type = config.dict().get( + f"{stage_name}_transformer_type", "SentenceTransformer" + ) + + if stage == EmbeddingProvider.PipeStage.BASE: + self.do_search = True + # Check if a model is set for the stage + if not (model and dimension and transformer_type): + raise ValueError( + f"Must set {stage.name.lower()}_model and {stage.name.lower()}_dimension for {stage} stage in order to initialize SentenceTransformerEmbeddingProvider." + ) + + if stage == EmbeddingProvider.PipeStage.RERANK: + # Check if a model is set for the stage + if not (model and dimension and transformer_type): + return None + + self.do_rerank = True + if transformer_type == "SentenceTransformer": + raise ValueError( + f"`SentenceTransformer` models are not yet supported for {stage} stage in SentenceTransformerEmbeddingProvider." + ) + + # Save the model_key and dimension into instance variables + setattr(self, f"{stage_name}_model", model) + setattr(self, f"{stage_name}_dimension", dimension) + setattr(self, f"{stage_name}_transformer_type", transformer_type) + + # Initialize the model + encoder = ( + self.SentenceTransformer( + model, truncate_dim=dimension, trust_remote_code=True + ) + if transformer_type == "SentenceTransformer" + else self.CrossEncoder(model, trust_remote_code=True) + ) + return encoder + + def get_embedding( + self, + text: str, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[float]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError("`get_embedding` only supports `SEARCH` stage.") + if not self.do_search: + raise ValueError( + "`get_embedding` can only be called for the search stage if a search model is set." + ) + encoder = self.search_encoder + return encoder.encode([text]).tolist()[0] + + def get_embeddings( + self, + texts: list[str], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[list[float]]: + if stage != EmbeddingProvider.PipeStage.BASE: + raise ValueError("`get_embeddings` only supports `SEARCH` stage.") + if not self.do_search: + raise ValueError( + "`get_embeddings` can only be called for the search stage if a search model is set." + ) + encoder = ( + self.search_encoder + if stage == EmbeddingProvider.PipeStage.BASE + else self.rerank_encoder + ) + return encoder.encode(texts).tolist() + + def rerank( + self, + query: str, + results: list[VectorSearchResult], + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.RERANK, + limit: int = 10, + ) -> list[VectorSearchResult]: + if stage != EmbeddingProvider.PipeStage.RERANK: + raise ValueError("`rerank` only supports `RERANK` stage.") + if not self.do_rerank: + return results[:limit] + + from copy import copy + + texts = copy([doc.metadata["text"] for doc in results]) + # Use the rank method from the rerank_encoder, which is a CrossEncoder model + reranked_scores = self.rerank_encoder.rank( + query, texts, return_documents=False, top_k=limit + ) + # Map the reranked scores back to the original documents + reranked_results = [] + for score in reranked_scores: + corpus_id = score["corpus_id"] + new_result = results[corpus_id] + new_result.score = float(score["score"]) + reranked_results.append(new_result) + + # Sort the documents by the new scores in descending order + reranked_results.sort(key=lambda doc: doc.score, reverse=True) + return reranked_results + + def tokenize_string( + self, + stage: EmbeddingProvider.PipeStage = EmbeddingProvider.PipeStage.BASE, + ) -> list[int]: + raise ValueError( + "SentenceTransformerEmbeddingProvider does not support tokenize_string." + ) diff --git a/R2R/r2r/providers/eval/__init__.py b/R2R/r2r/providers/eval/__init__.py new file mode 100755 index 00000000..3f5e1b51 --- /dev/null +++ b/R2R/r2r/providers/eval/__init__.py @@ -0,0 +1,3 @@ +from .llm.base_llm_eval import LLMEvalProvider + +__all__ = ["LLMEvalProvider"] diff --git a/R2R/r2r/providers/eval/llm/base_llm_eval.py b/R2R/r2r/providers/eval/llm/base_llm_eval.py new file mode 100755 index 00000000..7c573a34 --- /dev/null +++ b/R2R/r2r/providers/eval/llm/base_llm_eval.py @@ -0,0 +1,84 @@ +from fractions import Fraction +from typing import Union + +from r2r import EvalConfig, EvalProvider, LLMProvider, PromptProvider +from r2r.base.abstractions.llm import GenerationConfig + + +class LLMEvalProvider(EvalProvider): + def __init__( + self, + config: EvalConfig, + llm_provider: LLMProvider, + prompt_provider: PromptProvider, + ): + super().__init__(config) + + self.llm_provider = llm_provider + self.prompt_provider = prompt_provider + + def _calc_query_context_relevancy(self, query: str, context: str) -> float: + system_prompt = self.prompt_provider.get_prompt("default_system") + eval_prompt = self.prompt_provider.get_prompt( + "rag_context_eval", {"query": query, "context": context} + ) + response = self.llm_provider.get_completion( + self.prompt_provider._get_message_payload( + system_prompt, eval_prompt + ), + self.eval_generation_config, + ) + response_text = response.choices[0].message.content + fraction = ( + response_text + # Get the fraction in the returned tuple + .split(",")[-1][:-1] + # Remove any quotes and spaces + .replace("'", "") + .replace('"', "") + .strip() + ) + return float(Fraction(fraction)) + + def _calc_answer_grounding( + self, query: str, context: str, answer: str + ) -> float: + system_prompt = self.prompt_provider.get_prompt("default_system") + eval_prompt = self.prompt_provider.get_prompt( + "rag_answer_eval", + {"query": query, "context": context, "answer": answer}, + ) + response = self.llm_provider.get_completion( + self.prompt_provider._get_message_payload( + system_prompt, eval_prompt + ), + self.eval_generation_config, + ) + response_text = response.choices[0].message.content + fraction = ( + response_text + # Get the fraction in the returned tuple + .split(",")[-1][:-1] + # Remove any quotes and spaces + .replace("'", "") + .replace('"', "") + .strip() + ) + return float(Fraction(fraction)) + + def _evaluate( + self, + query: str, + context: str, + answer: str, + eval_generation_config: GenerationConfig, + ) -> dict[str, dict[str, Union[str, float]]]: + self.eval_generation_config = eval_generation_config + query_context_relevancy = self._calc_query_context_relevancy( + query, context + ) + answer_grounding = self._calc_answer_grounding(query, context, answer) + return { + "query_context_relevancy": query_context_relevancy, + "answer_grounding": answer_grounding, + } diff --git a/R2R/r2r/providers/kg/__init__.py b/R2R/r2r/providers/kg/__init__.py new file mode 100755 index 00000000..36bc79a2 --- /dev/null +++ b/R2R/r2r/providers/kg/__init__.py @@ -0,0 +1,3 @@ +from .neo4j.base_neo4j import Neo4jKGProvider + +__all__ = ["Neo4jKGProvider"] diff --git a/R2R/r2r/providers/kg/neo4j/base_neo4j.py b/R2R/r2r/providers/kg/neo4j/base_neo4j.py new file mode 100755 index 00000000..9ede2b85 --- /dev/null +++ b/R2R/r2r/providers/kg/neo4j/base_neo4j.py @@ -0,0 +1,983 @@ +# abstractions are taken from LlamaIndex +# Neo4jKGProvider is almost entirely taken from LlamaIndex Neo4jPropertyGraphStore +# https://github.com/run-llama/llama_index +import json +import os +from typing import Any, Dict, List, Optional, Tuple + +from r2r.base import ( + EntityType, + KGConfig, + KGProvider, + PromptProvider, + format_entity_types, + format_relations, +) +from r2r.base.abstractions.llama_abstractions import ( + LIST_LIMIT, + ChunkNode, + EntityNode, + LabelledNode, + PropertyGraphStore, + Relation, + Triplet, + VectorStoreQuery, + clean_string_values, + value_sanitize, +) + + +def remove_empty_values(input_dict): + """ + Remove entries with empty values from the dictionary. + + Parameters: + input_dict (dict): The dictionary from which empty values need to be removed. + + Returns: + dict: A new dictionary with all empty values removed. + """ + # Create a new dictionary excluding empty values + return {key: value for key, value in input_dict.items() if value} + + +BASE_ENTITY_LABEL = "__Entity__" +EXCLUDED_LABELS = ["_Bloom_Perspective_", "_Bloom_Scene_"] +EXCLUDED_RELS = ["_Bloom_HAS_SCENE_"] +EXHAUSTIVE_SEARCH_LIMIT = 10000 +# Threshold for returning all available prop values in graph schema +DISTINCT_VALUE_LIMIT = 10 + +node_properties_query = """ +CALL apoc.meta.data() +YIELD label, other, elementType, type, property +WHERE NOT type = "RELATIONSHIP" AND elementType = "node" + AND NOT label IN $EXCLUDED_LABELS +WITH label AS nodeLabels, collect({property:property, type:type}) AS properties +RETURN {labels: nodeLabels, properties: properties} AS output + +""" + +rel_properties_query = """ +CALL apoc.meta.data() +YIELD label, other, elementType, type, property +WHERE NOT type = "RELATIONSHIP" AND elementType = "relationship" + AND NOT label in $EXCLUDED_LABELS +WITH label AS nodeLabels, collect({property:property, type:type}) AS properties +RETURN {type: nodeLabels, properties: properties} AS output +""" + +rel_query = """ +CALL apoc.meta.data() +YIELD label, other, elementType, type, property +WHERE type = "RELATIONSHIP" AND elementType = "node" +UNWIND other AS other_node +WITH * WHERE NOT label IN $EXCLUDED_LABELS + AND NOT other_node IN $EXCLUDED_LABELS +RETURN {start: label, type: property, end: toString(other_node)} AS output +""" + + +class Neo4jKGProvider(PropertyGraphStore, KGProvider): + r""" + Neo4j Property Graph Store. + + This class implements a Neo4j property graph store. + + If you are using local Neo4j instead of aura, here's a helpful + command for launching the docker container: + + ```bash + docker run \ + -p 7474:7474 -p 7687:7687 \ + -v $PWD/data:/data -v $PWD/plugins:/plugins \ + --name neo4j-apoc \ + -e NEO4J_apoc_export_file_enabled=true \ + -e NEO4J_apoc_import_file_enabled=true \ + -e NEO4J_apoc_import_file_use__neo4j__config=true \ + -e NEO4JLABS_PLUGINS=\\[\"apoc\"\\] \ + neo4j:latest + ``` + + Args: + username (str): The username for the Neo4j database. + password (str): The password for the Neo4j database. + url (str): The URL for the Neo4j database. + database (Optional[str]): The name of the database to connect to. Defaults to "neo4j". + + Examples: + `pip install llama-index-graph-stores-neo4j` + + ```python + from llama_index.core.indices.property_graph import PropertyGraphIndex + from llama_index.graph_stores.neo4j import Neo4jKGProvider + + # Create a Neo4jKGProvider instance + graph_store = Neo4jKGProvider( + username="neo4j", + password="neo4j", + url="bolt://localhost:7687", + database="neo4j" + ) + + # create the index + index = PropertyGraphIndex.from_documents( + documents, + property_graph_store=graph_store, + ) + ``` + """ + + supports_structured_queries: bool = True + supports_vector_queries: bool = True + + def __init__( + self, + config: KGConfig, + refresh_schema: bool = True, + sanitize_query_output: bool = True, + enhanced_schema: bool = False, + *args: Any, + **kwargs: Any, + ) -> None: + if config.provider != "neo4j": + raise ValueError( + "Neo4jKGProvider must be initialized with config with `neo4j` provider." + ) + + try: + import neo4j + except ImportError: + raise ImportError("Please install neo4j: pip install neo4j") + + username = os.getenv("NEO4J_USER") + password = os.getenv("NEO4J_PASSWORD") + url = os.getenv("NEO4J_URL") + database = os.getenv("NEO4J_DATABASE", "neo4j") + + if not username or not password or not url: + raise ValueError( + "Neo4j configuration values are missing. Please set NEO4J_USER, NEO4J_PASSWORD, and NEO4J_URL environment variables." + ) + + self.sanitize_query_output = sanitize_query_output + self.enhcnaced_schema = enhanced_schema + self._driver = neo4j.GraphDatabase.driver( + url, auth=(username, password), **kwargs + ) + self._async_driver = neo4j.AsyncGraphDatabase.driver( + url, + auth=(username, password), + **kwargs, + ) + self._database = database + self.structured_schema = {} + if refresh_schema: + self.refresh_schema() + self.neo4j = neo4j + self.config = config + + @property + def client(self): + return self._driver + + def refresh_schema(self) -> None: + """Refresh the schema.""" + node_query_results = self.structured_query( + node_properties_query, + param_map={ + "EXCLUDED_LABELS": [*EXCLUDED_LABELS, BASE_ENTITY_LABEL] + }, + ) + node_properties = ( + [el["output"] for el in node_query_results] + if node_query_results + else [] + ) + + rels_query_result = self.structured_query( + rel_properties_query, param_map={"EXCLUDED_LABELS": EXCLUDED_RELS} + ) + rel_properties = ( + [el["output"] for el in rels_query_result] + if rels_query_result + else [] + ) + + rel_objs_query_result = self.structured_query( + rel_query, + param_map={ + "EXCLUDED_LABELS": [*EXCLUDED_LABELS, BASE_ENTITY_LABEL] + }, + ) + relationships = ( + [el["output"] for el in rel_objs_query_result] + if rel_objs_query_result + else [] + ) + + # Get constraints & indexes + try: + constraint = self.structured_query("SHOW CONSTRAINTS") + index = self.structured_query( + "CALL apoc.schema.nodes() YIELD label, properties, type, size, " + "valuesSelectivity WHERE type = 'RANGE' RETURN *, " + "size * valuesSelectivity as distinctValues" + ) + except ( + self.neo4j.exceptions.ClientError + ): # Read-only user might not have access to schema information + constraint = [] + index = [] + + self.structured_schema = { + "node_props": { + el["labels"]: el["properties"] for el in node_properties + }, + "rel_props": { + el["type"]: el["properties"] for el in rel_properties + }, + "relationships": relationships, + "metadata": {"constraint": constraint, "index": index}, + } + schema_counts = self.structured_query( + "CALL apoc.meta.graphSample() YIELD nodes, relationships " + "RETURN nodes, [rel in relationships | {name:apoc.any.property" + "(rel, 'type'), count: apoc.any.property(rel, 'count')}]" + " AS relationships" + ) + # Update node info + for node in schema_counts[0].get("nodes", []): + # Skip bloom labels + if node["name"] in EXCLUDED_LABELS: + continue + node_props = self.structured_schema["node_props"].get(node["name"]) + if not node_props: # The node has no properties + continue + enhanced_cypher = self._enhanced_schema_cypher( + node["name"], + node_props, + node["count"] < EXHAUSTIVE_SEARCH_LIMIT, + ) + enhanced_info = self.structured_query(enhanced_cypher)[0]["output"] + for prop in node_props: + if prop["property"] in enhanced_info: + prop.update(enhanced_info[prop["property"]]) + # Update rel info + for rel in schema_counts[0].get("relationships", []): + # Skip bloom labels + if rel["name"] in EXCLUDED_RELS: + continue + rel_props = self.structured_schema["rel_props"].get(rel["name"]) + if not rel_props: # The rel has no properties + continue + enhanced_cypher = self._enhanced_schema_cypher( + rel["name"], + rel_props, + rel["count"] < EXHAUSTIVE_SEARCH_LIMIT, + is_relationship=True, + ) + try: + enhanced_info = self.structured_query(enhanced_cypher)[0][ + "output" + ] + for prop in rel_props: + if prop["property"] in enhanced_info: + prop.update(enhanced_info[prop["property"]]) + except self.neo4j.exceptions.ClientError: + # Sometimes the types are not consistent in the db + pass + + def upsert_nodes(self, nodes: List[LabelledNode]) -> None: + # Lists to hold separated types + entity_dicts: List[dict] = [] + chunk_dicts: List[dict] = [] + + # Sort by type + for item in nodes: + if isinstance(item, EntityNode): + entity_dicts.append({**item.dict(), "id": item.id}) + elif isinstance(item, ChunkNode): + chunk_dicts.append({**item.dict(), "id": item.id}) + else: + # Log that we do not support these types of nodes + # Or raise an error? + pass + + if chunk_dicts: + self.structured_query( + """ + UNWIND $data AS row + MERGE (c:Chunk {id: row.id}) + SET c.text = row.text + WITH c, row + SET c += row.properties + WITH c, row.embedding AS embedding + WHERE embedding IS NOT NULL + CALL db.create.setNodeVectorProperty(c, 'embedding', embedding) + RETURN count(*) + """, + param_map={"data": chunk_dicts}, + ) + + if entity_dicts: + self.structured_query( + """ + UNWIND $data AS row + MERGE (e:`__Entity__` {id: row.id}) + SET e += apoc.map.clean(row.properties, [], []) + SET e.name = row.name + WITH e, row + CALL apoc.create.addLabels(e, [row.label]) + YIELD node + WITH e, row + CALL { + WITH e, row + WITH e, row + WHERE row.embedding IS NOT NULL + CALL db.create.setNodeVectorProperty(e, 'embedding', row.embedding) + RETURN count(*) AS count + } + WITH e, row WHERE row.properties.triplet_source_id IS NOT NULL + MERGE (c:Chunk {id: row.properties.triplet_source_id}) + MERGE (e)<-[:MENTIONS]-(c) + """, + param_map={"data": entity_dicts}, + ) + + def upsert_relations(self, relations: List[Relation]) -> None: + """Add relations.""" + params = [r.dict() for r in relations] + + self.structured_query( + """ + UNWIND $data AS row + MERGE (source {id: row.source_id}) + MERGE (target {id: row.target_id}) + WITH source, target, row + CALL apoc.merge.relationship(source, row.label, {}, row.properties, target) YIELD rel + RETURN count(*) + """, + param_map={"data": params}, + ) + + def get( + self, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> List[LabelledNode]: + """Get nodes.""" + cypher_statement = "MATCH (e) " + + params = {} + if properties or ids: + cypher_statement += "WHERE " + + if ids: + cypher_statement += "e.id in $ids " + params["ids"] = ids + + if properties: + prop_list = [] + for i, prop in enumerate(properties): + prop_list.append(f"e.`{prop}` = $property_{i}") + params[f"property_{i}"] = properties[prop] + cypher_statement += " AND ".join(prop_list) + + return_statement = """ + WITH e + RETURN e.id AS name, + [l in labels(e) WHERE l <> '__Entity__' | l][0] AS type, + e{.* , embedding: Null, id: Null} AS properties + """ + cypher_statement += return_statement + + response = self.structured_query(cypher_statement, param_map=params) + response = response if response else [] + + nodes = [] + for record in response: + # text indicates a chunk node + # none on the type indicates an implicit node, likely a chunk node + if "text" in record["properties"] or record["type"] is None: + text = record["properties"].pop("text", "") + nodes.append( + ChunkNode( + id_=record["name"], + text=text, + properties=remove_empty_values(record["properties"]), + ) + ) + else: + nodes.append( + EntityNode( + name=record["name"], + label=record["type"], + properties=remove_empty_values(record["properties"]), + ) + ) + + return nodes + + def get_triplets( + self, + entity_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> List[Triplet]: + # TODO: handle ids of chunk nodes + cypher_statement = "MATCH (e:`__Entity__`) " + + params = {} + if entity_names or properties or ids: + cypher_statement += "WHERE " + + if entity_names: + cypher_statement += "e.name in $entity_names " + params["entity_names"] = entity_names + + if ids: + cypher_statement += "e.id in $ids " + params["ids"] = ids + + if properties: + prop_list = [] + for i, prop in enumerate(properties): + prop_list.append(f"e.`{prop}` = $property_{i}") + params[f"property_{i}"] = properties[prop] + cypher_statement += " AND ".join(prop_list) + + return_statement = f""" + WITH e + CALL {{ + WITH e + MATCH (e)-[r{':`' + '`|`'.join(relation_names) + '`' if relation_names else ''}]->(t) + RETURN e.name AS source_id, [l in labels(e) WHERE l <> '__Entity__' | l][0] AS source_type, + e{{.* , embedding: Null, name: Null}} AS source_properties, + type(r) AS type, + t.name AS target_id, [l in labels(t) WHERE l <> '__Entity__' | l][0] AS target_type, + t{{.* , embedding: Null, name: Null}} AS target_properties + UNION ALL + WITH e + MATCH (e)<-[r{':`' + '`|`'.join(relation_names) + '`' if relation_names else ''}]-(t) + RETURN t.name AS source_id, [l in labels(t) WHERE l <> '__Entity__' | l][0] AS source_type, + e{{.* , embedding: Null, name: Null}} AS source_properties, + type(r) AS type, + e.name AS target_id, [l in labels(e) WHERE l <> '__Entity__' | l][0] AS target_type, + t{{.* , embedding: Null, name: Null}} AS target_properties + }} + RETURN source_id, source_type, type, target_id, target_type, source_properties, target_properties""" + cypher_statement += return_statement + + data = self.structured_query(cypher_statement, param_map=params) + data = data if data else [] + + triples = [] + for record in data: + source = EntityNode( + name=record["source_id"], + label=record["source_type"], + properties=remove_empty_values(record["source_properties"]), + ) + target = EntityNode( + name=record["target_id"], + label=record["target_type"], + properties=remove_empty_values(record["target_properties"]), + ) + rel = Relation( + source_id=record["source_id"], + target_id=record["target_id"], + label=record["type"], + ) + triples.append([source, rel, target]) + return triples + + def get_rel_map( + self, + graph_nodes: List[LabelledNode], + depth: int = 2, + limit: int = 30, + ignore_rels: Optional[List[str]] = None, + ) -> List[Triplet]: + """Get depth-aware rel map.""" + triples = [] + + ids = [node.id for node in graph_nodes] + # Needs some optimization + response = self.structured_query( + f""" + MATCH (e:`__Entity__`) + WHERE e.id in $ids + MATCH p=(e)-[r*1..{depth}]-(other) + WHERE ALL(rel in relationships(p) WHERE type(rel) <> 'MENTIONS') + UNWIND relationships(p) AS rel + WITH distinct rel + WITH startNode(rel) AS source, + type(rel) AS type, + endNode(rel) AS endNode + RETURN source.id AS source_id, [l in labels(source) WHERE l <> '__Entity__' | l][0] AS source_type, + source{{.* , embedding: Null, id: Null}} AS source_properties, + type, + endNode.id AS target_id, [l in labels(endNode) WHERE l <> '__Entity__' | l][0] AS target_type, + endNode{{.* , embedding: Null, id: Null}} AS target_properties + LIMIT toInteger($limit) + """, + param_map={"ids": ids, "limit": limit}, + ) + response = response if response else [] + + ignore_rels = ignore_rels or [] + for record in response: + if record["type"] in ignore_rels: + continue + + source = EntityNode( + name=record["source_id"], + label=record["source_type"], + properties=remove_empty_values(record["source_properties"]), + ) + target = EntityNode( + name=record["target_id"], + label=record["target_type"], + properties=remove_empty_values(record["target_properties"]), + ) + rel = Relation( + source_id=record["source_id"], + target_id=record["target_id"], + label=record["type"], + ) + triples.append([source, rel, target]) + + return triples + + def structured_query( + self, query: str, param_map: Optional[Dict[str, Any]] = None + ) -> Any: + param_map = param_map or {} + + with self._driver.session(database=self._database) as session: + result = session.run(query, param_map) + full_result = [d.data() for d in result] + + if self.sanitize_query_output: + return value_sanitize(full_result) + + return full_result + + def vector_query( + self, query: VectorStoreQuery, **kwargs: Any + ) -> Tuple[List[LabelledNode], List[float]]: + """Query the graph store with a vector store query.""" + data = self.structured_query( + """MATCH (e:`__Entity__`) + WHERE e.embedding IS NOT NULL AND size(e.embedding) = $dimension + WITH e, vector.similarity.cosine(e.embedding, $embedding) AS score + ORDER BY score DESC LIMIT toInteger($limit) + RETURN e.id AS name, + [l in labels(e) WHERE l <> '__Entity__' | l][0] AS type, + e{.* , embedding: Null, name: Null, id: Null} AS properties, + score""", + param_map={ + "embedding": query.query_embedding, + "dimension": len(query.query_embedding), + "limit": query.similarity_top_k, + }, + ) + data = data if data else [] + + nodes = [] + scores = [] + for record in data: + node = EntityNode( + name=record["name"], + label=record["type"], + properties=remove_empty_values(record["properties"]), + ) + nodes.append(node) + scores.append(record["score"]) + + return (nodes, scores) + + def delete( + self, + entity_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, + properties: Optional[dict] = None, + ids: Optional[List[str]] = None, + ) -> None: + """Delete matching data.""" + if entity_names: + self.structured_query( + "MATCH (n) WHERE n.name IN $entity_names DETACH DELETE n", + param_map={"entity_names": entity_names}, + ) + + if ids: + self.structured_query( + "MATCH (n) WHERE n.id IN $ids DETACH DELETE n", + param_map={"ids": ids}, + ) + + if relation_names: + for rel in relation_names: + self.structured_query(f"MATCH ()-[r:`{rel}`]->() DELETE r") + + if properties: + cypher = "MATCH (e) WHERE " + prop_list = [] + params = {} + for i, prop in enumerate(properties): + prop_list.append(f"e.`{prop}` = $property_{i}") + params[f"property_{i}"] = properties[prop] + cypher += " AND ".join(prop_list) + self.structured_query( + cypher + " DETACH DELETE e", param_map=params + ) + + def _enhanced_schema_cypher( + self, + label_or_type: str, + properties: List[Dict[str, Any]], + exhaustive: bool, + is_relationship: bool = False, + ) -> str: + if is_relationship: + match_clause = f"MATCH ()-[n:`{label_or_type}`]->()" + else: + match_clause = f"MATCH (n:`{label_or_type}`)" + + with_clauses = [] + return_clauses = [] + output_dict = {} + if exhaustive: + for prop in properties: + prop_name = prop["property"] + prop_type = prop["type"] + if prop_type == "STRING": + with_clauses.append( + f"collect(distinct substring(toString(n.`{prop_name}`), 0, 50)) " + f"AS `{prop_name}_values`" + ) + return_clauses.append( + f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}]," + f" distinct_count: size(`{prop_name}_values`)" + ) + elif prop_type in [ + "INTEGER", + "FLOAT", + "DATE", + "DATE_TIME", + "LOCAL_DATE_TIME", + ]: + with_clauses.append( + f"min(n.`{prop_name}`) AS `{prop_name}_min`" + ) + with_clauses.append( + f"max(n.`{prop_name}`) AS `{prop_name}_max`" + ) + with_clauses.append( + f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`" + ) + return_clauses.append( + f"min: toString(`{prop_name}_min`), " + f"max: toString(`{prop_name}_max`), " + f"distinct_count: `{prop_name}_distinct`" + ) + elif prop_type == "LIST": + with_clauses.append( + f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " + f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" + ) + return_clauses.append( + f"min_size: `{prop_name}_size_min`, " + f"max_size: `{prop_name}_size_max`" + ) + elif prop_type in ["BOOLEAN", "POINT", "DURATION"]: + continue + output_dict[prop_name] = "{" + return_clauses.pop() + "}" + else: + # Just sample 5 random nodes + match_clause += " WITH n LIMIT 5" + for prop in properties: + prop_name = prop["property"] + prop_type = prop["type"] + + # Check if indexed property, we can still do exhaustive + prop_index = [ + el + for el in self.structured_schema["metadata"]["index"] + if el["label"] == label_or_type + and el["properties"] == [prop_name] + and el["type"] == "RANGE" + ] + if prop_type == "STRING": + if ( + prop_index + and prop_index[0].get("size") > 0 + and prop_index[0].get("distinctValues") + <= DISTINCT_VALUE_LIMIT + ): + distinct_values = self.query( + f"CALL apoc.schema.properties.distinct(" + f"'{label_or_type}', '{prop_name}') YIELD value" + )[0]["value"] + return_clauses.append( + f"values: {distinct_values}," + f" distinct_count: {len(distinct_values)}" + ) + else: + with_clauses.append( + f"collect(distinct substring(n.`{prop_name}`, 0, 50)) " + f"AS `{prop_name}_values`" + ) + return_clauses.append(f"values: `{prop_name}_values`") + elif prop_type in [ + "INTEGER", + "FLOAT", + "DATE", + "DATE_TIME", + "LOCAL_DATE_TIME", + ]: + if not prop_index: + with_clauses.append( + f"collect(distinct toString(n.`{prop_name}`)) " + f"AS `{prop_name}_values`" + ) + return_clauses.append(f"values: `{prop_name}_values`") + else: + with_clauses.append( + f"min(n.`{prop_name}`) AS `{prop_name}_min`" + ) + with_clauses.append( + f"max(n.`{prop_name}`) AS `{prop_name}_max`" + ) + with_clauses.append( + f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`" + ) + return_clauses.append( + f"min: toString(`{prop_name}_min`), " + f"max: toString(`{prop_name}_max`), " + f"distinct_count: `{prop_name}_distinct`" + ) + + elif prop_type == "LIST": + with_clauses.append( + f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " + f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" + ) + return_clauses.append( + f"min_size: `{prop_name}_size_min`, " + f"max_size: `{prop_name}_size_max`" + ) + elif prop_type in ["BOOLEAN", "POINT", "DURATION"]: + continue + + output_dict[prop_name] = "{" + return_clauses.pop() + "}" + + with_clause = "WITH " + ",\n ".join(with_clauses) + return_clause = ( + "RETURN {" + + ", ".join(f"`{k}`: {v}" for k, v in output_dict.items()) + + "} AS output" + ) + + # Combine all parts of the Cypher query + return f"{match_clause}\n{with_clause}\n{return_clause}" + + def get_schema(self, refresh: bool = False) -> Any: + if refresh: + self.refresh_schema() + + return self.structured_schema + + def get_schema_str(self, refresh: bool = False) -> str: + schema = self.get_schema(refresh=refresh) + + formatted_node_props = [] + formatted_rel_props = [] + + if self.enhcnaced_schema: + # Enhanced formatting for nodes + for node_type, properties in schema["node_props"].items(): + formatted_node_props.append(f"- **{node_type}**") + for prop in properties: + example = "" + if prop["type"] == "STRING" and prop.get("values"): + if ( + prop.get("distinct_count", 11) + > DISTINCT_VALUE_LIMIT + ): + example = ( + f'Example: "{clean_string_values(prop["values"][0])}"' + if prop["values"] + else "" + ) + else: # If less than 10 possible values return all + example = ( + ( + "Available options: " + f'{[clean_string_values(el) for el in prop["values"]]}' + ) + if prop["values"] + else "" + ) + + elif prop["type"] in [ + "INTEGER", + "FLOAT", + "DATE", + "DATE_TIME", + "LOCAL_DATE_TIME", + ]: + if prop.get("min") is not None: + example = f'Min: {prop["min"]}, Max: {prop["max"]}' + else: + example = ( + f'Example: "{prop["values"][0]}"' + if prop.get("values") + else "" + ) + elif prop["type"] == "LIST": + # Skip embeddings + if ( + not prop.get("min_size") + or prop["min_size"] > LIST_LIMIT + ): + continue + example = f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}' + formatted_node_props.append( + f" - `{prop['property']}`: {prop['type']} {example}" + ) + + # Enhanced formatting for relationships + for rel_type, properties in schema["rel_props"].items(): + formatted_rel_props.append(f"- **{rel_type}**") + for prop in properties: + example = "" + if prop["type"] == "STRING": + if ( + prop.get("distinct_count", 11) + > DISTINCT_VALUE_LIMIT + ): + example = ( + f'Example: "{clean_string_values(prop["values"][0])}"' + if prop.get("values") + else "" + ) + else: # If less than 10 possible values return all + example = ( + ( + "Available options: " + f'{[clean_string_values(el) for el in prop["values"]]}' + ) + if prop.get("values") + else "" + ) + elif prop["type"] in [ + "INTEGER", + "FLOAT", + "DATE", + "DATE_TIME", + "LOCAL_DATE_TIME", + ]: + if prop.get("min"): # If we have min/max + example = ( + f'Min: {prop["min"]}, Max: {prop["max"]}' + ) + else: # return a single value + example = ( + f'Example: "{prop["values"][0]}"' + if prop.get("values") + else "" + ) + elif prop["type"] == "LIST": + # Skip embeddings + if prop["min_size"] > LIST_LIMIT: + continue + example = f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}' + formatted_rel_props.append( + f" - `{prop['property']}: {prop['type']}` {example}" + ) + else: + # Format node properties + for label, props in schema["node_props"].items(): + props_str = ", ".join( + [f"{prop['property']}: {prop['type']}" for prop in props] + ) + formatted_node_props.append(f"{label} {{{props_str}}}") + + # Format relationship properties using structured_schema + for type, props in schema["rel_props"].items(): + props_str = ", ".join( + [f"{prop['property']}: {prop['type']}" for prop in props] + ) + formatted_rel_props.append(f"{type} {{{props_str}}}") + + # Format relationships + formatted_rels = [ + f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" + for el in schema["relationships"] + ] + + return "\n".join( + [ + "Node properties:", + "\n".join(formatted_node_props), + "Relationship properties:", + "\n".join(formatted_rel_props), + "The relationships:", + "\n".join(formatted_rels), + ] + ) + + def update_extraction_prompt( + self, + prompt_provider: PromptProvider, + entity_types: list[EntityType], + relations: list[Relation], + ): + # Fetch the kg extraction prompt with blank entity types and relations + # Note - Assumes that for given prompt there is a `_with_spec` that can have entities + relations specified + few_shot_ner_kg_extraction_with_spec = prompt_provider.get_prompt( + f"{self.config.kg_extraction_prompt}_with_spec" + ) + + # Format the prompt to include the desired entity types and relations + few_shot_ner_kg_extraction = ( + few_shot_ner_kg_extraction_with_spec.replace( + "{entity_types}", format_entity_types(entity_types) + ).replace("{relations}", format_relations(relations)) + ) + + # Update the "few_shot_ner_kg_extraction" prompt used in downstream KG construction + prompt_provider.update_prompt( + self.config.kg_extraction_prompt, + json.dumps(few_shot_ner_kg_extraction, ensure_ascii=False), + ) + + def update_kg_agent_prompt( + self, + prompt_provider: PromptProvider, + entity_types: list[EntityType], + relations: list[Relation], + ): + # Fetch the kg extraction prompt with blank entity types and relations + # Note - Assumes that for given prompt there is a `_with_spec` that can have entities + relations specified + few_shot_ner_kg_extraction_with_spec = prompt_provider.get_prompt( + f"{self.config.kg_agent_prompt}_with_spec" + ) + + # Format the prompt to include the desired entity types and relations + few_shot_ner_kg_extraction = ( + few_shot_ner_kg_extraction_with_spec.replace( + "{entity_types}", + format_entity_types(entity_types, ignore_subcats=True), + ).replace("{relations}", format_relations(relations)) + ) + + # Update the "few_shot_ner_kg_extraction" prompt used in downstream KG construction + prompt_provider.update_prompt( + self.config.kg_agent_prompt, + json.dumps(few_shot_ner_kg_extraction, ensure_ascii=False), + ) diff --git a/R2R/r2r/providers/llms/__init__.py b/R2R/r2r/providers/llms/__init__.py new file mode 100755 index 00000000..38a1c54a --- /dev/null +++ b/R2R/r2r/providers/llms/__init__.py @@ -0,0 +1,7 @@ +from .litellm.base_litellm import LiteLLM +from .openai.base_openai import OpenAILLM + +__all__ = [ + "LiteLLM", + "OpenAILLM", +] diff --git a/R2R/r2r/providers/llms/litellm/base_litellm.py b/R2R/r2r/providers/llms/litellm/base_litellm.py new file mode 100755 index 00000000..581cce9a --- /dev/null +++ b/R2R/r2r/providers/llms/litellm/base_litellm.py @@ -0,0 +1,142 @@ +import logging +from typing import Any, Generator, Union + +from r2r.base import ( + LLMChatCompletion, + LLMChatCompletionChunk, + LLMConfig, + LLMProvider, +) +from r2r.base.abstractions.llm import GenerationConfig + +logger = logging.getLogger(__name__) + + +class LiteLLM(LLMProvider): + """A concrete class for creating LiteLLM models.""" + + def __init__( + self, + config: LLMConfig, + *args, + **kwargs, + ) -> None: + try: + from litellm import acompletion, completion + + self.litellm_completion = completion + self.litellm_acompletion = acompletion + except ImportError: + raise ImportError( + "Error, `litellm` is required to run a LiteLLM. Please install it using `pip install litellm`." + ) + super().__init__(config) + + def get_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletion: + if generation_config.stream: + raise ValueError( + "Stream must be set to False to use the `get_completion` method." + ) + return self._get_completion(messages, generation_config, **kwargs) + + def get_completion_stream( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> Generator[LLMChatCompletionChunk, None, None]: + if not generation_config.stream: + raise ValueError( + "Stream must be set to True to use the `get_completion_stream` method." + ) + return self._get_completion(messages, generation_config, **kwargs) + + def extract_content(self, response: LLMChatCompletion) -> str: + return response.choices[0].message.content + + def _get_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> Union[ + LLMChatCompletion, Generator[LLMChatCompletionChunk, None, None] + ]: + # Create a dictionary with the default arguments + args = self._get_base_args(generation_config) + args["messages"] = messages + + # Conditionally add the 'functions' argument if it's not None + if generation_config.functions is not None: + args["functions"] = generation_config.functions + + args = {**args, **kwargs} + response = self.litellm_completion(**args) + + if not generation_config.stream: + return LLMChatCompletion(**response.dict()) + else: + return self._get_chat_completion(response) + + def _get_chat_completion( + self, + response: Any, + ) -> Generator[LLMChatCompletionChunk, None, None]: + for part in response: + yield LLMChatCompletionChunk(**part.dict()) + + def _get_base_args( + self, + generation_config: GenerationConfig, + prompt=None, + ) -> dict: + """Get the base arguments for the LiteLLM API.""" + args = { + "model": generation_config.model, + "temperature": generation_config.temperature, + "top_p": generation_config.top_p, + "stream": generation_config.stream, + # TODO - We need to cap this to avoid potential errors when exceed max allowable context + "max_tokens": generation_config.max_tokens_to_sample, + } + return args + + async def aget_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletion: + if generation_config.stream: + raise ValueError( + "Stream must be set to False to use the `aget_completion` method." + ) + return await self._aget_completion( + messages, generation_config, **kwargs + ) + + async def _aget_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> Union[LLMChatCompletion, LLMChatCompletionChunk]: + """Asynchronously get a completion from the OpenAI API based on the provided messages.""" + + # Create a dictionary with the default arguments + args = self._get_base_args(generation_config) + + args["messages"] = messages + + # Conditionally add the 'functions' argument if it's not None + if generation_config.functions is not None: + args["functions"] = generation_config.functions + + args = {**args, **kwargs} + # Create the chat completion + return await self.litellm_acompletion(**args) diff --git a/R2R/r2r/providers/llms/openai/base_openai.py b/R2R/r2r/providers/llms/openai/base_openai.py new file mode 100755 index 00000000..460c0f0b --- /dev/null +++ b/R2R/r2r/providers/llms/openai/base_openai.py @@ -0,0 +1,144 @@ +"""A module for creating OpenAI model abstractions.""" + +import logging +import os +from typing import Union + +from r2r.base import ( + LLMChatCompletion, + LLMChatCompletionChunk, + LLMConfig, + LLMProvider, +) +from r2r.base.abstractions.llm import GenerationConfig + +logger = logging.getLogger(__name__) + + +class OpenAILLM(LLMProvider): + """A concrete class for creating OpenAI models.""" + + def __init__( + self, + config: LLMConfig, + *args, + **kwargs, + ) -> None: + if not isinstance(config, LLMConfig): + raise ValueError( + "The provided config must be an instance of OpenAIConfig." + ) + try: + from openai import OpenAI # noqa + except ImportError: + raise ImportError( + "Error, `openai` is required to run an OpenAILLM. Please install it using `pip install openai`." + ) + if config.provider != "openai": + raise ValueError( + "OpenAILLM must be initialized with config with `openai` provider." + ) + if not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "OpenAI API key not found. Please set the OPENAI_API_KEY environment variable." + ) + super().__init__(config) + self.config: LLMConfig = config + self.client = OpenAI() + + def get_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletion: + if generation_config.stream: + raise ValueError( + "Stream must be set to False to use the `get_completion` method." + ) + return self._get_completion(messages, generation_config, **kwargs) + + def get_completion_stream( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletionChunk: + if not generation_config.stream: + raise ValueError( + "Stream must be set to True to use the `get_completion_stream` method." + ) + return self._get_completion(messages, generation_config, **kwargs) + + def _get_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> Union[LLMChatCompletion, LLMChatCompletionChunk]: + """Get a completion from the OpenAI API based on the provided messages.""" + + # Create a dictionary with the default arguments + args = self._get_base_args(generation_config) + + args["messages"] = messages + + # Conditionally add the 'functions' argument if it's not None + if generation_config.functions is not None: + args["functions"] = generation_config.functions + + args = {**args, **kwargs} + # Create the chat completion + return self.client.chat.completions.create(**args) + + def _get_base_args( + self, + generation_config: GenerationConfig, + ) -> dict: + """Get the base arguments for the OpenAI API.""" + + args = { + "model": generation_config.model, + "temperature": generation_config.temperature, + "top_p": generation_config.top_p, + "stream": generation_config.stream, + # TODO - We need to cap this to avoid potential errors when exceed max allowable context + "max_tokens": generation_config.max_tokens_to_sample, + } + + return args + + async def aget_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> LLMChatCompletion: + if generation_config.stream: + raise ValueError( + "Stream must be set to False to use the `aget_completion` method." + ) + return await self._aget_completion( + messages, generation_config, **kwargs + ) + + async def _aget_completion( + self, + messages: list[dict], + generation_config: GenerationConfig, + **kwargs, + ) -> Union[LLMChatCompletion, LLMChatCompletionChunk]: + """Asynchronously get a completion from the OpenAI API based on the provided messages.""" + + # Create a dictionary with the default arguments + args = self._get_base_args(generation_config) + + args["messages"] = messages + + # Conditionally add the 'functions' argument if it's not None + if generation_config.functions is not None: + args["functions"] = generation_config.functions + + args = {**args, **kwargs} + # Create the chat completion + return await self.client.chat.completions.create(**args) diff --git a/R2R/r2r/providers/vector_dbs/__init__.py b/R2R/r2r/providers/vector_dbs/__init__.py new file mode 100755 index 00000000..38ea0890 --- /dev/null +++ b/R2R/r2r/providers/vector_dbs/__init__.py @@ -0,0 +1,5 @@ +from .pgvector.pgvector_db import PGVectorDB + +__all__ = [ + "PGVectorDB", +] diff --git a/R2R/r2r/providers/vector_dbs/pgvector/pgvector_db.py b/R2R/r2r/providers/vector_dbs/pgvector/pgvector_db.py new file mode 100755 index 00000000..8cf728d1 --- /dev/null +++ b/R2R/r2r/providers/vector_dbs/pgvector/pgvector_db.py @@ -0,0 +1,610 @@ +import json +import logging +import os +import time +from typing import Literal, Optional, Union + +from sqlalchemy import exc, text +from sqlalchemy.engine.url import make_url + +from r2r.base import ( + DocumentInfo, + UserStats, + VectorDBConfig, + VectorDBProvider, + VectorEntry, + VectorSearchResult, +) +from r2r.vecs.client import Client +from r2r.vecs.collection import Collection + +logger = logging.getLogger(__name__) + + +class PGVectorDB(VectorDBProvider): + def __init__(self, config: VectorDBConfig) -> None: + super().__init__(config) + try: + import r2r.vecs + except ImportError: + raise ValueError( + f"Error, PGVectorDB requires the vecs library. Please run `pip install vecs`." + ) + + # Check if a complete Postgres URI is provided + postgres_uri = self.config.extra_fields.get( + "postgres_uri" + ) or os.getenv("POSTGRES_URI") + + if postgres_uri: + # Log loudly that Postgres URI is being used + logger.warning("=" * 50) + logger.warning( + "ATTENTION: Using provided Postgres URI for connection" + ) + logger.warning("=" * 50) + + # Validate and use the provided URI + try: + parsed_uri = make_url(postgres_uri) + if not all([parsed_uri.username, parsed_uri.database]): + raise ValueError( + "The provided Postgres URI is missing required components." + ) + DB_CONNECTION = postgres_uri + + # Log the sanitized URI (without password) + sanitized_uri = parsed_uri.set(password="*****") + logger.info(f"Connecting using URI: {sanitized_uri}") + except Exception as e: + raise ValueError(f"Invalid Postgres URI provided: {e}") + else: + # Fall back to existing logic for individual connection parameters + user = self.config.extra_fields.get("user", None) or os.getenv( + "POSTGRES_USER" + ) + password = self.config.extra_fields.get( + "password", None + ) or os.getenv("POSTGRES_PASSWORD") + host = self.config.extra_fields.get("host", None) or os.getenv( + "POSTGRES_HOST" + ) + port = self.config.extra_fields.get("port", None) or os.getenv( + "POSTGRES_PORT" + ) + db_name = self.config.extra_fields.get( + "db_name", None + ) or os.getenv("POSTGRES_DBNAME") + + if not all([user, password, host, db_name]): + raise ValueError( + "Error, please set the POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_HOST, POSTGRES_DBNAME environment variables or provide them in the config." + ) + + # Check if it's a Unix socket connection + if host.startswith("/") and not port: + DB_CONNECTION = ( + f"postgresql://{user}:{password}@/{db_name}?host={host}" + ) + logger.info("Using Unix socket connection") + else: + DB_CONNECTION = ( + f"postgresql://{user}:{password}@{host}:{port}/{db_name}" + ) + logger.info("Using TCP connection") + + # The rest of the initialization remains the same + try: + self.vx: Client = r2r.vecs.create_client(DB_CONNECTION) + except Exception as e: + raise ValueError( + f"Error {e} occurred while attempting to connect to the pgvector provider with {DB_CONNECTION}." + ) + + self.collection_name = self.config.extra_fields.get( + "vecs_collection" + ) or os.getenv("POSTGRES_VECS_COLLECTION") + if not self.collection_name: + raise ValueError( + "Error, please set a valid POSTGRES_VECS_COLLECTION environment variable or set a 'vecs_collection' in the 'vector_database' settings of your `config.json`." + ) + + self.collection: Optional[Collection] = None + + logger.info( + f"Successfully initialized PGVectorDB with collection: {self.collection_name}" + ) + + def initialize_collection(self, dimension: int) -> None: + self.collection = self.vx.get_or_create_collection( + name=self.collection_name, dimension=dimension + ) + self._create_document_info_table() + self._create_hybrid_search_function() + + def _create_document_info_table(self): + with self.vx.Session() as sess: + with sess.begin(): + try: + # Enable uuid-ossp extension + sess.execute( + text('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";') + ) + except exc.ProgrammingError as e: + logger.error(f"Error enabling uuid-ossp extension: {e}") + raise + + # Create the table if it doesn't exist + create_table_query = f""" + CREATE TABLE IF NOT EXISTS document_info_"{self.collection_name}" ( + document_id UUID PRIMARY KEY, + title TEXT, + user_id UUID NULL, + version TEXT, + size_in_bytes INT, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + metadata JSONB, + status TEXT + ); + """ + sess.execute(text(create_table_query)) + + # Add the new column if it doesn't exist + add_column_query = f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_name = 'document_info_"{self.collection_name}"' + AND column_name = 'status' + ) THEN + ALTER TABLE "document_info_{self.collection_name}" + ADD COLUMN status TEXT DEFAULT 'processing'; + END IF; + END $$; + """ + sess.execute(text(add_column_query)) + + sess.commit() + + def _create_hybrid_search_function(self): + hybrid_search_function = f""" + CREATE OR REPLACE FUNCTION hybrid_search_{self.collection_name}( + query_text TEXT, + query_embedding VECTOR(512), + match_limit INT, + full_text_weight FLOAT = 1, + semantic_weight FLOAT = 1, + rrf_k INT = 50, + filter_condition JSONB = NULL + ) + RETURNS SETOF vecs."{self.collection_name}" + LANGUAGE sql + AS $$ + WITH full_text AS ( + SELECT + id, + ROW_NUMBER() OVER (ORDER BY ts_rank(to_tsvector('english', metadata->>'text'), websearch_to_tsquery(query_text)) DESC) AS rank_ix + FROM vecs."{self.collection_name}" + WHERE to_tsvector('english', metadata->>'text') @@ websearch_to_tsquery(query_text) + AND (filter_condition IS NULL OR (metadata @> filter_condition)) + ORDER BY rank_ix + LIMIT LEAST(match_limit, 30) * 2 + ), + semantic AS ( + SELECT + id, + ROW_NUMBER() OVER (ORDER BY vec <#> query_embedding) AS rank_ix + FROM vecs."{self.collection_name}" + WHERE filter_condition IS NULL OR (metadata @> filter_condition) + ORDER BY rank_ix + LIMIT LEAST(match_limit, 30) * 2 + ) + SELECT + vecs."{self.collection_name}".* + FROM + full_text + FULL OUTER JOIN semantic + ON full_text.id = semantic.id + JOIN vecs."{self.collection_name}" + ON vecs."{self.collection_name}".id = COALESCE(full_text.id, semantic.id) + ORDER BY + COALESCE(1.0 / (rrf_k + full_text.rank_ix), 0.0) * full_text_weight + + COALESCE(1.0 / (rrf_k + semantic.rank_ix), 0.0) * semantic_weight + DESC + LIMIT + LEAST(match_limit, 30); + $$; + """ + retry_attempts = 5 + for attempt in range(retry_attempts): + try: + with self.vx.Session() as sess: + # Acquire an advisory lock + sess.execute(text("SELECT pg_advisory_lock(123456789)")) + try: + sess.execute(text(hybrid_search_function)) + sess.commit() + finally: + # Release the advisory lock + sess.execute( + text("SELECT pg_advisory_unlock(123456789)") + ) + break # Break the loop if successful + except exc.InternalError as e: + if "tuple concurrently updated" in str(e): + time.sleep(2**attempt) # Exponential backoff + else: + raise # Re-raise the exception if it's not a concurrency issue + else: + raise RuntimeError( + "Failed to create hybrid search function after multiple attempts" + ) + + def copy(self, entry: VectorEntry, commit=True) -> None: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `copy`." + ) + + serializeable_entry = entry.to_serializable() + + self.collection.copy( + records=[ + ( + serializeable_entry["id"], + serializeable_entry["vector"], + serializeable_entry["metadata"], + ) + ] + ) + + def copy_entries( + self, entries: list[VectorEntry], commit: bool = True + ) -> None: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `copy_entries`." + ) + + self.collection.copy( + records=[ + ( + str(entry.id), + entry.vector.data, + entry.to_serializable()["metadata"], + ) + for entry in entries + ] + ) + + def upsert(self, entry: VectorEntry, commit=True) -> None: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `upsert`." + ) + + self.collection.upsert( + records=[ + ( + str(entry.id), + entry.vector.data, + entry.to_serializable()["metadata"], + ) + ] + ) + + def upsert_entries( + self, entries: list[VectorEntry], commit: bool = True + ) -> None: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `upsert_entries`." + ) + + self.collection.upsert( + records=[ + ( + str(entry.id), + entry.vector.data, + entry.to_serializable()["metadata"], + ) + for entry in entries + ] + ) + + def search( + self, + query_vector: list[float], + filters: dict[str, Union[bool, int, str]] = {}, + limit: int = 10, + *args, + **kwargs, + ) -> list[VectorSearchResult]: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `search`." + ) + measure = kwargs.get("measure", "cosine_distance") + mapped_filters = { + key: {"$eq": value} for key, value in filters.items() + } + + return [ + VectorSearchResult(id=ele[0], score=float(1 - ele[1]), metadata=ele[2]) # type: ignore + for ele in self.collection.query( + data=query_vector, + limit=limit, + filters=mapped_filters, + measure=measure, + include_value=True, + include_metadata=True, + ) + ] + + def hybrid_search( + self, + query_text: str, + query_vector: list[float], + limit: int = 10, + filters: Optional[dict[str, Union[bool, int, str]]] = None, + # Hybrid search parameters + full_text_weight: float = 1.0, + semantic_weight: float = 1.0, + rrf_k: int = 20, # typical value is ~2x the number of results you want + *args, + **kwargs, + ) -> list[VectorSearchResult]: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `hybrid_search`." + ) + + # Convert filters to a JSON-compatible format + filter_condition = None + if filters: + filter_condition = json.dumps(filters) + + query = text( + f""" + SELECT * FROM hybrid_search_{self.collection_name}( + cast(:query_text as TEXT), cast(:query_embedding as VECTOR), cast(:match_limit as INT), + cast(:full_text_weight as FLOAT), cast(:semantic_weight as FLOAT), cast(:rrf_k as INT), + cast(:filter_condition as JSONB) + ) + """ + ) + + params = { + "query_text": str(query_text), + "query_embedding": list(query_vector), + "match_limit": limit, + "full_text_weight": full_text_weight, + "semantic_weight": semantic_weight, + "rrf_k": rrf_k, + "filter_condition": filter_condition, + } + + with self.vx.Session() as session: + result = session.execute(query, params).fetchall() + return [ + VectorSearchResult(id=row[0], score=1.0, metadata=row[-1]) + for row in result + ] + + def create_index(self, index_type, column_name, index_options): + pass + + def delete_by_metadata( + self, + metadata_fields: list[str], + metadata_values: list[Union[bool, int, str]], + logic: Literal["AND", "OR"] = "AND", + ) -> list[str]: + if logic == "OR": + raise ValueError( + "OR logic is still being tested before official support for `delete_by_metadata` in pgvector." + ) + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `delete_by_metadata`." + ) + + if len(metadata_fields) != len(metadata_values): + raise ValueError( + "The number of metadata fields must match the number of metadata values." + ) + + # Construct the filter + if logic == "AND": + filters = { + k: {"$eq": v} for k, v in zip(metadata_fields, metadata_values) + } + else: # OR logic + # TODO - Test 'or' logic and remove check above + filters = { + "$or": [ + {k: {"$eq": v}} + for k, v in zip(metadata_fields, metadata_values) + ] + } + return self.collection.delete(filters=filters) + + def get_metadatas( + self, + metadata_fields: list[str], + filter_field: Optional[str] = None, + filter_value: Optional[Union[bool, int, str]] = None, + ) -> list[dict]: + if self.collection is None: + raise ValueError( + "Please call `initialize_collection` before attempting to run `get_metadatas`." + ) + + results = {tuple(metadata_fields): {}} + for field in metadata_fields: + unique_values = self.collection.get_unique_metadata_values( + field=field, + filter_field=filter_field, + filter_value=filter_value, + ) + for value in unique_values: + if value not in results: + results[value] = {} + results[value][field] = value + + return [ + results[key] for key in results if key != tuple(metadata_fields) + ] + + def upsert_documents_overview( + self, documents_overview: list[DocumentInfo] + ) -> None: + for document_info in documents_overview: + db_entry = document_info.convert_to_db_entry() + + # Convert 'None' string to None type for user_id + if db_entry["user_id"] == "None": + db_entry["user_id"] = None + + query = text( + f""" + INSERT INTO "document_info_{self.collection_name}" (document_id, title, user_id, version, created_at, updated_at, size_in_bytes, metadata, status) + VALUES (:document_id, :title, :user_id, :version, :created_at, :updated_at, :size_in_bytes, :metadata, :status) + ON CONFLICT (document_id) DO UPDATE SET + title = EXCLUDED.title, + user_id = EXCLUDED.user_id, + version = EXCLUDED.version, + updated_at = EXCLUDED.updated_at, + size_in_bytes = EXCLUDED.size_in_bytes, + metadata = EXCLUDED.metadata, + status = EXCLUDED.status; + """ + ) + with self.vx.Session() as sess: + sess.execute(query, db_entry) + sess.commit() + + def delete_from_documents_overview( + self, document_id: str, version: Optional[str] = None + ) -> None: + query = f""" + DELETE FROM "document_info_{self.collection_name}" + WHERE document_id = :document_id + """ + params = {"document_id": document_id} + + if version is not None: + query += " AND version = :version" + params["version"] = version + + with self.vx.Session() as sess: + with sess.begin(): + sess.execute(text(query), params) + sess.commit() + + def get_documents_overview( + self, + filter_document_ids: Optional[list[str]] = None, + filter_user_ids: Optional[list[str]] = None, + ): + conditions = [] + params = {} + + if filter_document_ids: + placeholders = ", ".join( + f":doc_id_{i}" for i in range(len(filter_document_ids)) + ) + conditions.append(f"document_id IN ({placeholders})") + params.update( + { + f"doc_id_{i}": str(document_id) + for i, document_id in enumerate(filter_document_ids) + } + ) + if filter_user_ids: + placeholders = ", ".join( + f":user_id_{i}" for i in range(len(filter_user_ids)) + ) + conditions.append(f"user_id IN ({placeholders})") + params.update( + { + f"user_id_{i}": str(user_id) + for i, user_id in enumerate(filter_user_ids) + } + ) + + query = f""" + SELECT document_id, title, user_id, version, size_in_bytes, created_at, updated_at, metadata, status + FROM "document_info_{self.collection_name}" + """ + if conditions: + query += " WHERE " + " AND ".join(conditions) + + with self.vx.Session() as sess: + results = sess.execute(text(query), params).fetchall() + return [ + DocumentInfo( + document_id=row[0], + title=row[1], + user_id=row[2], + version=row[3], + size_in_bytes=row[4], + created_at=row[5], + updated_at=row[6], + metadata=row[7], + status=row[8], + ) + for row in results + ] + + def get_document_chunks(self, document_id: str) -> list[dict]: + if not self.collection: + raise ValueError("Collection is not initialized.") + + table_name = self.collection.table.name + query = text( + f""" + SELECT metadata + FROM vecs."{table_name}" + WHERE metadata->>'document_id' = :document_id + ORDER BY CAST(metadata->>'chunk_order' AS INTEGER) + """ + ) + + params = {"document_id": document_id} + + with self.vx.Session() as sess: + results = sess.execute(query, params).fetchall() + return [result[0] for result in results] + + def get_users_overview(self, user_ids: Optional[list[str]] = None): + user_ids_condition = "" + params = {} + if user_ids: + user_ids_condition = "WHERE user_id IN :user_ids" + params["user_ids"] = tuple( + map(str, user_ids) + ) # Convert UUIDs to strings + + query = f""" + SELECT user_id, COUNT(document_id) AS num_files, SUM(size_in_bytes) AS total_size_in_bytes, ARRAY_AGG(document_id) AS document_ids + FROM "document_info_{self.collection_name}" + {user_ids_condition} + GROUP BY user_id + """ + + with self.vx.Session() as sess: + results = sess.execute(text(query), params).fetchall() + return [ + UserStats( + user_id=row[0], + num_files=row[1], + total_size_in_bytes=row[2], + document_ids=row[3], + ) + for row in results + if row[0] is not None + ] diff --git a/R2R/r2r/telemetry/__init__.py b/R2R/r2r/telemetry/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/R2R/r2r/telemetry/__init__.py diff --git a/R2R/r2r/telemetry/events.py b/R2R/r2r/telemetry/events.py new file mode 100755 index 00000000..5bd7528b --- /dev/null +++ b/R2R/r2r/telemetry/events.py @@ -0,0 +1,59 @@ +import uuid +from typing import Any, Dict + + +class BaseTelemetryEvent: + def __init__(self, event_type: str, properties: Dict[str, Any]): + self.event_type = event_type + self.properties = properties + self.event_id = str(uuid.uuid4()) + + +class DailyActiveUserEvent(BaseTelemetryEvent): + def __init__(self, user_id: str): + super().__init__("DailyActiveUser", {"user_id": user_id}) + + +class FeatureUsageEvent(BaseTelemetryEvent): + def __init__(self, user_id: str, feature: str): + super().__init__( + "FeatureUsage", {"user_id": user_id, "feature": feature} + ) + + +class ErrorEvent(BaseTelemetryEvent): + def __init__(self, user_id: str, endpoint: str, error_message: str): + super().__init__( + "Error", + { + "user_id": user_id, + "endpoint": endpoint, + "error_message": error_message, + }, + ) + + +class RequestLatencyEvent(BaseTelemetryEvent): + def __init__(self, endpoint: str, latency: float): + super().__init__( + "RequestLatency", {"endpoint": endpoint, "latency": latency} + ) + + +class GeographicDistributionEvent(BaseTelemetryEvent): + def __init__(self, user_id: str, country: str): + super().__init__( + "GeographicDistribution", {"user_id": user_id, "country": country} + ) + + +class SessionDurationEvent(BaseTelemetryEvent): + def __init__(self, user_id: str, duration: float): + super().__init__( + "SessionDuration", {"user_id": user_id, "duration": duration} + ) + + +class UserPathEvent(BaseTelemetryEvent): + def __init__(self, user_id: str, path: str): + super().__init__("UserPath", {"user_id": user_id, "path": path}) diff --git a/R2R/r2r/telemetry/posthog.py b/R2R/r2r/telemetry/posthog.py new file mode 100755 index 00000000..64e63895 --- /dev/null +++ b/R2R/r2r/telemetry/posthog.py @@ -0,0 +1,58 @@ +import logging +import os + +import posthog + +from r2r.telemetry.events import BaseTelemetryEvent + +logger = logging.getLogger(__name__) + + +class PosthogClient: + """ + This is a write-only project API key, so it can only create new events. It can't + read events or any of your other data stored with PostHog, so it's safe to use in public apps. + """ + + def __init__( + self, api_key: str, enabled: bool = True, debug: bool = False + ): + self.enabled = enabled + self.debug = debug + + if self.enabled: + logger.info( + "Initializing anonymized telemetry. To disable, set TELEMETRY_ENABLED=false in your environment." + ) + posthog.project_api_key = api_key + posthog.disable_geoip = False + else: + posthog.disabled = True + + if self.debug: + posthog.debug = True + + logger.info( + f"Posthog telemetry {'enabled' if self.enabled else 'disabled'}, debug mode {'on' if self.debug else 'off'}" + ) + + def capture(self, event: BaseTelemetryEvent): + if self.enabled: + posthog.capture(event.event_id, event.event_type, event.properties) + + +# Initialize the telemetry client with a flag to enable or disable telemetry +telemetry_enabled = os.getenv("TELEMETRY_ENABLED", "true").lower() in ( + "true", + "1", + "t", +) +debug_mode = os.getenv("DEBUG_MODE", "false").lower() in ( + "true", + "1", + "t", +) +telemetry_client = PosthogClient( + api_key="phc_OPBbibOIErCGc4NDLQsOrMuYFTKDmRwXX6qxnTr6zpU", + enabled=telemetry_enabled, +) diff --git a/R2R/r2r/telemetry/telemetry_decorator.py b/R2R/r2r/telemetry/telemetry_decorator.py new file mode 100755 index 00000000..2938a83e --- /dev/null +++ b/R2R/r2r/telemetry/telemetry_decorator.py @@ -0,0 +1,56 @@ +import asyncio +import logging +from functools import wraps + +from r2r.telemetry.events import ErrorEvent, FeatureUsageEvent +from r2r.telemetry.posthog import telemetry_client + +logger = logging.getLogger(__name__) + + +def telemetry_event(event_name): + def decorator(func): + @wraps(func) + async def async_wrapper(*args, **kwargs): + user_id = kwargs.get("user_id", "unknown_user") + try: + result = await func(*args, **kwargs) + try: + telemetry_client.capture( + FeatureUsageEvent(user_id=user_id, feature=event_name) + ) + except Exception as e: + logger.error(f"Error in telemetry event logging: {str(e)}") + return result + except Exception as e: + try: + telemetry_client.capture( + ErrorEvent( + user_id=user_id, + endpoint=event_name, + error_message=str(e), + ) + ) + except Exception as e: + logger.error(f"Error in telemetry event logging: {str(e)}") + + raise + + @wraps(func) + def sync_wrapper(*args, **kwargs): + loop = asyncio.get_event_loop() + if loop.is_running(): + future = asyncio.run_coroutine_threadsafe( + async_wrapper(*args, **kwargs), loop + ) + return future.result() + else: + return loop.run_until_complete(async_wrapper(*args, **kwargs)) + + return ( + async_wrapper + if asyncio.iscoroutinefunction(func) + else sync_wrapper + ) + + return decorator diff --git a/R2R/r2r/vecs/__init__.py b/R2R/r2r/vecs/__init__.py new file mode 100755 index 00000000..9d4f1d7e --- /dev/null +++ b/R2R/r2r/vecs/__init__.py @@ -0,0 +1,28 @@ +from . import exc +from .client import Client +from .collection import ( + Collection, + IndexArgsHNSW, + IndexArgsIVFFlat, + IndexMeasure, + IndexMethod, +) + +__project__ = "vecs" +__version__ = "0.4.2" + + +__all__ = [ + "IndexArgsIVFFlat", + "IndexArgsHNSW", + "IndexMethod", + "IndexMeasure", + "Collection", + "Client", + "exc", +] + + +def create_client(connection_string: str, *args, **kwargs) -> Client: + """Creates a client from a Postgres connection string""" + return Client(connection_string, *args, **kwargs) diff --git a/R2R/r2r/vecs/adapter/__init__.py b/R2R/r2r/vecs/adapter/__init__.py new file mode 100755 index 00000000..9cd9860d --- /dev/null +++ b/R2R/r2r/vecs/adapter/__init__.py @@ -0,0 +1,15 @@ +from .base import Adapter, AdapterContext, AdapterStep +from .markdown import MarkdownChunker +from .noop import NoOp +from .text import ParagraphChunker, TextEmbedding, TextEmbeddingModel + +__all__ = [ + "Adapter", + "AdapterContext", + "AdapterStep", + "NoOp", + "ParagraphChunker", + "TextEmbedding", + "TextEmbeddingModel", + "MarkdownChunker", +] diff --git a/R2R/r2r/vecs/adapter/base.py b/R2R/r2r/vecs/adapter/base.py new file mode 100755 index 00000000..7734e802 --- /dev/null +++ b/R2R/r2r/vecs/adapter/base.py @@ -0,0 +1,111 @@ +""" +The `vecs.experimental.adapter.base` module provides abstract classes and utilities +for creating and handling adapters in vecs. Adapters allow users to interact with +a collection using media types other than vectors. + +All public classes, enums, and functions are re-exported by `vecs.adapters` module. +""" + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Any, Dict, Generator, Iterable, Optional, Tuple + +from vecs.exc import ArgError + + +class AdapterContext(str, Enum): + """ + An enum representing the different contexts in which a Pipeline + will be invoked. + + Attributes: + upsert (str): The Collection.upsert method + query (str): The Collection.query method + """ + + upsert = "upsert" + query = "query" + + +class AdapterStep(ABC): + """ + Abstract class representing a step in the adapter pipeline. + + Each adapter step should adapt a user media into a tuple of: + - id (str) + - media (unknown type) + - metadata (dict) + + If the user provides id or metadata, default production is overridden. + """ + + @property + def exported_dimension(self) -> Optional[int]: + """ + Property that should be overridden by subclasses to provide the output dimension + of the adapter step. + """ + return None + + @abstractmethod + def __call__( + self, + records: Iterable[Tuple[str, Any, Optional[Dict]]], + adapter_context: AdapterContext, + ) -> Generator[Tuple[str, Any, Dict], None, None]: + """ + Abstract method that should be overridden by subclasses to handle each record. + """ + + +class Adapter: + """ + Class representing a sequence of AdapterStep instances forming a pipeline. + """ + + def __init__(self, steps: list[AdapterStep]): + """ + Initialize an Adapter instance with a list of AdapterStep instances. + + Args: + steps: list of AdapterStep instances. + + Raises: + ArgError: Raised if the steps list is empty. + """ + self.steps = steps + if len(steps) < 1: + raise ArgError("Adapter must contain at least 1 step") + + @property + def exported_dimension(self) -> Optional[int]: + """ + The output dimension of the adapter. Returns the exported dimension of the last + AdapterStep that provides one (from end to start of the steps list). + """ + for step in reversed(self.steps): + step_dim = step.exported_dimension + if step_dim is not None: + return step_dim + return None + + def __call__( + self, + records: Iterable[Tuple[str, Any, Optional[Dict]]], + adapter_context: AdapterContext, + ) -> Generator[Tuple[str, Any, Dict], None, None]: + """ + Invokes the adapter pipeline on an iterable of records. + + Args: + records: Iterable of tuples each containing an id, a media and an optional dict. + adapter_context: Context of the adapter. + + Yields: + Tuples each containing an id, a media and a dict. + """ + pipeline = records + for step in self.steps: + pipeline = step(pipeline, adapter_context) + + yield from pipeline # type: ignore diff --git a/R2R/r2r/vecs/adapter/markdown.py b/R2R/r2r/vecs/adapter/markdown.py new file mode 100755 index 00000000..149573f4 --- /dev/null +++ b/R2R/r2r/vecs/adapter/markdown.py @@ -0,0 +1,88 @@ +import re +from typing import Any, Dict, Generator, Iterable, Optional, Tuple + +from flupy import flu + +from .base import AdapterContext, AdapterStep + + +class MarkdownChunker(AdapterStep): + """ + MarkdownChunker is an AdapterStep that splits a markdown string into chunks where a heading signifies the start of a chunk, and yields each chunk as a separate record. + """ + + def __init__(self, *, skip_during_query: bool): + """ + Initializes the MarkdownChunker adapter. + + Args: + skip_during_query (bool): Whether to skip chunking during querying. + """ + self.skip_during_query = skip_during_query + + @staticmethod + def split_by_heading( + md: str, max_tokens: int + ) -> Generator[str, None, None]: + regex_split = r"^(#{1,6}\s+.+)$" + headings = [ + match.span()[0] + for match in re.finditer(regex_split, md, flags=re.MULTILINE) + ] + + if headings == [] or headings[0] != 0: + headings.insert(0, 0) + + sections = [md[i:j] for i, j in zip(headings, headings[1:] + [None])] + + for section in sections: + chunks = flu(section.split(" ")).chunk(max_tokens) + + is_not_useless_chunk = lambda i: not i in ["", "\n", []] + + joined_chunks = filter( + is_not_useless_chunk, [" ".join(chunk) for chunk in chunks] + ) + + for joined_chunk in joined_chunks: + yield joined_chunk + + def __call__( + self, + records: Iterable[Tuple[str, Any, Optional[Dict]]], + adapter_context: AdapterContext, + max_tokens: int = 99999999, + ) -> Generator[Tuple[str, Any, Dict], None, None]: + """ + Splits each markdown string in the records into chunks where each heading starts a new chunk, and yields each chunk + as a separate record. If the `skip_during_query` attribute is set to True, + this step is skipped during querying. + + Args: + records (Iterable[Tuple[str, Any, Optional[Dict]]]): Iterable of tuples each containing an id, a markdown string and an optional dict. + adapter_context (AdapterContext): Context of the adapter. + max_tokens (int): The maximum number of tokens per chunk + + Yields: + Tuple[str, Any, Dict]: The id appended with chunk index, the chunk, and the metadata. + """ + if max_tokens and max_tokens < 1: + raise ValueError("max_tokens must be a nonzero positive integer") + + if ( + adapter_context == AdapterContext("query") + and self.skip_during_query + ): + for id, markdown, metadata in records: + yield (id, markdown, metadata or {}) + else: + for id, markdown, metadata in records: + headings = MarkdownChunker.split_by_heading( + markdown, max_tokens + ) + for heading_ix, heading in enumerate(headings): + yield ( + f"{id}_head_{str(heading_ix).zfill(3)}", + heading, + metadata or {}, + ) diff --git a/R2R/r2r/vecs/adapter/noop.py b/R2R/r2r/vecs/adapter/noop.py new file mode 100755 index 00000000..b587a552 --- /dev/null +++ b/R2R/r2r/vecs/adapter/noop.py @@ -0,0 +1,55 @@ +""" +The `vecs.experimental.adapter.noop` module provides a default no-op (no operation) adapter +that passes the inputs through without any modification. This can be useful when no specific +adapter processing is required. + +All public classes, enums, and functions are re-exported by `vecs.adapters` module. +""" + +from typing import Any, Dict, Generator, Iterable, Optional, Tuple + +from .base import AdapterContext, AdapterStep + + +class NoOp(AdapterStep): + """ + NoOp is a no-operation AdapterStep. It is a default adapter that passes through + the input records without any modifications. + """ + + def __init__(self, dimension: int): + """ + Initializes the NoOp adapter with a dimension. + + Args: + dimension (int): The dimension of the input vectors. + """ + self._dimension = dimension + + @property + def exported_dimension(self) -> Optional[int]: + """ + Returns the dimension of the adapter. + + Returns: + int: The dimension of the input vectors. + """ + return self._dimension + + def __call__( + self, + records: Iterable[Tuple[str, Any, Optional[Dict]]], + adapter_context: AdapterContext, + ) -> Generator[Tuple[str, Any, Dict], None, None]: + """ + Yields the input records without any modification. + + Args: + records: Iterable of tuples each containing an id, a media and an optional dict. + adapter_context: Context of the adapter. + + Yields: + Tuple[str, Any, Dict]: The input record. + """ + for id, media, metadata in records: + yield (id, media, metadata or {}) diff --git a/R2R/r2r/vecs/adapter/text.py b/R2R/r2r/vecs/adapter/text.py new file mode 100755 index 00000000..78ae7732 --- /dev/null +++ b/R2R/r2r/vecs/adapter/text.py @@ -0,0 +1,151 @@ +""" +The `vecs.experimental.adapter.text` module provides adapter steps specifically designed for +handling text data. It provides two main classes, `TextEmbedding` and `ParagraphChunker`. + +All public classes, enums, and functions are re-exported by `vecs.adapters` module. +""" + +from typing import Any, Dict, Generator, Iterable, Literal, Optional, Tuple + +from flupy import flu +from vecs.exc import MissingDependency + +from .base import AdapterContext, AdapterStep + +TextEmbeddingModel = Literal[ + "all-mpnet-base-v2", + "multi-qa-mpnet-base-dot-v1", + "all-distilroberta-v1", + "all-MiniLM-L12-v2", + "multi-qa-distilbert-cos-v1", + "mixedbread-ai/mxbai-embed-large-v1", + "multi-qa-MiniLM-L6-cos-v1", + "paraphrase-multilingual-mpnet-base-v2", + "paraphrase-albert-small-v2", + "paraphrase-multilingual-MiniLM-L12-v2", + "paraphrase-MiniLM-L3-v2", + "distiluse-base-multilingual-cased-v1", + "distiluse-base-multilingual-cased-v2", +] + + +class TextEmbedding(AdapterStep): + """ + TextEmbedding is an AdapterStep that converts text media into + embeddings using a specified sentence transformers model. + """ + + def __init__( + self, + *, + model: TextEmbeddingModel, + batch_size: int = 8, + use_auth_token: str = None, + ): + """ + Initializes the TextEmbedding adapter with a sentence transformers model. + + Args: + model (TextEmbeddingModel): The sentence transformers model to use for embeddings. + batch_size (int): The number of records to encode simultaneously. + use_auth_token (str): The HuggingFace Hub auth token to use for private models. + + Raises: + MissingDependency: If the sentence_transformers library is not installed. + """ + try: + from sentence_transformers import SentenceTransformer as ST + except ImportError: + raise MissingDependency( + "Missing feature vecs[text_embedding]. Hint: `pip install 'vecs[text_embedding]'`" + ) + + self.model = ST(model, use_auth_token=use_auth_token) + self._exported_dimension = ( + self.model.get_sentence_embedding_dimension() + ) + self.batch_size = batch_size + + @property + def exported_dimension(self) -> Optional[int]: + """ + Returns the dimension of the embeddings produced by the sentence transformers model. + + Returns: + int: The dimension of the embeddings. + """ + return self._exported_dimension + + def __call__( + self, + records: Iterable[Tuple[str, Any, Optional[Dict]]], + adapter_context: AdapterContext, # pyright: ignore + ) -> Generator[Tuple[str, Any, Dict], None, None]: + """ + Converts each media in the records to an embedding and yields the result. + + Args: + records: Iterable of tuples each containing an id, a media and an optional dict. + adapter_context: Context of the adapter. + + Yields: + Tuple[str, Any, Dict]: The id, the embedding, and the metadata. + """ + for batch in flu(records).chunk(self.batch_size): + batch_records = [x for x in batch] + media = [text for _, text, _ in batch_records] + + embeddings = self.model.encode(media, normalize_embeddings=True) + + for (id, _, metadata), embedding in zip(batch_records, embeddings): # type: ignore + yield (id, embedding, metadata or {}) + + +class ParagraphChunker(AdapterStep): + """ + ParagraphChunker is an AdapterStep that splits text media into + paragraphs and yields each paragraph as a separate record. + """ + + def __init__(self, *, skip_during_query: bool): + """ + Initializes the ParagraphChunker adapter. + + Args: + skip_during_query (bool): Whether to skip chunking during querying. + """ + self.skip_during_query = skip_during_query + + def __call__( + self, + records: Iterable[Tuple[str, Any, Optional[Dict]]], + adapter_context: AdapterContext, + ) -> Generator[Tuple[str, Any, Dict], None, None]: + """ + Splits each media in the records into paragraphs and yields each paragraph + as a separate record. If the `skip_during_query` attribute is set to True, + this step is skipped during querying. + + Args: + records (Iterable[Tuple[str, Any, Optional[Dict]]]): Iterable of tuples each containing an id, a media and an optional dict. + adapter_context (AdapterContext): Context of the adapter. + + Yields: + Tuple[str, Any, Dict]: The id appended with paragraph index, the paragraph, and the metadata. + """ + if ( + adapter_context == AdapterContext("query") + and self.skip_during_query + ): + for id, media, metadata in records: + yield (id, media, metadata or {}) + else: + for id, media, metadata in records: + paragraphs = media.split("\n\n") + + for paragraph_ix, paragraph in enumerate(paragraphs): + yield ( + f"{id}_para_{str(paragraph_ix).zfill(3)}", + paragraph, + metadata or {}, + ) diff --git a/R2R/r2r/vecs/client.py b/R2R/r2r/vecs/client.py new file mode 100755 index 00000000..6259f1d8 --- /dev/null +++ b/R2R/r2r/vecs/client.py @@ -0,0 +1,313 @@ +""" +Defines the 'Client' class + +Importing from the `vecs.client` directly is not supported. +All public classes, enums, and functions are re-exported by the top level `vecs` module. +""" + +from __future__ import annotations + +import logging +import time +from typing import TYPE_CHECKING, List, Optional + +import sqlalchemy +from deprecated import deprecated +from sqlalchemy import MetaData, create_engine, text +from sqlalchemy.orm import sessionmaker +from sqlalchemy.pool import QueuePool + +from .adapter import Adapter +from .exc import CollectionNotFound + +if TYPE_CHECKING: + from r2r.vecs.collection import Collection + +logger = logging.getLogger(__name__) + + +class Client: + """ + The `vecs.Client` class serves as an interface to a PostgreSQL database with pgvector support. It facilitates + the creation, retrieval, listing and deletion of vector collections, while managing connections to the + database. + + A `Client` instance represents a connection to a PostgreSQL database. This connection can be used to create + and manipulate vector collections, where each collection is a group of vector records in a PostgreSQL table. + + The `vecs.Client` class can be also supports usage as a context manager to ensure the connection to the database + is properly closed after operations, or used directly. + + Example usage: + + DB_CONNECTION = "postgresql://<user>:<password>@<host>:<port>/<db_name>" + + with vecs.create_client(DB_CONNECTION) as vx: + # do some work + pass + + # OR + + vx = vecs.create_client(DB_CONNECTION) + # do some work + vx.disconnect() + """ + + def __init__( + self, + connection_string: str, + pool_size: int = 1, + max_retries: int = 3, + retry_delay: int = 1, + ): + self.engine = create_engine( + connection_string, + pool_size=pool_size, + poolclass=QueuePool, + pool_recycle=300, # Recycle connections after 5 min + ) + self.meta = MetaData(schema="vecs") + self.Session = sessionmaker(self.engine) + self.max_retries = max_retries + self.retry_delay = retry_delay + self.vector_version: Optional[str] = None + self._initialize_database() + + def _initialize_database(self): + retries = 0 + error = None + while retries < self.max_retries: + try: + with self.Session() as sess: + with sess.begin(): + self._create_schema(sess) + self._create_extension(sess) + self._get_vector_version(sess) + return + except Exception as e: + logger.warning( + f"Database connection error: {str(e)}. Retrying in {self.retry_delay} seconds..." + ) + retries += 1 + time.sleep(self.retry_delay) + error = e + + error_message = f"Failed to initialize database after {self.max_retries} retries with error: {str(error)}" + logger.error(error_message) + raise RuntimeError(error_message) + + def _create_schema(self, sess): + try: + sess.execute(text("CREATE SCHEMA IF NOT EXISTS vecs;")) + except Exception as e: + logger.warning(f"Failed to create schema: {str(e)}") + + def _create_extension(self, sess): + try: + sess.execute(text("CREATE EXTENSION IF NOT EXISTS vector;")) + sess.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;")) + sess.execute(text("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;")) + except Exception as e: + logger.warning(f"Failed to create extension: {str(e)}") + + def _get_vector_version(self, sess): + try: + self.vector_version = sess.execute( + text( + "SELECT installed_version FROM pg_available_extensions WHERE name = 'vector' LIMIT 1;" + ) + ).scalar_one() + except sqlalchemy.exc.InternalError as e: + logger.error(f"Failed with internal alchemy error: {str(e)}") + + import psycopg2 + + if isinstance(e.orig, psycopg2.errors.InFailedSqlTransaction): + sess.rollback() + self.vector_version = sess.execute( + text( + "SELECT installed_version FROM pg_available_extensions WHERE name = 'vector' LIMIT 1;" + ) + ).scalar_one() + else: + raise e + except Exception as e: + logger.error(f"Failed to retrieve vector version: {str(e)}") + raise e + + def _supports_hnsw(self): + return ( + not self.vector_version.startswith("0.4") + and not self.vector_version.startswith("0.3") + and not self.vector_version.startswith("0.2") + and not self.vector_version.startswith("0.1") + and not self.vector_version.startswith("0.0") + ) + + def get_or_create_collection( + self, + name: str, + *, + dimension: Optional[int] = None, + adapter: Optional[Adapter] = None, + ) -> Collection: + """ + Get a vector collection by name, or create it if no collection with + *name* exists. + + Args: + name (str): The name of the collection. + + Keyword Args: + dimension (int): The dimensionality of the vectors in the collection. + pipeline (int): The dimensionality of the vectors in the collection. + + Returns: + Collection: The created collection. + + Raises: + CollectionAlreadyExists: If a collection with the same name already exists + """ + from r2r.vecs.collection import Collection + + adapter_dimension = adapter.exported_dimension if adapter else None + + collection = Collection( + name=name, + dimension=dimension or adapter_dimension, # type: ignore + client=self, + adapter=adapter, + ) + + return collection._create_if_not_exists() + + @deprecated("use Client.get_or_create_collection") + def create_collection(self, name: str, dimension: int) -> Collection: + """ + Create a new vector collection. + + Args: + name (str): The name of the collection. + dimension (int): The dimensionality of the vectors in the collection. + + Returns: + Collection: The created collection. + + Raises: + CollectionAlreadyExists: If a collection with the same name already exists + """ + from r2r.vecs.collection import Collection + + return Collection(name, dimension, self)._create() + + @deprecated("use Client.get_or_create_collection") + def get_collection(self, name: str) -> Collection: + """ + Retrieve an existing vector collection. + + Args: + name (str): The name of the collection. + + Returns: + Collection: The retrieved collection. + + Raises: + CollectionNotFound: If no collection with the given name exists. + """ + from r2r.vecs.collection import Collection + + query = text( + f""" + select + relname as table_name, + atttypmod as embedding_dim + from + pg_class pc + join pg_attribute pa + on pc.oid = pa.attrelid + where + pc.relnamespace = 'vecs'::regnamespace + and pc.relkind = 'r' + and pa.attname = 'vec' + and not pc.relname ^@ '_' + and pc.relname = :name + """ + ).bindparams(name=name) + with self.Session() as sess: + query_result = sess.execute(query).fetchone() + + if query_result is None: + raise CollectionNotFound( + "No collection found with requested name" + ) + + name, dimension = query_result + return Collection( + name, + dimension, + self, + ) + + def list_collections(self) -> List["Collection"]: + """ + List all vector collections. + + Returns: + list[Collection]: A list of all collections. + """ + from r2r.vecs.collection import Collection + + return Collection._list_collections(self) + + def delete_collection(self, name: str) -> None: + """ + Delete a vector collection. + + If no collection with requested name exists, does nothing. + + Args: + name (str): The name of the collection. + + Returns: + None + """ + from r2r.vecs.collection import Collection + + Collection(name, -1, self)._drop() + return + + def disconnect(self) -> None: + """ + Disconnect the client from the database. + + Returns: + None + """ + self.engine.dispose() + logger.info("Disconnected from the database.") + return + + def __enter__(self) -> "Client": + """ + Enable use of the 'with' statement. + + Returns: + Client: The current instance of the Client. + """ + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Disconnect the client on exiting the 'with' statement context. + + Args: + exc_type: The exception type, if any. + exc_val: The exception value, if any. + exc_tb: The traceback, if any. + + Returns: + None + """ + self.disconnect() + return diff --git a/R2R/r2r/vecs/collection.py b/R2R/r2r/vecs/collection.py new file mode 100755 index 00000000..2293d49b --- /dev/null +++ b/R2R/r2r/vecs/collection.py @@ -0,0 +1,1132 @@ +""" +Defines the 'Collection' class + +Importing from the `vecs.collection` directly is not supported. +All public classes, enums, and functions are re-exported by the top level `vecs` module. +""" + +from __future__ import annotations + +import math +import uuid +import warnings +from dataclasses import dataclass +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, +) + +import psycopg2 +from flupy import flu +from sqlalchemy import ( + Column, + MetaData, + String, + Table, + alias, + and_, + cast, + delete, + distinct, + func, + or_, + select, + text, +) +from sqlalchemy.dialects import postgresql +from sqlalchemy.types import Float, UserDefinedType + +from .adapter import Adapter, AdapterContext, NoOp +from .exc import ( + ArgError, + CollectionAlreadyExists, + CollectionNotFound, + FilterError, + MismatchedDimension, + Unreachable, +) + +if TYPE_CHECKING: + from vecs.client import Client + + +MetadataValues = Union[str, int, float, bool, List[str]] +Metadata = Dict[str, MetadataValues] +Numeric = Union[int, float, complex] +Record = Tuple[str, Iterable[Numeric], Metadata] + + +class IndexMethod(str, Enum): + """ + An enum representing the index methods available. + + This class currently only supports the 'ivfflat' method but may + expand in the future. + + Attributes: + auto (str): Automatically choose the best available index method. + ivfflat (str): The ivfflat index method. + hnsw (str): The hnsw index method. + """ + + auto = "auto" + ivfflat = "ivfflat" + hnsw = "hnsw" + + +class IndexMeasure(str, Enum): + """ + An enum representing the types of distance measures available for indexing. + + Attributes: + cosine_distance (str): The cosine distance measure for indexing. + l2_distance (str): The Euclidean (L2) distance measure for indexing. + max_inner_product (str): The maximum inner product measure for indexing. + """ + + cosine_distance = "cosine_distance" + l2_distance = "l2_distance" + max_inner_product = "max_inner_product" + + +@dataclass +class IndexArgsIVFFlat: + """ + A class for arguments that can optionally be supplied to the index creation + method when building an IVFFlat type index. + + Attributes: + nlist (int): The number of IVF centroids that the index should use + """ + + n_lists: int + + +@dataclass +class IndexArgsHNSW: + """ + A class for arguments that can optionally be supplied to the index creation + method when building an HNSW type index. + + Ref: https://github.com/pgvector/pgvector#index-options + + Both attributes are Optional in case the user only wants to specify one and + leave the other as default + + Attributes: + m (int): Maximum number of connections per node per layer (default: 16) + ef_construction (int): Size of the dynamic candidate list for + constructing the graph (default: 64) + """ + + m: Optional[int] = 16 + ef_construction: Optional[int] = 64 + + +INDEX_MEASURE_TO_OPS = { + # Maps the IndexMeasure enum options to the SQL ops string required by + # the pgvector `create index` statement + IndexMeasure.cosine_distance: "vector_cosine_ops", + IndexMeasure.l2_distance: "vector_l2_ops", + IndexMeasure.max_inner_product: "vector_ip_ops", +} + +INDEX_MEASURE_TO_SQLA_ACC = { + IndexMeasure.cosine_distance: lambda x: x.cosine_distance, + IndexMeasure.l2_distance: lambda x: x.l2_distance, + IndexMeasure.max_inner_product: lambda x: x.max_inner_product, +} + + +class Vector(UserDefinedType): + cache_ok = True + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + return "VECTOR" if self.dim is None else f"VECTOR({self.dim})" + + def bind_processor(self, dialect): + def process(value): + if value is None: + return value + if not isinstance(value, list): + raise ValueError("Expected a list") + if self.dim is not None and len(value) != self.dim: + raise ValueError( + f"Expected {self.dim} dimensions, not {len(value)}" + ) + return "[" + ",".join(str(float(v)) for v in value) + "]" + + return process + + def result_processor(self, dialect, coltype): + return lambda value: ( + value + if value is None + else [float(v) for v in value[1:-1].split(",")] + ) + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op("<->", return_type=Float)(other) + + def max_inner_product(self, other): + return self.op("<#>", return_type=Float)(other) + + def cosine_distance(self, other): + return self.op("<=>", return_type=Float)(other) + + +class Collection: + """ + The `vecs.Collection` class represents a collection of vectors within a PostgreSQL database with pgvector support. + It provides methods to manage (create, delete, fetch, upsert), index, and perform similarity searches on these vector collections. + + The collections are stored in separate tables in the database, with each vector associated with an identifier and optional metadata. + + Example usage: + + with vecs.create_client(DB_CONNECTION) as vx: + collection = vx.create_collection(name="docs", dimension=3) + collection.upsert([("id1", [1, 1, 1], {"key": "value"})]) + # Further operations on 'collection' + + Public Attributes: + name: The name of the vector collection. + dimension: The dimension of vectors in the collection. + + Note: Some methods of this class can raise exceptions from the `vecs.exc` module if errors occur. + """ + + def __init__( + self, + name: str, + dimension: int, + client: Client, + adapter: Optional[Adapter] = None, + ): + """ + Initializes a new instance of the `Collection` class. + + During expected use, developers initialize instances of `Collection` using the + `vecs.Client` with `vecs.Client.create_collection(...)` rather than directly. + + Args: + name (str): The name of the collection. + dimension (int): The dimension of the vectors in the collection. + client (Client): The client to use for interacting with the database. + """ + from r2r.vecs.adapter import Adapter + + self.client = client + self.name = name + self.dimension = dimension + self.table = build_table(name, client.meta, dimension) + self._index: Optional[str] = None + self.adapter = adapter or Adapter(steps=[NoOp(dimension=dimension)]) + + reported_dimensions = set( + [ + x + for x in [ + dimension, + adapter.exported_dimension if adapter else None, + ] + if x is not None + ] + ) + if len(reported_dimensions) == 0: + raise ArgError( + "One of dimension or adapter must provide a dimension" + ) + elif len(reported_dimensions) > 1: + raise MismatchedDimension( + "Mismatch in the reported dimensions of the selected vector collection and embedding model. Correct the selected embedding model or specify a new vector collection by modifying the `POSTGRES_VECS_COLLECTION` environment variable." + ) + + def __repr__(self): + """ + Returns a string representation of the `Collection` instance. + + Returns: + str: A string representation of the `Collection` instance. + """ + return ( + f'vecs.Collection(name="{self.name}", dimension={self.dimension})' + ) + + def __len__(self) -> int: + """ + Returns the number of vectors in the collection. + + Returns: + int: The number of vectors in the collection. + """ + with self.client.Session() as sess: + with sess.begin(): + stmt = select(func.count()).select_from(self.table) + return sess.execute(stmt).scalar() or 0 + + def _create_if_not_exists(self): + """ + PRIVATE + + Creates a new collection in the database if it doesn't already exist + + Returns: + Collection: The found or created collection. + """ + query = text( + f""" + select + relname as table_name, + atttypmod as embedding_dim + from + pg_class pc + join pg_attribute pa + on pc.oid = pa.attrelid + where + pc.relnamespace = 'vecs'::regnamespace + and pc.relkind = 'r' + and pa.attname = 'vec' + and not pc.relname ^@ '_' + and pc.relname = :name + """ + ).bindparams(name=self.name) + with self.client.Session() as sess: + query_result = sess.execute(query).fetchone() + + if query_result: + _, collection_dimension = query_result + else: + collection_dimension = None + + reported_dimensions = set( + [ + x + for x in [self.dimension, collection_dimension] + if x is not None + ] + ) + if len(reported_dimensions) > 1: + raise MismatchedDimension( + "Dimensions reported by adapter, dimension, and collection do not match. The likely cause of this is a mismatch between the dimensions of the selected vector collection and embedding model. Select the correct embedding model, or specify a new vector collection by modifying your `POSTGRES_VECS_COLLECTION` environment variable. If the selected colelction does not exist then it will be automatically with dimensions that match the selected embedding model." + ) + + if not collection_dimension: + self.table.create(self.client.engine) + + return self + + def _create(self): + """ + PRIVATE + + Creates a new collection in the database. Raises a `vecs.exc.CollectionAlreadyExists` + exception if a collection with the specified name already exists. + + Returns: + Collection: The newly created collection. + """ + + collection_exists = self.__class__._does_collection_exist( + self.client, self.name + ) + if collection_exists: + raise CollectionAlreadyExists( + "Collection with requested name already exists" + ) + self.table.create(self.client.engine) + + unique_string = str(uuid.uuid4()).replace("-", "_")[0:7] + with self.client.Session() as sess: + sess.execute( + text( + f""" + create index ix_meta_{unique_string} + on vecs."{self.table.name}" + using gin ( metadata jsonb_path_ops ) + """ + ) + ) + return self + + def _drop(self): + """ + PRIVATE + + Deletes the collection from the database. Raises a `vecs.exc.CollectionNotFound` + exception if no collection with the specified name exists. + + Returns: + Collection: The deleted collection. + """ + with self.client.Session() as sess: + sess.execute(text(f"DROP TABLE IF EXISTS {self.name} CASCADE")) + sess.commit() + + return self + + def get_unique_metadata_values( + self, + field: str, + filter_field: Optional[str] = None, + filter_value: Optional[MetadataValues] = None, + ) -> List[MetadataValues]: + """ + Fetches all unique metadata values of a specific field, optionally filtered by another metadata field. + Args: + field (str): The metadata field for which to fetch unique values. + filter_field (Optional[str], optional): The metadata field to filter on. Defaults to None. + filter_value (Optional[MetadataValues], optional): The value to filter the metadata field with. Defaults to None. + Returns: + List[MetadataValues]: A list of unique metadata values for the specified field. + """ + with self.client.Session() as sess: + with sess.begin(): + stmt = select( + distinct(self.table.c.metadata[field].astext) + ).where(self.table.c.metadata[field] != None) + + if filter_field is not None and filter_value is not None: + stmt = stmt.where( + self.table.c.metadata[filter_field].astext + == str(filter_value) + ) + + result = sess.execute(stmt) + unique_values = result.scalars().all() + + return unique_values + + def copy( + self, + records: Iterable[Tuple[str, Any, Metadata]], + skip_adapter: bool = False, + ) -> None: + """ + Copies records into the collection. + + Args: + records (Iterable[Tuple[str, Any, Metadata]]): An iterable of content to copy. + Each record is a tuple where: + - the first element is a unique string identifier + - the second element is an iterable of numeric values or relevant input type for the + adapter assigned to the collection + - the third element is metadata associated with the vector + + skip_adapter (bool): Should the adapter be skipped while copying. i.e. if vectors are being + provided, rather than a media type that needs to be transformed + """ + import csv + import io + import json + import os + + pipeline = flu(records) + for record in pipeline: + with psycopg2.connect( + database=os.getenv("POSTGRES_DBNAME"), + user=os.getenv("POSTGRES_USER"), + password=os.getenv("POSTGRES_PASSWORD"), + host=os.getenv("POSTGRES_HOST"), + port=os.getenv("POSTGRES_PORT"), + ) as conn: + with conn.cursor() as cur: + f = io.StringIO() + id, vec, metadata = record + + writer = csv.writer(f, delimiter=",", quotechar='"') + writer.writerow( + [ + str(id), + [float(ele) for ele in vec], + json.dumps(metadata), + ] + ) + f.seek(0) + result = f.getvalue() + + writer_name = ( + f'vecs."{self.table.fullname.split(".")[-1]}"' + ) + g = io.StringIO(result) + cur.copy_expert( + f"COPY {writer_name}(id, vec, metadata) FROM STDIN WITH (FORMAT csv)", + g, + ) + conn.commit() + cur.close() + conn.close() + + def upsert( + self, + records: Iterable[Tuple[str, Any, Metadata]], + skip_adapter: bool = False, + ) -> None: + """ + Inserts or updates *vectors* records in the collection. + + Args: + records (Iterable[Tuple[str, Any, Metadata]]): An iterable of content to upsert. + Each record is a tuple where: + - the first element is a unique string identifier + - the second element is an iterable of numeric values or relevant input type for the + adapter assigned to the collection + - the third element is metadata associated with the vector + + skip_adapter (bool): Should the adapter be skipped while upserting. i.e. if vectors are being + provided, rather than a media type that needs to be transformed + """ + + chunk_size = 512 + + if skip_adapter: + pipeline = flu(records).chunk(chunk_size) + else: + # Construct a lazy pipeline of steps to transform and chunk user input + pipeline = flu( + self.adapter(records, AdapterContext("upsert")) + ).chunk(chunk_size) + + with self.client.Session() as sess: + with sess.begin(): + for chunk in pipeline: + stmt = postgresql.insert(self.table).values(chunk) + stmt = stmt.on_conflict_do_update( + index_elements=[self.table.c.id], + set_=dict( + vec=stmt.excluded.vec, + metadata=stmt.excluded.metadata, + ), + ) + sess.execute(stmt) + return None + + def fetch(self, ids: Iterable[str]) -> List[Record]: + """ + Fetches vectors from the collection by their identifiers. + + Args: + ids (Iterable[str]): An iterable of vector identifiers. + + Returns: + List[Record]: A list of the fetched vectors. + """ + if isinstance(ids, str): + raise ArgError("ids must be a list of strings") + + chunk_size = 12 + records = [] + with self.client.Session() as sess: + with sess.begin(): + for id_chunk in flu(ids).chunk(chunk_size): + stmt = select(self.table).where( + self.table.c.id.in_(id_chunk) + ) + chunk_records = sess.execute(stmt) + records.extend(chunk_records) + return records + + def delete( + self, + ids: Optional[Iterable[str]] = None, + filters: Optional[Dict[str, Any]] = None, + ) -> List[str]: + """ + Deletes vectors from the collection by matching filters or ids. + + Args: + ids (Iterable[str], optional): An iterable of vector identifiers. + filters (Optional[Dict], optional): Filters to apply to the search. Defaults to None. + + Returns: + List[str]: A list of the document IDs of the deleted vectors. + """ + if ids is None and filters is None: + raise ArgError("Either ids or filters must be provided.") + + if ids is not None and filters is not None: + raise ArgError("Either ids or filters must be provided, not both.") + + if isinstance(ids, str): + raise ArgError("ids must be a list of strings") + + ids = ids or [] + filters = filters or {} + del_document_ids = set([]) + + with self.client.Session() as sess: + with sess.begin(): + if ids: + for id_chunk in flu(ids).chunk(12): + stmt = select(self.table.c.metadata).where( + self.table.c.id.in_(id_chunk) + ) + results = sess.execute(stmt).fetchall() + for result in results: + metadata_json = result[0] + document_id = metadata_json.get("document_id") + if document_id: + del_document_ids.add(document_id) + + delete_stmt = ( + delete(self.table) + .where(self.table.c.id.in_(id_chunk)) + .returning(self.table.c.id) + ) + sess.execute(delete_stmt) + + if filters: + meta_filter = build_filters(self.table.c.metadata, filters) + stmt = select(self.table.c.metadata).where(meta_filter) + results = sess.execute(stmt).fetchall() + for result in results: + metadata_json = result[0] + document_id = metadata_json.get("document_id") + if document_id: + del_document_ids.add(document_id) + + delete_stmt = ( + delete(self.table) + .where(meta_filter) + .returning(self.table.c.id) + ) + sess.execute(delete_stmt) + + return list(del_document_ids) + + def __getitem__(self, items): + """ + Fetches a vector from the collection by its identifier. + + Args: + items (str): The identifier of the vector. + + Returns: + Record: The fetched vector. + """ + if not isinstance(items, str): + raise ArgError("items must be a string id") + + row = self.fetch([items]) + + if row == []: + raise KeyError("no item found with requested id") + return row[0] + + def query( + self, + data: Union[Iterable[Numeric], Any], + limit: int = 10, + filters: Optional[Dict] = None, + measure: Union[IndexMeasure, str] = IndexMeasure.cosine_distance, + include_value: bool = False, + include_metadata: bool = False, + *, + probes: Optional[int] = None, + ef_search: Optional[int] = None, + skip_adapter: bool = False, + ) -> Union[List[Record], List[str]]: + """ + Executes a similarity search in the collection. + + The return type is dependent on arguments *include_value* and *include_metadata* + + Args: + data (Any): The vector to use as the query. + limit (int, optional): The maximum number of results to return. Defaults to 10. + filters (Optional[Dict], optional): Filters to apply to the search. Defaults to None. + measure (Union[IndexMeasure, str], optional): The distance measure to use for the search. Defaults to 'cosine_distance'. + include_value (bool, optional): Whether to include the distance value in the results. Defaults to False. + include_metadata (bool, optional): Whether to include the metadata in the results. Defaults to False. + probes (Optional[Int], optional): Number of ivfflat index lists to query. Higher increases accuracy but decreases speed + ef_search (Optional[Int], optional): Size of the dynamic candidate list for HNSW index search. Higher increases accuracy but decreases speed + skip_adapter (bool, optional): When True, skips any associated adapter and queries using a literal vector provided to *data* + + Returns: + Union[List[Record], List[str]]: The result of the similarity search. + """ + + if probes is None: + probes = 10 + + if ef_search is None: + ef_search = 40 + + if not isinstance(probes, int): + raise ArgError("probes must be an integer") + + if probes < 1: + raise ArgError("probes must be >= 1") + + if limit > 1000: + raise ArgError("limit must be <= 1000") + + # ValueError on bad input + try: + imeasure = IndexMeasure(measure) + except ValueError: + raise ArgError("Invalid index measure") + + if not self.is_indexed_for_measure(imeasure): + warnings.warn( + UserWarning( + f"Query does not have a covering index for {imeasure}. See Collection.create_index" + ) + ) + + if skip_adapter: + adapted_query = [("", data, {})] + else: + # Adapt the query using the pipeline + adapted_query = [ + x + for x in self.adapter( + records=[("", data, {})], + adapter_context=AdapterContext("query"), + ) + ] + + if len(adapted_query) != 1: + raise ArgError( + "Failed to produce exactly one query vector from input" + ) + + _, vec, _ = adapted_query[0] + + distance_lambda = INDEX_MEASURE_TO_SQLA_ACC.get(imeasure) + if distance_lambda is None: + # unreachable + raise ArgError("invalid distance_measure") # pragma: no cover + + distance_clause = distance_lambda(self.table.c.vec)(vec) + + cols = [self.table.c.id] + + if include_value: + cols.append(distance_clause) + + if include_metadata: + cols.append(self.table.c.metadata) + + stmt = select(*cols) + if filters: + stmt = stmt.filter( + build_filters(self.table.c.metadata, filters) # type: ignore + ) + + stmt = stmt.order_by(distance_clause) + stmt = stmt.limit(limit) + + with self.client.Session() as sess: + with sess.begin(): + # index ignored if greater than n_lists + sess.execute( + text("set local ivfflat.probes = :probes").bindparams( + probes=probes + ) + ) + if self.client._supports_hnsw(): + sess.execute( + text( + "set local hnsw.ef_search = :ef_search" + ).bindparams(ef_search=ef_search) + ) + if len(cols) == 1: + return [str(x) for x in sess.scalars(stmt).fetchall()] + return sess.execute(stmt).fetchall() or [] + + @classmethod + def _list_collections(cls, client: "Client") -> List["Collection"]: + """ + PRIVATE + + Retrieves all collections from the database. + + Args: + client (Client): The database client. + + Returns: + List[Collection]: A list of all existing collections. + """ + + query = text( + """ + select + relname as table_name, + atttypmod as embedding_dim + from + pg_class pc + join pg_attribute pa + on pc.oid = pa.attrelid + where + pc.relnamespace = 'vecs'::regnamespace + and pc.relkind = 'r' + and pa.attname = 'vec' + and not pc.relname ^@ '_' + """ + ) + xc = [] + with client.Session() as sess: + for name, dimension in sess.execute(query): + existing_collection = cls(name, dimension, client) + xc.append(existing_collection) + return xc + + @classmethod + def _does_collection_exist(cls, client: "Client", name: str) -> bool: + """ + PRIVATE + + Checks if a collection with a given name exists within the database + + Args: + client (Client): The database client. + name (str): The name of the collection + + Returns: + Exists: Whether the collection exists or not + """ + + try: + client.get_collection(name) + return True + except CollectionNotFound: + return False + + @property + def index(self) -> Optional[str]: + """ + PRIVATE + + Note: + The `index` property is private and expected to undergo refactoring. + Do not rely on it's output. + + Retrieves the SQL name of the collection's vector index, if it exists. + + Returns: + Optional[str]: The name of the index, or None if no index exists. + """ + + if self._index is None: + query = text( + """ + select + relname as table_name + from + pg_class pc + where + pc.relnamespace = 'vecs'::regnamespace + and relname ilike 'ix_vector%' + and pc.relkind = 'i' + """ + ) + with self.client.Session() as sess: + ix_name = sess.execute(query).scalar() + self._index = ix_name + return self._index + + def is_indexed_for_measure(self, measure: IndexMeasure): + """ + Checks if the collection is indexed for a specific measure. + + Args: + measure (IndexMeasure): The measure to check for. + + Returns: + bool: True if the collection is indexed for the measure, False otherwise. + """ + + index_name = self.index + if index_name is None: + return False + + ops = INDEX_MEASURE_TO_OPS.get(measure) + if ops is None: + return False + + if ops in index_name: + return True + + return False + + def create_index( + self, + measure: IndexMeasure = IndexMeasure.cosine_distance, + method: IndexMethod = IndexMethod.auto, + index_arguments: Optional[ + Union[IndexArgsIVFFlat, IndexArgsHNSW] + ] = None, + replace=True, + ) -> None: + """ + Creates an index for the collection. + + Note: + When `vecs` creates an index on a pgvector column in PostgreSQL, it uses a multi-step + process that enables performant indexes to be built for large collections with low end + database hardware. + + Those steps are: + + - Creates a new table with a different name + - Randomly selects records from the existing table + - Inserts the random records from the existing table into the new table + - Creates the requested vector index on the new table + - Upserts all data from the existing table into the new table + - Drops the existing table + - Renames the new table to the existing tables name + + If you create dependencies (like views) on the table that underpins + a `vecs.Collection` the `create_index` step may require you to drop those dependencies before + it will succeed. + + Args: + measure (IndexMeasure, optional): The measure to index for. Defaults to 'cosine_distance'. + method (IndexMethod, optional): The indexing method to use. Defaults to 'auto'. + index_arguments: (IndexArgsIVFFlat | IndexArgsHNSW, optional): Index type specific arguments + replace (bool, optional): Whether to replace the existing index. Defaults to True. + + Raises: + ArgError: If an invalid index method is used, or if *replace* is False and an index already exists. + """ + + if method not in ( + IndexMethod.ivfflat, + IndexMethod.hnsw, + IndexMethod.auto, + ): + raise ArgError("invalid index method") + + if index_arguments: + # Disallow case where user submits index arguments but uses the + # IndexMethod.auto index (index build arguments should only be + # used with a specific index) + if method == IndexMethod.auto: + raise ArgError( + "Index build parameters are not allowed when using the IndexMethod.auto index." + ) + # Disallow case where user specifies one index type but submits + # index build arguments for the other index type + if ( + isinstance(index_arguments, IndexArgsHNSW) + and method != IndexMethod.hnsw + ) or ( + isinstance(index_arguments, IndexArgsIVFFlat) + and method != IndexMethod.ivfflat + ): + raise ArgError( + f"{index_arguments.__class__.__name__} build parameters were supplied but {method} index was specified." + ) + + if method == IndexMethod.auto: + if self.client._supports_hnsw(): + method = IndexMethod.hnsw + else: + method = IndexMethod.ivfflat + + if method == IndexMethod.hnsw and not self.client._supports_hnsw(): + raise ArgError( + "HNSW Unavailable. Upgrade your pgvector installation to > 0.5.0 to enable HNSW support" + ) + + ops = INDEX_MEASURE_TO_OPS.get(measure) + if ops is None: + raise ArgError("Unknown index measure") + + unique_string = str(uuid.uuid4()).replace("-", "_")[0:7] + + with self.client.Session() as sess: + with sess.begin(): + if self.index is not None: + if replace: + sess.execute(text(f'drop index vecs."{self.index}";')) + self._index = None + else: + raise ArgError( + "replace is set to False but an index exists" + ) + + if method == IndexMethod.ivfflat: + if not index_arguments: + n_records: int = sess.execute(func.count(self.table.c.id)).scalar() # type: ignore + + n_lists = ( + int(max(n_records / 1000, 30)) + if n_records < 1_000_000 + else int(math.sqrt(n_records)) + ) + else: + # The following mypy error is ignored because mypy + # complains that `index_arguments` is typed as a union + # of IndexArgsIVFFlat and IndexArgsHNSW types, + # which both don't necessarily contain the `n_lists` + # parameter, however we have validated that the + # correct type is being used above. + n_lists = index_arguments.n_lists # type: ignore + + sess.execute( + text( + f""" + create index ix_{ops}_ivfflat_nl{n_lists}_{unique_string} + on vecs."{self.table.name}" + using ivfflat (vec {ops}) with (lists={n_lists}) + """ + ) + ) + + if method == IndexMethod.hnsw: + if not index_arguments: + index_arguments = IndexArgsHNSW() + + # See above for explanation of why the following lines + # are ignored + m = index_arguments.m # type: ignore + ef_construction = index_arguments.ef_construction # type: ignore + + sess.execute( + text( + f""" + create index ix_{ops}_hnsw_m{m}_efc{ef_construction}_{unique_string} + on vecs."{self.table.name}" + using hnsw (vec {ops}) WITH (m={m}, ef_construction={ef_construction}); + """ + ) + ) + + return None + + +def build_filters(json_col: Column, filters: Dict): + """ + Builds filters for SQL query based on provided dictionary. + + Args: + json_col (Column): The column in the database table. + filters (Dict): The dictionary specifying filter conditions. + + Raises: + FilterError: If filter conditions are not correctly formatted. + + Returns: + The filter clause for the SQL query. + """ + if not isinstance(filters, dict): + raise FilterError("filters must be a dict") + + filter_clauses = [] + + for key, value in filters.items(): + if not isinstance(key, str): + raise FilterError("*filters* keys must be strings") + + if isinstance(value, dict): + if len(value) > 1: + raise FilterError("only one operator permitted per key") + for operator, clause in value.items(): + if operator not in ( + "$eq", + "$ne", + "$lt", + "$lte", + "$gt", + "$gte", + "$in", + ): + raise FilterError("unknown operator") + + if operator == "$eq" and not hasattr(clause, "__len__"): + contains_value = cast({key: clause}, postgresql.JSONB) + filter_clauses.append(json_col.op("@>")(contains_value)) + elif operator == "$in": + if not isinstance(clause, list): + raise FilterError( + "argument to $in filter must be a list" + ) + for elem in clause: + if not isinstance(elem, (int, str, float)): + raise FilterError( + "argument to $in filter must be a list of scalars" + ) + contains_value = [ + cast(elem, postgresql.JSONB) for elem in clause + ] + filter_clauses.append( + json_col.op("->")(key).in_(contains_value) + ) + else: + matches_value = cast(clause, postgresql.JSONB) + if operator == "$eq": + filter_clauses.append( + json_col.op("->")(key) == matches_value + ) + elif operator == "$ne": + filter_clauses.append( + json_col.op("->")(key) != matches_value + ) + elif operator == "$lt": + filter_clauses.append( + json_col.op("->")(key) < matches_value + ) + elif operator == "$lte": + filter_clauses.append( + json_col.op("->")(key) <= matches_value + ) + elif operator == "$gt": + filter_clauses.append( + json_col.op("->")(key) > matches_value + ) + elif operator == "$gte": + filter_clauses.append( + json_col.op("->")(key) >= matches_value + ) + else: + raise Unreachable() + else: + raise FilterError("Filter value must be a dict with an operator") + + if len(filter_clauses) == 1: + return filter_clauses[0] + else: + return and_(*filter_clauses) + + +def build_table(name: str, meta: MetaData, dimension: int) -> Table: + """ + PRIVATE + + Builds a SQLAlchemy model underpinning a `vecs.Collection`. + + Args: + name (str): The name of the table. + meta (MetaData): MetaData instance associated with the SQL database. + dimension: The dimension of the vectors in the collection. + + Returns: + Table: The constructed SQL table. + """ + return Table( + name, + meta, + Column("id", String, primary_key=True), + Column("vec", Vector(dimension), nullable=False), + Column( + "metadata", + postgresql.JSONB, + server_default=text("'{}'::jsonb"), + nullable=False, + ), + extend_existing=True, + ) diff --git a/R2R/r2r/vecs/exc.py b/R2R/r2r/vecs/exc.py new file mode 100755 index 00000000..0ae4500c --- /dev/null +++ b/R2R/r2r/vecs/exc.py @@ -0,0 +1,83 @@ +__all__ = [ + "VecsException", + "CollectionAlreadyExists", + "CollectionNotFound", + "ArgError", + "FilterError", + "IndexNotFound", + "Unreachable", +] + + +class VecsException(Exception): + """ + Base exception class for the 'vecs' package. + All custom exceptions in the 'vecs' package should derive from this class. + """ + + ... + + +class CollectionAlreadyExists(VecsException): + """ + Exception raised when attempting to create a collection that already exists. + """ + + ... + + +class CollectionNotFound(VecsException): + """ + Exception raised when attempting to access or manipulate a collection that does not exist. + """ + + ... + + +class ArgError(VecsException): + """ + Exception raised for invalid arguments when calling a method. + """ + + ... + + +class MismatchedDimension(ArgError): + """ + Exception raised when multiple sources of truth for a collection's embedding dimension do not match. + """ + + ... + + +class FilterError(VecsException): + """ + Exception raised when there's an error related to filter usage in a query. + """ + + ... + + +class IndexNotFound(VecsException): + """ + Exception raised when attempting to access an index that does not exist. + """ + + ... + + +class Unreachable(VecsException): + """ + Exception raised when an unreachable part of the code is executed. + This is typically used for error handling in cases that should be logically impossible. + """ + + ... + + +class MissingDependency(VecsException, ImportError): + """ + Exception raised when attempting to access a feature that requires an optional dependency when the optional dependency is not present. + """ + + ... |