diff options
Diffstat (limited to 'R2R/r2r/integrations')
-rwxr-xr-x | R2R/r2r/integrations/__init__.py | 3 | ||||
-rwxr-xr-x | R2R/r2r/integrations/serper.py | 103 |
2 files changed, 106 insertions, 0 deletions
diff --git a/R2R/r2r/integrations/__init__.py b/R2R/r2r/integrations/__init__.py new file mode 100755 index 00000000..0830f40c --- /dev/null +++ b/R2R/r2r/integrations/__init__.py @@ -0,0 +1,3 @@ +from .serper import SerperClient + +__all__ = ["SerperClient"] diff --git a/R2R/r2r/integrations/serper.py b/R2R/r2r/integrations/serper.py new file mode 100755 index 00000000..14333d1a --- /dev/null +++ b/R2R/r2r/integrations/serper.py @@ -0,0 +1,103 @@ +import http.client +import json +import os + + +# TODO - Move process json to dedicated data processing module +def process_json(json_object, indent=0): + """ + Recursively traverses the JSON object (dicts and lists) to create an unstructured text blob. + """ + text_blob = "" + if isinstance(json_object, dict): + for key, value in json_object.items(): + padding = " " * indent + if isinstance(value, (dict, list)): + text_blob += ( + f"{padding}{key}:\n{process_json(value, indent + 1)}" + ) + else: + text_blob += f"{padding}{key}: {value}\n" + elif isinstance(json_object, list): + for index, item in enumerate(json_object): + padding = " " * indent + if isinstance(item, (dict, list)): + text_blob += f"{padding}Item {index + 1}:\n{process_json(item, indent + 1)}" + else: + text_blob += f"{padding}Item {index + 1}: {item}\n" + return text_blob + + +# TODO - Introduce abstract "Integration" ABC. +class SerperClient: + def __init__(self, api_base: str = "google.serper.dev") -> None: + api_key = os.getenv("SERPER_API_KEY") + if not api_key: + raise ValueError( + "Please set the `SERPER_API_KEY` environment variable to use `SerperClient`." + ) + + self.api_base = api_base + self.headers = { + "X-API-KEY": api_key, + "Content-Type": "application/json", + } + + @staticmethod + def _extract_results(result_data: dict) -> list: + formatted_results = [] + + for key, value in result_data.items(): + # Skip searchParameters as it's not a result entry + if key == "searchParameters": + continue + + # Handle 'answerBox' as a single item + if key == "answerBox": + value["type"] = key # Add the type key to the dictionary + formatted_results.append(value) + # Handle lists of results + elif isinstance(value, list): + for item in value: + item["type"] = key # Add the type key to the dictionary + formatted_results.append(item) + # Handle 'peopleAlsoAsk' and potentially other single item formats + elif isinstance(value, dict): + value["type"] = key # Add the type key to the dictionary + formatted_results.append(value) + + return formatted_results + + # TODO - Add explicit typing for the return value + def get_raw(self, query: str, limit: int = 10) -> list: + connection = http.client.HTTPSConnection(self.api_base) + payload = json.dumps({"q": query, "num_outputs": limit}) + connection.request("POST", "/search", payload, self.headers) + response = connection.getresponse() + data = response.read() + json_data = json.loads(data.decode("utf-8")) + return SerperClient._extract_results(json_data) + + @staticmethod + def construct_context(results: list) -> str: + # Organize results by type + organized_results = {} + for result in results: + result_type = result.metadata.pop( + "type", "Unknown" + ) # Pop the type and use as key + if result_type not in organized_results: + organized_results[result_type] = [result.metadata] + else: + organized_results[result_type].append(result.metadata) + + context = "" + # Iterate over each result type + for result_type, items in organized_results.items(): + context += f"# {result_type} Results:\n" + for index, item in enumerate(items, start=1): + # Process each item under the current type + context += f"Item {index}:\n" + context += process_json(item) + "\n" + + return context |