R2R/r2r/integrations/serper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

import http.client
import json
import os


# TODO - Move process json to dedicated data processing module
def process_json(json_object, indent=0):
    """
    Recursively traverses the JSON object (dicts and lists) to create an unstructured text blob.
    """
    text_blob = ""
    if isinstance(json_object, dict):
        for key, value in json_object.items():
            padding = "  " * indent
            if isinstance(value, (dict, list)):
                text_blob += (
                    f"{padding}{key}:\n{process_json(value, indent + 1)}"
                )
            else:
                text_blob += f"{padding}{key}: {value}\n"
    elif isinstance(json_object, list):
        for index, item in enumerate(json_object):
            padding = "  " * indent
            if isinstance(item, (dict, list)):
                text_blob += f"{padding}Item {index + 1}:\n{process_json(item, indent + 1)}"
            else:
                text_blob += f"{padding}Item {index + 1}: {item}\n"
    return text_blob


# TODO - Introduce abstract "Integration" ABC.
class SerperClient:
    def __init__(self, api_base: str = "google.serper.dev") -> None:
        api_key = os.getenv("SERPER_API_KEY")
        if not api_key:
            raise ValueError(
                "Please set the `SERPER_API_KEY` environment variable to use `SerperClient`."
            )

        self.api_base = api_base
        self.headers = {
            "X-API-KEY": api_key,
            "Content-Type": "application/json",
        }

    @staticmethod
    def _extract_results(result_data: dict) -> list:
        formatted_results = []

        for key, value in result_data.items():
            # Skip searchParameters as it's not a result entry
            if key == "searchParameters":
                continue

            # Handle 'answerBox' as a single item
            if key == "answerBox":
                value["type"] = key  # Add the type key to the dictionary
                formatted_results.append(value)
            # Handle lists of results
            elif isinstance(value, list):
                for item in value:
                    item["type"] = key  # Add the type key to the dictionary
                    formatted_results.append(item)
            # Handle 'peopleAlsoAsk' and potentially other single item formats
            elif isinstance(value, dict):
                value["type"] = key  # Add the type key to the dictionary
                formatted_results.append(value)

        return formatted_results

    # TODO - Add explicit typing for the return value
    def get_raw(self, query: str, limit: int = 10) -> list:
        connection = http.client.HTTPSConnection(self.api_base)
        payload = json.dumps({"q": query, "num_outputs": limit})
        connection.request("POST", "/search", payload, self.headers)
        response = connection.getresponse()
        data = response.read()
        json_data = json.loads(data.decode("utf-8"))
        return SerperClient._extract_results(json_data)

    @staticmethod
    def construct_context(results: list) -> str:
        # Organize results by type
        organized_results = {}
        for result in results:
            result_type = result.metadata.pop(
                "type", "Unknown"
            )  # Pop the type and use as key
            if result_type not in organized_results:
                organized_results[result_type] = [result.metadata]
            else:
                organized_results[result_type].append(result.metadata)

        context = ""
        # Iterate over each result type
        for result_type, items in organized_results.items():
            context += f"# {result_type} Results:\n"
            for index, item in enumerate(items, start=1):
                # Process each item under the current type
                context += f"Item {index}:\n"
                context += process_json(item) + "\n"

        return context