diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/unstructured_client | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/unstructured_client')
32 files changed, 3040 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/__init__.py new file mode 100644 index 00000000..d8d60c47 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/__init__.py @@ -0,0 +1,4 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from .sdk import * +from .sdkconfiguration import * diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/__init__.py new file mode 100644 index 00000000..2ee66cdd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/__init__.py @@ -0,0 +1,5 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from .sdkhooks import * +from .types import * +from .registration import * diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/__init__.py new file mode 100644 index 00000000..321a6b9c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/__init__.py @@ -0,0 +1,5 @@ +from .clean_server_url_hook import CleanServerUrlSDKInitHook +from .logger_hook import LoggerHook +from .suggest_defining_url import SuggestDefiningUrlIf401AfterErrorHook +from .split_pdf_hook import SplitPdfHook +import logging diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/clean_server_url_hook.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/clean_server_url_hook.py new file mode 100644 index 00000000..5a2d6e3f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/clean_server_url_hook.py @@ -0,0 +1,34 @@ +from typing import Tuple +from urllib.parse import ParseResult, urlparse, urlunparse + +import requests + +from unstructured_client._hooks.types import SDKInitHook + + +class CleanServerUrlSDKInitHook(SDKInitHook): + """Hook fixing common mistakes by users in defining `server_url` in the unstructured-client""" + + def clean_server_url(self, base_url) -> str: + """Fix url scheme and remove the '/general/v0/general' path.""" + + # -- add a url scheme if not present (urllib.parse does not work reliably without it) + if "http" not in base_url: + base_url = "http://" + base_url + + parsed_url: ParseResult = urlparse(base_url) + + if "api.unstructuredapp.io" in base_url: + if parsed_url.scheme != "https": + parsed_url = parsed_url._replace(scheme="https") + + # -- path should always be empty + return urlunparse(parsed_url._replace(path="")) + + def sdk_init( + self, base_url: str, client: requests.Session + ) -> Tuple[str, requests.Session]: + """Concrete implementation for SDKInitHook.""" + cleaned_url = self.clean_server_url(base_url) + + return cleaned_url, client diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/common.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/common.py new file mode 100644 index 00000000..ec759932 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/common.py @@ -0,0 +1 @@ +UNSTRUCTURED_CLIENT_LOGGER_NAME = "unstructured-client" diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/form_utils.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/form_utils.py new file mode 100644 index 00000000..bf97faa3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/form_utils.py @@ -0,0 +1,228 @@ +from __future__ import annotations + +import logging +from typing import Union + +from requests_toolbelt.multipart.decoder import MultipartDecoder + +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client.models import shared + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) +FormData = dict[str, Union[str, shared.Files, list[str]]] + +PARTITION_FORM_FILES_KEY = "files" +PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page" +PARTITION_FORM_PAGE_RANGE_KEY = "split_pdf_page_range[]" +PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY = "split_pdf_allow_failed" +PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number" +PARTITION_FORM_CONCURRENCY_LEVEL_KEY = "split_pdf_concurrency_level" + + +def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, int]: + """Retrieves the split page range from the given form data. + + If the range is invalid or outside the bounds of the page count, + returns (1, num_pages), i.e. the full range. + + Args: + form_data: The form data containing the page range + key: The key to look for in the form data. + + Returns: + The range of pages to send in the request in the form (start, end) + """ + try: + _page_range = form_data.get(key) + + if _page_range is not None: + page_range = (int(_page_range[0]), int(_page_range[1])) + else: + page_range = (1, max_pages) + + except (ValueError, IndexError) as exc: + msg = f"{_page_range} is not a valid page range." + logger.error(msg) + raise ValueError(msg) from exc + + start, end = page_range + + if not 0 < start <= max_pages or not 0 < end <= max_pages or not start <= end: + msg = f"Page range {page_range} is out of bounds. Start and end values should be between 1 and {max_pages}." + logger.error(msg) + raise ValueError(msg) + + return page_range + + +def get_starting_page_number(form_data: FormData, key: str, fallback_value: int) -> int: + """Retrieves the starting page number from the given form data. + + In case given starting page number is not a valid integer or less than 1, it will + use the default value. + + Args: + form_data: The form data containing the starting page number. + key: The key to look for in the form data. + fallback_value: The default value to use in case of an error. + + Returns: + The starting page number. + """ + starting_page_number = fallback_value + try: + _starting_page_number = form_data.get(key) or fallback_value + starting_page_number = int(_starting_page_number) # type: ignore + except ValueError: + logger.warning( + "'%s' is not a valid integer. Using default value '%d'.", + key, + fallback_value, + ) + + if starting_page_number < 1: + logger.warning( + "'%s' is less than 1. Using default value '%d'.", + key, + fallback_value, + ) + starting_page_number = fallback_value + + return starting_page_number + +def get_split_pdf_allow_failed_param( + form_data: FormData, key: str, fallback_value: bool, +) -> bool: + """Retrieves the value for allow failed that should be used for splitting pdf. + + In case given the number is not a "false" or "true" literal, it will use the + default value. + + Args: + form_data: The form data containing the desired concurrency level. + key: The key to look for in the form data. + fallback_value: The default value to use in case of an error. + + Returns: + The concurrency level after validation. + """ + allow_failed = form_data.get(key) + + if allow_failed is None: + return fallback_value + + if allow_failed.lower() not in ["true", "false"]: + logger.warning( + "'%s' is not a valid boolean. Using default value '%s'.", + key, + fallback_value, + ) + return fallback_value + + return allow_failed.lower() == "true" + +def get_split_pdf_concurrency_level_param( + form_data: FormData, key: str, fallback_value: int, max_allowed: int +) -> int: + """Retrieves the value for concurreny level that should be used for splitting pdf. + + In case given the number is not a valid integer or less than 1, it will use the + default value. + + Args: + form_data: The form data containing the desired concurrency level. + key: The key to look for in the form data. + fallback_value: The default value to use in case of an error. + max_allowed: The maximum allowed value for the concurrency level. + + Returns: + The concurrency level after validation. + """ + concurrency_level_str = form_data.get(key) + + if concurrency_level_str is None: + return fallback_value + + try: + concurrency_level = int(concurrency_level_str) + except ValueError: + logger.warning( + "'%s' is not a valid integer. Using default value '%s'.", + key, + fallback_value, + ) + return fallback_value + + if concurrency_level < 1: + logger.warning( + "'%s' is less than 1. Using the default value = %s.", + key, + fallback_value, + ) + return fallback_value + + if concurrency_level > max_allowed: + logger.warning( + "'%s' is greater than %s. Using the maximum allowed value = %s.", + key, + max_allowed, + max_allowed, + ) + return max_allowed + + return concurrency_level + + +def decode_content_disposition(content_disposition: bytes) -> dict[str, str]: + """Decode the `Content-Disposition` header and return the parameters as a dictionary. + + Args: + content_disposition: The `Content-Disposition` header as bytes. + + Returns: + A dictionary containing the parameters extracted from the + `Content-Disposition` header. + """ + data = content_disposition.decode().split("; ")[1:] + parameters = [d.split("=") for d in data] + parameters_dict = {p[0]: p[1].strip('"') for p in parameters} + return parameters_dict + + +def parse_form_data(decoded_data: MultipartDecoder) -> FormData: + """Parses the form data from the decoded multipart data. + + Args: + decoded_data: The decoded multipart data. + + Returns: + The parsed form data. + """ + form_data: FormData = {} + + for part in decoded_data.parts: + content_disposition = part.headers.get(b"Content-Disposition") + if content_disposition is None: + raise RuntimeError("Content-Disposition header not found. Can't split pdf file.") + part_params = decode_content_disposition(content_disposition) + name = part_params.get("name") + + if name is None: + continue + + if name == PARTITION_FORM_FILES_KEY: + filename = part_params.get("filename") + if filename is None or not filename.strip(): + raise ValueError("Filename can't be an empty string.") + form_data[PARTITION_FORM_FILES_KEY] = shared.Files(part.content, filename) + else: + content = part.content.decode() + if name in form_data: + if isinstance(form_data[name], list): + form_data[name].append(content) + else: + form_data[name] = [form_data[name], content] + else: + form_data[name] = content + + return form_data diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/logger_hook.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/logger_hook.py new file mode 100644 index 00000000..354f9ccd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/logger_hook.py @@ -0,0 +1,83 @@ +import logging +from typing import Optional, Tuple, Union, DefaultDict + +import requests + +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client._hooks.types import ( + AfterSuccessContext, + AfterErrorContext, + AfterErrorHook, + SDKInitHook, + AfterSuccessHook, +) +from collections import defaultdict + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) + + +class LoggerHook(AfterErrorHook, AfterSuccessHook, SDKInitHook): + """Hook providing custom logging""" + + def __init__(self) -> None: + self.retries_counter: DefaultDict[str, int] = defaultdict(int) + + def log_retries(self, response: Optional[requests.Response], error: Optional[Exception], operation_id: str,): + """Log retries to give users visibility into requests.""" + + if response is not None and response.status_code // 100 == 5: + logger.info( + "Failed to process a request due to API server error with status code %d. " + "Attempting retry number %d after sleep.", + response.status_code, + self.retries_counter[operation_id], + ) + if response.text: + logger.info("Server message - %s", response.text) + + elif error is not None and isinstance(error, requests.exceptions.ConnectionError): + logger.info( + "Failed to process a request due to connection error - %s. " + "Attempting retry number %d after sleep.", + error, + self.retries_counter[operation_id], + ) + + + def sdk_init( + self, base_url: str, client: requests.Session + ) -> Tuple[str, requests.Session]: + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + return base_url, client + + def after_success( + self, hook_ctx: AfterSuccessContext, response: requests.Response + ) -> Union[requests.Response, Exception]: + self.retries_counter.pop(hook_ctx.operation_id, None) + # NOTE: In case of split page partition this means - at least one of the splits was partitioned successfully + logger.info("Successfully partitioned the document.") + return response + + def after_error( + self, + hook_ctx: AfterErrorContext, + response: Optional[requests.Response], + error: Optional[Exception], + ) -> Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]: + """Concrete implementation for AfterErrorHook.""" + self.retries_counter[hook_ctx.operation_id] += 1 + self.log_retries(response, error, hook_ctx.operation_id) + + if response and response.status_code == 200: + # NOTE: Even though this is an after_error method, due to split_pdf_hook logic we may get + # a success here when one of the split requests was partitioned successfully + logger.info("Successfully partitioned the document.") + + else: + logger.error("Failed to partition the document.") + if response: + logger.error("Server responded with %d - %s", response.status_code, response.text) + if error is not None: + logger.error("Following error occurred - %s", error) + + return response, error diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py new file mode 100644 index 00000000..27dc5e03 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/pdf_utils.py @@ -0,0 +1,79 @@ +import io +import logging +from typing import Generator, Tuple, Optional + +from pypdf import PdfReader, PdfWriter +from pypdf.errors import PdfReadError + +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client.models import shared + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) + +# Loading pdfs with strict=False can dump a lot of warnings +# We don't need to display these +pdf_logger = logging.getLogger("pypdf") +pdf_logger.setLevel(logging.ERROR) + + +def get_pdf_pages( + pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None +) -> Generator[Tuple[io.BytesIO, int, int], None, None]: + """Reads given bytes of a pdf file and split it into n file-like objects, each + with `split_size` pages. + + Args: + file_content: Content of the PDF file. + split_size: Split size, e.g. if the given file has 10 pages + and this value is set to 2 it will yield 5 documents, each containing 2 pages + of the original document. By default it will split each page to a separate file. + page_start: Begin splitting at this page number + page_end: If provided, split up to and including this page number + + Yields: + The file contents with their page number and overall pages number of the original document. + """ + + offset = page_start - 1 + offset_end = page_end or len(pdf.pages) + + while offset < offset_end: + new_pdf = PdfWriter() + pdf_buffer = io.BytesIO() + + end = min(offset + split_size, offset_end) + + for page in list(pdf.pages[offset:end]): + new_pdf.add_page(page) + + new_pdf.write(pdf_buffer) + pdf_buffer.seek(0) + + yield pdf_buffer, offset, offset_end + offset += split_size + + +def is_pdf(file: shared.Files) -> bool: + """Checks if the given file is a PDF. + + First it checks the file extension and if it is equal to `.pdf`, then + it tries to read that file. If there is no error then we assume it is a proper PDF. + + Args: + file: The file to be checked. + + Returns: + True if the file is a PDF, False otherwise. + """ + if not file.file_name.endswith(".pdf"): + logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.") + return False + + try: + PdfReader(io.BytesIO(file.content), strict=True) + except (PdfReadError, UnicodeDecodeError) as exc: + logger.error(exc) + logger.warning("The file does not appear to be a valid PDF.") + return False + + return True diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/request_utils.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/request_utils.py new file mode 100644 index 00000000..1512e80b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/request_utils.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +import asyncio +import copy +import io +import json +import logging +from typing import Optional, Tuple, Any + +import httpx +import requests +from requests.structures import CaseInsensitiveDict +from requests_toolbelt.multipart.encoder import MultipartEncoder + +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client._hooks.custom.form_utils import ( + PARTITION_FORM_FILES_KEY, + PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, + PARTITION_FORM_PAGE_RANGE_KEY, + PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, + FormData, +) + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) + + +def create_request_body( + form_data: FormData, page_content: io.BytesIO, filename: str, page_number: int +) -> MultipartEncoder: + payload = prepare_request_payload(form_data) + + payload_fields: list[tuple[str, Any]] = [] + for key, value in payload.items(): + if isinstance(value, list): + payload_fields.extend([(key, list_value) for list_value in value]) + else: + payload_fields.append((key, value)) + + payload_fields.append((PARTITION_FORM_FILES_KEY, ( + filename, + page_content, + "application/pdf", + ))) + + payload_fields.append((PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, str(page_number))) + + body = MultipartEncoder( + fields=payload_fields + ) + return body + + +def create_httpx_request( + original_request: requests.Request, body: MultipartEncoder +) -> httpx.Request: + headers = prepare_request_headers(original_request.headers) + return httpx.Request( + method="POST", + url=original_request.url or "", + content=body.to_string(), + headers={**headers, "Content-Type": body.content_type}, + ) + + +def create_request( + request: requests.PreparedRequest, + body: MultipartEncoder, +) -> requests.Request: + headers = prepare_request_headers(request.headers) + return requests.Request( + method="POST", + url=request.url or "", + data=body, + headers={**headers, "Content-Type": body.content_type}, + ) + + +async def call_api_async( + client: httpx.AsyncClient, + page: Tuple[io.BytesIO, int], + original_request: requests.Request, + form_data: FormData, + filename: str, + limiter: asyncio.Semaphore, +) -> tuple[int, dict]: + page_content, page_number = page + body = create_request_body(form_data, page_content, filename, page_number) + new_request = create_httpx_request(original_request, body) + async with limiter: + try: + response = await client.send(new_request) + return response.status_code, response.json() + except Exception: + logger.error("Failed to send request for page %d", page_number) + return 500, {} + + +def call_api( + client: Optional[requests.Session], + page: Tuple[io.BytesIO, int], + request: requests.PreparedRequest, + form_data: FormData, + filename: str, +) -> requests.Response: + if client is None: + raise RuntimeError("HTTP client not accessible!") + page_content, page_number = page + + body = create_request_body(form_data, page_content, filename, page_number) + new_request = create_request(request, body) + prepared_request = client.prepare_request(new_request) + + try: + return client.send(prepared_request) + except Exception: + logger.error("Failed to send request for page %d", page_number) + return requests.Response() + + +def prepare_request_headers( + headers: CaseInsensitiveDict[str], +) -> CaseInsensitiveDict[str]: + """Prepare the request headers by removing the 'Content-Type' and 'Content-Length' headers. + + Args: + headers: The original request headers. + + Returns: + The modified request headers. + """ + headers = copy.deepcopy(headers) + headers.pop("Content-Type", None) + headers.pop("Content-Length", None) + return headers + + +def prepare_request_payload(form_data: FormData) -> FormData: + """Prepares the request payload by removing unnecessary keys and updating the file. + + Args: + form_data: The original form data. + + Returns: + The updated request payload. + """ + payload = copy.deepcopy(form_data) + payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None) + payload.pop(PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, None) + payload.pop(PARTITION_FORM_FILES_KEY, None) + payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None) + payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None) + updated_parameters = { + PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false", + } + payload.update(updated_parameters) + return payload + + +def create_response(response: requests.Response, elements: list) -> requests.Response: + """ + Creates a modified response object with updated content. + + Args: + response: The original response object. + elements: The list of elements to be serialized and added to + the response. + + Returns: + The modified response object with updated content. + """ + response_copy = copy.deepcopy(response) + content = json.dumps(elements).encode() + content_length = str(len(content)) + response_copy.headers.update({"Content-Length": content_length}) + setattr(response_copy, "_content", content) + return response_copy + + +def log_after_split_response(status_code: int, split_number: int): + if status_code == 200: + logger.info( + "Successfully partitioned set #%d, elements added to the final result.", + split_number, + ) + else: + logger.warning( + "Failed to partition set #%d, its elements will be omitted in the final result.", + split_number, + ) diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/split_pdf_hook.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/split_pdf_hook.py new file mode 100644 index 00000000..54a3b89e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -0,0 +1,445 @@ +from __future__ import annotations + +import asyncio +import io +import json +import logging +import math +from collections.abc import Awaitable +from typing import Any, Coroutine, Optional, Tuple, Union + +import httpx +import nest_asyncio +import requests +from pypdf import PdfReader +from requests_toolbelt.multipart.decoder import MultipartDecoder + +from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client._hooks.custom.form_utils import ( + PARTITION_FORM_CONCURRENCY_LEVEL_KEY, + PARTITION_FORM_FILES_KEY, + PARTITION_FORM_PAGE_RANGE_KEY, + PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, + PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, +) +from unstructured_client._hooks.types import ( + AfterErrorContext, + AfterErrorHook, + AfterSuccessContext, + AfterSuccessHook, + BeforeRequestContext, + BeforeRequestHook, + SDKInitHook, +) +from unstructured_client.models import shared + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) + +DEFAULT_STARTING_PAGE_NUMBER = 1 +DEFAULT_ALLOW_FAILED = False +DEFAULT_CONCURRENCY_LEVEL = 8 +MAX_CONCURRENCY_LEVEL = 15 +MIN_PAGES_PER_SPLIT = 2 +MAX_PAGES_PER_SPLIT = 20 + + +async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, requests.Response]: + response = await coro + return index, response + + +async def run_tasks(coroutines: list[Awaitable], allow_failed: bool = False) -> list[tuple[int, requests.Response]]: + if allow_failed: + responses = await asyncio.gather(*coroutines, return_exceptions=False) + return list(enumerate(responses, 1)) + # TODO: replace with asyncio.TaskGroup for python >3.11 # pylint: disable=fixme + tasks = [asyncio.create_task(_order_keeper(index, coro)) for index, coro in enumerate(coroutines, 1)] + results = [] + remaining_tasks = dict(enumerate(tasks, 1)) + for future in asyncio.as_completed(tasks): + index, response = await future + if response.status_code != 200: + # cancel all remaining tasks + for remaining_task in remaining_tasks.values(): + remaining_task.cancel() + results.append((index, response)) + break + results.append((index, response)) + # remove task from remaining_tasks that should be cancelled in case of failure + del remaining_tasks[index] + # return results in the original order + return sorted(results, key=lambda x: x[0]) + + +def context_is_uvloop(): + """Return true if uvloop is installed and we're currently in a uvloop context. Our asyncio splitting code currently doesn't work under uvloop.""" + try: + import uvloop # pylint: disable=import-outside-toplevel + loop = asyncio.get_event_loop() + return isinstance(loop, uvloop.Loop) + except (ImportError, RuntimeError): + return False + +def get_optimal_split_size(num_pages: int, concurrency_level: int) -> int: + """Distributes pages to workers evenly based on the number of pages and desired concurrency level.""" + if num_pages < MAX_PAGES_PER_SPLIT * concurrency_level: + split_size = math.ceil(num_pages / concurrency_level) + else: + split_size = MAX_PAGES_PER_SPLIT + + return max(split_size, MIN_PAGES_PER_SPLIT) + + +class SplitPdfHook(SDKInitHook, BeforeRequestHook, AfterSuccessHook, AfterErrorHook): + """ + A hook class that splits a PDF file into multiple pages and sends each page as + a separate request. This hook is designed to be used with an Speakeasy SDK. + + Usage: + 1. Create an instance of the `SplitPdfHook` class. + 2. Register SDK Init, Before Request, After Success and After Error hooks. + """ + + def __init__(self) -> None: + self.client: Optional[requests.Session] = None + self.coroutines_to_execute: dict[ + str, list[Coroutine[Any, Any, requests.Response]] + ] = {} + self.api_successful_responses: dict[str, list[requests.Response]] = {} + self.api_failed_responses: dict[str, list[requests.Response]] = {} + self.allow_failed: bool = DEFAULT_ALLOW_FAILED + + def sdk_init( + self, base_url: str, client: requests.Session + ) -> Tuple[str, requests.Session]: + """Initializes Split PDF Hook. + + Args: + base_url (str): URL of the API. + client (requests.Session): HTTP Client. + + Returns: + Tuple[str, requests.Session]: The initialized SDK options. + """ + self.client = client + return base_url, client + + + # pylint: disable=too-many-return-statements + def before_request( + self, hook_ctx: BeforeRequestContext, request: requests.PreparedRequest + ) -> Union[requests.PreparedRequest, Exception]: + """If `splitPdfPage` is set to `true` in the request, the PDF file is split into + separate pages. Each page is sent as a separate request in parallel. The last + page request is returned by this method. It will return the original request + when: `splitPdfPage` is set to `false`, the file is not a PDF, or the HTTP + has not been initialized. + + Args: + hook_ctx (BeforeRequestContext): The hook context containing information about + the operation. + request (requests.PreparedRequest): The request object. + + Returns: + Union[requests.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`, + the last page request; otherwise, the original request. + """ + if self.client is None: + logger.warning("HTTP client not accessible! Continuing without splitting.") + return request + + if context_is_uvloop(): + logger.warning("Splitting is currently incompatible with uvloop. Continuing without splitting.") + return request + + # This allows us to use an event loop in an env with an existing loop + # Temporary fix until we can improve the async splitting behavior + nest_asyncio.apply() + operation_id = hook_ctx.operation_id + content_type = request.headers.get("Content-Type") + body = request.body + if not isinstance(body, bytes) or content_type is None: + return request + + decoded_body = MultipartDecoder(body, content_type) + form_data = form_utils.parse_form_data(decoded_body) + split_pdf_page = form_data.get(PARTITION_FORM_SPLIT_PDF_PAGE_KEY) + if split_pdf_page is None or split_pdf_page == "false": + logger.info("Partitioning without split.") + return request + + logger.info("Preparing to split document for partition.") + file = form_data.get(PARTITION_FORM_FILES_KEY) + if ( + file is None + or not isinstance(file, shared.Files) + or not pdf_utils.is_pdf(file) + ): + logger.info("Partitioning without split.") + return request + + starting_page_number = form_utils.get_starting_page_number( + form_data, + key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, + fallback_value=DEFAULT_STARTING_PAGE_NUMBER, + ) + if starting_page_number > 1: + logger.info("Starting page number set to %d", starting_page_number) + logger.info("Starting page number set to %d", starting_page_number) + + self.allow_failed = form_utils.get_split_pdf_allow_failed_param( + form_data, + key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, + fallback_value=DEFAULT_ALLOW_FAILED, + ) + logger.info("Allow failed set to %d", self.allow_failed) + + concurrency_level = form_utils.get_split_pdf_concurrency_level_param( + form_data, + key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY, + fallback_value=DEFAULT_CONCURRENCY_LEVEL, + max_allowed=MAX_CONCURRENCY_LEVEL, + ) + logger.info("Concurrency level set to %d", concurrency_level) + limiter = asyncio.Semaphore(concurrency_level) + + pdf = PdfReader(io.BytesIO(file.content)) + + page_range_start, page_range_end = form_utils.get_page_range( + form_data, + key=PARTITION_FORM_PAGE_RANGE_KEY, + max_pages=len(pdf.pages), + ) + + page_count = page_range_end - page_range_start + 1 + logger.info( + "Splitting pages %d to %d (%d total)", + page_range_start, + page_range_end, + page_count, + ) + + split_size = get_optimal_split_size( + num_pages=page_count, concurrency_level=concurrency_level + ) + logger.info("Determined optimal split size of %d pages.", split_size) + + # If the doc is small enough, and we aren't slicing it with a page range: + # do not split, just continue with the original request + if split_size >= page_count and page_count == len(pdf.pages): + logger.info( + "Document has too few pages (%d) to be split efficiently. Partitioning without split.", + page_count, + ) + return request + + pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end) + logger.info( + "Partitioning %d files with %d page(s) each.", + math.floor(page_count / split_size), + split_size, + ) + + # Log the remainder pages if there are any + if page_count % split_size > 0: + logger.info( + "Partitioning 1 file with %d page(s).", + page_count % split_size, + ) + + async def call_api_partial(page): + async with httpx.AsyncClient() as client: + status_code, json_response = await request_utils.call_api_async( + client=client, + original_request=request, + form_data=form_data, + filename=file.file_name, + page=page, + limiter=limiter, + ) + + # convert httpx response to requests.Response to preserve + # compatibility with the synchronous SDK generated by speakeasy + response = requests.Response() + response.status_code = status_code + response._content = json.dumps( # pylint: disable=W0212 + json_response + ).encode() + response.headers["Content-Type"] = "application/json" + return response + + self.coroutines_to_execute[operation_id] = [] + last_page_content = io.BytesIO() + last_page_number = 0 + set_index = 1 + for page_content, page_index, all_pages_number in pages: + page_number = page_index + starting_page_number + logger.info( + "Partitioning set #%d (pages %d-%d).", + set_index, + page_number, + min(page_number + split_size - 1, all_pages_number), + ) + # Check if this set of pages is the last one + if page_index + split_size >= all_pages_number: + last_page_content = page_content + last_page_number = page_number + break + coroutine = call_api_partial((page_content, page_number)) + self.coroutines_to_execute[operation_id].append(coroutine) + set_index += 1 + # `before_request` method needs to return a request so we skip sending the last page in parallel + # and return that last page at the end of this method + + body = request_utils.create_request_body( + form_data, last_page_content, file.file_name, last_page_number + ) + last_page_request = request_utils.create_request(request, body) + last_page_prepared_request = self.client.prepare_request(last_page_request) + return last_page_prepared_request + + def _await_elements( + self, operation_id: str, response: requests.Response + ) -> Optional[list]: + """ + Waits for the partition requests to complete and returns the flattened + elements. + + Args: + operation_id (str): The ID of the operation. + response (requests.Response): The response object. + + Returns: + Optional[list]: The flattened elements if the partition requests are + completed, otherwise None. + """ + tasks = self.coroutines_to_execute.get(operation_id) + if tasks is None: + return None + + ioloop = asyncio.get_event_loop() + task_responses: list[tuple[int, requests.Response]] = ioloop.run_until_complete( + run_tasks(tasks, allow_failed=self.allow_failed) + ) + + if task_responses is None: + return None + + successful_responses = [] + failed_responses = [] + elements = [] + for response_number, res in task_responses: + request_utils.log_after_split_response(res.status_code, response_number) + if res.status_code == 200: + successful_responses.append(res) + elements.append(res.json()) + else: + failed_responses.append(res) + + if self.allow_failed or not failed_responses: + last_response_number = len(task_responses) + 1 + request_utils.log_after_split_response( + response.status_code, last_response_number + ) + if response.status_code == 200: + elements.append(response.json()) + successful_responses.append(response) + else: + failed_responses.append(response) + + self.api_successful_responses[operation_id] = successful_responses + self.api_failed_responses[operation_id] = failed_responses + flattened_elements = [element for sublist in elements for element in sublist] + return flattened_elements + + def after_success( + self, hook_ctx: AfterSuccessContext, response: requests.Response + ) -> Union[requests.Response, Exception]: + """Executes after a successful API request. Awaits all parallel requests and + combines the responses into a single response object. + + Args: + hook_ctx (AfterSuccessContext): The context object containing information + about the hook execution. + response (requests.Response): The response object returned from the API + request. + + Returns: + Union[requests.Response, Exception]: If requests were run in parallel, a + combined response object; otherwise, the original response. Can return + exception if it ocurred during the execution. + """ + operation_id = hook_ctx.operation_id + # Because in `before_request` method we skipped sending last page in parallel + # we need to pass response, which contains last page, to `_await_elements` method + elements = self._await_elements(operation_id, response) + + # if fails are disallowed, return the first failed response + if not self.allow_failed and self.api_failed_responses.get(operation_id): + return self.api_failed_responses[operation_id][0] + + if elements is None: + return response + + updated_response = request_utils.create_response(response, elements) + self._clear_operation(operation_id) + return updated_response + + def after_error( + self, + hook_ctx: AfterErrorContext, + response: Optional[requests.Response], + error: Optional[Exception], + ) -> Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]: + """Executes after an unsuccessful API request. Awaits all parallel requests, + if at least one request was successful, combines the responses into a single + response object and doesn't throw an error. It will return an error only if + all requests failed, or there was no PDF split. + + Args: + hook_ctx (AfterErrorContext): The AfterErrorContext object containing + information about the hook context. + response (Optional[requests.Response]): The Response object representing + the response received before the exception occurred. + error (Optional[Exception]): The exception object that was thrown. + + Returns: + Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]: + If requests were run in parallel, and at least one was successful, a combined + response object; otherwise, the original response and exception. + """ + + # if fails are disallowed - return response and error objects immediately + if not self.allow_failed: + return (response, error) + + operation_id = hook_ctx.operation_id + # We know that this request failed so we pass a failed or empty response to `_await_elements` method + # where it checks if at least on of the other requests succeeded + elements = self._await_elements(operation_id, response or requests.Response()) + successful_responses = self.api_successful_responses.get(operation_id) + + if elements is None or successful_responses is None: + return (response, error) + + if len(successful_responses) == 0: + self._clear_operation(operation_id) + return (response, error) + + updated_response = request_utils.create_response( + successful_responses[0], elements + ) + self._clear_operation(operation_id) + return (updated_response, None) + + def _clear_operation(self, operation_id: str) -> None: + """ + Clears the operation data associated with the given operation ID. + + Args: + operation_id (str): The ID of the operation to clear. + """ + self.coroutines_to_execute.pop(operation_id, None) + self.api_successful_responses.pop(operation_id, None) diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/suggest_defining_url.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/suggest_defining_url.py new file mode 100644 index 00000000..d9268159 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/custom/suggest_defining_url.py @@ -0,0 +1,30 @@ +from typing import Optional, Tuple, Union + +import logging +import requests + +from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client._hooks.types import AfterErrorContext, AfterErrorHook + +logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) + + +class SuggestDefiningUrlIf401AfterErrorHook(AfterErrorHook): + """Hook advising users to check that 'server_url' is defined if a 401 error is encountered.""" + + def warn_if_401(self, response: Optional[requests.Response]): + """If the paid API returns 401, warn the user in case they meant to use the free api.""" + if response is not None and response.status_code == 401: + logger.warning( + "This API key is invalid against the paid API. If intending to use the free API, please initialize UnstructuredClient with `server='free-api'`." + ) + + def after_error( + self, + hook_ctx: AfterErrorContext, + response: Optional[requests.Response], + error: Optional[Exception], + ) -> Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]: + """Concrete implementation for AfterErrorHook.""" + self.warn_if_401(response) + return response, error diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/registration.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/registration.py new file mode 100644 index 00000000..f2a7fc60 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/registration.py @@ -0,0 +1,48 @@ +"""Registration of custom, human-written hooks.""" + +from .custom import ( + CleanServerUrlSDKInitHook, + LoggerHook, + SuggestDefiningUrlIf401AfterErrorHook, + SplitPdfHook, +) +from .types import Hooks + + +# This file is only ever generated once on the first generation and then is free to be modified. +# Any hooks you wish to add should be registered in the init_hooks function. Feel free to define +# them in this file or in separate files in the hooks folder. + + +def init_hooks(hooks: Hooks): + # pylint: disable=unused-argument + """Add hooks by calling `hooks.register_<type_or>_hook` with an instance of that hook. + + Hooks are registered per SDK instance, and are valid for the lifetime of the SDK instance + """ + + # Initialize custom hooks + clean_server_url_hook = CleanServerUrlSDKInitHook() + suggest_defining_url_hook = SuggestDefiningUrlIf401AfterErrorHook() + logger_hook = LoggerHook() + split_pdf_hook = SplitPdfHook() + + # NOTE: logger_hook should stay registered last as logs the status of + # request and whether it will be retried which can be changed by e.g. split_pdf_hook + + # Register SDK Init hooks + hooks.register_sdk_init_hook(clean_server_url_hook) + hooks.register_sdk_init_hook(logger_hook) + hooks.register_sdk_init_hook(split_pdf_hook) + + # Register Before Request hooks + hooks.register_before_request_hook(split_pdf_hook) + + # Register After Error hooks + hooks.register_after_success_hook(split_pdf_hook) + hooks.register_after_success_hook(logger_hook) + + # Register After Error hooks + hooks.register_after_error_hook(suggest_defining_url_hook) + hooks.register_after_error_hook(split_pdf_hook) + hooks.register_after_error_hook(logger_hook) diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/sdkhooks.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/sdkhooks.py new file mode 100644 index 00000000..1fdae95a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/sdkhooks.py @@ -0,0 +1,57 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import requests +from .types import SDKInitHook, BeforeRequestContext, BeforeRequestHook, AfterSuccessContext, AfterSuccessHook, AfterErrorContext, AfterErrorHook, Hooks +from .registration import init_hooks +from typing import List, Optional, Tuple + + +class SDKHooks(Hooks): + def __init__(self): + self.sdk_init_hooks: List[SDKInitHook] = [] + self.before_request_hooks: List[BeforeRequestHook] = [] + self.after_success_hooks: List[AfterSuccessHook] = [] + self.after_error_hooks: List[AfterErrorHook] = [] + init_hooks(self) + + def register_sdk_init_hook(self, hook: SDKInitHook) -> None: + self.sdk_init_hooks.append(hook) + + def register_before_request_hook(self, hook: BeforeRequestHook) -> None: + self.before_request_hooks.append(hook) + + def register_after_success_hook(self, hook: AfterSuccessHook) -> None: + self.after_success_hooks.append(hook) + + def register_after_error_hook(self, hook: AfterErrorHook) -> None: + self.after_error_hooks.append(hook) + + def sdk_init(self, base_url: str, client: requests.Session) -> Tuple[str, requests.Session]: + for hook in self.sdk_init_hooks: + base_url, client = hook.sdk_init(base_url, client) + return base_url, client + + def before_request(self, hook_ctx: BeforeRequestContext, request: requests.PreparedRequest) -> requests.PreparedRequest: + for hook in self.before_request_hooks: + out = hook.before_request(hook_ctx, request) + if isinstance(out, Exception): + raise out + request = out + + return request + + def after_success(self, hook_ctx: AfterSuccessContext, response: requests.Response) -> requests.Response: + for hook in self.after_success_hooks: + out = hook.after_success(hook_ctx, response) + if isinstance(out, Exception): + raise out + response = out + return response + + def after_error(self, hook_ctx: AfterErrorContext, response: Optional[requests.Response], error: Optional[Exception]) -> Tuple[Optional[requests.Response], Optional[Exception]]: + for hook in self.after_error_hooks: + result = hook.after_error(hook_ctx, response, error) + if isinstance(result, Exception): + raise result + response, error = result + return response, error diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/types.py b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/types.py new file mode 100644 index 00000000..72ab059b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/_hooks/types.py @@ -0,0 +1,74 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import requests as requests_http +from abc import ABC, abstractmethod +from typing import Any, Callable, List, Optional, Tuple, Union + + +class HookContext: + operation_id: str + oauth2_scopes: Optional[List[str]] = None + security_source: Optional[Union[Any, Callable[[], Any]]] = None + + def __init__(self, operation_id: str, oauth2_scopes: Optional[List[str]], security_source: Optional[Union[Any, Callable[[], Any]]]): + self.operation_id = operation_id + self.oauth2_scopes = oauth2_scopes + self.security_source = security_source + + +class BeforeRequestContext(HookContext): + def __init__(self, hook_ctx: HookContext): + super().__init__(hook_ctx.operation_id, hook_ctx.oauth2_scopes, hook_ctx.security_source) + + +class AfterSuccessContext(HookContext): + def __init__(self, hook_ctx: HookContext): + super().__init__(hook_ctx.operation_id, hook_ctx.oauth2_scopes, hook_ctx.security_source) + + + +class AfterErrorContext(HookContext): + def __init__(self, hook_ctx: HookContext): + super().__init__(hook_ctx.operation_id, hook_ctx.oauth2_scopes, hook_ctx.security_source) + + +class SDKInitHook(ABC): + @abstractmethod + def sdk_init(self, base_url: str, client: requests_http.Session) -> Tuple[str, requests_http.Session]: + pass + + +class BeforeRequestHook(ABC): + @abstractmethod + def before_request(self, hook_ctx: BeforeRequestContext, request: requests_http.PreparedRequest) -> Union[requests_http.PreparedRequest, Exception]: + pass + + +class AfterSuccessHook(ABC): + @abstractmethod + def after_success(self, hook_ctx: AfterSuccessContext, response: requests_http.Response) -> Union[requests_http.Response, Exception]: + pass + + +class AfterErrorHook(ABC): + @abstractmethod + def after_error(self, hook_ctx: AfterErrorContext, response: Optional[requests_http.Response], error: Optional[Exception]) -> Union[Tuple[Optional[requests_http.Response], Optional[Exception]], Exception]: + pass + + +class Hooks(ABC): + @abstractmethod + def register_sdk_init_hook(self, hook: SDKInitHook): + pass + + @abstractmethod + def register_before_request_hook(self, hook: BeforeRequestHook): + pass + + @abstractmethod + def register_after_success_hook(self, hook: AfterSuccessHook): + pass + + @abstractmethod + def register_after_error_hook(self, hook: AfterErrorHook): + pass diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/general.py b/.venv/lib/python3.12/site-packages/unstructured_client/general.py new file mode 100644 index 00000000..e9d06a9d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/general.py @@ -0,0 +1,117 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import requests as requests_http +from .sdkconfiguration import SDKConfiguration +from typing import Any, Dict, List, Optional +from unstructured_client import utils +from unstructured_client._hooks import AfterErrorContext, AfterSuccessContext, BeforeRequestContext, HookContext +from unstructured_client.models import errors, operations + +class General: + sdk_configuration: SDKConfiguration + + def __init__(self, sdk_config: SDKConfiguration) -> None: + self.sdk_configuration = sdk_config + + + + def partition(self, request: operations.PartitionRequest, retries: Optional[utils.RetryConfig] = None) -> operations.PartitionResponse: + r"""Summary + Description + """ + hook_ctx = HookContext(operation_id='partition', oauth2_scopes=[], security_source=self.sdk_configuration.security) + base_url = utils.template_url(*self.sdk_configuration.get_server_details()) + + url = base_url + '/general/v0/general' + + if callable(self.sdk_configuration.security): + headers, query_params = utils.get_security(self.sdk_configuration.security()) + else: + headers, query_params = utils.get_security(self.sdk_configuration.security) + + headers = { **utils.get_headers(request), **headers } + req_content_type, data, form = utils.serialize_request_body(request, operations.PartitionRequest, "partition_parameters", False, False, 'multipart') + if req_content_type is not None and req_content_type not in ('multipart/form-data', 'multipart/mixed'): + headers['content-type'] = req_content_type + if data is None and form is None: + raise Exception('request body is required') + headers['Accept'] = 'application/json' + headers['user-agent'] = self.sdk_configuration.user_agent + client = self.sdk_configuration.client + + global_retry_config = self.sdk_configuration.retry_config + retry_config = retries + if retry_config is None: + if global_retry_config: + retry_config = global_retry_config + else: + retry_config = utils.RetryConfig('backoff', utils.BackoffStrategy(500, 60000, 1.5, 900000), True) + + req = None + def do_request(): + nonlocal req + try: + req = client.prepare_request(requests_http.Request('POST', url, params=query_params, data=data, files=form, headers=headers)) + req = self.sdk_configuration.get_hooks().before_request(BeforeRequestContext(hook_ctx), req) + http_res = client.send(req) + except Exception as e: + _, err = self.sdk_configuration.get_hooks().after_error(AfterErrorContext(hook_ctx), None, e) + if err is not None: + raise err from e + raise e + + if utils.match_status_codes(['422','4XX','5XX'], http_res.status_code): + result, e = self.sdk_configuration.get_hooks().after_error(AfterErrorContext(hook_ctx), http_res, None) + if e is not None: + raise e + if result is not None: + http_res = result + else: + raise errors.SDKError('Unexpected error occurred', -1, '', None) + else: + http_res = self.sdk_configuration.get_hooks().after_success(AfterSuccessContext(hook_ctx), http_res) + + return http_res + + http_res = utils.retry(do_request, utils.Retries(retry_config, [ + '502', + '503', + '504' + ])) + + + res = operations.PartitionResponse(status_code=http_res.status_code, content_type=http_res.headers.get('Content-Type') or '', raw_response=http_res) + + if http_res.status_code == 200: + # pylint: disable=no-else-return + if utils.match_content_type(http_res.headers.get('Content-Type') or '', 'application/json'): + out = utils.unmarshal_json(http_res.text, Optional[List[Dict[str, Any]]]) + res.elements = out + else: + content_type = http_res.headers.get('Content-Type') + raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res) + elif http_res.status_code == 422: + # pylint: disable=no-else-return + if utils.match_content_type(http_res.headers.get('Content-Type') or '', 'application/json'): + out = utils.unmarshal_json(http_res.text, errors.HTTPValidationError) + raise out + else: + content_type = http_res.headers.get('Content-Type') + raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res) + elif http_res.status_code >= 400 and http_res.status_code < 500: + raise errors.SDKError('API error occurred', http_res.status_code, http_res.text, http_res) + elif http_res.status_code >= 500 and http_res.status_code < 600: + # pylint: disable=no-else-return + if utils.match_content_type(http_res.headers.get('Content-Type') or '', 'application/json'): + out = utils.unmarshal_json(http_res.text, errors.ServerError) + raise out + else: + content_type = http_res.headers.get('Content-Type') + raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res) + else: + raise errors.SDKError('unknown status code received', http_res.status_code, http_res.text, http_res) + + return res + + + diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/__init__.py new file mode 100644 index 00000000..97d7cbfd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/__init__.py @@ -0,0 +1,4 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + + +# package diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/__init__.py new file mode 100644 index 00000000..03839458 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/__init__.py @@ -0,0 +1,7 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from .httpvalidationerror import * +from .sdkerror import * +from .servererror import * + +__all__ = ["Detail","HTTPValidationError","SDKError","ServerError"] diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/httpvalidationerror.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/httpvalidationerror.py new file mode 100644 index 00000000..9b943ef5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/httpvalidationerror.py @@ -0,0 +1,21 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from __future__ import annotations +import dataclasses +from ...models.shared import validationerror as shared_validationerror +from dataclasses_json import Undefined, dataclass_json +from typing import List, Optional, Union +from unstructured_client import utils + + +@dataclass_json(undefined=Undefined.EXCLUDE) + +@dataclasses.dataclass +class HTTPValidationError(Exception): + detail: Optional[Detail] = dataclasses.field(default=None, metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('detail'), 'exclude': lambda f: f is None }}) + + + def __str__(self) -> str: + return utils.marshal_json(self, type(self)) + +Detail = Union[List[shared_validationerror.ValidationError], str] diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/sdkerror.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/sdkerror.py new file mode 100644 index 00000000..2e7ef211 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/sdkerror.py @@ -0,0 +1,24 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import requests as requests_http + + +class SDKError(Exception): + """Represents an error returned by the API.""" + message: str + status_code: int + body: str + raw_response: requests_http.Response + + def __init__(self, message: str, status_code: int, body: str, raw_response: requests_http.Response): + self.message = message + self.status_code = status_code + self.body = body + self.raw_response = raw_response + + def __str__(self): + body = '' + if len(self.body) > 0: + body = f'\n{self.body}' + + return f'{self.message}: Status {self.status_code}{body}' diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/servererror.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/servererror.py new file mode 100644 index 00000000..14e1bd77 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/errors/servererror.py @@ -0,0 +1,18 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from __future__ import annotations +import dataclasses +from dataclasses_json import Undefined, dataclass_json +from typing import Optional +from unstructured_client import utils + + +@dataclass_json(undefined=Undefined.EXCLUDE) + +@dataclasses.dataclass +class ServerError(Exception): + detail: Optional[str] = dataclasses.field(default=None, metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('detail'), 'exclude': lambda f: f is None }}) + + + def __str__(self) -> str: + return utils.marshal_json(self, type(self)) diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/operations/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/operations/__init__.py new file mode 100644 index 00000000..53ed8933 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/operations/__init__.py @@ -0,0 +1,5 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from .partition import * + +__all__ = ["PartitionRequest","PartitionResponse"] diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/operations/partition.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/operations/partition.py new file mode 100644 index 00000000..cb462c2a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/operations/partition.py @@ -0,0 +1,30 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from __future__ import annotations +import dataclasses +import requests as requests_http +from ...models.shared import partition_parameters as shared_partition_parameters +from typing import Any, Dict, List, Optional + + +@dataclasses.dataclass +class PartitionRequest: + UNSET='__SPEAKEASY_UNSET__' + partition_parameters: shared_partition_parameters.PartitionParameters = dataclasses.field(metadata={'request': { 'media_type': 'multipart/form-data' }}) + unstructured_api_key: Optional[str] = dataclasses.field(default=UNSET, metadata={'header': { 'field_name': 'unstructured-api-key', 'style': 'simple', 'explode': False }}) + + + + +@dataclasses.dataclass +class PartitionResponse: + content_type: str = dataclasses.field() + r"""HTTP response content type for this operation""" + status_code: int = dataclasses.field() + r"""HTTP response status code for this operation""" + raw_response: requests_http.Response = dataclasses.field() + r"""Raw HTTP response; suitable for custom response parsing""" + elements: Optional[List[Dict[str, Any]]] = dataclasses.field(default=None) + r"""Successful Response""" + + diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/__init__.py new file mode 100644 index 00000000..b6a7718a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/__init__.py @@ -0,0 +1,7 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from .partition_parameters import * +from .security import * +from .validationerror import * + +__all__ = ["ChunkingStrategy","Files","Loc","OutputFormat","PartitionParameters","Security","Strategy","ValidationError"] diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/partition_parameters.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/partition_parameters.py new file mode 100644 index 00000000..69cbcf8d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/partition_parameters.py @@ -0,0 +1,103 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from __future__ import annotations +import dataclasses +from enum import Enum +from typing import List, Optional +from unstructured_client import utils + + +class ChunkingStrategy(str, Enum, metaclass=utils.OpenEnumMeta): + BASIC = 'basic' + BY_PAGE = 'by_page' + BY_SIMILARITY = 'by_similarity' + BY_TITLE = 'by_title' + + +@dataclasses.dataclass +class Files: + content: bytes = dataclasses.field(metadata={'multipart_form': { 'content': True }}) + file_name: str = dataclasses.field(metadata={'multipart_form': { 'field_name': 'files' }}) + + + + +class OutputFormat(str, Enum, metaclass=utils.OpenEnumMeta): + r"""The format of the response. Supported formats are application/json and text/csv. Default: application/json.""" + APPLICATION_JSON = 'application/json' + TEXT_CSV = 'text/csv' + + +class Strategy(str, Enum, metaclass=utils.OpenEnumMeta): + r"""The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto""" + FAST = 'fast' + HI_RES = 'hi_res' + AUTO = 'auto' + OCR_ONLY = 'ocr_only' + + +@dataclasses.dataclass +class PartitionParameters: + UNSET='__SPEAKEASY_UNSET__' + files: Files = dataclasses.field(metadata={'multipart_form': { 'file': True }}) + r"""The file to extract""" + chunking_strategy: Optional[ChunkingStrategy] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'chunking_strategy' }}) + r"""Use one of the supported strategies to chunk the returned elements after partitioning. When 'chunking_strategy' is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: 'basic', 'by_page', 'by_similarity', or 'by_title'""" + combine_under_n_chars: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'combine_under_n_chars' }}) + r"""If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500""" + content_type: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'content_type' }}) + r"""A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype.""" + coordinates: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'coordinates' }}) + r"""If `True`, return coordinates for each element extracted via OCR. Default: `False`""" + encoding: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'encoding' }}) + r"""The encoding method used to decode the text input. Default: utf-8""" + extract_image_block_types: Optional[List[str]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'extract_image_block_types' }}) + r"""The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields.""" + gz_uncompressed_content_type: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'gz_uncompressed_content_type' }}) + r"""If file is gzipped, use this content type after unzipping.""" + hi_res_model_name: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'hi_res_model_name' }}) + r"""The name of the inference model used when strategy is hi_res""" + include_orig_elements: Optional[bool] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'include_orig_elements' }}) + r"""When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as `.metadata.orig_elements`. Default: true.""" + include_page_breaks: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'include_page_breaks' }}) + r"""If true, the output will include page breaks if the filetype supports it. Default: false""" + languages: Optional[List[str]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'languages' }}) + r"""The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages.""" + max_characters: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'max_characters' }}) + r"""If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500""" + multipage_sections: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'multipage_sections' }}) + r"""If chunking strategy is set, determines if sections can span multiple sections. Default: true""" + new_after_n_chars: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'new_after_n_chars' }}) + r"""If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500""" + ocr_languages: Optional[List[str]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'ocr_languages' }}) + r"""Deprecated! The languages present in the document, for use in partitioning and/or OCR""" + output_format: Optional[OutputFormat] = dataclasses.field(default=OutputFormat.APPLICATION_JSON, metadata={'multipart_form': { 'field_name': 'output_format' }}) + r"""The format of the response. Supported formats are application/json and text/csv. Default: application/json.""" + overlap: Optional[int] = dataclasses.field(default=0, metadata={'multipart_form': { 'field_name': 'overlap' }}) + r"""Specifies the length of a string ('tail') to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default: 0""" + overlap_all: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'overlap_all' }}) + r"""When `True`, apply overlap between 'normal' chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of 'pollution' of otherwise clean semantic chunk boundaries. Default: False""" + pdf_infer_table_structure: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'pdf_infer_table_structure' }}) + r"""Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents.""" + similarity_threshold: Optional[float] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'similarity_threshold' }}) + r"""A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks.""" + skip_infer_table_types: Optional[List[str]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'skip_infer_table_types' }}) + r"""The document types that you want to skip table extraction with. Default: []""" + split_pdf_allow_failed: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'split_pdf_allow_failed' }}) + r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.""" + split_pdf_concurrency_level: Optional[int] = dataclasses.field(default=5, metadata={'multipart_form': { 'field_name': 'split_pdf_concurrency_level' }}) + r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" + split_pdf_page: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'split_pdf_page' }}) + r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" + split_pdf_page_range: Optional[List[int]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'split_pdf_page_range' }}) + r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend.""" + starting_page_number: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'starting_page_number' }}) + r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.""" + strategy: Optional[Strategy] = dataclasses.field(default=Strategy.AUTO, metadata={'multipart_form': { 'field_name': 'strategy' }}) + r"""The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto""" + unique_element_ids: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'unique_element_ids' }}) + r"""When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False`""" + xml_keep_tags: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'xml_keep_tags' }}) + r"""If `True`, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents.""" + + diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/security.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/security.py new file mode 100644 index 00000000..21a232a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/security.py @@ -0,0 +1,12 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from __future__ import annotations +import dataclasses +from typing import Optional + + +@dataclasses.dataclass +class Security: + api_key_auth: Optional[str] = dataclasses.field(default=None, metadata={'security': { 'scheme': True, 'type': 'apiKey', 'sub_type': 'header', 'field_name': 'unstructured-api-key' }}) + + diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/validationerror.py b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/validationerror.py new file mode 100644 index 00000000..a7c35ec2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/models/shared/validationerror.py @@ -0,0 +1,19 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from __future__ import annotations +import dataclasses +from dataclasses_json import Undefined, dataclass_json +from typing import List, Union +from unstructured_client import utils + + +@dataclass_json(undefined=Undefined.EXCLUDE) +@dataclasses.dataclass +class ValidationError: + loc: List[Loc] = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('loc') }}) + msg: str = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('msg') }}) + type: str = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('type') }}) + + + +Loc = Union[str, int] diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/sdk.py b/.venv/lib/python3.12/site-packages/unstructured_client/sdk.py new file mode 100644 index 00000000..91dfbef9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/sdk.py @@ -0,0 +1,76 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import requests as requests_http +from .general import General +from .sdkconfiguration import SDKConfiguration +from .utils.retries import RetryConfig +from typing import Callable, Dict, Optional, Union +from unstructured_client import utils +from unstructured_client._hooks import SDKHooks +from unstructured_client.models import shared + +class UnstructuredClient: + general: General + + sdk_configuration: SDKConfiguration + + def __init__(self, + api_key_auth: Union[Optional[str], Callable[[], Optional[str]]] = None, + server: Optional[str] = None, + server_url: Optional[str] = None, + url_params: Optional[Dict[str, str]] = None, + client: Optional[requests_http.Session] = None, + retry_config: Optional[RetryConfig] = None + ) -> None: + """Instantiates the SDK configuring it with the provided parameters. + + :param api_key_auth: The api_key_auth required for authentication + :type api_key_auth: Union[Optional[str], Callable[[], Optional[str]]] + :param server: The server by name to use for all operations + :type server: str + :param server_url: The server URL to use for all operations + :type server_url: str + :param url_params: Parameters to optionally template the server URL with + :type url_params: Dict[str, str] + :param client: The requests.Session HTTP client to use for all operations + :type client: requests_http.Session + :param retry_config: The utils.RetryConfig to use globally + :type retry_config: RetryConfig + """ + if client is None: + client = requests_http.Session() + + if callable(api_key_auth): + def security(): + return shared.Security(api_key_auth = api_key_auth()) + else: + security = shared.Security(api_key_auth = api_key_auth) + + if server_url is not None: + if url_params is not None: + server_url = utils.template_url(server_url, url_params) + + + self.sdk_configuration = SDKConfiguration( + client, + security, + server_url, + server, + retry_config=retry_config + ) + + hooks = SDKHooks() + + current_server_url, *_ = self.sdk_configuration.get_server_details() + server_url, self.sdk_configuration.client = hooks.sdk_init(current_server_url, self.sdk_configuration.client) + if current_server_url != server_url: + self.sdk_configuration.server_url = server_url + + # pylint: disable=protected-access + self.sdk_configuration.__dict__['_hooks'] = hooks + + self._init_sdks() + + + def _init_sdks(self): + self.general = General(self.sdk_configuration) diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/sdkconfiguration.py b/.venv/lib/python3.12/site-packages/unstructured_client/sdkconfiguration.py new file mode 100644 index 00000000..a1da362f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/sdkconfiguration.py @@ -0,0 +1,56 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + + +import requests as requests_http +from ._hooks import SDKHooks +from .utils import utils +from .utils.retries import RetryConfig +from dataclasses import dataclass +from typing import Callable, Dict, Optional, Tuple, Union +from unstructured_client.models import shared + + +SERVER_SAAS_API = 'saas-api' +r"""Serverless SaaS API""" +SERVER_FREE_API = 'free-api' +r"""Hosted API Free""" +SERVER_DEVELOPMENT = 'development' +r"""Development server""" +SERVERS = { + SERVER_SAAS_API: 'https://api.unstructuredapp.io', + SERVER_FREE_API: 'https://api.unstructured.io', + SERVER_DEVELOPMENT: 'http://localhost:8000', +} +"""Contains the list of servers available to the SDK""" + + +@dataclass +class SDKConfiguration: + client: requests_http.Session + security: Union[shared.Security,Callable[[], shared.Security]] = None + server_url: Optional[str] = '' + server: Optional[str] = '' + language: str = 'python' + openapi_doc_version: str = '1.0.44' + sdk_version: str = '0.25.5' + gen_version: str = '2.393.4' + user_agent: str = 'speakeasy-sdk/python 0.25.5 2.393.4 1.0.44 unstructured-client' + retry_config: Optional[RetryConfig] = None + + def __post_init__(self): + self._hooks = SDKHooks() + + def get_server_details(self) -> Tuple[str, Dict[str, str]]: + if self.server_url is not None and self.server_url != '': + return utils.remove_suffix(self.server_url, '/'), {} + if not self.server: + self.server = SERVER_SAAS_API + + if self.server not in SERVERS: + raise ValueError(f"Invalid server \"{self.server}\"") + + return SERVERS[self.server], {} + + + def get_hooks(self) -> SDKHooks: + return self._hooks diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/utils/__init__.py b/.venv/lib/python3.12/site-packages/unstructured_client/utils/__init__.py new file mode 100644 index 00000000..cbf886a7 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/utils/__init__.py @@ -0,0 +1,6 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +from .retries import * +from .utils import * +from .enums import * + diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/utils/enums.py b/.venv/lib/python3.12/site-packages/unstructured_client/utils/enums.py new file mode 100644 index 00000000..4d3aec17 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/utils/enums.py @@ -0,0 +1,17 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import enum + +class OpenEnumMeta(enum.EnumMeta): + def __call__(cls, value, names=None, *, module=None, qualname=None, type=None, start=1): + # The `type` kwarg also happens to be a built-in that pylint flags as + # redeclared. Safe to ignore this lint rule with this scope. + # pylint: disable=redefined-builtin + + if names is not None: + return super().__call__(value, names=names, module=module, qualname=qualname, type=type, start=start) + + try: + return super().__call__(value, names=names, module=module, qualname=qualname, type=type, start=start) + except ValueError: + return value diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/utils/retries.py b/.venv/lib/python3.12/site-packages/unstructured_client/utils/retries.py new file mode 100644 index 00000000..5edae538 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/utils/retries.py @@ -0,0 +1,119 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import random +import time +from typing import List + +import requests + + +class BackoffStrategy: + initial_interval: int + max_interval: int + exponent: float + max_elapsed_time: int + + def __init__(self, initial_interval: int, max_interval: int, exponent: float, max_elapsed_time: int): + self.initial_interval = initial_interval + self.max_interval = max_interval + self.exponent = exponent + self.max_elapsed_time = max_elapsed_time + + +class RetryConfig: + strategy: str + backoff: BackoffStrategy + retry_connection_errors: bool + + def __init__(self, strategy: str, backoff: BackoffStrategy, retry_connection_errors: bool): + self.strategy = strategy + self.backoff = backoff + self.retry_connection_errors = retry_connection_errors + + +class Retries: + config: RetryConfig + status_codes: List[str] + + def __init__(self, config: RetryConfig, status_codes: List[str]): + self.config = config + self.status_codes = status_codes + + +class TemporaryError(Exception): + response: requests.Response + + def __init__(self, response: requests.Response): + self.response = response + + +class PermanentError(Exception): + inner: Exception + + def __init__(self, inner: Exception): + self.inner = inner + + +def retry(func, retries: Retries): + if retries.config.strategy == 'backoff': + def do_request(): + res: requests.Response + try: + res = func() + + for code in retries.status_codes: + if "X" in code.upper(): + code_range = int(code[0]) + + status_major = res.status_code / 100 + + if status_major >= code_range and status_major < code_range + 1: + raise TemporaryError(res) + else: + parsed_code = int(code) + + if res.status_code == parsed_code: + raise TemporaryError(res) + except requests.exceptions.ConnectionError as exception: + if retries.config.retry_connection_errors: + raise + + raise PermanentError(exception) from exception + except requests.exceptions.Timeout as exception: + if retries.config.retry_connection_errors: + raise + + raise PermanentError(exception) from exception + except TemporaryError: + raise + except Exception as exception: + raise PermanentError(exception) from exception + + return res + + return retry_with_backoff(do_request, retries.config.backoff.initial_interval, retries.config.backoff.max_interval, retries.config.backoff.exponent, retries.config.backoff.max_elapsed_time) + + return func() + + +def retry_with_backoff(func, initial_interval=500, max_interval=60000, exponent=1.5, max_elapsed_time=3600000): + start = round(time.time()*1000) + retries = 0 + + while True: + try: + return func() + except PermanentError as exception: + raise exception.inner + except Exception as exception: # pylint: disable=broad-exception-caught + now = round(time.time()*1000) + if now - start > max_elapsed_time: + if isinstance(exception, TemporaryError): + return exception.response + + raise + sleep = ((initial_interval/1000) * + exponent**retries + random.uniform(0, 1)) + sleep = min(sleep, max_interval / 1000) + time.sleep(sleep) + retries += 1 diff --git a/.venv/lib/python3.12/site-packages/unstructured_client/utils/utils.py b/.venv/lib/python3.12/site-packages/unstructured_client/utils/utils.py new file mode 100644 index 00000000..f21a65d9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/unstructured_client/utils/utils.py @@ -0,0 +1,1116 @@ +"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" + +import base64 +import json +import re +import sys +from dataclasses import Field, fields, is_dataclass, make_dataclass +from datetime import date, datetime +from decimal import Decimal +from email.message import Message +from enum import Enum +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Union, + get_args, + get_origin, +) +from xmlrpc.client import boolean +from typing_inspect import is_optional_type +import dateutil.parser +from dataclasses_json import DataClassJsonMixin + + +def get_security(security: Any) -> Tuple[Dict[str, str], Dict[str, str]]: + headers: Dict[str, str] = {} + query_params: Dict[str, str] = {} + + if security is None: + return headers, query_params + + sec_fields: Tuple[Field, ...] = fields(security) + for sec_field in sec_fields: + value = getattr(security, sec_field.name) + if value is None: + continue + + metadata = sec_field.metadata.get("security") + if metadata is None: + continue + if metadata.get("option"): + _parse_security_option(headers, query_params, value) + return headers, query_params + if metadata.get("scheme"): + # Special case for basic auth which could be a flattened struct + if metadata.get("sub_type") == "basic" and not is_dataclass(value): + _parse_security_scheme(headers, query_params, metadata, security) + else: + _parse_security_scheme(headers, query_params, metadata, value) + + return headers, query_params + + +def _parse_security_option( + headers: Dict[str, str], query_params: Dict[str, str], option: Any +): + opt_fields: Tuple[Field, ...] = fields(option) + for opt_field in opt_fields: + metadata = opt_field.metadata.get("security") + if metadata is None or metadata.get("scheme") is None: + continue + _parse_security_scheme( + headers, query_params, metadata, getattr(option, opt_field.name) + ) + + +def _parse_security_scheme( + headers: Dict[str, str], + query_params: Dict[str, str], + scheme_metadata: Dict, + scheme: Any, +): + scheme_type = scheme_metadata.get("type") + sub_type = scheme_metadata.get("sub_type") + + if is_dataclass(scheme): + if scheme_type == "http" and sub_type == "basic": + _parse_basic_auth_scheme(headers, scheme) + return + + scheme_fields: Tuple[Field, ...] = fields(scheme) + for scheme_field in scheme_fields: + metadata = scheme_field.metadata.get("security") + if metadata is None or metadata.get("field_name") is None: + continue + + value = getattr(scheme, scheme_field.name) + + _parse_security_scheme_value( + headers, query_params, scheme_metadata, metadata, value + ) + else: + _parse_security_scheme_value( + headers, query_params, scheme_metadata, scheme_metadata, scheme + ) + + +def _parse_security_scheme_value( + headers: Dict[str, str], + query_params: Dict[str, str], + scheme_metadata: Dict, + security_metadata: Dict, + value: Any, +): + scheme_type = scheme_metadata.get("type") + sub_type = scheme_metadata.get("sub_type") + + header_name = str(security_metadata.get("field_name")) + + if scheme_type == "apiKey": + if sub_type == "header": + headers[header_name] = value + elif sub_type == "query": + query_params[header_name] = value + else: + raise Exception("not supported") + elif scheme_type == "openIdConnect": + headers[header_name] = _apply_bearer(value) + elif scheme_type == "oauth2": + if sub_type != "client_credentials": + headers[header_name] = _apply_bearer(value) + elif scheme_type == "http": + if sub_type == "bearer": + headers[header_name] = _apply_bearer(value) + else: + raise Exception("not supported") + else: + raise Exception("not supported") + + +def _apply_bearer(token: str) -> str: + return token.lower().startswith("bearer ") and token or f"Bearer {token}" + + +def _parse_basic_auth_scheme(headers: Dict[str, str], scheme: Any): + username = "" + password = "" + + scheme_fields: Tuple[Field, ...] = fields(scheme) + for scheme_field in scheme_fields: + metadata = scheme_field.metadata.get("security") + if metadata is None or metadata.get("field_name") is None: + continue + + field_name = metadata.get("field_name") + value = getattr(scheme, scheme_field.name) + + if field_name == "username": + username = value + if field_name == "password": + password = value + + data = f"{username}:{password}".encode() + headers["Authorization"] = f"Basic {base64.b64encode(data).decode()}" + + +def generate_url( + server_url: str, + path: str, + path_params: Any, + gbls: Optional[Any] = None, +) -> str: + path_param_values: Dict[str, str] = {} + + globals_already_populated = _populate_path_params( + path_params, gbls, path_param_values, [] + ) + if gbls is not None: + _populate_path_params(gbls, None, path_param_values, globals_already_populated) + + for key, value in path_param_values.items(): + path = path.replace("{" + key + "}", value, 1) + + return remove_suffix(server_url, "/") + path + + +def _populate_path_params( + path_params: Any, + gbls: Any, + path_param_values: Dict[str, str], + skip_fields: List[str], +) -> List[str]: + globals_already_populated: List[str] = [] + + path_param_fields: Tuple[Field, ...] = fields(path_params) + for field in path_param_fields: + if field.name in skip_fields: + continue + + param_metadata = field.metadata.get("path_param") + if param_metadata is None: + continue + + param = getattr(path_params, field.name) if path_params is not None else None + param, global_found = _populate_from_globals( + field.name, param, "path_param", gbls + ) + if global_found: + globals_already_populated.append(field.name) + + if param is None: + continue + + f_name = param_metadata.get("field_name", field.name) + serialization = param_metadata.get("serialization", "") + if serialization != "": + serialized_params = _get_serialized_params( + param_metadata, field.type, f_name, param + ) + for key, value in serialized_params.items(): + path_param_values[key] = value + else: + if param_metadata.get("style", "simple") == "simple": + if isinstance(param, List): + pp_vals: List[str] = [] + for pp_val in param: + if pp_val is None: + continue + pp_vals.append(_val_to_string(pp_val)) + path_param_values[param_metadata.get("field_name", field.name)] = ( + ",".join(pp_vals) + ) + elif isinstance(param, Dict): + pp_vals: List[str] = [] + for pp_key in param: + if param[pp_key] is None: + continue + if param_metadata.get("explode"): + pp_vals.append(f"{pp_key}={_val_to_string(param[pp_key])}") + else: + pp_vals.append(f"{pp_key},{_val_to_string(param[pp_key])}") + path_param_values[param_metadata.get("field_name", field.name)] = ( + ",".join(pp_vals) + ) + elif not isinstance(param, (str, int, float, complex, bool, Decimal)): + pp_vals: List[str] = [] + param_fields: Tuple[Field, ...] = fields(param) + for param_field in param_fields: + param_value_metadata = param_field.metadata.get("path_param") + if not param_value_metadata: + continue + + param_name = param_value_metadata.get("field_name", field.name) + + param_field_val = getattr(param, param_field.name) + if param_field_val is None: + continue + if param_metadata.get("explode"): + pp_vals.append( + f"{param_name}={_val_to_string(param_field_val)}" + ) + else: + pp_vals.append( + f"{param_name},{_val_to_string(param_field_val)}" + ) + path_param_values[param_metadata.get("field_name", field.name)] = ( + ",".join(pp_vals) + ) + else: + path_param_values[param_metadata.get("field_name", field.name)] = ( + _val_to_string(param) + ) + + return globals_already_populated + + +def is_optional(field): + return get_origin(field) is Union and type(None) in get_args(field) + + +def template_url(url_with_params: str, params: Dict[str, str]) -> str: + for key, value in params.items(): + url_with_params = url_with_params.replace("{" + key + "}", value) + + return url_with_params + + +def get_query_params( + query_params: Any, + gbls: Optional[Any] = None, +) -> Dict[str, List[str]]: + params: Dict[str, List[str]] = {} + + globals_already_populated = _populate_query_params(query_params, gbls, params, []) + if gbls is not None: + _populate_query_params(gbls, None, params, globals_already_populated) + + return params + + +def _populate_query_params( + query_params: Any, + gbls: Any, + query_param_values: Dict[str, List[str]], + skip_fields: List[str], +) -> List[str]: + globals_already_populated: List[str] = [] + + param_fields: Tuple[Field, ...] = fields(query_params) + for field in param_fields: + if field.name in skip_fields: + continue + + metadata = field.metadata.get("query_param") + if not metadata: + continue + + param_name = field.name + value = getattr(query_params, param_name) if query_params is not None else None + + value, global_found = _populate_from_globals( + param_name, value, "query_param", gbls + ) + if global_found: + globals_already_populated.append(param_name) + + f_name = metadata.get("field_name") + serialization = metadata.get("serialization", "") + if serialization != "": + serialized_parms = _get_serialized_params( + metadata, field.type, f_name, value + ) + for key, value in serialized_parms.items(): + if key in query_param_values: + query_param_values[key].extend(value) + else: + query_param_values[key] = [value] + else: + style = metadata.get("style", "form") + if style == "deepObject": + _populate_deep_object_query_params( + metadata, f_name, value, query_param_values + ) + elif style == "form": + _populate_delimited_query_params( + metadata, f_name, value, ",", query_param_values + ) + elif style == "pipeDelimited": + _populate_delimited_query_params( + metadata, f_name, value, "|", query_param_values + ) + else: + raise Exception("not yet implemented") + + return globals_already_populated + + +def get_headers(headers_params: Any, gbls: Optional[Any] = None) -> Dict[str, str]: + headers: Dict[str, str] = {} + + globals_already_populated = [] + if headers_params is not None: + globals_already_populated = _populate_headers(headers_params, gbls, headers, []) + if gbls is not None: + _populate_headers(gbls, None, headers, globals_already_populated) + + return headers + + +def _populate_headers( + headers_params: Any, + gbls: Any, + header_values: Dict[str, str], + skip_fields: List[str], +) -> List[str]: + globals_already_populated: List[str] = [] + + param_fields: Tuple[Field, ...] = fields(headers_params) + for field in param_fields: + if field.name in skip_fields: + continue + + metadata = field.metadata.get("header") + if not metadata: + continue + + value, global_found = _populate_from_globals( + field.name, getattr(headers_params, field.name), "header", gbls + ) + if global_found: + globals_already_populated.append(field.name) + value = _serialize_header(metadata.get("explode", False), value) + + if value != "": + header_values[metadata.get("field_name", field.name)] = value + + return globals_already_populated + + +def _get_serialized_params( + metadata: Dict, field_type: type, field_name: str, obj: Any +) -> Dict[str, str]: + params: Dict[str, str] = {} + + serialization = metadata.get("serialization", "") + if serialization == "json": + params[metadata.get("field_name", field_name)] = marshal_json(obj, field_type) + + return params + + +def _populate_deep_object_query_params( + metadata: Dict, field_name: str, obj: Any, params: Dict[str, List[str]] +): + if obj is None: + return + + if is_dataclass(obj): + _populate_deep_object_query_params_dataclass(metadata.get("field_name", field_name), obj, params) + elif isinstance(obj, Dict): + _populate_deep_object_query_params_dict(metadata.get("field_name", field_name), obj, params) + + +def _populate_deep_object_query_params_dataclass( + prior_params_key: str, obj: Any, params: Dict[str, List[str]] +): + if obj is None: + return + + if not is_dataclass(obj): + return + + obj_fields: Tuple[Field, ...] = fields(obj) + for obj_field in obj_fields: + obj_param_metadata = obj_field.metadata.get("query_param") + if not obj_param_metadata: + continue + + obj_val = getattr(obj, obj_field.name) + if obj_val is None: + continue + + params_key = f'{prior_params_key}[{obj_param_metadata.get("field_name", obj_field.name)}]' + + if is_dataclass(obj_val): + _populate_deep_object_query_params_dataclass(params_key, obj_val, params) + elif isinstance(obj_val, Dict): + _populate_deep_object_query_params_dict(params_key, obj_val, params) + elif isinstance(obj_val, List): + _populate_deep_object_query_params_list(params_key, obj_val, params) + else: + params[params_key] = [_val_to_string(obj_val)] + + +def _populate_deep_object_query_params_dict( + prior_params_key: str, value: Dict, params: Dict[str, List[str]] +): + if value is None: + return + + for key, val in value.items(): + if val is None: + continue + + params_key = f'{prior_params_key}[{key}]' + + if is_dataclass(val): + _populate_deep_object_query_params_dataclass(params_key, val, params) + elif isinstance(val, Dict): + _populate_deep_object_query_params_dict(params_key, val, params) + elif isinstance(val, List): + _populate_deep_object_query_params_list(params_key, val, params) + else: + params[params_key] = [_val_to_string(val)] + + +def _populate_deep_object_query_params_list( + params_key: str, value: List, params: Dict[str, List[str]] +): + if value is None: + return + + for val in value: + if val is None: + continue + + if params.get(params_key) is None: + params[params_key] = [] + + params[params_key].append(_val_to_string(val)) + + +def _get_query_param_field_name(obj_field: Field) -> str: + obj_param_metadata = obj_field.metadata.get("query_param") + + if not obj_param_metadata: + return "" + + return obj_param_metadata.get("field_name", obj_field.name) + + +def _populate_delimited_query_params( + metadata: Dict, + field_name: str, + obj: Any, + delimiter: str, + query_param_values: Dict[str, List[str]], +): + _populate_form( + field_name, + metadata.get("explode", True), + obj, + _get_query_param_field_name, + delimiter, + query_param_values, + ) + + +SERIALIZATION_METHOD_TO_CONTENT_TYPE = { + "json": "application/json", + "form": "application/x-www-form-urlencoded", + "multipart": "multipart/form-data", + "raw": "application/octet-stream", + "string": "text/plain", +} + + +def serialize_request_body( + request: Any, + request_type: type, + request_field_name: str, + nullable: bool, + optional: bool, + serialization_method: str, + encoder=None, +) -> Tuple[Optional[str], Optional[Any], Optional[Any]]: + if request is None: + if not nullable and optional: + return None, None, None + + if not is_dataclass(request) or not hasattr(request, request_field_name): + return serialize_content_type( + request_field_name, + request_type, + SERIALIZATION_METHOD_TO_CONTENT_TYPE[serialization_method], + request, + encoder, + ) + + request_val = getattr(request, request_field_name) + + if request_val is None: + if not nullable and optional: + return None, None, None + + request_fields: Tuple[Field, ...] = fields(request) + request_metadata = None + + for field in request_fields: + if field.name == request_field_name: + request_metadata = field.metadata.get("request") + break + + if request_metadata is None: + raise Exception("invalid request type") + + return serialize_content_type( + request_field_name, + request_type, + request_metadata.get("media_type", "application/octet-stream"), + request_val, + ) + + +def serialize_content_type( + field_name: str, request_type: Any, media_type: str, request: Any, encoder=None +) -> Tuple[Optional[str], Optional[Any], Optional[List[List[Any]]]]: + if re.match(r"(application|text)\/.*?\+*json.*", media_type) is not None: + return media_type, marshal_json(request, request_type, encoder), None + if re.match(r"multipart\/.*", media_type) is not None: + return serialize_multipart_form(media_type, request) + if re.match(r"application\/x-www-form-urlencoded.*", media_type) is not None: + return media_type, serialize_form_data(field_name, request), None + if isinstance(request, (bytes, bytearray)): + return media_type, request, None + if isinstance(request, str): + return media_type, request, None + + raise Exception( + f"invalid request body type {type(request)} for mediaType {media_type}" + ) + + +def serialize_multipart_form( + media_type: str, request: Any +) -> Tuple[str, Any, List[List[Any]]]: + form: List[List[Any]] = [] + request_fields = fields(request) + + for field in request_fields: + val = getattr(request, field.name) + if val is None: + continue + + field_metadata = field.metadata.get("multipart_form") + if not field_metadata: + continue + + if field_metadata.get("file") is True: + file_fields = fields(val) + + file_name = "" + field_name = "" + content = bytes() + + for file_field in file_fields: + file_metadata = file_field.metadata.get("multipart_form") + if file_metadata is None: + continue + + if file_metadata.get("content") is True: + content = getattr(val, file_field.name) + else: + field_name = file_metadata.get("field_name", file_field.name) + file_name = getattr(val, file_field.name) + if field_name == "" or file_name == "" or content == bytes(): + raise Exception("invalid multipart/form-data file") + + form.append([field_name, [file_name, content]]) + elif field_metadata.get("json") is True: + to_append = [ + field_metadata.get("field_name", field.name), + [None, marshal_json(val, field.type), "application/json"], + ] + form.append(to_append) + else: + field_name = field_metadata.get("field_name", field.name) + if isinstance(val, List): + for value in val: + if value is None: + continue + form.append([field_name + "[]", [None, _val_to_string(value)]]) + else: + form.append([field_name, [None, _val_to_string(val)]]) + return media_type, None, form + + +def serialize_dict( + original: Dict, explode: bool, field_name, existing: Optional[Dict[str, List[str]]] +) -> Dict[str, List[str]]: + if existing is None: + existing = {} + + if explode is True: + for key, val in original.items(): + if key not in existing: + existing[key] = [] + existing[key].append(val) + else: + temp = [] + for key, val in original.items(): + temp.append(str(key)) + temp.append(str(val)) + if field_name not in existing: + existing[field_name] = [] + existing[field_name].append(",".join(temp)) + return existing + + +def serialize_form_data(field_name: str, data: Any) -> Dict[str, Any]: + form: Dict[str, List[str]] = {} + + if is_dataclass(data): + for field in fields(data): + val = getattr(data, field.name) + if val is None: + continue + + metadata = field.metadata.get("form") + if metadata is None: + continue + + field_name = metadata.get("field_name", field.name) + + if metadata.get("json"): + form[field_name] = [marshal_json(val, field.type)] + else: + if metadata.get("style", "form") == "form": + _populate_form( + field_name, + metadata.get("explode", True), + val, + _get_form_field_name, + ",", + form, + ) + else: + raise Exception(f"Invalid form style for field {field.name}") + elif isinstance(data, Dict): + for key, value in data.items(): + form[key] = [_val_to_string(value)] + else: + raise Exception(f"Invalid request body type for field {field_name}") + + return form + + +def _get_form_field_name(obj_field: Field) -> str: + obj_param_metadata = obj_field.metadata.get("form") + + if not obj_param_metadata: + return "" + + return obj_param_metadata.get("field_name", obj_field.name) + + +def _populate_form( + field_name: str, + explode: boolean, + obj: Any, + get_field_name_func: Callable, + delimiter: str, + form: Dict[str, List[str]], +): + if obj is None: + return form + + if is_dataclass(obj): + items = [] + + obj_fields: Tuple[Field, ...] = fields(obj) + for obj_field in obj_fields: + obj_field_name = get_field_name_func(obj_field) + if obj_field_name == "": + continue + + val = getattr(obj, obj_field.name) + if val is None: + continue + + if explode: + form[obj_field_name] = [_val_to_string(val)] + else: + items.append(f"{obj_field_name}{delimiter}{_val_to_string(val)}") + + if len(items) > 0: + form[field_name] = [delimiter.join(items)] + elif isinstance(obj, Dict): + items = [] + for key, value in obj.items(): + if value is None: + continue + + if explode: + form[key] = [_val_to_string(value)] + else: + items.append(f"{key}{delimiter}{_val_to_string(value)}") + + if len(items) > 0: + form[field_name] = [delimiter.join(items)] + elif isinstance(obj, List): + items = [] + + for value in obj: + if value is None: + continue + + if explode: + if not field_name in form: + form[field_name] = [] + form[field_name].append(_val_to_string(value)) + else: + items.append(_val_to_string(value)) + + if len(items) > 0: + form[field_name] = [delimiter.join([str(item) for item in items])] + else: + form[field_name] = [_val_to_string(obj)] + + return form + + +def _serialize_header(explode: bool, obj: Any) -> str: + if obj is None: + return "" + + if is_dataclass(obj): + items = [] + obj_fields: Tuple[Field, ...] = fields(obj) + for obj_field in obj_fields: + obj_param_metadata = obj_field.metadata.get("header") + + if not obj_param_metadata: + continue + + obj_field_name = obj_param_metadata.get("field_name", obj_field.name) + if obj_field_name == "": + continue + + val = getattr(obj, obj_field.name) + if val is None: + continue + + if explode: + items.append(f"{obj_field_name}={_val_to_string(val)}") + else: + items.append(obj_field_name) + items.append(_val_to_string(val)) + + if len(items) > 0: + return ",".join(items) + elif isinstance(obj, Dict): + items = [] + + for key, value in obj.items(): + if value is None: + continue + + if explode: + items.append(f"{key}={_val_to_string(value)}") + else: + items.append(key) + items.append(_val_to_string(value)) + + if len(items) > 0: + return ",".join([str(item) for item in items]) + elif isinstance(obj, List): + items = [] + + for value in obj: + if value is None: + continue + + items.append(_val_to_string(value)) + + if len(items) > 0: + return ",".join(items) + else: + return f"{_val_to_string(obj)}" + + return "" + + +def unmarshal_json(data, typ, decoder=None, infer_missing=False): + unmarshal = make_dataclass("Unmarshal", [("res", typ)], bases=(DataClassJsonMixin,)) + json_dict = json.loads(data) + try: + out = unmarshal.from_dict({"res": json_dict}, infer_missing=infer_missing) + except AttributeError as attr_err: + raise AttributeError( + f"unable to unmarshal {data} as {typ} - {attr_err}" + ) from attr_err + + return out.res if decoder is None else decoder(out.res) + + +def marshal_json(val, typ, encoder=None): + if not is_optional_type(typ) and val is None: + raise ValueError(f"Could not marshal None into non-optional type: {typ}") + + marshal = make_dataclass("Marshal", [("res", typ)], bases=(DataClassJsonMixin,)) + marshaller = marshal(res=val) + json_dict = marshaller.to_dict() + val = json_dict["res"] if encoder is None else encoder(json_dict["res"]) + + return json.dumps(val, separators=(",", ":"), sort_keys=True) + + +def match_content_type(content_type: str, pattern: str) -> boolean: + if pattern in (content_type, "*", "*/*"): + return True + + msg = Message() + msg["content-type"] = content_type + media_type = msg.get_content_type() + + if media_type == pattern: + return True + + parts = media_type.split("/") + if len(parts) == 2: + if pattern in (f"{parts[0]}/*", f"*/{parts[1]}"): + return True + + return False + + +def match_status_codes(status_codes: List[str], status_code: int) -> bool: + for code in status_codes: + if code == str(status_code): + return True + + if code.endswith("XX") and code.startswith(str(status_code)[:1]): + return True + return False + + +def datetimeisoformat(optional: bool): + def isoformatoptional(val): + if optional and val is None: + return None + return _val_to_string(val) + + return isoformatoptional + + +def dateisoformat(optional: bool): + def isoformatoptional(val): + if optional and val is None: + return None + return date.isoformat(val) + + return isoformatoptional + + +def datefromisoformat(date_str: str): + return dateutil.parser.parse(date_str).date() + + +def bigintencoder(optional: bool): + def bigintencode(val: int): + if optional and val is None: + return None + return str(val) + + return bigintencode + + +def bigintdecoder(val): + if val is None: + return None + + if isinstance(val, float): + raise ValueError(f"{val} is a float") + return int(val) + +def integerstrencoder(optional: bool): + def integerstrencode(val: int): + if optional and val is None: + return None + return str(val) + + return integerstrencode + + +def integerstrdecoder(val): + if val is None: + return None + + if isinstance(val, float): + raise ValueError(f"{val} is a float") + return int(val) + + +def numberstrencoder(optional: bool): + def numberstrencode(val: float): + if optional and val is None: + return None + return str(val) + + return numberstrencode + + +def numberstrdecoder(val): + if val is None: + return None + + return float(val) + + +def decimalencoder(optional: bool, as_str: bool): + def decimalencode(val: Decimal): + if optional and val is None: + return None + + if as_str: + return str(val) + + return float(val) + + return decimalencode + + +def decimaldecoder(val): + if val is None: + return None + + return Decimal(str(val)) + + +def map_encoder(optional: bool, value_encoder: Callable): + def map_encode(val: Dict): + if optional and val is None: + return None + + encoded = {} + for key, value in val.items(): + encoded[key] = value_encoder(value) + + return encoded + + return map_encode + + +def map_decoder(value_decoder: Callable): + def map_decode(val: Dict): + decoded = {} + for key, value in val.items(): + decoded[key] = value_decoder(value) + + return decoded + + return map_decode + + +def list_encoder(optional: bool, value_encoder: Callable): + def list_encode(val: List): + if optional and val is None: + return None + + encoded = [] + for value in val: + encoded.append(value_encoder(value)) + + return encoded + + return list_encode + + +def list_decoder(value_decoder: Callable): + def list_decode(val: List): + decoded = [] + for value in val: + decoded.append(value_decoder(value)) + + return decoded + + return list_decode + + +def union_encoder(all_encoders: Dict[str, Callable]): + def selective_encoder(val: Any): + if type(val) in all_encoders: + return all_encoders[type(val)](val) + return val + + return selective_encoder + + +def union_decoder(all_decoders: List[Callable]): + def selective_decoder(val: Any): + decoded = val + for decoder in all_decoders: + try: + decoded = decoder(val) + break + except (TypeError, ValueError): + continue + return decoded + + return selective_decoder + + +def get_field_name(name): + def override(_, _field_name=name): + return _field_name + + return override + + +def _val_to_string(val) -> str: + if isinstance(val, bool): + return str(val).lower() + if isinstance(val, datetime): + return str(val.isoformat().replace("+00:00", "Z")) + if isinstance(val, Enum): + return str(val.value) + + return str(val) + + +def _populate_from_globals( + param_name: str, value: Any, param_type: str, gbls: Any +) -> Tuple[Any, bool]: + if gbls is None: + return value, False + + global_fields = fields(gbls) + + found = False + for field in global_fields: + if field.name is not param_name: + continue + + found = True + + if value is not None: + return value, True + + global_value = getattr(gbls, field.name) + + param_metadata = field.metadata.get(param_type) + if param_metadata is None: + return value, True + + return global_value, True + + return value, found + + +def decoder_with_discriminator(field_name): + def decode_fx(obj): + kls = getattr(sys.modules["sdk.models.shared"], obj[field_name]) + return unmarshal_json(json.dumps(obj), kls) + + return decode_fx + + +def remove_suffix(input_string, suffix): + if suffix and input_string.endswith(suffix): + return input_string[: -len(suffix)] + return input_string |