about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/huggingface_hub/inference
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/inference
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/inference')
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/__init__.py0
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py3516
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_common.py422
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/__init__.py0
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/_async_client.py3628
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/__init__.py187
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py43
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_to_audio.py30
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py114
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/base.py161
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/chat_completion.py301
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/depth_estimation.py28
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py80
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py36
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/fill_mask.py47
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_classification.py43
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_segmentation.py51
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_image.py54
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_text.py101
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/object_detection.py58
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/question_answering.py74
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py27
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/summarization.py41
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/table_question_answering.py62
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text2text_generation.py42
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_classification.py41
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_generation.py168
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_audio.py100
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_image.py50
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_speech.py100
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py46
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/token_classification.py51
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/translation.py49
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/video_classification.py45
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py49
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_classification.py45
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py40
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py52
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/__init__.py135
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/_common.py241
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/black_forest_labs.py66
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cerebras.py6
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cohere.py15
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fal_ai.py90
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fireworks_ai.py6
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hf_inference.py122
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hyperbolic.py43
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/nebius.py41
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/novita.py26
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/replicate.py53
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/sambanova.py6
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/together.py59
52 files changed, 10891 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/__init__.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/__init__.py
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py
new file mode 100644
index 00000000..988c8156
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py
@@ -0,0 +1,3516 @@
+# coding=utf-8
+# Copyright 2023-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Related resources:
+#    https://huggingface.co/tasks
+#    https://huggingface.co/docs/huggingface.js/inference/README
+#    https://github.com/huggingface/huggingface.js/tree/main/packages/inference/src
+#    https://github.com/huggingface/text-generation-inference/tree/main/clients/python
+#    https://github.com/huggingface/text-generation-inference/blob/main/clients/python/text_generation/client.py
+#    https://huggingface.slack.com/archives/C03E4DQ9LAJ/p1680169099087869
+#    https://github.com/huggingface/unity-api#tasks
+#
+# Some TODO:
+# - add all tasks
+#
+# NOTE: the philosophy of this client is "let's make it as easy as possible to use it, even if less optimized". Some
+# examples of how it translates:
+# - Timeout / Server unavailable is handled by the client in a single "timeout" parameter.
+# - Files can be provided as bytes, file paths, or URLs and the client will try to "guess" the type.
+# - Images are parsed as PIL.Image for easier manipulation.
+# - Provides a "recommended model" for each task => suboptimal but user-wise quicker to get a first script running.
+# - Only the main parameters are publicly exposed. Power users can always read the docs for more options.
+import base64
+import logging
+import re
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union, overload
+
+from requests import HTTPError
+
+from huggingface_hub import constants
+from huggingface_hub.errors import BadRequestError, InferenceTimeoutError
+from huggingface_hub.inference._common import (
+    TASKS_EXPECTING_IMAGES,
+    ContentT,
+    ModelStatus,
+    RequestParameters,
+    _b64_encode,
+    _b64_to_image,
+    _bytes_to_dict,
+    _bytes_to_image,
+    _bytes_to_list,
+    _get_unsupported_text_generation_kwargs,
+    _import_numpy,
+    _open_as_binary,
+    _set_unsupported_text_generation_kwargs,
+    _stream_chat_completion_response,
+    _stream_text_generation_response,
+    raise_text_generation_error,
+)
+from huggingface_hub.inference._generated.types import (
+    AudioClassificationOutputElement,
+    AudioClassificationOutputTransform,
+    AudioToAudioOutputElement,
+    AutomaticSpeechRecognitionOutput,
+    ChatCompletionInputGrammarType,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputTool,
+    ChatCompletionInputToolChoiceClass,
+    ChatCompletionInputToolChoiceEnum,
+    ChatCompletionOutput,
+    ChatCompletionStreamOutput,
+    DocumentQuestionAnsweringOutputElement,
+    FillMaskOutputElement,
+    ImageClassificationOutputElement,
+    ImageClassificationOutputTransform,
+    ImageSegmentationOutputElement,
+    ImageSegmentationSubtask,
+    ImageToImageTargetSize,
+    ImageToTextOutput,
+    ObjectDetectionOutputElement,
+    Padding,
+    QuestionAnsweringOutputElement,
+    SummarizationOutput,
+    SummarizationTruncationStrategy,
+    TableQuestionAnsweringOutputElement,
+    TextClassificationOutputElement,
+    TextClassificationOutputTransform,
+    TextGenerationInputGrammarType,
+    TextGenerationOutput,
+    TextGenerationStreamOutput,
+    TextToSpeechEarlyStoppingEnum,
+    TokenClassificationAggregationStrategy,
+    TokenClassificationOutputElement,
+    TranslationOutput,
+    TranslationTruncationStrategy,
+    VisualQuestionAnsweringOutputElement,
+    ZeroShotClassificationOutputElement,
+    ZeroShotImageClassificationOutputElement,
+)
+from huggingface_hub.inference._providers import PROVIDER_T, HFInferenceTask, get_provider_helper
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
+from huggingface_hub.utils._deprecation import _deprecate_arguments, _deprecate_method
+
+
+if TYPE_CHECKING:
+    import numpy as np
+    from PIL.Image import Image
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_KWARGS_NOT_USED_REGEX = re.compile(r"The following `model_kwargs` are not used by the model: \[(.*?)\]")
+
+
+class InferenceClient:
+    """
+    Initialize a new Inference Client.
+
+    [`InferenceClient`] aims to provide a unified experience to perform inference. The client can be used
+    seamlessly with either the (free) Inference API, self-hosted Inference Endpoints, or third-party Inference Providers.
+
+    Args:
+        model (`str`, `optional`):
+            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
+            or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
+            automatically selected for the task.
+            Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
+            arguments are mutually exclusive. If using `base_url` for chat completion, the `/chat/completions` suffix
+            path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
+            documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
+        provider (`str`, *optional*):
+            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"cohere"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
+            defaults to hf-inference (Hugging Face Serverless Inference API).
+            If model is a URL or `base_url` is passed, then `provider` is not used.
+        token (`str` or `bool`, *optional*):
+            Hugging Face token. Will default to the locally saved token if not provided.
+            Pass `token=False` if you don't want to send your token to the server.
+            Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
+        timeout (`float`, `optional`):
+            The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
+            API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
+        headers (`Dict[str, str]`, `optional`):
+            Additional headers to send to the server. By default only the authorization and user-agent headers are sent.
+            Values in this dictionary will override the default values.
+        cookies (`Dict[str, str]`, `optional`):
+            Additional cookies to send to the server.
+        proxies (`Any`, `optional`):
+            Proxies to use for the request.
+        base_url (`str`, `optional`):
+            Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
+        api_key (`str`, `optional`):
+            Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        *,
+        provider: Optional[PROVIDER_T] = None,
+        token: Optional[str] = None,
+        timeout: Optional[float] = None,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        proxies: Optional[Any] = None,
+        # OpenAI compatibility
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+    ) -> None:
+        if model is not None and base_url is not None:
+            raise ValueError(
+                "Received both `model` and `base_url` arguments. Please provide only one of them."
+                " `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
+                " If using `base_url` for chat completion, the `/chat/completions` suffix path will be appended to the base url."
+                " When passing a URL as `model`, the client will not append any suffix path to it."
+            )
+        if token is not None and api_key is not None:
+            raise ValueError(
+                "Received both `token` and `api_key` arguments. Please provide only one of them."
+                " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `token`."
+            )
+
+        self.model: Optional[str] = base_url or model
+        self.token: Optional[str] = token if token is not None else api_key
+        self.headers = headers if headers is not None else {}
+
+        # Configure provider
+        self.provider = provider if provider is not None else "hf-inference"
+
+        self.cookies = cookies
+        self.timeout = timeout
+        self.proxies = proxies
+
+    def __repr__(self):
+        return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
+
+    @overload
+    def post(  # type: ignore[misc]
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: Literal[False] = ...,
+    ) -> bytes: ...
+
+    @overload
+    def post(  # type: ignore[misc]
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: Literal[True] = ...,
+    ) -> Iterable[bytes]: ...
+
+    @overload
+    def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, Iterable[bytes]]: ...
+
+    @_deprecate_method(
+        version="0.31.0",
+        message=(
+            "Making direct POST requests to the inference server is not supported anymore. "
+            "Please use task methods instead (e.g. `InferenceClient.chat_completion`). "
+            "If your use case is not supported, please open an issue in https://github.com/huggingface/huggingface_hub."
+        ),
+    )
+    def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, Iterable[bytes]]:
+        """
+        Make a POST request to the inference server.
+
+        This method is deprecated and will be removed in the future.
+        Please use task methods instead (e.g. `InferenceClient.chat_completion`).
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(
+                "Cannot use `post` with another provider than `hf-inference`. "
+                "`InferenceClient.post` is deprecated and should not be used directly anymore."
+            )
+        provider_helper = HFInferenceTask(task or "unknown")
+        mapped_model = provider_helper._prepare_mapped_model(model or self.model)
+        url = provider_helper._prepare_url(self.token, mapped_model)  # type: ignore[arg-type]
+        headers = provider_helper._prepare_headers(self.headers, self.token)  # type: ignore[arg-type]
+        return self._inner_post(
+            request_parameters=RequestParameters(
+                url=url,
+                task=task or "unknown",
+                model=model or "unknown",
+                json=json,
+                data=data,
+                headers=headers,
+            ),
+            stream=stream,
+        )
+
+    @overload
+    def _inner_post(  # type: ignore[misc]
+        self, request_parameters: RequestParameters, *, stream: Literal[False] = ...
+    ) -> bytes: ...
+
+    @overload
+    def _inner_post(  # type: ignore[misc]
+        self, request_parameters: RequestParameters, *, stream: Literal[True] = ...
+    ) -> Iterable[bytes]: ...
+
+    @overload
+    def _inner_post(
+        self, request_parameters: RequestParameters, *, stream: bool = False
+    ) -> Union[bytes, Iterable[bytes]]: ...
+
+    def _inner_post(
+        self, request_parameters: RequestParameters, *, stream: bool = False
+    ) -> Union[bytes, Iterable[bytes]]:
+        """Make a request to the inference server."""
+        # TODO: this should be handled in provider helpers directly
+        if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers:
+            request_parameters.headers["Accept"] = "image/png"
+
+        while True:
+            with _open_as_binary(request_parameters.data) as data_as_binary:
+                try:
+                    response = get_session().post(
+                        request_parameters.url,
+                        json=request_parameters.json,
+                        data=data_as_binary,
+                        headers=request_parameters.headers,
+                        cookies=self.cookies,
+                        timeout=self.timeout,
+                        stream=stream,
+                        proxies=self.proxies,
+                    )
+                except TimeoutError as error:
+                    # Convert any `TimeoutError` to a `InferenceTimeoutError`
+                    raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error  # type: ignore
+
+            try:
+                hf_raise_for_status(response)
+                return response.iter_lines() if stream else response.content
+            except HTTPError as error:
+                if error.response.status_code == 422 and request_parameters.task != "unknown":
+                    msg = str(error.args[0])
+                    if len(error.response.text) > 0:
+                        msg += f"\n{error.response.text}\n"
+                    error.args = (msg,) + error.args[1:]
+                raise
+
+    def audio_classification(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
+    ) -> List[AudioClassificationOutputElement]:
+        """
+        Perform audio classification on the provided audio content.
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The audio content to classify. It can be raw audio bytes, a local audio file, or a URL pointing to an
+                audio file.
+            model (`str`, *optional*):
+                The model to use for audio classification. Can be a model ID hosted on the Hugging Face Hub
+                or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
+                audio classification will be used.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
+
+        Returns:
+            `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.audio_classification("audio.flac")
+        [
+            AudioClassificationOutputElement(score=0.4976358711719513, label='hap'),
+            AudioClassificationOutputElement(score=0.3677836060523987, label='neu'),
+            ...
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={"function_to_apply": function_to_apply, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return AudioClassificationOutputElement.parse_obj_as_list(response)
+
+    def audio_to_audio(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+    ) -> List[AudioToAudioOutputElement]:
+        """
+        Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The audio content for the model. It can be raw audio bytes, a local audio file, or a URL pointing to an
+                audio file.
+            model (`str`, *optional*):
+                The model can be any model which takes an audio file and returns another audio file. Can be a model ID hosted on the Hugging Face Hub
+                or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
+                audio_to_audio will be used.
+
+        Returns:
+            `List[AudioToAudioOutputElement]`: A list of [`AudioToAudioOutputElement`] items containing audios label, content-type, and audio content in blob.
+
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> audio_output = client.audio_to_audio("audio.flac")
+        >>> for i, item in enumerate(audio_output):
+        >>>     with open(f"output_{i}.flac", "wb") as f:
+                    f.write(item.blob)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="audio-to-audio")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        audio_output = AudioToAudioOutputElement.parse_obj_as_list(response)
+        for item in audio_output:
+            item.blob = base64.b64decode(item.blob)
+        return audio_output
+
+    def automatic_speech_recognition(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> AutomaticSpeechRecognitionOutput:
+        """
+        Perform automatic speech recognition (ASR or audio-to-text) on the given audio content.
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The content to transcribe. It can be raw audio bytes, local audio file, or a URL to an audio file.
+            model (`str`, *optional*):
+                The model to use for ASR. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for ASR will be used.
+            extra_body (`Dict`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+        Returns:
+            [`AutomaticSpeechRecognitionOutput`]: An item containing the transcribed text and optionally the timestamp chunks.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.automatic_speech_recognition("hello_world.flac").text
+        "hello world"
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="automatic-speech-recognition")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={**(extra_body or {})},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
+
+    @overload
+    def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[False] = False,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> ChatCompletionOutput: ...
+
+    @overload
+    def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[True] = True,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> Iterable[ChatCompletionStreamOutput]: ...
+
+    @overload
+    def chat_completion(
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
+
+    def chat_completion(
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        # Parameters from ChatCompletionInput (handled manually)
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
+        """
+        A method for completing conversations using a specified language model.
+
+        <Tip>
+
+        The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
+        Inputs and outputs are strictly the same and using either syntax will yield the same results.
+        Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
+        for more details about OpenAI's compatibility.
+
+        </Tip>
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            messages (List of [`ChatCompletionInputMessage`]):
+                Conversation history consisting of roles and content pairs.
+            model (`str`, *optional*):
+                The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
+                See https://huggingface.co/tasks/text-generation for more details.
+                If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
+                custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
+            frequency_penalty (`float`, *optional*):
+                Penalizes new tokens based on their existing frequency
+                in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
+            logit_bias (`List[float]`, *optional*):
+                Adjusts the likelihood of specific tokens appearing in the generated output.
+            logprobs (`bool`, *optional*):
+                Whether to return log probabilities of the output tokens or not. If true, returns the log
+                probabilities of each output token returned in the content of message.
+            max_tokens (`int`, *optional*):
+                Maximum number of tokens allowed in the response. Defaults to 100.
+            n (`int`, *optional*):
+                The number of completions to generate for each prompt.
+            presence_penalty (`float`, *optional*):
+                Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
+                text so far, increasing the model's likelihood to talk about new topics.
+            response_format ([`ChatCompletionInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
+            seed (Optional[`int`], *optional*):
+                Seed for reproducible control flow. Defaults to None.
+            stop (`List[str]`, *optional*):
+                Up to four strings which trigger the end of the response.
+                Defaults to None.
+            stream (`bool`, *optional*):
+                Enable realtime streaming of responses. Defaults to False.
+            stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
+                Options for streaming completions.
+            temperature (`float`, *optional*):
+                Controls randomness of the generations. Lower values ensure
+                less random completions. Range: [0, 2]. Defaults to 1.0.
+            top_logprobs (`int`, *optional*):
+                An integer between 0 and 5 specifying the number of most likely tokens to return at each token
+                position, each with an associated log probability. logprobs must be set to true if this parameter is
+                used.
+            top_p (`float`, *optional*):
+                Fraction of the most likely next words to sample from.
+                Must be between 0 and 1. Defaults to 1.0.
+            tool_choice ([`ChatCompletionInputToolChoiceClass`] or [`ChatCompletionInputToolChoiceEnum`], *optional*):
+                The tool to use for the completion. Defaults to "auto".
+            tool_prompt (`str`, *optional*):
+                A prompt to be appended before the tools.
+            tools (List of [`ChatCompletionInputTool`], *optional*):
+                A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
+                provide a list of functions the model may generate JSON inputs for.
+            extra_body (`Dict`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+        Returns:
+            [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
+            Generated text returned from the server:
+            - if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
+            - if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
+        >>> client.chat_completion(messages, max_tokens=100)
+        ChatCompletionOutput(
+            choices=[
+                ChatCompletionOutputComplete(
+                    finish_reason='eos_token',
+                    index=0,
+                    message=ChatCompletionOutputMessage(
+                        role='assistant',
+                        content='The capital of France is Paris.',
+                        name=None,
+                        tool_calls=None
+                    ),
+                    logprobs=None
+                )
+            ],
+            created=1719907176,
+            id='',
+            model='meta-llama/Meta-Llama-3-8B-Instruct',
+            object='text_completion',
+            system_fingerprint='2.0.4-sha-f426a33',
+            usage=ChatCompletionOutputUsage(
+                completion_tokens=8,
+                prompt_tokens=17,
+                total_tokens=25
+            )
+        )
+        ```
+
+        Example using streaming:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
+        >>> for token in client.chat_completion(messages, max_tokens=10, stream=True):
+        ...     print(token)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        (...)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ```
+
+        Example using OpenAI's syntax:
+        ```py
+        # instead of `from openai import OpenAI`
+        from huggingface_hub import InferenceClient
+
+        # instead of `client = OpenAI(...)`
+        client = InferenceClient(
+            base_url=...,
+            api_key=...,
+        )
+
+        output = client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=True,
+            max_tokens=1024,
+        )
+
+        for chunk in output:
+            print(chunk.choices[0].delta.content)
+        ```
+
+        Example using a third-party provider directly with extra (provider-specific) parameters. Usage will be billed on your Together AI account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="together",  # Use Together AI provider
+        ...     api_key="<together_api_key>",  # Pass your Together API key directly
+        ... )
+        >>> client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-8B-Instruct",
+        ...     messages=[{"role": "user", "content": "What is the capital of France?"}],
+        ...     extra_body={"safety_model": "Meta-Llama/Llama-Guard-7b"},
+        ... )
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="sambanova",  # Use Sambanova provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-8B-Instruct",
+        ...     messages=[{"role": "user", "content": "What is the capital of France?"}],
+        ... )
+        ```
+
+        Example using Image + Text as input:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+
+        # provide a remote URL
+        >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        # or a base64-encoded image
+        >>> image_path = "/path/to/image.jpeg"
+        >>> with open(image_path, "rb") as f:
+        ...     base64_image = base64.b64encode(f.read()).decode("utf-8")
+        >>> image_url = f"data:image/jpeg;base64,{base64_image}"
+
+        >>> client = InferenceClient("meta-llama/Llama-3.2-11B-Vision-Instruct")
+        >>> output = client.chat.completions.create(
+        ...     messages=[
+        ...         {
+        ...             "role": "user",
+        ...             "content": [
+        ...                 {
+        ...                     "type": "image_url",
+        ...                     "image_url": {"url": image_url},
+        ...                 },
+        ...                 {
+        ...                     "type": "text",
+        ...                     "text": "Describe this image in one sentence.",
+        ...                 },
+        ...             ],
+        ...         },
+        ...     ],
+        ... )
+        >>> output
+        The image depicts the iconic Statue of Liberty situated in New York Harbor, New York, on a clear day.
+        ```
+
+        Example using tools:
+        ```py
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
+        ...     },
+        ...     {
+        ...         "role": "user",
+        ...         "content": "What's the weather like the next 3 days in San Francisco, CA?",
+        ...     },
+        ... ]
+        >>> tools = [
+        ...     {
+        ...         "type": "function",
+        ...         "function": {
+        ...             "name": "get_current_weather",
+        ...             "description": "Get the current weather",
+        ...             "parameters": {
+        ...                 "type": "object",
+        ...                 "properties": {
+        ...                     "location": {
+        ...                         "type": "string",
+        ...                         "description": "The city and state, e.g. San Francisco, CA",
+        ...                     },
+        ...                     "format": {
+        ...                         "type": "string",
+        ...                         "enum": ["celsius", "fahrenheit"],
+        ...                         "description": "The temperature unit to use. Infer this from the users location.",
+        ...                     },
+        ...                 },
+        ...                 "required": ["location", "format"],
+        ...             },
+        ...         },
+        ...     },
+        ...     {
+        ...         "type": "function",
+        ...         "function": {
+        ...             "name": "get_n_day_weather_forecast",
+        ...             "description": "Get an N-day weather forecast",
+        ...             "parameters": {
+        ...                 "type": "object",
+        ...                 "properties": {
+        ...                     "location": {
+        ...                         "type": "string",
+        ...                         "description": "The city and state, e.g. San Francisco, CA",
+        ...                     },
+        ...                     "format": {
+        ...                         "type": "string",
+        ...                         "enum": ["celsius", "fahrenheit"],
+        ...                         "description": "The temperature unit to use. Infer this from the users location.",
+        ...                     },
+        ...                     "num_days": {
+        ...                         "type": "integer",
+        ...                         "description": "The number of days to forecast",
+        ...                     },
+        ...                 },
+        ...                 "required": ["location", "format", "num_days"],
+        ...             },
+        ...         },
+        ...     },
+        ... ]
+
+        >>> response = client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-70B-Instruct",
+        ...     messages=messages,
+        ...     tools=tools,
+        ...     tool_choice="auto",
+        ...     max_tokens=500,
+        ... )
+        >>> response.choices[0].message.tool_calls[0].function
+        ChatCompletionOutputFunctionDefinition(
+            arguments={
+                'location': 'San Francisco, CA',
+                'format': 'fahrenheit',
+                'num_days': 3
+            },
+            name='get_n_day_weather_forecast',
+            description=None
+        )
+        ```
+
+        Example using response_format:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+        ...     },
+        ... ]
+        >>> response_format = {
+        ...     "type": "json",
+        ...     "value": {
+        ...         "properties": {
+        ...             "location": {"type": "string"},
+        ...             "activity": {"type": "string"},
+        ...             "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...             "animals": {"type": "array", "items": {"type": "string"}},
+        ...         },
+        ...         "required": ["location", "activity", "animals_seen", "animals"],
+        ...     },
+        ... }
+        >>> response = client.chat_completion(
+        ...     messages=messages,
+        ...     response_format=response_format,
+        ...     max_tokens=500,
+        )
+        >>> response.choices[0].message.content
+        '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
+        ```
+        """
+        # Get the provider helper
+        provider_helper = get_provider_helper(self.provider, task="conversational")
+
+        # Since `chat_completion(..., model=xxx)` is also a payload parameter for the server, we need to handle 'model' differently.
+        # `self.model` takes precedence over 'model' argument for building URL.
+        # `model` takes precedence for payload value.
+        model_id_or_url = self.model or model
+        payload_model = model or self.model
+
+        # Prepare the payload
+        parameters = {
+            "model": payload_model,
+            "frequency_penalty": frequency_penalty,
+            "logit_bias": logit_bias,
+            "logprobs": logprobs,
+            "max_tokens": max_tokens,
+            "n": n,
+            "presence_penalty": presence_penalty,
+            "response_format": response_format,
+            "seed": seed,
+            "stop": stop,
+            "temperature": temperature,
+            "tool_choice": tool_choice,
+            "tool_prompt": tool_prompt,
+            "tools": tools,
+            "top_logprobs": top_logprobs,
+            "top_p": top_p,
+            "stream": stream,
+            "stream_options": stream_options,
+            **(extra_body or {}),
+        }
+        request_parameters = provider_helper.prepare_request(
+            inputs=messages,
+            parameters=parameters,
+            headers=self.headers,
+            model=model_id_or_url,
+            api_key=self.token,
+        )
+        data = self._inner_post(request_parameters, stream=stream)
+
+        if stream:
+            return _stream_chat_completion_response(data)  # type: ignore[arg-type]
+
+        return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
+
+    def document_question_answering(
+        self,
+        image: ContentT,
+        question: str,
+        *,
+        model: Optional[str] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        lang: Optional[str] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+        word_boxes: Optional[List[Union[List[float], str]]] = None,
+    ) -> List[DocumentQuestionAnsweringOutputElement]:
+        """
+        Answer questions on document images.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            question (`str`):
+                Question to be answered.
+            model (`str`, *optional*):
+                The model to use for the document question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
+                Defaults to None.
+            doc_stride (`int`, *optional*):
+                If the words in the document are too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer
+            lang (`str`, *optional*):
+                Language to use while running OCR. Defaults to english.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using doc_stride as overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Can return less than top_k
+                answers if there are not enough options available within the context.
+            word_boxes (`List[Union[List[float], str`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
+                step and use the provided bounding boxes instead.
+        Returns:
+            `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
+        ```
+        """
+        inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        provider_helper = get_provider_helper(self.provider, task="document-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=inputs,
+            parameters={
+                "doc_stride": doc_stride,
+                "handle_impossible_answer": handle_impossible_answer,
+                "lang": lang,
+                "max_answer_len": max_answer_len,
+                "max_question_len": max_question_len,
+                "max_seq_len": max_seq_len,
+                "top_k": top_k,
+                "word_boxes": word_boxes,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
+
+    def feature_extraction(
+        self,
+        text: str,
+        *,
+        normalize: Optional[bool] = None,
+        prompt_name: Optional[str] = None,
+        truncate: Optional[bool] = None,
+        truncation_direction: Optional[Literal["Left", "Right"]] = None,
+        model: Optional[str] = None,
+    ) -> "np.ndarray":
+        """
+        Generate embeddings for a given text.
+
+        Args:
+            text (`str`):
+                The text to embed.
+            model (`str`, *optional*):
+                The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
+                Defaults to None.
+            normalize (`bool`, *optional*):
+                Whether to normalize the embeddings or not.
+                Only available on server powered by Text-Embedding-Inference.
+            prompt_name (`str`, *optional*):
+                The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+                Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
+                For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",...},
+                then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
+                because the prompt text will be prepended before any text to encode.
+            truncate (`bool`, *optional*):
+                Whether to truncate the embeddings or not.
+                Only available on server powered by Text-Embedding-Inference.
+            truncation_direction (`Literal["Left", "Right"]`, *optional*):
+                Which side of the input should be truncated when `truncate=True` is passed.
+
+        Returns:
+            `np.ndarray`: The embedding representing the input text as a float32 numpy array.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.feature_extraction("Hi, who are you?")
+        array([[ 2.424802  ,  2.93384   ,  1.1750331 , ...,  1.240499, -0.13776633, -0.7889173 ],
+        [-0.42943227, -0.6364878 , -1.693462  , ...,  0.41978157, -2.4336355 ,  0.6162071 ],
+        ...,
+        [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="feature-extraction")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "normalize": normalize,
+                "prompt_name": prompt_name,
+                "truncate": truncate,
+                "truncation_direction": truncation_direction,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        np = _import_numpy()
+        return np.array(_bytes_to_dict(response), dtype="float32")
+
+    def fill_mask(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        targets: Optional[List[str]] = None,
+        top_k: Optional[int] = None,
+    ) -> List[FillMaskOutputElement]:
+        """
+        Fill in a hole with a missing word (token to be precise).
+
+        Args:
+            text (`str`):
+                a string to be filled from, must contain the [MASK] token (check model card for exact name of the mask).
+            model (`str`, *optional*):
+                The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
+            targets (`List[str`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
+        Returns:
+            `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
+            probability, token reference, and completed text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.fill_mask("The goal of life is <mask>.")
+        [
+            FillMaskOutputElement(score=0.06897063553333282, token=11098, token_str=' happiness', sequence='The goal of life is happiness.'),
+            FillMaskOutputElement(score=0.06554922461509705, token=45075, token_str=' immortality', sequence='The goal of life is immortality.')
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="fill-mask")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={"targets": targets, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return FillMaskOutputElement.parse_obj_as_list(response)
+
+    def image_classification(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        function_to_apply: Optional["ImageClassificationOutputTransform"] = None,
+        top_k: Optional[int] = None,
+    ) -> List[ImageClassificationOutputElement]:
+        """
+        Perform image classification on the given image using the specified model.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The image to classify. It can be raw bytes, an image file, or a URL to an online image.
+            model (`str`, *optional*):
+                The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
+                deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
+            function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+        Returns:
+            `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
+        [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="image-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"function_to_apply": function_to_apply, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return ImageClassificationOutputElement.parse_obj_as_list(response)
+
+    def image_segmentation(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        mask_threshold: Optional[float] = None,
+        overlap_mask_area_threshold: Optional[float] = None,
+        subtask: Optional["ImageSegmentationSubtask"] = None,
+        threshold: Optional[float] = None,
+    ) -> List[ImageSegmentationOutputElement]:
+        """
+        Perform image segmentation on the given image using the specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The image to segment. It can be raw bytes, an image file, or a URL to an online image.
+            model (`str`, *optional*):
+                The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
+                deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
+            mask_threshold (`float`, *optional*):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            subtask (`"ImageSegmentationSubtask"`, *optional*):
+                Segmentation task to be performed, depending on model capabilities.
+            threshold (`float`, *optional*):
+                Probability threshold to filter out predicted masks.
+        Returns:
+            `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.image_segmentation("cat.jpg")
+        [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "mask_threshold": mask_threshold,
+                "overlap_mask_area_threshold": overlap_mask_area_threshold,
+                "subtask": subtask,
+                "threshold": threshold,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        output = ImageSegmentationOutputElement.parse_obj_as_list(response)
+        for item in output:
+            item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
+        return output
+
+    def image_to_image(
+        self,
+        image: ContentT,
+        prompt: Optional[str] = None,
+        *,
+        negative_prompt: Optional[str] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        model: Optional[str] = None,
+        target_size: Optional[ImageToImageTargetSize] = None,
+        **kwargs,
+    ) -> "Image":
+        """
+        Perform image-to-image translation using a specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
+            prompt (`str`, *optional*):
+                The text prompt to guide the image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate images closely
+                linked to the text prompt at the expense of lower image quality.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            target_size (`ImageToImageTargetSize`, *optional*):
+                The size in pixel of the output image.
+
+        Returns:
+            `Image`: The translated image.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> image = client.image_to_image("cat.jpg", prompt="turn the cat into a tiger")
+        >>> image.save("tiger.jpg")
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="image-to-image")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "target_size": target_size,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return _bytes_to_image(response)
+
+    def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
+        """
+        Takes an input image and return text.
+
+        Models can have very different outputs depending on your use case (image captioning, optical character recognition
+        (OCR), Pix2Struct, etc). Please have a look to the model card to learn more about a model's specificities.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image to caption. It can be raw bytes, an image file, or a URL to an online image..
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+
+        Returns:
+            [`ImageToTextOutput`]: The generated text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.image_to_text("cat.jpg")
+        'a cat standing in a grassy field '
+        >>> client.image_to_text("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
+        'a dog laying on the grass next to a flower pot '
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="image-to-text")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        output = ImageToTextOutput.parse_obj(response)
+        return output[0] if isinstance(output, list) else output
+
+    def object_detection(
+        self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
+    ) -> List[ObjectDetectionOutputElement]:
+        """
+        Perform object detection on the given image using the specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The image to detect objects on. It can be raw bytes, an image file, or a URL to an online image.
+            model (`str`, *optional*):
+                The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
+                deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
+            threshold (`float`, *optional*):
+                The probability necessary to make a prediction.
+        Returns:
+            `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+            `ValueError`:
+                If the request output is not a List.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.object_detection("people.jpg")
+        [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="object-detection")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"threshold": threshold},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return ObjectDetectionOutputElement.parse_obj_as_list(response)
+
+    def question_answering(
+        self,
+        question: str,
+        context: str,
+        *,
+        model: Optional[str] = None,
+        align_to_words: Optional[bool] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+    ) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
+        """
+        Retrieve the answer to a question from a given text.
+
+        Args:
+            question (`str`):
+                Question to be answered.
+            context (`str`):
+                The context of the question.
+            model (`str`):
+                The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint.
+            align_to_words (`bool`, *optional*):
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt
+                on non-space-separated languages (like Japanese or Chinese)
+            doc_stride (`int`, *optional*):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using docStride as overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                topk answers if there are not enough options available within the context.
+
+        Returns:
+            Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
+                When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
+                When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
+        QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={
+                "align_to_words": align_to_words,
+                "doc_stride": doc_stride,
+                "handle_impossible_answer": handle_impossible_answer,
+                "max_answer_len": max_answer_len,
+                "max_question_len": max_question_len,
+                "max_seq_len": max_seq_len,
+                "top_k": top_k,
+            },
+            extra_payload={"question": question, "context": context},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        # Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
+        output = QuestionAnsweringOutputElement.parse_obj(response)
+        return output
+
+    def sentence_similarity(
+        self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
+    ) -> List[float]:
+        """
+        Compute the semantic similarity between a sentence and a list of other sentences by comparing their embeddings.
+
+        Args:
+            sentence (`str`):
+                The main sentence to compare to others.
+            other_sentences (`List[str]`):
+                The list of sentences to compare to.
+            model (`str`, *optional*):
+                The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
+                Defaults to None.
+
+        Returns:
+            `List[float]`: The embedding representing the input text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.sentence_similarity(
+        ...     "Machine learning is so easy.",
+        ...     other_sentences=[
+        ...         "Deep learning is so straightforward.",
+        ...         "This is so difficult, like rocket science.",
+        ...         "I can't believe how much I struggled with this.",
+        ...     ],
+        ... )
+        [0.7785726189613342, 0.45876261591911316, 0.2906220555305481]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="sentence-similarity")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={},
+            extra_payload={"source_sentence": sentence, "sentences": other_sentences},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return _bytes_to_list(response)
+
+    def summarization(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
+        truncation: Optional["SummarizationTruncationStrategy"] = None,
+    ) -> SummarizationOutput:
+        """
+        Generate a summary of a given text using a specified model.
+
+        Args:
+            text (`str`):
+                The input text to summarize.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for summarization will be used.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
+            truncation (`"SummarizationTruncationStrategy"`, *optional*):
+                The truncation strategy to use.
+        Returns:
+            [`SummarizationOutput`]: The generated summary text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.summarization("The Eiffel tower...")
+        SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
+        ```
+        """
+        parameters = {
+            "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+            "generate_parameters": generate_parameters,
+            "truncation": truncation,
+        }
+        provider_helper = get_provider_helper(self.provider, task="summarization")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters=parameters,
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return SummarizationOutput.parse_obj_as_list(response)[0]
+
+    def table_question_answering(
+        self,
+        table: Dict[str, Any],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        padding: Optional["Padding"] = None,
+        sequential: Optional[bool] = None,
+        truncation: Optional[bool] = None,
+    ) -> TableQuestionAnsweringOutputElement:
+        """
+        Retrieve the answer to a question from information given in a table.
+
+        Args:
+            table (`str`):
+                A table of data represented as a dict of lists where entries are headers and the lists are all the
+                values, all lists must have the same size.
+            query (`str`):
+                The query in plain text that you want to ask the table.
+            model (`str`):
+                The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
+                Hub or a URL to a deployed Inference Endpoint.
+            padding (`"Padding"`, *optional*):
+                Activates and controls padding.
+            sequential (`bool`, *optional*):
+                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+                inference to be done sequentially to extract relations within sequences, given their conversational
+                nature.
+            truncation (`bool`, *optional*):
+                Activates and controls truncation.
+
+        Returns:
+            [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> query = "How many stars does the transformers repository have?"
+        >>> table = {"Repository": ["Transformers", "Datasets", "Tokenizers"], "Stars": ["36542", "4512", "3934"]}
+        >>> client.table_question_answering(table, query, model="google/tapas-base-finetuned-wtq")
+        TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="table-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={"model": model, "padding": padding, "sequential": sequential, "truncation": truncation},
+            extra_payload={"query": query, "table": table},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return TableQuestionAnsweringOutputElement.parse_obj_as_instance(response)
+
+    def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[str]:
+        """
+        Classifying a target category (a group) based on a set of attributes.
+
+        Args:
+            table (`Dict[str, Any]`):
+                Set of attributes to classify.
+            model (`str`, *optional*):
+                The model to use for the tabular classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended tabular classification model will be used.
+                Defaults to None.
+
+        Returns:
+            `List`: a list of labels, one per row in the initial table.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> table = {
+        ...     "fixed_acidity": ["7.4", "7.8", "10.3"],
+        ...     "volatile_acidity": ["0.7", "0.88", "0.32"],
+        ...     "citric_acid": ["0", "0", "0.45"],
+        ...     "residual_sugar": ["1.9", "2.6", "6.4"],
+        ...     "chlorides": ["0.076", "0.098", "0.073"],
+        ...     "free_sulfur_dioxide": ["11", "25", "5"],
+        ...     "total_sulfur_dioxide": ["34", "67", "13"],
+        ...     "density": ["0.9978", "0.9968", "0.9976"],
+        ...     "pH": ["3.51", "3.2", "3.23"],
+        ...     "sulphates": ["0.56", "0.68", "0.82"],
+        ...     "alcohol": ["9.4", "9.8", "12.6"],
+        ... }
+        >>> client.tabular_classification(table=table, model="julien-c/wine-quality")
+        ["5", "5", "5"]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="tabular-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            extra_payload={"table": table},
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return _bytes_to_list(response)
+
+    def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
+        """
+        Predicting a numerical target value given a set of attributes/features in a table.
+
+        Args:
+            table (`Dict[str, Any]`):
+                Set of attributes stored in a table. The attributes used to predict the target can be both numerical and categorical.
+            model (`str`, *optional*):
+                The model to use for the tabular regression task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended tabular regression model will be used.
+                Defaults to None.
+
+        Returns:
+            `List`: a list of predicted numerical target values.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> table = {
+        ...     "Height": ["11.52", "12.48", "12.3778"],
+        ...     "Length1": ["23.2", "24", "23.9"],
+        ...     "Length2": ["25.4", "26.3", "26.5"],
+        ...     "Length3": ["30", "31.2", "31.1"],
+        ...     "Species": ["Bream", "Bream", "Bream"],
+        ...     "Width": ["4.02", "4.3056", "4.6961"],
+        ... }
+        >>> client.tabular_regression(table, model="scikit-learn/Fish-Weight")
+        [110, 120, 130]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="tabular-regression")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={},
+            extra_payload={"table": table},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return _bytes_to_list(response)
+
+    def text_classification(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["TextClassificationOutputTransform"] = None,
+    ) -> List[TextClassificationOutputElement]:
+        """
+        Perform text classification (e.g. sentiment-analysis) on the given text.
+
+        Args:
+            text (`str`):
+                A string to be classified.
+            model (`str`, *optional*):
+                The model to use for the text classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended text classification model will be used.
+                Defaults to None.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
+
+        Returns:
+            `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.text_classification("I like you")
+        [
+            TextClassificationOutputElement(label='POSITIVE', score=0.9998695850372314),
+            TextClassificationOutputElement(label='NEGATIVE', score=0.0001304351753788069),
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "function_to_apply": function_to_apply,
+                "top_k": top_k,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
+
+    @overload
+    def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[False] = ...,
+        stream: Literal[False] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> str: ...
+
+    @overload
+    def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: Literal[False] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> TextGenerationOutput: ...
+
+    @overload
+    def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[False] = ...,
+        stream: Literal[True] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> Iterable[str]: ...
+
+    @overload
+    def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: Literal[True] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> Iterable[TextGenerationStreamOutput]: ...
+
+    @overload
+    def text_generation(
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: bool = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> Union[TextGenerationOutput, Iterable[TextGenerationStreamOutput]]: ...
+
+    def text_generation(
+        self,
+        prompt: str,
+        *,
+        details: bool = False,
+        stream: bool = False,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]:
+        """
+        Given a prompt, generate the following text.
+
+        <Tip>
+
+        If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
+        It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
+
+        </Tip>
+
+        Args:
+            prompt (`str`):
+                Input text.
+            details (`bool`, *optional*):
+                By default, text_generation returns a string. Pass `details=True` if you want a detailed output (tokens,
+                probabilities, seed, finish reason, etc.). Only available for models running on with the
+                `text-generation-inference` backend.
+            stream (`bool`, *optional*):
+                By default, text_generation returns the full generated text. Pass `stream=True` if you want a stream of
+                tokens to be returned. Only available for models running on with the `text-generation-inference`
+                backend.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            adapter_id (`str`, *optional*):
+                Lora adapter id.
+            best_of (`int`, *optional*):
+                Generate best_of sequences and return the one if the highest token logprobs.
+            decoder_input_details (`bool`, *optional*):
+                Return the decoder input token logprobs and ids. You must set `details=True` as well for it to be taken
+                into account. Defaults to `False`.
+            do_sample (`bool`, *optional*):
+                Activate logits sampling
+            frequency_penalty (`float`, *optional*):
+                Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in
+                the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+            grammar ([`TextGenerationInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
+            max_new_tokens (`int`, *optional*):
+                Maximum number of generated tokens. Defaults to 100.
+            repetition_penalty (`float`, *optional*):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            return_full_text (`bool`, *optional*):
+                Whether to prepend the prompt to the generated text
+            seed (`int`, *optional*):
+                Random sampling seed
+            stop (`List[str]`, *optional*):
+                Stop generating tokens if a member of `stop` is generated.
+            stop_sequences (`List[str]`, *optional*):
+                Deprecated argument. Use `stop` instead.
+            temperature (`float`, *optional*):
+                The value used to module the logits distribution.
+            top_n_tokens (`int`, *optional*):
+                Return information about the `top_n_tokens` most likely tokens at each generation step, instead of
+                just the sampled token.
+            top_k (`int`, *optional`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`, *optional`):
+                Truncate inputs tokens to the given size.
+            typical_p (`float`, *optional`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`, *optional`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+
+        Returns:
+            `Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]`:
+            Generated text returned from the server:
+            - if `stream=False` and `details=False`, the generated text is returned as a `str` (default)
+            - if `stream=True` and `details=False`, the generated text is returned token by token as a `Iterable[str]`
+            - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.TextGenerationOutput`]
+            - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.TextGenerationStreamOutput`]
+
+        Raises:
+            `ValidationError`:
+                If input values are not valid. No HTTP call is made to the server.
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+
+        # Case 1: generate text
+        >>> client.text_generation("The huggingface_hub library is ", max_new_tokens=12)
+        '100% open source and built to be easy to use.'
+
+        # Case 2: iterate over the generated tokens. Useful for large generation.
+        >>> for token in client.text_generation("The huggingface_hub library is ", max_new_tokens=12, stream=True):
+        ...     print(token)
+        100
+        %
+        open
+        source
+        and
+        built
+        to
+        be
+        easy
+        to
+        use
+        .
+
+        # Case 3: get more details about the generation process.
+        >>> client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True)
+        TextGenerationOutput(
+            generated_text='100% open source and built to be easy to use.',
+            details=TextGenerationDetails(
+                finish_reason='length',
+                generated_tokens=12,
+                seed=None,
+                prefill=[
+                    TextGenerationPrefillOutputToken(id=487, text='The', logprob=None),
+                    TextGenerationPrefillOutputToken(id=53789, text=' hugging', logprob=-13.171875),
+                    (...)
+                    TextGenerationPrefillOutputToken(id=204, text=' ', logprob=-7.0390625)
+                ],
+                tokens=[
+                    TokenElement(id=1425, text='100', logprob=-1.0175781, special=False),
+                    TokenElement(id=16, text='%', logprob=-0.0463562, special=False),
+                    (...)
+                    TokenElement(id=25, text='.', logprob=-0.5703125, special=False)
+                ],
+                best_of_sequences=None
+            )
+        )
+
+        # Case 4: iterate over the generated tokens with more details.
+        # Last object is more complete, containing the full generated text and the finish reason.
+        >>> for details in client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True, stream=True):
+        ...     print(details)
+        ...
+        TextGenerationStreamOutput(token=TokenElement(id=1425, text='100', logprob=-1.0175781, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=16, text='%', logprob=-0.0463562, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1314, text=' open', logprob=-1.3359375, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3178, text=' source', logprob=-0.28100586, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=273, text=' and', logprob=-0.5961914, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3426, text=' built', logprob=-1.9423828, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-1.4121094, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=314, text=' be', logprob=-1.5224609, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1833, text=' easy', logprob=-2.1132812, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-0.08520508, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=745, text=' use', logprob=-0.39453125, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(
+            id=25,
+            text='.',
+            logprob=-0.5703125,
+            special=False),
+            generated_text='100% open source and built to be easy to use.',
+            details=TextGenerationStreamOutputStreamDetails(finish_reason='length', generated_tokens=12, seed=None)
+        )
+
+        # Case 5: generate constrained output using grammar
+        >>> response = client.text_generation(
+        ...     prompt="I saw a puppy a cat and a raccoon during my bike ride in the park",
+        ...     model="HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
+        ...     max_new_tokens=100,
+        ...     repetition_penalty=1.3,
+        ...     grammar={
+        ...         "type": "json",
+        ...         "value": {
+        ...             "properties": {
+        ...                 "location": {"type": "string"},
+        ...                 "activity": {"type": "string"},
+        ...                 "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...                 "animals": {"type": "array", "items": {"type": "string"}},
+        ...             },
+        ...             "required": ["location", "activity", "animals_seen", "animals"],
+        ...         },
+        ...     },
+        ... )
+        >>> json.loads(response)
+        {
+            "activity": "bike riding",
+            "animals": ["puppy", "cat", "raccoon"],
+            "animals_seen": 3,
+            "location": "park"
+        }
+        ```
+        """
+        if decoder_input_details and not details:
+            warnings.warn(
+                "`decoder_input_details=True` has been passed to the server but `details=False` is set meaning that"
+                " the output from the server will be truncated."
+            )
+            decoder_input_details = False
+
+        if stop_sequences is not None:
+            warnings.warn(
+                "`stop_sequences` is a deprecated argument for `text_generation` task"
+                " and will be removed in version '0.28.0'. Use `stop` instead.",
+                FutureWarning,
+            )
+        if stop is None:
+            stop = stop_sequences  # use deprecated arg if provided
+
+        # Build payload
+        parameters = {
+            "adapter_id": adapter_id,
+            "best_of": best_of,
+            "decoder_input_details": decoder_input_details,
+            "details": details,
+            "do_sample": do_sample,
+            "frequency_penalty": frequency_penalty,
+            "grammar": grammar,
+            "max_new_tokens": max_new_tokens,
+            "repetition_penalty": repetition_penalty,
+            "return_full_text": return_full_text,
+            "seed": seed,
+            "stop": stop if stop is not None else [],
+            "temperature": temperature,
+            "top_k": top_k,
+            "top_n_tokens": top_n_tokens,
+            "top_p": top_p,
+            "truncate": truncate,
+            "typical_p": typical_p,
+            "watermark": watermark,
+        }
+
+        # Remove some parameters if not a TGI server
+        unsupported_kwargs = _get_unsupported_text_generation_kwargs(model)
+        if len(unsupported_kwargs) > 0:
+            # The server does not support some parameters
+            # => means it is not a TGI server
+            # => remove unsupported parameters and warn the user
+
+            ignored_parameters = []
+            for key in unsupported_kwargs:
+                if parameters.get(key):
+                    ignored_parameters.append(key)
+                parameters.pop(key, None)
+            if len(ignored_parameters) > 0:
+                warnings.warn(
+                    "API endpoint/model for text-generation is not served via TGI. Ignoring following parameters:"
+                    f" {', '.join(ignored_parameters)}.",
+                    UserWarning,
+                )
+            if details:
+                warnings.warn(
+                    "API endpoint/model for text-generation is not served via TGI. Parameter `details=True` will"
+                    " be ignored meaning only the generated text will be returned.",
+                    UserWarning,
+                )
+                details = False
+            if stream:
+                raise ValueError(
+                    "API endpoint/model for text-generation is not served via TGI. Cannot return output as a stream."
+                    " Please pass `stream=False` as input."
+                )
+
+        provider_helper = get_provider_helper(self.provider, task="text-generation")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters=parameters,
+            extra_payload={"stream": stream},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+
+        # Handle errors separately for more precise error messages
+        try:
+            bytes_output = self._inner_post(request_parameters, stream=stream)
+        except HTTPError as e:
+            match = MODEL_KWARGS_NOT_USED_REGEX.search(str(e))
+            if isinstance(e, BadRequestError) and match:
+                unused_params = [kwarg.strip("' ") for kwarg in match.group(1).split(",")]
+                _set_unsupported_text_generation_kwargs(model, unused_params)
+                return self.text_generation(  # type: ignore
+                    prompt=prompt,
+                    details=details,
+                    stream=stream,
+                    model=model or self.model,
+                    adapter_id=adapter_id,
+                    best_of=best_of,
+                    decoder_input_details=decoder_input_details,
+                    do_sample=do_sample,
+                    frequency_penalty=frequency_penalty,
+                    grammar=grammar,
+                    max_new_tokens=max_new_tokens,
+                    repetition_penalty=repetition_penalty,
+                    return_full_text=return_full_text,
+                    seed=seed,
+                    stop=stop,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_n_tokens=top_n_tokens,
+                    top_p=top_p,
+                    truncate=truncate,
+                    typical_p=typical_p,
+                    watermark=watermark,
+                )
+            raise_text_generation_error(e)
+
+        # Parse output
+        if stream:
+            return _stream_text_generation_response(bytes_output, details)  # type: ignore
+
+        data = _bytes_to_dict(bytes_output)  # type: ignore[arg-type]
+
+        # Data can be a single element (dict) or an iterable of dicts where we select the first element of.
+        if isinstance(data, list):
+            data = data[0]
+
+        return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
+
+    def text_to_image(
+        self,
+        prompt: str,
+        *,
+        negative_prompt: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        model: Optional[str] = None,
+        scheduler: Optional[str] = None,
+        seed: Optional[int] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+    ) -> "Image":
+        """
+        Generate an image based on a given text using a specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            prompt (`str`):
+                The prompt to generate an image from.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
+            height (`int`, *optional*):
+                The height in pixels of the output image
+            width (`int`, *optional*):
+                The width in pixels of the output image
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                prompt, but values too high may cause saturation and other artifacts.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
+                Defaults to None.
+            scheduler (`str`, *optional*):
+                Override the scheduler with a compatible one.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+            extra_body (`Dict[str, Any]`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+
+        Returns:
+            `Image`: The generated image.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+
+        >>> image = client.text_to_image("An astronaut riding a horse on the moon.")
+        >>> image.save("astronaut.png")
+
+        >>> image = client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     negative_prompt="low resolution, blurry",
+        ...     model="stabilityai/stable-diffusion-2-1",
+        ... )
+        >>> image.save("better_astronaut.png")
+        ```
+        Example using a third-party provider directly. Usage will be billed on your fal.ai account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",  # Use fal.ai provider
+        ...     api_key="fal-ai-api-key",  # Pass your fal.ai API key
+        ... )
+        >>> image = client.text_to_image(
+        ...     "A majestic lion in a fantasy forest",
+        ...     model="black-forest-labs/FLUX.1-schnell",
+        ... )
+        >>> image.save("lion.png")
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> image = client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     model="black-forest-labs/FLUX.1-dev",
+        ... )
+        >>> image.save("astronaut.png")
+        ```
+
+        Example using Replicate provider with extra parameters
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> image = client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     model="black-forest-labs/FLUX.1-schnell",
+        ...     extra_body={"output_quality": 100},
+        ... )
+        >>> image.save("astronaut.png")
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-image")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters={
+                "negative_prompt": negative_prompt,
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                "scheduler": scheduler,
+                "seed": seed,
+                **(extra_body or {}),
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return _bytes_to_image(response)
+
+    def text_to_video(
+        self,
+        prompt: str,
+        *,
+        model: Optional[str] = None,
+        guidance_scale: Optional[float] = None,
+        negative_prompt: Optional[List[str]] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        seed: Optional[int] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        """
+        Generate a video based on a given text.
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            prompt (`str`):
+                The prompt to generate a video from.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-video model will be used.
+                Defaults to None.
+            guidance_scale (`float`, *optional*):
+                A higher guidance scale value encourages the model to generate videos closely linked to the text
+                prompt, but values too high may cause saturation and other artifacts.
+            negative_prompt (`List[str]`, *optional*):
+                One or several prompt to guide what NOT to include in video generation.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+            extra_body (`Dict[str, Any]`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+
+        Returns:
+            `bytes`: The generated video.
+
+        Example:
+
+        Example using a third-party provider directly. Usage will be billed on your fal.ai account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",  # Using fal.ai provider
+        ...     api_key="fal-ai-api-key",  # Pass your fal.ai API key
+        ... )
+        >>> video = client.text_to_video(
+        ...     "A majestic lion running in a fantasy forest",
+        ...     model="tencent/HunyuanVideo",
+        ... )
+        >>> with open("lion.mp4", "wb") as file:
+        ...     file.write(video)
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Using replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> video = client.text_to_video(
+        ...     "A cat running in a park",
+        ...     model="genmo/mochi-1-preview",
+        ... )
+        >>> with open("cat.mp4", "wb") as file:
+        ...     file.write(video)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-video")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters={
+                "guidance_scale": guidance_scale,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "seed": seed,
+                **(extra_body or {}),
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return response
+
+    def text_to_speech(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
+        epsilon_cutoff: Optional[float] = None,
+        eta_cutoff: Optional[float] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        min_length: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        num_beam_groups: Optional[int] = None,
+        num_beams: Optional[int] = None,
+        penalty_alpha: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        use_cache: Optional[bool] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        """
+        Synthesize an audio of a voice pronouncing a given text.
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            text (`str`):
+                The text to synthesize.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-speech model will be used.
+                Defaults to None.
+            do_sample (`bool`, *optional*):
+                Whether to use sampling instead of greedy decoding when generating new tokens.
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"]`, *optional*):
+                Controls the stopping condition for beam-based methods.
+            epsilon_cutoff (`float`, *optional*):
+                If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
+                epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on
+                the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            eta_cutoff (`float`, *optional*):
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly
+                between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff)
+                * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token
+                probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3,
+                depending on the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            max_length (`int`, *optional*):
+                The maximum length (in tokens) of the generated text, including the input.
+            max_new_tokens (`int`, *optional*):
+                The maximum number of tokens to generate. Takes precedence over max_length.
+            min_length (`int`, *optional*):
+                The minimum length (in tokens) of the generated text, including the input.
+            min_new_tokens (`int`, *optional*):
+                The minimum number of tokens to generate. Takes precedence over min_length.
+            num_beam_groups (`int`, *optional*):
+                Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
+                See [this paper](https://hf.co/papers/1610.02424) for more details.
+            num_beams (`int`, *optional*):
+                Number of beams to use for beam search.
+            penalty_alpha (`float`, *optional*):
+                The value balances the model confidence and the degeneration penalty in contrastive search decoding.
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            top_k (`int`, *optional*):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional*):
+                If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
+                top_p or higher are kept for generation.
+            typical_p (`float`, *optional*):
+                Local typicality measures how similar the conditional probability of predicting a target token next is
+                to the expected conditional probability of predicting a random token next, given the partial text
+                already generated. If set to float < 1, the smallest set of the most locally typical tokens with
+                probabilities that add up to typical_p or higher are kept for generation. See [this
+                paper](https://hf.co/papers/2202.00666) for more details.
+            use_cache (`bool`, *optional*):
+                Whether the model should use the past last key/values attentions to speed up decoding
+            extra_body (`Dict[str, Any]`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+        Returns:
+            `bytes`: The generated audio.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from pathlib import Path
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+
+        >>> audio = client.text_to_speech("Hello world")
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+
+        Example using a third-party provider directly. Usage will be billed on your Replicate account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",
+        ...     api_key="your-replicate-api-key",  # Pass your Replicate API key directly
+        ... )
+        >>> audio = client.text_to_speech(
+        ...     text="Hello world",
+        ...     model="OuteAI/OuteTTS-0.3-500M",
+        ... )
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> audio =client.text_to_speech(
+        ...     text="Hello world",
+        ...     model="OuteAI/OuteTTS-0.3-500M",
+        ... )
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+        Example using Replicate provider with extra parameters
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> audio = client.text_to_speech(
+        ...     "Hello, my name is Kororo, an awesome text-to-speech model.",
+        ...     model="hexgrad/Kokoro-82M",
+        ...     extra_body={"voice": "af_nicole"},
+        ... )
+        >>> Path("hello.flac").write_bytes(audio)
+        ```
+
+        Example music-gen using "YuE-s1-7B-anneal-en-cot" on fal.ai
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> lyrics = '''
+        ... [verse]
+        ... In the town where I was born
+        ... Lived a man who sailed to sea
+        ... And he told us of his life
+        ... In the land of submarines
+        ... So we sailed on to the sun
+        ... 'Til we found a sea of green
+        ... And we lived beneath the waves
+        ... In our yellow submarine
+
+        ... [chorus]
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... '''
+        >>> genres = "pavarotti-style tenor voice"
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",
+        ...     model="m-a-p/YuE-s1-7B-anneal-en-cot",
+        ...     api_key=...,
+        ... )
+        >>> audio = client.text_to_speech(lyrics, extra_body={"genres": genres})
+        >>> with open("output.mp3", "wb") as f:
+        ...     f.write(audio)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-speech")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "do_sample": do_sample,
+                "early_stopping": early_stopping,
+                "epsilon_cutoff": epsilon_cutoff,
+                "eta_cutoff": eta_cutoff,
+                "max_length": max_length,
+                "max_new_tokens": max_new_tokens,
+                "min_length": min_length,
+                "min_new_tokens": min_new_tokens,
+                "num_beam_groups": num_beam_groups,
+                "num_beams": num_beams,
+                "penalty_alpha": penalty_alpha,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "typical_p": typical_p,
+                "use_cache": use_cache,
+                **(extra_body or {}),
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return response
+
+    def token_classification(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None,
+        ignore_labels: Optional[List[str]] = None,
+        stride: Optional[int] = None,
+    ) -> List[TokenClassificationOutputElement]:
+        """
+        Perform token classification on the given text.
+        Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text.
+
+        Args:
+            text (`str`):
+                A string to be classified.
+            model (`str`, *optional*):
+                The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
+                Defaults to None.
+            aggregation_strategy (`"TokenClassificationAggregationStrategy"`, *optional*):
+                The strategy used to fuse tokens based on model predictions
+            ignore_labels (`List[str`, *optional*):
+                A list of labels to ignore
+            stride (`int`, *optional*):
+                The number of overlapping tokens between chunks when splitting the input text.
+
+        Returns:
+            `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.token_classification("My name is Sarah Jessica Parker but you can call me Jessica")
+        [
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9971321225166321,
+                word='Sarah Jessica Parker',
+                start=11,
+                end=31,
+            ),
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9773476123809814,
+                word='Jessica',
+                start=52,
+                end=59,
+            )
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="token-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "aggregation_strategy": aggregation_strategy,
+                "ignore_labels": ignore_labels,
+                "stride": stride,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return TokenClassificationOutputElement.parse_obj_as_list(response)
+
+    def translation(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        truncation: Optional["TranslationTruncationStrategy"] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
+    ) -> TranslationOutput:
+        """
+        Convert text from one language to another.
+
+        Check out https://huggingface.co/tasks/translation for more information on how to choose the best model for
+        your specific use case. Source and target languages usually depend on the model.
+        However, it is possible to specify source and target languages for certain models. If you are working with one of these models,
+        you can use `src_lang` and `tgt_lang` arguments to pass the relevant information.
+
+        Args:
+            text (`str`):
+                A string to be translated.
+            model (`str`, *optional*):
+                The model to use for the translation task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended translation model will be used.
+                Defaults to None.
+            src_lang (`str`, *optional*):
+                The source language of the text. Required for models that can translate from multiple languages.
+            tgt_lang (`str`, *optional*):
+                Target language to translate to. Required for models that can translate to multiple languages.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            truncation (`"TranslationTruncationStrategy"`, *optional*):
+                The truncation strategy to use.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
+
+        Returns:
+            [`TranslationOutput`]: The generated translated text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+            `ValueError`:
+                If only one of the `src_lang` and `tgt_lang` arguments are provided.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.translation("My name is Wolfgang and I live in Berlin")
+        'Mein Name ist Wolfgang und ich lebe in Berlin.'
+        >>> client.translation("My name is Wolfgang and I live in Berlin", model="Helsinki-NLP/opus-mt-en-fr")
+        TranslationOutput(translation_text='Je m'appelle Wolfgang et je vis à Berlin.')
+        ```
+
+        Specifying languages:
+        ```py
+        >>> client.translation("My name is Sarah Jessica Parker but you can call me Jessica", model="facebook/mbart-large-50-many-to-many-mmt", src_lang="en_XX", tgt_lang="fr_XX")
+        "Mon nom est Sarah Jessica Parker mais vous pouvez m'appeler Jessica"
+        ```
+        """
+        # Throw error if only one of `src_lang` and `tgt_lang` was given
+        if src_lang is not None and tgt_lang is None:
+            raise ValueError("You cannot specify `src_lang` without specifying `tgt_lang`.")
+
+        if src_lang is None and tgt_lang is not None:
+            raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
+
+        provider_helper = get_provider_helper(self.provider, task="translation")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "src_lang": src_lang,
+                "tgt_lang": tgt_lang,
+                "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+                "truncation": truncation,
+                "generate_parameters": generate_parameters,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return TranslationOutput.parse_obj_as_list(response)[0]
+
+    def visual_question_answering(
+        self,
+        image: ContentT,
+        question: str,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+    ) -> List[VisualQuestionAnsweringOutputElement]:
+        """
+        Answering open-ended questions based on an image.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            question (`str`):
+                Question to be answered.
+            model (`str`, *optional*):
+                The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
+                Defaults to None.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                topk answers if there are not enough options available within the context.
+        Returns:
+            `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
+
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.visual_question_answering(
+        ...     image="https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg",
+        ...     question="What is the animal doing?"
+        ... )
+        [
+            VisualQuestionAnsweringOutputElement(score=0.778609573841095, answer='laying down'),
+            VisualQuestionAnsweringOutputElement(score=0.6957435607910156, answer='sitting'),
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="visual-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+            extra_payload={"question": question, "image": _b64_encode(image)},
+        )
+        response = self._inner_post(request_parameters)
+        return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
+
+    @_deprecate_arguments(
+        version="0.30.0",
+        deprecated_args=["labels"],
+        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
+    )
+    def zero_shot_classification(
+        self,
+        text: str,
+        # temporarily keeping it optional for backward compatibility.
+        candidate_labels: List[str] = None,  # type: ignore
+        *,
+        multi_label: Optional[bool] = False,
+        hypothesis_template: Optional[str] = None,
+        model: Optional[str] = None,
+        # deprecated argument
+        labels: List[str] = None,  # type: ignore
+    ) -> List[ZeroShotClassificationOutputElement]:
+        """
+        Provide as input a text and a set of candidate labels to classify the input text.
+
+        Args:
+            text (`str`):
+                The input text to classify.
+            candidate_labels (`List[str]`):
+                The set of possible class labels to classify the text into.
+            labels (`List[str]`, *optional*):
+                (deprecated) List of strings. Each string is the verbalization of a possible label for the input text.
+            multi_label (`bool`, *optional*):
+                Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of
+                the label likelihoods for each sequence is 1. If true, the labels are considered independent and
+                probabilities are normalized for each candidate.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
+
+
+        Returns:
+            `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example with `multi_label=False`:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> text = (
+        ...     "A new model offers an explanation for how the Galilean satellites formed around the solar system's"
+        ...     "largest world. Konstantin Batygin did not set out to solve one of the solar system's most puzzling"
+        ...     " mysteries when he went for a run up a hill in Nice, France."
+        ... )
+        >>> labels = ["space & cosmos", "scientific discovery", "microbiology", "robots", "archeology"]
+        >>> client.zero_shot_classification(text, labels)
+        [
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.7961668968200684),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.18570658564567566),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.00730885099619627),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.006258360575884581),
+            ZeroShotClassificationOutputElement(label='robots', score=0.004559356719255447),
+        ]
+        >>> client.zero_shot_classification(text, labels, multi_label=True)
+        [
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.9829297661781311),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.755190908908844),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.0005462635890580714),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.00047131875180639327),
+            ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
+        ]
+        ```
+
+        Example with `multi_label=True` and a custom `hypothesis_template`:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.zero_shot_classification(
+        ...    text="I really like our dinner and I'm very happy. I don't like the weather though.",
+        ...    labels=["positive", "negative", "pessimistic", "optimistic"],
+        ...    multi_label=True,
+        ...    hypothesis_template="This text is {} towards the weather"
+        ... )
+        [
+            ZeroShotClassificationOutputElement(label='negative', score=0.9231801629066467),
+            ZeroShotClassificationOutputElement(label='pessimistic', score=0.8760990500450134),
+            ZeroShotClassificationOutputElement(label='optimistic', score=0.0008674879791215062),
+            ZeroShotClassificationOutputElement(label='positive', score=0.0005250611575320363)
+        ]
+        ```
+        """
+        # handle deprecation
+        if labels is not None:
+            if candidate_labels is not None:
+                raise ValueError(
+                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
+                )
+            candidate_labels = labels
+        elif candidate_labels is None:
+            raise ValueError("Must specify `candidate_labels`")
+
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "candidate_labels": candidate_labels,
+                "multi_label": multi_label,
+                "hypothesis_template": hypothesis_template,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        output = _bytes_to_dict(response)
+        return [
+            ZeroShotClassificationOutputElement.parse_obj_as_instance({"label": label, "score": score})
+            for label, score in zip(output["labels"], output["scores"])
+        ]
+
+    @_deprecate_arguments(
+        version="0.30.0",
+        deprecated_args=["labels"],
+        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
+    )
+    def zero_shot_image_classification(
+        self,
+        image: ContentT,
+        # temporarily keeping it optional for backward compatibility.
+        candidate_labels: List[str] = None,  # type: ignore
+        *,
+        model: Optional[str] = None,
+        hypothesis_template: Optional[str] = None,
+        # deprecated argument
+        labels: List[str] = None,  # type: ignore
+    ) -> List[ZeroShotImageClassificationOutputElement]:
+        """
+        Provide input image and text labels to predict text labels for the image.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image to caption. It can be raw bytes, an image file, or a URL to an online image.
+            candidate_labels (`List[str]`):
+                The candidate labels for this image
+            labels (`List[str]`, *optional*):
+                (deprecated) List of string possible labels. There must be at least 2 labels.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
+
+        Returns:
+            `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+
+        >>> client.zero_shot_image_classification(
+        ...     "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
+        ...     labels=["dog", "cat", "horse"],
+        ... )
+        [ZeroShotImageClassificationOutputElement(label='dog', score=0.956),...]
+        ```
+        """
+        # handle deprecation
+        if labels is not None:
+            if candidate_labels is not None:
+                raise ValueError(
+                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
+                )
+            candidate_labels = labels
+        elif candidate_labels is None:
+            raise ValueError("Must specify `candidate_labels`")
+        # Raise ValueError if input is less than 2 labels
+        if len(candidate_labels) < 2:
+            raise ValueError("You must specify at least 2 classes to compare.")
+
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-image-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "candidate_labels": candidate_labels,
+                "hypothesis_template": hypothesis_template,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
+
+    @_deprecate_method(
+        version="0.33.0",
+        message=(
+            "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)."
+            " Use `HfApi.list_models(..., inference_provider='...')` to list warm models per provider."
+        ),
+    )
+    def list_deployed_models(
+        self, frameworks: Union[None, str, Literal["all"], List[str]] = None
+    ) -> Dict[str, List[str]]:
+        """
+        List models deployed on the HF Serverless Inference API service.
+
+        This helper checks deployed models framework by framework. By default, it will check the 4 main frameworks that
+        are supported and account for 95% of the hosted models. However, if you want a complete list of models you can
+        specify `frameworks="all"` as input. Alternatively, if you know before-hand which framework you are interested
+        in, you can also restrict to search to this one (e.g. `frameworks="text-generation-inference"`). The more
+        frameworks are checked, the more time it will take.
+
+        <Tip warning={true}>
+
+        This endpoint method does not return a live list of all models available for the HF Inference API service.
+        It searches over a cached list of models that were recently available and the list may not be up to date.
+        If you want to know the live status of a specific model, use [`~InferenceClient.get_model_status`].
+
+        </Tip>
+
+        <Tip>
+
+        This endpoint method is mostly useful for discoverability. If you already know which model you want to use and want to
+        check its availability, you can directly use [`~InferenceClient.get_model_status`].
+
+        </Tip>
+
+        Args:
+            frameworks (`Literal["all"]` or `List[str]` or `str`, *optional*):
+                The frameworks to filter on. By default only a subset of the available frameworks are tested. If set to
+                "all", all available frameworks will be tested. It is also possible to provide a single framework or a
+                custom set of frameworks to check.
+
+        Returns:
+            `Dict[str, List[str]]`: A dictionary mapping task names to a sorted list of model IDs.
+
+        Example:
+        ```python
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+
+        # Discover zero-shot-classification models currently deployed
+        >>> models = client.list_deployed_models()
+        >>> models["zero-shot-classification"]
+        ['Narsil/deberta-large-mnli-zero-cls', 'facebook/bart-large-mnli', ...]
+
+        # List from only 1 framework
+        >>> client.list_deployed_models("text-generation-inference")
+        {'text-generation': ['bigcode/starcoder', 'meta-llama/Llama-2-70b-chat-hf', ...], ...}
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Listing deployed models is not supported on '{self.provider}'.")
+
+        # Resolve which frameworks to check
+        if frameworks is None:
+            frameworks = constants.MAIN_INFERENCE_API_FRAMEWORKS
+        elif frameworks == "all":
+            frameworks = constants.ALL_INFERENCE_API_FRAMEWORKS
+        elif isinstance(frameworks, str):
+            frameworks = [frameworks]
+        frameworks = list(set(frameworks))
+
+        # Fetch them iteratively
+        models_by_task: Dict[str, List[str]] = {}
+
+        def _unpack_response(framework: str, items: List[Dict]) -> None:
+            for model in items:
+                if framework == "sentence-transformers":
+                    # Model running with the `sentence-transformers` framework can work with both tasks even if not
+                    # branded as such in the API response
+                    models_by_task.setdefault("feature-extraction", []).append(model["model_id"])
+                    models_by_task.setdefault("sentence-similarity", []).append(model["model_id"])
+                else:
+                    models_by_task.setdefault(model["task"], []).append(model["model_id"])
+
+        for framework in frameworks:
+            response = get_session().get(
+                f"{constants.INFERENCE_ENDPOINT}/framework/{framework}", headers=build_hf_headers(token=self.token)
+            )
+            hf_raise_for_status(response)
+            _unpack_response(framework, response.json())
+
+        # Sort alphabetically for discoverability and return
+        for task, models in models_by_task.items():
+            models_by_task[task] = sorted(set(models), key=lambda x: x.lower())
+        return models_by_task
+
+    def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Get information about the deployed endpoint.
+
+        This endpoint is only available on endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
+        Endpoints powered by `transformers` return an empty payload.
+
+        Args:
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+
+        Returns:
+            `Dict[str, Any]`: Information about the endpoint.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> client.get_endpoint_info()
+        {
+            'model_id': 'meta-llama/Meta-Llama-3-70B-Instruct',
+            'model_sha': None,
+            'model_dtype': 'torch.float16',
+            'model_device_type': 'cuda',
+            'model_pipeline_tag': None,
+            'max_concurrent_requests': 128,
+            'max_best_of': 2,
+            'max_stop_sequences': 4,
+            'max_input_length': 8191,
+            'max_total_tokens': 8192,
+            'waiting_served_ratio': 0.3,
+            'max_batch_total_tokens': 1259392,
+            'max_waiting_tokens': 20,
+            'max_batch_size': None,
+            'validation_workers': 32,
+            'max_client_batch_size': 4,
+            'version': '2.0.2',
+            'sha': 'dccab72549635c7eb5ddb17f43f0b7cdff07c214',
+            'docker_label': 'sha-dccab72'
+        }
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Getting endpoint info is not supported on '{self.provider}'.")
+
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if model.startswith(("http://", "https://")):
+            url = model.rstrip("/") + "/info"
+        else:
+            url = f"{constants.INFERENCE_ENDPOINT}/models/{model}/info"
+
+        response = get_session().get(url, headers=build_hf_headers(token=self.token))
+        hf_raise_for_status(response)
+        return response.json()
+
+    def health_check(self, model: Optional[str] = None) -> bool:
+        """
+        Check the health of the deployed endpoint.
+
+        Health check is only available with Inference Endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
+        For Inference API, please use [`InferenceClient.get_model_status`] instead.
+
+        Args:
+            model (`str`, *optional*):
+                URL of the Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+
+        Returns:
+            `bool`: True if everything is working fine.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient("https://jzgu0buei5.us-east-1.aws.endpoints.huggingface.cloud")
+        >>> client.health_check()
+        True
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Health check is not supported on '{self.provider}'.")
+
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if not model.startswith(("http://", "https://")):
+            raise ValueError(
+                "Model must be an Inference Endpoint URL. For serverless Inference API, please use `InferenceClient.get_model_status`."
+            )
+        url = model.rstrip("/") + "/health"
+
+        response = get_session().get(url, headers=build_hf_headers(token=self.token))
+        return response.status_code == 200
+
+    @_deprecate_method(
+        version="0.33.0",
+        message=(
+            "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)."
+            " Use `HfApi.model_info` to get the model status both with HF Inference API and external providers."
+        ),
+    )
+    def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
+        """
+        Get the status of a model hosted on the HF Inference API.
+
+        <Tip>
+
+        This endpoint is mostly useful when you already know which model you want to use and want to check its
+        availability. If you want to discover already deployed models, you should rather use [`~InferenceClient.list_deployed_models`].
+
+        </Tip>
+
+        Args:
+            model (`str`, *optional*):
+                Identifier of the model for witch the status gonna be checked. If model is not provided,
+                the model associated with this instance of [`InferenceClient`] will be used. Only HF Inference API service can be checked so the
+                identifier cannot be a URL.
+
+
+        Returns:
+            [`ModelStatus`]: An instance of ModelStatus dataclass, containing information,
+                         about the state of the model: load, state, compute type and framework.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
+        ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Getting model status is not supported on '{self.provider}'.")
+
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if model.startswith("https://"):
+            raise NotImplementedError("Model status is only available for Inference API endpoints.")
+        url = f"{constants.INFERENCE_ENDPOINT}/status/{model}"
+
+        response = get_session().get(url, headers=build_hf_headers(token=self.token))
+        hf_raise_for_status(response)
+        response_data = response.json()
+
+        if "error" in response_data:
+            raise ValueError(response_data["error"])
+
+        return ModelStatus(
+            loaded=response_data["loaded"],
+            state=response_data["state"],
+            compute_type=response_data["compute_type"],
+            framework=response_data["framework"],
+        )
+
+    @property
+    def chat(self) -> "ProxyClientChat":
+        return ProxyClientChat(self)
+
+
+class _ProxyClient:
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    def __init__(self, client: InferenceClient):
+        self._client = client
+
+
+class ProxyClientChat(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def completions(self) -> "ProxyClientChatCompletions":
+        return ProxyClientChatCompletions(self._client)
+
+
+class ProxyClientChatCompletions(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def create(self):
+        return self._client.chat_completion
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_common.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_common.py
new file mode 100644
index 00000000..574f726b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_common.py
@@ -0,0 +1,422 @@
+# coding=utf-8
+# Copyright 2023-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains utilities used by both the sync and async inference clients."""
+
+import base64
+import io
+import json
+import logging
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncIterable,
+    BinaryIO,
+    ContextManager,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Literal,
+    NoReturn,
+    Optional,
+    Union,
+    overload,
+)
+
+from requests import HTTPError
+
+from huggingface_hub.errors import (
+    GenerationError,
+    IncompleteGenerationError,
+    OverloadedError,
+    TextGenerationError,
+    UnknownError,
+    ValidationError,
+)
+
+from ..utils import get_session, is_aiohttp_available, is_numpy_available, is_pillow_available
+from ._generated.types import ChatCompletionStreamOutput, TextGenerationStreamOutput
+
+
+if TYPE_CHECKING:
+    from aiohttp import ClientResponse, ClientSession
+    from PIL.Image import Image
+
+# TYPES
+UrlT = str
+PathT = Union[str, Path]
+BinaryT = Union[bytes, BinaryIO]
+ContentT = Union[BinaryT, PathT, UrlT]
+
+# Use to set a Accept: image/png header
+TASKS_EXPECTING_IMAGES = {"text-to-image", "image-to-image"}
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RequestParameters:
+    url: str
+    task: str
+    model: Optional[str]
+    json: Optional[Union[str, Dict, List]]
+    data: Optional[ContentT]
+    headers: Dict[str, Any]
+
+
+# Add dataclass for ModelStatus. We use this dataclass in get_model_status function.
+@dataclass
+class ModelStatus:
+    """
+    This Dataclass represents the model status in the HF Inference API.
+
+    Args:
+        loaded (`bool`):
+            If the model is currently loaded into HF's Inference API. Models
+            are loaded on-demand, leading to the user's first request taking longer.
+            If a model is loaded, you can be assured that it is in a healthy state.
+        state (`str`):
+            The current state of the model. This can be 'Loaded', 'Loadable', 'TooBig'.
+            If a model's state is 'Loadable', it's not too big and has a supported
+            backend. Loadable models are automatically loaded when the user first
+            requests inference on the endpoint. This means it is transparent for the
+            user to load a model, except that the first call takes longer to complete.
+        compute_type (`Dict`):
+            Information about the compute resource the model is using or will use, such as 'gpu' type and number of
+            replicas.
+        framework (`str`):
+            The name of the framework that the model was built with, such as 'transformers'
+            or 'text-generation-inference'.
+    """
+
+    loaded: bool
+    state: str
+    compute_type: Dict
+    framework: str
+
+
+## IMPORT UTILS
+
+
+def _import_aiohttp():
+    # Make sure `aiohttp` is installed on the machine.
+    if not is_aiohttp_available():
+        raise ImportError("Please install aiohttp to use `AsyncInferenceClient` (`pip install aiohttp`).")
+    import aiohttp
+
+    return aiohttp
+
+
+def _import_numpy():
+    """Make sure `numpy` is installed on the machine."""
+    if not is_numpy_available():
+        raise ImportError("Please install numpy to use deal with embeddings (`pip install numpy`).")
+    import numpy
+
+    return numpy
+
+
+def _import_pil_image():
+    """Make sure `PIL` is installed on the machine."""
+    if not is_pillow_available():
+        raise ImportError(
+            "Please install Pillow to use deal with images (`pip install Pillow`). If you don't want the image to be"
+            " post-processed, use `client.post(...)` and get the raw response from the server."
+        )
+    from PIL import Image
+
+    return Image
+
+
+## ENCODING / DECODING UTILS
+
+
+@overload
+def _open_as_binary(
+    content: ContentT,
+) -> ContextManager[BinaryT]: ...  # means "if input is not None, output is not None"
+
+
+@overload
+def _open_as_binary(
+    content: Literal[None],
+) -> ContextManager[Literal[None]]: ...  # means "if input is None, output is None"
+
+
+@contextmanager  # type: ignore
+def _open_as_binary(content: Optional[ContentT]) -> Generator[Optional[BinaryT], None, None]:
+    """Open `content` as a binary file, either from a URL, a local path, or raw bytes.
+
+    Do nothing if `content` is None,
+
+    TODO: handle a PIL.Image as input
+    TODO: handle base64 as input
+    """
+    # If content is a string => must be either a URL or a path
+    if isinstance(content, str):
+        if content.startswith("https://") or content.startswith("http://"):
+            logger.debug(f"Downloading content from {content}")
+            yield get_session().get(content).content  # TODO: retrieve as stream and pipe to post request ?
+            return
+        content = Path(content)
+        if not content.exists():
+            raise FileNotFoundError(
+                f"File not found at {content}. If `data` is a string, it must either be a URL or a path to a local"
+                " file. To pass raw content, please encode it as bytes first."
+            )
+
+    # If content is a Path => open it
+    if isinstance(content, Path):
+        logger.debug(f"Opening content from {content}")
+        with content.open("rb") as f:
+            yield f
+    else:
+        # Otherwise: already a file-like object or None
+        yield content
+
+
+def _b64_encode(content: ContentT) -> str:
+    """Encode a raw file (image, audio) into base64. Can be bytes, an opened file, a path or a URL."""
+    with _open_as_binary(content) as data:
+        data_as_bytes = data if isinstance(data, bytes) else data.read()
+        return base64.b64encode(data_as_bytes).decode()
+
+
+def _b64_to_image(encoded_image: str) -> "Image":
+    """Parse a base64-encoded string into a PIL Image."""
+    Image = _import_pil_image()
+    return Image.open(io.BytesIO(base64.b64decode(encoded_image)))
+
+
+def _bytes_to_list(content: bytes) -> List:
+    """Parse bytes from a Response object into a Python list.
+
+    Expects the response body to be JSON-encoded data.
+
+    NOTE: This is exactly the same implementation as `_bytes_to_dict` and will not complain if the returned data is a
+    dictionary. The only advantage of having both is to help the user (and mypy) understand what kind of data to expect.
+    """
+    return json.loads(content.decode())
+
+
+def _bytes_to_dict(content: bytes) -> Dict:
+    """Parse bytes from a Response object into a Python dictionary.
+
+    Expects the response body to be JSON-encoded data.
+
+    NOTE: This is exactly the same implementation as `_bytes_to_list` and will not complain if the returned data is a
+    list. The only advantage of having both is to help the user (and mypy) understand what kind of data to expect.
+    """
+    return json.loads(content.decode())
+
+
+def _bytes_to_image(content: bytes) -> "Image":
+    """Parse bytes from a Response object into a PIL Image.
+
+    Expects the response body to be raw bytes. To deal with b64 encoded images, use `_b64_to_image` instead.
+    """
+    Image = _import_pil_image()
+    return Image.open(io.BytesIO(content))
+
+
+def _as_dict(response: Union[bytes, Dict]) -> Dict:
+    return json.loads(response) if isinstance(response, bytes) else response
+
+
+## PAYLOAD UTILS
+
+
+## STREAMING UTILS
+
+
+def _stream_text_generation_response(
+    bytes_output_as_lines: Iterable[bytes], details: bool
+) -> Union[Iterable[str], Iterable[TextGenerationStreamOutput]]:
+    """Used in `InferenceClient.text_generation`."""
+    # Parse ServerSentEvents
+    for byte_payload in bytes_output_as_lines:
+        try:
+            output = _format_text_generation_stream_output(byte_payload, details)
+        except StopIteration:
+            break
+        if output is not None:
+            yield output
+
+
+async def _async_stream_text_generation_response(
+    bytes_output_as_lines: AsyncIterable[bytes], details: bool
+) -> Union[AsyncIterable[str], AsyncIterable[TextGenerationStreamOutput]]:
+    """Used in `AsyncInferenceClient.text_generation`."""
+    # Parse ServerSentEvents
+    async for byte_payload in bytes_output_as_lines:
+        try:
+            output = _format_text_generation_stream_output(byte_payload, details)
+        except StopIteration:
+            break
+        if output is not None:
+            yield output
+
+
+def _format_text_generation_stream_output(
+    byte_payload: bytes, details: bool
+) -> Optional[Union[str, TextGenerationStreamOutput]]:
+    if not byte_payload.startswith(b"data:"):
+        return None  # empty line
+
+    if byte_payload.strip() == b"data: [DONE]":
+        raise StopIteration("[DONE] signal received.")
+
+    # Decode payload
+    payload = byte_payload.decode("utf-8")
+    json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+
+    # Either an error as being returned
+    if json_payload.get("error") is not None:
+        raise _parse_text_generation_error(json_payload["error"], json_payload.get("error_type"))
+
+    # Or parse token payload
+    output = TextGenerationStreamOutput.parse_obj_as_instance(json_payload)
+    return output.token.text if not details else output
+
+
+def _stream_chat_completion_response(
+    bytes_lines: Iterable[bytes],
+) -> Iterable[ChatCompletionStreamOutput]:
+    """Used in `InferenceClient.chat_completion` if model is served with TGI."""
+    for item in bytes_lines:
+        try:
+            output = _format_chat_completion_stream_output(item)
+        except StopIteration:
+            break
+        if output is not None:
+            yield output
+
+
+async def _async_stream_chat_completion_response(
+    bytes_lines: AsyncIterable[bytes],
+) -> AsyncIterable[ChatCompletionStreamOutput]:
+    """Used in `AsyncInferenceClient.chat_completion`."""
+    async for item in bytes_lines:
+        try:
+            output = _format_chat_completion_stream_output(item)
+        except StopIteration:
+            break
+        if output is not None:
+            yield output
+
+
+def _format_chat_completion_stream_output(
+    byte_payload: bytes,
+) -> Optional[ChatCompletionStreamOutput]:
+    if not byte_payload.startswith(b"data:"):
+        return None  # empty line
+
+    if byte_payload.strip() == b"data: [DONE]":
+        raise StopIteration("[DONE] signal received.")
+
+    # Decode payload
+    payload = byte_payload.decode("utf-8")
+    json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+
+    # Either an error as being returned
+    if json_payload.get("error") is not None:
+        raise _parse_text_generation_error(json_payload["error"], json_payload.get("error_type"))
+
+    # Or parse token payload
+    return ChatCompletionStreamOutput.parse_obj_as_instance(json_payload)
+
+
+async def _async_yield_from(client: "ClientSession", response: "ClientResponse") -> AsyncIterable[bytes]:
+    async for byte_payload in response.content:
+        yield byte_payload.strip()
+    await client.close()
+
+
+# "TGI servers" are servers running with the `text-generation-inference` backend.
+# This backend is the go-to solution to run large language models at scale. However,
+# for some smaller models (e.g. "gpt2") the default `transformers` + `api-inference`
+# solution is still in use.
+#
+# Both approaches have very similar APIs, but not exactly the same. What we do first in
+# the `text_generation` method is to assume the model is served via TGI. If we realize
+# it's not the case (i.e. we receive an HTTP 400 Bad Request), we fallback to the
+# default API with a warning message. When that's the case, We remember the unsupported
+# attributes for this model in the `_UNSUPPORTED_TEXT_GENERATION_KWARGS` global variable.
+#
+# In addition, TGI servers have a built-in API route for chat-completion, which is not
+# available on the default API. We use this route to provide a more consistent behavior
+# when available.
+#
+# For more details, see https://github.com/huggingface/text-generation-inference and
+# https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task.
+
+_UNSUPPORTED_TEXT_GENERATION_KWARGS: Dict[Optional[str], List[str]] = {}
+
+
+def _set_unsupported_text_generation_kwargs(model: Optional[str], unsupported_kwargs: List[str]) -> None:
+    _UNSUPPORTED_TEXT_GENERATION_KWARGS.setdefault(model, []).extend(unsupported_kwargs)
+
+
+def _get_unsupported_text_generation_kwargs(model: Optional[str]) -> List[str]:
+    return _UNSUPPORTED_TEXT_GENERATION_KWARGS.get(model, [])
+
+
+# TEXT GENERATION ERRORS
+# ----------------------
+# Text-generation errors are parsed separately to handle as much as possible the errors returned by the text generation
+# inference project (https://github.com/huggingface/text-generation-inference).
+# ----------------------
+
+
+def raise_text_generation_error(http_error: HTTPError) -> NoReturn:
+    """
+    Try to parse text-generation-inference error message and raise HTTPError in any case.
+
+    Args:
+        error (`HTTPError`):
+            The HTTPError that have been raised.
+    """
+    # Try to parse a Text Generation Inference error
+
+    try:
+        # Hacky way to retrieve payload in case of aiohttp error
+        payload = getattr(http_error, "response_error_payload", None) or http_error.response.json()
+        error = payload.get("error")
+        error_type = payload.get("error_type")
+    except Exception:  # no payload
+        raise http_error
+
+    # If error_type => more information than `hf_raise_for_status`
+    if error_type is not None:
+        exception = _parse_text_generation_error(error, error_type)
+        raise exception from http_error
+
+    # Otherwise, fallback to default error
+    raise http_error
+
+
+def _parse_text_generation_error(error: Optional[str], error_type: Optional[str]) -> TextGenerationError:
+    if error_type == "generation":
+        return GenerationError(error)  # type: ignore
+    if error_type == "incomplete_generation":
+        return IncompleteGenerationError(error)  # type: ignore
+    if error_type == "overloaded":
+        return OverloadedError(error)  # type: ignore
+    if error_type == "validation":
+        return ValidationError(error)  # type: ignore
+    return UnknownError(error)  # type: ignore
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/__init__.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/__init__.py
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/_async_client.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/_async_client.py
new file mode 100644
index 00000000..df7bf4ea
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/_async_client.py
@@ -0,0 +1,3628 @@
+# coding=utf-8
+# Copyright 2023-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# WARNING
+# This entire file has been adapted from the sync-client code in `src/huggingface_hub/inference/_client.py`.
+# Any change in InferenceClient will be automatically reflected in AsyncInferenceClient.
+# To re-generate the code, run `make style` or `python ./utils/generate_async_inference_client.py --update`.
+# WARNING
+import asyncio
+import base64
+import logging
+import re
+import warnings
+from typing import TYPE_CHECKING, Any, AsyncIterable, Dict, List, Literal, Optional, Set, Union, overload
+
+from huggingface_hub import constants
+from huggingface_hub.errors import InferenceTimeoutError
+from huggingface_hub.inference._common import (
+    TASKS_EXPECTING_IMAGES,
+    ContentT,
+    ModelStatus,
+    RequestParameters,
+    _async_stream_chat_completion_response,
+    _async_stream_text_generation_response,
+    _b64_encode,
+    _b64_to_image,
+    _bytes_to_dict,
+    _bytes_to_image,
+    _bytes_to_list,
+    _get_unsupported_text_generation_kwargs,
+    _import_numpy,
+    _open_as_binary,
+    _set_unsupported_text_generation_kwargs,
+    raise_text_generation_error,
+)
+from huggingface_hub.inference._generated.types import (
+    AudioClassificationOutputElement,
+    AudioClassificationOutputTransform,
+    AudioToAudioOutputElement,
+    AutomaticSpeechRecognitionOutput,
+    ChatCompletionInputGrammarType,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputTool,
+    ChatCompletionInputToolChoiceClass,
+    ChatCompletionInputToolChoiceEnum,
+    ChatCompletionOutput,
+    ChatCompletionStreamOutput,
+    DocumentQuestionAnsweringOutputElement,
+    FillMaskOutputElement,
+    ImageClassificationOutputElement,
+    ImageClassificationOutputTransform,
+    ImageSegmentationOutputElement,
+    ImageSegmentationSubtask,
+    ImageToImageTargetSize,
+    ImageToTextOutput,
+    ObjectDetectionOutputElement,
+    Padding,
+    QuestionAnsweringOutputElement,
+    SummarizationOutput,
+    SummarizationTruncationStrategy,
+    TableQuestionAnsweringOutputElement,
+    TextClassificationOutputElement,
+    TextClassificationOutputTransform,
+    TextGenerationInputGrammarType,
+    TextGenerationOutput,
+    TextGenerationStreamOutput,
+    TextToSpeechEarlyStoppingEnum,
+    TokenClassificationAggregationStrategy,
+    TokenClassificationOutputElement,
+    TranslationOutput,
+    TranslationTruncationStrategy,
+    VisualQuestionAnsweringOutputElement,
+    ZeroShotClassificationOutputElement,
+    ZeroShotImageClassificationOutputElement,
+)
+from huggingface_hub.inference._providers import PROVIDER_T, HFInferenceTask, get_provider_helper
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
+from huggingface_hub.utils._deprecation import _deprecate_arguments, _deprecate_method
+
+from .._common import _async_yield_from, _import_aiohttp
+
+
+if TYPE_CHECKING:
+    import numpy as np
+    from aiohttp import ClientResponse, ClientSession
+    from PIL.Image import Image
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_KWARGS_NOT_USED_REGEX = re.compile(r"The following `model_kwargs` are not used by the model: \[(.*?)\]")
+
+
+class AsyncInferenceClient:
+    """
+    Initialize a new Inference Client.
+
+    [`InferenceClient`] aims to provide a unified experience to perform inference. The client can be used
+    seamlessly with either the (free) Inference API, self-hosted Inference Endpoints, or third-party Inference Providers.
+
+    Args:
+        model (`str`, `optional`):
+            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
+            or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
+            automatically selected for the task.
+            Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
+            arguments are mutually exclusive. If using `base_url` for chat completion, the `/chat/completions` suffix
+            path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
+            documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
+        provider (`str`, *optional*):
+            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"cohere"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
+            defaults to hf-inference (Hugging Face Serverless Inference API).
+            If model is a URL or `base_url` is passed, then `provider` is not used.
+        token (`str` or `bool`, *optional*):
+            Hugging Face token. Will default to the locally saved token if not provided.
+            Pass `token=False` if you don't want to send your token to the server.
+            Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
+        timeout (`float`, `optional`):
+            The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
+            API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
+        headers (`Dict[str, str]`, `optional`):
+            Additional headers to send to the server. By default only the authorization and user-agent headers are sent.
+            Values in this dictionary will override the default values.
+        cookies (`Dict[str, str]`, `optional`):
+            Additional cookies to send to the server.
+        trust_env ('bool', 'optional'):
+            Trust environment settings for proxy configuration if the parameter is `True` (`False` by default).
+        proxies (`Any`, `optional`):
+            Proxies to use for the request.
+        base_url (`str`, `optional`):
+            Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
+        api_key (`str`, `optional`):
+            Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        *,
+        provider: Optional[PROVIDER_T] = None,
+        token: Optional[str] = None,
+        timeout: Optional[float] = None,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        trust_env: bool = False,
+        proxies: Optional[Any] = None,
+        # OpenAI compatibility
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+    ) -> None:
+        if model is not None and base_url is not None:
+            raise ValueError(
+                "Received both `model` and `base_url` arguments. Please provide only one of them."
+                " `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
+                " If using `base_url` for chat completion, the `/chat/completions` suffix path will be appended to the base url."
+                " When passing a URL as `model`, the client will not append any suffix path to it."
+            )
+        if token is not None and api_key is not None:
+            raise ValueError(
+                "Received both `token` and `api_key` arguments. Please provide only one of them."
+                " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `token`."
+            )
+
+        self.model: Optional[str] = base_url or model
+        self.token: Optional[str] = token if token is not None else api_key
+        self.headers = headers if headers is not None else {}
+
+        # Configure provider
+        self.provider = provider if provider is not None else "hf-inference"
+
+        self.cookies = cookies
+        self.timeout = timeout
+        self.trust_env = trust_env
+        self.proxies = proxies
+
+        # Keep track of the sessions to close them properly
+        self._sessions: Dict["ClientSession", Set["ClientResponse"]] = dict()
+
+    def __repr__(self):
+        return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
+
+    @overload
+    async def post(  # type: ignore[misc]
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: Literal[False] = ...,
+    ) -> bytes: ...
+
+    @overload
+    async def post(  # type: ignore[misc]
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: Literal[True] = ...,
+    ) -> AsyncIterable[bytes]: ...
+
+    @overload
+    async def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, AsyncIterable[bytes]]: ...
+
+    @_deprecate_method(
+        version="0.31.0",
+        message=(
+            "Making direct POST requests to the inference server is not supported anymore. "
+            "Please use task methods instead (e.g. `InferenceClient.chat_completion`). "
+            "If your use case is not supported, please open an issue in https://github.com/huggingface/huggingface_hub."
+        ),
+    )
+    async def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, AsyncIterable[bytes]]:
+        """
+        Make a POST request to the inference server.
+
+        This method is deprecated and will be removed in the future.
+        Please use task methods instead (e.g. `InferenceClient.chat_completion`).
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(
+                "Cannot use `post` with another provider than `hf-inference`. "
+                "`InferenceClient.post` is deprecated and should not be used directly anymore."
+            )
+        provider_helper = HFInferenceTask(task or "unknown")
+        mapped_model = provider_helper._prepare_mapped_model(model or self.model)
+        url = provider_helper._prepare_url(self.token, mapped_model)  # type: ignore[arg-type]
+        headers = provider_helper._prepare_headers(self.headers, self.token)  # type: ignore[arg-type]
+        return await self._inner_post(
+            request_parameters=RequestParameters(
+                url=url,
+                task=task or "unknown",
+                model=model or "unknown",
+                json=json,
+                data=data,
+                headers=headers,
+            ),
+            stream=stream,
+        )
+
+    @overload
+    async def _inner_post(  # type: ignore[misc]
+        self, request_parameters: RequestParameters, *, stream: Literal[False] = ...
+    ) -> bytes: ...
+
+    @overload
+    async def _inner_post(  # type: ignore[misc]
+        self, request_parameters: RequestParameters, *, stream: Literal[True] = ...
+    ) -> AsyncIterable[bytes]: ...
+
+    @overload
+    async def _inner_post(
+        self, request_parameters: RequestParameters, *, stream: bool = False
+    ) -> Union[bytes, AsyncIterable[bytes]]: ...
+
+    async def _inner_post(
+        self, request_parameters: RequestParameters, *, stream: bool = False
+    ) -> Union[bytes, AsyncIterable[bytes]]:
+        """Make a request to the inference server."""
+
+        aiohttp = _import_aiohttp()
+
+        # TODO: this should be handled in provider helpers directly
+        if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers:
+            request_parameters.headers["Accept"] = "image/png"
+
+        while True:
+            with _open_as_binary(request_parameters.data) as data_as_binary:
+                # Do not use context manager as we don't want to close the connection immediately when returning
+                # a stream
+                session = self._get_client_session(headers=request_parameters.headers)
+
+                try:
+                    response = await session.post(
+                        request_parameters.url, json=request_parameters.json, data=data_as_binary, proxy=self.proxies
+                    )
+                    response_error_payload = None
+                    if response.status != 200:
+                        try:
+                            response_error_payload = await response.json()  # get payload before connection closed
+                        except Exception:
+                            pass
+                    response.raise_for_status()
+                    if stream:
+                        return _async_yield_from(session, response)
+                    else:
+                        content = await response.read()
+                        await session.close()
+                        return content
+                except asyncio.TimeoutError as error:
+                    await session.close()
+                    # Convert any `TimeoutError` to a `InferenceTimeoutError`
+                    raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error  # type: ignore
+                except aiohttp.ClientResponseError as error:
+                    error.response_error_payload = response_error_payload
+                    await session.close()
+                    raise error
+                except Exception:
+                    await session.close()
+                    raise
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+
+    def __del__(self):
+        if len(self._sessions) > 0:
+            warnings.warn(
+                "Deleting 'AsyncInferenceClient' client but some sessions are still open. "
+                "This can happen if you've stopped streaming data from the server before the stream was complete. "
+                "To close the client properly, you must call `await client.close()` "
+                "or use an async context (e.g. `async with AsyncInferenceClient(): ...`."
+            )
+
+    async def close(self):
+        """Close all open sessions.
+
+        By default, 'aiohttp.ClientSession' objects are closed automatically when a call is completed. However, if you
+        are streaming data from the server and you stop before the stream is complete, you must call this method to
+        close the session properly.
+
+        Another possibility is to use an async context (e.g. `async with AsyncInferenceClient(): ...`).
+        """
+        await asyncio.gather(*[session.close() for session in self._sessions.keys()])
+
+    async def audio_classification(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
+    ) -> List[AudioClassificationOutputElement]:
+        """
+        Perform audio classification on the provided audio content.
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The audio content to classify. It can be raw audio bytes, a local audio file, or a URL pointing to an
+                audio file.
+            model (`str`, *optional*):
+                The model to use for audio classification. Can be a model ID hosted on the Hugging Face Hub
+                or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
+                audio classification will be used.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
+
+        Returns:
+            `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.audio_classification("audio.flac")
+        [
+            AudioClassificationOutputElement(score=0.4976358711719513, label='hap'),
+            AudioClassificationOutputElement(score=0.3677836060523987, label='neu'),
+            ...
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={"function_to_apply": function_to_apply, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return AudioClassificationOutputElement.parse_obj_as_list(response)
+
+    async def audio_to_audio(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+    ) -> List[AudioToAudioOutputElement]:
+        """
+        Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The audio content for the model. It can be raw audio bytes, a local audio file, or a URL pointing to an
+                audio file.
+            model (`str`, *optional*):
+                The model can be any model which takes an audio file and returns another audio file. Can be a model ID hosted on the Hugging Face Hub
+                or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
+                audio_to_audio will be used.
+
+        Returns:
+            `List[AudioToAudioOutputElement]`: A list of [`AudioToAudioOutputElement`] items containing audios label, content-type, and audio content in blob.
+
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> audio_output = await client.audio_to_audio("audio.flac")
+        >>> async for i, item in enumerate(audio_output):
+        >>>     with open(f"output_{i}.flac", "wb") as f:
+                    f.write(item.blob)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="audio-to-audio")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        audio_output = AudioToAudioOutputElement.parse_obj_as_list(response)
+        for item in audio_output:
+            item.blob = base64.b64decode(item.blob)
+        return audio_output
+
+    async def automatic_speech_recognition(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> AutomaticSpeechRecognitionOutput:
+        """
+        Perform automatic speech recognition (ASR or audio-to-text) on the given audio content.
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The content to transcribe. It can be raw audio bytes, local audio file, or a URL to an audio file.
+            model (`str`, *optional*):
+                The model to use for ASR. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for ASR will be used.
+            extra_body (`Dict`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+        Returns:
+            [`AutomaticSpeechRecognitionOutput`]: An item containing the transcribed text and optionally the timestamp chunks.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.automatic_speech_recognition("hello_world.flac").text
+        "hello world"
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="automatic-speech-recognition")
+        request_parameters = provider_helper.prepare_request(
+            inputs=audio,
+            parameters={**(extra_body or {})},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
+
+    @overload
+    async def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[False] = False,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> ChatCompletionOutput: ...
+
+    @overload
+    async def chat_completion(  # type: ignore
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: Literal[True] = True,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
+
+    @overload
+    async def chat_completion(
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
+
+    async def chat_completion(
+        self,
+        messages: List[Dict],
+        *,
+        model: Optional[str] = None,
+        stream: bool = False,
+        # Parameters from ChatCompletionInput (handled manually)
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None,
+        tool_prompt: Optional[str] = None,
+        tools: Optional[List[ChatCompletionInputTool]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        extra_body: Optional[Dict] = None,
+    ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
+        """
+        A method for completing conversations using a specified language model.
+
+        <Tip>
+
+        The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
+        Inputs and outputs are strictly the same and using either syntax will yield the same results.
+        Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
+        for more details about OpenAI's compatibility.
+
+        </Tip>
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            messages (List of [`ChatCompletionInputMessage`]):
+                Conversation history consisting of roles and content pairs.
+            model (`str`, *optional*):
+                The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
+                See https://huggingface.co/tasks/text-generation for more details.
+                If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
+                custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
+            frequency_penalty (`float`, *optional*):
+                Penalizes new tokens based on their existing frequency
+                in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
+            logit_bias (`List[float]`, *optional*):
+                Adjusts the likelihood of specific tokens appearing in the generated output.
+            logprobs (`bool`, *optional*):
+                Whether to return log probabilities of the output tokens or not. If true, returns the log
+                probabilities of each output token returned in the content of message.
+            max_tokens (`int`, *optional*):
+                Maximum number of tokens allowed in the response. Defaults to 100.
+            n (`int`, *optional*):
+                The number of completions to generate for each prompt.
+            presence_penalty (`float`, *optional*):
+                Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
+                text so far, increasing the model's likelihood to talk about new topics.
+            response_format ([`ChatCompletionInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
+            seed (Optional[`int`], *optional*):
+                Seed for reproducible control flow. Defaults to None.
+            stop (`List[str]`, *optional*):
+                Up to four strings which trigger the end of the response.
+                Defaults to None.
+            stream (`bool`, *optional*):
+                Enable realtime streaming of responses. Defaults to False.
+            stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
+                Options for streaming completions.
+            temperature (`float`, *optional*):
+                Controls randomness of the generations. Lower values ensure
+                less random completions. Range: [0, 2]. Defaults to 1.0.
+            top_logprobs (`int`, *optional*):
+                An integer between 0 and 5 specifying the number of most likely tokens to return at each token
+                position, each with an associated log probability. logprobs must be set to true if this parameter is
+                used.
+            top_p (`float`, *optional*):
+                Fraction of the most likely next words to sample from.
+                Must be between 0 and 1. Defaults to 1.0.
+            tool_choice ([`ChatCompletionInputToolChoiceClass`] or [`ChatCompletionInputToolChoiceEnum`], *optional*):
+                The tool to use for the completion. Defaults to "auto".
+            tool_prompt (`str`, *optional*):
+                A prompt to be appended before the tools.
+            tools (List of [`ChatCompletionInputTool`], *optional*):
+                A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
+                provide a list of functions the model may generate JSON inputs for.
+            extra_body (`Dict`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+        Returns:
+            [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
+            Generated text returned from the server:
+            - if `stream=False`, the generated text is returned as a [`ChatCompletionOutput`] (default).
+            - if `stream=True`, the generated text is returned token by token as a sequence of [`ChatCompletionStreamOutput`].
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
+        >>> await client.chat_completion(messages, max_tokens=100)
+        ChatCompletionOutput(
+            choices=[
+                ChatCompletionOutputComplete(
+                    finish_reason='eos_token',
+                    index=0,
+                    message=ChatCompletionOutputMessage(
+                        role='assistant',
+                        content='The capital of France is Paris.',
+                        name=None,
+                        tool_calls=None
+                    ),
+                    logprobs=None
+                )
+            ],
+            created=1719907176,
+            id='',
+            model='meta-llama/Meta-Llama-3-8B-Instruct',
+            object='text_completion',
+            system_fingerprint='2.0.4-sha-f426a33',
+            usage=ChatCompletionOutputUsage(
+                completion_tokens=8,
+                prompt_tokens=17,
+                total_tokens=25
+            )
+        )
+        ```
+
+        Example using streaming:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
+        >>> async for token in await client.chat_completion(messages, max_tokens=10, stream=True):
+        ...     print(token)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        (...)
+        ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ```
+
+        Example using OpenAI's syntax:
+        ```py
+        # Must be run in an async context
+        # instead of `from openai import OpenAI`
+        from huggingface_hub import AsyncInferenceClient
+
+        # instead of `client = OpenAI(...)`
+        client = AsyncInferenceClient(
+            base_url=...,
+            api_key=...,
+        )
+
+        output = await client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=True,
+            max_tokens=1024,
+        )
+
+        for chunk in output:
+            print(chunk.choices[0].delta.content)
+        ```
+
+        Example using a third-party provider directly with extra (provider-specific) parameters. Usage will be billed on your Together AI account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="together",  # Use Together AI provider
+        ...     api_key="<together_api_key>",  # Pass your Together API key directly
+        ... )
+        >>> client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-8B-Instruct",
+        ...     messages=[{"role": "user", "content": "What is the capital of France?"}],
+        ...     extra_body={"safety_model": "Meta-Llama/Llama-Guard-7b"},
+        ... )
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="sambanova",  # Use Sambanova provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-8B-Instruct",
+        ...     messages=[{"role": "user", "content": "What is the capital of France?"}],
+        ... )
+        ```
+
+        Example using Image + Text as input:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+
+        # provide a remote URL
+        >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        # or a base64-encoded image
+        >>> image_path = "/path/to/image.jpeg"
+        >>> with open(image_path, "rb") as f:
+        ...     base64_image = base64.b64encode(f.read()).decode("utf-8")
+        >>> image_url = f"data:image/jpeg;base64,{base64_image}"
+
+        >>> client = AsyncInferenceClient("meta-llama/Llama-3.2-11B-Vision-Instruct")
+        >>> output = await client.chat.completions.create(
+        ...     messages=[
+        ...         {
+        ...             "role": "user",
+        ...             "content": [
+        ...                 {
+        ...                     "type": "image_url",
+        ...                     "image_url": {"url": image_url},
+        ...                 },
+        ...                 {
+        ...                     "type": "text",
+        ...                     "text": "Describe this image in one sentence.",
+        ...                 },
+        ...             ],
+        ...         },
+        ...     ],
+        ... )
+        >>> output
+        The image depicts the iconic Statue of Liberty situated in New York Harbor, New York, on a clear day.
+        ```
+
+        Example using tools:
+        ```py
+        # Must be run in an async context
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
+        ...     },
+        ...     {
+        ...         "role": "user",
+        ...         "content": "What's the weather like the next 3 days in San Francisco, CA?",
+        ...     },
+        ... ]
+        >>> tools = [
+        ...     {
+        ...         "type": "function",
+        ...         "function": {
+        ...             "name": "get_current_weather",
+        ...             "description": "Get the current weather",
+        ...             "parameters": {
+        ...                 "type": "object",
+        ...                 "properties": {
+        ...                     "location": {
+        ...                         "type": "string",
+        ...                         "description": "The city and state, e.g. San Francisco, CA",
+        ...                     },
+        ...                     "format": {
+        ...                         "type": "string",
+        ...                         "enum": ["celsius", "fahrenheit"],
+        ...                         "description": "The temperature unit to use. Infer this from the users location.",
+        ...                     },
+        ...                 },
+        ...                 "required": ["location", "format"],
+        ...             },
+        ...         },
+        ...     },
+        ...     {
+        ...         "type": "function",
+        ...         "function": {
+        ...             "name": "get_n_day_weather_forecast",
+        ...             "description": "Get an N-day weather forecast",
+        ...             "parameters": {
+        ...                 "type": "object",
+        ...                 "properties": {
+        ...                     "location": {
+        ...                         "type": "string",
+        ...                         "description": "The city and state, e.g. San Francisco, CA",
+        ...                     },
+        ...                     "format": {
+        ...                         "type": "string",
+        ...                         "enum": ["celsius", "fahrenheit"],
+        ...                         "description": "The temperature unit to use. Infer this from the users location.",
+        ...                     },
+        ...                     "num_days": {
+        ...                         "type": "integer",
+        ...                         "description": "The number of days to forecast",
+        ...                     },
+        ...                 },
+        ...                 "required": ["location", "format", "num_days"],
+        ...             },
+        ...         },
+        ...     },
+        ... ]
+
+        >>> response = await client.chat_completion(
+        ...     model="meta-llama/Meta-Llama-3-70B-Instruct",
+        ...     messages=messages,
+        ...     tools=tools,
+        ...     tool_choice="auto",
+        ...     max_tokens=500,
+        ... )
+        >>> response.choices[0].message.tool_calls[0].function
+        ChatCompletionOutputFunctionDefinition(
+            arguments={
+                'location': 'San Francisco, CA',
+                'format': 'fahrenheit',
+                'num_days': 3
+            },
+            name='get_n_day_weather_forecast',
+            description=None
+        )
+        ```
+
+        Example using response_format:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+        ...     },
+        ... ]
+        >>> response_format = {
+        ...     "type": "json",
+        ...     "value": {
+        ...         "properties": {
+        ...             "location": {"type": "string"},
+        ...             "activity": {"type": "string"},
+        ...             "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...             "animals": {"type": "array", "items": {"type": "string"}},
+        ...         },
+        ...         "required": ["location", "activity", "animals_seen", "animals"],
+        ...     },
+        ... }
+        >>> response = await client.chat_completion(
+        ...     messages=messages,
+        ...     response_format=response_format,
+        ...     max_tokens=500,
+        )
+        >>> response.choices[0].message.content
+        '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
+        ```
+        """
+        # Get the provider helper
+        provider_helper = get_provider_helper(self.provider, task="conversational")
+
+        # Since `chat_completion(..., model=xxx)` is also a payload parameter for the server, we need to handle 'model' differently.
+        # `self.model` takes precedence over 'model' argument for building URL.
+        # `model` takes precedence for payload value.
+        model_id_or_url = self.model or model
+        payload_model = model or self.model
+
+        # Prepare the payload
+        parameters = {
+            "model": payload_model,
+            "frequency_penalty": frequency_penalty,
+            "logit_bias": logit_bias,
+            "logprobs": logprobs,
+            "max_tokens": max_tokens,
+            "n": n,
+            "presence_penalty": presence_penalty,
+            "response_format": response_format,
+            "seed": seed,
+            "stop": stop,
+            "temperature": temperature,
+            "tool_choice": tool_choice,
+            "tool_prompt": tool_prompt,
+            "tools": tools,
+            "top_logprobs": top_logprobs,
+            "top_p": top_p,
+            "stream": stream,
+            "stream_options": stream_options,
+            **(extra_body or {}),
+        }
+        request_parameters = provider_helper.prepare_request(
+            inputs=messages,
+            parameters=parameters,
+            headers=self.headers,
+            model=model_id_or_url,
+            api_key=self.token,
+        )
+        data = await self._inner_post(request_parameters, stream=stream)
+
+        if stream:
+            return _async_stream_chat_completion_response(data)  # type: ignore[arg-type]
+
+        return ChatCompletionOutput.parse_obj_as_instance(data)  # type: ignore[arg-type]
+
+    async def document_question_answering(
+        self,
+        image: ContentT,
+        question: str,
+        *,
+        model: Optional[str] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        lang: Optional[str] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+        word_boxes: Optional[List[Union[List[float], str]]] = None,
+    ) -> List[DocumentQuestionAnsweringOutputElement]:
+        """
+        Answer questions on document images.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            question (`str`):
+                Question to be answered.
+            model (`str`, *optional*):
+                The model to use for the document question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended document question answering model will be used.
+                Defaults to None.
+            doc_stride (`int`, *optional*):
+                If the words in the document are too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer
+            lang (`str`, *optional*):
+                Language to use while running OCR. Defaults to english.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using doc_stride as overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Can return less than top_k
+                answers if there are not enough options available within the context.
+            word_boxes (`List[Union[List[float], str`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
+                step and use the provided bounding boxes instead.
+        Returns:
+            `List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
+        ```
+        """
+        inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
+        provider_helper = get_provider_helper(self.provider, task="document-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=inputs,
+            parameters={
+                "doc_stride": doc_stride,
+                "handle_impossible_answer": handle_impossible_answer,
+                "lang": lang,
+                "max_answer_len": max_answer_len,
+                "max_question_len": max_question_len,
+                "max_seq_len": max_seq_len,
+                "top_k": top_k,
+                "word_boxes": word_boxes,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response)
+
+    async def feature_extraction(
+        self,
+        text: str,
+        *,
+        normalize: Optional[bool] = None,
+        prompt_name: Optional[str] = None,
+        truncate: Optional[bool] = None,
+        truncation_direction: Optional[Literal["Left", "Right"]] = None,
+        model: Optional[str] = None,
+    ) -> "np.ndarray":
+        """
+        Generate embeddings for a given text.
+
+        Args:
+            text (`str`):
+                The text to embed.
+            model (`str`, *optional*):
+                The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
+                Defaults to None.
+            normalize (`bool`, *optional*):
+                Whether to normalize the embeddings or not.
+                Only available on server powered by Text-Embedding-Inference.
+            prompt_name (`str`, *optional*):
+                The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+                Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
+                For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",...},
+                then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?"
+                because the prompt text will be prepended before any text to encode.
+            truncate (`bool`, *optional*):
+                Whether to truncate the embeddings or not.
+                Only available on server powered by Text-Embedding-Inference.
+            truncation_direction (`Literal["Left", "Right"]`, *optional*):
+                Which side of the input should be truncated when `truncate=True` is passed.
+
+        Returns:
+            `np.ndarray`: The embedding representing the input text as a float32 numpy array.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.feature_extraction("Hi, who are you?")
+        array([[ 2.424802  ,  2.93384   ,  1.1750331 , ...,  1.240499, -0.13776633, -0.7889173 ],
+        [-0.42943227, -0.6364878 , -1.693462  , ...,  0.41978157, -2.4336355 ,  0.6162071 ],
+        ...,
+        [ 0.28552425, -0.928395  , -1.2077185 , ...,  0.76810825, -2.1069427 ,  0.6236161 ]], dtype=float32)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="feature-extraction")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "normalize": normalize,
+                "prompt_name": prompt_name,
+                "truncate": truncate,
+                "truncation_direction": truncation_direction,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        np = _import_numpy()
+        return np.array(_bytes_to_dict(response), dtype="float32")
+
+    async def fill_mask(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        targets: Optional[List[str]] = None,
+        top_k: Optional[int] = None,
+    ) -> List[FillMaskOutputElement]:
+        """
+        Fill in a hole with a missing word (token to be precise).
+
+        Args:
+            text (`str`):
+                a string to be filled from, must contain the [MASK] token (check model card for exact name of the mask).
+            model (`str`, *optional*):
+                The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
+            targets (`List[str`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
+        Returns:
+            `List[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
+            probability, token reference, and completed text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.fill_mask("The goal of life is <mask>.")
+        [
+            FillMaskOutputElement(score=0.06897063553333282, token=11098, token_str=' happiness', sequence='The goal of life is happiness.'),
+            FillMaskOutputElement(score=0.06554922461509705, token=45075, token_str=' immortality', sequence='The goal of life is immortality.')
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="fill-mask")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={"targets": targets, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return FillMaskOutputElement.parse_obj_as_list(response)
+
+    async def image_classification(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        function_to_apply: Optional["ImageClassificationOutputTransform"] = None,
+        top_k: Optional[int] = None,
+    ) -> List[ImageClassificationOutputElement]:
+        """
+        Perform image classification on the given image using the specified model.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The image to classify. It can be raw bytes, an image file, or a URL to an online image.
+            model (`str`, *optional*):
+                The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
+                deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
+            function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+        Returns:
+            `List[ImageClassificationOutputElement]`: a list of [`ImageClassificationOutputElement`] items containing the predicted label and associated probability.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.image_classification("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
+        [ImageClassificationOutputElement(label='Blenheim spaniel', score=0.9779096841812134), ...]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="image-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"function_to_apply": function_to_apply, "top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return ImageClassificationOutputElement.parse_obj_as_list(response)
+
+    async def image_segmentation(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        mask_threshold: Optional[float] = None,
+        overlap_mask_area_threshold: Optional[float] = None,
+        subtask: Optional["ImageSegmentationSubtask"] = None,
+        threshold: Optional[float] = None,
+    ) -> List[ImageSegmentationOutputElement]:
+        """
+        Perform image segmentation on the given image using the specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The image to segment. It can be raw bytes, an image file, or a URL to an online image.
+            model (`str`, *optional*):
+                The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
+                deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
+            mask_threshold (`float`, *optional*):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            subtask (`"ImageSegmentationSubtask"`, *optional*):
+                Segmentation task to be performed, depending on model capabilities.
+            threshold (`float`, *optional*):
+                Probability threshold to filter out predicted masks.
+        Returns:
+            `List[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.image_segmentation("cat.jpg")
+        [ImageSegmentationOutputElement(score=0.989008, label='LABEL_184', mask=<PIL.PngImagePlugin.PngImageFile image mode=L size=400x300 at 0x7FDD2B129CC0>), ...]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="audio-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "mask_threshold": mask_threshold,
+                "overlap_mask_area_threshold": overlap_mask_area_threshold,
+                "subtask": subtask,
+                "threshold": threshold,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        output = ImageSegmentationOutputElement.parse_obj_as_list(response)
+        for item in output:
+            item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
+        return output
+
+    async def image_to_image(
+        self,
+        image: ContentT,
+        prompt: Optional[str] = None,
+        *,
+        negative_prompt: Optional[str] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        model: Optional[str] = None,
+        target_size: Optional[ImageToImageTargetSize] = None,
+        **kwargs,
+    ) -> "Image":
+        """
+        Perform image-to-image translation using a specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
+            prompt (`str`, *optional*):
+                The text prompt to guide the image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate images closely
+                linked to the text prompt at the expense of lower image quality.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            target_size (`ImageToImageTargetSize`, *optional*):
+                The size in pixel of the output image.
+
+        Returns:
+            `Image`: The translated image.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> image = await client.image_to_image("cat.jpg", prompt="turn the cat into a tiger")
+        >>> image.save("tiger.jpg")
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="image-to-image")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "target_size": target_size,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return _bytes_to_image(response)
+
+    async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
+        """
+        Takes an input image and return text.
+
+        Models can have very different outputs depending on your use case (image captioning, optical character recognition
+        (OCR), Pix2Struct, etc). Please have a look to the model card to learn more about a model's specificities.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image to caption. It can be raw bytes, an image file, or a URL to an online image..
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+
+        Returns:
+            [`ImageToTextOutput`]: The generated text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.image_to_text("cat.jpg")
+        'a cat standing in a grassy field '
+        >>> await client.image_to_text("https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg")
+        'a dog laying on the grass next to a flower pot '
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="image-to-text")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        output = ImageToTextOutput.parse_obj(response)
+        return output[0] if isinstance(output, list) else output
+
+    async def object_detection(
+        self, image: ContentT, *, model: Optional[str] = None, threshold: Optional[float] = None
+    ) -> List[ObjectDetectionOutputElement]:
+        """
+        Perform object detection on the given image using the specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The image to detect objects on. It can be raw bytes, an image file, or a URL to an online image.
+            model (`str`, *optional*):
+                The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
+                deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
+            threshold (`float`, *optional*):
+                The probability necessary to make a prediction.
+        Returns:
+            `List[ObjectDetectionOutputElement]`: A list of [`ObjectDetectionOutputElement`] items containing the bounding boxes and associated attributes.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+            `ValueError`:
+                If the request output is not a List.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.object_detection("people.jpg")
+        [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="object-detection")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"threshold": threshold},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return ObjectDetectionOutputElement.parse_obj_as_list(response)
+
+    async def question_answering(
+        self,
+        question: str,
+        context: str,
+        *,
+        model: Optional[str] = None,
+        align_to_words: Optional[bool] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+    ) -> Union[QuestionAnsweringOutputElement, List[QuestionAnsweringOutputElement]]:
+        """
+        Retrieve the answer to a question from a given text.
+
+        Args:
+            question (`str`):
+                Question to be answered.
+            context (`str`):
+                The context of the question.
+            model (`str`):
+                The model to use for the question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint.
+            align_to_words (`bool`, *optional*):
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt
+                on non-space-separated languages (like Japanese or Chinese)
+            doc_stride (`int`, *optional*):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_question_len (`int`, *optional*):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using docStride as overlap) if needed.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                topk answers if there are not enough options available within the context.
+
+        Returns:
+            Union[`QuestionAnsweringOutputElement`, List[`QuestionAnsweringOutputElement`]]:
+                When top_k is 1 or not provided, it returns a single `QuestionAnsweringOutputElement`.
+                When top_k is greater than 1, it returns a list of `QuestionAnsweringOutputElement`.
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.question_answering(question="What's my name?", context="My name is Clara and I live in Berkeley.")
+        QuestionAnsweringOutputElement(answer='Clara', end=16, score=0.9326565265655518, start=11)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={
+                "align_to_words": align_to_words,
+                "doc_stride": doc_stride,
+                "handle_impossible_answer": handle_impossible_answer,
+                "max_answer_len": max_answer_len,
+                "max_question_len": max_question_len,
+                "max_seq_len": max_seq_len,
+                "top_k": top_k,
+            },
+            extra_payload={"question": question, "context": context},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        # Parse the response as a single `QuestionAnsweringOutputElement` when top_k is 1 or not provided, or a list of `QuestionAnsweringOutputElement` to ensure backward compatibility.
+        output = QuestionAnsweringOutputElement.parse_obj(response)
+        return output
+
+    async def sentence_similarity(
+        self, sentence: str, other_sentences: List[str], *, model: Optional[str] = None
+    ) -> List[float]:
+        """
+        Compute the semantic similarity between a sentence and a list of other sentences by comparing their embeddings.
+
+        Args:
+            sentence (`str`):
+                The main sentence to compare to others.
+            other_sentences (`List[str]`):
+                The list of sentences to compare to.
+            model (`str`, *optional*):
+                The model to use for the conversational task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended conversational model will be used.
+                Defaults to None.
+
+        Returns:
+            `List[float]`: The embedding representing the input text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.sentence_similarity(
+        ...     "Machine learning is so easy.",
+        ...     other_sentences=[
+        ...         "Deep learning is so straightforward.",
+        ...         "This is so difficult, like rocket science.",
+        ...         "I can't believe how much I struggled with this.",
+        ...     ],
+        ... )
+        [0.7785726189613342, 0.45876261591911316, 0.2906220555305481]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="sentence-similarity")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={},
+            extra_payload={"source_sentence": sentence, "sentences": other_sentences},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return _bytes_to_list(response)
+
+    async def summarization(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
+        truncation: Optional["SummarizationTruncationStrategy"] = None,
+    ) -> SummarizationOutput:
+        """
+        Generate a summary of a given text using a specified model.
+
+        Args:
+            text (`str`):
+                The input text to summarize.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended model for summarization will be used.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
+            truncation (`"SummarizationTruncationStrategy"`, *optional*):
+                The truncation strategy to use.
+        Returns:
+            [`SummarizationOutput`]: The generated summary text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.summarization("The Eiffel tower...")
+        SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....")
+        ```
+        """
+        parameters = {
+            "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+            "generate_parameters": generate_parameters,
+            "truncation": truncation,
+        }
+        provider_helper = get_provider_helper(self.provider, task="summarization")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters=parameters,
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return SummarizationOutput.parse_obj_as_list(response)[0]
+
+    async def table_question_answering(
+        self,
+        table: Dict[str, Any],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        padding: Optional["Padding"] = None,
+        sequential: Optional[bool] = None,
+        truncation: Optional[bool] = None,
+    ) -> TableQuestionAnsweringOutputElement:
+        """
+        Retrieve the answer to a question from information given in a table.
+
+        Args:
+            table (`str`):
+                A table of data represented as a dict of lists where entries are headers and the lists are all the
+                values, all lists must have the same size.
+            query (`str`):
+                The query in plain text that you want to ask the table.
+            model (`str`):
+                The model to use for the table-question-answering task. Can be a model ID hosted on the Hugging Face
+                Hub or a URL to a deployed Inference Endpoint.
+            padding (`"Padding"`, *optional*):
+                Activates and controls padding.
+            sequential (`bool`, *optional*):
+                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+                inference to be done sequentially to extract relations within sequences, given their conversational
+                nature.
+            truncation (`bool`, *optional*):
+                Activates and controls truncation.
+
+        Returns:
+            [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> query = "How many stars does the transformers repository have?"
+        >>> table = {"Repository": ["Transformers", "Datasets", "Tokenizers"], "Stars": ["36542", "4512", "3934"]}
+        >>> await client.table_question_answering(table, query, model="google/tapas-base-finetuned-wtq")
+        TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE')
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="table-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={"model": model, "padding": padding, "sequential": sequential, "truncation": truncation},
+            extra_payload={"query": query, "table": table},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return TableQuestionAnsweringOutputElement.parse_obj_as_instance(response)
+
+    async def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[str]:
+        """
+        Classifying a target category (a group) based on a set of attributes.
+
+        Args:
+            table (`Dict[str, Any]`):
+                Set of attributes to classify.
+            model (`str`, *optional*):
+                The model to use for the tabular classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended tabular classification model will be used.
+                Defaults to None.
+
+        Returns:
+            `List`: a list of labels, one per row in the initial table.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> table = {
+        ...     "fixed_acidity": ["7.4", "7.8", "10.3"],
+        ...     "volatile_acidity": ["0.7", "0.88", "0.32"],
+        ...     "citric_acid": ["0", "0", "0.45"],
+        ...     "residual_sugar": ["1.9", "2.6", "6.4"],
+        ...     "chlorides": ["0.076", "0.098", "0.073"],
+        ...     "free_sulfur_dioxide": ["11", "25", "5"],
+        ...     "total_sulfur_dioxide": ["34", "67", "13"],
+        ...     "density": ["0.9978", "0.9968", "0.9976"],
+        ...     "pH": ["3.51", "3.2", "3.23"],
+        ...     "sulphates": ["0.56", "0.68", "0.82"],
+        ...     "alcohol": ["9.4", "9.8", "12.6"],
+        ... }
+        >>> await client.tabular_classification(table=table, model="julien-c/wine-quality")
+        ["5", "5", "5"]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="tabular-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            extra_payload={"table": table},
+            parameters={},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return _bytes_to_list(response)
+
+    async def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]:
+        """
+        Predicting a numerical target value given a set of attributes/features in a table.
+
+        Args:
+            table (`Dict[str, Any]`):
+                Set of attributes stored in a table. The attributes used to predict the target can be both numerical and categorical.
+            model (`str`, *optional*):
+                The model to use for the tabular regression task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended tabular regression model will be used.
+                Defaults to None.
+
+        Returns:
+            `List`: a list of predicted numerical target values.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> table = {
+        ...     "Height": ["11.52", "12.48", "12.3778"],
+        ...     "Length1": ["23.2", "24", "23.9"],
+        ...     "Length2": ["25.4", "26.3", "26.5"],
+        ...     "Length3": ["30", "31.2", "31.1"],
+        ...     "Species": ["Bream", "Bream", "Bream"],
+        ...     "Width": ["4.02", "4.3056", "4.6961"],
+        ... }
+        >>> await client.tabular_regression(table, model="scikit-learn/Fish-Weight")
+        [110, 120, 130]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="tabular-regression")
+        request_parameters = provider_helper.prepare_request(
+            inputs=None,
+            parameters={},
+            extra_payload={"table": table},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return _bytes_to_list(response)
+
+    async def text_classification(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+        function_to_apply: Optional["TextClassificationOutputTransform"] = None,
+    ) -> List[TextClassificationOutputElement]:
+        """
+        Perform text classification (e.g. sentiment-analysis) on the given text.
+
+        Args:
+            text (`str`):
+                A string to be classified.
+            model (`str`, *optional*):
+                The model to use for the text classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended text classification model will be used.
+                Defaults to None.
+            top_k (`int`, *optional*):
+                When specified, limits the output to the top K most probable classes.
+            function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
+                The function to apply to the model outputs in order to retrieve the scores.
+
+        Returns:
+            `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.text_classification("I like you")
+        [
+            TextClassificationOutputElement(label='POSITIVE', score=0.9998695850372314),
+            TextClassificationOutputElement(label='NEGATIVE', score=0.0001304351753788069),
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "function_to_apply": function_to_apply,
+                "top_k": top_k,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return TextClassificationOutputElement.parse_obj_as_list(response)[0]  # type: ignore [return-value]
+
+    @overload
+    async def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[False] = ...,
+        stream: Literal[False] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> str: ...
+
+    @overload
+    async def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: Literal[False] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> TextGenerationOutput: ...
+
+    @overload
+    async def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[False] = ...,
+        stream: Literal[True] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> AsyncIterable[str]: ...
+
+    @overload
+    async def text_generation(  # type: ignore
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: Literal[True] = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> AsyncIterable[TextGenerationStreamOutput]: ...
+
+    @overload
+    async def text_generation(
+        self,
+        prompt: str,
+        *,
+        details: Literal[True] = ...,
+        stream: bool = ...,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> Union[TextGenerationOutput, AsyncIterable[TextGenerationStreamOutput]]: ...
+
+    async def text_generation(
+        self,
+        prompt: str,
+        *,
+        details: bool = False,
+        stream: bool = False,
+        model: Optional[str] = None,
+        # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
+        best_of: Optional[int] = None,
+        decoder_input_details: Optional[bool] = None,
+        do_sample: Optional[bool] = False,  # Manual default value
+        frequency_penalty: Optional[float] = None,
+        grammar: Optional[TextGenerationInputGrammarType] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: Optional[bool] = False,  # Manual default value
+        seed: Optional[int] = None,
+        stop: Optional[List[str]] = None,
+        stop_sequences: Optional[List[str]] = None,  # Deprecated, use `stop` instead
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_n_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: Optional[bool] = None,
+    ) -> Union[str, TextGenerationOutput, AsyncIterable[str], AsyncIterable[TextGenerationStreamOutput]]:
+        """
+        Given a prompt, generate the following text.
+
+        <Tip>
+
+        If you want to generate a response from chat messages, you should use the [`InferenceClient.chat_completion`] method.
+        It accepts a list of messages instead of a single text prompt and handles the chat templating for you.
+
+        </Tip>
+
+        Args:
+            prompt (`str`):
+                Input text.
+            details (`bool`, *optional*):
+                By default, text_generation returns a string. Pass `details=True` if you want a detailed output (tokens,
+                probabilities, seed, finish reason, etc.). Only available for models running on with the
+                `text-generation-inference` backend.
+            stream (`bool`, *optional*):
+                By default, text_generation returns the full generated text. Pass `stream=True` if you want a stream of
+                tokens to be returned. Only available for models running on with the `text-generation-inference`
+                backend.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            adapter_id (`str`, *optional*):
+                Lora adapter id.
+            best_of (`int`, *optional*):
+                Generate best_of sequences and return the one if the highest token logprobs.
+            decoder_input_details (`bool`, *optional*):
+                Return the decoder input token logprobs and ids. You must set `details=True` as well for it to be taken
+                into account. Defaults to `False`.
+            do_sample (`bool`, *optional*):
+                Activate logits sampling
+            frequency_penalty (`float`, *optional*):
+                Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in
+                the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+            grammar ([`TextGenerationInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
+            max_new_tokens (`int`, *optional*):
+                Maximum number of generated tokens. Defaults to 100.
+            repetition_penalty (`float`, *optional*):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            return_full_text (`bool`, *optional*):
+                Whether to prepend the prompt to the generated text
+            seed (`int`, *optional*):
+                Random sampling seed
+            stop (`List[str]`, *optional*):
+                Stop generating tokens if a member of `stop` is generated.
+            stop_sequences (`List[str]`, *optional*):
+                Deprecated argument. Use `stop` instead.
+            temperature (`float`, *optional*):
+                The value used to module the logits distribution.
+            top_n_tokens (`int`, *optional*):
+                Return information about the `top_n_tokens` most likely tokens at each generation step, instead of
+                just the sampled token.
+            top_k (`int`, *optional`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`, *optional`):
+                Truncate inputs tokens to the given size.
+            typical_p (`float`, *optional`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`, *optional`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+
+        Returns:
+            `Union[str, TextGenerationOutput, Iterable[str], Iterable[TextGenerationStreamOutput]]`:
+            Generated text returned from the server:
+            - if `stream=False` and `details=False`, the generated text is returned as a `str` (default)
+            - if `stream=True` and `details=False`, the generated text is returned token by token as a `Iterable[str]`
+            - if `stream=False` and `details=True`, the generated text is returned with more details as a [`~huggingface_hub.TextGenerationOutput`]
+            - if `details=True` and `stream=True`, the generated text is returned token by token as a iterable of [`~huggingface_hub.TextGenerationStreamOutput`]
+
+        Raises:
+            `ValidationError`:
+                If input values are not valid. No HTTP call is made to the server.
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+
+        # Case 1: generate text
+        >>> await client.text_generation("The huggingface_hub library is ", max_new_tokens=12)
+        '100% open source and built to be easy to use.'
+
+        # Case 2: iterate over the generated tokens. Useful for large generation.
+        >>> async for token in await client.text_generation("The huggingface_hub library is ", max_new_tokens=12, stream=True):
+        ...     print(token)
+        100
+        %
+        open
+        source
+        and
+        built
+        to
+        be
+        easy
+        to
+        use
+        .
+
+        # Case 3: get more details about the generation process.
+        >>> await client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True)
+        TextGenerationOutput(
+            generated_text='100% open source and built to be easy to use.',
+            details=TextGenerationDetails(
+                finish_reason='length',
+                generated_tokens=12,
+                seed=None,
+                prefill=[
+                    TextGenerationPrefillOutputToken(id=487, text='The', logprob=None),
+                    TextGenerationPrefillOutputToken(id=53789, text=' hugging', logprob=-13.171875),
+                    (...)
+                    TextGenerationPrefillOutputToken(id=204, text=' ', logprob=-7.0390625)
+                ],
+                tokens=[
+                    TokenElement(id=1425, text='100', logprob=-1.0175781, special=False),
+                    TokenElement(id=16, text='%', logprob=-0.0463562, special=False),
+                    (...)
+                    TokenElement(id=25, text='.', logprob=-0.5703125, special=False)
+                ],
+                best_of_sequences=None
+            )
+        )
+
+        # Case 4: iterate over the generated tokens with more details.
+        # Last object is more complete, containing the full generated text and the finish reason.
+        >>> async for details in await client.text_generation("The huggingface_hub library is ", max_new_tokens=12, details=True, stream=True):
+        ...     print(details)
+        ...
+        TextGenerationStreamOutput(token=TokenElement(id=1425, text='100', logprob=-1.0175781, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=16, text='%', logprob=-0.0463562, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1314, text=' open', logprob=-1.3359375, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3178, text=' source', logprob=-0.28100586, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=273, text=' and', logprob=-0.5961914, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=3426, text=' built', logprob=-1.9423828, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-1.4121094, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=314, text=' be', logprob=-1.5224609, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=1833, text=' easy', logprob=-2.1132812, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=271, text=' to', logprob=-0.08520508, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(id=745, text=' use', logprob=-0.39453125, special=False), generated_text=None, details=None)
+        TextGenerationStreamOutput(token=TokenElement(
+            id=25,
+            text='.',
+            logprob=-0.5703125,
+            special=False),
+            generated_text='100% open source and built to be easy to use.',
+            details=TextGenerationStreamOutputStreamDetails(finish_reason='length', generated_tokens=12, seed=None)
+        )
+
+        # Case 5: generate constrained output using grammar
+        >>> response = await client.text_generation(
+        ...     prompt="I saw a puppy a cat and a raccoon during my bike ride in the park",
+        ...     model="HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
+        ...     max_new_tokens=100,
+        ...     repetition_penalty=1.3,
+        ...     grammar={
+        ...         "type": "json",
+        ...         "value": {
+        ...             "properties": {
+        ...                 "location": {"type": "string"},
+        ...                 "activity": {"type": "string"},
+        ...                 "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...                 "animals": {"type": "array", "items": {"type": "string"}},
+        ...             },
+        ...             "required": ["location", "activity", "animals_seen", "animals"],
+        ...         },
+        ...     },
+        ... )
+        >>> json.loads(response)
+        {
+            "activity": "bike riding",
+            "animals": ["puppy", "cat", "raccoon"],
+            "animals_seen": 3,
+            "location": "park"
+        }
+        ```
+        """
+        if decoder_input_details and not details:
+            warnings.warn(
+                "`decoder_input_details=True` has been passed to the server but `details=False` is set meaning that"
+                " the output from the server will be truncated."
+            )
+            decoder_input_details = False
+
+        if stop_sequences is not None:
+            warnings.warn(
+                "`stop_sequences` is a deprecated argument for `text_generation` task"
+                " and will be removed in version '0.28.0'. Use `stop` instead.",
+                FutureWarning,
+            )
+        if stop is None:
+            stop = stop_sequences  # use deprecated arg if provided
+
+        # Build payload
+        parameters = {
+            "adapter_id": adapter_id,
+            "best_of": best_of,
+            "decoder_input_details": decoder_input_details,
+            "details": details,
+            "do_sample": do_sample,
+            "frequency_penalty": frequency_penalty,
+            "grammar": grammar,
+            "max_new_tokens": max_new_tokens,
+            "repetition_penalty": repetition_penalty,
+            "return_full_text": return_full_text,
+            "seed": seed,
+            "stop": stop if stop is not None else [],
+            "temperature": temperature,
+            "top_k": top_k,
+            "top_n_tokens": top_n_tokens,
+            "top_p": top_p,
+            "truncate": truncate,
+            "typical_p": typical_p,
+            "watermark": watermark,
+        }
+
+        # Remove some parameters if not a TGI server
+        unsupported_kwargs = _get_unsupported_text_generation_kwargs(model)
+        if len(unsupported_kwargs) > 0:
+            # The server does not support some parameters
+            # => means it is not a TGI server
+            # => remove unsupported parameters and warn the user
+
+            ignored_parameters = []
+            for key in unsupported_kwargs:
+                if parameters.get(key):
+                    ignored_parameters.append(key)
+                parameters.pop(key, None)
+            if len(ignored_parameters) > 0:
+                warnings.warn(
+                    "API endpoint/model for text-generation is not served via TGI. Ignoring following parameters:"
+                    f" {', '.join(ignored_parameters)}.",
+                    UserWarning,
+                )
+            if details:
+                warnings.warn(
+                    "API endpoint/model for text-generation is not served via TGI. Parameter `details=True` will"
+                    " be ignored meaning only the generated text will be returned.",
+                    UserWarning,
+                )
+                details = False
+            if stream:
+                raise ValueError(
+                    "API endpoint/model for text-generation is not served via TGI. Cannot return output as a stream."
+                    " Please pass `stream=False` as input."
+                )
+
+        provider_helper = get_provider_helper(self.provider, task="text-generation")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters=parameters,
+            extra_payload={"stream": stream},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+
+        # Handle errors separately for more precise error messages
+        try:
+            bytes_output = await self._inner_post(request_parameters, stream=stream)
+        except _import_aiohttp().ClientResponseError as e:
+            match = MODEL_KWARGS_NOT_USED_REGEX.search(e.response_error_payload["error"])
+            if e.status == 400 and match:
+                unused_params = [kwarg.strip("' ") for kwarg in match.group(1).split(",")]
+                _set_unsupported_text_generation_kwargs(model, unused_params)
+                return await self.text_generation(  # type: ignore
+                    prompt=prompt,
+                    details=details,
+                    stream=stream,
+                    model=model or self.model,
+                    adapter_id=adapter_id,
+                    best_of=best_of,
+                    decoder_input_details=decoder_input_details,
+                    do_sample=do_sample,
+                    frequency_penalty=frequency_penalty,
+                    grammar=grammar,
+                    max_new_tokens=max_new_tokens,
+                    repetition_penalty=repetition_penalty,
+                    return_full_text=return_full_text,
+                    seed=seed,
+                    stop=stop,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_n_tokens=top_n_tokens,
+                    top_p=top_p,
+                    truncate=truncate,
+                    typical_p=typical_p,
+                    watermark=watermark,
+                )
+            raise_text_generation_error(e)
+
+        # Parse output
+        if stream:
+            return _async_stream_text_generation_response(bytes_output, details)  # type: ignore
+
+        data = _bytes_to_dict(bytes_output)  # type: ignore[arg-type]
+
+        # Data can be a single element (dict) or an iterable of dicts where we select the first element of.
+        if isinstance(data, list):
+            data = data[0]
+
+        return TextGenerationOutput.parse_obj_as_instance(data) if details else data["generated_text"]
+
+    async def text_to_image(
+        self,
+        prompt: str,
+        *,
+        negative_prompt: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        model: Optional[str] = None,
+        scheduler: Optional[str] = None,
+        seed: Optional[int] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+    ) -> "Image":
+        """
+        Generate an image based on a given text using a specified model.
+
+        <Tip warning={true}>
+
+        You must have `PIL` installed if you want to work with images (`pip install Pillow`).
+
+        </Tip>
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            prompt (`str`):
+                The prompt to generate an image from.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
+            height (`int`, *optional*):
+                The height in pixels of the output image
+            width (`int`, *optional*):
+                The width in pixels of the output image
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                prompt, but values too high may cause saturation and other artifacts.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-image model will be used.
+                Defaults to None.
+            scheduler (`str`, *optional*):
+                Override the scheduler with a compatible one.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+            extra_body (`Dict[str, Any]`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+
+        Returns:
+            `Image`: The generated image.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+
+        >>> image = await client.text_to_image("An astronaut riding a horse on the moon.")
+        >>> image.save("astronaut.png")
+
+        >>> image = await client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     negative_prompt="low resolution, blurry",
+        ...     model="stabilityai/stable-diffusion-2-1",
+        ... )
+        >>> image.save("better_astronaut.png")
+        ```
+        Example using a third-party provider directly. Usage will be billed on your fal.ai account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",  # Use fal.ai provider
+        ...     api_key="fal-ai-api-key",  # Pass your fal.ai API key
+        ... )
+        >>> image = client.text_to_image(
+        ...     "A majestic lion in a fantasy forest",
+        ...     model="black-forest-labs/FLUX.1-schnell",
+        ... )
+        >>> image.save("lion.png")
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> image = client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     model="black-forest-labs/FLUX.1-dev",
+        ... )
+        >>> image.save("astronaut.png")
+        ```
+
+        Example using Replicate provider with extra parameters
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> image = client.text_to_image(
+        ...     "An astronaut riding a horse on the moon.",
+        ...     model="black-forest-labs/FLUX.1-schnell",
+        ...     extra_body={"output_quality": 100},
+        ... )
+        >>> image.save("astronaut.png")
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-image")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters={
+                "negative_prompt": negative_prompt,
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                "scheduler": scheduler,
+                "seed": seed,
+                **(extra_body or {}),
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return _bytes_to_image(response)
+
+    async def text_to_video(
+        self,
+        prompt: str,
+        *,
+        model: Optional[str] = None,
+        guidance_scale: Optional[float] = None,
+        negative_prompt: Optional[List[str]] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        seed: Optional[int] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        """
+        Generate a video based on a given text.
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            prompt (`str`):
+                The prompt to generate a video from.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-video model will be used.
+                Defaults to None.
+            guidance_scale (`float`, *optional*):
+                A higher guidance scale value encourages the model to generate videos closely linked to the text
+                prompt, but values too high may cause saturation and other artifacts.
+            negative_prompt (`List[str]`, *optional*):
+                One or several prompt to guide what NOT to include in video generation.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+            extra_body (`Dict[str, Any]`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+
+        Returns:
+            `bytes`: The generated video.
+
+        Example:
+
+        Example using a third-party provider directly. Usage will be billed on your fal.ai account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",  # Using fal.ai provider
+        ...     api_key="fal-ai-api-key",  # Pass your fal.ai API key
+        ... )
+        >>> video = client.text_to_video(
+        ...     "A majestic lion running in a fantasy forest",
+        ...     model="tencent/HunyuanVideo",
+        ... )
+        >>> with open("lion.mp4", "wb") as file:
+        ...     file.write(video)
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Using replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> video = client.text_to_video(
+        ...     "A cat running in a park",
+        ...     model="genmo/mochi-1-preview",
+        ... )
+        >>> with open("cat.mp4", "wb") as file:
+        ...     file.write(video)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-video")
+        request_parameters = provider_helper.prepare_request(
+            inputs=prompt,
+            parameters={
+                "guidance_scale": guidance_scale,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "seed": seed,
+                **(extra_body or {}),
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return response
+
+    async def text_to_speech(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
+        epsilon_cutoff: Optional[float] = None,
+        eta_cutoff: Optional[float] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        min_length: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        num_beam_groups: Optional[int] = None,
+        num_beams: Optional[int] = None,
+        penalty_alpha: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        use_cache: Optional[bool] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        """
+        Synthesize an audio of a voice pronouncing a given text.
+
+        <Tip>
+        You can pass provider-specific parameters to the model by using the `extra_body` argument.
+        </Tip>
+
+        Args:
+            text (`str`):
+                The text to synthesize.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. If not provided, the default recommended text-to-speech model will be used.
+                Defaults to None.
+            do_sample (`bool`, *optional*):
+                Whether to use sampling instead of greedy decoding when generating new tokens.
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"]`, *optional*):
+                Controls the stopping condition for beam-based methods.
+            epsilon_cutoff (`float`, *optional*):
+                If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
+                epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on
+                the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            eta_cutoff (`float`, *optional*):
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly
+                between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff)
+                * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token
+                probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3,
+                depending on the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            max_length (`int`, *optional*):
+                The maximum length (in tokens) of the generated text, including the input.
+            max_new_tokens (`int`, *optional*):
+                The maximum number of tokens to generate. Takes precedence over max_length.
+            min_length (`int`, *optional*):
+                The minimum length (in tokens) of the generated text, including the input.
+            min_new_tokens (`int`, *optional*):
+                The minimum number of tokens to generate. Takes precedence over min_length.
+            num_beam_groups (`int`, *optional*):
+                Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
+                See [this paper](https://hf.co/papers/1610.02424) for more details.
+            num_beams (`int`, *optional*):
+                Number of beams to use for beam search.
+            penalty_alpha (`float`, *optional*):
+                The value balances the model confidence and the degeneration penalty in contrastive search decoding.
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            top_k (`int`, *optional*):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional*):
+                If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
+                top_p or higher are kept for generation.
+            typical_p (`float`, *optional*):
+                Local typicality measures how similar the conditional probability of predicting a target token next is
+                to the expected conditional probability of predicting a random token next, given the partial text
+                already generated. If set to float < 1, the smallest set of the most locally typical tokens with
+                probabilities that add up to typical_p or higher are kept for generation. See [this
+                paper](https://hf.co/papers/2202.00666) for more details.
+            use_cache (`bool`, *optional*):
+                Whether the model should use the past last key/values attentions to speed up decoding
+            extra_body (`Dict[str, Any]`, *optional*):
+                Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
+                for supported parameters.
+        Returns:
+            `bytes`: The generated audio.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from pathlib import Path
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+
+        >>> audio = await client.text_to_speech("Hello world")
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+
+        Example using a third-party provider directly. Usage will be billed on your Replicate account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",
+        ...     api_key="your-replicate-api-key",  # Pass your Replicate API key directly
+        ... )
+        >>> audio = client.text_to_speech(
+        ...     text="Hello world",
+        ...     model="OuteAI/OuteTTS-0.3-500M",
+        ... )
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+
+        Example using a third-party provider through Hugging Face Routing. Usage will be billed on your Hugging Face account.
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> audio =client.text_to_speech(
+        ...     text="Hello world",
+        ...     model="OuteAI/OuteTTS-0.3-500M",
+        ... )
+        >>> Path("hello_world.flac").write_bytes(audio)
+        ```
+        Example using Replicate provider with extra parameters
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient(
+        ...     provider="replicate",  # Use replicate provider
+        ...     api_key="hf_...",  # Pass your HF token
+        ... )
+        >>> audio = client.text_to_speech(
+        ...     "Hello, my name is Kororo, an awesome text-to-speech model.",
+        ...     model="hexgrad/Kokoro-82M",
+        ...     extra_body={"voice": "af_nicole"},
+        ... )
+        >>> Path("hello.flac").write_bytes(audio)
+        ```
+
+        Example music-gen using "YuE-s1-7B-anneal-en-cot" on fal.ai
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> lyrics = '''
+        ... [verse]
+        ... In the town where I was born
+        ... Lived a man who sailed to sea
+        ... And he told us of his life
+        ... In the land of submarines
+        ... So we sailed on to the sun
+        ... 'Til we found a sea of green
+        ... And we lived beneath the waves
+        ... In our yellow submarine
+
+        ... [chorus]
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... '''
+        >>> genres = "pavarotti-style tenor voice"
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",
+        ...     model="m-a-p/YuE-s1-7B-anneal-en-cot",
+        ...     api_key=...,
+        ... )
+        >>> audio = client.text_to_speech(lyrics, extra_body={"genres": genres})
+        >>> with open("output.mp3", "wb") as f:
+        ...     f.write(audio)
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="text-to-speech")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "do_sample": do_sample,
+                "early_stopping": early_stopping,
+                "epsilon_cutoff": epsilon_cutoff,
+                "eta_cutoff": eta_cutoff,
+                "max_length": max_length,
+                "max_new_tokens": max_new_tokens,
+                "min_length": min_length,
+                "min_new_tokens": min_new_tokens,
+                "num_beam_groups": num_beam_groups,
+                "num_beams": num_beams,
+                "penalty_alpha": penalty_alpha,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "typical_p": typical_p,
+                "use_cache": use_cache,
+                **(extra_body or {}),
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response)
+        return response
+
+    async def token_classification(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None,
+        ignore_labels: Optional[List[str]] = None,
+        stride: Optional[int] = None,
+    ) -> List[TokenClassificationOutputElement]:
+        """
+        Perform token classification on the given text.
+        Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text.
+
+        Args:
+            text (`str`):
+                A string to be classified.
+            model (`str`, *optional*):
+                The model to use for the token classification task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended token classification model will be used.
+                Defaults to None.
+            aggregation_strategy (`"TokenClassificationAggregationStrategy"`, *optional*):
+                The strategy used to fuse tokens based on model predictions
+            ignore_labels (`List[str`, *optional*):
+                A list of labels to ignore
+            stride (`int`, *optional*):
+                The number of overlapping tokens between chunks when splitting the input text.
+
+        Returns:
+            `List[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.token_classification("My name is Sarah Jessica Parker but you can call me Jessica")
+        [
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9971321225166321,
+                word='Sarah Jessica Parker',
+                start=11,
+                end=31,
+            ),
+            TokenClassificationOutputElement(
+                entity_group='PER',
+                score=0.9773476123809814,
+                word='Jessica',
+                start=52,
+                end=59,
+            )
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="token-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "aggregation_strategy": aggregation_strategy,
+                "ignore_labels": ignore_labels,
+                "stride": stride,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return TokenClassificationOutputElement.parse_obj_as_list(response)
+
+    async def translation(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        truncation: Optional["TranslationTruncationStrategy"] = None,
+        generate_parameters: Optional[Dict[str, Any]] = None,
+    ) -> TranslationOutput:
+        """
+        Convert text from one language to another.
+
+        Check out https://huggingface.co/tasks/translation for more information on how to choose the best model for
+        your specific use case. Source and target languages usually depend on the model.
+        However, it is possible to specify source and target languages for certain models. If you are working with one of these models,
+        you can use `src_lang` and `tgt_lang` arguments to pass the relevant information.
+
+        Args:
+            text (`str`):
+                A string to be translated.
+            model (`str`, *optional*):
+                The model to use for the translation task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended translation model will be used.
+                Defaults to None.
+            src_lang (`str`, *optional*):
+                The source language of the text. Required for models that can translate from multiple languages.
+            tgt_lang (`str`, *optional*):
+                Target language to translate to. Required for models that can translate to multiple languages.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether to clean up the potential extra spaces in the text output.
+            truncation (`"TranslationTruncationStrategy"`, *optional*):
+                The truncation strategy to use.
+            generate_parameters (`Dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
+
+        Returns:
+            [`TranslationOutput`]: The generated translated text.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+            `ValueError`:
+                If only one of the `src_lang` and `tgt_lang` arguments are provided.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.translation("My name is Wolfgang and I live in Berlin")
+        'Mein Name ist Wolfgang und ich lebe in Berlin.'
+        >>> await client.translation("My name is Wolfgang and I live in Berlin", model="Helsinki-NLP/opus-mt-en-fr")
+        TranslationOutput(translation_text='Je m'appelle Wolfgang et je vis à Berlin.')
+        ```
+
+        Specifying languages:
+        ```py
+        >>> client.translation("My name is Sarah Jessica Parker but you can call me Jessica", model="facebook/mbart-large-50-many-to-many-mmt", src_lang="en_XX", tgt_lang="fr_XX")
+        "Mon nom est Sarah Jessica Parker mais vous pouvez m'appeler Jessica"
+        ```
+        """
+        # Throw error if only one of `src_lang` and `tgt_lang` was given
+        if src_lang is not None and tgt_lang is None:
+            raise ValueError("You cannot specify `src_lang` without specifying `tgt_lang`.")
+
+        if src_lang is None and tgt_lang is not None:
+            raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.")
+
+        provider_helper = get_provider_helper(self.provider, task="translation")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "src_lang": src_lang,
+                "tgt_lang": tgt_lang,
+                "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+                "truncation": truncation,
+                "generate_parameters": generate_parameters,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return TranslationOutput.parse_obj_as_list(response)[0]
+
+    async def visual_question_answering(
+        self,
+        image: ContentT,
+        question: str,
+        *,
+        model: Optional[str] = None,
+        top_k: Optional[int] = None,
+    ) -> List[VisualQuestionAnsweringOutputElement]:
+        """
+        Answering open-ended questions based on an image.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            question (`str`):
+                Question to be answered.
+            model (`str`, *optional*):
+                The model to use for the visual question answering task. Can be a model ID hosted on the Hugging Face Hub or a URL to
+                a deployed Inference Endpoint. If not provided, the default recommended visual question answering model will be used.
+                Defaults to None.
+            top_k (`int`, *optional*):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                topk answers if there are not enough options available within the context.
+        Returns:
+            `List[VisualQuestionAnsweringOutputElement]`: a list of [`VisualQuestionAnsweringOutputElement`] items containing the predicted label and associated probability.
+
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.visual_question_answering(
+        ...     image="https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg",
+        ...     question="What is the animal doing?"
+        ... )
+        [
+            VisualQuestionAnsweringOutputElement(score=0.778609573841095, answer='laying down'),
+            VisualQuestionAnsweringOutputElement(score=0.6957435607910156, answer='sitting'),
+        ]
+        ```
+        """
+        provider_helper = get_provider_helper(self.provider, task="visual-question-answering")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={"top_k": top_k},
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+            extra_payload={"question": question, "image": _b64_encode(image)},
+        )
+        response = await self._inner_post(request_parameters)
+        return VisualQuestionAnsweringOutputElement.parse_obj_as_list(response)
+
+    @_deprecate_arguments(
+        version="0.30.0",
+        deprecated_args=["labels"],
+        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
+    )
+    async def zero_shot_classification(
+        self,
+        text: str,
+        # temporarily keeping it optional for backward compatibility.
+        candidate_labels: List[str] = None,  # type: ignore
+        *,
+        multi_label: Optional[bool] = False,
+        hypothesis_template: Optional[str] = None,
+        model: Optional[str] = None,
+        # deprecated argument
+        labels: List[str] = None,  # type: ignore
+    ) -> List[ZeroShotClassificationOutputElement]:
+        """
+        Provide as input a text and a set of candidate labels to classify the input text.
+
+        Args:
+            text (`str`):
+                The input text to classify.
+            candidate_labels (`List[str]`):
+                The set of possible class labels to classify the text into.
+            labels (`List[str]`, *optional*):
+                (deprecated) List of strings. Each string is the verbalization of a possible label for the input text.
+            multi_label (`bool`, *optional*):
+                Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of
+                the label likelihoods for each sequence is 1. If true, the labels are considered independent and
+                probabilities are normalized for each candidate.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
+
+
+        Returns:
+            `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example with `multi_label=False`:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> text = (
+        ...     "A new model offers an explanation for how the Galilean satellites formed around the solar system's"
+        ...     "largest world. Konstantin Batygin did not set out to solve one of the solar system's most puzzling"
+        ...     " mysteries when he went for a run up a hill in Nice, France."
+        ... )
+        >>> labels = ["space & cosmos", "scientific discovery", "microbiology", "robots", "archeology"]
+        >>> await client.zero_shot_classification(text, labels)
+        [
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.7961668968200684),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.18570658564567566),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.00730885099619627),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.006258360575884581),
+            ZeroShotClassificationOutputElement(label='robots', score=0.004559356719255447),
+        ]
+        >>> await client.zero_shot_classification(text, labels, multi_label=True)
+        [
+            ZeroShotClassificationOutputElement(label='scientific discovery', score=0.9829297661781311),
+            ZeroShotClassificationOutputElement(label='space & cosmos', score=0.755190908908844),
+            ZeroShotClassificationOutputElement(label='microbiology', score=0.0005462635890580714),
+            ZeroShotClassificationOutputElement(label='archeology', score=0.00047131875180639327),
+            ZeroShotClassificationOutputElement(label='robots', score=0.00030448526376858354),
+        ]
+        ```
+
+        Example with `multi_label=True` and a custom `hypothesis_template`:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.zero_shot_classification(
+        ...    text="I really like our dinner and I'm very happy. I don't like the weather though.",
+        ...    labels=["positive", "negative", "pessimistic", "optimistic"],
+        ...    multi_label=True,
+        ...    hypothesis_template="This text is {} towards the weather"
+        ... )
+        [
+            ZeroShotClassificationOutputElement(label='negative', score=0.9231801629066467),
+            ZeroShotClassificationOutputElement(label='pessimistic', score=0.8760990500450134),
+            ZeroShotClassificationOutputElement(label='optimistic', score=0.0008674879791215062),
+            ZeroShotClassificationOutputElement(label='positive', score=0.0005250611575320363)
+        ]
+        ```
+        """
+        # handle deprecation
+        if labels is not None:
+            if candidate_labels is not None:
+                raise ValueError(
+                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
+                )
+            candidate_labels = labels
+        elif candidate_labels is None:
+            raise ValueError("Must specify `candidate_labels`")
+
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=text,
+            parameters={
+                "candidate_labels": candidate_labels,
+                "multi_label": multi_label,
+                "hypothesis_template": hypothesis_template,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        output = _bytes_to_dict(response)
+        return [
+            ZeroShotClassificationOutputElement.parse_obj_as_instance({"label": label, "score": score})
+            for label, score in zip(output["labels"], output["scores"])
+        ]
+
+    @_deprecate_arguments(
+        version="0.30.0",
+        deprecated_args=["labels"],
+        custom_message="`labels`has been renamed to `candidate_labels` and will be removed in huggingface_hub>=0.30.0.",
+    )
+    async def zero_shot_image_classification(
+        self,
+        image: ContentT,
+        # temporarily keeping it optional for backward compatibility.
+        candidate_labels: List[str] = None,  # type: ignore
+        *,
+        model: Optional[str] = None,
+        hypothesis_template: Optional[str] = None,
+        # deprecated argument
+        labels: List[str] = None,  # type: ignore
+    ) -> List[ZeroShotImageClassificationOutputElement]:
+        """
+        Provide input image and text labels to predict text labels for the image.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO]`):
+                The input image to caption. It can be raw bytes, an image file, or a URL to an online image.
+            candidate_labels (`List[str]`):
+                The candidate labels for this image
+            labels (`List[str]`, *optional*):
+                (deprecated) List of string possible labels. There must be at least 2 labels.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
+
+        Returns:
+            `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
+
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+
+        >>> await client.zero_shot_image_classification(
+        ...     "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
+        ...     labels=["dog", "cat", "horse"],
+        ... )
+        [ZeroShotImageClassificationOutputElement(label='dog', score=0.956),...]
+        ```
+        """
+        # handle deprecation
+        if labels is not None:
+            if candidate_labels is not None:
+                raise ValueError(
+                    "Cannot specify both `labels` and `candidate_labels`. Use `candidate_labels` instead."
+                )
+            candidate_labels = labels
+        elif candidate_labels is None:
+            raise ValueError("Must specify `candidate_labels`")
+        # Raise ValueError if input is less than 2 labels
+        if len(candidate_labels) < 2:
+            raise ValueError("You must specify at least 2 classes to compare.")
+
+        provider_helper = get_provider_helper(self.provider, task="zero-shot-image-classification")
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "candidate_labels": candidate_labels,
+                "hypothesis_template": hypothesis_template,
+            },
+            headers=self.headers,
+            model=model or self.model,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
+
+    @_deprecate_method(
+        version="0.33.0",
+        message=(
+            "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)."
+            " Use `HfApi.list_models(..., inference_provider='...')` to list warm models per provider."
+        ),
+    )
+    async def list_deployed_models(
+        self, frameworks: Union[None, str, Literal["all"], List[str]] = None
+    ) -> Dict[str, List[str]]:
+        """
+        List models deployed on the HF Serverless Inference API service.
+
+        This helper checks deployed models framework by framework. By default, it will check the 4 main frameworks that
+        are supported and account for 95% of the hosted models. However, if you want a complete list of models you can
+        specify `frameworks="all"` as input. Alternatively, if you know before-hand which framework you are interested
+        in, you can also restrict to search to this one (e.g. `frameworks="text-generation-inference"`). The more
+        frameworks are checked, the more time it will take.
+
+        <Tip warning={true}>
+
+        This endpoint method does not return a live list of all models available for the HF Inference API service.
+        It searches over a cached list of models that were recently available and the list may not be up to date.
+        If you want to know the live status of a specific model, use [`~InferenceClient.get_model_status`].
+
+        </Tip>
+
+        <Tip>
+
+        This endpoint method is mostly useful for discoverability. If you already know which model you want to use and want to
+        check its availability, you can directly use [`~InferenceClient.get_model_status`].
+
+        </Tip>
+
+        Args:
+            frameworks (`Literal["all"]` or `List[str]` or `str`, *optional*):
+                The frameworks to filter on. By default only a subset of the available frameworks are tested. If set to
+                "all", all available frameworks will be tested. It is also possible to provide a single framework or a
+                custom set of frameworks to check.
+
+        Returns:
+            `Dict[str, List[str]]`: A dictionary mapping task names to a sorted list of model IDs.
+
+        Example:
+        ```py
+        # Must be run in an async contextthon
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+
+        # Discover zero-shot-classification models currently deployed
+        >>> models = await client.list_deployed_models()
+        >>> models["zero-shot-classification"]
+        ['Narsil/deberta-large-mnli-zero-cls', 'facebook/bart-large-mnli', ...]
+
+        # List from only 1 framework
+        >>> await client.list_deployed_models("text-generation-inference")
+        {'text-generation': ['bigcode/starcoder', 'meta-llama/Llama-2-70b-chat-hf', ...], ...}
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Listing deployed models is not supported on '{self.provider}'.")
+
+        # Resolve which frameworks to check
+        if frameworks is None:
+            frameworks = constants.MAIN_INFERENCE_API_FRAMEWORKS
+        elif frameworks == "all":
+            frameworks = constants.ALL_INFERENCE_API_FRAMEWORKS
+        elif isinstance(frameworks, str):
+            frameworks = [frameworks]
+        frameworks = list(set(frameworks))
+
+        # Fetch them iteratively
+        models_by_task: Dict[str, List[str]] = {}
+
+        def _unpack_response(framework: str, items: List[Dict]) -> None:
+            for model in items:
+                if framework == "sentence-transformers":
+                    # Model running with the `sentence-transformers` framework can work with both tasks even if not
+                    # branded as such in the API response
+                    models_by_task.setdefault("feature-extraction", []).append(model["model_id"])
+                    models_by_task.setdefault("sentence-similarity", []).append(model["model_id"])
+                else:
+                    models_by_task.setdefault(model["task"], []).append(model["model_id"])
+
+        for framework in frameworks:
+            response = get_session().get(
+                f"{constants.INFERENCE_ENDPOINT}/framework/{framework}", headers=build_hf_headers(token=self.token)
+            )
+            hf_raise_for_status(response)
+            _unpack_response(framework, response.json())
+
+        # Sort alphabetically for discoverability and return
+        for task, models in models_by_task.items():
+            models_by_task[task] = sorted(set(models), key=lambda x: x.lower())
+        return models_by_task
+
+    def _get_client_session(self, headers: Optional[Dict] = None) -> "ClientSession":
+        aiohttp = _import_aiohttp()
+        client_headers = self.headers.copy()
+        if headers is not None:
+            client_headers.update(headers)
+
+        # Return a new aiohttp ClientSession with correct settings.
+        session = aiohttp.ClientSession(
+            headers=client_headers,
+            cookies=self.cookies,
+            timeout=aiohttp.ClientTimeout(self.timeout),
+            trust_env=self.trust_env,
+        )
+
+        # Keep track of sessions to close them later
+        self._sessions[session] = set()
+
+        # Override the `._request` method to register responses to be closed
+        session._wrapped_request = session._request
+
+        async def _request(method, url, **kwargs):
+            response = await session._wrapped_request(method, url, **kwargs)
+            self._sessions[session].add(response)
+            return response
+
+        session._request = _request
+
+        # Override the 'close' method to
+        # 1. close ongoing responses
+        # 2. deregister the session when closed
+        session._close = session.close
+
+        async def close_session():
+            for response in self._sessions[session]:
+                response.close()
+            await session._close()
+            self._sessions.pop(session, None)
+
+        session.close = close_session
+        return session
+
+    async def get_endpoint_info(self, *, model: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Get information about the deployed endpoint.
+
+        This endpoint is only available on endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
+        Endpoints powered by `transformers` return an empty payload.
+
+        Args:
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+
+        Returns:
+            `Dict[str, Any]`: Information about the endpoint.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> await client.get_endpoint_info()
+        {
+            'model_id': 'meta-llama/Meta-Llama-3-70B-Instruct',
+            'model_sha': None,
+            'model_dtype': 'torch.float16',
+            'model_device_type': 'cuda',
+            'model_pipeline_tag': None,
+            'max_concurrent_requests': 128,
+            'max_best_of': 2,
+            'max_stop_sequences': 4,
+            'max_input_length': 8191,
+            'max_total_tokens': 8192,
+            'waiting_served_ratio': 0.3,
+            'max_batch_total_tokens': 1259392,
+            'max_waiting_tokens': 20,
+            'max_batch_size': None,
+            'validation_workers': 32,
+            'max_client_batch_size': 4,
+            'version': '2.0.2',
+            'sha': 'dccab72549635c7eb5ddb17f43f0b7cdff07c214',
+            'docker_label': 'sha-dccab72'
+        }
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Getting endpoint info is not supported on '{self.provider}'.")
+
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if model.startswith(("http://", "https://")):
+            url = model.rstrip("/") + "/info"
+        else:
+            url = f"{constants.INFERENCE_ENDPOINT}/models/{model}/info"
+
+        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
+            response = await client.get(url, proxy=self.proxies)
+            response.raise_for_status()
+            return await response.json()
+
+    async def health_check(self, model: Optional[str] = None) -> bool:
+        """
+        Check the health of the deployed endpoint.
+
+        Health check is only available with Inference Endpoints powered by Text-Generation-Inference (TGI) or Text-Embedding-Inference (TEI).
+        For Inference API, please use [`InferenceClient.get_model_status`] instead.
+
+        Args:
+            model (`str`, *optional*):
+                URL of the Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+
+        Returns:
+            `bool`: True if everything is working fine.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("https://jzgu0buei5.us-east-1.aws.endpoints.huggingface.cloud")
+        >>> await client.health_check()
+        True
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Health check is not supported on '{self.provider}'.")
+
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if not model.startswith(("http://", "https://")):
+            raise ValueError(
+                "Model must be an Inference Endpoint URL. For serverless Inference API, please use `InferenceClient.get_model_status`."
+            )
+        url = model.rstrip("/") + "/health"
+
+        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
+            response = await client.get(url, proxy=self.proxies)
+            return response.status == 200
+
+    @_deprecate_method(
+        version="0.33.0",
+        message=(
+            "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)."
+            " Use `HfApi.model_info` to get the model status both with HF Inference API and external providers."
+        ),
+    )
+    async def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
+        """
+        Get the status of a model hosted on the HF Inference API.
+
+        <Tip>
+
+        This endpoint is mostly useful when you already know which model you want to use and want to check its
+        availability. If you want to discover already deployed models, you should rather use [`~InferenceClient.list_deployed_models`].
+
+        </Tip>
+
+        Args:
+            model (`str`, *optional*):
+                Identifier of the model for witch the status gonna be checked. If model is not provided,
+                the model associated with this instance of [`InferenceClient`] will be used. Only HF Inference API service can be checked so the
+                identifier cannot be a URL.
+
+
+        Returns:
+            [`ModelStatus`]: An instance of ModelStatus dataclass, containing information,
+                         about the state of the model: load, state, compute type and framework.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> await client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
+        ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
+        ```
+        """
+        if self.provider != "hf-inference":
+            raise ValueError(f"Getting model status is not supported on '{self.provider}'.")
+
+        model = model or self.model
+        if model is None:
+            raise ValueError("Model id not provided.")
+        if model.startswith("https://"):
+            raise NotImplementedError("Model status is only available for Inference API endpoints.")
+        url = f"{constants.INFERENCE_ENDPOINT}/status/{model}"
+
+        async with self._get_client_session(headers=build_hf_headers(token=self.token)) as client:
+            response = await client.get(url, proxy=self.proxies)
+            response.raise_for_status()
+            response_data = await response.json()
+
+        if "error" in response_data:
+            raise ValueError(response_data["error"])
+
+        return ModelStatus(
+            loaded=response_data["loaded"],
+            state=response_data["state"],
+            compute_type=response_data["compute_type"],
+            framework=response_data["framework"],
+        )
+
+    @property
+    def chat(self) -> "ProxyClientChat":
+        return ProxyClientChat(self)
+
+
+class _ProxyClient:
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    def __init__(self, client: AsyncInferenceClient):
+        self._client = client
+
+
+class ProxyClientChat(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def completions(self) -> "ProxyClientChatCompletions":
+        return ProxyClientChatCompletions(self._client)
+
+
+class ProxyClientChatCompletions(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def create(self):
+        return self._client.chat_completion
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/__init__.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/__init__.py
new file mode 100644
index 00000000..edbc967c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/__init__.py
@@ -0,0 +1,187 @@
+# This file is auto-generated by `utils/generate_inference_types.py`.
+# Do not modify it manually.
+#
+# ruff: noqa: F401
+
+from .audio_classification import (
+    AudioClassificationInput,
+    AudioClassificationOutputElement,
+    AudioClassificationOutputTransform,
+    AudioClassificationParameters,
+)
+from .audio_to_audio import AudioToAudioInput, AudioToAudioOutputElement
+from .automatic_speech_recognition import (
+    AutomaticSpeechRecognitionEarlyStoppingEnum,
+    AutomaticSpeechRecognitionGenerationParameters,
+    AutomaticSpeechRecognitionInput,
+    AutomaticSpeechRecognitionOutput,
+    AutomaticSpeechRecognitionOutputChunk,
+    AutomaticSpeechRecognitionParameters,
+)
+from .base import BaseInferenceType
+from .chat_completion import (
+    ChatCompletionInput,
+    ChatCompletionInputFunctionDefinition,
+    ChatCompletionInputFunctionName,
+    ChatCompletionInputGrammarType,
+    ChatCompletionInputGrammarTypeType,
+    ChatCompletionInputMessage,
+    ChatCompletionInputMessageChunk,
+    ChatCompletionInputMessageChunkType,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputTool,
+    ChatCompletionInputToolChoiceClass,
+    ChatCompletionInputToolChoiceEnum,
+    ChatCompletionInputURL,
+    ChatCompletionOutput,
+    ChatCompletionOutputComplete,
+    ChatCompletionOutputFunctionDefinition,
+    ChatCompletionOutputLogprob,
+    ChatCompletionOutputLogprobs,
+    ChatCompletionOutputMessage,
+    ChatCompletionOutputToolCall,
+    ChatCompletionOutputTopLogprob,
+    ChatCompletionOutputUsage,
+    ChatCompletionStreamOutput,
+    ChatCompletionStreamOutputChoice,
+    ChatCompletionStreamOutputDelta,
+    ChatCompletionStreamOutputDeltaToolCall,
+    ChatCompletionStreamOutputFunction,
+    ChatCompletionStreamOutputLogprob,
+    ChatCompletionStreamOutputLogprobs,
+    ChatCompletionStreamOutputTopLogprob,
+    ChatCompletionStreamOutputUsage,
+)
+from .depth_estimation import DepthEstimationInput, DepthEstimationOutput
+from .document_question_answering import (
+    DocumentQuestionAnsweringInput,
+    DocumentQuestionAnsweringInputData,
+    DocumentQuestionAnsweringOutputElement,
+    DocumentQuestionAnsweringParameters,
+)
+from .feature_extraction import FeatureExtractionInput, FeatureExtractionInputTruncationDirection
+from .fill_mask import FillMaskInput, FillMaskOutputElement, FillMaskParameters
+from .image_classification import (
+    ImageClassificationInput,
+    ImageClassificationOutputElement,
+    ImageClassificationOutputTransform,
+    ImageClassificationParameters,
+)
+from .image_segmentation import (
+    ImageSegmentationInput,
+    ImageSegmentationOutputElement,
+    ImageSegmentationParameters,
+    ImageSegmentationSubtask,
+)
+from .image_to_image import ImageToImageInput, ImageToImageOutput, ImageToImageParameters, ImageToImageTargetSize
+from .image_to_text import (
+    ImageToTextEarlyStoppingEnum,
+    ImageToTextGenerationParameters,
+    ImageToTextInput,
+    ImageToTextOutput,
+    ImageToTextParameters,
+)
+from .object_detection import (
+    ObjectDetectionBoundingBox,
+    ObjectDetectionInput,
+    ObjectDetectionOutputElement,
+    ObjectDetectionParameters,
+)
+from .question_answering import (
+    QuestionAnsweringInput,
+    QuestionAnsweringInputData,
+    QuestionAnsweringOutputElement,
+    QuestionAnsweringParameters,
+)
+from .sentence_similarity import SentenceSimilarityInput, SentenceSimilarityInputData
+from .summarization import (
+    SummarizationInput,
+    SummarizationOutput,
+    SummarizationParameters,
+    SummarizationTruncationStrategy,
+)
+from .table_question_answering import (
+    Padding,
+    TableQuestionAnsweringInput,
+    TableQuestionAnsweringInputData,
+    TableQuestionAnsweringOutputElement,
+    TableQuestionAnsweringParameters,
+)
+from .text2text_generation import (
+    Text2TextGenerationInput,
+    Text2TextGenerationOutput,
+    Text2TextGenerationParameters,
+    Text2TextGenerationTruncationStrategy,
+)
+from .text_classification import (
+    TextClassificationInput,
+    TextClassificationOutputElement,
+    TextClassificationOutputTransform,
+    TextClassificationParameters,
+)
+from .text_generation import (
+    TextGenerationInput,
+    TextGenerationInputGenerateParameters,
+    TextGenerationInputGrammarType,
+    TextGenerationOutput,
+    TextGenerationOutputBestOfSequence,
+    TextGenerationOutputDetails,
+    TextGenerationOutputFinishReason,
+    TextGenerationOutputPrefillToken,
+    TextGenerationOutputToken,
+    TextGenerationStreamOutput,
+    TextGenerationStreamOutputStreamDetails,
+    TextGenerationStreamOutputToken,
+    TypeEnum,
+)
+from .text_to_audio import (
+    TextToAudioEarlyStoppingEnum,
+    TextToAudioGenerationParameters,
+    TextToAudioInput,
+    TextToAudioOutput,
+    TextToAudioParameters,
+)
+from .text_to_image import TextToImageInput, TextToImageOutput, TextToImageParameters
+from .text_to_speech import (
+    TextToSpeechEarlyStoppingEnum,
+    TextToSpeechGenerationParameters,
+    TextToSpeechInput,
+    TextToSpeechOutput,
+    TextToSpeechParameters,
+)
+from .text_to_video import TextToVideoInput, TextToVideoOutput, TextToVideoParameters
+from .token_classification import (
+    TokenClassificationAggregationStrategy,
+    TokenClassificationInput,
+    TokenClassificationOutputElement,
+    TokenClassificationParameters,
+)
+from .translation import TranslationInput, TranslationOutput, TranslationParameters, TranslationTruncationStrategy
+from .video_classification import (
+    VideoClassificationInput,
+    VideoClassificationOutputElement,
+    VideoClassificationOutputTransform,
+    VideoClassificationParameters,
+)
+from .visual_question_answering import (
+    VisualQuestionAnsweringInput,
+    VisualQuestionAnsweringInputData,
+    VisualQuestionAnsweringOutputElement,
+    VisualQuestionAnsweringParameters,
+)
+from .zero_shot_classification import (
+    ZeroShotClassificationInput,
+    ZeroShotClassificationOutputElement,
+    ZeroShotClassificationParameters,
+)
+from .zero_shot_image_classification import (
+    ZeroShotImageClassificationInput,
+    ZeroShotImageClassificationOutputElement,
+    ZeroShotImageClassificationParameters,
+)
+from .zero_shot_object_detection import (
+    ZeroShotObjectDetectionBoundingBox,
+    ZeroShotObjectDetectionInput,
+    ZeroShotObjectDetectionOutputElement,
+    ZeroShotObjectDetectionParameters,
+)
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py
new file mode 100644
index 00000000..05305578
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py
@@ -0,0 +1,43 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+AudioClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass_with_extra
+class AudioClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Audio Classification"""
+
+    function_to_apply: Optional["AudioClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass_with_extra
+class AudioClassificationInput(BaseInferenceType):
+    """Inputs for Audio Classification inference"""
+
+    inputs: str
+    """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the audio data as a raw bytes payload.
+    """
+    parameters: Optional[AudioClassificationParameters] = None
+    """Additional inference parameters for Audio Classification"""
+
+
+@dataclass_with_extra
+class AudioClassificationOutputElement(BaseInferenceType):
+    """Outputs for Audio Classification inference"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_to_audio.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_to_audio.py
new file mode 100644
index 00000000..43f376b5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/audio_to_audio.py
@@ -0,0 +1,30 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class AudioToAudioInput(BaseInferenceType):
+    """Inputs for Audio to Audio inference"""
+
+    inputs: Any
+    """The input audio data"""
+
+
+@dataclass_with_extra
+class AudioToAudioOutputElement(BaseInferenceType):
+    """Outputs of inference for the Audio To Audio task
+    A generated audio file with its label.
+    """
+
+    blob: Any
+    """The generated audio file."""
+    content_type: str
+    """The content type of audio file."""
+    label: str
+    """The label of the audio file."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
new file mode 100644
index 00000000..1e885b6c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
@@ -0,0 +1,114 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import List, Literal, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+AutomaticSpeechRecognitionEarlyStoppingEnum = Literal["never"]
+
+
+@dataclass_with_extra
+class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
+    """Parametrization of the text generation process"""
+
+    do_sample: Optional[bool] = None
+    """Whether to use sampling instead of greedy decoding when generating new tokens."""
+    early_stopping: Optional[Union[bool, "AutomaticSpeechRecognitionEarlyStoppingEnum"]] = None
+    """Controls the stopping condition for beam-based methods."""
+    epsilon_cutoff: Optional[float] = None
+    """If set to float strictly between 0 and 1, only tokens with a conditional probability
+    greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+    3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+    Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+    """
+    eta_cutoff: Optional[float] = None
+    """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+    float strictly between 0 and 1, a token is only considered if it is greater than either
+    eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+    term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+    the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+    See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+    for more details.
+    """
+    max_length: Optional[int] = None
+    """The maximum length (in tokens) of the generated text, including the input."""
+    max_new_tokens: Optional[int] = None
+    """The maximum number of tokens to generate. Takes precedence over max_length."""
+    min_length: Optional[int] = None
+    """The minimum length (in tokens) of the generated text, including the input."""
+    min_new_tokens: Optional[int] = None
+    """The minimum number of tokens to generate. Takes precedence over min_length."""
+    num_beam_groups: Optional[int] = None
+    """Number of groups to divide num_beams into in order to ensure diversity among different
+    groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+    """
+    num_beams: Optional[int] = None
+    """Number of beams to use for beam search."""
+    penalty_alpha: Optional[float] = None
+    """The value balances the model confidence and the degeneration penalty in contrastive
+    search decoding.
+    """
+    temperature: Optional[float] = None
+    """The value used to modulate the next token probabilities."""
+    top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
+    top_p: Optional[float] = None
+    """If set to float < 1, only the smallest set of most probable tokens with probabilities
+    that add up to top_p or higher are kept for generation.
+    """
+    typical_p: Optional[float] = None
+    """Local typicality measures how similar the conditional probability of predicting a target
+    token next is to the expected conditional probability of predicting a random token next,
+    given the partial text already generated. If set to float < 1, the smallest set of the
+    most locally typical tokens with probabilities that add up to typical_p or higher are
+    kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+    """
+    use_cache: Optional[bool] = None
+    """Whether the model should use the past last key/values attentions to speed up decoding"""
+
+
+@dataclass_with_extra
+class AutomaticSpeechRecognitionParameters(BaseInferenceType):
+    """Additional inference parameters for Automatic Speech Recognition"""
+
+    return_timestamps: Optional[bool] = None
+    """Whether to output corresponding timestamps with the generated text"""
+    # Will be deprecated in the future when the renaming to `generation_parameters` is implemented in transformers
+    generate_kwargs: Optional[AutomaticSpeechRecognitionGenerationParameters] = None
+    """Parametrization of the text generation process"""
+
+
+@dataclass_with_extra
+class AutomaticSpeechRecognitionInput(BaseInferenceType):
+    """Inputs for Automatic Speech Recognition inference"""
+
+    inputs: str
+    """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the audio data as a raw bytes payload.
+    """
+    parameters: Optional[AutomaticSpeechRecognitionParameters] = None
+    """Additional inference parameters for Automatic Speech Recognition"""
+
+
+@dataclass_with_extra
+class AutomaticSpeechRecognitionOutputChunk(BaseInferenceType):
+    text: str
+    """A chunk of text identified by the model"""
+    timestamp: List[float]
+    """The start and end timestamps corresponding with the text"""
+
+
+@dataclass_with_extra
+class AutomaticSpeechRecognitionOutput(BaseInferenceType):
+    """Outputs of inference for the Automatic Speech Recognition task"""
+
+    text: str
+    """The recognized text."""
+    chunks: Optional[List[AutomaticSpeechRecognitionOutputChunk]] = None
+    """When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
+    the model.
+    """
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/base.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/base.py
new file mode 100644
index 00000000..1f0c4687
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/base.py
@@ -0,0 +1,161 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains a base class for all inference types."""
+
+import inspect
+import json
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Type, TypeVar, Union, get_args
+
+
+T = TypeVar("T", bound="BaseInferenceType")
+
+
+def _repr_with_extra(self):
+    fields = list(self.__dataclass_fields__.keys())
+    other_fields = list(k for k in self.__dict__ if k not in fields)
+    return f"{self.__class__.__name__}({', '.join(f'{k}={self.__dict__[k]!r}' for k in fields + other_fields)})"
+
+
+def dataclass_with_extra(cls: Type[T]) -> Type[T]:
+    """Decorator to add a custom __repr__ method to a dataclass, showing all fields, including extra ones.
+
+    This decorator only works with dataclasses that inherit from `BaseInferenceType`.
+    """
+    cls = dataclass(cls)
+    cls.__repr__ = _repr_with_extra  # type: ignore[method-assign]
+    return cls
+
+
+@dataclass
+class BaseInferenceType(dict):
+    """Base class for all inference types.
+
+    Object is a dataclass and a dict for backward compatibility but plan is to remove the dict part in the future.
+
+    Handle parsing from dict, list and json strings in a permissive way to ensure future-compatibility (e.g. all fields
+    are made optional, and non-expected fields are added as dict attributes).
+    """
+
+    @classmethod
+    def parse_obj_as_list(cls: Type[T], data: Union[bytes, str, List, Dict]) -> List[T]:
+        """Alias to parse server response and return a single instance.
+
+        See `parse_obj` for more details.
+        """
+        output = cls.parse_obj(data)
+        if not isinstance(output, list):
+            raise ValueError(f"Invalid input data for {cls}. Expected a list, but got {type(output)}.")
+        return output
+
+    @classmethod
+    def parse_obj_as_instance(cls: Type[T], data: Union[bytes, str, List, Dict]) -> T:
+        """Alias to parse server response and return a single instance.
+
+        See `parse_obj` for more details.
+        """
+        output = cls.parse_obj(data)
+        if isinstance(output, list):
+            raise ValueError(f"Invalid input data for {cls}. Expected a single instance, but got a list.")
+        return output
+
+    @classmethod
+    def parse_obj(cls: Type[T], data: Union[bytes, str, List, Dict]) -> Union[List[T], T]:
+        """Parse server response as a dataclass or list of dataclasses.
+
+        To enable future-compatibility, we want to handle cases where the server return more fields than expected.
+        In such cases, we don't want to raise an error but still create the dataclass object. Remaining fields are
+        added as dict attributes.
+        """
+        # Parse server response (from bytes)
+        if isinstance(data, bytes):
+            data = data.decode()
+        if isinstance(data, str):
+            data = json.loads(data)
+
+        # If a list, parse each item individually
+        if isinstance(data, List):
+            return [cls.parse_obj(d) for d in data]  # type: ignore [misc]
+
+        # At this point, we expect a dict
+        if not isinstance(data, dict):
+            raise ValueError(f"Invalid data type: {type(data)}")
+
+        init_values = {}
+        other_values = {}
+        for key, value in data.items():
+            key = normalize_key(key)
+            if key in cls.__dataclass_fields__ and cls.__dataclass_fields__[key].init:
+                if isinstance(value, dict) or isinstance(value, list):
+                    field_type = cls.__dataclass_fields__[key].type
+
+                    # if `field_type` is a `BaseInferenceType`, parse it
+                    if inspect.isclass(field_type) and issubclass(field_type, BaseInferenceType):
+                        value = field_type.parse_obj(value)
+
+                    # otherwise, recursively parse nested dataclasses (if possible)
+                    # `get_args` returns handle Union and Optional for us
+                    else:
+                        expected_types = get_args(field_type)
+                        for expected_type in expected_types:
+                            if getattr(expected_type, "_name", None) == "List":
+                                expected_type = get_args(expected_type)[
+                                    0
+                                ]  # assume same type for all items in the list
+                            if inspect.isclass(expected_type) and issubclass(expected_type, BaseInferenceType):
+                                value = expected_type.parse_obj(value)
+                                break
+                init_values[key] = value
+            else:
+                other_values[key] = value
+
+        # Make all missing fields default to None
+        # => ensure that dataclass initialization will never fail even if the server does not return all fields.
+        for key in cls.__dataclass_fields__:
+            if key not in init_values:
+                init_values[key] = None
+
+        # Initialize dataclass with expected values
+        item = cls(**init_values)
+
+        # Add remaining fields as dict attributes
+        item.update(other_values)
+
+        # Add remaining fields as extra dataclass fields.
+        # They won't be part of the dataclass fields but will be accessible as attributes.
+        # Use @dataclass_with_extra to show them in __repr__.
+        item.__dict__.update(other_values)
+        return item
+
+    def __post_init__(self):
+        self.update(asdict(self))
+
+    def __setitem__(self, __key: Any, __value: Any) -> None:
+        # Hacky way to keep dataclass values in sync when dict is updated
+        super().__setitem__(__key, __value)
+        if __key in self.__dataclass_fields__ and getattr(self, __key, None) != __value:
+            self.__setattr__(__key, __value)
+        return
+
+    def __setattr__(self, __name: str, __value: Any) -> None:
+        # Hacky way to keep dict values is sync when dataclass is updated
+        super().__setattr__(__name, __value)
+        if self.get(__name) != __value:
+            self[__name] = __value
+        return
+
+
+def normalize_key(key: str) -> str:
+    # e.g "content-type" -> "content_type", "Accept" -> "accept"
+    return key.replace("-", "_").replace(" ", "_").lower()
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/chat_completion.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/chat_completion.py
new file mode 100644
index 00000000..a817d3f8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/chat_completion.py
@@ -0,0 +1,301 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, List, Literal, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ChatCompletionInputURL(BaseInferenceType):
+    url: str
+
+
+ChatCompletionInputMessageChunkType = Literal["text", "image_url"]
+
+
+@dataclass_with_extra
+class ChatCompletionInputMessageChunk(BaseInferenceType):
+    type: "ChatCompletionInputMessageChunkType"
+    image_url: Optional[ChatCompletionInputURL] = None
+    text: Optional[str] = None
+
+
+@dataclass_with_extra
+class ChatCompletionInputMessage(BaseInferenceType):
+    content: Union[List[ChatCompletionInputMessageChunk], str]
+    role: str
+    name: Optional[str] = None
+
+
+ChatCompletionInputGrammarTypeType = Literal["json", "regex"]
+
+
+@dataclass_with_extra
+class ChatCompletionInputGrammarType(BaseInferenceType):
+    type: "ChatCompletionInputGrammarTypeType"
+    value: Any
+    """A string that represents a [JSON Schema](https://json-schema.org/).
+    JSON Schema is a declarative language that allows to annotate JSON documents
+    with types and descriptions.
+    """
+
+
+@dataclass_with_extra
+class ChatCompletionInputStreamOptions(BaseInferenceType):
+    include_usage: bool
+    """If set, an additional chunk will be streamed before the data: [DONE] message. The usage
+    field on this chunk shows the token usage statistics for the entire request, and the
+    choices field will always be an empty array. All other chunks will also include a usage
+    field, but with a null value.
+    """
+
+
+@dataclass_with_extra
+class ChatCompletionInputFunctionName(BaseInferenceType):
+    name: str
+
+
+@dataclass_with_extra
+class ChatCompletionInputToolChoiceClass(BaseInferenceType):
+    function: ChatCompletionInputFunctionName
+
+
+ChatCompletionInputToolChoiceEnum = Literal["auto", "none", "required"]
+
+
+@dataclass_with_extra
+class ChatCompletionInputFunctionDefinition(BaseInferenceType):
+    arguments: Any
+    name: str
+    description: Optional[str] = None
+
+
+@dataclass_with_extra
+class ChatCompletionInputTool(BaseInferenceType):
+    function: ChatCompletionInputFunctionDefinition
+    type: str
+
+
+@dataclass_with_extra
+class ChatCompletionInput(BaseInferenceType):
+    """Chat Completion Input.
+    Auto-generated from TGI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.
+    """
+
+    messages: List[ChatCompletionInputMessage]
+    """A list of messages comprising the conversation so far."""
+    frequency_penalty: Optional[float] = None
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing
+    frequency in the text so far,
+    decreasing the model's likelihood to repeat the same line verbatim.
+    """
+    logit_bias: Optional[List[float]] = None
+    """UNUSED
+    Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON
+    object that maps tokens
+    (specified by their token ID in the tokenizer) to an associated bias value from -100 to
+    100. Mathematically,
+    the bias is added to the logits generated by the model prior to sampling. The exact
+    effect will vary per model,
+    but values between -1 and 1 should decrease or increase likelihood of selection; values
+    like -100 or 100 should
+    result in a ban or exclusive selection of the relevant token.
+    """
+    logprobs: Optional[bool] = None
+    """Whether to return log probabilities of the output tokens or not. If true, returns the log
+    probabilities of each
+    output token returned in the content of message.
+    """
+    max_tokens: Optional[int] = None
+    """The maximum number of tokens that can be generated in the chat completion."""
+    model: Optional[str] = None
+    """[UNUSED] ID of the model to use. See the model endpoint compatibility table for details
+    on which models work with the Chat API.
+    """
+    n: Optional[int] = None
+    """UNUSED
+    How many chat completion choices to generate for each input message. Note that you will
+    be charged based on the
+    number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
+    """
+    presence_penalty: Optional[float] = None
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they
+    appear in the text so far,
+    increasing the model's likelihood to talk about new topics
+    """
+    response_format: Optional[ChatCompletionInputGrammarType] = None
+    seed: Optional[int] = None
+    stop: Optional[List[str]] = None
+    """Up to 4 sequences where the API will stop generating further tokens."""
+    stream: Optional[bool] = None
+    stream_options: Optional[ChatCompletionInputStreamOptions] = None
+    temperature: Optional[float] = None
+    """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the
+    output more random, while
+    lower values like 0.2 will make it more focused and deterministic.
+    We generally recommend altering this or `top_p` but not both.
+    """
+    tool_choice: Optional[Union[ChatCompletionInputToolChoiceClass, "ChatCompletionInputToolChoiceEnum"]] = None
+    tool_prompt: Optional[str] = None
+    """A prompt to be appended before the tools"""
+    tools: Optional[List[ChatCompletionInputTool]] = None
+    """A list of tools the model may call. Currently, only functions are supported as a tool.
+    Use this to provide a list of
+    functions the model may generate JSON inputs for.
+    """
+    top_logprobs: Optional[int] = None
+    """An integer between 0 and 5 specifying the number of most likely tokens to return at each
+    token position, each with
+    an associated log probability. logprobs must be set to true if this parameter is used.
+    """
+    top_p: Optional[float] = None
+    """An alternative to sampling with temperature, called nucleus sampling, where the model
+    considers the results of the
+    tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10%
+    probability mass are considered.
+    """
+
+
+@dataclass_with_extra
+class ChatCompletionOutputTopLogprob(BaseInferenceType):
+    logprob: float
+    token: str
+
+
+@dataclass_with_extra
+class ChatCompletionOutputLogprob(BaseInferenceType):
+    logprob: float
+    token: str
+    top_logprobs: List[ChatCompletionOutputTopLogprob]
+
+
+@dataclass_with_extra
+class ChatCompletionOutputLogprobs(BaseInferenceType):
+    content: List[ChatCompletionOutputLogprob]
+
+
+@dataclass_with_extra
+class ChatCompletionOutputFunctionDefinition(BaseInferenceType):
+    arguments: Any
+    name: str
+    description: Optional[str] = None
+
+
+@dataclass_with_extra
+class ChatCompletionOutputToolCall(BaseInferenceType):
+    function: ChatCompletionOutputFunctionDefinition
+    id: str
+    type: str
+
+
+@dataclass_with_extra
+class ChatCompletionOutputMessage(BaseInferenceType):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[List[ChatCompletionOutputToolCall]] = None
+
+
+@dataclass_with_extra
+class ChatCompletionOutputComplete(BaseInferenceType):
+    finish_reason: str
+    index: int
+    message: ChatCompletionOutputMessage
+    logprobs: Optional[ChatCompletionOutputLogprobs] = None
+
+
+@dataclass_with_extra
+class ChatCompletionOutputUsage(BaseInferenceType):
+    completion_tokens: int
+    prompt_tokens: int
+    total_tokens: int
+
+
+@dataclass_with_extra
+class ChatCompletionOutput(BaseInferenceType):
+    """Chat Completion Output.
+    Auto-generated from TGI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.
+    """
+
+    choices: List[ChatCompletionOutputComplete]
+    created: int
+    id: str
+    model: str
+    system_fingerprint: str
+    usage: ChatCompletionOutputUsage
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputFunction(BaseInferenceType):
+    arguments: str
+    name: Optional[str] = None
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputDeltaToolCall(BaseInferenceType):
+    function: ChatCompletionStreamOutputFunction
+    id: str
+    index: int
+    type: str
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputDelta(BaseInferenceType):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[ChatCompletionStreamOutputDeltaToolCall] = None
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputTopLogprob(BaseInferenceType):
+    logprob: float
+    token: str
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputLogprob(BaseInferenceType):
+    logprob: float
+    token: str
+    top_logprobs: List[ChatCompletionStreamOutputTopLogprob]
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputLogprobs(BaseInferenceType):
+    content: List[ChatCompletionStreamOutputLogprob]
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputChoice(BaseInferenceType):
+    delta: ChatCompletionStreamOutputDelta
+    index: int
+    finish_reason: Optional[str] = None
+    logprobs: Optional[ChatCompletionStreamOutputLogprobs] = None
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutputUsage(BaseInferenceType):
+    completion_tokens: int
+    prompt_tokens: int
+    total_tokens: int
+
+
+@dataclass_with_extra
+class ChatCompletionStreamOutput(BaseInferenceType):
+    """Chat Completion Stream Output.
+    Auto-generated from TGI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.
+    """
+
+    choices: List[ChatCompletionStreamOutputChoice]
+    created: int
+    id: str
+    model: str
+    system_fingerprint: str
+    usage: Optional[ChatCompletionStreamOutputUsage] = None
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/depth_estimation.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/depth_estimation.py
new file mode 100644
index 00000000..1e09bdff
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/depth_estimation.py
@@ -0,0 +1,28 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Dict, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class DepthEstimationInput(BaseInferenceType):
+    """Inputs for Depth Estimation inference"""
+
+    inputs: Any
+    """The input image data"""
+    parameters: Optional[Dict[str, Any]] = None
+    """Additional inference parameters for Depth Estimation"""
+
+
+@dataclass_with_extra
+class DepthEstimationOutput(BaseInferenceType):
+    """Outputs of inference for the Depth Estimation task"""
+
+    depth: Any
+    """The predicted depth as an image"""
+    predicted_depth: Any
+    """The predicted depth as a tensor"""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py
new file mode 100644
index 00000000..2457d2c8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py
@@ -0,0 +1,80 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, List, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class DocumentQuestionAnsweringInputData(BaseInferenceType):
+    """One (document, question) pair to answer"""
+
+    image: Any
+    """The image on which the question is asked"""
+    question: str
+    """A question to ask of the document"""
+
+
+@dataclass_with_extra
+class DocumentQuestionAnsweringParameters(BaseInferenceType):
+    """Additional inference parameters for Document Question Answering"""
+
+    doc_stride: Optional[int] = None
+    """If the words in the document are too long to fit with the question for the model, it will
+    be split in several chunks with some overlap. This argument controls the size of that
+    overlap.
+    """
+    handle_impossible_answer: Optional[bool] = None
+    """Whether to accept impossible as an answer"""
+    lang: Optional[str] = None
+    """Language to use while running OCR. Defaults to english."""
+    max_answer_len: Optional[int] = None
+    """The maximum length of predicted answers (e.g., only answers with a shorter length are
+    considered).
+    """
+    max_question_len: Optional[int] = None
+    """The maximum length of the question after tokenization. It will be truncated if needed."""
+    max_seq_len: Optional[int] = None
+    """The maximum length of the total sentence (context + question) in tokens of each chunk
+    passed to the model. The context will be split in several chunks (using doc_stride as
+    overlap) if needed.
+    """
+    top_k: Optional[int] = None
+    """The number of answers to return (will be chosen by order of likelihood). Can return less
+    than top_k answers if there are not enough options available within the context.
+    """
+    word_boxes: Optional[List[Union[List[float], str]]] = None
+    """A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+    skip the OCR step and use the provided bounding boxes instead.
+    """
+
+
+@dataclass_with_extra
+class DocumentQuestionAnsweringInput(BaseInferenceType):
+    """Inputs for Document Question Answering inference"""
+
+    inputs: DocumentQuestionAnsweringInputData
+    """One (document, question) pair to answer"""
+    parameters: Optional[DocumentQuestionAnsweringParameters] = None
+    """Additional inference parameters for Document Question Answering"""
+
+
+@dataclass_with_extra
+class DocumentQuestionAnsweringOutputElement(BaseInferenceType):
+    """Outputs of inference for the Document Question Answering task"""
+
+    answer: str
+    """The answer to the question."""
+    end: int
+    """The end word index of the answer (in the OCR’d version of the input or provided word
+    boxes).
+    """
+    score: float
+    """The probability associated to the answer."""
+    start: int
+    """The start word index of the answer (in the OCR’d version of the input or provided word
+    boxes).
+    """
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py
new file mode 100644
index 00000000..e965ddba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py
@@ -0,0 +1,36 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import List, Literal, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+FeatureExtractionInputTruncationDirection = Literal["Left", "Right"]
+
+
+@dataclass_with_extra
+class FeatureExtractionInput(BaseInferenceType):
+    """Feature Extraction Input.
+    Auto-generated from TEI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
+    """
+
+    inputs: Union[List[str], str]
+    """The text or list of texts to embed."""
+    normalize: Optional[bool] = None
+    prompt_name: Optional[str] = None
+    """The name of the prompt that should be used by for encoding. If not set, no prompt
+    will be applied.
+    Must be a key in the `sentence-transformers` configuration `prompts` dictionary.
+    For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
+    ...},
+    then the sentence "What is the capital of France?" will be encoded as
+    "query: What is the capital of France?" because the prompt text will be prepended before
+    any text to encode.
+    """
+    truncate: Optional[bool] = None
+    truncation_direction: Optional["FeatureExtractionInputTruncationDirection"] = None
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/fill_mask.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/fill_mask.py
new file mode 100644
index 00000000..dfcdc56b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/fill_mask.py
@@ -0,0 +1,47 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, List, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class FillMaskParameters(BaseInferenceType):
+    """Additional inference parameters for Fill Mask"""
+
+    targets: Optional[List[str]] = None
+    """When passed, the model will limit the scores to the passed targets instead of looking up
+    in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+    tokenized and the first resulting token will be used (with a warning, and that might be
+    slower).
+    """
+    top_k: Optional[int] = None
+    """When passed, overrides the number of predictions to return."""
+
+
+@dataclass_with_extra
+class FillMaskInput(BaseInferenceType):
+    """Inputs for Fill Mask inference"""
+
+    inputs: str
+    """The text with masked tokens"""
+    parameters: Optional[FillMaskParameters] = None
+    """Additional inference parameters for Fill Mask"""
+
+
+@dataclass_with_extra
+class FillMaskOutputElement(BaseInferenceType):
+    """Outputs of inference for the Fill Mask task"""
+
+    score: float
+    """The corresponding probability"""
+    sequence: str
+    """The corresponding input with the mask token prediction."""
+    token: int
+    """The predicted token id (to replace the masked one)."""
+    token_str: Any
+    fill_mask_output_token_str: Optional[str] = None
+    """The predicted token (to replace the masked one)."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_classification.py
new file mode 100644
index 00000000..0fdda6c8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_classification.py
@@ -0,0 +1,43 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+ImageClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass_with_extra
+class ImageClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Image Classification"""
+
+    function_to_apply: Optional["ImageClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass_with_extra
+class ImageClassificationInput(BaseInferenceType):
+    """Inputs for Image Classification inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
+    parameters: Optional[ImageClassificationParameters] = None
+    """Additional inference parameters for Image Classification"""
+
+
+@dataclass_with_extra
+class ImageClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Image Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_segmentation.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_segmentation.py
new file mode 100644
index 00000000..3dbf61db
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_segmentation.py
@@ -0,0 +1,51 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+ImageSegmentationSubtask = Literal["instance", "panoptic", "semantic"]
+
+
+@dataclass_with_extra
+class ImageSegmentationParameters(BaseInferenceType):
+    """Additional inference parameters for Image Segmentation"""
+
+    mask_threshold: Optional[float] = None
+    """Threshold to use when turning the predicted masks into binary values."""
+    overlap_mask_area_threshold: Optional[float] = None
+    """Mask overlap threshold to eliminate small, disconnected segments."""
+    subtask: Optional["ImageSegmentationSubtask"] = None
+    """Segmentation task to be performed, depending on model capabilities."""
+    threshold: Optional[float] = None
+    """Probability threshold to filter out predicted masks."""
+
+
+@dataclass_with_extra
+class ImageSegmentationInput(BaseInferenceType):
+    """Inputs for Image Segmentation inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
+    parameters: Optional[ImageSegmentationParameters] = None
+    """Additional inference parameters for Image Segmentation"""
+
+
+@dataclass_with_extra
+class ImageSegmentationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Image Segmentation task
+    A predicted mask / segment
+    """
+
+    label: str
+    """The label of the predicted segment."""
+    mask: str
+    """The corresponding mask as a black-and-white image (base64-encoded)."""
+    score: Optional[float] = None
+    """The score or confidence degree the model has."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_image.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_image.py
new file mode 100644
index 00000000..0deb0374
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_image.py
@@ -0,0 +1,54 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ImageToImageTargetSize(BaseInferenceType):
+    """The size in pixel of the output image."""
+
+    height: int
+    width: int
+
+
+@dataclass_with_extra
+class ImageToImageParameters(BaseInferenceType):
+    """Additional inference parameters for Image To Image"""
+
+    guidance_scale: Optional[float] = None
+    """For diffusion models. A higher guidance scale value encourages the model to generate
+    images closely linked to the text prompt at the expense of lower image quality.
+    """
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in image generation."""
+    num_inference_steps: Optional[int] = None
+    """For diffusion models. The number of denoising steps. More denoising steps usually lead to
+    a higher quality image at the expense of slower inference.
+    """
+    target_size: Optional[ImageToImageTargetSize] = None
+    """The size in pixel of the output image."""
+
+
+@dataclass_with_extra
+class ImageToImageInput(BaseInferenceType):
+    """Inputs for Image To Image inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
+    parameters: Optional[ImageToImageParameters] = None
+    """Additional inference parameters for Image To Image"""
+
+
+@dataclass_with_extra
+class ImageToImageOutput(BaseInferenceType):
+    """Outputs of inference for the Image To Image task"""
+
+    image: Any
+    """The output image returned as raw bytes in the payload."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_text.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_text.py
new file mode 100644
index 00000000..550d95e1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/image_to_text.py
@@ -0,0 +1,101 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Literal, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+ImageToTextEarlyStoppingEnum = Literal["never"]
+
+
+@dataclass_with_extra
+class ImageToTextGenerationParameters(BaseInferenceType):
+    """Parametrization of the text generation process"""
+
+    do_sample: Optional[bool] = None
+    """Whether to use sampling instead of greedy decoding when generating new tokens."""
+    early_stopping: Optional[Union[bool, "ImageToTextEarlyStoppingEnum"]] = None
+    """Controls the stopping condition for beam-based methods."""
+    epsilon_cutoff: Optional[float] = None
+    """If set to float strictly between 0 and 1, only tokens with a conditional probability
+    greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+    3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+    Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+    """
+    eta_cutoff: Optional[float] = None
+    """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+    float strictly between 0 and 1, a token is only considered if it is greater than either
+    eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+    term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+    the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+    See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+    for more details.
+    """
+    max_length: Optional[int] = None
+    """The maximum length (in tokens) of the generated text, including the input."""
+    max_new_tokens: Optional[int] = None
+    """The maximum number of tokens to generate. Takes precedence over max_length."""
+    min_length: Optional[int] = None
+    """The minimum length (in tokens) of the generated text, including the input."""
+    min_new_tokens: Optional[int] = None
+    """The minimum number of tokens to generate. Takes precedence over min_length."""
+    num_beam_groups: Optional[int] = None
+    """Number of groups to divide num_beams into in order to ensure diversity among different
+    groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+    """
+    num_beams: Optional[int] = None
+    """Number of beams to use for beam search."""
+    penalty_alpha: Optional[float] = None
+    """The value balances the model confidence and the degeneration penalty in contrastive
+    search decoding.
+    """
+    temperature: Optional[float] = None
+    """The value used to modulate the next token probabilities."""
+    top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
+    top_p: Optional[float] = None
+    """If set to float < 1, only the smallest set of most probable tokens with probabilities
+    that add up to top_p or higher are kept for generation.
+    """
+    typical_p: Optional[float] = None
+    """Local typicality measures how similar the conditional probability of predicting a target
+    token next is to the expected conditional probability of predicting a random token next,
+    given the partial text already generated. If set to float < 1, the smallest set of the
+    most locally typical tokens with probabilities that add up to typical_p or higher are
+    kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+    """
+    use_cache: Optional[bool] = None
+    """Whether the model should use the past last key/values attentions to speed up decoding"""
+
+
+@dataclass_with_extra
+class ImageToTextParameters(BaseInferenceType):
+    """Additional inference parameters for Image To Text"""
+
+    max_new_tokens: Optional[int] = None
+    """The amount of maximum tokens to generate."""
+    # Will be deprecated in the future when the renaming to `generation_parameters` is implemented in transformers
+    generate_kwargs: Optional[ImageToTextGenerationParameters] = None
+    """Parametrization of the text generation process"""
+
+
+@dataclass_with_extra
+class ImageToTextInput(BaseInferenceType):
+    """Inputs for Image To Text inference"""
+
+    inputs: Any
+    """The input image data"""
+    parameters: Optional[ImageToTextParameters] = None
+    """Additional inference parameters for Image To Text"""
+
+
+@dataclass_with_extra
+class ImageToTextOutput(BaseInferenceType):
+    """Outputs of inference for the Image To Text task"""
+
+    generated_text: Any
+    image_to_text_output_generated_text: Optional[str] = None
+    """The generated text."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/object_detection.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/object_detection.py
new file mode 100644
index 00000000..75f3ebcf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/object_detection.py
@@ -0,0 +1,58 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ObjectDetectionParameters(BaseInferenceType):
+    """Additional inference parameters for Object Detection"""
+
+    threshold: Optional[float] = None
+    """The probability necessary to make a prediction."""
+
+
+@dataclass_with_extra
+class ObjectDetectionInput(BaseInferenceType):
+    """Inputs for Object Detection inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
+    parameters: Optional[ObjectDetectionParameters] = None
+    """Additional inference parameters for Object Detection"""
+
+
+@dataclass_with_extra
+class ObjectDetectionBoundingBox(BaseInferenceType):
+    """The predicted bounding box. Coordinates are relative to the top left corner of the input
+    image.
+    """
+
+    xmax: int
+    """The x-coordinate of the bottom-right corner of the bounding box."""
+    xmin: int
+    """The x-coordinate of the top-left corner of the bounding box."""
+    ymax: int
+    """The y-coordinate of the bottom-right corner of the bounding box."""
+    ymin: int
+    """The y-coordinate of the top-left corner of the bounding box."""
+
+
+@dataclass_with_extra
+class ObjectDetectionOutputElement(BaseInferenceType):
+    """Outputs of inference for the Object Detection task"""
+
+    box: ObjectDetectionBoundingBox
+    """The predicted bounding box. Coordinates are relative to the top left corner of the input
+    image.
+    """
+    label: str
+    """The predicted label for the bounding box."""
+    score: float
+    """The associated score / probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/question_answering.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/question_answering.py
new file mode 100644
index 00000000..014ab418
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/question_answering.py
@@ -0,0 +1,74 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class QuestionAnsweringInputData(BaseInferenceType):
+    """One (context, question) pair to answer"""
+
+    context: str
+    """The context to be used for answering the question"""
+    question: str
+    """The question to be answered"""
+
+
+@dataclass_with_extra
+class QuestionAnsweringParameters(BaseInferenceType):
+    """Additional inference parameters for Question Answering"""
+
+    align_to_words: Optional[bool] = None
+    """Attempts to align the answer to real words. Improves quality on space separated
+    languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+    """
+    doc_stride: Optional[int] = None
+    """If the context is too long to fit with the question for the model, it will be split in
+    several chunks with some overlap. This argument controls the size of that overlap.
+    """
+    handle_impossible_answer: Optional[bool] = None
+    """Whether to accept impossible as an answer."""
+    max_answer_len: Optional[int] = None
+    """The maximum length of predicted answers (e.g., only answers with a shorter length are
+    considered).
+    """
+    max_question_len: Optional[int] = None
+    """The maximum length of the question after tokenization. It will be truncated if needed."""
+    max_seq_len: Optional[int] = None
+    """The maximum length of the total sentence (context + question) in tokens of each chunk
+    passed to the model. The context will be split in several chunks (using docStride as
+    overlap) if needed.
+    """
+    top_k: Optional[int] = None
+    """The number of answers to return (will be chosen by order of likelihood). Note that we
+    return less than topk answers if there are not enough options available within the
+    context.
+    """
+
+
+@dataclass_with_extra
+class QuestionAnsweringInput(BaseInferenceType):
+    """Inputs for Question Answering inference"""
+
+    inputs: QuestionAnsweringInputData
+    """One (context, question) pair to answer"""
+    parameters: Optional[QuestionAnsweringParameters] = None
+    """Additional inference parameters for Question Answering"""
+
+
+@dataclass_with_extra
+class QuestionAnsweringOutputElement(BaseInferenceType):
+    """Outputs of inference for the Question Answering task"""
+
+    answer: str
+    """The answer to the question."""
+    end: int
+    """The character position in the input where the answer ends."""
+    score: float
+    """The probability associated to the answer."""
+    start: int
+    """The character position in the input where the answer begins."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py
new file mode 100644
index 00000000..66e8bb4d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py
@@ -0,0 +1,27 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Dict, List, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class SentenceSimilarityInputData(BaseInferenceType):
+    sentences: List[str]
+    """A list of strings which will be compared against the source_sentence."""
+    source_sentence: str
+    """The string that you wish to compare the other strings with. This can be a phrase,
+    sentence, or longer passage, depending on the model being used.
+    """
+
+
+@dataclass_with_extra
+class SentenceSimilarityInput(BaseInferenceType):
+    """Inputs for Sentence similarity inference"""
+
+    inputs: SentenceSimilarityInputData
+    parameters: Optional[Dict[str, Any]] = None
+    """Additional inference parameters for Sentence Similarity"""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/summarization.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/summarization.py
new file mode 100644
index 00000000..33eae6fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/summarization.py
@@ -0,0 +1,41 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Dict, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+SummarizationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
+
+
+@dataclass_with_extra
+class SummarizationParameters(BaseInferenceType):
+    """Additional inference parameters for summarization."""
+
+    clean_up_tokenization_spaces: Optional[bool] = None
+    """Whether to clean up the potential extra spaces in the text output."""
+    generate_parameters: Optional[Dict[str, Any]] = None
+    """Additional parametrization of the text generation algorithm."""
+    truncation: Optional["SummarizationTruncationStrategy"] = None
+    """The truncation strategy to use."""
+
+
+@dataclass_with_extra
+class SummarizationInput(BaseInferenceType):
+    """Inputs for Summarization inference"""
+
+    inputs: str
+    """The input text to summarize."""
+    parameters: Optional[SummarizationParameters] = None
+    """Additional inference parameters for summarization."""
+
+
+@dataclass_with_extra
+class SummarizationOutput(BaseInferenceType):
+    """Outputs of inference for the Summarization task"""
+
+    summary_text: str
+    """The summarized text."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/table_question_answering.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/table_question_answering.py
new file mode 100644
index 00000000..10e208ee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/table_question_answering.py
@@ -0,0 +1,62 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Dict, List, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class TableQuestionAnsweringInputData(BaseInferenceType):
+    """One (table, question) pair to answer"""
+
+    question: str
+    """The question to be answered about the table"""
+    table: Dict[str, List[str]]
+    """The table to serve as context for the questions"""
+
+
+Padding = Literal["do_not_pad", "longest", "max_length"]
+
+
+@dataclass_with_extra
+class TableQuestionAnsweringParameters(BaseInferenceType):
+    """Additional inference parameters for Table Question Answering"""
+
+    padding: Optional["Padding"] = None
+    """Activates and controls padding."""
+    sequential: Optional[bool] = None
+    """Whether to do inference sequentially or as a batch. Batching is faster, but models like
+    SQA require the inference to be done sequentially to extract relations within sequences,
+    given their conversational nature.
+    """
+    truncation: Optional[bool] = None
+    """Activates and controls truncation."""
+
+
+@dataclass_with_extra
+class TableQuestionAnsweringInput(BaseInferenceType):
+    """Inputs for Table Question Answering inference"""
+
+    inputs: TableQuestionAnsweringInputData
+    """One (table, question) pair to answer"""
+    parameters: Optional[TableQuestionAnsweringParameters] = None
+    """Additional inference parameters for Table Question Answering"""
+
+
+@dataclass_with_extra
+class TableQuestionAnsweringOutputElement(BaseInferenceType):
+    """Outputs of inference for the Table Question Answering task"""
+
+    answer: str
+    """The answer of the question given the table. If there is an aggregator, the answer will be
+    preceded by `AGGREGATOR >`.
+    """
+    cells: List[str]
+    """List of strings made up of the answer cell values."""
+    coordinates: List[List[int]]
+    """Coordinates of the cells of the answers."""
+    aggregator: Optional[str] = None
+    """If the model has an aggregator, this returns the aggregator."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text2text_generation.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text2text_generation.py
new file mode 100644
index 00000000..34ac74e2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text2text_generation.py
@@ -0,0 +1,42 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Dict, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+Text2TextGenerationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
+
+
+@dataclass_with_extra
+class Text2TextGenerationParameters(BaseInferenceType):
+    """Additional inference parameters for Text2text Generation"""
+
+    clean_up_tokenization_spaces: Optional[bool] = None
+    """Whether to clean up the potential extra spaces in the text output."""
+    generate_parameters: Optional[Dict[str, Any]] = None
+    """Additional parametrization of the text generation algorithm"""
+    truncation: Optional["Text2TextGenerationTruncationStrategy"] = None
+    """The truncation strategy to use"""
+
+
+@dataclass_with_extra
+class Text2TextGenerationInput(BaseInferenceType):
+    """Inputs for Text2text Generation inference"""
+
+    inputs: str
+    """The input text data"""
+    parameters: Optional[Text2TextGenerationParameters] = None
+    """Additional inference parameters for Text2text Generation"""
+
+
+@dataclass_with_extra
+class Text2TextGenerationOutput(BaseInferenceType):
+    """Outputs of inference for the Text2text Generation task"""
+
+    generated_text: Any
+    text2_text_generation_output_generated_text: Optional[str] = None
+    """The generated text."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_classification.py
new file mode 100644
index 00000000..9a172b23
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_classification.py
@@ -0,0 +1,41 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+TextClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass_with_extra
+class TextClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Text Classification"""
+
+    function_to_apply: Optional["TextClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass_with_extra
+class TextClassificationInput(BaseInferenceType):
+    """Inputs for Text Classification inference"""
+
+    inputs: str
+    """The text to classify"""
+    parameters: Optional[TextClassificationParameters] = None
+    """Additional inference parameters for Text Classification"""
+
+
+@dataclass_with_extra
+class TextClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Text Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_generation.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_generation.py
new file mode 100644
index 00000000..7ad5c039
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_generation.py
@@ -0,0 +1,168 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, List, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+TypeEnum = Literal["json", "regex"]
+
+
+@dataclass_with_extra
+class TextGenerationInputGrammarType(BaseInferenceType):
+    type: "TypeEnum"
+    value: Any
+    """A string that represents a [JSON Schema](https://json-schema.org/).
+    JSON Schema is a declarative language that allows to annotate JSON documents
+    with types and descriptions.
+    """
+
+
+@dataclass_with_extra
+class TextGenerationInputGenerateParameters(BaseInferenceType):
+    adapter_id: Optional[str] = None
+    """Lora adapter id"""
+    best_of: Optional[int] = None
+    """Generate best_of sequences and return the one if the highest token logprobs."""
+    decoder_input_details: Optional[bool] = None
+    """Whether to return decoder input token logprobs and ids."""
+    details: Optional[bool] = None
+    """Whether to return generation details."""
+    do_sample: Optional[bool] = None
+    """Activate logits sampling."""
+    frequency_penalty: Optional[float] = None
+    """The parameter for frequency penalty. 1.0 means no penalty
+    Penalize new tokens based on their existing frequency in the text so far,
+    decreasing the model's likelihood to repeat the same line verbatim.
+    """
+    grammar: Optional[TextGenerationInputGrammarType] = None
+    max_new_tokens: Optional[int] = None
+    """Maximum number of tokens to generate."""
+    repetition_penalty: Optional[float] = None
+    """The parameter for repetition penalty. 1.0 means no penalty.
+    See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    """
+    return_full_text: Optional[bool] = None
+    """Whether to prepend the prompt to the generated text"""
+    seed: Optional[int] = None
+    """Random sampling seed."""
+    stop: Optional[List[str]] = None
+    """Stop generating tokens if a member of `stop` is generated."""
+    temperature: Optional[float] = None
+    """The value used to module the logits distribution."""
+    top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
+    top_n_tokens: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-n-filtering."""
+    top_p: Optional[float] = None
+    """Top-p value for nucleus sampling."""
+    truncate: Optional[int] = None
+    """Truncate inputs tokens to the given size."""
+    typical_p: Optional[float] = None
+    """Typical Decoding mass
+    See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666)
+    for more information.
+    """
+    watermark: Optional[bool] = None
+    """Watermarking with [A Watermark for Large Language
+    Models](https://arxiv.org/abs/2301.10226).
+    """
+
+
+@dataclass_with_extra
+class TextGenerationInput(BaseInferenceType):
+    """Text Generation Input.
+    Auto-generated from TGI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.
+    """
+
+    inputs: str
+    parameters: Optional[TextGenerationInputGenerateParameters] = None
+    stream: Optional[bool] = None
+
+
+TextGenerationOutputFinishReason = Literal["length", "eos_token", "stop_sequence"]
+
+
+@dataclass_with_extra
+class TextGenerationOutputPrefillToken(BaseInferenceType):
+    id: int
+    logprob: float
+    text: str
+
+
+@dataclass_with_extra
+class TextGenerationOutputToken(BaseInferenceType):
+    id: int
+    logprob: float
+    special: bool
+    text: str
+
+
+@dataclass_with_extra
+class TextGenerationOutputBestOfSequence(BaseInferenceType):
+    finish_reason: "TextGenerationOutputFinishReason"
+    generated_text: str
+    generated_tokens: int
+    prefill: List[TextGenerationOutputPrefillToken]
+    tokens: List[TextGenerationOutputToken]
+    seed: Optional[int] = None
+    top_tokens: Optional[List[List[TextGenerationOutputToken]]] = None
+
+
+@dataclass_with_extra
+class TextGenerationOutputDetails(BaseInferenceType):
+    finish_reason: "TextGenerationOutputFinishReason"
+    generated_tokens: int
+    prefill: List[TextGenerationOutputPrefillToken]
+    tokens: List[TextGenerationOutputToken]
+    best_of_sequences: Optional[List[TextGenerationOutputBestOfSequence]] = None
+    seed: Optional[int] = None
+    top_tokens: Optional[List[List[TextGenerationOutputToken]]] = None
+
+
+@dataclass_with_extra
+class TextGenerationOutput(BaseInferenceType):
+    """Text Generation Output.
+    Auto-generated from TGI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.
+    """
+
+    generated_text: str
+    details: Optional[TextGenerationOutputDetails] = None
+
+
+@dataclass_with_extra
+class TextGenerationStreamOutputStreamDetails(BaseInferenceType):
+    finish_reason: "TextGenerationOutputFinishReason"
+    generated_tokens: int
+    input_length: int
+    seed: Optional[int] = None
+
+
+@dataclass_with_extra
+class TextGenerationStreamOutputToken(BaseInferenceType):
+    id: int
+    logprob: float
+    special: bool
+    text: str
+
+
+@dataclass_with_extra
+class TextGenerationStreamOutput(BaseInferenceType):
+    """Text Generation Stream Output.
+    Auto-generated from TGI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.
+    """
+
+    index: int
+    token: TextGenerationStreamOutputToken
+    details: Optional[TextGenerationStreamOutputStreamDetails] = None
+    generated_text: Optional[str] = None
+    top_tokens: Optional[List[TextGenerationStreamOutputToken]] = None
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_audio.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_audio.py
new file mode 100644
index 00000000..963b4406
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_audio.py
@@ -0,0 +1,100 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Literal, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+TextToAudioEarlyStoppingEnum = Literal["never"]
+
+
+@dataclass_with_extra
+class TextToAudioGenerationParameters(BaseInferenceType):
+    """Parametrization of the text generation process"""
+
+    do_sample: Optional[bool] = None
+    """Whether to use sampling instead of greedy decoding when generating new tokens."""
+    early_stopping: Optional[Union[bool, "TextToAudioEarlyStoppingEnum"]] = None
+    """Controls the stopping condition for beam-based methods."""
+    epsilon_cutoff: Optional[float] = None
+    """If set to float strictly between 0 and 1, only tokens with a conditional probability
+    greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+    3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+    Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+    """
+    eta_cutoff: Optional[float] = None
+    """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+    float strictly between 0 and 1, a token is only considered if it is greater than either
+    eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+    term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+    the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+    See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+    for more details.
+    """
+    max_length: Optional[int] = None
+    """The maximum length (in tokens) of the generated text, including the input."""
+    max_new_tokens: Optional[int] = None
+    """The maximum number of tokens to generate. Takes precedence over max_length."""
+    min_length: Optional[int] = None
+    """The minimum length (in tokens) of the generated text, including the input."""
+    min_new_tokens: Optional[int] = None
+    """The minimum number of tokens to generate. Takes precedence over min_length."""
+    num_beam_groups: Optional[int] = None
+    """Number of groups to divide num_beams into in order to ensure diversity among different
+    groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+    """
+    num_beams: Optional[int] = None
+    """Number of beams to use for beam search."""
+    penalty_alpha: Optional[float] = None
+    """The value balances the model confidence and the degeneration penalty in contrastive
+    search decoding.
+    """
+    temperature: Optional[float] = None
+    """The value used to modulate the next token probabilities."""
+    top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
+    top_p: Optional[float] = None
+    """If set to float < 1, only the smallest set of most probable tokens with probabilities
+    that add up to top_p or higher are kept for generation.
+    """
+    typical_p: Optional[float] = None
+    """Local typicality measures how similar the conditional probability of predicting a target
+    token next is to the expected conditional probability of predicting a random token next,
+    given the partial text already generated. If set to float < 1, the smallest set of the
+    most locally typical tokens with probabilities that add up to typical_p or higher are
+    kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+    """
+    use_cache: Optional[bool] = None
+    """Whether the model should use the past last key/values attentions to speed up decoding"""
+
+
+@dataclass_with_extra
+class TextToAudioParameters(BaseInferenceType):
+    """Additional inference parameters for Text To Audio"""
+
+    # Will be deprecated in the future when the renaming to `generation_parameters` is implemented in transformers
+    generate_kwargs: Optional[TextToAudioGenerationParameters] = None
+    """Parametrization of the text generation process"""
+
+
+@dataclass_with_extra
+class TextToAudioInput(BaseInferenceType):
+    """Inputs for Text To Audio inference"""
+
+    inputs: str
+    """The input text data"""
+    parameters: Optional[TextToAudioParameters] = None
+    """Additional inference parameters for Text To Audio"""
+
+
+@dataclass_with_extra
+class TextToAudioOutput(BaseInferenceType):
+    """Outputs of inference for the Text To Audio task"""
+
+    audio: Any
+    """The generated audio waveform."""
+    sampling_rate: float
+    """The sampling rate of the generated audio waveform."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_image.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_image.py
new file mode 100644
index 00000000..20c96373
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_image.py
@@ -0,0 +1,50 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class TextToImageParameters(BaseInferenceType):
+    """Additional inference parameters for Text To Image"""
+
+    guidance_scale: Optional[float] = None
+    """A higher guidance scale value encourages the model to generate images closely linked to
+    the text prompt, but values too high may cause saturation and other artifacts.
+    """
+    height: Optional[int] = None
+    """The height in pixels of the output image"""
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in image generation."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    image at the expense of slower inference.
+    """
+    scheduler: Optional[str] = None
+    """Override the scheduler with a compatible one."""
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+    width: Optional[int] = None
+    """The width in pixels of the output image"""
+
+
+@dataclass_with_extra
+class TextToImageInput(BaseInferenceType):
+    """Inputs for Text To Image inference"""
+
+    inputs: str
+    """The input text data (sometimes called "prompt")"""
+    parameters: Optional[TextToImageParameters] = None
+    """Additional inference parameters for Text To Image"""
+
+
+@dataclass_with_extra
+class TextToImageOutput(BaseInferenceType):
+    """Outputs of inference for the Text To Image task"""
+
+    image: Any
+    """The generated image returned as raw bytes in the payload."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_speech.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_speech.py
new file mode 100644
index 00000000..4399e2ae
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_speech.py
@@ -0,0 +1,100 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Literal, Optional, Union
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+TextToSpeechEarlyStoppingEnum = Literal["never"]
+
+
+@dataclass_with_extra
+class TextToSpeechGenerationParameters(BaseInferenceType):
+    """Parametrization of the text generation process"""
+
+    do_sample: Optional[bool] = None
+    """Whether to use sampling instead of greedy decoding when generating new tokens."""
+    early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None
+    """Controls the stopping condition for beam-based methods."""
+    epsilon_cutoff: Optional[float] = None
+    """If set to float strictly between 0 and 1, only tokens with a conditional probability
+    greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+    3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+    Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+    """
+    eta_cutoff: Optional[float] = None
+    """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+    float strictly between 0 and 1, a token is only considered if it is greater than either
+    eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+    term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+    the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+    See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+    for more details.
+    """
+    max_length: Optional[int] = None
+    """The maximum length (in tokens) of the generated text, including the input."""
+    max_new_tokens: Optional[int] = None
+    """The maximum number of tokens to generate. Takes precedence over max_length."""
+    min_length: Optional[int] = None
+    """The minimum length (in tokens) of the generated text, including the input."""
+    min_new_tokens: Optional[int] = None
+    """The minimum number of tokens to generate. Takes precedence over min_length."""
+    num_beam_groups: Optional[int] = None
+    """Number of groups to divide num_beams into in order to ensure diversity among different
+    groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+    """
+    num_beams: Optional[int] = None
+    """Number of beams to use for beam search."""
+    penalty_alpha: Optional[float] = None
+    """The value balances the model confidence and the degeneration penalty in contrastive
+    search decoding.
+    """
+    temperature: Optional[float] = None
+    """The value used to modulate the next token probabilities."""
+    top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
+    top_p: Optional[float] = None
+    """If set to float < 1, only the smallest set of most probable tokens with probabilities
+    that add up to top_p or higher are kept for generation.
+    """
+    typical_p: Optional[float] = None
+    """Local typicality measures how similar the conditional probability of predicting a target
+    token next is to the expected conditional probability of predicting a random token next,
+    given the partial text already generated. If set to float < 1, the smallest set of the
+    most locally typical tokens with probabilities that add up to typical_p or higher are
+    kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+    """
+    use_cache: Optional[bool] = None
+    """Whether the model should use the past last key/values attentions to speed up decoding"""
+
+
+@dataclass_with_extra
+class TextToSpeechParameters(BaseInferenceType):
+    """Additional inference parameters for Text To Speech"""
+
+    # Will be deprecated in the future when the renaming to `generation_parameters` is implemented in transformers
+    generate_kwargs: Optional[TextToSpeechGenerationParameters] = None
+    """Parametrization of the text generation process"""
+
+
+@dataclass_with_extra
+class TextToSpeechInput(BaseInferenceType):
+    """Inputs for Text To Speech inference"""
+
+    inputs: str
+    """The input text data"""
+    parameters: Optional[TextToSpeechParameters] = None
+    """Additional inference parameters for Text To Speech"""
+
+
+@dataclass_with_extra
+class TextToSpeechOutput(BaseInferenceType):
+    """Outputs of inference for the Text To Speech task"""
+
+    audio: Any
+    """The generated audio"""
+    sampling_rate: Optional[float] = None
+    """The sampling rate of the generated audio waveform."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py
new file mode 100644
index 00000000..e54a1bc0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py
@@ -0,0 +1,46 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, List, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class TextToVideoParameters(BaseInferenceType):
+    """Additional inference parameters for Text To Video"""
+
+    guidance_scale: Optional[float] = None
+    """A higher guidance scale value encourages the model to generate videos closely linked to
+    the text prompt, but values too high may cause saturation and other artifacts.
+    """
+    negative_prompt: Optional[List[str]] = None
+    """One or several prompt to guide what NOT to include in video generation."""
+    num_frames: Optional[float] = None
+    """The num_frames parameter determines how many video frames are generated."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    video at the expense of slower inference.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+
+
+@dataclass_with_extra
+class TextToVideoInput(BaseInferenceType):
+    """Inputs for Text To Video inference"""
+
+    inputs: str
+    """The input text data (sometimes called "prompt")"""
+    parameters: Optional[TextToVideoParameters] = None
+    """Additional inference parameters for Text To Video"""
+
+
+@dataclass_with_extra
+class TextToVideoOutput(BaseInferenceType):
+    """Outputs of inference for the Text To Video task"""
+
+    video: Any
+    """The generated video returned as raw bytes in the payload."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/token_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/token_classification.py
new file mode 100644
index 00000000..e039b6a1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/token_classification.py
@@ -0,0 +1,51 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import List, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+TokenClassificationAggregationStrategy = Literal["none", "simple", "first", "average", "max"]
+
+
+@dataclass_with_extra
+class TokenClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Token Classification"""
+
+    aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None
+    """The strategy used to fuse tokens based on model predictions"""
+    ignore_labels: Optional[List[str]] = None
+    """A list of labels to ignore"""
+    stride: Optional[int] = None
+    """The number of overlapping tokens between chunks when splitting the input text."""
+
+
+@dataclass_with_extra
+class TokenClassificationInput(BaseInferenceType):
+    """Inputs for Token Classification inference"""
+
+    inputs: str
+    """The input text data"""
+    parameters: Optional[TokenClassificationParameters] = None
+    """Additional inference parameters for Token Classification"""
+
+
+@dataclass_with_extra
+class TokenClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Token Classification task"""
+
+    end: int
+    """The character position in the input where this group ends."""
+    score: float
+    """The associated score / probability"""
+    start: int
+    """The character position in the input where this group begins."""
+    word: str
+    """The corresponding text"""
+    entity: Optional[str] = None
+    """The predicted label for a single token"""
+    entity_group: Optional[str] = None
+    """The predicted label for a group of one or more tokens"""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/translation.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/translation.py
new file mode 100644
index 00000000..df95b7db
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/translation.py
@@ -0,0 +1,49 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Dict, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+TranslationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
+
+
+@dataclass_with_extra
+class TranslationParameters(BaseInferenceType):
+    """Additional inference parameters for Translation"""
+
+    clean_up_tokenization_spaces: Optional[bool] = None
+    """Whether to clean up the potential extra spaces in the text output."""
+    generate_parameters: Optional[Dict[str, Any]] = None
+    """Additional parametrization of the text generation algorithm."""
+    src_lang: Optional[str] = None
+    """The source language of the text. Required for models that can translate from multiple
+    languages.
+    """
+    tgt_lang: Optional[str] = None
+    """Target language to translate to. Required for models that can translate to multiple
+    languages.
+    """
+    truncation: Optional["TranslationTruncationStrategy"] = None
+    """The truncation strategy to use."""
+
+
+@dataclass_with_extra
+class TranslationInput(BaseInferenceType):
+    """Inputs for Translation inference"""
+
+    inputs: str
+    """The text to translate."""
+    parameters: Optional[TranslationParameters] = None
+    """Additional inference parameters for Translation"""
+
+
+@dataclass_with_extra
+class TranslationOutput(BaseInferenceType):
+    """Outputs of inference for the Translation task"""
+
+    translation_text: str
+    """The translated text."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/video_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/video_classification.py
new file mode 100644
index 00000000..e1d7a15b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/video_classification.py
@@ -0,0 +1,45 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Literal, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+VideoClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass_with_extra
+class VideoClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Video Classification"""
+
+    frame_sampling_rate: Optional[int] = None
+    """The sampling rate used to select frames from the video."""
+    function_to_apply: Optional["VideoClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    num_frames: Optional[int] = None
+    """The number of sampled frames to consider for classification."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass_with_extra
+class VideoClassificationInput(BaseInferenceType):
+    """Inputs for Video Classification inference"""
+
+    inputs: Any
+    """The input video data"""
+    parameters: Optional[VideoClassificationParameters] = None
+    """Additional inference parameters for Video Classification"""
+
+
+@dataclass_with_extra
+class VideoClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Video Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py
new file mode 100644
index 00000000..d368f162
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py
@@ -0,0 +1,49 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class VisualQuestionAnsweringInputData(BaseInferenceType):
+    """One (image, question) pair to answer"""
+
+    image: Any
+    """The image."""
+    question: str
+    """The question to answer based on the image."""
+
+
+@dataclass_with_extra
+class VisualQuestionAnsweringParameters(BaseInferenceType):
+    """Additional inference parameters for Visual Question Answering"""
+
+    top_k: Optional[int] = None
+    """The number of answers to return (will be chosen by order of likelihood). Note that we
+    return less than topk answers if there are not enough options available within the
+    context.
+    """
+
+
+@dataclass_with_extra
+class VisualQuestionAnsweringInput(BaseInferenceType):
+    """Inputs for Visual Question Answering inference"""
+
+    inputs: VisualQuestionAnsweringInputData
+    """One (image, question) pair to answer"""
+    parameters: Optional[VisualQuestionAnsweringParameters] = None
+    """Additional inference parameters for Visual Question Answering"""
+
+
+@dataclass_with_extra
+class VisualQuestionAnsweringOutputElement(BaseInferenceType):
+    """Outputs of inference for the Visual Question Answering task"""
+
+    score: float
+    """The associated score / probability"""
+    answer: Optional[str] = None
+    """The answer to the question"""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_classification.py
new file mode 100644
index 00000000..47b32492
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_classification.py
@@ -0,0 +1,45 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import List, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ZeroShotClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Zero Shot Classification"""
+
+    candidate_labels: List[str]
+    """The set of possible class labels to classify the text into."""
+    hypothesis_template: Optional[str] = None
+    """The sentence used in conjunction with `candidate_labels` to attempt the text
+    classification by replacing the placeholder with the candidate labels.
+    """
+    multi_label: Optional[bool] = None
+    """Whether multiple candidate labels can be true. If false, the scores are normalized such
+    that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+    considered independent and probabilities are normalized for each candidate.
+    """
+
+
+@dataclass_with_extra
+class ZeroShotClassificationInput(BaseInferenceType):
+    """Inputs for Zero Shot Classification inference"""
+
+    inputs: str
+    """The text to classify"""
+    parameters: ZeroShotClassificationParameters
+    """Additional inference parameters for Zero Shot Classification"""
+
+
+@dataclass_with_extra
+class ZeroShotClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Zero Shot Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
new file mode 100644
index 00000000..998d66b6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
@@ -0,0 +1,40 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import List, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ZeroShotImageClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Zero Shot Image Classification"""
+
+    candidate_labels: List[str]
+    """The candidate labels for this image"""
+    hypothesis_template: Optional[str] = None
+    """The sentence used in conjunction with `candidate_labels` to attempt the image
+    classification by replacing the placeholder with the candidate labels.
+    """
+
+
+@dataclass_with_extra
+class ZeroShotImageClassificationInput(BaseInferenceType):
+    """Inputs for Zero Shot Image Classification inference"""
+
+    inputs: str
+    """The input image data to classify as a base64-encoded string."""
+    parameters: ZeroShotImageClassificationParameters
+    """Additional inference parameters for Zero Shot Image Classification"""
+
+
+@dataclass_with_extra
+class ZeroShotImageClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Zero Shot Image Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
new file mode 100644
index 00000000..8ef76b5f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
@@ -0,0 +1,52 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import List
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ZeroShotObjectDetectionParameters(BaseInferenceType):
+    """Additional inference parameters for Zero Shot Object Detection"""
+
+    candidate_labels: List[str]
+    """The candidate labels for this image"""
+
+
+@dataclass_with_extra
+class ZeroShotObjectDetectionInput(BaseInferenceType):
+    """Inputs for Zero Shot Object Detection inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string."""
+    parameters: ZeroShotObjectDetectionParameters
+    """Additional inference parameters for Zero Shot Object Detection"""
+
+
+@dataclass_with_extra
+class ZeroShotObjectDetectionBoundingBox(BaseInferenceType):
+    """The predicted bounding box. Coordinates are relative to the top left corner of the input
+    image.
+    """
+
+    xmax: int
+    xmin: int
+    ymax: int
+    ymin: int
+
+
+@dataclass_with_extra
+class ZeroShotObjectDetectionOutputElement(BaseInferenceType):
+    """Outputs of inference for the Zero Shot Object Detection task"""
+
+    box: ZeroShotObjectDetectionBoundingBox
+    """The predicted bounding box. Coordinates are relative to the top left corner of the input
+    image.
+    """
+    label: str
+    """A candidate label"""
+    score: float
+    """The associated score / probability"""
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/__init__.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/__init__.py
new file mode 100644
index 00000000..34003125
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/__init__.py
@@ -0,0 +1,135 @@
+from typing import Dict, Literal
+
+from ._common import TaskProviderHelper
+from .black_forest_labs import BlackForestLabsTextToImageTask
+from .cerebras import CerebrasConversationalTask
+from .cohere import CohereConversationalTask
+from .fal_ai import (
+    FalAIAutomaticSpeechRecognitionTask,
+    FalAITextToImageTask,
+    FalAITextToSpeechTask,
+    FalAITextToVideoTask,
+)
+from .fireworks_ai import FireworksAIConversationalTask
+from .hf_inference import HFInferenceBinaryInputTask, HFInferenceConversational, HFInferenceTask
+from .hyperbolic import HyperbolicTextGenerationTask, HyperbolicTextToImageTask
+from .nebius import NebiusConversationalTask, NebiusTextGenerationTask, NebiusTextToImageTask
+from .novita import NovitaConversationalTask, NovitaTextGenerationTask
+from .replicate import ReplicateTask, ReplicateTextToSpeechTask
+from .sambanova import SambanovaConversationalTask
+from .together import TogetherConversationalTask, TogetherTextGenerationTask, TogetherTextToImageTask
+
+
+PROVIDER_T = Literal[
+    "black-forest-labs",
+    "cerebras",
+    "cohere",
+    "fal-ai",
+    "fireworks-ai",
+    "hf-inference",
+    "hyperbolic",
+    "nebius",
+    "novita",
+    "replicate",
+    "sambanova",
+    "together",
+]
+
+PROVIDERS: Dict[PROVIDER_T, Dict[str, TaskProviderHelper]] = {
+    "black-forest-labs": {
+        "text-to-image": BlackForestLabsTextToImageTask(),
+    },
+    "cerebras": {
+        "conversational": CerebrasConversationalTask(),
+    },
+    "cohere": {
+        "conversational": CohereConversationalTask(),
+    },
+    "fal-ai": {
+        "automatic-speech-recognition": FalAIAutomaticSpeechRecognitionTask(),
+        "text-to-image": FalAITextToImageTask(),
+        "text-to-speech": FalAITextToSpeechTask(),
+        "text-to-video": FalAITextToVideoTask(),
+    },
+    "fireworks-ai": {
+        "conversational": FireworksAIConversationalTask(),
+    },
+    "hf-inference": {
+        "text-to-image": HFInferenceTask("text-to-image"),
+        "conversational": HFInferenceConversational(),
+        "text-generation": HFInferenceTask("text-generation"),
+        "text-classification": HFInferenceTask("text-classification"),
+        "question-answering": HFInferenceTask("question-answering"),
+        "audio-classification": HFInferenceBinaryInputTask("audio-classification"),
+        "automatic-speech-recognition": HFInferenceBinaryInputTask("automatic-speech-recognition"),
+        "fill-mask": HFInferenceTask("fill-mask"),
+        "feature-extraction": HFInferenceTask("feature-extraction"),
+        "image-classification": HFInferenceBinaryInputTask("image-classification"),
+        "image-segmentation": HFInferenceBinaryInputTask("image-segmentation"),
+        "document-question-answering": HFInferenceTask("document-question-answering"),
+        "image-to-text": HFInferenceBinaryInputTask("image-to-text"),
+        "object-detection": HFInferenceBinaryInputTask("object-detection"),
+        "audio-to-audio": HFInferenceBinaryInputTask("audio-to-audio"),
+        "zero-shot-image-classification": HFInferenceBinaryInputTask("zero-shot-image-classification"),
+        "zero-shot-classification": HFInferenceTask("zero-shot-classification"),
+        "image-to-image": HFInferenceBinaryInputTask("image-to-image"),
+        "sentence-similarity": HFInferenceTask("sentence-similarity"),
+        "table-question-answering": HFInferenceTask("table-question-answering"),
+        "tabular-classification": HFInferenceTask("tabular-classification"),
+        "text-to-speech": HFInferenceTask("text-to-speech"),
+        "token-classification": HFInferenceTask("token-classification"),
+        "translation": HFInferenceTask("translation"),
+        "summarization": HFInferenceTask("summarization"),
+        "visual-question-answering": HFInferenceBinaryInputTask("visual-question-answering"),
+    },
+    "hyperbolic": {
+        "text-to-image": HyperbolicTextToImageTask(),
+        "conversational": HyperbolicTextGenerationTask("conversational"),
+        "text-generation": HyperbolicTextGenerationTask("text-generation"),
+    },
+    "nebius": {
+        "text-to-image": NebiusTextToImageTask(),
+        "conversational": NebiusConversationalTask(),
+        "text-generation": NebiusTextGenerationTask(),
+    },
+    "novita": {
+        "text-generation": NovitaTextGenerationTask(),
+        "conversational": NovitaConversationalTask(),
+    },
+    "replicate": {
+        "text-to-image": ReplicateTask("text-to-image"),
+        "text-to-speech": ReplicateTextToSpeechTask(),
+        "text-to-video": ReplicateTask("text-to-video"),
+    },
+    "sambanova": {
+        "conversational": SambanovaConversationalTask(),
+    },
+    "together": {
+        "text-to-image": TogetherTextToImageTask(),
+        "conversational": TogetherConversationalTask(),
+        "text-generation": TogetherTextGenerationTask(),
+    },
+}
+
+
+def get_provider_helper(provider: PROVIDER_T, task: str) -> TaskProviderHelper:
+    """Get provider helper instance by name and task.
+
+    Args:
+        provider (str): Name of the provider
+        task (str): Name of the task
+
+    Returns:
+        TaskProviderHelper: Helper instance for the specified provider and task
+
+    Raises:
+        ValueError: If provider or task is not supported
+    """
+    if provider not in PROVIDERS:
+        raise ValueError(f"Provider '{provider}' not supported. Available providers: {list(PROVIDERS.keys())}")
+    if task not in PROVIDERS[provider]:
+        raise ValueError(
+            f"Task '{task}' not supported for provider '{provider}'. "
+            f"Available tasks: {list(PROVIDERS[provider].keys())}"
+        )
+    return PROVIDERS[provider][task]
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/_common.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/_common.py
new file mode 100644
index 00000000..a30b5cf3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/_common.py
@@ -0,0 +1,241 @@
+from functools import lru_cache
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub import constants
+from huggingface_hub.inference._common import RequestParameters
+from huggingface_hub.utils import build_hf_headers, get_token, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+# Dev purposes only.
+# If you want to try to run inference for a new model locally before it's registered on huggingface.co
+# for a given Inference Provider, you can add it to the following dictionary.
+HARDCODED_MODEL_ID_MAPPING: Dict[str, Dict[str, str]] = {
+    # "HF model ID" => "Model ID on Inference Provider's side"
+    #
+    # Example:
+    # "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen2.5-Coder-32B-Instruct",
+    "cerebras": {},
+    "cohere": {},
+    "fal-ai": {},
+    "fireworks-ai": {},
+    "hf-inference": {},
+    "hyperbolic": {},
+    "nebius": {},
+    "replicate": {},
+    "sambanova": {},
+    "together": {},
+}
+
+
+def filter_none(d: Dict[str, Any]) -> Dict[str, Any]:
+    return {k: v for k, v in d.items() if v is not None}
+
+
+class TaskProviderHelper:
+    """Base class for task-specific provider helpers."""
+
+    def __init__(self, provider: str, base_url: str, task: str) -> None:
+        self.provider = provider
+        self.task = task
+        self.base_url = base_url
+
+    def prepare_request(
+        self,
+        *,
+        inputs: Any,
+        parameters: Dict[str, Any],
+        headers: Dict,
+        model: Optional[str],
+        api_key: Optional[str],
+        extra_payload: Optional[Dict[str, Any]] = None,
+    ) -> RequestParameters:
+        """
+        Prepare the request to be sent to the provider.
+
+        Each step (api_key, model, headers, url, payload) can be customized in subclasses.
+        """
+        # api_key from user, or local token, or raise error
+        api_key = self._prepare_api_key(api_key)
+
+        # mapped model from HF model ID
+        mapped_model = self._prepare_mapped_model(model)
+
+        # default HF headers + user headers (to customize in subclasses)
+        headers = self._prepare_headers(headers, api_key)
+
+        # routed URL if HF token, or direct URL (to customize in '_prepare_route' in subclasses)
+        url = self._prepare_url(api_key, mapped_model)
+
+        # prepare payload (to customize in subclasses)
+        payload = self._prepare_payload_as_dict(inputs, parameters, mapped_model=mapped_model)
+        if payload is not None:
+            payload = recursive_merge(payload, extra_payload or {})
+
+        # body data (to customize in subclasses)
+        data = self._prepare_payload_as_bytes(inputs, parameters, mapped_model, extra_payload)
+
+        # check if both payload and data are set and return
+        if payload is not None and data is not None:
+            raise ValueError("Both payload and data cannot be set in the same request.")
+        if payload is None and data is None:
+            raise ValueError("Either payload or data must be set in the request.")
+        return RequestParameters(url=url, task=self.task, model=mapped_model, json=payload, data=data, headers=headers)
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        """
+        Return the response in the expected format.
+
+        Override this method in subclasses for customized response handling."""
+        return response
+
+    def _prepare_api_key(self, api_key: Optional[str]) -> str:
+        """Return the API key to use for the request.
+
+        Usually not overwritten in subclasses."""
+        if api_key is None:
+            api_key = get_token()
+        if api_key is None:
+            raise ValueError(
+                f"You must provide an api_key to work with {self.provider} API or log in with `huggingface-cli login`."
+            )
+        return api_key
+
+    def _prepare_mapped_model(self, model: Optional[str]) -> str:
+        """Return the mapped model ID to use for the request.
+
+        Usually not overwritten in subclasses."""
+        if model is None:
+            raise ValueError(f"Please provide an HF model ID supported by {self.provider}.")
+
+        # hardcoded mapping for local testing
+        if HARDCODED_MODEL_ID_MAPPING.get(self.provider, {}).get(model):
+            return HARDCODED_MODEL_ID_MAPPING[self.provider][model]
+
+        provider_mapping = _fetch_inference_provider_mapping(model).get(self.provider)
+        if provider_mapping is None:
+            raise ValueError(f"Model {model} is not supported by provider {self.provider}.")
+
+        if provider_mapping.task != self.task:
+            raise ValueError(
+                f"Model {model} is not supported for task {self.task} and provider {self.provider}. "
+                f"Supported task: {provider_mapping.task}."
+            )
+
+        if provider_mapping.status == "staging":
+            logger.warning(
+                f"Model {model} is in staging mode for provider {self.provider}. Meant for test purposes only."
+            )
+        return provider_mapping.provider_id
+
+    def _prepare_headers(self, headers: Dict, api_key: str) -> Dict:
+        """Return the headers to use for the request.
+
+        Override this method in subclasses for customized headers.
+        """
+        return {**build_hf_headers(token=api_key), **headers}
+
+    def _prepare_url(self, api_key: str, mapped_model: str) -> str:
+        """Return the URL to use for the request.
+
+        Usually not overwritten in subclasses."""
+        base_url = self._prepare_base_url(api_key)
+        route = self._prepare_route(mapped_model)
+        return f"{base_url.rstrip('/')}/{route.lstrip('/')}"
+
+    def _prepare_base_url(self, api_key: str) -> str:
+        """Return the base URL to use for the request.
+
+        Usually not overwritten in subclasses."""
+        # Route to the proxy if the api_key is a HF TOKEN
+        if api_key.startswith("hf_"):
+            logger.info(f"Calling '{self.provider}' provider through Hugging Face router.")
+            return constants.INFERENCE_PROXY_TEMPLATE.format(provider=self.provider)
+        else:
+            logger.info(f"Calling '{self.provider}' provider directly.")
+            return self.base_url
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        """Return the route to use for the request.
+
+        Override this method in subclasses for customized routes.
+        """
+        return ""
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        """Return the payload to use for the request, as a dict.
+
+        Override this method in subclasses for customized payloads.
+        Only one of `_prepare_payload_as_dict` and `_prepare_payload_as_bytes` should return a value.
+        """
+        return None
+
+    def _prepare_payload_as_bytes(
+        self, inputs: Any, parameters: Dict, mapped_model: str, extra_payload: Optional[Dict]
+    ) -> Optional[bytes]:
+        """Return the body to use for the request, as bytes.
+
+        Override this method in subclasses for customized body data.
+        Only one of `_prepare_payload_as_dict` and `_prepare_payload_as_bytes` should return a value.
+        """
+        return None
+
+
+class BaseConversationalTask(TaskProviderHelper):
+    """
+    Base class for conversational (chat completion) tasks.
+    The schema follows the OpenAI API format defined here: https://platform.openai.com/docs/api-reference/chat
+    """
+
+    def __init__(self, provider: str, base_url: str):
+        super().__init__(provider=provider, base_url=base_url, task="conversational")
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return "/v1/chat/completions"
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        return {"messages": inputs, **filter_none(parameters), "model": mapped_model}
+
+
+class BaseTextGenerationTask(TaskProviderHelper):
+    """
+    Base class for text-generation (completion) tasks.
+    The schema follows the OpenAI API format defined here: https://platform.openai.com/docs/api-reference/completions
+    """
+
+    def __init__(self, provider: str, base_url: str):
+        super().__init__(provider=provider, base_url=base_url, task="text-generation")
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return "/v1/completions"
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        return {"prompt": inputs, **filter_none(parameters), "model": mapped_model}
+
+
+@lru_cache(maxsize=None)
+def _fetch_inference_provider_mapping(model: str) -> Dict:
+    """
+    Fetch provider mappings for a model from the Hub.
+    """
+    from huggingface_hub.hf_api import HfApi
+
+    info = HfApi().model_info(model, expand=["inferenceProviderMapping"])
+    provider_mapping = info.inference_provider_mapping
+    if provider_mapping is None:
+        raise ValueError(f"No provider mapping found for model {model}")
+    return provider_mapping
+
+
+def recursive_merge(dict1: Dict, dict2: Dict) -> Dict:
+    return {
+        **dict1,
+        **{
+            key: recursive_merge(dict1[key], value)
+            if (key in dict1 and isinstance(dict1[key], dict) and isinstance(value, dict))
+            else value
+            for key, value in dict2.items()
+        },
+    }
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/black_forest_labs.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/black_forest_labs.py
new file mode 100644
index 00000000..14d8eb3d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/black_forest_labs.py
@@ -0,0 +1,66 @@
+import time
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub.inference._common import _as_dict
+from huggingface_hub.inference._providers._common import TaskProviderHelper, filter_none
+from huggingface_hub.utils import logging
+from huggingface_hub.utils._http import get_session
+
+
+logger = logging.get_logger(__name__)
+
+MAX_POLLING_ATTEMPTS = 6
+POLLING_INTERVAL = 1.0
+
+
+class BlackForestLabsTextToImageTask(TaskProviderHelper):
+    def __init__(self):
+        super().__init__(provider="black-forest-labs", base_url="https://api.us1.bfl.ai/v1", task="text-to-image")
+
+    def _prepare_headers(self, headers: Dict, api_key: str) -> Dict:
+        headers = super()._prepare_headers(headers, api_key)
+        if not api_key.startswith("hf_"):
+            _ = headers.pop("authorization")
+            headers["X-Key"] = api_key
+        return headers
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return mapped_model
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        parameters = filter_none(parameters)
+        if "num_inference_steps" in parameters:
+            parameters["steps"] = parameters.pop("num_inference_steps")
+        if "guidance_scale" in parameters:
+            parameters["guidance"] = parameters.pop("guidance_scale")
+
+        return {"prompt": inputs, **parameters}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        """
+        Polling mechanism for Black Forest Labs since the API is asynchronous.
+        """
+        url = _as_dict(response).get("polling_url")
+        session = get_session()
+        for _ in range(MAX_POLLING_ATTEMPTS):
+            time.sleep(POLLING_INTERVAL)
+
+            response = session.get(url, headers={"Content-Type": "application/json"})  # type: ignore
+            response.raise_for_status()  # type: ignore
+            response_json: Dict = response.json()  # type: ignore
+            status = response_json.get("status")
+            logger.info(
+                f"Polling generation result from {url}. Current status: {status}. "
+                f"Will retry after {POLLING_INTERVAL} seconds if not ready."
+            )
+
+            if (
+                status == "Ready"
+                and isinstance(response_json.get("result"), dict)
+                and (sample_url := response_json["result"].get("sample"))
+            ):
+                image_resp = session.get(sample_url)
+                image_resp.raise_for_status()
+                return image_resp.content
+
+        raise TimeoutError(f"Failed to get the image URL after {MAX_POLLING_ATTEMPTS} attempts.")
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cerebras.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cerebras.py
new file mode 100644
index 00000000..12b18158
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cerebras.py
@@ -0,0 +1,6 @@
+from huggingface_hub.inference._providers._common import BaseConversationalTask
+
+
+class CerebrasConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider="cerebras", base_url="https://api.cerebras.ai")
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cohere.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cohere.py
new file mode 100644
index 00000000..2fbe6fbf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/cohere.py
@@ -0,0 +1,15 @@
+from huggingface_hub.inference._providers._common import (
+    BaseConversationalTask,
+)
+
+
+_PROVIDER = "cohere"
+_BASE_URL = "https://api.cohere.com"
+
+
+class CohereConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return "/compatibility/v1/chat/completions"
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fal_ai.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fal_ai.py
new file mode 100644
index 00000000..e6d2e4bd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fal_ai.py
@@ -0,0 +1,90 @@
+import base64
+from abc import ABC
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub.inference._common import _as_dict
+from huggingface_hub.inference._providers._common import TaskProviderHelper, filter_none
+from huggingface_hub.utils import get_session
+
+
+class FalAITask(TaskProviderHelper, ABC):
+    def __init__(self, task: str):
+        super().__init__(provider="fal-ai", base_url="https://fal.run", task=task)
+
+    def _prepare_headers(self, headers: Dict, api_key: str) -> Dict:
+        headers = super()._prepare_headers(headers, api_key)
+        if not api_key.startswith("hf_"):
+            headers["authorization"] = f"Key {api_key}"
+        return headers
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return f"/{mapped_model}"
+
+
+class FalAIAutomaticSpeechRecognitionTask(FalAITask):
+    def __init__(self):
+        super().__init__("automatic-speech-recognition")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        if isinstance(inputs, str) and inputs.startswith(("http://", "https://")):
+            # If input is a URL, pass it directly
+            audio_url = inputs
+        else:
+            # If input is a file path, read it first
+            if isinstance(inputs, str):
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+            audio_b64 = base64.b64encode(inputs).decode()
+            content_type = "audio/mpeg"
+            audio_url = f"data:{content_type};base64,{audio_b64}"
+
+        return {"audio_url": audio_url, **filter_none(parameters)}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        text = _as_dict(response)["text"]
+        if not isinstance(text, str):
+            raise ValueError(f"Unexpected output format from FalAI API. Expected string, got {type(text)}.")
+        return text
+
+
+class FalAITextToImageTask(FalAITask):
+    def __init__(self):
+        super().__init__("text-to-image")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        parameters = filter_none(parameters)
+        if "width" in parameters and "height" in parameters:
+            parameters["image_size"] = {
+                "width": parameters.pop("width"),
+                "height": parameters.pop("height"),
+            }
+        return {"prompt": inputs, **parameters}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        url = _as_dict(response)["images"][0]["url"]
+        return get_session().get(url).content
+
+
+class FalAITextToSpeechTask(FalAITask):
+    def __init__(self):
+        super().__init__("text-to-speech")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        return {"lyrics": inputs, **filter_none(parameters)}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        url = _as_dict(response)["audio"]["url"]
+        return get_session().get(url).content
+
+
+class FalAITextToVideoTask(FalAITask):
+    def __init__(self):
+        super().__init__("text-to-video")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        return {"prompt": inputs, **filter_none(parameters)}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        url = _as_dict(response)["video"]["url"]
+        return get_session().get(url).content
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fireworks_ai.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fireworks_ai.py
new file mode 100644
index 00000000..bac95c29
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/fireworks_ai.py
@@ -0,0 +1,6 @@
+from ._common import BaseConversationalTask
+
+
+class FireworksAIConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider="fireworks-ai", base_url="https://api.fireworks.ai/inference")
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hf_inference.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hf_inference.py
new file mode 100644
index 00000000..2377f91b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hf_inference.py
@@ -0,0 +1,122 @@
+import json
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from huggingface_hub import constants
+from huggingface_hub.inference._common import _b64_encode, _open_as_binary
+from huggingface_hub.inference._providers._common import TaskProviderHelper, filter_none
+from huggingface_hub.utils import build_hf_headers, get_session, get_token, hf_raise_for_status
+
+
+class HFInferenceTask(TaskProviderHelper):
+    """Base class for HF Inference API tasks."""
+
+    def __init__(self, task: str):
+        super().__init__(
+            provider="hf-inference",
+            base_url=constants.INFERENCE_PROXY_TEMPLATE.format(provider="hf-inference"),
+            task=task,
+        )
+
+    def _prepare_api_key(self, api_key: Optional[str]) -> str:
+        # special case: for HF Inference we allow not providing an API key
+        return api_key or get_token()  # type: ignore[return-value]
+
+    def _prepare_mapped_model(self, model: Optional[str]) -> str:
+        if model is not None:
+            return model
+        model = _fetch_recommended_models().get(self.task)
+        if model is None:
+            raise ValueError(
+                f"Task {self.task} has no recommended model for HF Inference. Please specify a model"
+                " explicitly. Visit https://huggingface.co/tasks for more info."
+            )
+        return model
+
+    def _prepare_url(self, api_key: str, mapped_model: str) -> str:
+        # hf-inference provider can handle URLs (e.g. Inference Endpoints or TGI deployment)
+        if mapped_model.startswith(("http://", "https://")):
+            return mapped_model
+        return (
+            # Feature-extraction and sentence-similarity are the only cases where we handle models with several tasks.
+            f"{self.base_url}/pipeline/{self.task}/{mapped_model}"
+            if self.task in ("feature-extraction", "sentence-similarity")
+            # Otherwise, we use the default endpoint
+            else f"{self.base_url}/models/{mapped_model}"
+        )
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        if isinstance(inputs, bytes):
+            raise ValueError(f"Unexpected binary input for task {self.task}.")
+        if isinstance(inputs, Path):
+            raise ValueError(f"Unexpected path input for task {self.task} (got {inputs})")
+        return {"inputs": inputs, "parameters": filter_none(parameters)}
+
+
+class HFInferenceBinaryInputTask(HFInferenceTask):
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        return None
+
+    def _prepare_payload_as_bytes(
+        self, inputs: Any, parameters: Dict, mapped_model: str, extra_payload: Optional[Dict]
+    ) -> Optional[bytes]:
+        parameters = filter_none({k: v for k, v in parameters.items() if v is not None})
+        extra_payload = extra_payload or {}
+        has_parameters = len(parameters) > 0 or len(extra_payload) > 0
+
+        # Raise if not a binary object or a local path or a URL.
+        if not isinstance(inputs, (bytes, Path)) and not isinstance(inputs, str):
+            raise ValueError(f"Expected binary inputs or a local path or a URL. Got {inputs}")
+
+        # Send inputs as raw content when no parameters are provided
+        if not has_parameters:
+            with _open_as_binary(inputs) as data:
+                data_as_bytes = data if isinstance(data, bytes) else data.read()
+                return data_as_bytes
+
+        # Otherwise encode as b64
+        return json.dumps({"inputs": _b64_encode(inputs), "parameters": parameters, **extra_payload}).encode("utf-8")
+
+
+class HFInferenceConversational(HFInferenceTask):
+    def __init__(self):
+        super().__init__("text-generation")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        payload_model = parameters.get("model") or mapped_model
+
+        if payload_model is None or payload_model.startswith(("http://", "https://")):
+            payload_model = "dummy"
+
+        return {**filter_none(parameters), "model": payload_model, "messages": inputs}
+
+    def _prepare_url(self, api_key: str, mapped_model: str) -> str:
+        base_url = (
+            mapped_model
+            if mapped_model.startswith(("http://", "https://"))
+            else f"{constants.INFERENCE_PROXY_TEMPLATE.format(provider='hf-inference')}/models/{mapped_model}"
+        )
+        return _build_chat_completion_url(base_url)
+
+
+def _build_chat_completion_url(model_url: str) -> str:
+    # Strip trailing /
+    model_url = model_url.rstrip("/")
+
+    # Append /chat/completions if not already present
+    if model_url.endswith("/v1"):
+        model_url += "/chat/completions"
+
+    # Append /v1/chat/completions if not already present
+    if not model_url.endswith("/chat/completions"):
+        model_url += "/v1/chat/completions"
+
+    return model_url
+
+
+@lru_cache(maxsize=1)
+def _fetch_recommended_models() -> Dict[str, Optional[str]]:
+    response = get_session().get(f"{constants.ENDPOINT}/api/tasks", headers=build_hf_headers())
+    hf_raise_for_status(response)
+    return {task: next(iter(details["widgetModels"]), None) for task, details in response.json().items()}
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hyperbolic.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hyperbolic.py
new file mode 100644
index 00000000..919a3818
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/hyperbolic.py
@@ -0,0 +1,43 @@
+import base64
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub.inference._common import _as_dict
+from huggingface_hub.inference._providers._common import BaseConversationalTask, TaskProviderHelper, filter_none
+
+
+class HyperbolicTextToImageTask(TaskProviderHelper):
+    def __init__(self):
+        super().__init__(provider="hyperbolic", base_url="https://api.hyperbolic.xyz", task="text-to-image")
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return "/v1/images/generations"
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        parameters = filter_none(parameters)
+        if "num_inference_steps" in parameters:
+            parameters["steps"] = parameters.pop("num_inference_steps")
+        if "guidance_scale" in parameters:
+            parameters["cfg_scale"] = parameters.pop("guidance_scale")
+        # For Hyperbolic, the width and height are required parameters
+        if "width" not in parameters:
+            parameters["width"] = 512
+        if "height" not in parameters:
+            parameters["height"] = 512
+        return {"prompt": inputs, "model_name": mapped_model, **parameters}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        response_dict = _as_dict(response)
+        return base64.b64decode(response_dict["images"][0]["image"])
+
+
+class HyperbolicTextGenerationTask(BaseConversationalTask):
+    """
+    Special case for Hyperbolic, where text-generation task is handled as a conversational task.
+    """
+
+    def __init__(self, task: str):
+        super().__init__(
+            provider="hyperbolic",
+            base_url="https://api.hyperbolic.xyz",
+        )
+        self.task = task
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/nebius.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/nebius.py
new file mode 100644
index 00000000..d6b37356
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/nebius.py
@@ -0,0 +1,41 @@
+import base64
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub.inference._common import _as_dict
+from huggingface_hub.inference._providers._common import (
+    BaseConversationalTask,
+    BaseTextGenerationTask,
+    TaskProviderHelper,
+    filter_none,
+)
+
+
+class NebiusTextGenerationTask(BaseTextGenerationTask):
+    def __init__(self):
+        super().__init__(provider="nebius", base_url="https://api.studio.nebius.ai")
+
+
+class NebiusConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider="nebius", base_url="https://api.studio.nebius.ai")
+
+
+class NebiusTextToImageTask(TaskProviderHelper):
+    def __init__(self):
+        super().__init__(task="text-to-image", provider="nebius", base_url="https://api.studio.nebius.ai")
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        return "/v1/images/generations"
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        parameters = filter_none(parameters)
+        if "guidance_scale" in parameters:
+            parameters.pop("guidance_scale")
+        if parameters.get("response_format") not in ("b64_json", "url"):
+            parameters["response_format"] = "b64_json"
+
+        return {"prompt": inputs, **parameters, "model": mapped_model}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        response_dict = _as_dict(response)
+        return base64.b64decode(response_dict["data"][0]["b64_json"])
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/novita.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/novita.py
new file mode 100644
index 00000000..3fc836a3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/novita.py
@@ -0,0 +1,26 @@
+from huggingface_hub.inference._providers._common import (
+    BaseConversationalTask,
+    BaseTextGenerationTask,
+)
+
+
+_PROVIDER = "novita"
+_BASE_URL = "https://api.novita.ai/v3/openai"
+
+
+class NovitaTextGenerationTask(BaseTextGenerationTask):
+    def __init__(self):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        # there is no v1/ route for novita
+        return "/completions"
+
+
+class NovitaConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        # there is no v1/ route for novita
+        return "/chat/completions"
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/replicate.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/replicate.py
new file mode 100644
index 00000000..dc84f69f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/replicate.py
@@ -0,0 +1,53 @@
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub.inference._common import _as_dict
+from huggingface_hub.inference._providers._common import TaskProviderHelper, filter_none
+from huggingface_hub.utils import get_session
+
+
+_PROVIDER = "replicate"
+_BASE_URL = "https://api.replicate.com"
+
+
+class ReplicateTask(TaskProviderHelper):
+    def __init__(self, task: str):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL, task=task)
+
+    def _prepare_headers(self, headers: Dict, api_key: str) -> Dict:
+        headers = super()._prepare_headers(headers, api_key)
+        headers["Prefer"] = "wait"
+        return headers
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        if ":" in mapped_model:
+            return "/v1/predictions"
+        return f"/v1/models/{mapped_model}/predictions"
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        payload: Dict[str, Any] = {"input": {"prompt": inputs, **filter_none(parameters)}}
+        if ":" in mapped_model:
+            version = mapped_model.split(":", 1)[1]
+            payload["version"] = version
+        return payload
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        response_dict = _as_dict(response)
+        if response_dict.get("output") is None:
+            raise TimeoutError(
+                f"Inference request timed out after 60 seconds. No output generated for model {response_dict.get('model')}"
+                "The model might be in cold state or starting up. Please try again later."
+            )
+        output_url = (
+            response_dict["output"] if isinstance(response_dict["output"], str) else response_dict["output"][0]
+        )
+        return get_session().get(output_url).content
+
+
+class ReplicateTextToSpeechTask(ReplicateTask):
+    def __init__(self):
+        super().__init__("text-to-speech")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        payload: Dict = super()._prepare_payload_as_dict(inputs, parameters, mapped_model)  # type: ignore[assignment]
+        payload["input"]["text"] = payload["input"].pop("prompt")  # rename "prompt" to "text" for TTS
+        return payload
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/sambanova.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/sambanova.py
new file mode 100644
index 00000000..3678e942
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/sambanova.py
@@ -0,0 +1,6 @@
+from huggingface_hub.inference._providers._common import BaseConversationalTask
+
+
+class SambanovaConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider="sambanova", base_url="https://api.sambanova.ai")
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/together.py b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/together.py
new file mode 100644
index 00000000..6e2c1eb4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_providers/together.py
@@ -0,0 +1,59 @@
+import base64
+from abc import ABC
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub.inference._common import _as_dict
+from huggingface_hub.inference._providers._common import (
+    BaseConversationalTask,
+    BaseTextGenerationTask,
+    TaskProviderHelper,
+    filter_none,
+)
+
+
+_PROVIDER = "together"
+_BASE_URL = "https://api.together.xyz"
+
+
+class TogetherTask(TaskProviderHelper, ABC):
+    """Base class for Together API tasks."""
+
+    def __init__(self, task: str):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL, task=task)
+
+    def _prepare_route(self, mapped_model: str) -> str:
+        if self.task == "text-to-image":
+            return "/v1/images/generations"
+        elif self.task == "conversational":
+            return "/v1/chat/completions"
+        elif self.task == "text-generation":
+            return "/v1/completions"
+        raise ValueError(f"Unsupported task '{self.task}' for Together API.")
+
+
+class TogetherTextGenerationTask(BaseTextGenerationTask):
+    def __init__(self):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+
+
+class TogetherConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+
+
+class TogetherTextToImageTask(TogetherTask):
+    def __init__(self):
+        super().__init__("text-to-image")
+
+    def _prepare_payload_as_dict(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]:
+        parameters = filter_none(parameters)
+        if "num_inference_steps" in parameters:
+            parameters["steps"] = parameters.pop("num_inference_steps")
+        if "guidance_scale" in parameters:
+            parameters["guidance"] = parameters.pop("guidance_scale")
+
+        return {"prompt": inputs, "response_format": "base64", **parameters, "model": mapped_model}
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        response_dict = _as_dict(response)
+        return base64.b64decode(response_dict["data"][0]["b64_json"])