aboutsummaryrefslogtreecommitdiff
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import copy
import hashlib
import logging
import os
import re
import shutil
import subprocess
import tempfile
import zipfile
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from threading import Lock
from typing import Iterable, List, Optional, Union

from typing_extensions import Literal

from azure.ai.ml.constants._common import DefaultOpenEncoding

from ._http_utils import HttpPipeline
from .utils import get_base_directory_for_cache

_logger = logging.getLogger(__name__)


class ArtifactCache:
    """Disk cache of azure artifact packages.

    The key of the cache is path of artifact packages in local, like this
    azure-ai-ml/components/additional_includes/artifacts/{organization}/{project}/{feed}/{package_name}/{version}.
    The value is the files/folders in this cache folder.
    """

    # artifact cache is shared across SDK versions and across workspaces/registries
    DEFAULT_DISK_CACHE_DIRECTORY = get_base_directory_for_cache().joinpath(
        "components",
        "additional_includes",
        "artifacts",
    )
    POSTFIX_CHECKSUM = "checksum"
    _instance_lock = Lock()
    _instance = None

    def __new__(cls):
        """Singleton creation disk cache."""
        if cls._instance is None:
            with cls._instance_lock:
                if cls._instance is None:
                    cls._instance = object.__new__(cls)
                    cls.check_artifact_extension()
        return cls._instance

    @staticmethod
    def check_artifact_extension():
        # check az extension azure-devops installed. Install it if not installed.
        result = subprocess.run(
            [shutil.which("az"), "artifacts", "--help", "--yes"],
            capture_output=True,
            check=False,
        )

        if result.returncode != 0:
            raise RuntimeError(
                "Auto-installation failed. Please install azure-devops "
                "extension by 'az extension add --name azure-devops'."
            )

    def __init__(self, cache_directory=None):
        self._cache_directory = cache_directory or self.DEFAULT_DISK_CACHE_DIRECTORY
        Path(self._cache_directory).mkdir(exist_ok=True, parents=True)
        self._artifacts_tool_path = None
        self._download_locks = defaultdict(Lock)

    @property
    def cache_directory(self) -> Path:
        """Cache directory path.

        :return: The cache directory
        :rtype: Path
        """
        return self._cache_directory

    @staticmethod
    def hash_files_content(file_list: List[Union[str, os.PathLike]]) -> str:
        """Hash the file content in the file list.

        :param file_list: The list of files to hash
        :type file_list: List[Union[str, os.PathLike]]
        :return: Hashed file contents
        :rtype: str
        """
        ordered_file_list = copy.copy(file_list)
        hasher = hashlib.sha256()
        ordered_file_list.sort()
        for item in ordered_file_list:
            with open(item, "rb") as f:
                hasher.update(f.read())
        return hasher.hexdigest()

    @staticmethod
    def _format_organization_name(organization):
        pattern = r'[<>:"\\/|?*]'
        normalized_organization_name = re.sub(pattern, "_", organization)
        return normalized_organization_name

    @staticmethod
    def get_organization_project_by_git():
        """Get organization and project from git remote url. For example, the git remote url is
        "https://organization.visualstudio.com/xxx/project_name/_git/repositry_name" or
        "https://dev.azure.com/{organization}/project".

        :return organization_url, project: organization_url, project
        :rtype organization_url, project: str, str
        """
        result = subprocess.run(
            [shutil.which("git"), "config", "--get", "remote.origin.url"],
            capture_output=True,
            encoding="utf-8",
            check=False,
        )

        if result.returncode != 0:
            # When organization and project cannot be retrieved from the origin url.
            raise RuntimeError(
                f"Get the git origin url failed, you must be in a local Git directory, "
                f"error message: {result.stderr}"
            )
        origin_url = result.stdout.strip()

        # Organization URL has two format, https://dev.azure.com/{organization} and
        # https://{organization}.visualstudio.com
        # https://learn.microsoft.com/azure/devops/extend/develop/work-with-urls?view=azure-devops&tabs=http
        if "dev.azure.com" in origin_url:
            regex = r"^https:\/\/\w*@?dev\.azure\.com\/(\w*)\/(\w*)"
            results = re.findall(regex, origin_url)
            if results:
                organization, project = results[0]
                return f"https://dev.azure.com/{organization}", project
        elif "visualstudio.com" in origin_url:
            regex = r"https:\/\/(\w*)\.visualstudio\.com.*\/(\w*)\/_git"
            results = re.findall(regex, origin_url)
            if results:
                organization, project = results[0]
                return f"https://{organization}.visualstudio.com", project

        # When organization and project cannot be retrieved from the origin url.
        raise RuntimeError(
            f'Cannot get organization and project from git origin url "{origin_url}", '
            f'you must be in a local Git directory that has a "remote" referencing a '
            f"Azure DevOps or Azure DevOps Server repository."
        )

    @classmethod
    def _get_checksum_path(cls, path):
        artifact_path = Path(path)
        return artifact_path.parent / f"{artifact_path.name}_{cls.POSTFIX_CHECKSUM}"

    def _redirect_artifacts_tool_path(self, organization: Optional[str]):
        """Downloads the artifacts tool and redirects `az artifact` command to it.

        Done to avoid the transient issue when download artifacts

        :param organization:  The organization url. If None, is determined by local git repo
        :type organization: Optional[str]
        """
        from azure.identity import DefaultAzureCredential

        if not organization:
            organization, _ = self.get_organization_project_by_git()

        organization_pattern = r"https:\/\/(.*)\.visualstudio\.com"
        result = re.findall(pattern=organization_pattern, string=organization)
        if result:
            organization_name = result[0]
        else:
            organization_pattern = r"https:\/\/dev\.azure\.com\/(.*)"
            result = re.findall(pattern=organization_pattern, string=organization)
            if not result:
                raise RuntimeError("Cannot find artifact organization.")
            organization_name = result[0]

        if not self._artifacts_tool_path:
            os_name = "Windows" if os.name == "nt" else "Linux"
            credential = DefaultAzureCredential()
            token = credential.get_token("https://management.azure.com/.default")
            header = {"Authorization": "Bearer " + token.token}

            # The underlying HttpTransport is meant to be user configurable.
            # MLClient instances have a user configured Pipeline for sending http requests
            # TODO: Replace this with MlCLient._requests_pipeline
            requests_pipeline = HttpPipeline()
            url = (
                f"https://{organization_name}.vsblob.visualstudio.com/_apis/clienttools/ArtifactTool/release?"
                f"osName={os_name}&arch=AMD64"
            )
            response = requests_pipeline.get(  # pylint: disable=too-many-function-args,unexpected-keyword-arg
                url, headers=header
            )
            if response.status_code == 200:
                artifacts_tool_path = tempfile.mkdtemp()  # nosec B306
                artifacts_tool_uri = response.json()["uri"]
                response = requests_pipeline.get(artifacts_tool_uri)  # pylint: disable=too-many-function-args
                with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
                    zip_file.extractall(artifacts_tool_path)
                os.environ["AZURE_DEVOPS_EXT_ARTIFACTTOOL_OVERRIDE_PATH"] = str(artifacts_tool_path.resolve())
                self._artifacts_tool_path = artifacts_tool_path
            else:
                _logger.warning("Download artifact tool failed: %s", response.text)

    def _download_artifacts(
        self,
        download_cmd: Iterable[str],
        organization: Optional[str],
        name: str,
        version: str,
        feed: str,
        max_retries: int = 3,
    ):
        """Download artifacts with retry.

        :param download_cmd: The command used to download the artifact
        :type download_cmd: Iterable[str]
        :param organization: The artifact organization
        :type organization: Optional[str]
        :param name: The package name
        :type name: str
        :param version: The package version
        :type version: str
        :param feed: The download feed
        :type feed: str
        :param max_retries: The number of times to retry the download. Defaults to 3
        :type max_retries: int
        """
        retries = 0
        while retries <= max_retries:
            try:
                self._redirect_artifacts_tool_path(organization)
            except Exception as e:  # pylint: disable=W0718
                _logger.warning("Redirect artifacts tool path failed, details: %s", e)

            retries += 1
            result = subprocess.run(
                download_cmd,
                capture_output=True,
                encoding="utf-8",
                check=False,
            )

            if result.returncode != 0:
                error_msg = (
                    f"Download package {name}:{version} from the feed {feed} failed {retries} times: {result.stderr}"
                )
                if retries < max_retries:
                    _logger.warning(error_msg)
                else:
                    error_msg = error_msg + f"\nDownload artifact debug info: {result.stdout}"
                    raise RuntimeError(error_msg)
            else:
                return

    def _check_artifacts(self, artifact_package_path: Union[str, os.PathLike]) -> bool:
        """Check the artifact folder is legal.

        :param artifact_package_path: The artifact package path
        :type artifact_package_path: Union[str, os.PathLike]
        :return:
          * If the artifact folder or checksum file does not exist, return false.
          * If the checksum file exists and does not equal to the hash of artifact folder, return False.
          * If the checksum file equals to the hash of artifact folder, return true.
        :rtype: bool
        """
        path = Path(artifact_package_path)
        if not path.exists():
            return False
        checksum_path = self._get_checksum_path(artifact_package_path)
        if checksum_path.exists():
            with open(checksum_path, "r", encoding=DefaultOpenEncoding.READ) as f:
                checksum = f.read()
                file_list = [os.path.join(root, f) for root, _, files in os.walk(path) for f in files]
                artifact_hash = self.hash_files_content(file_list)
                return checksum == artifact_hash
        return False

    def get(
        self,
        feed: str,
        name: str,
        version: str,
        scope: Literal["project", "organization"],
        organization: Optional[str] = None,
        project: Optional[str] = None,
        resolve: bool = True,
    ) -> Optional[Path]:
        """Get the catch path of artifact package. Package path like this azure-ai-
        ml/components/additional_includes/artifacts/{organization}/{project}/{feed}/{package_name}/{version}. If the
        path exits, it will return the package path. If the path not exist and resolve=True, it will download the
        artifact package and return package path. If the path not exist and resolve=False, it will return None.

        :param feed: Name or ID of the feed.
        :type feed: str
        :param name: Name of the package.
        :type name: str
        :param version: Version of the package.
        :type version: str
        :param scope: Scope of the feed: 'project' if the feed was created in a project, and 'organization' otherwise.
        :type scope: Literal["project", "organization"]
        :param organization: Azure DevOps organization URL.
        :type organization: str
        :param project: Name or ID of the project.
        :type project: str
        :param resolve: Whether download package when package does not exist in local.
        :type resolve: bool
        :return artifact_package_path: Cache path of the artifact package
        :rtype: Optional[Path]
        """
        if not all([organization, project]):
            org_val, project_val = self.get_organization_project_by_git()
            organization = organization or org_val
            project = project or project_val
        artifact_package_path = (
            Path(self.DEFAULT_DISK_CACHE_DIRECTORY)
            / self._format_organization_name(organization)
            / project
            / feed
            / name
            / version
        )
        # Use lock to avoid downloading the same package at the same time.
        with self._download_locks[artifact_package_path]:
            if self._check_artifacts(artifact_package_path):
                # When the cache folder of artifact package exists, it's sure that the package has been downloaded.
                return artifact_package_path.absolute().resolve()
            if resolve:
                check_sum_path = self._get_checksum_path(artifact_package_path)
                if Path(check_sum_path).exists():
                    os.unlink(check_sum_path)
                if artifact_package_path.exists():
                    # Remove invalid artifact package to avoid affecting download artifact.
                    temp_folder = tempfile.mkdtemp()  # nosec B306
                    os.rename(artifact_package_path, temp_folder)
                    shutil.rmtree(temp_folder)
                # Download artifact
                return self.set(
                    feed=feed,
                    name=name,
                    version=version,
                    organization=organization,
                    project=project,
                    scope=scope,
                )
        return None

    def set(
        self,
        feed: str,
        name: str,
        version: str,
        scope: Literal["project", "organization"],
        organization: Optional[str] = None,
        project: Optional[str] = None,
    ) -> Path:
        """Set the artifact package to the cache. The key of the cache is path of artifact packages in local. The value
        is the files/folders in this cache folder. If package path exists, directly return package path.

        :param feed: Name or ID of the feed.
        :type feed: str
        :param name: Name of the package.
        :type name: str
        :param version: Version of the package.
        :type version: str
        :param scope: Scope of the feed: 'project' if the feed was created in a project, and 'organization' otherwise.
        :type scope: Literal["project", "organization"]
        :param organization: Azure DevOps organization URL.
        :type organization: str
        :param project: Name or ID of the project.
        :type project: str
        :return artifact_package_path: Cache path of the artifact package
        :rtype: Path
        """
        tempdir = tempfile.mkdtemp()  # nosec B306
        download_cmd = [
            shutil.which("az"),
            "artifacts",
            "universal",
            "download",
            "--feed",
            feed,
            "--name",
            name,
            "--version",
            version,
            "--scope",
            scope,
            "--path",
            tempdir,
        ]
        if organization:
            download_cmd.extend(["--org", organization])
        if project:
            download_cmd.extend(["--project", project])
        _logger.info("Start downloading artifacts %s:%s from %s.", name, version, feed)
        result = subprocess.run(
            download_cmd,
            capture_output=True,
            encoding="utf-8",
            check=False,
        )

        if result.returncode != 0:
            artifacts_tool_not_find_error_pattern = "No such file or directory: .*artifacttool"
            if re.findall(artifacts_tool_not_find_error_pattern, result.stderr):
                # When download artifacts tool failed retry download artifacts command
                _logger.warning(
                    "Download package %s:%s from the feed %s failed: %s", name, version, feed, result.stderr
                )
                download_cmd.append("--debug")
                self._download_artifacts(download_cmd, organization, name, version, feed)
            else:
                raise RuntimeError(f"Download package {name}:{version} from the feed {feed} failed: {result.stderr}")
        try:
            # Copy artifact package from temp folder to the cache path.
            if not all([organization, project]):
                org_val, project_val = self.get_organization_project_by_git()
                organization = organization or org_val
                project = project or project_val
            artifact_package_path = (
                Path(self.DEFAULT_DISK_CACHE_DIRECTORY)
                / self._format_organization_name(organization)
                / project
                / feed
                / name
                / version
            )
            artifact_package_path.parent.mkdir(exist_ok=True, parents=True)
            file_list = [os.path.join(root, f) for root, _, files in os.walk(tempdir) for f in files]
            artifact_hash = self.hash_files_content(file_list)
            os.rename(tempdir, artifact_package_path)
            temp_checksum_file = os.path.join(tempfile.mkdtemp(), f"{version}_{self.POSTFIX_CHECKSUM}")
            with open(temp_checksum_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
                f.write(artifact_hash)
            os.rename(
                temp_checksum_file,
                artifact_package_path.parent / f"{version}_{self.POSTFIX_CHECKSUM}",
            )
        except (FileExistsError, PermissionError, OSError):
            # On Windows, if dst exists a FileExistsError is always raised.
            # On Unix, if dst is a non-empty directory, an OSError is raised.
            # If dst is being used by another process will raise PermissionError.
            # https://docs.python.org/3/library/os.html#os.rename
            pass
        return artifact_package_path.absolute().resolve()