# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import copy
import hashlib
import logging
import os
import re
import shutil
import subprocess
import tempfile
import zipfile
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from threading import Lock
from typing import Iterable, List, Optional, Union
from typing_extensions import Literal
from azure.ai.ml.constants._common import DefaultOpenEncoding
from ._http_utils import HttpPipeline
from .utils import get_base_directory_for_cache
_logger = logging.getLogger(__name__)
class ArtifactCache:
"""Disk cache of azure artifact packages.
The key of the cache is path of artifact packages in local, like this
azure-ai-ml/components/additional_includes/artifacts/{organization}/{project}/{feed}/{package_name}/{version}.
The value is the files/folders in this cache folder.
"""
# artifact cache is shared across SDK versions and across workspaces/registries
DEFAULT_DISK_CACHE_DIRECTORY = get_base_directory_for_cache().joinpath(
"components",
"additional_includes",
"artifacts",
)
POSTFIX_CHECKSUM = "checksum"
_instance_lock = Lock()
_instance = None
def __new__(cls):
"""Singleton creation disk cache."""
if cls._instance is None:
with cls._instance_lock:
if cls._instance is None:
cls._instance = object.__new__(cls)
cls.check_artifact_extension()
return cls._instance
@staticmethod
def check_artifact_extension():
# check az extension azure-devops installed. Install it if not installed.
result = subprocess.run(
[shutil.which("az"), "artifacts", "--help", "--yes"],
capture_output=True,
check=False,
)
if result.returncode != 0:
raise RuntimeError(
"Auto-installation failed. Please install azure-devops "
"extension by 'az extension add --name azure-devops'."
)
def __init__(self, cache_directory=None):
self._cache_directory = cache_directory or self.DEFAULT_DISK_CACHE_DIRECTORY
Path(self._cache_directory).mkdir(exist_ok=True, parents=True)
self._artifacts_tool_path = None
self._download_locks = defaultdict(Lock)
@property
def cache_directory(self) -> Path:
"""Cache directory path.
:return: The cache directory
:rtype: Path
"""
return self._cache_directory
@staticmethod
def hash_files_content(file_list: List[Union[str, os.PathLike]]) -> str:
"""Hash the file content in the file list.
:param file_list: The list of files to hash
:type file_list: List[Union[str, os.PathLike]]
:return: Hashed file contents
:rtype: str
"""
ordered_file_list = copy.copy(file_list)
hasher = hashlib.sha256()
ordered_file_list.sort()
for item in ordered_file_list:
with open(item, "rb") as f:
hasher.update(f.read())
return hasher.hexdigest()
@staticmethod
def _format_organization_name(organization):
pattern = r'[<>:"\\/|?*]'
normalized_organization_name = re.sub(pattern, "_", organization)
return normalized_organization_name
@staticmethod
def get_organization_project_by_git():
"""Get organization and project from git remote url. For example, the git remote url is
"https://organization.visualstudio.com/xxx/project_name/_git/repositry_name" or
"https://dev.azure.com/{organization}/project".
:return organization_url, project: organization_url, project
:rtype organization_url, project: str, str
"""
result = subprocess.run(
[shutil.which("git"), "config", "--get", "remote.origin.url"],
capture_output=True,
encoding="utf-8",
check=False,
)
if result.returncode != 0:
# When organization and project cannot be retrieved from the origin url.
raise RuntimeError(
f"Get the git origin url failed, you must be in a local Git directory, "
f"error message: {result.stderr}"
)
origin_url = result.stdout.strip()
# Organization URL has two format, https://dev.azure.com/{organization} and
# https://{organization}.visualstudio.com
# https://learn.microsoft.com/azure/devops/extend/develop/work-with-urls?view=azure-devops&tabs=http
if "dev.azure.com" in origin_url:
regex = r"^https:\/\/\w*@?dev\.azure\.com\/(\w*)\/(\w*)"
results = re.findall(regex, origin_url)
if results:
organization, project = results[0]
return f"https://dev.azure.com/{organization}", project
elif "visualstudio.com" in origin_url:
regex = r"https:\/\/(\w*)\.visualstudio\.com.*\/(\w*)\/_git"
results = re.findall(regex, origin_url)
if results:
organization, project = results[0]
return f"https://{organization}.visualstudio.com", project
# When organization and project cannot be retrieved from the origin url.
raise RuntimeError(
f'Cannot get organization and project from git origin url "{origin_url}", '
f'you must be in a local Git directory that has a "remote" referencing a '
f"Azure DevOps or Azure DevOps Server repository."
)
@classmethod
def _get_checksum_path(cls, path):
artifact_path = Path(path)
return artifact_path.parent / f"{artifact_path.name}_{cls.POSTFIX_CHECKSUM}"
def _redirect_artifacts_tool_path(self, organization: Optional[str]):
"""Downloads the artifacts tool and redirects `az artifact` command to it.
Done to avoid the transient issue when download artifacts
:param organization: The organization url. If None, is determined by local git repo
:type organization: Optional[str]
"""
from azure.identity import DefaultAzureCredential
if not organization:
organization, _ = self.get_organization_project_by_git()
organization_pattern = r"https:\/\/(.*)\.visualstudio\.com"
result = re.findall(pattern=organization_pattern, string=organization)
if result:
organization_name = result[0]
else:
organization_pattern = r"https:\/\/dev\.azure\.com\/(.*)"
result = re.findall(pattern=organization_pattern, string=organization)
if not result:
raise RuntimeError("Cannot find artifact organization.")
organization_name = result[0]
if not self._artifacts_tool_path:
os_name = "Windows" if os.name == "nt" else "Linux"
credential = DefaultAzureCredential()
token = credential.get_token("https://management.azure.com/.default")
header = {"Authorization": "Bearer " + token.token}
# The underlying HttpTransport is meant to be user configurable.
# MLClient instances have a user configured Pipeline for sending http requests
# TODO: Replace this with MlCLient._requests_pipeline
requests_pipeline = HttpPipeline()
url = (
f"https://{organization_name}.vsblob.visualstudio.com/_apis/clienttools/ArtifactTool/release?"
f"osName={os_name}&arch=AMD64"
)
response = requests_pipeline.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
url, headers=header
)
if response.status_code == 200:
artifacts_tool_path = tempfile.mkdtemp() # nosec B306
artifacts_tool_uri = response.json()["uri"]
response = requests_pipeline.get(artifacts_tool_uri) # pylint: disable=too-many-function-args
with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
zip_file.extractall(artifacts_tool_path)
os.environ["AZURE_DEVOPS_EXT_ARTIFACTTOOL_OVERRIDE_PATH"] = str(artifacts_tool_path.resolve())
self._artifacts_tool_path = artifacts_tool_path
else:
_logger.warning("Download artifact tool failed: %s", response.text)
def _download_artifacts(
self,
download_cmd: Iterable[str],
organization: Optional[str],
name: str,
version: str,
feed: str,
max_retries: int = 3,
):
"""Download artifacts with retry.
:param download_cmd: The command used to download the artifact
:type download_cmd: Iterable[str]
:param organization: The artifact organization
:type organization: Optional[str]
:param name: The package name
:type name: str
:param version: The package version
:type version: str
:param feed: The download feed
:type feed: str
:param max_retries: The number of times to retry the download. Defaults to 3
:type max_retries: int
"""
retries = 0
while retries <= max_retries:
try:
self._redirect_artifacts_tool_path(organization)
except Exception as e: # pylint: disable=W0718
_logger.warning("Redirect artifacts tool path failed, details: %s", e)
retries += 1
result = subprocess.run(
download_cmd,
capture_output=True,
encoding="utf-8",
check=False,
)
if result.returncode != 0:
error_msg = (
f"Download package {name}:{version} from the feed {feed} failed {retries} times: {result.stderr}"
)
if retries < max_retries:
_logger.warning(error_msg)
else:
error_msg = error_msg + f"\nDownload artifact debug info: {result.stdout}"
raise RuntimeError(error_msg)
else:
return
def _check_artifacts(self, artifact_package_path: Union[str, os.PathLike]) -> bool:
"""Check the artifact folder is legal.
:param artifact_package_path: The artifact package path
:type artifact_package_path: Union[str, os.PathLike]
:return:
* If the artifact folder or checksum file does not exist, return false.
* If the checksum file exists and does not equal to the hash of artifact folder, return False.
* If the checksum file equals to the hash of artifact folder, return true.
:rtype: bool
"""
path = Path(artifact_package_path)
if not path.exists():
return False
checksum_path = self._get_checksum_path(artifact_package_path)
if checksum_path.exists():
with open(checksum_path, "r", encoding=DefaultOpenEncoding.READ) as f:
checksum = f.read()
file_list = [os.path.join(root, f) for root, _, files in os.walk(path) for f in files]
artifact_hash = self.hash_files_content(file_list)
return checksum == artifact_hash
return False
def get(
self,
feed: str,
name: str,
version: str,
scope: Literal["project", "organization"],
organization: Optional[str] = None,
project: Optional[str] = None,
resolve: bool = True,
) -> Optional[Path]:
"""Get the catch path of artifact package. Package path like this azure-ai-
ml/components/additional_includes/artifacts/{organization}/{project}/{feed}/{package_name}/{version}. If the
path exits, it will return the package path. If the path not exist and resolve=True, it will download the
artifact package and return package path. If the path not exist and resolve=False, it will return None.
:param feed: Name or ID of the feed.
:type feed: str
:param name: Name of the package.
:type name: str
:param version: Version of the package.
:type version: str
:param scope: Scope of the feed: 'project' if the feed was created in a project, and 'organization' otherwise.
:type scope: Literal["project", "organization"]
:param organization: Azure DevOps organization URL.
:type organization: str
:param project: Name or ID of the project.
:type project: str
:param resolve: Whether download package when package does not exist in local.
:type resolve: bool
:return artifact_package_path: Cache path of the artifact package
:rtype: Optional[Path]
"""
if not all([organization, project]):
org_val, project_val = self.get_organization_project_by_git()
organization = organization or org_val
project = project or project_val
artifact_package_path = (
Path(self.DEFAULT_DISK_CACHE_DIRECTORY)
/ self._format_organization_name(organization)
/ project
/ feed
/ name
/ version
)
# Use lock to avoid downloading the same package at the same time.
with self._download_locks[artifact_package_path]:
if self._check_artifacts(artifact_package_path):
# When the cache folder of artifact package exists, it's sure that the package has been downloaded.
return artifact_package_path.absolute().resolve()
if resolve:
check_sum_path = self._get_checksum_path(artifact_package_path)
if Path(check_sum_path).exists():
os.unlink(check_sum_path)
if artifact_package_path.exists():
# Remove invalid artifact package to avoid affecting download artifact.
temp_folder = tempfile.mkdtemp() # nosec B306
os.rename(artifact_package_path, temp_folder)
shutil.rmtree(temp_folder)
# Download artifact
return self.set(
feed=feed,
name=name,
version=version,
organization=organization,
project=project,
scope=scope,
)
return None
def set(
self,
feed: str,
name: str,
version: str,
scope: Literal["project", "organization"],
organization: Optional[str] = None,
project: Optional[str] = None,
) -> Path:
"""Set the artifact package to the cache. The key of the cache is path of artifact packages in local. The value
is the files/folders in this cache folder. If package path exists, directly return package path.
:param feed: Name or ID of the feed.
:type feed: str
:param name: Name of the package.
:type name: str
:param version: Version of the package.
:type version: str
:param scope: Scope of the feed: 'project' if the feed was created in a project, and 'organization' otherwise.
:type scope: Literal["project", "organization"]
:param organization: Azure DevOps organization URL.
:type organization: str
:param project: Name or ID of the project.
:type project: str
:return artifact_package_path: Cache path of the artifact package
:rtype: Path
"""
tempdir = tempfile.mkdtemp() # nosec B306
download_cmd = [
shutil.which("az"),
"artifacts",
"universal",
"download",
"--feed",
feed,
"--name",
name,
"--version",
version,
"--scope",
scope,
"--path",
tempdir,
]
if organization:
download_cmd.extend(["--org", organization])
if project:
download_cmd.extend(["--project", project])
_logger.info("Start downloading artifacts %s:%s from %s.", name, version, feed)
result = subprocess.run(
download_cmd,
capture_output=True,
encoding="utf-8",
check=False,
)
if result.returncode != 0:
artifacts_tool_not_find_error_pattern = "No such file or directory: .*artifacttool"
if re.findall(artifacts_tool_not_find_error_pattern, result.stderr):
# When download artifacts tool failed retry download artifacts command
_logger.warning(
"Download package %s:%s from the feed %s failed: %s", name, version, feed, result.stderr
)
download_cmd.append("--debug")
self._download_artifacts(download_cmd, organization, name, version, feed)
else:
raise RuntimeError(f"Download package {name}:{version} from the feed {feed} failed: {result.stderr}")
try:
# Copy artifact package from temp folder to the cache path.
if not all([organization, project]):
org_val, project_val = self.get_organization_project_by_git()
organization = organization or org_val
project = project or project_val
artifact_package_path = (
Path(self.DEFAULT_DISK_CACHE_DIRECTORY)
/ self._format_organization_name(organization)
/ project
/ feed
/ name
/ version
)
artifact_package_path.parent.mkdir(exist_ok=True, parents=True)
file_list = [os.path.join(root, f) for root, _, files in os.walk(tempdir) for f in files]
artifact_hash = self.hash_files_content(file_list)
os.rename(tempdir, artifact_package_path)
temp_checksum_file = os.path.join(tempfile.mkdtemp(), f"{version}_{self.POSTFIX_CHECKSUM}")
with open(temp_checksum_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
f.write(artifact_hash)
os.rename(
temp_checksum_file,
artifact_package_path.parent / f"{version}_{self.POSTFIX_CHECKSUM}",
)
except (FileExistsError, PermissionError, OSError):
# On Windows, if dst exists a FileExistsError is always raised.
# On Unix, if dst is a non-empty directory, an OSError is raised.
# If dst is being used by another process will raise PermissionError.
# https://docs.python.org/3/library/os.html#os.rename
pass
return artifact_package_path.absolute().resolve()