diff options
| author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
|---|---|---|
| committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
| commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
| tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular | |
| parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
| download | gn-ai-master.tar.gz | |
Diffstat (limited to '.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular')
8 files changed, 2560 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/__init__.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/__init__.py new file mode 100644 index 00000000..c0373010 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/__init__.py @@ -0,0 +1,22 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +from .automl_tabular import AutoMLTabular +from .classification_job import ClassificationJob +from .featurization_settings import ColumnTransformer, TabularFeaturizationSettings +from .forecasting_job import ForecastingJob +from .forecasting_settings import ForecastingSettings +from .limit_settings import TabularLimitSettings +from .regression_job import RegressionJob + +__all__ = [ + "AutoMLTabular", + "ClassificationJob", + "ColumnTransformer", + "ForecastingJob", + "ForecastingSettings", + "RegressionJob", + "TabularFeaturizationSettings", + "TabularLimitSettings", +] diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/automl_tabular.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/automl_tabular.py new file mode 100644 index 00000000..5f4ed22b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/automl_tabular.py @@ -0,0 +1,607 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=too-many-instance-attributes + +from abc import ABC +from typing import Any, Dict, List, Optional, Union + +from azure.ai.ml._restclient.v2024_01_01_preview.models import ( + AutoNCrossValidations, + BlockedTransformers, + CustomNCrossValidations, + LogVerbosity, +) +from azure.ai.ml._utils.utils import camel_to_snake +from azure.ai.ml.constants import TabularTrainingMode +from azure.ai.ml.constants._job.automl import AutoMLConstants +from azure.ai.ml.entities._inputs_outputs import Input +from azure.ai.ml.entities._job.automl.automl_vertical import AutoMLVertical +from azure.ai.ml.entities._job.automl.stack_ensemble_settings import StackEnsembleSettings +from azure.ai.ml.entities._job.automl.tabular.featurization_settings import ( + ColumnTransformer, + TabularFeaturizationSettings, +) +from azure.ai.ml.entities._job.automl.tabular.limit_settings import TabularLimitSettings +from azure.ai.ml.entities._job.automl.training_settings import TrainingSettings +from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationException + + +class AutoMLTabular(AutoMLVertical, ABC): + """Initialize an AutoML job entity for tabular data. + + Constructor for AutoMLTabular. + + :keyword task_type: The type of task to run. Possible values include: "classification", "regression" + , "forecasting". + :paramtype task_type: str + :keyword featurization: featurization settings. Defaults to None. + :paramtype featurization: typing.Optional[TabularFeaturizationSettings] + :keyword limits: limits settings. Defaults to None. + :paramtype limits: typing.Optional[TabularLimitSettings] + :keyword training: training settings. Defaults to None. + :paramtype training: typing.Optional[TrainingSettings] + :keyword log_verbosity: Verbosity of logging. Possible values include: "debug", "info", "warning", "error", + "critical". Defaults to "info". + :paramtype log_verbosity: str + :keyword target_column_name: The name of the target column. Defaults to None. + :paramtype target_column_name: typing.Optional[str] + :keyword weight_column_name: The name of the weight column. Defaults to None. + :paramtype weight_column_name: typing.Optional[str] + :keyword validation_data_size: The size of the validation data. Defaults to None. + :paramtype validation_data_size: typing.Optional[float] + :keyword cv_split_column_names: The names of the columns to use for cross validation. Defaults to None. + :paramtype cv_split_column_names: typing.Optional[List[str]] + :keyword n_cross_validations: The number of cross validations to run. Defaults to None. + :paramtype n_cross_validations: typing.Optional[int] + :keyword test_data_size: The size of the test data. Defaults to None. + :paramtype test_data_size: typing.Optional[float] + :keyword training_data: The training data. Defaults to None. + :paramtype training_data: typing.Optional[azure.ai.ml.entities._inputs_outputs.Input] + :keyword validation_data: The validation data. Defaults to None. + :paramtype validation_data: typing.Optional[azure.ai.ml.entities._inputs_outputs.Input] + :keyword test_data: The test data. Defaults to None. + :paramtype test_data: typing.Optional[azure.ai.ml.entities._inputs_outputs.Input] + """ + + def __init__( + self, + *, + task_type: str, + featurization: Optional[TabularFeaturizationSettings] = None, + limits: Optional[TabularLimitSettings] = None, + training: Optional[Any] = None, + **kwargs: Any, + ) -> None: + """Initialize an AutoML job entity for tabular data. + + Constructor for AutoMLTabular. + + :keyword task_type: The type of task to run. Possible values include: "classification", "regression" + , "forecasting". + :paramtype task_type: str + :keyword featurization: featurization settings. Defaults to None. + :paramtype featurization: typing.Optional[TabularFeaturizationSettings] + :keyword limits: limits settings. Defaults to None. + :paramtype limits: typing.Optional[TabularLimitSettings] + :keyword training: training settings. Defaults to None. + :paramtype training: typing.Optional[TrainingSettings] + :keyword log_verbosity: Verbosity of logging. Possible values include: "debug", "info", "warning", "error", + "critical". Defaults to "info". + :paramtype log_verbosity: str + :keyword target_column_name: The name of the target column. Defaults to None. + :paramtype target_column_name: typing.Optional[str] + :keyword weight_column_name: The name of the weight column. Defaults to None. + :paramtype weight_column_name: typing.Optional[str] + :keyword validation_data_size: The size of the validation data. Defaults to None. + :paramtype validation_data_size: typing.Optional[float] + :keyword cv_split_column_names: The names of the columns to use for cross validation. Defaults to None. + :paramtype cv_split_column_names: typing.Optional[List[str]] + :keyword n_cross_validations: The number of cross validations to run. Defaults to None. + :paramtype n_cross_validations: typing.Optional[int] + :keyword test_data_size: The size of the test data. Defaults to None. + :paramtype test_data_size: typing.Optional[float] + :keyword training_data: The training data. Defaults to None. + :paramtype training_data: typing.Optional[azure.ai.ml.entities._inputs_outputs.Input] + :keyword validation_data: The validation data. Defaults to None. + :paramtype validation_data: typing.Optional[azure.ai.ml.entities._inputs_outputs.Input] + :keyword test_data: The test data. Defaults to None. + :paramtype test_data: typing.Optional[azure.ai.ml.entities._inputs_outputs.Input] + :raises: :class:`azure.ai.ml.exceptions.ValidationException` + """ + self.log_verbosity = kwargs.pop("log_verbosity", LogVerbosity.INFO) + + self.target_column_name = kwargs.pop("target_column_name", None) + self.weight_column_name = kwargs.pop("weight_column_name", None) + self.validation_data_size = kwargs.pop("validation_data_size", None) + self.cv_split_column_names = kwargs.pop("cv_split_column_names", None) + self.n_cross_validations = kwargs.pop("n_cross_validations", None) + self.test_data_size = kwargs.pop("test_data_size", None) + + super().__init__( + task_type=task_type, + training_data=kwargs.pop("training_data", None), + validation_data=kwargs.pop("validation_data", None), + test_data=kwargs.pop("test_data", None), + **kwargs, + ) + + self._featurization = featurization + self._limits = limits + self._training = training + + @property + def log_verbosity(self) -> LogVerbosity: + """Get the log verbosity for the AutoML job. + + :return: log verbosity for the AutoML job + :rtype: LogVerbosity + """ + return self._log_verbosity + + @log_verbosity.setter + def log_verbosity(self, value: Union[str, LogVerbosity]) -> None: + """Set the log verbosity for the AutoML job. + + :param value: str or LogVerbosity + :type value: typing.Union[str, LogVerbosity] + """ + self._log_verbosity = None if value is None else LogVerbosity[camel_to_snake(value).upper()] + + @property + def limits(self) -> Optional[TabularLimitSettings]: + """Get the tabular limits for the AutoML job. + + :return: Tabular limits for the AutoML job + :rtype: TabularLimitSettings + """ + return self._limits + + @limits.setter + def limits(self, value: Union[Dict, TabularLimitSettings]) -> None: + """Set the limits for the AutoML job. + + :param value: typing.Dict or TabularLimitSettings + :type value: typing.Union[typing.Dict, TabularLimitSettings] + :raises ValidationException: Expected a dictionary for limit settings. + """ + if isinstance(value, TabularLimitSettings): + self._limits = value + else: + if not isinstance(value, dict): + msg = "Expected a dictionary for limit settings." + raise ValidationException( + message=msg, + no_personal_data_message=msg, + target=ErrorTarget.AUTOML, + error_category=ErrorCategory.USER_ERROR, + ) + self.set_limits(**value) + + @property + def training(self) -> Any: + """Get the training settings for the AutoML job. + + :return: Training settings for the AutoML job. + :rtype: TrainingSettings + """ + return self._training + + @training.setter + def training(self, value: Union[Dict, TrainingSettings]) -> None: + """Set the training settings for the AutoML job. + + :param value: typing.Dict or TrainingSettings + :type value: typing.Union[typing.Dict, TrainingSettings] + :raises ValidationException: Expected a dictionary for training settings. + """ + if isinstance(value, TrainingSettings): + self._training = value + else: + if not isinstance(value, dict): + msg = "Expected a dictionary for training settings." + raise ValidationException( + message=msg, + no_personal_data_message=msg, + target=ErrorTarget.AUTOML, + error_category=ErrorCategory.USER_ERROR, + ) + self.set_training(**value) + + @property + def featurization(self) -> Optional[TabularFeaturizationSettings]: + """Get the tabular featurization settings for the AutoML job. + + :return: Tabular featurization settings for the AutoML job + :rtype: TabularFeaturizationSettings + """ + return self._featurization + + @featurization.setter + def featurization(self, value: Union[Dict, TabularFeaturizationSettings]) -> None: + """Set the featurization settings for the AutoML job. + + :param value: typing.Dict or TabularFeaturizationSettings + :type value: typing.Union[typing.Dict, TabularFeaturizationSettings] + :raises ValidationException: Expected a dictionary for featurization settings + """ + if isinstance(value, TabularFeaturizationSettings): + self._featurization = value + else: + if not isinstance(value, dict): + msg = "Expected a dictionary for featurization settings." + raise ValidationException( + message=msg, + no_personal_data_message=msg, + target=ErrorTarget.AUTOML, + error_category=ErrorCategory.USER_ERROR, + ) + self.set_featurization(**value) + + def set_limits( + self, + *, + enable_early_termination: Optional[bool] = None, + exit_score: Optional[float] = None, + max_concurrent_trials: Optional[int] = None, + max_cores_per_trial: Optional[int] = None, + max_nodes: Optional[int] = None, + max_trials: Optional[int] = None, + timeout_minutes: Optional[int] = None, + trial_timeout_minutes: Optional[int] = None, + ) -> None: + """Set limits for the job. + + :keyword enable_early_termination: Whether to enable early termination if the score is not improving in the + short term, defaults to None. + + Early stopping logic: + + * No early stopping for first 20 iterations (landmarks). + * Early stopping window starts on the 21st iteration and looks for early_stopping_n_iters iterations + (currently set to 10). This means that the first iteration where stopping can occur is the 31st. + * AutoML still schedules 2 ensemble iterations AFTER early stopping, which might result in higher scores. + * Early stopping is triggered if the absolute value of best score calculated is the same for past + early_stopping_n_iters iterations, that is, if there is no improvement in score for + early_stopping_n_iters iterations. + + :paramtype enable_early_termination: typing.Optional[bool] + :keyword exit_score: Target score for experiment. The experiment terminates after this score is reached. + If not specified (no criteria), the experiment runs until no further progress is made + on the primary metric. For for more information on exit criteria, see this `article + <https://learn.microsoft.com/azure/machine-learning/how-to-configure-auto-train#exit-criteria>`_ + , defaults to None + :paramtype exit_score: typing.Optional[float] + :keyword max_concurrent_trials: This is the maximum number of iterations that would be executed in parallel. + The default value is 1. + + * AmlCompute clusters support one iteration running per node. For multiple AutoML experiment parent runs + executed in parallel on a single AmlCompute cluster, the sum of the ``max_concurrent_trials`` values + for all experiments should be less than or equal to the maximum number of nodes. Otherwise, runs + will be queued until nodes are available. + + * DSVM supports multiple iterations per node. ``max_concurrent_trials`` should + be less than or equal to the number of cores on the DSVM. For multiple experiments + run in parallel on a single DSVM, the sum of the ``max_concurrent_trials`` values for all + experiments should be less than or equal to the maximum number of nodes. + + * Databricks - ``max_concurrent_trials`` should be less than or equal to the number of + worker nodes on Databricks. + + ``max_concurrent_trials`` does not apply to local runs. Formerly, this parameter + was named ``concurrent_iterations``. + :paramtype max_concurrent_trials: typing.Optional[int] + :keyword max_cores_per_trial: The maximum number of threads to use for a given training iteration. + Acceptable values: + + * Greater than 1 and less than or equal to the maximum number of cores on the compute target. + + * Equal to -1, which means to use all the possible cores per iteration per child-run. + + * Equal to 1, the default. + + :paramtype max_cores_per_trial: typing.Optional[int] + :keyword max_nodes: [Experimental] The maximum number of nodes to use for distributed training. + + * For forecasting, each model is trained using max(2, int(max_nodes / max_concurrent_trials)) nodes. + + * For classification/regression, each model is trained using max_nodes nodes. + + Note- This parameter is in public preview and might change in future. + :paramtype max_nodes: typing.Optional[int] + :keyword max_trials: The total number of different algorithm and parameter combinations to test during an + automated ML experiment. If not specified, the default is 1000 iterations. + :paramtype max_trials: typing.Optional[int] + :keyword timeout_minutes: Maximum amount of time in minutes that all iterations combined can take before the + experiment terminates. If not specified, the default experiment timeout is 6 days. To specify a timeout + less than or equal to 1 hour, make sure your dataset's size is not greater than + 10,000,000 (rows times column) or an error results, defaults to None + :paramtype timeout_minutes: typing.Optional[int] + :keyword trial_timeout_minutes: Maximum time in minutes that each iteration can run for before it terminates. + If not specified, a value of 1 month or 43200 minutes is used, defaults to None + :paramtype trial_timeout_minutes: typing.Optional[int] + """ + self._limits = self._limits or TabularLimitSettings() + self._limits.enable_early_termination = ( + enable_early_termination if enable_early_termination is not None else self._limits.enable_early_termination + ) + self._limits.exit_score = exit_score if exit_score is not None else self._limits.exit_score + self._limits.max_concurrent_trials = ( + max_concurrent_trials if max_concurrent_trials is not None else self._limits.max_concurrent_trials + ) + self._limits.max_cores_per_trial = ( + max_cores_per_trial if max_cores_per_trial is not None else self._limits.max_cores_per_trial + ) + self._limits.max_nodes = max_nodes if max_nodes is not None else self._limits.max_nodes + self._limits.max_trials = max_trials if max_trials is not None else self._limits.max_trials + self._limits.timeout_minutes = timeout_minutes if timeout_minutes is not None else self._limits.timeout_minutes + self._limits.trial_timeout_minutes = ( + trial_timeout_minutes if trial_timeout_minutes is not None else self._limits.trial_timeout_minutes + ) + + def set_training( + self, + *, + enable_onnx_compatible_models: Optional[bool] = None, + enable_dnn_training: Optional[bool] = None, + enable_model_explainability: Optional[bool] = None, + enable_stack_ensemble: Optional[bool] = None, + enable_vote_ensemble: Optional[bool] = None, + stack_ensemble_settings: Optional[StackEnsembleSettings] = None, + ensemble_model_download_timeout: Optional[int] = None, + allowed_training_algorithms: Optional[List[str]] = None, + blocked_training_algorithms: Optional[List[str]] = None, + training_mode: Optional[Union[str, TabularTrainingMode]] = None, + ) -> None: + """The method to configure training related settings. + + :keyword enable_onnx_compatible_models: Whether to enable or disable enforcing the ONNX-compatible models. + The default is False. For more information about Open Neural Network Exchange (ONNX) and Azure Machine + Learning,see this `article <https://learn.microsoft.com/azure/machine-learning/concept-onnx>`__. + :paramtype enable_onnx_compatible_models: typing.Optional[bool] + :keyword enable_dnn_training: Whether to include DNN based models during model selection. + However, the default is True for DNN NLP tasks, and it's False for all other AutoML tasks. + :paramtype enable_dnn_training: typing.Optional[bool] + :keyword enable_model_explainability: Whether to enable explaining the best AutoML model at the end of all + AutoML training iterations. For more information, see + `Interpretability: model explanations in automated machine learning + <https://learn.microsoft.com/azure/machine-learning/how-to-machine-learning-interpretability-automl>`__. + , defaults to None + :paramtype enable_model_explainability: typing.Optional[bool] + :keyword enable_stack_ensemble: Whether to enable/disable StackEnsemble iteration. + If `enable_onnx_compatible_models` flag is being set, then StackEnsemble iteration will be disabled. + Similarly, for Timeseries tasks, StackEnsemble iteration will be disabled by default, to avoid risks of + overfitting due to small training set used in fitting the meta learner. + For more information about ensembles, see `Ensemble configuration + <https://learn.microsoft.com/azure/machine-learning/how-to-configure-auto-train#ensemble>`__ + , defaults to None + :paramtype enable_stack_ensemble: typing.Optional[bool] + :keyword enable_vote_ensemble: Whether to enable/disable VotingEnsemble iteration. + For more information about ensembles, see `Ensemble configuration + <https://learn.microsoft.com/azure/machine-learning/how-to-configure-auto-train#ensemble>`__ + , defaults to None + :paramtype enable_vote_ensemble: typing.Optional[bool] + :keyword stack_ensemble_settings: Settings for StackEnsemble iteration, defaults to None + :paramtype stack_ensemble_settings: typing.Optional[StackEnsembleSettings] + :keyword ensemble_model_download_timeout: During VotingEnsemble and StackEnsemble model generation, + multiple fitted models from the previous child runs are downloaded. Configure this parameter with a + higher value than 300 secs, if more time is needed, defaults to None + :paramtype ensemble_model_download_timeout: typing.Optional[int] + :keyword allowed_training_algorithms: A list of model names to search for an experiment. If not specified, + then all models supported for the task are used minus any specified in ``blocked_training_algorithms`` + or deprecated TensorFlow models, defaults to None + :paramtype allowed_training_algorithms: typing.Optional[List[str]] + :keyword blocked_training_algorithms: A list of algorithms to ignore for an experiment, defaults to None + :paramtype blocked_training_algorithms: typing.Optional[List[str]] + :keyword training_mode: [Experimental] The training mode to use. + The possible values are- + + * distributed- enables distributed training for supported algorithms. + + * non_distributed- disables distributed training. + + * auto- Currently, it is same as non_distributed. In future, this might change. + + Note: This parameter is in public preview and may change in future. + :paramtype training_mode: typing.Optional[typing.Union[str, azure.ai.ml.constants.TabularTrainingMode]] + """ + # get training object by calling training getter of respective tabular task + self._training = self.training + if self._training is not None: + self._training.enable_onnx_compatible_models = ( + enable_onnx_compatible_models + if enable_onnx_compatible_models is not None + else self._training.enable_onnx_compatible_models + ) + self._training.enable_dnn_training = ( + enable_dnn_training if enable_dnn_training is not None else self._training.enable_dnn_training + ) + self._training.enable_model_explainability = ( + enable_model_explainability + if enable_model_explainability is not None + else self._training.enable_model_explainability + ) + self._training.enable_stack_ensemble = ( + enable_stack_ensemble if enable_stack_ensemble is not None else self._training.enable_stack_ensemble + ) + self._training.enable_vote_ensemble = ( + enable_vote_ensemble if enable_vote_ensemble is not None else self._training.enable_vote_ensemble + ) + self._training.stack_ensemble_settings = ( + stack_ensemble_settings + if stack_ensemble_settings is not None + else self._training.stack_ensemble_settings + ) + self._training.ensemble_model_download_timeout = ( + ensemble_model_download_timeout + if ensemble_model_download_timeout is not None + else self._training.ensemble_model_download_timeout + ) + + self._training.allowed_training_algorithms = allowed_training_algorithms + self._training.blocked_training_algorithms = blocked_training_algorithms + self._training.training_mode = training_mode if training_mode is not None else self._training.training_mode + + def set_featurization( + self, + *, + blocked_transformers: Optional[List[Union[BlockedTransformers, str]]] = None, + column_name_and_types: Optional[Dict[str, str]] = None, + dataset_language: Optional[str] = None, + transformer_params: Optional[Dict[str, List[ColumnTransformer]]] = None, + mode: Optional[str] = None, + enable_dnn_featurization: Optional[bool] = None, + ) -> None: + """Define feature engineering configuration. + + :keyword blocked_transformers: A list of transformer names to be blocked during featurization, defaults to None + :paramtype blocked_transformers: Optional[List[Union[BlockedTransformers, str]]] + :keyword column_name_and_types: A dictionary of column names and feature types used to update column purpose + , defaults to None + :paramtype column_name_and_types: Optional[Dict[str, str]] + :keyword dataset_language: Three character ISO 639-3 code for the language(s) contained in the dataset. + Languages other than English are only supported if you use GPU-enabled compute. The language_code + 'mul' should be used if the dataset contains multiple languages. To find ISO 639-3 codes for different + languages, please refer to https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes, defaults to None + :paramtype dataset_language: Optional[str] + :keyword transformer_params: A dictionary of transformer and corresponding customization parameters + , defaults to None + :paramtype transformer_params: Optional[Dict[str, List[ColumnTransformer]]] + :keyword mode: "off", "auto", defaults to "auto", defaults to None + :paramtype mode: Optional[str] + :keyword enable_dnn_featurization: Whether to include DNN based feature engineering methods, defaults to None + :paramtype enable_dnn_featurization: Optional[bool] + """ + self._featurization = self._featurization or TabularFeaturizationSettings() + self._featurization.blocked_transformers = ( + blocked_transformers if blocked_transformers is not None else self._featurization.blocked_transformers + ) + self._featurization.column_name_and_types = ( + column_name_and_types if column_name_and_types is not None else self._featurization.column_name_and_types + ) + self._featurization.dataset_language = ( + dataset_language if dataset_language is not None else self._featurization.dataset_language + ) + self._featurization.transformer_params = ( + transformer_params if transformer_params is not None else self._featurization.transformer_params + ) + self._featurization.mode = mode or self._featurization.mode + self._featurization.enable_dnn_featurization = ( + enable_dnn_featurization + if enable_dnn_featurization is not None + else self._featurization.enable_dnn_featurization + ) + + def set_data( + self, + *, + training_data: Input, + target_column_name: str, + weight_column_name: Optional[str] = None, + validation_data: Optional[Input] = None, + validation_data_size: Optional[float] = None, + n_cross_validations: Optional[Union[str, int]] = None, + cv_split_column_names: Optional[List[str]] = None, + test_data: Optional[Input] = None, + test_data_size: Optional[float] = None, + ) -> None: + """Define data configuration. + + :keyword training_data: Training data. + :paramtype training_data: Input + :keyword target_column_name: Column name of the target column. + :paramtype target_column_name: str + :keyword weight_column_name: Weight column name, defaults to None + :paramtype weight_column_name: typing.Optional[str] + :keyword validation_data: Validation data, defaults to None + :paramtype validation_data: typing.Optional[Input] + :keyword validation_data_size: Validation data size, defaults to None + :paramtype validation_data_size: typing.Optional[float] + :keyword n_cross_validations: n_cross_validations, defaults to None + :paramtype n_cross_validations: typing.Optional[typing.Union[str, int]] + :keyword cv_split_column_names: cv_split_column_names, defaults to None + :paramtype cv_split_column_names: typing.Optional[typing.List[str]] + :keyword test_data: Test data, defaults to None + :paramtype test_data: typing.Optional[Input] + :keyword test_data_size: Test data size, defaults to None + :paramtype test_data_size: typing.Optional[float] + """ + self.target_column_name = target_column_name if target_column_name is not None else self.target_column_name + self.weight_column_name = weight_column_name if weight_column_name is not None else self.weight_column_name + self.training_data = training_data if training_data is not None else self.training_data + self.validation_data = validation_data if validation_data is not None else self.validation_data + self.validation_data_size = ( + validation_data_size if validation_data_size is not None else self.validation_data_size + ) + self.cv_split_column_names = ( + cv_split_column_names if cv_split_column_names is not None else self.cv_split_column_names + ) + self.n_cross_validations = n_cross_validations if n_cross_validations is not None else self.n_cross_validations + self.test_data = test_data if test_data is not None else self.test_data + self.test_data_size = test_data_size if test_data_size is not None else self.test_data_size + + def _validation_data_to_rest(self, rest_obj: "AutoMLTabular") -> None: + """Validation data serialization. + + :param rest_obj: Serialized object + :type rest_obj: AutoMLTabular + """ + if rest_obj.n_cross_validations: + n_cross_val = rest_obj.n_cross_validations + # Convert n_cross_validations int value to CustomNCrossValidations + if isinstance(n_cross_val, int) and n_cross_val > 1: + rest_obj.n_cross_validations = CustomNCrossValidations(value=n_cross_val) + # Convert n_cross_validations str value to AutoNCrossValidations + elif isinstance(n_cross_val, str): + rest_obj.n_cross_validations = AutoNCrossValidations() + + def _validation_data_from_rest(self) -> None: + """Validation data deserialization.""" + if self.n_cross_validations: + n_cross_val = self.n_cross_validations + # Convert n_cross_validations CustomNCrossValidations back into int value + if isinstance(n_cross_val, CustomNCrossValidations): + self.n_cross_validations = n_cross_val.value + # Convert n_cross_validations AutoNCrossValidations to str value + elif isinstance(n_cross_val, AutoNCrossValidations): + self.n_cross_validations = AutoMLConstants.AUTO + + def __eq__(self, other: object) -> bool: + """Return True if both instances have the same values. + + This method check instances equality and returns True if both of + the instances have the same attributes with the same values. + + :param other: Any object + :type other: object + :return: True or False + :rtype: bool + """ + if not isinstance(other, AutoMLTabular): + return NotImplemented + + return ( + self.target_column_name == other.target_column_name + and self.weight_column_name == other.weight_column_name + and self.training_data == other.training_data + and self.validation_data == other.validation_data + and self.validation_data_size == other.validation_data_size + and self.cv_split_column_names == other.cv_split_column_names + and self.n_cross_validations == other.n_cross_validations + and self.test_data == other.test_data + and self.test_data_size == other.test_data_size + and self._featurization == other._featurization + and self._limits == other._limits + and self._training == other._training + ) + + def __ne__(self, other: object) -> bool: + """Check inequality between two AutoMLTabular objects. + + :param other: Any object + :type other: object + :return: True or False + :rtype: bool + """ + return not self.__eq__(other) diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/classification_job.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/classification_job.py new file mode 100644 index 00000000..6f5ab271 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/classification_job.py @@ -0,0 +1,352 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=protected-access +from typing import Any, Dict, Optional, Union + +from azure.ai.ml._restclient.v2023_04_01_preview.models import AutoMLJob as RestAutoMLJob +from azure.ai.ml._restclient.v2023_04_01_preview.models import Classification as RestClassification +from azure.ai.ml._restclient.v2023_04_01_preview.models import ClassificationPrimaryMetrics, JobBase, TaskType +from azure.ai.ml._utils.utils import camel_to_snake, is_data_binding_expression +from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY +from azure.ai.ml.constants._job.automl import AutoMLConstants +from azure.ai.ml.entities._credentials import _BaseJobIdentityConfiguration +from azure.ai.ml.entities._job._input_output_helpers import from_rest_data_outputs, to_rest_data_outputs +from azure.ai.ml.entities._job.automl.tabular.automl_tabular import AutoMLTabular +from azure.ai.ml.entities._job.automl.tabular.featurization_settings import TabularFeaturizationSettings +from azure.ai.ml.entities._job.automl.tabular.limit_settings import TabularLimitSettings +from azure.ai.ml.entities._job.automl.training_settings import ( # noqa: F401 # pylint: disable=unused-import + ClassificationTrainingSettings, + TrainingSettings, +) +from azure.ai.ml.entities._util import load_from_dict + + +class ClassificationJob(AutoMLTabular): + """Configuration for AutoML Classification Job. + + :keyword primary_metric: The primary metric to use for optimization, defaults to None + :paramtype primary_metric: typing.Optional[str] + :keyword positive_label: Positive label for binary metrics calculation, defaults to None + :paramtype positive_label: typing.Optional[str] + :keyword featurization: Featurization settings. Defaults to None. + :paramtype featurization: typing.Optional[TabularFeaturizationSettings] + :keyword limits: Limits settings. Defaults to None. + :paramtype limits: typing.Optional[TabularLimitSettings] + :keyword training: Training settings. Defaults to None. + :paramtype training: typing.Optional[TrainingSettings] + :return: An instance of ClassificationJob object. + :rtype: ~azure.ai.ml.entities.automl.ClassificationJob + :raises ValueError: If primary_metric is not a valid primary metric + :raises ValueError: If positive_label is not a valid positive label + :raises ValueError: If featurization is not a valid featurization settings + :raises ValueError: If limits is not a valid limits settings + :raises ValueError: If training is not a valid training settings + """ + + _DEFAULT_PRIMARY_METRIC = ClassificationPrimaryMetrics.ACCURACY + + def __init__( + self, + *, + primary_metric: Optional[str] = None, + positive_label: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Initialize a new AutoML Classification task. + + :keyword primary_metric: The primary metric to use for optimization, defaults to None + :paramtype primary_metric: typing.Optional[str] + :keyword positive_label: Positive label for binary metrics calculation, defaults to None + :paramtype positive_label: typing.Optional[str] + :keyword featurization: featurization settings. Defaults to None. + :paramtype featurization: typing.Optional[TabularFeaturizationSettings] + :keyword limits: limits settings. Defaults to None. + :paramtype limits: typing.Optional[TabularLimitSettings] + :keyword training: training settings. Defaults to None. + :paramtype training: typing.Optional[TrainingSettings] + :raises ValueError: If primary_metric is not a valid primary metric + :raises ValueError: If positive_label is not a valid positive label + :raises ValueError: If featurization is not a valid featurization settings + :raises ValueError: If limits is not a valid limits settings + :raises ValueError: If training is not a valid training settings + """ + # Extract any task specific settings + featurization = kwargs.pop("featurization", None) + limits = kwargs.pop("limits", None) + training = kwargs.pop("training", None) + + super().__init__( + task_type=TaskType.CLASSIFICATION, + featurization=featurization, + limits=limits, + training=training, + **kwargs, + ) + + self.primary_metric = primary_metric or ClassificationJob._DEFAULT_PRIMARY_METRIC + self.positive_label = positive_label + + @property + def primary_metric(self) -> Union[str, ClassificationPrimaryMetrics]: + """The primary metric to use for optimization. + + :return: The primary metric to use for optimization. + :rtype: typing.Union[str, ClassificationPrimaryMetrics] + """ + return self._primary_metric + + @primary_metric.setter + def primary_metric(self, value: Union[str, ClassificationPrimaryMetrics]) -> None: + """The primary metric to use for optimization setter. + + :param value: Primary metric to use for optimization. + :type value: typing.Union[str, ClassificationPrimaryMetrics] + """ + # TODO: better way to do this + if is_data_binding_expression(str(value), ["parent"]): + self._primary_metric = value + return + self._primary_metric = ( + ClassificationJob._DEFAULT_PRIMARY_METRIC + if value is None + else ClassificationPrimaryMetrics[camel_to_snake(value).upper()] + ) + + @property # type: ignore + def training(self) -> ClassificationTrainingSettings: + """Training Settings for AutoML Classification Job. + + :return: Training settings used for AutoML Classification Job. + :rtype: ClassificationTrainingSettings + """ + return self._training or ClassificationTrainingSettings() + + @training.setter + def training(self, value: Union[Dict, ClassificationTrainingSettings]) -> None: # pylint: disable=unused-argument + ... + + def _to_rest_object(self) -> JobBase: + """Convert ClassificationJob object to a REST object. + + :return: REST object representation of this object. + :rtype: JobBase + """ + classification_task = RestClassification( + target_column_name=self.target_column_name, + training_data=self.training_data, + validation_data=self.validation_data, + validation_data_size=self.validation_data_size, + weight_column_name=self.weight_column_name, + cv_split_column_names=self.cv_split_column_names, + n_cross_validations=self.n_cross_validations, + test_data=self.test_data, + test_data_size=self.test_data_size, + featurization_settings=self._featurization._to_rest_object() if self._featurization else None, + limit_settings=self._limits._to_rest_object() if self._limits else None, + training_settings=self._training._to_rest_object() if self._training else None, + primary_metric=self.primary_metric, + positive_label=self.positive_label, + log_verbosity=self.log_verbosity, + ) + self._resolve_data_inputs(classification_task) + self._validation_data_to_rest(classification_task) + + properties = RestAutoMLJob( + display_name=self.display_name, + description=self.description, + experiment_name=self.experiment_name, + tags=self.tags, + compute_id=self.compute, + properties=self.properties, + environment_id=self.environment_id, + environment_variables=self.environment_variables, + services=self.services, + outputs=to_rest_data_outputs(self.outputs), + resources=self.resources, + task_details=classification_task, + identity=self.identity._to_job_rest_object() if self.identity else None, + queue_settings=self.queue_settings, + ) + + result = JobBase(properties=properties) + result.name = self.name + return result + + @classmethod + def _from_rest_object(cls, obj: JobBase) -> "ClassificationJob": + """Convert a REST object to ClassificationJob object. + + :param obj: ClassificationJob in Rest format. + :type obj: JobBase + :return: ClassificationJob objects. + :rtype: ClassificationJob + """ + + properties: RestAutoMLJob = obj.properties + task_details: RestClassification = properties.task_details + + job_args_dict = { + "id": obj.id, + "name": obj.name, + "description": properties.description, + "tags": properties.tags, + "properties": properties.properties, + "experiment_name": properties.experiment_name, + "services": properties.services, + "status": properties.status, + "creation_context": obj.system_data, + "display_name": properties.display_name, + "compute": properties.compute_id, + "outputs": from_rest_data_outputs(properties.outputs), + "resources": properties.resources, + "identity": ( + _BaseJobIdentityConfiguration._from_rest_object(properties.identity) if properties.identity else None + ), + "queue_settings": properties.queue_settings, + } + + classification_job = cls( + target_column_name=task_details.target_column_name, + training_data=task_details.training_data, + validation_data=task_details.validation_data, + validation_data_size=task_details.validation_data_size, + weight_column_name=task_details.weight_column_name, + cv_split_column_names=task_details.cv_split_column_names, + n_cross_validations=task_details.n_cross_validations, + test_data=task_details.test_data, + test_data_size=task_details.test_data_size, + featurization=( + TabularFeaturizationSettings._from_rest_object(task_details.featurization_settings) + if task_details.featurization_settings + else None + ), + limits=( + TabularLimitSettings._from_rest_object(task_details.limit_settings) + if task_details.limit_settings + else None + ), + training=( + ClassificationTrainingSettings._from_rest_object(task_details.training_settings) + if task_details.training_settings + else None + ), + primary_metric=task_details.primary_metric, + positive_label=task_details.positive_label, + log_verbosity=task_details.log_verbosity, + **job_args_dict, + ) + + classification_job._restore_data_inputs() + classification_job._validation_data_from_rest() + + return classification_job + + @classmethod + def _load_from_dict( + cls, + data: Dict, + context: Dict, + additional_message: str, + **kwargs: Any, + ) -> "ClassificationJob": + """Load from a dictionary. + + :param data: dictionary representation of the object. + :type data: typing.Dict + :param context: dictionary containing the context. + :type context: typing.Dict + :param additional_message: additional message to be added to the error message. + :type additional_message: str + :return: ClassificationJob object. + :rtype: ClassificationJob + """ + from azure.ai.ml._schema.automl.table_vertical.classification import AutoMLClassificationSchema + from azure.ai.ml._schema.pipeline.automl_node import AutoMLClassificationNodeSchema + + if kwargs.pop("inside_pipeline", False): + loaded_data = load_from_dict( + AutoMLClassificationNodeSchema, + data, + context, + additional_message, + **kwargs, + ) + else: + loaded_data = load_from_dict(AutoMLClassificationSchema, data, context, additional_message, **kwargs) + job_instance = cls._create_instance_from_schema_dict(loaded_data) + return job_instance + + @classmethod + def _create_instance_from_schema_dict(cls, loaded_data: Dict) -> "ClassificationJob": + """Create an instance from a schema dictionary. + + :param loaded_data: dictionary containing the data. + :type loaded_data: typing.Dict + :return: ClassificationJob object. + :rtype: ClassificationJob + """ + loaded_data.pop(AutoMLConstants.TASK_TYPE_YAML, None) + data_settings = { + "training_data": loaded_data.pop("training_data"), + "target_column_name": loaded_data.pop("target_column_name"), + "weight_column_name": loaded_data.pop("weight_column_name", None), + "validation_data": loaded_data.pop("validation_data", None), + "validation_data_size": loaded_data.pop("validation_data_size", None), + "cv_split_column_names": loaded_data.pop("cv_split_column_names", None), + "n_cross_validations": loaded_data.pop("n_cross_validations", None), + "test_data": loaded_data.pop("test_data", None), + "test_data_size": loaded_data.pop("test_data_size", None), + } + job = ClassificationJob(**loaded_data) + job.set_data(**data_settings) + return job + + def _to_dict(self, inside_pipeline: bool = False) -> Dict: + """Convert the object to a dictionary. + + :param inside_pipeline: whether the job is inside a pipeline or not, defaults to False + :type inside_pipeline: bool + :return: dictionary representation of the object. + :rtype: typing.Dict + """ + from azure.ai.ml._schema.automl.table_vertical.classification import AutoMLClassificationSchema + from azure.ai.ml._schema.pipeline.automl_node import AutoMLClassificationNodeSchema + + schema_dict: dict = {} + if inside_pipeline: + schema_dict = AutoMLClassificationNodeSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) + else: + schema_dict = AutoMLClassificationSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) + + return schema_dict + + def __eq__(self, other: object) -> bool: + """Returns True if both instances have the same values. + + This method check instances equality and returns True if both of + the instances have the same attributes with the same values. + + :param other: Any object + :type other: object + :return: True or False + :rtype: bool + """ + if not isinstance(other, ClassificationJob): + return NotImplemented + + if not super().__eq__(other): + return False + + return self.primary_metric == other.primary_metric + + def __ne__(self, other: object) -> bool: + """Check inequality between two ImageLimitSettings objects. + + :param other: Any object + :type other: object + :return: True or False + :rtype: bool + """ + return not self.__eq__(other) diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/featurization_settings.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/featurization_settings.py new file mode 100644 index 00000000..6ef2332e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/featurization_settings.py @@ -0,0 +1,170 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=protected-access + +import logging +from typing import Dict, List, Optional, Union + +from azure.ai.ml._restclient.v2023_04_01_preview.models import BlockedTransformers +from azure.ai.ml._restclient.v2023_04_01_preview.models import ColumnTransformer as RestColumnTransformer +from azure.ai.ml._restclient.v2023_04_01_preview.models import ( + TableVerticalFeaturizationSettings as RestTabularFeaturizationSettings, +) +from azure.ai.ml._utils.utils import camel_to_snake +from azure.ai.ml.constants._job.automl import AutoMLTransformerParameterKeys +from azure.ai.ml.entities._job.automl.featurization_settings import FeaturizationSettings, FeaturizationSettingsType +from azure.ai.ml.entities._mixins import RestTranslatableMixin + +module_logger = logging.getLogger(__name__) + + +class ColumnTransformer(RestTranslatableMixin): + """Column transformer settings. + + :param fields: The fields on which to perform custom featurization + :type field: List[str] + :param parameters: parameters used for custom featurization + :type parameters: Dict[str, Optional[str, float]] + """ + + def __init__( + self, + *, + fields: Optional[List[str]] = None, + parameters: Optional[Dict[str, Union[str, float]]] = None, + ): + self.fields = fields + self.parameters = parameters + + def _to_rest_object(self) -> RestColumnTransformer: + return RestColumnTransformer(fields=self.fields, parameters=self.parameters) + + @classmethod + def _from_rest_object(cls, obj: RestColumnTransformer) -> Optional["ColumnTransformer"]: + if obj: + fields = obj.fields + parameters = obj.parameters + return ColumnTransformer(fields=fields, parameters=parameters) + return None + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ColumnTransformer): + return NotImplemented + return self.fields == other.fields and self.parameters == other.parameters + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + +class TabularFeaturizationSettings(FeaturizationSettings): + """Featurization settings for an AutoML Job.""" + + def __init__( + self, + *, + blocked_transformers: Optional[List[Union[BlockedTransformers, str]]] = None, + column_name_and_types: Optional[Dict[str, str]] = None, + dataset_language: Optional[str] = None, + transformer_params: Optional[Dict[str, List[ColumnTransformer]]] = None, + mode: Optional[str] = None, + enable_dnn_featurization: Optional[bool] = None, + ): + """ + :param blocked_transformers: A list of transformers to ignore when featurizing. + :type blocked_transformers: List[Union[BlockedTransformers, str]] + :param column_name_and_types: A dictionary of column names and feature types used to update column purpose. + :type column_name_and_types: Dict[str, str] + :param dataset_language: The language of the dataset. + :type dataset_language: str + :param transformer_params: A dictionary of transformers and their parameters. + :type transformer_params: Dict[str, List[ColumnTransformer]] + :param mode: The mode of the featurization. + :type mode: str + :param enable_dnn_featurization: Whether to enable DNN featurization. + :type enable_dnn_featurization: bool + :ivar type: Specifies the type of FeaturizationSettings. Set automatically to "Tabular" for this class. + :vartype type: str + """ + super().__init__(dataset_language=dataset_language) + self.blocked_transformers = blocked_transformers + self.column_name_and_types = column_name_and_types + self.transformer_params = transformer_params + self.mode = mode + self.enable_dnn_featurization = enable_dnn_featurization + self.type = FeaturizationSettingsType.TABULAR + + @property + def transformer_params(self) -> Optional[Dict[str, List[ColumnTransformer]]]: + """A dictionary of transformers and their parameters.""" + return self._transformer_params + + @transformer_params.setter + def transformer_params(self, value: Dict[str, List[ColumnTransformer]]) -> None: + self._transformer_params = ( + None + if not value + else {(AutoMLTransformerParameterKeys[camel_to_snake(k).upper()].value): v for k, v in value.items()} + ) + + @property + def blocked_transformers(self) -> Optional[List[Union[BlockedTransformers, str]]]: + """A list of transformers to ignore when featurizing.""" + return self._blocked_transformers + + @blocked_transformers.setter + def blocked_transformers(self, blocked_transformers_list: List[Union[BlockedTransformers, str]]) -> None: + self._blocked_transformers = ( + None + if blocked_transformers_list is None + else [BlockedTransformers[camel_to_snake(o)] for o in blocked_transformers_list] + ) + + def _to_rest_object(self) -> RestTabularFeaturizationSettings: + transformer_dict = {} + if self.transformer_params: + for key, settings in self.transformer_params.items(): + transformer_dict[key] = [o._to_rest_object() for o in settings] + return RestTabularFeaturizationSettings( + blocked_transformers=self.blocked_transformers, + column_name_and_types=self.column_name_and_types, + dataset_language=self.dataset_language, + mode=self.mode, + transformer_params=transformer_dict, + enable_dnn_featurization=self.enable_dnn_featurization, + ) + + @classmethod + def _from_rest_object(cls, obj: RestTabularFeaturizationSettings) -> "TabularFeaturizationSettings": + rest_transformers_params = obj.transformer_params + transformer_dict: Optional[Dict] = None + if rest_transformers_params: + transformer_dict = {} + for key, settings in rest_transformers_params.items(): + transformer_dict[key] = [ColumnTransformer._from_rest_object(o) for o in settings] + transformer_params = transformer_dict + + return TabularFeaturizationSettings( + blocked_transformers=obj.blocked_transformers, + column_name_and_types=obj.column_name_and_types, + dataset_language=obj.dataset_language, + transformer_params=transformer_params, + mode=obj.mode, + enable_dnn_featurization=obj.enable_dnn_featurization, + ) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, TabularFeaturizationSettings): + return NotImplemented + return ( + super().__eq__(other) + and self.blocked_transformers == other.blocked_transformers + and self.column_name_and_types == other.column_name_and_types + and self.transformer_params == other.transformer_params + and self.mode == other.mode + and self.enable_dnn_featurization == other.enable_dnn_featurization + ) + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/forecasting_job.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/forecasting_job.py new file mode 100644 index 00000000..9bd10b19 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/forecasting_job.py @@ -0,0 +1,686 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=protected-access + +from typing import Any, Dict, List, Optional, Union + +from azure.ai.ml._restclient.v2023_04_01_preview.models import AutoMLJob as RestAutoMLJob +from azure.ai.ml._restclient.v2023_04_01_preview.models import Forecasting as RestForecasting +from azure.ai.ml._restclient.v2023_04_01_preview.models import ForecastingPrimaryMetrics, JobBase, TaskType +from azure.ai.ml._utils.utils import camel_to_snake, is_data_binding_expression +from azure.ai.ml.constants import TabularTrainingMode +from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY +from azure.ai.ml.constants._job.automl import AutoMLConstants +from azure.ai.ml.entities._credentials import _BaseJobIdentityConfiguration +from azure.ai.ml.entities._job._input_output_helpers import from_rest_data_outputs, to_rest_data_outputs +from azure.ai.ml.entities._job.automl.stack_ensemble_settings import StackEnsembleSettings +from azure.ai.ml.entities._job.automl.tabular.automl_tabular import AutoMLTabular +from azure.ai.ml.entities._job.automl.tabular.featurization_settings import TabularFeaturizationSettings +from azure.ai.ml.entities._job.automl.tabular.forecasting_settings import ForecastingSettings +from azure.ai.ml.entities._job.automl.tabular.limit_settings import TabularLimitSettings +from azure.ai.ml.entities._job.automl.training_settings import ForecastingTrainingSettings +from azure.ai.ml.entities._util import load_from_dict + + +class ForecastingJob(AutoMLTabular): + """ + Configuration for AutoML Forecasting Task. + + :param primary_metric: The primary metric to use for model selection. + :type primary_metric: Optional[str] + :param forecasting_settings: The settings for the forecasting task. + :type forecasting_settings: + Optional[~azure.ai.ml.automl.ForecastingSettings] + :param kwargs: Job-specific arguments + :type kwargs: Dict[str, Any] + """ + + _DEFAULT_PRIMARY_METRIC = ForecastingPrimaryMetrics.NORMALIZED_ROOT_MEAN_SQUARED_ERROR + + def __init__( + self, + *, + primary_metric: Optional[str] = None, + forecasting_settings: Optional[ForecastingSettings] = None, + **kwargs: Any, + ) -> None: + """Initialize a new AutoML Forecasting task.""" + # Extract any task specific settings + featurization = kwargs.pop("featurization", None) + limits = kwargs.pop("limits", None) + training = kwargs.pop("training", None) + + super().__init__( + task_type=TaskType.FORECASTING, + featurization=featurization, + limits=limits, + training=training, + **kwargs, + ) + + self.primary_metric = primary_metric or ForecastingJob._DEFAULT_PRIMARY_METRIC + self._forecasting_settings = forecasting_settings + + @property + def primary_metric(self) -> Optional[str]: + """ + Return the primary metric to use for model selection. + + :return: The primary metric for model selection. + :rtype: Optional[str] + """ + return self._primary_metric + + @primary_metric.setter + def primary_metric(self, value: Union[str, ForecastingPrimaryMetrics]) -> None: + """ + Set the primary metric to use for model selection. + + :param value: The primary metric for model selection. + :type: Union[str, ~azure.ai.ml.automl.ForecastingPrimaryMetrics] + """ + if is_data_binding_expression(str(value), ["parent"]): + self._primary_metric = value + return + self._primary_metric = ( + ForecastingJob._DEFAULT_PRIMARY_METRIC + if value is None + else ForecastingPrimaryMetrics[camel_to_snake(value).upper()] + ) + + @property # type: ignore + def training(self) -> ForecastingTrainingSettings: + """ + Return the forecast training settings. + + :return: training settings. + :rtype: ~azure.ai.ml.automl.ForecastingTrainingSettings + """ + return self._training or ForecastingTrainingSettings() + + @training.setter + def training(self, value: Union[Dict, ForecastingTrainingSettings]) -> None: # pylint: disable=unused-argument + ... + + @property + def forecasting_settings(self) -> Optional[ForecastingSettings]: + """ + Return the forecast settings. + + :return: forecast settings. + :rtype: ~azure.ai.ml.automl.ForecastingSettings + """ + return self._forecasting_settings + + def set_forecast_settings( + self, + *, + time_column_name: Optional[str] = None, + forecast_horizon: Optional[Union[str, int]] = None, + time_series_id_column_names: Optional[Union[str, List[str]]] = None, + target_lags: Optional[Union[str, int, List[int]]] = None, + feature_lags: Optional[str] = None, + target_rolling_window_size: Optional[Union[str, int]] = None, + country_or_region_for_holidays: Optional[str] = None, + use_stl: Optional[str] = None, + seasonality: Optional[Union[str, int]] = None, + short_series_handling_config: Optional[str] = None, + frequency: Optional[str] = None, + target_aggregate_function: Optional[str] = None, + cv_step_size: Optional[int] = None, + features_unknown_at_forecast_time: Optional[Union[str, List[str]]] = None, + ) -> None: + """Manage parameters used by forecasting tasks. + + :keyword time_column_name: + The name of the time column. This parameter is required when forecasting to specify the datetime + column in the input data used for building the time series and inferring its frequency. + :paramtype time_column_name: Optional[str] + :keyword forecast_horizon: + The desired maximum forecast horizon in units of time-series frequency. The default value is 1. + + Units are based on the time interval of your training data, e.g., monthly, weekly that the forecaster + should predict out. When task type is forecasting, this parameter is required. For more information on + setting forecasting parameters, see `Auto-train a time-series forecast model <https://learn.microsoft.com/ + azure/machine-learning/how-to-auto-train-forecast>`_. + :type forecast_horizon: Optional[Union[int, str]] + :keyword time_series_id_column_names: + The names of columns used to group a time series. + It can be used to create multiple series. If time series id column names is not defined or + the identifier columns specified do not identify all the series in the dataset, the time series identifiers + will be automatically created for your data set. + :paramtype time_series_id_column_names: Optional[Union[str, List[str]]] + :keyword target_lags: The number of past periods to lag from the target column. By default the lags are turned + off. + + When forecasting, this parameter represents the number of rows to lag the target values based + on the frequency of the data. This is represented as a list or single integer. Lag should be used + when the relationship between the independent variables and dependent variable do not match up or + correlate by default. For example, when trying to forecast demand for a product, the demand in any + month may depend on the price of specific commodities 3 months prior. In this example, you may want + to lag the target (demand) negatively by 3 months so that the model is training on the correct + relationship. For more information, see `Auto-train a time-series forecast model + <https://learn.microsoft.com/azure/machine-learning/how-to-auto-train-forecast>`_. + + **Note on auto detection of target lags and rolling window size. + Please see the corresponding comments in the rolling window section.** + We use the next algorithm to detect the optimal target lag and rolling window size. + + #. Estimate the maximum lag order for the look back feature selection. In our case it is the number of + periods till the next date frequency granularity i.e. if frequency is daily, it will be a week (7), + if it is a week, it will be month (4). That values multiplied by two is the largest + possible values of lags/rolling windows. In our examples, we will consider the maximum lag + order of 14 and 8 respectively). + #. Create a de-seasonalized series by adding trend and residual components. This will be used + in the next step. + #. Estimate the PACF - Partial Auto Correlation Function on the on the data from (2) + and search for points, where the auto correlation is significant i.e. its absolute + value is more then 1.96/square_root(maximal lag value), which correspond to significance of 95%. + #. If all points are significant, we consider it being strong seasonality + and do not create look back features. + #. We scan the PACF values from the beginning and the value before the first insignificant + auto correlation will designate the lag. If first significant element (value correlate with + itself) is followed by insignificant, the lag will be 0 and we will not use look back features. + + :type target_lags: Optional[Union[str, int, List[int]]] + :keyword feature_lags: Flag for generating lags for the numeric features with 'auto' or None. + :paramtype feature_lags: Optional[str] + :keyword target_rolling_window_size: The number of past periods used to create a rolling window average of the + target column. + + When forecasting, this parameter represents `n` historical periods to use to generate forecasted values, + <= training set size. If omitted, `n` is the full training set size. Specify this parameter + when you only want to consider a certain amount of history when training the model. + If set to 'auto', rolling window will be estimated as the last + value where the PACF is more then the significance threshold. Please see target_lags section for details. + :paramtype target_rolling_window_size: Optional[Union[str, int]] + :keyword country_or_region_for_holidays: The country/region used to generate holiday features. + These should be ISO 3166 two-letter country/region codes, for example 'US' or 'GB'. + :paramtype country_or_region_for_holidays: Optional[str] + :keyword use_stl: Configure STL Decomposition of the time-series target column. + use_stl can take three values: None (default) - no stl decomposition, 'season' - only generate + season component and season_trend - generate both season and trend components. + :type use_stl: Optional[str] + :keyword seasonality: Set time series seasonality as an integer multiple of the series frequency. + If seasonality is set to 'auto', it will be inferred. + If set to None, the time series is assumed non-seasonal which is equivalent to seasonality=1. + :paramtype seasonality: Optional[Union[int, str] + :keyword short_series_handling_config: + The parameter defining how if AutoML should handle short time series. + + Possible values: 'auto' (default), 'pad', 'drop' and None. + + * **auto** short series will be padded if there are no long series, + otherwise short series will be dropped. + * **pad** all the short series will be padded. + * **drop** all the short series will be dropped". + * **None** the short series will not be modified. + + If set to 'pad', the table will be padded with the zeroes and + empty values for the regressors and random values for target with the mean + equal to target value median for given time series id. If median is more or equal + to zero, the minimal padded value will be clipped by zero: + Input: + + +------------+---------------+----------+--------+ + | Date | numeric_value | string | target | + +============+===============+==========+========+ + | 2020-01-01 | 23 | green | 55 | + +------------+---------------+----------+--------+ + + Output assuming minimal number of values is four: + + +------------+---------------+----------+--------+ + | Date | numeric_value | string | target | + +============+===============+==========+========+ + | 2019-12-29 | 0 | NA | 55.1 | + +------------+---------------+----------+--------+ + | 2019-12-30 | 0 | NA | 55.6 | + +------------+---------------+----------+--------+ + | 2019-12-31 | 0 | NA | 54.5 | + +------------+---------------+----------+--------+ + | 2020-01-01 | 23 | green | 55 | + +------------+---------------+----------+--------+ + + **Note:** We have two parameters short_series_handling_configuration and + legacy short_series_handling. When both parameters are set we are + synchronize them as shown in the table below (short_series_handling_configuration and + short_series_handling for brevity are marked as handling_configuration and handling + respectively). + + +------------+--------------------------+----------------------+-----------------------------+ + | | handling | | handling | | resulting | | resulting | + | | | configuration | | handling | | handling | + | | | | | configuration | + +============+==========================+======================+=============================+ + | True | auto | True | auto | + +------------+--------------------------+----------------------+-----------------------------+ + | True | pad | True | auto | + +------------+--------------------------+----------------------+-----------------------------+ + | True | drop | True | auto | + +------------+--------------------------+----------------------+-----------------------------+ + | True | None | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | auto | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | pad | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | drop | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | None | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + + :type short_series_handling_config: Optional[str] + :keyword frequency: Forecast frequency. + + When forecasting, this parameter represents the period with which the forecast is desired, + for example daily, weekly, yearly, etc. The forecast frequency is dataset frequency by default. + You can optionally set it to greater (but not lesser) than dataset frequency. + We'll aggregate the data and generate the results at forecast frequency. For example, + for daily data, you can set the frequency to be daily, weekly or monthly, but not hourly. + The frequency needs to be a pandas offset alias. + Please refer to pandas documentation for more information: + https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects + :type frequency: Optional[str] + :keyword target_aggregate_function: The function to be used to aggregate the time series target + column to conform to a user specified frequency. If the target_aggregation_function is set, + but the freq parameter is not set, the error is raised. The possible target aggregation + functions are: "sum", "max", "min" and "mean". + + * The target column values are aggregated based on the specified operation. + Typically, sum is appropriate for most scenarios. + * Numerical predictor columns in your data are aggregated by sum, mean, minimum value, + and maximum value. As a result, automated ML generates new columns suffixed with the + aggregation function name and applies the selected aggregate operation. + * For categorical predictor columns, the data is aggregated by mode, + the most prominent category in the window. + * Date predictor columns are aggregated by minimum value, maximum value and mode. + + +----------------+-------------------------------+--------------------------------------+ + | | freq | | target_aggregation_function | | Data regularity | + | | | | fixing mechanism | + +================+===============================+======================================+ + | None (Default) | None (Default) | | The aggregation | + | | | | is not applied. | + | | | | If the valid | + | | | | frequency can | + | | | | not be | + | | | | determined | + | | | | the error | + | | | | will be raised. | + +----------------+-------------------------------+--------------------------------------+ + | Some Value | None (Default) | | The aggregation | + | | | | is not applied. | + | | | | If the number | + | | | | of data points | + | | | | compliant to | + | | | | given frequency | + | | | | grid is | + | | | | less then 90% | + | | | | these points | + | | | | will be | + | | | | removed, | + | | | | otherwise | + | | | | the error will | + | | | | be raised. | + +----------------+-------------------------------+--------------------------------------+ + | None (Default) | Aggregation function | | The error about | + | | | | missing | + | | | | frequency | + | | | | parameter is | + | | | | raised. | + +----------------+-------------------------------+--------------------------------------+ + | Some Value | Aggregation function | | Aggregate to | + | | | | frequency using | + | | | | provided | + | | | | aggregation | + | | | | function. | + +----------------+-------------------------------+--------------------------------------+ + + :type target_aggregate_function: Optional[str] + :keyword cv_step_size: Number of periods between the origin_time of one CV fold and the next fold. + For example, if `n_step` = 3 for daily data, the origin time for each fold will be three days apart. + :paramtype cv_step_size: Optional[int] + :keyword features_unknown_at_forecast_time: The feature columns that are available for training but + unknown at the time of forecast/inference. If features_unknown_at_forecast_time is set to an empty + list, it is assumed that all the feature columns in the dataset are known at inference time. If this + parameter is not set the support for future features is not enabled. + :paramtype features_unknown_at_forecast_time: Optional[Union[str, List[str]]] + """ + self._forecasting_settings = self._forecasting_settings or ForecastingSettings() + + self._forecasting_settings.country_or_region_for_holidays = ( + country_or_region_for_holidays + if country_or_region_for_holidays is not None + else self._forecasting_settings.country_or_region_for_holidays + ) + self._forecasting_settings.cv_step_size = ( + cv_step_size if cv_step_size is not None else self._forecasting_settings.cv_step_size + ) + self._forecasting_settings.forecast_horizon = ( + forecast_horizon if forecast_horizon is not None else self._forecasting_settings.forecast_horizon + ) + self._forecasting_settings.target_lags = ( + target_lags if target_lags is not None else self._forecasting_settings.target_lags + ) + self._forecasting_settings.target_rolling_window_size = ( + target_rolling_window_size + if target_rolling_window_size is not None + else self._forecasting_settings.target_rolling_window_size + ) + self._forecasting_settings.frequency = ( + frequency if frequency is not None else self._forecasting_settings.frequency + ) + self._forecasting_settings.feature_lags = ( + feature_lags if feature_lags is not None else self._forecasting_settings.feature_lags + ) + self._forecasting_settings.seasonality = ( + seasonality if seasonality is not None else self._forecasting_settings.seasonality + ) + self._forecasting_settings.use_stl = use_stl if use_stl is not None else self._forecasting_settings.use_stl + self._forecasting_settings.short_series_handling_config = ( + short_series_handling_config + if short_series_handling_config is not None + else self._forecasting_settings.short_series_handling_config + ) + self._forecasting_settings.target_aggregate_function = ( + target_aggregate_function + if target_aggregate_function is not None + else self._forecasting_settings.target_aggregate_function + ) + self._forecasting_settings.time_column_name = ( + time_column_name if time_column_name is not None else self._forecasting_settings.time_column_name + ) + self._forecasting_settings.time_series_id_column_names = ( + time_series_id_column_names + if time_series_id_column_names is not None + else self._forecasting_settings.time_series_id_column_names + ) + self._forecasting_settings.features_unknown_at_forecast_time = ( + features_unknown_at_forecast_time + if features_unknown_at_forecast_time is not None + else self._forecasting_settings.features_unknown_at_forecast_time + ) + + # override + def set_training( + self, + *, + enable_onnx_compatible_models: Optional[bool] = None, + enable_dnn_training: Optional[bool] = None, + enable_model_explainability: Optional[bool] = None, + enable_stack_ensemble: Optional[bool] = None, + enable_vote_ensemble: Optional[bool] = None, + stack_ensemble_settings: Optional[StackEnsembleSettings] = None, + ensemble_model_download_timeout: Optional[int] = None, + allowed_training_algorithms: Optional[List[str]] = None, + blocked_training_algorithms: Optional[List[str]] = None, + training_mode: Optional[Union[str, TabularTrainingMode]] = None, + ) -> None: + """ + The method to configure forecast training related settings. + + :keyword enable_onnx_compatible_models: + Whether to enable or disable enforcing the ONNX-compatible models. + The default is False. For more information about Open Neural Network Exchange (ONNX) and Azure Machine + Learning, see this `article <https://learn.microsoft.com/azure/machine-learning/concept-onnx>`__. + :type enable_onnx_compatible: Optional[bool] + :keyword enable_dnn_training: + Whether to include DNN based models during model selection. + However, the default is True for DNN NLP tasks, and it's False for all other AutoML tasks. + :paramtype enable_dnn_training: Optional[bool] + :keyword enable_model_explainability: + Whether to enable explaining the best AutoML model at the end of all AutoML training iterations. + For more information, see `Interpretability: model explanations in automated machine learning + <https://learn.microsoft.com/azure/machine-learning/how-to-machine-learning-interpretability-automl>`__. + , defaults to None + :type enable_model_explainability: Optional[bool] + :keyword enable_stack_ensemble: + Whether to enable/disable StackEnsemble iteration. + If `enable_onnx_compatible_models` flag is being set, then StackEnsemble iteration will be disabled. + Similarly, for Timeseries tasks, StackEnsemble iteration will be disabled by default, to avoid risks of + overfitting due to small training set used in fitting the meta learner. + For more information about ensembles, see `Ensemble configuration + <https://learn.microsoft.com/azure/machine-learning/how-to-configure-auto-train#ensemble>`__ + , defaults to None + :type enable_stack_ensemble: Optional[bool] + :keyword enable_vote_ensemble: + Whether to enable/disable VotingEnsemble iteration. + For more information about ensembles, see `Ensemble configuration + <https://learn.microsoft.com/azure/machine-learning/how-to-configure-auto-train#ensemble>`__ + , defaults to None + :type enable_vote_ensemble: Optional[bool] + :keyword stack_ensemble_settings: + Settings for StackEnsemble iteration, defaults to None + :paramtype stack_ensemble_settings: Optional[StackEnsembleSettings] + :keyword ensemble_model_download_timeout: + During VotingEnsemble and StackEnsemble model generation, + multiple fitted models from the previous child runs are downloaded. Configure this parameter with a + higher value than 300 secs, if more time is needed, defaults to None + :paramtype ensemble_model_download_timeout: Optional[int] + :keyword allowed_training_algorithms: + A list of model names to search for an experiment. If not specified, + then all models supported for the task are used minus any specified in ``blocked_training_algorithms`` + or deprecated TensorFlow models, defaults to None + :paramtype allowed_training_algorithms: Optional[List[str]] + :keyword blocked_training_algorithms: + A list of algorithms to ignore for an experiment, defaults to None + :paramtype blocked_training_algorithms: Optional[List[str]] + :keyword training_mode: + [Experimental] The training mode to use. + The possible values are- + + * distributed- enables distributed training for supported algorithms. + + * non_distributed- disables distributed training. + + * auto- Currently, it is same as non_distributed. In future, this might change. + + Note: This parameter is in public preview and may change in future. + :type training_mode: Optional[Union[~azure.ai.ml.constants.TabularTrainingMode, str]] + """ + super().set_training( + enable_onnx_compatible_models=enable_onnx_compatible_models, + enable_dnn_training=enable_dnn_training, + enable_model_explainability=enable_model_explainability, + enable_stack_ensemble=enable_stack_ensemble, + enable_vote_ensemble=enable_vote_ensemble, + stack_ensemble_settings=stack_ensemble_settings, + ensemble_model_download_timeout=ensemble_model_download_timeout, + allowed_training_algorithms=allowed_training_algorithms, + blocked_training_algorithms=blocked_training_algorithms, + training_mode=training_mode, + ) + + # Disable stack ensemble by default, since it is currently not supported for forecasting tasks + if enable_stack_ensemble is None: + if self._training is not None: + self._training.enable_stack_ensemble = False + + def _to_rest_object(self) -> JobBase: + if self._forecasting_settings is not None: + forecasting_task = RestForecasting( + target_column_name=self.target_column_name, + training_data=self.training_data, + validation_data=self.validation_data, + validation_data_size=self.validation_data_size, + weight_column_name=self.weight_column_name, + cv_split_column_names=self.cv_split_column_names, + n_cross_validations=self.n_cross_validations, + test_data=self.test_data, + test_data_size=self.test_data_size, + featurization_settings=self._featurization._to_rest_object() if self._featurization else None, + limit_settings=self._limits._to_rest_object() if self._limits else None, + training_settings=self._training._to_rest_object() if self._training else None, + primary_metric=self.primary_metric, + log_verbosity=self.log_verbosity, + forecasting_settings=self._forecasting_settings._to_rest_object(), + ) + else: + forecasting_task = RestForecasting( + target_column_name=self.target_column_name, + training_data=self.training_data, + validation_data=self.validation_data, + validation_data_size=self.validation_data_size, + weight_column_name=self.weight_column_name, + cv_split_column_names=self.cv_split_column_names, + n_cross_validations=self.n_cross_validations, + test_data=self.test_data, + test_data_size=self.test_data_size, + featurization_settings=self._featurization._to_rest_object() if self._featurization else None, + limit_settings=self._limits._to_rest_object() if self._limits else None, + training_settings=self._training._to_rest_object() if self._training else None, + primary_metric=self.primary_metric, + log_verbosity=self.log_verbosity, + forecasting_settings=None, + ) + + self._resolve_data_inputs(forecasting_task) + self._validation_data_to_rest(forecasting_task) + + properties = RestAutoMLJob( + display_name=self.display_name, + description=self.description, + experiment_name=self.experiment_name, + tags=self.tags, + compute_id=self.compute, + properties=self.properties, + environment_id=self.environment_id, + environment_variables=self.environment_variables, + services=self.services, + outputs=to_rest_data_outputs(self.outputs), + resources=self.resources, + task_details=forecasting_task, + identity=self.identity._to_job_rest_object() if self.identity else None, + queue_settings=self.queue_settings, + ) + + result = JobBase(properties=properties) + result.name = self.name + return result + + @classmethod + def _from_rest_object(cls, obj: JobBase) -> "ForecastingJob": + properties: RestAutoMLJob = obj.properties + task_details: RestForecasting = properties.task_details + + job_args_dict = { + "id": obj.id, + "name": obj.name, + "description": properties.description, + "tags": properties.tags, + "properties": properties.properties, + "experiment_name": properties.experiment_name, + "services": properties.services, + "status": properties.status, + "creation_context": obj.system_data, + "display_name": properties.display_name, + "compute": properties.compute_id, + "outputs": from_rest_data_outputs(properties.outputs), + "resources": properties.resources, + "identity": ( + _BaseJobIdentityConfiguration._from_rest_object(properties.identity) if properties.identity else None + ), + "queue_settings": properties.queue_settings, + } + + forecasting_job = cls( + target_column_name=task_details.target_column_name, + training_data=task_details.training_data, + validation_data=task_details.validation_data, + validation_data_size=task_details.validation_data_size, + weight_column_name=task_details.weight_column_name, + cv_split_column_names=task_details.cv_split_column_names, + n_cross_validations=task_details.n_cross_validations, + test_data=task_details.test_data, + test_data_size=task_details.test_data_size, + featurization=( + TabularFeaturizationSettings._from_rest_object(task_details.featurization_settings) + if task_details.featurization_settings + else None + ), + limits=( + TabularLimitSettings._from_rest_object(task_details.limit_settings) + if task_details.limit_settings + else None + ), + training=( + ForecastingTrainingSettings._from_rest_object(task_details.training_settings) + if task_details.training_settings + else None + ), + primary_metric=task_details.primary_metric, + forecasting_settings=( + ForecastingSettings._from_rest_object(task_details.forecasting_settings) + if task_details.forecasting_settings + else None + ), + log_verbosity=task_details.log_verbosity, + **job_args_dict, + ) + + forecasting_job._restore_data_inputs() + forecasting_job._validation_data_from_rest() + + return forecasting_job + + @classmethod + def _load_from_dict( + cls, + data: Dict, + context: Dict, + additional_message: str, + **kwargs: Any, + ) -> "ForecastingJob": + from azure.ai.ml._schema.automl.table_vertical.forecasting import AutoMLForecastingSchema + from azure.ai.ml._schema.pipeline.automl_node import AutoMLForecastingNodeSchema + + if kwargs.pop("inside_pipeline", False): + loaded_data = load_from_dict(AutoMLForecastingNodeSchema, data, context, additional_message, **kwargs) + else: + loaded_data = load_from_dict(AutoMLForecastingSchema, data, context, additional_message, **kwargs) + job_instance = cls._create_instance_from_schema_dict(loaded_data) + return job_instance + + @classmethod + def _create_instance_from_schema_dict(cls, loaded_data: Dict) -> "ForecastingJob": + loaded_data.pop(AutoMLConstants.TASK_TYPE_YAML, None) + data_settings = { + "training_data": loaded_data.pop("training_data"), + "target_column_name": loaded_data.pop("target_column_name"), + "weight_column_name": loaded_data.pop("weight_column_name", None), + "validation_data": loaded_data.pop("validation_data", None), + "validation_data_size": loaded_data.pop("validation_data_size", None), + "cv_split_column_names": loaded_data.pop("cv_split_column_names", None), + "n_cross_validations": loaded_data.pop("n_cross_validations", None), + "test_data": loaded_data.pop("test_data", None), + "test_data_size": loaded_data.pop("test_data_size", None), + } + job = ForecastingJob(**loaded_data) + job.set_data(**data_settings) + return job + + def _to_dict(self, inside_pipeline: bool = False) -> Dict: + from azure.ai.ml._schema.automl.table_vertical.forecasting import AutoMLForecastingSchema + from azure.ai.ml._schema.pipeline.automl_node import AutoMLForecastingNodeSchema + + schema_dict: dict = {} + if inside_pipeline: + schema_dict = AutoMLForecastingNodeSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) + else: + schema_dict = AutoMLForecastingSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) + return schema_dict + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ForecastingJob): + return NotImplemented + + if not super(ForecastingJob, self).__eq__(other): + return False + + return self.primary_metric == other.primary_metric and self._forecasting_settings == other._forecasting_settings + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/forecasting_settings.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/forecasting_settings.py new file mode 100644 index 00000000..09439483 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/forecasting_settings.py @@ -0,0 +1,383 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=too-many-instance-attributes + +from typing import List, Optional, Union + +from azure.ai.ml._restclient.v2023_04_01_preview.models import ( + AutoForecastHorizon, + AutoSeasonality, + AutoTargetLags, + AutoTargetRollingWindowSize, + CustomForecastHorizon, + CustomSeasonality, + CustomTargetLags, + CustomTargetRollingWindowSize, + ForecastHorizonMode, +) +from azure.ai.ml._restclient.v2023_04_01_preview.models import ( + ForecastingSettings as RestForecastingSettings, +) +from azure.ai.ml._restclient.v2023_04_01_preview.models import ( + SeasonalityMode, + TargetLagsMode, + TargetRollingWindowSizeMode, +) +from azure.ai.ml.entities._mixins import RestTranslatableMixin + + +class ForecastingSettings(RestTranslatableMixin): + """Forecasting settings for an AutoML Job. + + :param country_or_region_for_holidays: The country/region used to generate holiday features. These should be ISO + 3166 two-letter country/region code, for example 'US' or 'GB'. + :type country_or_region_for_holidays: Optional[str] + :param cv_step_size: + Number of periods between the origin_time of one CV fold and the next fold. For + example, if `n_step` = 3 for daily data, the origin time for each fold will be + three days apart. + :type cv_step_size: Optional[int] + :param forecast_horizon: + The desired maximum forecast horizon in units of time-series frequency. The default value is 1. + + Units are based on the time interval of your training data, e.g., monthly, weekly that the forecaster + should predict out. When task type is forecasting, this parameter is required. For more information on + setting forecasting parameters, see `Auto-train a time-series forecast model <https://learn.microsoft.com/ + azure/machine-learning/how-to-auto-train-forecast>`_. + :type forecast_horizon: Optional[Union[int, str]] + :param target_lags: + The number of past periods to lag from the target column. By default the lags are turned off. + + When forecasting, this parameter represents the number of rows to lag the target values based + on the frequency of the data. This is represented as a list or single integer. Lag should be used + when the relationship between the independent variables and dependent variable do not match up or + correlate by default. For example, when trying to forecast demand for a product, the demand in any + month may depend on the price of specific commodities 3 months prior. In this example, you may want + to lag the target (demand) negatively by 3 months so that the model is training on the correct + relationship. For more information, see `Auto-train a time-series forecast model + <https://learn.microsoft.com/azure/machine-learning/how-to-auto-train-forecast>`_. + + **Note on auto detection of target lags and rolling window size. + Please see the corresponding comments in the rolling window section.** + We use the next algorithm to detect the optimal target lag and rolling window size. + + #. Estimate the maximum lag order for the look back feature selection. In our case it is the number of + periods till the next date frequency granularity i.e. if frequency is daily, it will be a week (7), + if it is a week, it will be month (4). That values multiplied by two is the largest + possible values of lags/rolling windows. In our examples, we will consider the maximum lag + order of 14 and 8 respectively). + #. Create a de-seasonalized series by adding trend and residual components. This will be used + in the next step. + #. Estimate the PACF - Partial Auto Correlation Function on the on the data from (2) + and search for points, where the auto correlation is significant i.e. its absolute + value is more then 1.96/square_root(maximal lag value), which correspond to significance of 95%. + #. If all points are significant, we consider it being strong seasonality + and do not create look back features. + #. We scan the PACF values from the beginning and the value before the first insignificant + auto correlation will designate the lag. If first significant element (value correlate with + itself) is followed by insignificant, the lag will be 0 and we will not use look back features. + :type target_lags: Union[str, int, List[int]] + :param target_rolling_window_size: + The number of past periods used to create a rolling window average of the target column. + + When forecasting, this parameter represents `n` historical periods to use to generate forecasted values, + <= training set size. If omitted, `n` is the full training set size. Specify this parameter + when you only want to consider a certain amount of history when training the model. + If set to 'auto', rolling window will be estimated as the last + value where the PACF is more then the significance threshold. Please see target_lags section for details. + :type target_rolling_window_size: Optional[Union[str, int]] + :param frequency: Forecast frequency. + + When forecasting, this parameter represents the period with which the forecast is desired, + for example daily, weekly, yearly, etc. The forecast frequency is dataset frequency by default. + You can optionally set it to greater (but not lesser) than dataset frequency. + We'll aggregate the data and generate the results at forecast frequency. For example, + for daily data, you can set the frequency to be daily, weekly or monthly, but not hourly. + The frequency needs to be a pandas offset alias. + Please refer to pandas documentation for more information: + https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects + :type frequency: Optional[str] + :param feature_lags: Flag for generating lags for the numeric features with 'auto' or None. + :type feature_lags: Optional[str] + :param seasonality: Set time series seasonality as an integer multiple of the series frequency. + If seasonality is set to 'auto', it will be inferred. + If set to None, the time series is assumed non-seasonal which is equivalent to seasonality=1. + :type seasonality: Optional[Union[int, str]] + :param use_stl: Configure STL Decomposition of the time-series target column. + use_stl can take three values: None (default) - no stl decomposition, 'season' - only generate + season component and season_trend - generate both season and trend components. + :type use_stl: Optional[str] + :param short_series_handling_config: + The parameter defining how if AutoML should handle short time series. + + Possible values: 'auto' (default), 'pad', 'drop' and None. + * **auto** short series will be padded if there are no long series, + otherwise short series will be dropped. + * **pad** all the short series will be padded. + * **drop** all the short series will be dropped". + * **None** the short series will not be modified. + If set to 'pad', the table will be padded with the zeroes and + empty values for the regressors and random values for target with the mean + equal to target value median for given time series id. If median is more or equal + to zero, the minimal padded value will be clipped by zero. + Input: + + +------------+---------------+----------+--------+ + | Date | numeric_value | string | target | + +============+===============+==========+========+ + | 2020-01-01 | 23 | green | 55 | + +------------+---------------+----------+--------+ + + Output assuming minimal number of values is four: + + +------------+---------------+----------+--------+ + | Date | numeric_value | string | target | + +============+===============+==========+========+ + | 2019-12-29 | 0 | NA | 55.1 | + +------------+---------------+----------+--------+ + | 2019-12-30 | 0 | NA | 55.6 | + +------------+---------------+----------+--------+ + | 2019-12-31 | 0 | NA | 54.5 | + +------------+---------------+----------+--------+ + | 2020-01-01 | 23 | green | 55 | + +------------+---------------+----------+--------+ + + **Note:** We have two parameters short_series_handling_configuration and + legacy short_series_handling. When both parameters are set we are + synchronize them as shown in the table below (short_series_handling_configuration and + short_series_handling for brevity are marked as handling_configuration and handling + respectively). + + +------------+--------------------------+----------------------+-----------------------------+ + | | handling | | handling configuration | | resulting handling | | resulting handling | + | | | | | configuration | + +============+==========================+======================+=============================+ + | True | auto | True | auto | + +------------+--------------------------+----------------------+-----------------------------+ + | True | pad | True | auto | + +------------+--------------------------+----------------------+-----------------------------+ + | True | drop | True | auto | + +------------+--------------------------+----------------------+-----------------------------+ + | True | None | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | auto | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | pad | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | drop | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + | False | None | False | None | + +------------+--------------------------+----------------------+-----------------------------+ + + :type short_series_handling_config: Optional[str] + :param target_aggregate_function: The function to be used to aggregate the time series target + column to conform to a user specified frequency. If the + target_aggregation_function is set, but the freq parameter + is not set, the error is raised. The possible target + aggregation functions are: "sum", "max", "min" and "mean". + + * The target column values are aggregated based on the specified operation. + Typically, sum is appropriate for most scenarios. + * Numerical predictor columns in your data are aggregated by sum, mean, minimum value, + and maximum value. As a result, automated ML generates new columns suffixed with the + aggregation function name and applies the selected aggregate operation. + * For categorical predictor columns, the data is aggregated by mode, + the most prominent category in the window. + * Date predictor columns are aggregated by minimum value, maximum value and mode. + + +----------------+-------------------------------+--------------------------------------+ + | | freq | | target_aggregation_function | | Data regularity | + | | | | fixing mechanism | + +================+===============================+======================================+ + | None (Default) | None (Default) | | The aggregation is not | + | | | | applied. If the valid | + | | | | frequency can not be | + | | | | determined the error will | + | | | | be raised. | + +----------------+-------------------------------+--------------------------------------+ + | Some Value | None (Default) | | The aggregation is not | + | | | | applied. If the number | + | | | | of data points compliant | + | | | | to given frequency grid | + | | | | is less then 90% these points | + | | | | will be removed, otherwise | + | | | | the error will be raised. | + +----------------+-------------------------------+--------------------------------------+ + | None (Default) | Aggregation function | | The error about missing | + | | | | frequency parameter | + | | | | is raised. | + +----------------+-------------------------------+--------------------------------------+ + | Some Value | Aggregation function | | Aggregate to frequency using | + | | | | provided aggregation function. | + +----------------+-------------------------------+--------------------------------------+ + :type target_aggregate_function: str + :param time_column_name: + The name of the time column. This parameter is required when forecasting to specify the datetime + column in the input data used for building the time series and inferring its frequency. + :type time_column_name: Optional[str] + :param time_series_id_column_names: + The names of columns used to group a timeseries. + It can be used to create multiple series. If time series id column names is not defined or + the identifier columns specified do not identify all the series in the dataset, the time series identifiers + will be automatically created for your dataset. + :type time_series_id_column_names: Union[str, List[str]] + :param features_unknown_at_forecast_time: + The feature columns that are available for training but unknown at the time of forecast/inference. + If features_unknown_at_forecast_time is set to an empty list, it is assumed that + all the feature columns in the dataset are known at inference time. If this parameter is not set + the support for future features is not enabled. + :type features_unknown_at_forecast_time: Optional[Union[str, List[str]]] + """ + + def __init__( + self, + *, + country_or_region_for_holidays: Optional[str] = None, + cv_step_size: Optional[int] = None, + forecast_horizon: Optional[Union[str, int]] = None, + target_lags: Optional[Union[str, int, List[int]]] = None, + target_rolling_window_size: Optional[Union[str, int]] = None, + frequency: Optional[str] = None, + feature_lags: Optional[str] = None, + seasonality: Optional[Union[str, int]] = None, + use_stl: Optional[str] = None, + short_series_handling_config: Optional[str] = None, + target_aggregate_function: Optional[str] = None, + time_column_name: Optional[str] = None, + time_series_id_column_names: Optional[Union[str, List[str]]] = None, + features_unknown_at_forecast_time: Optional[Union[str, List[str]]] = None, + ): + self.country_or_region_for_holidays = country_or_region_for_holidays + self.cv_step_size = cv_step_size + self.forecast_horizon = forecast_horizon + self.target_lags = target_lags + self.target_rolling_window_size = target_rolling_window_size + self.frequency = frequency + self.feature_lags = feature_lags + self.seasonality = seasonality + self.use_stl = use_stl + self.short_series_handling_config = short_series_handling_config + self.target_aggregate_function = target_aggregate_function + self.time_column_name = time_column_name + self.time_series_id_column_names = time_series_id_column_names + self.features_unknown_at_forecast_time = features_unknown_at_forecast_time + + def _to_rest_object(self) -> RestForecastingSettings: + forecast_horizon = None + if isinstance(self.forecast_horizon, str): + forecast_horizon = AutoForecastHorizon() + elif self.forecast_horizon: + forecast_horizon = CustomForecastHorizon(value=self.forecast_horizon) + + target_lags = None + if isinstance(self.target_lags, str): + target_lags = AutoTargetLags() + elif self.target_lags: + lags = [self.target_lags] if not isinstance(self.target_lags, list) else self.target_lags + target_lags = CustomTargetLags(values=lags) + + target_rolling_window_size = None + if isinstance(self.target_rolling_window_size, str): + target_rolling_window_size = AutoTargetRollingWindowSize() + elif self.target_rolling_window_size: + target_rolling_window_size = CustomTargetRollingWindowSize(value=self.target_rolling_window_size) + + seasonality = None + if isinstance(self.seasonality, str): + seasonality = AutoSeasonality() + elif self.seasonality: + seasonality = CustomSeasonality(value=self.seasonality) + + time_series_id_column_names = self.time_series_id_column_names + if isinstance(self.time_series_id_column_names, str) and self.time_series_id_column_names: + time_series_id_column_names = [self.time_series_id_column_names] + + features_unknown_at_forecast_time = self.features_unknown_at_forecast_time + if isinstance(self.features_unknown_at_forecast_time, str) and self.features_unknown_at_forecast_time: + features_unknown_at_forecast_time = [self.features_unknown_at_forecast_time] + + return RestForecastingSettings( + country_or_region_for_holidays=self.country_or_region_for_holidays, + cv_step_size=self.cv_step_size, + forecast_horizon=forecast_horizon, + time_column_name=self.time_column_name, + target_lags=target_lags, + target_rolling_window_size=target_rolling_window_size, + seasonality=seasonality, + frequency=self.frequency, + feature_lags=self.feature_lags, + use_stl=self.use_stl, + short_series_handling_config=self.short_series_handling_config, + target_aggregate_function=self.target_aggregate_function, + time_series_id_column_names=time_series_id_column_names, + features_unknown_at_forecast_time=features_unknown_at_forecast_time, + ) + + @classmethod + def _from_rest_object(cls, obj: RestForecastingSettings) -> "ForecastingSettings": + forecast_horizon = None + if obj.forecast_horizon and obj.forecast_horizon.mode == ForecastHorizonMode.AUTO: + forecast_horizon = obj.forecast_horizon.mode.lower() + elif obj.forecast_horizon: + forecast_horizon = obj.forecast_horizon.value + + rest_target_lags = obj.target_lags + target_lags = None + if rest_target_lags and rest_target_lags.mode == TargetLagsMode.AUTO: + target_lags = rest_target_lags.mode.lower() + elif rest_target_lags: + target_lags = rest_target_lags.values + + target_rolling_window_size = None + if obj.target_rolling_window_size and obj.target_rolling_window_size.mode == TargetRollingWindowSizeMode.AUTO: + target_rolling_window_size = obj.target_rolling_window_size.mode.lower() + elif obj.target_rolling_window_size: + target_rolling_window_size = obj.target_rolling_window_size.value + + seasonality = None + if obj.seasonality and obj.seasonality.mode == SeasonalityMode.AUTO: + seasonality = obj.seasonality.mode.lower() + elif obj.seasonality: + seasonality = obj.seasonality.value + + return cls( + country_or_region_for_holidays=obj.country_or_region_for_holidays, + cv_step_size=obj.cv_step_size, + forecast_horizon=forecast_horizon, + target_lags=target_lags, + target_rolling_window_size=target_rolling_window_size, + frequency=obj.frequency, + feature_lags=obj.feature_lags, + seasonality=seasonality, + use_stl=obj.use_stl, + short_series_handling_config=obj.short_series_handling_config, + target_aggregate_function=obj.target_aggregate_function, + time_column_name=obj.time_column_name, + time_series_id_column_names=obj.time_series_id_column_names, + features_unknown_at_forecast_time=obj.features_unknown_at_forecast_time, + ) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ForecastingSettings): + return NotImplemented + return ( + self.country_or_region_for_holidays == other.country_or_region_for_holidays + and self.cv_step_size == other.cv_step_size + and self.forecast_horizon == other.forecast_horizon + and self.target_lags == other.target_lags + and self.target_rolling_window_size == other.target_rolling_window_size + and self.frequency == other.frequency + and self.feature_lags == other.feature_lags + and self.seasonality == other.seasonality + and self.use_stl == other.use_stl + and self.short_series_handling_config == other.short_series_handling_config + and self.target_aggregate_function == other.target_aggregate_function + and self.time_column_name == other.time_column_name + and self.time_series_id_column_names == other.time_series_id_column_names + and self.features_unknown_at_forecast_time == other.features_unknown_at_forecast_time + ) + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/limit_settings.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/limit_settings.py new file mode 100644 index 00000000..1024f504 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/limit_settings.py @@ -0,0 +1,101 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +from typing import Optional + +from azure.ai.ml._restclient.v2023_04_01_preview.models import TableVerticalLimitSettings as RestTabularLimitSettings +from azure.ai.ml._utils.utils import from_iso_duration_format_mins, to_iso_duration_format_mins +from azure.ai.ml.entities._mixins import RestTranslatableMixin + + +class TabularLimitSettings(RestTranslatableMixin): + """Limit settings for a AutoML Table Verticals. + + :param enable_early_termination: Whether to enable early termination if the score is not improving in + the short term. The default is True. + :type enable_early_termination: bool + :param exit_score: Target score for experiment. The experiment terminates after this score is reached. + :type exit_score: float + :param max_concurrent_trials: Maximum number of concurrent AutoML iterations. + :type max_concurrent_trials: int + :param max_cores_per_trial: The maximum number of threads to use for a given training iteration. + :type max_cores_per_trial: int + :param max_nodes: [Experimental] The maximum number of nodes to use for distributed training. + + * For forecasting, each model is trained using max(2, int(max_nodes / max_concurrent_trials)) nodes. + + * For classification/regression, each model is trained using max_nodes nodes. + + Note- This parameter is in public preview and might change in future. + :type max_nodes: int + :param max_trials: Maximum number of AutoML iterations. + :type max_trials: int + :param timeout_minutes: AutoML job timeout. + :type timeout_minutes: int + :param trial_timeout_minutes: AutoML job timeout. + :type trial_timeout_minutes: int + """ + + def __init__( + self, + *, + enable_early_termination: Optional[bool] = None, + exit_score: Optional[float] = None, + max_concurrent_trials: Optional[int] = None, + max_cores_per_trial: Optional[int] = None, + max_nodes: Optional[int] = None, + max_trials: Optional[int] = None, + timeout_minutes: Optional[int] = None, + trial_timeout_minutes: Optional[int] = None, + ): + self.enable_early_termination = enable_early_termination + self.exit_score = exit_score + self.max_concurrent_trials = max_concurrent_trials + self.max_cores_per_trial = max_cores_per_trial + self.max_nodes = max_nodes + self.max_trials = max_trials + self.timeout_minutes = timeout_minutes + self.trial_timeout_minutes = trial_timeout_minutes + + def _to_rest_object(self) -> RestTabularLimitSettings: + return RestTabularLimitSettings( + enable_early_termination=self.enable_early_termination, + exit_score=self.exit_score, + max_concurrent_trials=self.max_concurrent_trials, + max_cores_per_trial=self.max_cores_per_trial, + max_nodes=self.max_nodes, + max_trials=self.max_trials, + timeout=to_iso_duration_format_mins(self.timeout_minutes), + trial_timeout=to_iso_duration_format_mins(self.trial_timeout_minutes), + ) + + @classmethod + def _from_rest_object(cls, obj: RestTabularLimitSettings) -> "TabularLimitSettings": + return cls( + enable_early_termination=obj.enable_early_termination, + exit_score=obj.exit_score, + max_concurrent_trials=obj.max_concurrent_trials, + max_cores_per_trial=obj.max_cores_per_trial, + max_nodes=obj.max_nodes, + max_trials=obj.max_trials, + timeout_minutes=from_iso_duration_format_mins(obj.timeout), + trial_timeout_minutes=from_iso_duration_format_mins(obj.trial_timeout), + ) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, TabularLimitSettings): + return NotImplemented + return ( + self.enable_early_termination == other.enable_early_termination + and self.exit_score == other.exit_score + and self.max_concurrent_trials == other.max_concurrent_trials + and self.max_cores_per_trial == other.max_cores_per_trial + and self.max_nodes == other.max_nodes + and self.max_trials == other.max_trials + and self.timeout_minutes == other.timeout_minutes + and self.trial_timeout_minutes == other.trial_timeout_minutes + ) + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) diff --git a/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/regression_job.py b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/regression_job.py new file mode 100644 index 00000000..3531e52c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/azure/ai/ml/entities/_job/automl/tabular/regression_job.py @@ -0,0 +1,239 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=protected-access + +from typing import Any, Dict, Optional, Union + +from azure.ai.ml._restclient.v2023_04_01_preview.models import AutoMLJob as RestAutoMLJob +from azure.ai.ml._restclient.v2023_04_01_preview.models import JobBase +from azure.ai.ml._restclient.v2023_04_01_preview.models import Regression as RestRegression +from azure.ai.ml._restclient.v2023_04_01_preview.models import RegressionPrimaryMetrics, TaskType +from azure.ai.ml._utils.utils import camel_to_snake, is_data_binding_expression +from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY +from azure.ai.ml.constants._job.automl import AutoMLConstants +from azure.ai.ml.entities._credentials import _BaseJobIdentityConfiguration +from azure.ai.ml.entities._job._input_output_helpers import from_rest_data_outputs, to_rest_data_outputs +from azure.ai.ml.entities._job.automl.tabular import AutoMLTabular, TabularFeaturizationSettings, TabularLimitSettings +from azure.ai.ml.entities._job.automl.training_settings import RegressionTrainingSettings +from azure.ai.ml.entities._util import load_from_dict + + +class RegressionJob(AutoMLTabular): + """Configuration for AutoML Regression Job.""" + + _DEFAULT_PRIMARY_METRIC = RegressionPrimaryMetrics.NORMALIZED_ROOT_MEAN_SQUARED_ERROR + + def __init__( + self, + *, + primary_metric: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Initialize a new AutoML Regression task. + + :param primary_metric: The primary metric to use for optimization + :type primary_metric: str + :param kwargs: Job-specific arguments + :type kwargs: dict + """ + # Extract any task specific settings + featurization = kwargs.pop("featurization", None) + limits = kwargs.pop("limits", None) + training = kwargs.pop("training", None) + + super().__init__( + task_type=TaskType.REGRESSION, + featurization=featurization, + limits=limits, + training=training, + **kwargs, + ) + + self.primary_metric = primary_metric or RegressionJob._DEFAULT_PRIMARY_METRIC + + @property + def primary_metric(self) -> Union[str, RegressionPrimaryMetrics]: + return self._primary_metric + + @primary_metric.setter + def primary_metric(self, value: Union[str, RegressionPrimaryMetrics]) -> None: + # TODO: better way to do this + if is_data_binding_expression(str(value), ["parent"]): + self._primary_metric = value + return + self._primary_metric = ( + RegressionJob._DEFAULT_PRIMARY_METRIC + if value is None + else RegressionPrimaryMetrics[camel_to_snake(value).upper()] + ) + + @property + def training(self) -> RegressionTrainingSettings: + return self._training or RegressionTrainingSettings() + + @training.setter + def training(self, value: Union[Dict, RegressionTrainingSettings]) -> None: # pylint: disable=unused-argument + ... + + def _to_rest_object(self) -> JobBase: + regression_task = RestRegression( + target_column_name=self.target_column_name, + training_data=self.training_data, + validation_data=self.validation_data, + validation_data_size=self.validation_data_size, + weight_column_name=self.weight_column_name, + cv_split_column_names=self.cv_split_column_names, + n_cross_validations=self.n_cross_validations, + test_data=self.test_data, + test_data_size=self.test_data_size, + featurization_settings=self._featurization._to_rest_object() if self._featurization else None, + limit_settings=self._limits._to_rest_object() if self._limits else None, + training_settings=self._training._to_rest_object() if self._training else None, + primary_metric=self.primary_metric, + log_verbosity=self.log_verbosity, + ) + self._resolve_data_inputs(regression_task) + self._validation_data_to_rest(regression_task) + + properties = RestAutoMLJob( + display_name=self.display_name, + description=self.description, + experiment_name=self.experiment_name, + tags=self.tags, + compute_id=self.compute, + properties=self.properties, + environment_id=self.environment_id, + environment_variables=self.environment_variables, + services=self.services, + outputs=to_rest_data_outputs(self.outputs), + resources=self.resources, + task_details=regression_task, + identity=self.identity._to_job_rest_object() if self.identity else None, + queue_settings=self.queue_settings, + ) + + result = JobBase(properties=properties) + result.name = self.name + return result + + @classmethod + def _from_rest_object(cls, obj: JobBase) -> "RegressionJob": + properties: RestAutoMLJob = obj.properties + task_details: RestRegression = properties.task_details + + job_args_dict = { + "id": obj.id, + "name": obj.name, + "description": properties.description, + "tags": properties.tags, + "properties": properties.properties, + "experiment_name": properties.experiment_name, + "services": properties.services, + "status": properties.status, + "creation_context": obj.system_data, + "display_name": properties.display_name, + "compute": properties.compute_id, + "outputs": from_rest_data_outputs(properties.outputs), + "resources": properties.resources, + "identity": ( + _BaseJobIdentityConfiguration._from_rest_object(properties.identity) if properties.identity else None + ), + "queue_settings": properties.queue_settings, + } + + regression_job = cls( + target_column_name=task_details.target_column_name, + training_data=task_details.training_data, + validation_data=task_details.validation_data, + validation_data_size=task_details.validation_data_size, + weight_column_name=task_details.weight_column_name, + cv_split_column_names=task_details.cv_split_column_names, + n_cross_validations=task_details.n_cross_validations, + test_data=task_details.test_data, + test_data_size=task_details.test_data_size, + featurization=( + TabularFeaturizationSettings._from_rest_object(task_details.featurization_settings) + if task_details.featurization_settings + else None + ), + limits=( + TabularLimitSettings._from_rest_object(task_details.limit_settings) + if task_details.limit_settings + else None + ), + training=( + RegressionTrainingSettings._from_rest_object(task_details.training_settings) + if task_details.training_settings + else None + ), + primary_metric=task_details.primary_metric, + log_verbosity=task_details.log_verbosity, + **job_args_dict, + ) + + regression_job._restore_data_inputs() + regression_job._validation_data_from_rest() + + return regression_job + + @classmethod + def _load_from_dict( + cls, + data: Dict, + context: Dict, + additional_message: str, + **kwargs: Any, + ) -> "RegressionJob": + from azure.ai.ml._schema.automl.table_vertical.regression import AutoMLRegressionSchema + from azure.ai.ml._schema.pipeline.automl_node import AutoMLRegressionNodeSchema + + if kwargs.pop("inside_pipeline", False): + loaded_data = load_from_dict(AutoMLRegressionNodeSchema, data, context, additional_message, **kwargs) + else: + loaded_data = load_from_dict(AutoMLRegressionSchema, data, context, additional_message, **kwargs) + job_instance = cls._create_instance_from_schema_dict(loaded_data) + return job_instance + + @classmethod + def _create_instance_from_schema_dict(cls, loaded_data: Dict) -> "RegressionJob": + loaded_data.pop(AutoMLConstants.TASK_TYPE_YAML, None) + data_settings = { + "training_data": loaded_data.pop("training_data"), + "target_column_name": loaded_data.pop("target_column_name"), + "weight_column_name": loaded_data.pop("weight_column_name", None), + "validation_data": loaded_data.pop("validation_data", None), + "validation_data_size": loaded_data.pop("validation_data_size", None), + "cv_split_column_names": loaded_data.pop("cv_split_column_names", None), + "n_cross_validations": loaded_data.pop("n_cross_validations", None), + "test_data": loaded_data.pop("test_data", None), + "test_data_size": loaded_data.pop("test_data_size", None), + } + job = RegressionJob(**loaded_data) + job.set_data(**data_settings) + return job + + def _to_dict(self, inside_pipeline: bool = False) -> Dict: + from azure.ai.ml._schema.automl.table_vertical.regression import AutoMLRegressionSchema + from azure.ai.ml._schema.pipeline.automl_node import AutoMLRegressionNodeSchema + + schema_dict: dict = {} + if inside_pipeline: + schema_dict = AutoMLRegressionNodeSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) + else: + schema_dict = AutoMLRegressionSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) + + return schema_dict + + def __eq__(self, other: object) -> bool: + if not isinstance(other, RegressionJob): + return NotImplemented + + if not super(RegressionJob, self).__eq__(other): + return False + + return self.primary_metric == other.primary_metric + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) |
