AD | Added AutoTS/Sklearn Models as Operator Models with Contamination Parameter Support (#955)

codeloop · web-flow · commit c00e529ad214 · 2024-10-10T16:06:08.000+05:30
diff --git a/ads/opctl/operator/lowcode/anomaly/const.py b/ads/opctl/operator/lowcode/anomaly/const.py
@@ -11,10 +11,15 @@
 class SupportedModels(str, metaclass=ExtendedEnumMeta):
     """Supported anomaly models."""
 
-    AutoMLX = "automlx"
     AutoTS = "autots"
     Auto = "auto"
-    # TODS = "tods"
+    IQR = "iqr"
+    LOF = "lof"
+    ZSCORE = "zscore"
+    ROLLING_ZSCORE = "rolling_zscore"
+    MAD = "mad"
+    EE = "ee"
+    ISOLATIONFOREST = "isolationforest"
 
 class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
     """Supported non time-based anomaly detection models."""
diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py
@@ -4,80 +4,75 @@
 # Copyright (c) 2023, 2024 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
-import pandas as pd
-
 from ads.common.decorator.runtime_dependency import runtime_dependency
-
-from .base_model import AnomalyOperatorBaseModel
-from .anomaly_dataset import AnomalyOutput
 from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
+from .anomaly_dataset import AnomalyOutput
+from .base_model import AnomalyOperatorBaseModel
+from ..const import SupportedModels
+from ads.opctl import logger
 
 
 class AutoTSOperatorModel(AnomalyOperatorBaseModel):
     """Class representing AutoTS Anomaly Detection operator model."""
+    model_mapping = {
+        "isolationforest": "IsolationForest",
+        "lof": "LOF",
+        "ee": "EE",
+        "zscore": "zscore",
+        "rolling_zscore": "rolling_zscore",
+        "mad": "mad",
+        "minmax": "minmax",
+        "iqr": "IQR"
+    }
 
     @runtime_dependency(
         module="autots",
         err_msg=(
-            "Please run `pip3 install autots` to "
-            "install the required dependencies for AutoTS."
+                "Please run `pip3 install autots` to "
+                "install the required dependencies for AutoTS."
         ),
     )
     def _build_model(self) -> AnomalyOutput:
         from autots.evaluator.anomaly_detector import AnomalyDetector
 
-        method = self.spec.model_kwargs.get("method")
-        transform_dict = self.spec.model_kwargs.get("transform_dict", {})
-
-        if method == "random" or method == "deep" or method == "fast":
-            new_params = AnomalyDetector.get_new_params(method=method)
-            transform_dict = new_params.pop("transform_dict")
-
-            for key, value in new_params.items():
-                self.spec.model_kwargs[key] = value
-
-        if self.spec.model_kwargs.get("output") is None:
-            self.spec.model_kwargs["output"] = "univariate"
-
-        if "transform_dict" not in self.spec.model_kwargs:
-            self.spec.model_kwargs["transform_dict"] = transform_dict
-
-        if self.spec.contamination != 0.1:  # TODO: remove hard-coding
-            self.spec.model_kwargs.get("method_params", {})[
-                "contamination"
-            ] = self.spec.contamination
-
-        model = AnomalyDetector(**self.spec.model_kwargs)
+        method = SupportedModels.ISOLATIONFOREST if self.spec.model == SupportedModels.AutoTS else self.spec.model
+        model_params = {"method": self.model_mapping[method],
+                        "transform_dict": self.spec.model_kwargs.get("transform_dict", {}),
+                        "output": self.spec.model_kwargs.get("output", "univariate"), "method_params": {}}
+        # Supported methods with contamination param
+        if method in [SupportedModels.ISOLATIONFOREST, SupportedModels.LOF, SupportedModels.EE]:
+            model_params["method_params"][
+                "contamination"] = self.spec.contamination if self.spec.contamination else 0.01
+        else:
+            if self.spec.contamination:
+                raise ValueError(f"The contamination parameter is not supported for the selected model \"{method}\"")
+        logger.info(f"model params: {model_params}")
+
+        model = AnomalyDetector(**model_params)
 
         date_column = self.spec.datetime_column.name
 
         anomaly_output = AnomalyOutput(date_column=date_column)
 
         for target, df in self.datasets.full_data_dict.items():
             data = df.set_index(date_column)
-
             (anomaly, score) = model.detect(data)
-
             if len(anomaly.columns) == 1:
                 score.rename(
                     columns={score.columns.values[0]: OutputColumns.SCORE_COL},
                     inplace=True,
                 )
                 score = 1 - score
                 score = score.reset_index(drop=False)
-
                 col = anomaly.columns.values[0]
                 anomaly[col] = anomaly[col].replace({1: 0, -1: 1})
                 anomaly.rename(columns={col: OutputColumns.ANOMALY_COL}, inplace=True)
                 anomaly = anomaly.reset_index(drop=False)
-
                 anomaly_output.add_output(target, anomaly, score)
-
             else:
                 raise NotImplementedError(
                     "Multi-Output Anomaly Detection is not yet supported in autots"
                 )
-
         return anomaly_output
 
     def _generate_report(self):
diff --git a/ads/opctl/operator/lowcode/anomaly/model/factory.py b/ads/opctl/operator/lowcode/anomaly/model/factory.py
@@ -4,18 +4,14 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 from ads.opctl.operator.lowcode.anomaly.utils import select_auto_model
-
-from ..const import NonTimeADSupportedModels, SupportedModels
-from ..operator_config import AnomalyOperatorConfig
 from .anomaly_dataset import AnomalyDatasets
-from .automlx import AutoMLXOperatorModel
 from .autots import AutoTSOperatorModel
-
-# from .tods import TODSOperatorModel
 from .base_model import AnomalyOperatorBaseModel
 from .isolationforest import IsolationForestOperatorModel
 from .oneclasssvm import OneClassSVMOperatorModel
 from .randomcutforest import RandomCutForestOperatorModel
+from ..const import NonTimeADSupportedModels, SupportedModels
+from ..operator_config import AnomalyOperatorConfig
 
 
 class UnSupportedModelError(Exception):
@@ -45,9 +41,14 @@ class AnomalyOperatorModelFactory:
     """
 
     _MAP = {
-        SupportedModels.AutoMLX: AutoMLXOperatorModel,
-        # SupportedModels.TODS: TODSOperatorModel,
         SupportedModels.AutoTS: AutoTSOperatorModel,
+        SupportedModels.IQR: AutoTSOperatorModel,
+        SupportedModels.LOF: AutoTSOperatorModel,
+        SupportedModels.ISOLATIONFOREST: AutoTSOperatorModel,
+        SupportedModels.ZSCORE: AutoTSOperatorModel,
+        SupportedModels.ROLLING_ZSCORE: AutoTSOperatorModel,
+        SupportedModels.EE: AutoTSOperatorModel,
+        SupportedModels.MAD: AutoTSOperatorModel
     }
 
     _NonTime_MAP = {
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -364,15 +364,21 @@ spec:
         - oneclasssvm
         - isolationforest
         - randomcutforest
+        - iqr
+        - lof
+        - zscore
+        - rolling_zscore
+        - mad
+        - ee
       meta:
         description: "The model to be used for anomaly detection"
 
     contamination:
       required: false
-      default: 0.1
+      default: 0.01
       type: float
       meta:
-        description: "Fraction of training dataset corresponding to anomalies (between 0.0 and 0.5)"
+        description: "The proportion of outliers in the data set. The contamination should be in the range (0, 0.5]"
 
     model_kwargs:
       type: dict
diff --git a/docs/source/user_guide/operators/anomaly_detection_operator/advanced_use_cases.rst b/docs/source/user_guide/operators/anomaly_detection_operator/advanced_use_cases.rst
@@ -8,8 +8,7 @@ The Science of Anomaly Detection
 Anomaly Detection comes in many forms. We will go through some of these and give guidance as to whether this Operator is going to be helpful for each use case.
 
 * Constructive v Destructive v Pre-Processing: This Operator focuses on the Constructive and Pre-Processing use cases. Destructive can work, but more specific parameters may be required.
-* Supervised v Semi-Supervised v Unsupervised: All 3 of these approaches are supported by AutoMLX. AutoTS supports only Unsupervised at this time.
-* Time Series. This Operator requires time-series data.
+* The operator currently supports only unsupervised learning and works with both time-series and non-time-series data.
 
 
 Data Parameterization
@@ -51,40 +50,3 @@ Data Parameterization
         datetime_column:
             name: ds
         target_column: y
-
-
-Model Parameterization
-----------------------
-
-**Specify Model Type**
-
-Sometimes users will know which models they want to use. When users know this in advance, they can specify using the ``model_kwargs`` dictionary. In the following example, we will instruct the model to *only* use the ``IsolationForestOD`` model.
-
-.. code-block:: yaml
-
-  kind: operator
-  type: anomaly
-  version: v1
-  spec:
-    model: automlx
-    model_kwargs:
-      model_list:
-        - IsolationForestOD
-      search_space:
-        IsolationForestOD:
-          n_estimators:
-            range': [10, 50]
-            type': 'discrete'
-
-
-AutoTS offers the same extensibility:
-
-.. code-block:: yaml
-
-  kind: operator
-  type: anomaly
-  version: v1
-  spec:
-    model: autots
-    model_kwargs:
-      method: IQR
diff --git a/docs/source/user_guide/operators/anomaly_detection_operator/index.rst b/docs/source/user_guide/operators/anomaly_detection_operator/index.rst
@@ -27,7 +27,7 @@ If you have additional variables that you think might be related, then you shoul
 
 **Auto Model Selection**
 
-Operators users don't need to know anything about the underlying models in order to use them. By default we set ``model: auto``. However, some users want more control over the modeling parameters. These users can set the ``model`` parameter to either ``autots`` or ``automlx`` and then pass parameters directly into ``model_kwargs``. See :doc:`Advanced Examples <./advanced_use_cases>`
+Operators users don't need to know anything about the underlying models in order to use them. By default we set ``model: auto``. However, some users want more control over the modeling parameters. These users can set the ``model`` parameter to ``isolationforest``, ``lof``, ``ee``, ``zscore``, ``rolling_zscore``, ``mad``, ``mixmax``, ``iqr`` or ``autots`` and then pass parameters directly into ``model_kwargs``. See :doc:`Advanced Examples <./advanced_use_cases>`
 
 **Anomaly Detection Documentation**
 
diff --git a/docs/source/user_guide/operators/anomaly_detection_operator/productionize.rst b/docs/source/user_guide/operators/anomaly_detection_operator/productionize.rst
@@ -156,9 +156,7 @@ The yaml can also be maximally stated as follows:
         datetime_column:
             format: "%d/%m/%y"
             name: Date
-        model: automlx
-        model_kwargs:
-            time_budget: 100 
+        model: ee
         preprocessing: true
         generate_metrics: true
         generate_report: true
diff --git a/docs/source/user_guide/operators/anomaly_detection_operator/yaml_schema.rst b/docs/source/user_guide/operators/anomaly_detection_operator/yaml_schema.rst
@@ -16,8 +16,10 @@ Here is an example anomaly.yaml with every parameter specified:
             name: Date
         input_data:
             url: data.csv
-        model: auto
+        model: isolationforest
+        contamination: 0.005
         target_column: target
+        target_category_columns: ['series']
 
 
 * **Kind**: The yaml file always starts with ``kind: operator``. There are many other kinds of yaml files that can be run by ``ads opctl``, so we need to specify this is an operator.
@@ -39,7 +41,8 @@ Here is an example anomaly.yaml with every parameter specified:
     * **output_directory**: (optional) This dictionary contains the details for where to put the output artifacts. The directory need not exist, but must be accessible by the Operator during runtime.
         * **url**: Insert the uri for the dataset if it's on object storage or Data Lake using the URI pattern ``oci://<bucket>@<namespace>/subfolder/``.
         * **kwargs**: Insert any other args for pandas to load the data (``format``, ``options``, etc.) See full list in ``YAML Schema`` section.
-    * **model**: (optional) The name of the model framework you want to use. Defaults to "auto". Other options are: ``autots``, and ``auto``.
+    * **model**: (optional) The name of the model framework you want to use. Defaults to "auto". Other options are: iqr, lof, zscore, rolling_zscore, isolationforest, mad, ee, autots and auto.
+    * **contamination**: The proportion of outliers in the data set. The contamination should be in the range (0, 0.5]. This parameter is supported only by specific methods, i.e. isolationforest, lof and ee. If used with other models, this parameter will be ignored.
     * **model_kwargs**: (optional) This kwargs dict passes straight through to the model framework. If you want to take direct control of the modeling, this is the best way.
     * **test_data**: (optional) This dictionary contains the details for how to read the test data. Test data should contain every datetime value of the input_data, (optionally) all of the series from target_category_columns, and a column titles "anomaly" with either a 1 (non-anomalous) or 0 (anomalous).
         * **url**: Insert the uri for the dataset if it's on object storage or Data Lake using the URI pattern ``oci://<bucket>@<namespace>/path/to/data.csv``.
diff --git a/tests/operators/anomaly/test_anomaly_simple.py b/tests/operators/anomaly/test_anomaly_simple.py
@@ -3,7 +3,7 @@
 # Copyright (c) 2023, 2024 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
-from ads.opctl.operator.lowcode.anomaly.const import NonTimeADSupportedModels
+from ads.opctl.operator.lowcode.anomaly.const import NonTimeADSupportedModels, SupportedModels
 import yaml
 import subprocess
 import pandas as pd
@@ -16,8 +16,7 @@
 from datetime import datetime
 from ads.opctl.operator.cmd import run
 
-
-MODELS = ["autots"]  # "automlx",
+MODELS = ["autots", "iqr", "lof", "zscore", "rolling_zscore", "mad", "ee", "isolationforest"]
 
 # Mandatory YAML parameters
 TEMPLATE_YAML = {
@@ -218,7 +217,7 @@ def test_load_datasets(model, data_dict):
         yaml_i = deepcopy(TEMPLATE_YAML)
         yaml_i["spec"]["model"] = model
         yaml_i["spec"]["input_data"]["url"] = data_dict["url"]
-        if model in NonTimeADSupportedModels.values():
+        if model in set(NonTimeADSupportedModels.values()) - set(SupportedModels.values()):
             del yaml_i["spec"]["datetime_column"]
         else:
             yaml_i["spec"]["datetime_column"]["name"] = data_dict["dt_col"]