From 66b24f9e8a68a89152dd572ebd4866bb0ac324e8 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" <33659563+V1NAY8@users.noreply.github.com> Date: Mon, 17 Aug 2020 22:40:27 +0530 Subject: [PATCH] Replace MLModel(overwrite) with es_if_exists --- CONTRIBUTING.md | 3 +- README.md | 2 +- eland/ml/imported_ml_model.py | 41 +++++- eland/ml/ml_model.py | 12 ++ .../tests/ml/test_imported_ml_model_pytest.py | 129 ++++++++++++++++-- eland/utils.py | 1 + 6 files changed, 170 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3962c07..7d85db3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -151,7 +151,8 @@ currently using a minimum version of PyCharm 2019.2.4. Tools\'-\>\'Docstring format\' to `numpy` - Install development requirements. Open terminal in virtual environment and run `pip install -r requirements-dev.txt` -- Setup Elasticsearch instance (assumes `localhost:9200`), and run +- Setup Elasticsearch instance with docker `ELASTICSEARCH_VERSION=elasticsearch:7.x-SNAPSHOT .ci/run-elasticsearch.sh` and check `http://localhost:9200` +- Run `python -m eland.tests.setup_tests` to setup test environment -*note this modifies Elasticsearch indices* - Install local `eland` module (required to execute notebook tests) diff --git a/README.md b/README.md index 46c724f..48edff9 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ dtype: int64 13057 20819.488281 13058 18315.431274 Length: 13059, dtype: float64 ->>> print(s.info_es()) +>>> print(s.es_info()) index_pattern: flights Index: index_field: _id diff --git a/eland/ml/imported_ml_model.py b/eland/ml/imported_ml_model.py index 23418fe..9129b81 100644 --- a/eland/ml/imported_ml_model.py +++ b/eland/ml/imported_ml_model.py @@ -22,6 +22,7 @@ import numpy as np # type: ignore from .ml_model import MLModel from .transformers import get_model_transformer from ..common import es_version +import warnings if TYPE_CHECKING: @@ -100,7 +101,13 @@ class ImportedMLModel(MLModel): classification_weights: List[str] Weights of the classification targets - overwrite: bool + es_if_exists: {'fail', 'replace'} default 'fail' + How to behave if model already exists + + - fail: Raise a Value Error + - replace: Overwrite existing model + + overwrite: **DEPRECATED** - bool Delete and overwrite existing model (if exists) es_compress_model_definition: bool @@ -127,7 +134,7 @@ class ImportedMLModel(MLModel): >>> # Serialise the model to Elasticsearch >>> feature_names = ["f0", "f1", "f2", "f3", "f4"] >>> model_id = "test_decision_tree_classifier" - >>> es_model = ImportedMLModel('localhost', model_id, classifier, feature_names, overwrite=True) + >>> es_model = ImportedMLModel('localhost', model_id, classifier, feature_names, es_if_exists='replace') >>> # Get some test results from Elasticsearch model >>> es_model.predict(test_data) @@ -155,7 +162,8 @@ class ImportedMLModel(MLModel): feature_names: List[str], classification_labels: Optional[List[str]] = None, classification_weights: Optional[List[float]] = None, - overwrite: bool = False, + es_if_exists: Optional[str] = None, + overwrite: Optional[bool] = None, es_compress_model_definition: bool = True, ): super().__init__(es_client, model_id) @@ -171,7 +179,30 @@ class ImportedMLModel(MLModel): self._model_type = transformer.model_type serializer = transformer.transform() - if overwrite: + # Verify if both parameters are given + if overwrite is not None and es_if_exists is not None: + raise ValueError( + "Using 'overwrite' and 'es_if_exists' together is invalid, use only 'es_if_exists'" + ) + + if overwrite is not None: + warnings.warn( + "'overwrite' parameter is deprecated, use 'es_if_exists' instead", + DeprecationWarning, + stacklevel=2, + ) + es_if_exists = "replace" if overwrite else "fail" + elif es_if_exists is None: + es_if_exists = "fail" + + if es_if_exists not in ("fail", "replace"): + raise ValueError("'es_if_exists' must be either 'fail' or 'replace'") + elif es_if_exists == "fail": + if self.check_existing_model(): + raise ValueError( + f"Trained machine learning model {model_id} already exists" + ) + elif es_if_exists == "replace": self.delete_model() body: Dict[str, Any] = { @@ -224,7 +255,7 @@ class ImportedMLModel(MLModel): >>> # Serialise the model to Elasticsearch >>> feature_names = ["f0", "f1", "f2", "f3", "f4", "f5"] >>> model_id = "test_xgb_regressor" - >>> es_model = ImportedMLModel('localhost', model_id, regressor, feature_names, overwrite=True) + >>> es_model = ImportedMLModel('localhost', model_id, regressor, feature_names, es_if_exists='replace') >>> # Get some test results from Elasticsearch model >>> es_model.predict(test_data) diff --git a/eland/ml/ml_model.py b/eland/ml/ml_model.py index a865019..bb93806 100644 --- a/eland/ml/ml_model.py +++ b/eland/ml/ml_model.py @@ -57,3 +57,15 @@ class MLModel: self._client.ml.delete_trained_model(model_id=self._model_id, ignore=(404,)) except elasticsearch.NotFoundError: pass + + def check_existing_model(self) -> bool: + """ + Check If model exists in Elastic + """ + try: + self._client.ml.get_trained_models( + model_id=self._model_id, include_model_definition=False + ) + except elasticsearch.NotFoundError: + return False + return True diff --git a/eland/tests/ml/test_imported_ml_model_pytest.py b/eland/tests/ml/test_imported_ml_model_pytest.py index c3a9543..99d87ed 100644 --- a/eland/tests/ml/test_imported_ml_model_pytest.py +++ b/eland/tests/ml/test_imported_ml_model_pytest.py @@ -103,7 +103,7 @@ class TestImportedMLModel: model_id, classifier, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=True, ) @@ -147,7 +147,7 @@ class TestImportedMLModel: model_id, classifier, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) @@ -176,7 +176,7 @@ class TestImportedMLModel: model_id, regressor, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) # Get some test results @@ -204,7 +204,7 @@ class TestImportedMLModel: model_id, classifier, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) # Get some test results @@ -232,7 +232,7 @@ class TestImportedMLModel: model_id, regressor, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) # Get some test results @@ -270,7 +270,7 @@ class TestImportedMLModel: model_id, classifier, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) # Get some test results @@ -306,7 +306,7 @@ class TestImportedMLModel: model_id = "test_xgb_classifier" es_model = ImportedMLModel( - ES_TEST_CLIENT, model_id, classifier, feature_names, overwrite=True + ES_TEST_CLIENT, model_id, classifier, feature_names, es_if_exists="replace" ) # Get some test results check_prediction_equality( @@ -342,7 +342,7 @@ class TestImportedMLModel: model_id, regressor, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) # Get some test results @@ -369,7 +369,7 @@ class TestImportedMLModel: model_id = "test_xgb_regressor" es_model = ImportedMLModel( - ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True + ES_TEST_CLIENT, model_id, regressor, feature_names, es_if_exists="replace" ) # Single feature @@ -410,7 +410,7 @@ class TestImportedMLModel: model_id, regressor, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) # Get some test results @@ -451,7 +451,7 @@ class TestImportedMLModel: model_id, classifier, feature_names, - overwrite=True, + es_if_exists="replace", es_compress_model_definition=compress_model_definition, ) @@ -461,3 +461,110 @@ class TestImportedMLModel: # Clean up es_model.delete_model() + + # If both overwrite and es_if_exists is given. + @requires_sklearn + @pytest.mark.parametrize("compress_model_definition", [True, False]) + @pytest.mark.parametrize("es_if_exists", ["fail", "replace"]) + @pytest.mark.parametrize("overwrite", [True, False]) + def test_imported_mlmodel_bothparams( + self, compress_model_definition, es_if_exists, overwrite + ): + # Train model + training_data = datasets.make_regression(n_features=5) + regressor = RandomForestRegressor() + regressor.fit(training_data[0], training_data[1]) + + feature_names = ["f0", "f1", "f2", "f3", "f4"] + model_id = "test_random_forest_regressor" + + match = "Using 'overwrite' and 'es_if_exists' together is invalid, use only 'es_if_exists'" + with pytest.raises(ValueError, match=match): + ImportedMLModel( + ES_TEST_CLIENT, + model_id, + regressor, + feature_names, + es_if_exists=es_if_exists, + overwrite=overwrite, + es_compress_model_definition=compress_model_definition, + ) + + # Deprecation warning for overwrite parameter + @requires_sklearn + @pytest.mark.parametrize("compress_model_definition", [True, False]) + @pytest.mark.parametrize("overwrite", [True]) + def test_imported_mlmodel_overwrite_true( + self, compress_model_definition, overwrite + ): + # Train model + training_data = datasets.make_regression(n_features=5) + regressor = RandomForestRegressor() + regressor.fit(training_data[0], training_data[1]) + + feature_names = ["f0", "f1", "f2", "f3", "f4"] + model_id = "test_random_forest_regressor" + + match = "'overwrite' parameter is deprecated, use 'es_if_exists' instead" + with pytest.warns(DeprecationWarning, match=match): + ImportedMLModel( + ES_TEST_CLIENT, + model_id, + regressor, + feature_names, + overwrite=overwrite, + es_compress_model_definition=compress_model_definition, + ) + + @requires_sklearn + @pytest.mark.parametrize("compress_model_definition", [True, False]) + @pytest.mark.parametrize("overwrite", [False]) + def test_imported_mlmodel_overwrite_false( + self, compress_model_definition, overwrite + ): + # Train model + training_data = datasets.make_regression(n_features=5) + regressor = RandomForestRegressor() + regressor.fit(training_data[0], training_data[1]) + + feature_names = ["f0", "f1", "f2", "f3", "f4"] + model_id = "test_random_forest_regressor" + + match_error = f"Trained machine learning model {model_id} already exists" + match_warning = ( + "'overwrite' parameter is deprecated, use 'es_if_exists' instead" + ) + with pytest.raises(ValueError, match=match_error): + with pytest.warns(DeprecationWarning, match=match_warning): + ImportedMLModel( + ES_TEST_CLIENT, + model_id, + regressor, + feature_names, + overwrite=overwrite, + es_compress_model_definition=compress_model_definition, + ) + + # Raise ValueError if Model exists when es_if_exists = 'fail' + @requires_sklearn + @pytest.mark.parametrize("compress_model_definition", [True, False]) + def test_es_if_exists_fail(self, compress_model_definition): + # Train model + training_data = datasets.make_regression(n_features=5) + regressor = RandomForestRegressor() + regressor.fit(training_data[0], training_data[1]) + + feature_names = ["f0", "f1", "f2", "f3", "f4"] + model_id = "test_random_forest_regressor" + + # If both overwrite and es_if_exists is given. + match = f"Trained machine learning model {model_id} already exists" + with pytest.raises(ValueError, match=match): + ImportedMLModel( + ES_TEST_CLIENT, + model_id, + regressor, + feature_names, + es_if_exists="fail", + es_compress_model_definition=compress_model_definition, + ) diff --git a/eland/utils.py b/eland/utils.py index e643b48..330db9a 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -36,6 +36,7 @@ def deprecated_api( warnings.warn( f"{f.__name__} is deprecated, use {replace_with} instead", DeprecationWarning, + stacklevel=2, ) return f(*args, **kwargs)