diff --git a/eland/ml/ml_model.py b/eland/ml/ml_model.py index 42aa424..c1333dc 100644 --- a/eland/ml/ml_model.py +++ b/eland/ml/ml_model.py @@ -269,6 +269,9 @@ class MLModel: model: An instance of a supported python model. We support the following model types: - sklearn.tree.DecisionTreeClassifier + - NOTE: When calculating the probabilities of a given classification label, Elasticsearch utilizes + softMax. SKLearn instead normalizes the results. We try to account for this during model + serialization, but probabilities may be slightly different in the predictions. - sklearn.tree.DecisionTreeRegressor - sklearn.ensemble.RandomForestRegressor - sklearn.ensemble.RandomForestClassifier diff --git a/eland/ml/transformers/sklearn.py b/eland/ml/transformers/sklearn.py index 726e044..71d55ab 100644 --- a/eland/ml/transformers/sklearn.py +++ b/eland/ml/transformers/sklearn.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import math from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union import numpy as np @@ -86,8 +86,12 @@ class SKLearnTransformer(ModelTransformer): ): # classification requires more than one value, so assume regression leaf_value = [float(value[0][0])] else: - # the classification value, which is the index of the largest value - leaf_value = [float(np.argmax(value))] + # the classification value + # DecisionTreeClassifiers simply use normalize (dividing predicted values by sum) + # We use softMax, to get our probabilities as close as possible, store log value here + leaf_value = [ + -10000000 if n <= 0 else math.log(float(n)) for n in value[0] + ] return TreeNode( node_index, decision_type=self._node_decision_type, diff --git a/tests/ml/test_ml_model_pytest.py b/tests/ml/test_ml_model_pytest.py index 2c10bd1..7d28018 100644 --- a/tests/ml/test_ml_model_pytest.py +++ b/tests/ml/test_ml_model_pytest.py @@ -73,7 +73,7 @@ def random_rows(data, size): return data[np.random.randint(data.shape[0], size=size), :] -def check_prediction_equality(es_model, py_model, test_data): +def check_prediction_equality(es_model: MLModel, py_model, test_data): # Get some test results test_results = py_model.predict(np.asarray(test_data)) es_results = es_model.predict(test_data) @@ -131,14 +131,25 @@ class TestMLModel: @requires_sklearn @pytest.mark.parametrize("compress_model_definition", [True, False]) - def test_decision_tree_classifier(self, compress_model_definition): + @pytest.mark.parametrize("multi_class", [True, False]) + def test_decision_tree_classifier(self, compress_model_definition, multi_class): # Train model - training_data = datasets.make_classification(n_features=5) + training_data = ( + datasets.make_classification( + n_features=7, + n_classes=3, + n_clusters_per_class=2, + n_informative=6, + n_redundant=1, + ) + if multi_class + else datasets.make_classification(n_features=7) + ) classifier = DecisionTreeClassifier() classifier.fit(training_data[0], training_data[1]) # Serialise the models to Elasticsearch - feature_names = ["f0", "f1", "f2", "f3", "f4"] + feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6"] model_id = "test_decision_tree_classifier" es_model = MLModel.import_model(