[ML] fixes decision tree classifier upload to account for probabilities (#465)

This switches our sklearn.DecisionTreeClassifier serialization logic to account for multi-valued leaves in the tree. The key difference between our inference and DecisionTreeClassifier, is that we run a softMax over the leaf where sklearn simply normalizes the results. This means that our "probabilities" returned will be different than sklearn.
2025-07-11 00:02:14 +08:00 · 2022-05-17 08:11:20 -04:00 · 2022-05-17 08:11:20 -04:00 · fa30246937
commit fa30246937
parent 5bbb8e484a
3 changed files with 25 additions and 7 deletions
--- a/eland/ml/ml_model.py
+++ b/eland/ml/ml_model.py
@ -269,6 +269,9 @@ class MLModel:

        model: An instance of a supported python model. We support the following model types:
            - sklearn.tree.DecisionTreeClassifier
+                - NOTE: When calculating the probabilities of a given classification label, Elasticsearch utilizes
+                        softMax. SKLearn instead normalizes the results. We try to account for this during model
+                        serialization, but probabilities may be slightly different in the predictions.
            - sklearn.tree.DecisionTreeRegressor
            - sklearn.ensemble.RandomForestRegressor
            - sklearn.ensemble.RandomForestClassifier
--- a/eland/ml/transformers/sklearn.py
+++ b/eland/ml/transformers/sklearn.py
@ -14,7 +14,7 @@
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
-
+import math
 from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union

 import numpy as np
@ -86,8 +86,12 @@ class SKLearnTransformer(ModelTransformer):
            ):  # classification requires more than one value, so assume regression
                leaf_value = [float(value[0][0])]
            else:
-                # the classification value, which is the index of the largest value
-                leaf_value = [float(np.argmax(value))]
+                # the classification value
+                # DecisionTreeClassifiers simply use normalize (dividing predicted values by sum)
+                # We use softMax, to get our probabilities as close as possible, store log value here
+                leaf_value = [
+                    -10000000 if n <= 0 else math.log(float(n)) for n in value[0]
+                ]
            return TreeNode(
                node_index,
                decision_type=self._node_decision_type,
--- a/tests/ml/test_ml_model_pytest.py
+++ b/tests/ml/test_ml_model_pytest.py
@ -73,7 +73,7 @@ def random_rows(data, size):
    return data[np.random.randint(data.shape[0], size=size), :]


-def check_prediction_equality(es_model, py_model, test_data):
+def check_prediction_equality(es_model: MLModel, py_model, test_data):
    # Get some test results
    test_results = py_model.predict(np.asarray(test_data))
    es_results = es_model.predict(test_data)
@ -131,14 +131,25 @@ class TestMLModel:

    @requires_sklearn
    @pytest.mark.parametrize("compress_model_definition", [True, False])
-    def test_decision_tree_classifier(self, compress_model_definition):
+    @pytest.mark.parametrize("multi_class", [True, False])
+    def test_decision_tree_classifier(self, compress_model_definition, multi_class):
        # Train model
-        training_data = datasets.make_classification(n_features=5)
+        training_data = (
+            datasets.make_classification(
+                n_features=7,
+                n_classes=3,
+                n_clusters_per_class=2,
+                n_informative=6,
+                n_redundant=1,
+            )
+            if multi_class
+            else datasets.make_classification(n_features=7)
+        )
        classifier = DecisionTreeClassifier()
        classifier.fit(training_data[0], training_data[1])

        # Serialise the models to Elasticsearch
-        feature_names = ["f0", "f1", "f2", "f3", "f4"]
+        feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6"]
        model_id = "test_decision_tree_classifier"

        es_model = MLModel.import_model(