[ML] fixes decision tree classifier upload to account for probabilities (#465)

This switches our sklearn.DecisionTreeClassifier serialization logic to account for multi-valued leaves in the tree.

The key difference between our inference and DecisionTreeClassifier, is that we run a softMax over the leaf where sklearn simply normalizes the results.

This means that our "probabilities" returned will be different than sklearn.
This commit is contained in:
Benjamin Trent 2022-05-17 08:11:20 -04:00 committed by GitHub
parent 5bbb8e484a
commit fa30246937
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 7 deletions

View File

@ -269,6 +269,9 @@ class MLModel:
model: An instance of a supported python model. We support the following model types:
- sklearn.tree.DecisionTreeClassifier
- NOTE: When calculating the probabilities of a given classification label, Elasticsearch utilizes
softMax. SKLearn instead normalizes the results. We try to account for this during model
serialization, but probabilities may be slightly different in the predictions.
- sklearn.tree.DecisionTreeRegressor
- sklearn.ensemble.RandomForestRegressor
- sklearn.ensemble.RandomForestClassifier

View File

@ -14,7 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import math
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
import numpy as np
@ -86,8 +86,12 @@ class SKLearnTransformer(ModelTransformer):
): # classification requires more than one value, so assume regression
leaf_value = [float(value[0][0])]
else:
# the classification value, which is the index of the largest value
leaf_value = [float(np.argmax(value))]
# the classification value
# DecisionTreeClassifiers simply use normalize (dividing predicted values by sum)
# We use softMax, to get our probabilities as close as possible, store log value here
leaf_value = [
-10000000 if n <= 0 else math.log(float(n)) for n in value[0]
]
return TreeNode(
node_index,
decision_type=self._node_decision_type,

View File

@ -73,7 +73,7 @@ def random_rows(data, size):
return data[np.random.randint(data.shape[0], size=size), :]
def check_prediction_equality(es_model, py_model, test_data):
def check_prediction_equality(es_model: MLModel, py_model, test_data):
# Get some test results
test_results = py_model.predict(np.asarray(test_data))
es_results = es_model.predict(test_data)
@ -131,14 +131,25 @@ class TestMLModel:
@requires_sklearn
@pytest.mark.parametrize("compress_model_definition", [True, False])
def test_decision_tree_classifier(self, compress_model_definition):
@pytest.mark.parametrize("multi_class", [True, False])
def test_decision_tree_classifier(self, compress_model_definition, multi_class):
# Train model
training_data = datasets.make_classification(n_features=5)
training_data = (
datasets.make_classification(
n_features=7,
n_classes=3,
n_clusters_per_class=2,
n_informative=6,
n_redundant=1,
)
if multi_class
else datasets.make_classification(n_features=7)
)
classifier = DecisionTreeClassifier()
classifier.fit(training_data[0], training_data[1])
# Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"]
feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6"]
model_id = "test_decision_tree_classifier"
es_model = MLModel.import_model(