[ML] fixes decision tree classifier upload to account for probabilities (#465)

This switches our sklearn.DecisionTreeClassifier serialization logic to account for multi-valued leaves in the tree.

The key difference between our inference and DecisionTreeClassifier, is that we run a softMax over the leaf where sklearn simply normalizes the results.

This means that our "probabilities" returned will be different than sklearn.
This commit is contained in:
Benjamin Trent 2022-05-17 08:11:20 -04:00 committed by GitHub
parent 5bbb8e484a
commit fa30246937
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 7 deletions

View File

@ -269,6 +269,9 @@ class MLModel:
model: An instance of a supported python model. We support the following model types: model: An instance of a supported python model. We support the following model types:
- sklearn.tree.DecisionTreeClassifier - sklearn.tree.DecisionTreeClassifier
- NOTE: When calculating the probabilities of a given classification label, Elasticsearch utilizes
softMax. SKLearn instead normalizes the results. We try to account for this during model
serialization, but probabilities may be slightly different in the predictions.
- sklearn.tree.DecisionTreeRegressor - sklearn.tree.DecisionTreeRegressor
- sklearn.ensemble.RandomForestRegressor - sklearn.ensemble.RandomForestRegressor
- sklearn.ensemble.RandomForestClassifier - sklearn.ensemble.RandomForestClassifier

View File

@ -14,7 +14,7 @@
# KIND, either express or implied. See the License for the # KIND, either express or implied. See the License for the
# specific language governing permissions and limitations # specific language governing permissions and limitations
# under the License. # under the License.
import math
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
import numpy as np import numpy as np
@ -86,8 +86,12 @@ class SKLearnTransformer(ModelTransformer):
): # classification requires more than one value, so assume regression ): # classification requires more than one value, so assume regression
leaf_value = [float(value[0][0])] leaf_value = [float(value[0][0])]
else: else:
# the classification value, which is the index of the largest value # the classification value
leaf_value = [float(np.argmax(value))] # DecisionTreeClassifiers simply use normalize (dividing predicted values by sum)
# We use softMax, to get our probabilities as close as possible, store log value here
leaf_value = [
-10000000 if n <= 0 else math.log(float(n)) for n in value[0]
]
return TreeNode( return TreeNode(
node_index, node_index,
decision_type=self._node_decision_type, decision_type=self._node_decision_type,

View File

@ -73,7 +73,7 @@ def random_rows(data, size):
return data[np.random.randint(data.shape[0], size=size), :] return data[np.random.randint(data.shape[0], size=size), :]
def check_prediction_equality(es_model, py_model, test_data): def check_prediction_equality(es_model: MLModel, py_model, test_data):
# Get some test results # Get some test results
test_results = py_model.predict(np.asarray(test_data)) test_results = py_model.predict(np.asarray(test_data))
es_results = es_model.predict(test_data) es_results = es_model.predict(test_data)
@ -131,14 +131,25 @@ class TestMLModel:
@requires_sklearn @requires_sklearn
@pytest.mark.parametrize("compress_model_definition", [True, False]) @pytest.mark.parametrize("compress_model_definition", [True, False])
def test_decision_tree_classifier(self, compress_model_definition): @pytest.mark.parametrize("multi_class", [True, False])
def test_decision_tree_classifier(self, compress_model_definition, multi_class):
# Train model # Train model
training_data = datasets.make_classification(n_features=5) training_data = (
datasets.make_classification(
n_features=7,
n_classes=3,
n_clusters_per_class=2,
n_informative=6,
n_redundant=1,
)
if multi_class
else datasets.make_classification(n_features=7)
)
classifier = DecisionTreeClassifier() classifier = DecisionTreeClassifier()
classifier.fit(training_data[0], training_data[1]) classifier.fit(training_data[0], training_data[1])
# Serialise the models to Elasticsearch # Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"] feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6"]
model_id = "test_decision_tree_classifier" model_id = "test_decision_tree_classifier"
es_model = MLModel.import_model( es_model = MLModel.import_model(