mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
[ML] fixes decision tree classifier upload to account for probabilities (#465)
This switches our sklearn.DecisionTreeClassifier serialization logic to account for multi-valued leaves in the tree. The key difference between our inference and DecisionTreeClassifier, is that we run a softMax over the leaf where sklearn simply normalizes the results. This means that our "probabilities" returned will be different than sklearn.
This commit is contained in:
parent
5bbb8e484a
commit
fa30246937
@ -269,6 +269,9 @@ class MLModel:
|
||||
|
||||
model: An instance of a supported python model. We support the following model types:
|
||||
- sklearn.tree.DecisionTreeClassifier
|
||||
- NOTE: When calculating the probabilities of a given classification label, Elasticsearch utilizes
|
||||
softMax. SKLearn instead normalizes the results. We try to account for this during model
|
||||
serialization, but probabilities may be slightly different in the predictions.
|
||||
- sklearn.tree.DecisionTreeRegressor
|
||||
- sklearn.ensemble.RandomForestRegressor
|
||||
- sklearn.ensemble.RandomForestClassifier
|
||||
|
@ -14,7 +14,7 @@
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import math
|
||||
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
|
||||
|
||||
import numpy as np
|
||||
@ -86,8 +86,12 @@ class SKLearnTransformer(ModelTransformer):
|
||||
): # classification requires more than one value, so assume regression
|
||||
leaf_value = [float(value[0][0])]
|
||||
else:
|
||||
# the classification value, which is the index of the largest value
|
||||
leaf_value = [float(np.argmax(value))]
|
||||
# the classification value
|
||||
# DecisionTreeClassifiers simply use normalize (dividing predicted values by sum)
|
||||
# We use softMax, to get our probabilities as close as possible, store log value here
|
||||
leaf_value = [
|
||||
-10000000 if n <= 0 else math.log(float(n)) for n in value[0]
|
||||
]
|
||||
return TreeNode(
|
||||
node_index,
|
||||
decision_type=self._node_decision_type,
|
||||
|
@ -73,7 +73,7 @@ def random_rows(data, size):
|
||||
return data[np.random.randint(data.shape[0], size=size), :]
|
||||
|
||||
|
||||
def check_prediction_equality(es_model, py_model, test_data):
|
||||
def check_prediction_equality(es_model: MLModel, py_model, test_data):
|
||||
# Get some test results
|
||||
test_results = py_model.predict(np.asarray(test_data))
|
||||
es_results = es_model.predict(test_data)
|
||||
@ -131,14 +131,25 @@ class TestMLModel:
|
||||
|
||||
@requires_sklearn
|
||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||
def test_decision_tree_classifier(self, compress_model_definition):
|
||||
@pytest.mark.parametrize("multi_class", [True, False])
|
||||
def test_decision_tree_classifier(self, compress_model_definition, multi_class):
|
||||
# Train model
|
||||
training_data = datasets.make_classification(n_features=5)
|
||||
training_data = (
|
||||
datasets.make_classification(
|
||||
n_features=7,
|
||||
n_classes=3,
|
||||
n_clusters_per_class=2,
|
||||
n_informative=6,
|
||||
n_redundant=1,
|
||||
)
|
||||
if multi_class
|
||||
else datasets.make_classification(n_features=7)
|
||||
)
|
||||
classifier = DecisionTreeClassifier()
|
||||
classifier.fit(training_data[0], training_data[1])
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
feature_names = ["f0", "f1", "f2", "f3", "f4"]
|
||||
feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6"]
|
||||
model_id = "test_decision_tree_classifier"
|
||||
|
||||
es_model = MLModel.import_model(
|
||||
|
Loading…
x
Reference in New Issue
Block a user