mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
[ML] fixes decision tree classifier upload to account for probabilities (#465)
This switches our sklearn.DecisionTreeClassifier serialization logic to account for multi-valued leaves in the tree. The key difference between our inference and DecisionTreeClassifier, is that we run a softMax over the leaf where sklearn simply normalizes the results. This means that our "probabilities" returned will be different than sklearn.
This commit is contained in:
parent
5bbb8e484a
commit
fa30246937
@ -269,6 +269,9 @@ class MLModel:
|
|||||||
|
|
||||||
model: An instance of a supported python model. We support the following model types:
|
model: An instance of a supported python model. We support the following model types:
|
||||||
- sklearn.tree.DecisionTreeClassifier
|
- sklearn.tree.DecisionTreeClassifier
|
||||||
|
- NOTE: When calculating the probabilities of a given classification label, Elasticsearch utilizes
|
||||||
|
softMax. SKLearn instead normalizes the results. We try to account for this during model
|
||||||
|
serialization, but probabilities may be slightly different in the predictions.
|
||||||
- sklearn.tree.DecisionTreeRegressor
|
- sklearn.tree.DecisionTreeRegressor
|
||||||
- sklearn.ensemble.RandomForestRegressor
|
- sklearn.ensemble.RandomForestRegressor
|
||||||
- sklearn.ensemble.RandomForestClassifier
|
- sklearn.ensemble.RandomForestClassifier
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
# KIND, either express or implied. See the License for the
|
# KIND, either express or implied. See the License for the
|
||||||
# specific language governing permissions and limitations
|
# specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
import math
|
||||||
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
|
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -86,8 +86,12 @@ class SKLearnTransformer(ModelTransformer):
|
|||||||
): # classification requires more than one value, so assume regression
|
): # classification requires more than one value, so assume regression
|
||||||
leaf_value = [float(value[0][0])]
|
leaf_value = [float(value[0][0])]
|
||||||
else:
|
else:
|
||||||
# the classification value, which is the index of the largest value
|
# the classification value
|
||||||
leaf_value = [float(np.argmax(value))]
|
# DecisionTreeClassifiers simply use normalize (dividing predicted values by sum)
|
||||||
|
# We use softMax, to get our probabilities as close as possible, store log value here
|
||||||
|
leaf_value = [
|
||||||
|
-10000000 if n <= 0 else math.log(float(n)) for n in value[0]
|
||||||
|
]
|
||||||
return TreeNode(
|
return TreeNode(
|
||||||
node_index,
|
node_index,
|
||||||
decision_type=self._node_decision_type,
|
decision_type=self._node_decision_type,
|
||||||
|
@ -73,7 +73,7 @@ def random_rows(data, size):
|
|||||||
return data[np.random.randint(data.shape[0], size=size), :]
|
return data[np.random.randint(data.shape[0], size=size), :]
|
||||||
|
|
||||||
|
|
||||||
def check_prediction_equality(es_model, py_model, test_data):
|
def check_prediction_equality(es_model: MLModel, py_model, test_data):
|
||||||
# Get some test results
|
# Get some test results
|
||||||
test_results = py_model.predict(np.asarray(test_data))
|
test_results = py_model.predict(np.asarray(test_data))
|
||||||
es_results = es_model.predict(test_data)
|
es_results = es_model.predict(test_data)
|
||||||
@ -131,14 +131,25 @@ class TestMLModel:
|
|||||||
|
|
||||||
@requires_sklearn
|
@requires_sklearn
|
||||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||||
def test_decision_tree_classifier(self, compress_model_definition):
|
@pytest.mark.parametrize("multi_class", [True, False])
|
||||||
|
def test_decision_tree_classifier(self, compress_model_definition, multi_class):
|
||||||
# Train model
|
# Train model
|
||||||
training_data = datasets.make_classification(n_features=5)
|
training_data = (
|
||||||
|
datasets.make_classification(
|
||||||
|
n_features=7,
|
||||||
|
n_classes=3,
|
||||||
|
n_clusters_per_class=2,
|
||||||
|
n_informative=6,
|
||||||
|
n_redundant=1,
|
||||||
|
)
|
||||||
|
if multi_class
|
||||||
|
else datasets.make_classification(n_features=7)
|
||||||
|
)
|
||||||
classifier = DecisionTreeClassifier()
|
classifier = DecisionTreeClassifier()
|
||||||
classifier.fit(training_data[0], training_data[1])
|
classifier.fit(training_data[0], training_data[1])
|
||||||
|
|
||||||
# Serialise the models to Elasticsearch
|
# Serialise the models to Elasticsearch
|
||||||
feature_names = ["f0", "f1", "f2", "f3", "f4"]
|
feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6"]
|
||||||
model_id = "test_decision_tree_classifier"
|
model_id = "test_decision_tree_classifier"
|
||||||
|
|
||||||
es_model = MLModel.import_model(
|
es_model = MLModel.import_model(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user