mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
I updated the tree serialization format for the new scikit learn versions. I also updated the minimum requirement of scikit learn to 1.3 to ensure compatibility. Fixes #555
223 lines
8.1 KiB
Python
223 lines
8.1 KiB
Python
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
# license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright
|
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
# the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from typing import Any, Dict
|
|
|
|
import numpy as np
|
|
|
|
from .._optional import import_optional_dependency
|
|
|
|
import_optional_dependency("sklearn", on_version="warn")
|
|
|
|
import sklearn
|
|
from sklearn.preprocessing import FunctionTransformer
|
|
|
|
|
|
class Tree:
|
|
"""Wrapper to create sklearn Tree objects from Elastic ML tree
|
|
description in JSON format.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
json_tree: Dict[str, Any],
|
|
feature_names_map: Dict[str, int],
|
|
):
|
|
tree_leaf = -1
|
|
|
|
node_count = len(json_tree["tree_structure"])
|
|
children_left = np.ones((node_count,), dtype=int) * tree_leaf
|
|
children_right = np.ones((node_count,), dtype=int) * tree_leaf
|
|
feature = np.ones((node_count,), dtype=int) * -2
|
|
threshold = np.ones((node_count,), dtype=float) * -2
|
|
impurity = np.zeros((node_count,), dtype=float)
|
|
# value works only for regression and binary classification
|
|
value = np.zeros((node_count, 1, 1), dtype="<f8")
|
|
n_node_samples = np.zeros((node_count,), dtype=int)
|
|
|
|
# parse values from the JSON tree
|
|
feature_names = json_tree["feature_names"]
|
|
for json_node in json_tree["tree_structure"]:
|
|
node_id = json_node["node_index"]
|
|
if "number_samples" in json_node:
|
|
n_node_samples[node_id] = json_node["number_samples"]
|
|
else:
|
|
n_node_samples[node_id] = 0
|
|
|
|
if "leaf_value" not in json_node:
|
|
children_left[node_id] = json_node["left_child"]
|
|
children_right[node_id] = json_node["right_child"]
|
|
feature[node_id] = feature_names_map[
|
|
feature_names[json_node["split_feature"]]
|
|
]
|
|
threshold[node_id] = json_node["threshold"]
|
|
if "split_gain" in json_node:
|
|
impurity[node_id] = json_node["split_gain"]
|
|
else:
|
|
impurity[node_id] = -1
|
|
else:
|
|
value[node_id, 0, 0] = json_node["leaf_value"]
|
|
|
|
# iterate through tree to get max depth and expected values
|
|
weighted_n_node_samples = n_node_samples.copy()
|
|
self.max_depth = Tree._compute_expectations(
|
|
children_left=children_left,
|
|
children_right=children_right,
|
|
node_sample_weight=weighted_n_node_samples,
|
|
values=value,
|
|
node_index=0,
|
|
)
|
|
self.n_outputs = value.shape[-1]
|
|
|
|
# initialize the sklearn tree
|
|
self.tree = sklearn.tree._tree.Tree(
|
|
len(feature_names), np.array([1], dtype=int), 1
|
|
)
|
|
node_state = np.array(
|
|
[
|
|
(
|
|
children_left[i],
|
|
children_right[i],
|
|
feature[i],
|
|
threshold[i],
|
|
impurity[i],
|
|
n_node_samples[i],
|
|
weighted_n_node_samples[i],
|
|
True,
|
|
)
|
|
for i in range(node_count)
|
|
],
|
|
dtype={
|
|
"names": [
|
|
"left_child",
|
|
"right_child",
|
|
"feature",
|
|
"threshold",
|
|
"impurity",
|
|
"n_node_samples",
|
|
"weighted_n_node_samples",
|
|
"missing_go_to_left",
|
|
],
|
|
"formats": ["<i8", "<i8", "<i8", "<f8", "<f8", "<i8", "<f8", "u1"],
|
|
},
|
|
)
|
|
state = {
|
|
"max_depth": self.max_depth,
|
|
"node_count": node_count,
|
|
"nodes": node_state,
|
|
"values": value,
|
|
}
|
|
self.tree.__setstate__(state)
|
|
|
|
@staticmethod
|
|
def _compute_expectations(
|
|
children_left, children_right, node_sample_weight, values, node_index
|
|
) -> int:
|
|
if children_right[node_index] == -1:
|
|
return 0
|
|
|
|
left_index = children_left[node_index]
|
|
right_index = children_right[node_index]
|
|
depth_left = Tree._compute_expectations(
|
|
children_left, children_right, node_sample_weight, values, left_index
|
|
)
|
|
depth_right = Tree._compute_expectations(
|
|
children_left, children_right, node_sample_weight, values, right_index
|
|
)
|
|
left_weight = node_sample_weight[left_index]
|
|
right_weight = node_sample_weight[right_index]
|
|
|
|
v = (
|
|
(
|
|
left_weight * values[left_index, :]
|
|
+ right_weight * values[right_index, :]
|
|
)
|
|
/ (left_weight + right_weight)
|
|
if left_weight + right_weight > 0
|
|
else 0
|
|
)
|
|
values[node_index, :] = v
|
|
return max(depth_left, depth_right) + 1
|
|
|
|
|
|
class TargetMeanEncoder(FunctionTransformer):
|
|
"""FunctionTransformer implementation of the target mean encoder, which is
|
|
deserialized from the Elastic ML preprocessor description in JSON formats.
|
|
"""
|
|
|
|
def __init__(self, preprocessor: Dict[str, Any]):
|
|
self.preprocessor = preprocessor
|
|
target_map = self.preprocessor["target_mean_encoding"]["target_map"]
|
|
feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"]
|
|
self.field_name_in = self.preprocessor["target_mean_encoding"]["field"]
|
|
fallback_value = self.preprocessor["target_mean_encoding"]["default_value"]
|
|
|
|
def func(column):
|
|
return np.array(
|
|
[
|
|
target_map[str(category)]
|
|
if category in target_map
|
|
else fallback_value
|
|
for category in column
|
|
]
|
|
).reshape(-1, 1)
|
|
|
|
def feature_names_out(ft, carr):
|
|
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
|
|
|
super().__init__(func=func, feature_names_out=feature_names_out)
|
|
|
|
|
|
class FrequencyEncoder(FunctionTransformer):
|
|
"""FunctionTransformer implementation of the frequency encoder, which is
|
|
deserialized from the Elastic ML preprocessor description in JSON format.
|
|
"""
|
|
|
|
def __init__(self, preprocessor: Dict[str, Any]):
|
|
self.preprocessor = preprocessor
|
|
frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"]
|
|
feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"]
|
|
self.field_name_in = self.preprocessor["frequency_encoding"]["field"]
|
|
fallback_value = 0.0
|
|
|
|
def func(column):
|
|
return np.array(
|
|
[
|
|
frequency_map[str(category)]
|
|
if category in frequency_map
|
|
else fallback_value
|
|
for category in column
|
|
]
|
|
).reshape(-1, 1)
|
|
|
|
def feature_names_out(ft, carr):
|
|
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
|
|
|
super().__init__(func=func, feature_names_out=feature_names_out)
|
|
|
|
|
|
class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
|
|
"""Wrapper for sklearn one-hot encoder, which is deserialized from the
|
|
Elastic ML preprocessor description in JSON format.
|
|
"""
|
|
|
|
def __init__(self, preprocessor: Dict[str, Any]):
|
|
self.preprocessor = preprocessor
|
|
self.field_name_in = self.preprocessor["one_hot_encoding"]["field"]
|
|
self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())]
|
|
super().__init__(categories=self.cats, handle_unknown="ignore")
|