# Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from typing import Any, Dict import numpy as np from .._optional import import_optional_dependency import_optional_dependency("sklearn", on_version="warn") import sklearn from sklearn.preprocessing import FunctionTransformer class Tree: """Wrapper to create sklearn Tree objects from Elastic ML tree description in JSON format. """ def __init__( self, json_tree: Dict[str, Any], feature_names_map: Dict[str, int], ): tree_leaf = -1 node_count = len(json_tree["tree_structure"]) children_left = np.ones((node_count,), dtype=int) * tree_leaf children_right = np.ones((node_count,), dtype=int) * tree_leaf feature = np.ones((node_count,), dtype=int) * -2 threshold = np.ones((node_count,), dtype=float) * -2 impurity = np.zeros((node_count,), dtype=float) # value works only for regression and binary classification value = np.zeros((node_count, 1, 1), dtype=" int: if children_right[node_index] == -1: return 0 left_index = children_left[node_index] right_index = children_right[node_index] depth_left = Tree._compute_expectations( children_left, children_right, node_sample_weight, values, left_index ) depth_right = Tree._compute_expectations( children_left, children_right, node_sample_weight, values, right_index ) left_weight = node_sample_weight[left_index] right_weight = node_sample_weight[right_index] v = ( ( left_weight * values[left_index, :] + right_weight * values[right_index, :] ) / (left_weight + right_weight) if left_weight + right_weight > 0 else 0 ) values[node_index, :] = v return max(depth_left, depth_right) + 1 class TargetMeanEncoder(FunctionTransformer): """FunctionTransformer implementation of the target mean encoder, which is deserialized from the Elastic ML preprocessor description in JSON formats. """ def __init__(self, preprocessor: Dict[str, Any]): self.preprocessor = preprocessor target_map = self.preprocessor["target_mean_encoding"]["target_map"] feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"] self.field_name_in = self.preprocessor["target_mean_encoding"]["field"] fallback_value = self.preprocessor["target_mean_encoding"]["default_value"] def func(column): return np.array( [ target_map[str(category)] if category in target_map else fallback_value for category in column ] ).reshape(-1, 1) def feature_names_out(ft, carr): return [feature_name_out if c == self.field_name_in else c for c in carr] super().__init__(func=func, feature_names_out=feature_names_out) class FrequencyEncoder(FunctionTransformer): """FunctionTransformer implementation of the frequency encoder, which is deserialized from the Elastic ML preprocessor description in JSON format. """ def __init__(self, preprocessor: Dict[str, Any]): self.preprocessor = preprocessor frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"] feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"] self.field_name_in = self.preprocessor["frequency_encoding"]["field"] fallback_value = 0.0 def func(column): return np.array( [ frequency_map[str(category)] if category in frequency_map else fallback_value for category in column ] ).reshape(-1, 1) def feature_names_out(ft, carr): return [feature_name_out if c == self.field_name_in else c for c in carr] super().__init__(func=func, feature_names_out=feature_names_out) class OneHotEncoder(sklearn.preprocessing.OneHotEncoder): """Wrapper for sklearn one-hot encoder, which is deserialized from the Elastic ML preprocessor description in JSON format. """ def __init__(self, preprocessor: Dict[str, Any]): self.preprocessor = preprocessor self.field_name_in = self.preprocessor["one_hot_encoding"]["field"] self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())] super().__init__(categories=self.cats, handle_unknown="ignore")