[ML] Add support for multi:softmax|softprob XGBClassifier

This commit is contained in:
Benjamin Trent 2020-08-06 13:04:10 -04:00 committed by GitHub
parent 5c901e8f1b
commit efb9e3b4c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 59 additions and 11 deletions

View File

@ -19,7 +19,7 @@ import base64
import gzip import gzip
import json import json
from abc import ABC from abc import ABC
from typing import Sequence, Dict, Any, Optional from typing import Sequence, Dict, Any, Optional, List
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None: def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
@ -69,7 +69,7 @@ class TreeNode:
right_child: Optional[int] = None, right_child: Optional[int] = None,
split_feature: Optional[int] = None, split_feature: Optional[int] = None,
threshold: Optional[float] = None, threshold: Optional[float] = None,
leaf_value: Optional[float] = None, leaf_value: Optional[List[float]] = None,
): ):
self._node_idx = node_idx self._node_idx = node_idx
self._decision_type = decision_type self._decision_type = decision_type

View File

@ -60,7 +60,17 @@ class ImportedMLModel(MLModel):
- sklearn.ensemble.RandomForestRegressor - sklearn.ensemble.RandomForestRegressor
- sklearn.ensemble.RandomForestClassifier - sklearn.ensemble.RandomForestClassifier
- xgboost.XGBClassifier - xgboost.XGBClassifier
- only the following operators are supported:
- "binary:logistic"
- "binary:hinge"
- "multi:softmax"
- "multi:softprob"
- xgboost.XGBRegressor - xgboost.XGBRegressor
- only the following operators are supportd:
- "reg:squarederror"
- "reg:linear"
- "reg:squaredlogerror"
- "reg:logistic"
feature_names: List[str] feature_names: List[str]
Names of the features (required) Names of the features (required)

View File

@ -79,10 +79,10 @@ class SKLearnTransformer(ModelTransformer):
if ( if (
value.shape[1] == 1 value.shape[1] == 1
): # classification requires more than one value, so assume regression ): # classification requires more than one value, so assume regression
leaf_value = float(value[0][0]) leaf_value = [float(value[0][0])]
else: else:
# the classification value, which is the index of the largest value # the classification value, which is the index of the largest value
leaf_value = int(np.argmax(value)) leaf_value = [float(np.argmax(value))]
return TreeNode( return TreeNode(
node_index, node_index,
decision_type=self._node_decision_type, decision_type=self._node_decision_type,

View File

@ -49,6 +49,7 @@ class XGBoostForestTransformer(ModelTransformer):
self._node_decision_type = "lt" self._node_decision_type = "lt"
self._base_score = base_score self._base_score = base_score
self._objective = objective self._objective = objective
self._feature_dict = dict(zip(feature_names, range(len(feature_names))))
def get_feature_id(self, feature_id: str) -> int: def get_feature_id(self, feature_id: str) -> int:
if feature_id[0] == "f": if feature_id[0] == "f":
@ -56,6 +57,9 @@ class XGBoostForestTransformer(ModelTransformer):
return int(feature_id[1:]) return int(feature_id[1:])
except ValueError: except ValueError:
raise RuntimeError(f"Unable to interpret '{feature_id}'") raise RuntimeError(f"Unable to interpret '{feature_id}'")
f_id = self._feature_dict.get(feature_id)
if f_id:
return f_id
else: else:
try: try:
return int(feature_id) return int(feature_id)
@ -81,10 +85,13 @@ class XGBoostForestTransformer(ModelTransformer):
f"cannot determine node index or tree from '{node_id}' for tree {curr_tree}" f"cannot determine node index or tree from '{node_id}' for tree {curr_tree}"
) )
def build_leaf_node(self, row: pd.Series, curr_tree: int) -> TreeNode:
return TreeNode(node_idx=row["Node"], leaf_value=[float(row["Gain"])])
def build_tree_node(self, row: pd.Series, curr_tree: int) -> TreeNode: def build_tree_node(self, row: pd.Series, curr_tree: int) -> TreeNode:
node_index = row["Node"] node_index = row["Node"]
if row["Feature"] == "Leaf": if row["Feature"] == "Leaf":
return TreeNode(node_idx=node_index, leaf_value=float(row["Gain"])) return self.build_leaf_node(row, curr_tree)
else: else:
return TreeNode( return TreeNode(
node_idx=node_index, node_idx=node_index,
@ -96,12 +103,16 @@ class XGBoostForestTransformer(ModelTransformer):
) )
def build_tree(self, nodes: List[TreeNode]) -> Tree: def build_tree(self, nodes: List[TreeNode]) -> Tree:
return Tree(feature_names=self._feature_names, tree_structure=nodes) return Tree(
feature_names=self._feature_names,
tree_structure=nodes,
target_type=self.determine_target_type(),
)
def build_base_score_stump(self) -> Tree: def build_base_score_stump(self) -> Tree:
return Tree( return Tree(
feature_names=self._feature_names, feature_names=self._feature_names,
tree_structure=[TreeNode(0, leaf_value=self._base_score)], tree_structure=[TreeNode(0, leaf_value=[self._base_score])],
) )
def build_forest(self) -> List[Tree]: def build_forest(self) -> List[Tree]:
@ -209,12 +220,30 @@ class XGBoostClassifierTransformer(XGBoostForestTransformer):
model.objective, model.objective,
classification_labels, classification_labels,
) )
if model.classes_ is None:
n_estimators = model.get_params()["n_estimators"]
num_trees = model.get_booster().trees_to_dataframe()["Tree"].max() + 1
self._num_classes = num_trees // n_estimators
else:
self._num_classes = len(model.classes_)
def build_leaf_node(self, row: pd.Series, curr_tree: int) -> TreeNode:
if self._num_classes <= 2:
return super().build_leaf_node(row, curr_tree)
leaf_val = [0.0] * self._num_classes
leaf_val[curr_tree % self._num_classes] = float(row["Gain"])
return TreeNode(node_idx=row["Node"], leaf_value=leaf_val)
def determine_target_type(self) -> str: def determine_target_type(self) -> str:
return "classification" return "classification"
def is_objective_supported(self) -> bool: def is_objective_supported(self) -> bool:
return self._objective in {"binary:logistic", "binary:hinge"} return self._objective in {
"binary:logistic",
"binary:hinge",
"multi:softmax",
"multi:softprob",
}
def build_aggregator_output(self) -> Dict[str, Any]: def build_aggregator_output(self) -> Dict[str, Any]:
return {"logistic_regression": {}} return {"logistic_regression": {}}

View File

@ -226,10 +226,19 @@ class TestImportedMLModel:
@requires_xgboost @requires_xgboost
@pytest.mark.parametrize("compress_model_definition", [True, False]) @pytest.mark.parametrize("compress_model_definition", [True, False])
def test_xgb_classifier(self, compress_model_definition): @pytest.mark.parametrize("multi_class", [True, False])
def test_xgb_classifier(self, compress_model_definition, multi_class):
# test both multiple and binary classification
if multi_class:
training_data = datasets.make_classification(
n_features=5, n_classes=3, n_informative=3
)
classifier = XGBClassifier(booster="gbtree", objective="multi:softmax")
else:
training_data = datasets.make_classification(n_features=5)
classifier = XGBClassifier(booster="gbtree")
# Train model # Train model
training_data = datasets.make_classification(n_features=5)
classifier = XGBClassifier(booster="gbtree")
classifier.fit(training_data[0], training_data[1]) classifier.fit(training_data[0], training_data[1])
# Get some test results # Get some test results