Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1698,6 +1698,34 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
bst_ulong *out_n_features, char const ***out_features,
bst_ulong *out_dim, bst_ulong const **out_shape,
float const **out_scores);

/**
* @brief Get per-tree weights for leaf similarity computation.
*
* @param config A JSON string with the following format:
*
* {
* "weight_type": str,
* "iteration_begin": int,
* "iteration_end": int
* }
*
* - weight_type: A JSON string with following possible values:
* * 'uniform': assign equal weight to each tree.
* * 'gain': sum split gain for each tree.
* * 'cover': sum split cover for each tree.
* - iteration_begin: Beginning iteration used when extracting tree weights.
* - iteration_end: End iteration used when extracting tree weights. 0 means
* using all remaining iterations.
*
* @param out_len Length of output tree weight array.
* @param out_weights Pointer to the output tree weight array.
*
* @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterGetLeafSimilarityWeights(BoosterHandle handle, const char *config,
bst_ulong *out_len,
float const **out_weights);
/**@}*/ // End of Booster

/**
Expand Down
5 changes: 5 additions & 0 deletions include/xgboost/gbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ class GradientBooster : public Model, public Configurable {
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;

virtual void LeafSimilarityWeights(std::string const& weight_type,
bst_layer_t iteration_begin,
bst_layer_t iteration_end,
std::vector<float>* weights) const = 0;
/**
* @brief Getter for categories.
*/
Expand Down
5 changes: 5 additions & 0 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
std::vector<bst_feature_t>* features,
std::vector<float>* scores) = 0;

virtual void CalcLeafSimilarityWeights(std::string const& weight_type,
bst_layer_t iteration_begin,
bst_layer_t iteration_end,
std::vector<float>* weights) = 0;

/*
* \brief Get number of boosted rounds from gradient booster.
*/
Expand Down
121 changes: 121 additions & 0 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2981,6 +2981,127 @@ def inplace_predict(
"Data type:" + str(type(data)) + " not supported by inplace prediction."
)

def compute_leaf_similarity(
self,
data: DMatrix,
reference: DMatrix,
weight_type: str = "uniform",
) -> np.ndarray:
"""Compute similarity between observations based on leaf node co-occurrence.

Two samples are similar if they land in the same leaf nodes across trees.
This is similar to Random Forest proximity matrices.

Parameters
----------
data :
Query dataset (m samples).
reference :
Reference dataset (n samples).
weight_type :
How to weight trees: "uniform" (equal tree weights), "gain"
(by loss improvement), or "cover" (by hessian sum, approximately
sample count for regression).

Returns
-------
similarity : ndarray of shape (m, n)
Similarity scores in [0, 1].
"""
if weight_type not in ("uniform", "gain", "cover"):
raise ValueError(
"weight_type must be 'uniform', 'gain', or 'cover', "
f"got '{weight_type}'"
)

config = json.loads(self.save_config())["learner"]
booster = config["gradient_booster"]["name"]
if booster == "gblinear":
raise XGBoostError(
"Leaf similarity is only defined for tree boosters, got gblinear."
)

if config["learner_train_param"]["multi_strategy"] == "multi_output_tree":
raise XGBoostError(
"Leaf similarity does not support multi_output_tree."
)

query_leaves = self.predict(data, pred_leaf=True, strict_shape=True)
ref_leaves = self.predict(reference, pred_leaf=True, strict_shape=True)

query_leaves = np.asarray(query_leaves, dtype=np.int64).reshape(
query_leaves.shape[0], -1
)
ref_leaves = np.asarray(ref_leaves, dtype=np.int64).reshape(
ref_leaves.shape[0], -1
)

m, n = query_leaves.shape[0], ref_leaves.shape[0]
if query_leaves.shape[1] != ref_leaves.shape[1]:
raise ValueError("Query and reference leaf predictions have different shapes.")

n_trees = query_leaves.shape[1]
if m == 0 or n == 0 or n_trees == 0:
return np.zeros((m, n), dtype=np.float32)

if weight_type == "uniform":
weights = np.ones(n_trees, dtype=np.float32)
else:
out_len = c_bst_ulong()
out_weights = ctypes.POINTER(ctypes.c_float)()
_check_call(
_LIB.XGBoosterGetLeafSimilarityWeights(
self.handle,
make_jcargs(
weight_type=weight_type,
iteration_begin=0,
iteration_end=0,
),
ctypes.byref(out_len),
ctypes.byref(out_weights),
)
)
weights = ctypes2numpy(out_weights, out_len.value, np.float32)
if weights.shape[0] != n_trees:
raise ValueError(
"Tree weight count does not match leaf prediction shape: "
f"{weights.shape[0]} != {n_trees}"
)

if weights.sum() == 0:
weights = np.ones(n_trees, dtype=np.float32)

total_weight = weights.sum()
if total_weight == 0:
weights = np.ones(n_trees, dtype=np.float32)
total_weight = weights.sum()

leaf_upper = np.maximum(query_leaves.max(axis=0), ref_leaves.max(axis=0)) + 1
offsets = np.zeros(n_trees, dtype=np.int64)
if n_trees > 1:
offsets[1:] = np.cumsum(leaf_upper[:-1], dtype=np.int64)

weight_values = np.sqrt(weights / total_weight, dtype=np.float32)
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np.sqrt does not accept a dtype= keyword (this will raise TypeError: sqrt() got an unexpected keyword argument 'dtype' on NumPy). Cast either the input or the output to np.float32 instead (e.g., compute sqrt first, then .astype(np.float32, copy=False), or pass a preallocated out= array of the desired dtype).

Suggested change
weight_values = np.sqrt(weights / total_weight, dtype=np.float32)
weight_values = np.sqrt(weights / total_weight).astype(
np.float32, copy=False
)

Copilot uses AI. Check for mistakes.
q_cols = (query_leaves + offsets).reshape(-1)
r_cols = (ref_leaves + offsets).reshape(-1)
q_rows = np.repeat(np.arange(m), n_trees)
r_rows = np.repeat(np.arange(n), n_trees)
feature_dim = int(offsets[-1] + leaf_upper[-1])

query_matrix = scipy.sparse.csr_matrix(
(np.tile(weight_values, m), (q_rows, q_cols)),
shape=(m, feature_dim),
dtype=np.float32,
)
ref_matrix = scipy.sparse.csr_matrix(
(np.tile(weight_values, n), (r_rows, r_cols)),
shape=(n, feature_dim),
dtype=np.float32,
)

similarity = query_matrix @ ref_matrix.T
return similarity.toarray()

def save_model(self, fname: PathLike) -> None:
"""Save the model to a file.

Expand Down
23 changes: 23 additions & 0 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2023,3 +2023,26 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *config,
*out_features = dmlc::BeginPtr(feature_names_c);
API_END();
}

XGB_DLL int XGBoosterGetLeafSimilarityWeights(BoosterHandle handle, char const *config,
bst_ulong *out_len,
float const **out_weights) {
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(config);
auto *learner = static_cast<Learner *>(handle);
auto jconfig = Json::Load(StringView{config});

auto weight_type = RequiredArg<String>(jconfig, "weight_type", __func__);
auto iteration_begin = RequiredArg<Integer>(jconfig, "iteration_begin", __func__);
auto iteration_end = RequiredArg<Integer>(jconfig, "iteration_end", __func__);

auto &weights = learner->GetThreadLocal().ret_vec_float;
learner->CalcLeafSimilarityWeights(weight_type, iteration_begin, iteration_end, &weights);

xgboost_CHECK_C_ARG_PTR(out_len);
xgboost_CHECK_C_ARG_PTR(out_weights);
*out_len = weights.size();
*out_weights = dmlc::BeginPtr(weights);
API_END();
}
10 changes: 10 additions & 0 deletions src/gbm/gblinear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,16 @@ class GBLinear : public GradientBooster {
}
}

void LeafSimilarityWeights(std::string const& weight_type, bst_layer_t iteration_begin,
bst_layer_t iteration_end,
std::vector<float>* weights) const override {
(void)weight_type;
(void)iteration_begin;
(void)iteration_end;
(void)weights;
LOG(FATAL) << "Leaf similarity weights are not defined for gblinear booster.";
}

protected:
void PredictBatchInternal(DMatrix *p_fmat,
std::vector<bst_float> *out_preds) {
Expand Down
39 changes: 39 additions & 0 deletions src/gbm/gbtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,45 @@ class GBTree : public GradientBooster {
}
}

void LeafSimilarityWeights(std::string const& weight_type,
bst_layer_t iteration_begin,
bst_layer_t iteration_end,
std::vector<float>* weights) const override {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, iteration_begin, iteration_end);
weights->clear();
weights->reserve(tree_end - tree_begin);

auto const get_weight = [&](RegTree const& tree) {
CHECK(!tree.IsMultiTarget()) << "Leaf similarity weights for multi-target tree "
<< MTNotImplemented();
tree::ScalarTreeView view{&tree};

if (weight_type == "uniform") {
return 1.0f;
}

float weight = 0.0f;
for (bst_node_t nidx = 0; nidx < view.Size(); ++nidx) {
if (!view.IsLeaf(nidx)) {
if (weight_type == "gain") {
weight += view.LossChg(nidx);
} else if (weight_type == "cover") {
weight += view.SumHess(nidx);
} else {
LOG(FATAL) << "Unknown leaf similarity weight type, expected one of: "
<< R"({"uniform", "gain", "cover"}, got: )" << weight_type;
}
}
}
return weight;
};

for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
auto const& tree = *model_.trees.at(tree_idx);
weights->push_back(get_weight(tree));
}
}

[[nodiscard]] CatContainer const* Cats() const override { return this->model_.Cats(); }

void PredictLeaf(DMatrix* p_fmat,
Expand Down
10 changes: 10 additions & 0 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1266,6 +1266,16 @@ class LearnerImpl : public LearnerIO {
gbm_->FeatureScore(importance_type, trees, features, scores);
}

void CalcLeafSimilarityWeights(std::string const& weight_type,
bst_layer_t iteration_begin,
bst_layer_t iteration_end,
std::vector<float>* weights) override {
this->Configure();
this->CheckModelInitialized();

gbm_->LeafSimilarityWeights(weight_type, iteration_begin, iteration_end, weights);
}

const std::map<std::string, std::string>& GetConfigurationArguments() const override {
return cfg_;
}
Expand Down
Loading
Loading