Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ class MetaInfo {
[[nodiscard]] CatContainer const* Cats() const;
[[nodiscard]] CatContainer* Cats();
[[nodiscard]] std::shared_ptr<CatContainer const> CatsShared() const;
[[nodiscard]] std::shared_ptr<CatContainer> CatsShared();
/**
* @brief Setter for categories.
*/
Expand Down Expand Up @@ -726,6 +727,7 @@ class DMatrix {
[[nodiscard]] std::shared_ptr<CatContainer const> CatsShared() const {
return this->Info().CatsShared();
}
[[nodiscard]] std::shared_ptr<CatContainer> CatsShared() { return this->Info().CatsShared(); }

protected:
virtual BatchSet<SparsePage> GetRowBatches() = 0;
Expand Down
1 change: 1 addition & 0 deletions ops/conda_env/aarch64_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies:
- llvmlite
- loky>=3.5.1
- pyarrow
- polars
- pyspark>=4.0.0
- cloudpickle
- pip:
Expand Down
1 change: 1 addition & 0 deletions ops/conda_env/macos_cpu_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ dependencies:
- awscli
- loky>=3.5.1
- pyarrow
- polars
- cloudpickle
1 change: 1 addition & 0 deletions ops/conda_env/win64_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ dependencies:
- py-ubjson
- loky>=3.5.1
- pyarrow
- polars
13 changes: 13 additions & 0 deletions src/common/categorical.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,19 @@ XGBOOST_DEVICE bst_cat_t AsCat(T const& v) {
return static_cast<bst_cat_t>(v);
}

/**
* @brief Storage size for a CatBitField whose largest valid bit is @p max_code.
*
* Widens to size_t before +1 so max_code near INT32_MAX cannot trigger signed-overflow
* UB on bst_cat_t = int32_t.
*
* @return Storage size in @c CatBitField::value_type units.
*/
[[nodiscard]] inline std::size_t SizeCatBitsForMaxCode(bst_cat_t max_code) {
CHECK_GE(max_code, 0);
return CatBitField::ComputeStorageSize(static_cast<std::size_t>(max_code) + 1);
}

/* \brief Whether is fidx a categorical feature.
*
* \param ft Feature type for all features.
Expand Down
16 changes: 10 additions & 6 deletions src/common/quantile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -533,14 +533,18 @@ void AddCategories(std::set<float> const &categories, float *max_cat, HistogramC
InvalidCategory();
}
auto &cut_values = cuts->cut_values_.HostVector();
// With column-wise data split, the categories may be empty.
auto feature_max_cat =
categories.empty() ? 0.0f : *std::max_element(categories.cbegin(), categories.cend());
if (categories.empty()) {
// column-wise split: emit a placeholder cut and treat the synthetic 0.0f as the
// observed max so downstream sizing (evaluator.cu MaxCategory()+1) does not see -1
cut_values.push_back(0.0f);
*max_cat = std::max(*max_cat, 0.0f);
return;
}
auto feature_max_cat = *std::max_element(categories.cbegin(), categories.cend());
CheckMaxCat(feature_max_cat, categories.size());
*max_cat = std::max(*max_cat, feature_max_cat);
for (bst_cat_t i = 0; i <= AsCat(feature_max_cat); ++i) {
cut_values.push_back(i);
}
// one cut per observed code; categories is sorted ascending
cut_values.insert(cut_values.end(), categories.cbegin(), categories.cend());
}

HistogramCuts HostSketchContainer::MakeCuts(Context const *ctx, MetaInfo const &info) {
Expand Down
27 changes: 17 additions & 10 deletions src/common/quantile.cu
Original file line number Diff line number Diff line change
Expand Up @@ -705,16 +705,23 @@ HistogramCuts SketchContainer::MakeCuts(Context const *ctx, bool is_column_split
auto column = Span<SketchEntry const>{h_entries.data() + begin, end - begin};

if (IsCat(h_feature_types, i)) {
auto column_size = std::max(static_cast<std::size_t>(1), column.size());
auto feature_max = column.empty() ? 0.0f : column.back().value;
if (std::any_of(column.cbegin(), column.cend(),
[](auto const &entry) { return InvalidCat(entry.value); })) {
InvalidCategory();
}
CheckMaxCat(feature_max, column_size);
max_cat = std::max(max_cat, feature_max);
for (std::size_t cat = 0; cat <= static_cast<std::size_t>(feature_max); ++cat) {
h_out_cut_values.push_back(cat);
if (column.empty()) {
// column-split worker with no rows: emit a placeholder cut and treat the
// synthetic 0.0f as observed max so MaxCategory() is never -1
h_out_cut_values.push_back(0.0f);
max_cat = std::max(max_cat, 0.0f);
} else {
auto feature_max = column.back().value;
if (std::any_of(column.cbegin(), column.cend(),
[](auto const &entry) { return InvalidCat(entry.value); })) {
InvalidCategory();
}
CheckMaxCat(feature_max, column.size());
max_cat = std::max(max_cat, feature_max);
// one cut per observed physical code; column sorted ascending
for (auto const &entry : column) {
h_out_cut_values.push_back(entry.value);
}
}
} else {
summary.Reserve(column.size());
Expand Down
11 changes: 7 additions & 4 deletions src/data/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@

namespace xgboost::data {
namespace {
auto GetRefCats(Json handle) {
auto cats = reinterpret_cast<CatContainer const*>(get<Integer const>(handle));
// Pair the owning ref CatContainer pointer with its host view
[[nodiscard]] std::pair<CatContainer*, enc::HostColumnsView> GetRefCats(Json handle) {
auto cats = reinterpret_cast<CatContainer*>(get<Integer const>(handle));
CHECK(cats);
auto h_cats = cats->HostView();
return h_cats;
return {cats, h_cats};
}
} // anonymous namespace

Expand All @@ -32,7 +33,9 @@ ColumnarAdapter::ColumnarAdapter(StringView columns) {

if (IsA<Object>(jdf)) {
// Has reference categories.
this->ref_cats_ = GetRefCats(jdf["ref_categories"]);
auto [ref_cats_ptr, ref_cats_view] = GetRefCats(jdf["ref_categories"]);
this->ref_cats_ptr_ = ref_cats_ptr;
this->ref_cats_ = ref_cats_view;
jdf = jdf["columns"];
}

Expand Down
54 changes: 46 additions & 8 deletions src/data/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
#include <cstdint> // for uint8_t
#include <limits> // for numeric_limits
#include <memory> // for unique_ptr, make_unique
#include <utility> // for move
#include <mutex> // for once_flag, call_once
#include <utility> // for move, forward
#include <variant> // for variant
#include <vector> // for vector

Expand Down Expand Up @@ -437,6 +438,11 @@ using EncColumnarAdapterBatch = EncColumnarAdapterBatchImpl<CatAccessor>;
class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch> {
std::vector<ArrayInterface<1>> columns_;
enc::HostColumnsView ref_cats_;
// non-owning; pointee outlives the adapter (caller keeps the ref DMatrix alive)
CatContainer* ref_cats_ptr_{nullptr};
// cached Recode mapping; write-once via CachedRefMapping()
mutable std::once_flag cache_once_;
mutable std::vector<std::int32_t> cached_ref_mapping_;
std::vector<enc::HostCatIndexView> cats_;
std::vector<std::int32_t> cat_segments_;
ColumnarAdapterBatch batch_;
Expand All @@ -454,6 +460,12 @@ class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch>
*/
explicit ColumnarAdapter(StringView columns);

// non-copyable and non-movable (owns std::once_flag)
ColumnarAdapter(ColumnarAdapter const&) = delete;
ColumnarAdapter& operator=(ColumnarAdapter const&) = delete;
ColumnarAdapter(ColumnarAdapter&&) = delete;
ColumnarAdapter& operator=(ColumnarAdapter&&) = delete;

[[nodiscard]] ColumnarAdapterBatch const& Value() const override { return batch_; }

[[nodiscard]] bst_idx_t NumRows() const {
Expand All @@ -474,18 +486,44 @@ class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch>
static_cast<std::int32_t>(this->cat_segments_.back())};
}
[[nodiscard]] enc::HostColumnsView RefCats() const { return this->ref_cats_; }
// non-owning; pointee outlives the adapter; non-const so dispatchers can call Sort()
// on the ref CatContainer through a const adapter
[[nodiscard]] CatContainer* RefCatsPtr() const { return this->ref_cats_ptr_; }
[[nodiscard]] common::Span<ArrayInterface<1> const> Columns() const { return this->columns_; }

/** @brief Cached Recode mapping; first call wins, later calls ignore @p builder.
*
* @warning The returned span aliases adapter storage; lifetime <= adapter.
*/
template <typename Fn>
[[nodiscard]] common::Span<std::int32_t const> CachedRefMapping(Fn&& builder) const {
std::call_once(this->cache_once_,
[&] { this->cached_ref_mapping_ = std::forward<Fn>(builder)(); });
return common::Span<std::int32_t const>{this->cached_ref_mapping_};
}
};

inline auto MakeEncColumnarBatch(Context const* ctx, ColumnarAdapter const* adapter) {
auto cats = std::make_unique<CatContainer>(adapter->RefCats(), true);
cats->Sort(ctx);
auto [acc, mapping] = cpu_impl::MakeCatAccessor(ctx, adapter->Cats(), cats.get());
return std::tuple{EncColumnarAdapterBatch{adapter->Columns(), acc}, std::move(mapping)};
inline EncColumnarAdapterBatch MakeEncColumnarBatch(Context const* ctx,
ColumnarAdapter const* adapter) {
// alias the reference dictionary when available; Sort() is idempotent under sort_mu_
auto* ref_cats_ptr = adapter->RefCatsPtr();
if (ref_cats_ptr != nullptr) {
ref_cats_ptr->Sort(ctx);
auto cached = adapter->CachedRefMapping([&] {
[[maybe_unused]] auto [acc, mapping] =
cpu_impl::MakeCatAccessor(ctx, adapter->Cats(), ref_cats_ptr);
return std::move(mapping);
});
auto cats_mapping = enc::MappingView{adapter->Cats().feature_segments, cached};
return EncColumnarAdapterBatch{adapter->Columns(), CatAccessor{cats_mapping}};
}
CHECK(!adapter->HasRefCategorical())
<< "ColumnarAdapter has reference categorical view but no CatContainer pointer.";
return EncColumnarAdapterBatch{adapter->Columns(), CatAccessor{}};
}

inline auto MakeEncColumnarBatch(Context const* ctx,
std::shared_ptr<ColumnarAdapter> const& adapter) {
inline EncColumnarAdapterBatch MakeEncColumnarBatch(
Context const* ctx, std::shared_ptr<ColumnarAdapter> const& adapter) {
return MakeEncColumnarBatch(ctx, adapter.get());
}

Expand Down
66 changes: 66 additions & 0 deletions src/data/cat_container.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <algorithm> // for copy
#include <cstddef> // for size_t
#include <memory> // for make_unique
#include <mutex> // for lock_guard, scoped_lock
#include <utility> // for move
#include <vector> // for vector

Expand All @@ -16,6 +17,37 @@
#include "xgboost/json.h" // for Json

namespace xgboost {
namespace {
// Validate Arrow StringArray offset invariants before the copy; malformed offsets cause
// SortNames to compute OOB substrings and break stable_sort's strict-weak-ordering.
void ValidateCatStrArrayOffsets(enc::CatStrArrayView const& str) {
if (str.offsets.empty()) {
return;
}
constexpr auto kHint =
" The producing dataframe library is emitting inconsistent Arrow data; update it"
" to the latest version.";
CHECK_EQ(str.offsets.front(), 0)
<< "Malformed Arrow categorical dictionary: offsets[0] must be 0." << kHint;
auto const n = str.offsets.size();
for (std::size_t i = 0; i < n; ++i) {
auto const off = str.offsets[i];
CHECK_GE(off, 0)
<< "Malformed Arrow categorical dictionary: offsets[" << i << "] = " << off
<< " is negative." << kHint;
if (i + 1 < n) {
CHECK_LE(off, str.offsets[i + 1])
<< "Malformed Arrow categorical dictionary: offsets not monotonic at i=" << i
<< "." << kHint;
}
}
auto last = static_cast<std::size_t>(str.offsets.back());
CHECK_LE(last, str.values.size())
<< "Malformed Arrow categorical dictionary: last offset " << last
<< " exceeds values buffer size " << str.values.size() << "." << kHint;
}
} // namespace

CatContainer::CatContainer(enc::HostColumnsView const& df, bool is_ref) : CatContainer{} {
this->is_ref_ = is_ref;
this->n_total_cats_ = df.n_total_cats;
Expand All @@ -30,6 +62,7 @@ CatContainer::CatContainer(enc::HostColumnsView const& df, bool is_ref) : CatCon
for (auto const& col : df.columns) {
std::visit(enc::Overloaded{
[this](enc::CatStrArrayView str) {
ValidateCatStrArrayOffsets(str);
using T = typename cpu_impl::ViewToStorageImpl<enc::CatStrArrayView>::Type;
this->cpu_impl_->columns.emplace_back();
this->cpu_impl_->columns.back().emplace<T>();
Expand Down Expand Up @@ -116,6 +149,8 @@ struct PrimToUbj<double> {
} // anonymous namespace

void CatContainer::Save(Json* p_out) const {
// serializes the full container snapshot against Sort()/Copy()
std::lock_guard guard{sort_mu_};
[[maybe_unused]] auto _ = this->HostView();
auto& out = *p_out;

Expand Down Expand Up @@ -166,6 +201,9 @@ void CatContainer::Save(Json* p_out) const {
out["sorted_idx"] = std::move(jsorted_index);
out["feature_segments"] = std::move(jf_segments);
out["enc"] = arr;
// persist is_ref_ and sorted_; optional fields for back-compat with pre-field models
out["is_ref"] = Boolean{this->is_ref_};
out["sorted"] = Boolean{this->sorted_};
}

namespace {
Expand All @@ -187,6 +225,8 @@ void LoadJson(Json jvalues, Vec* p_out) {
} // namespace

void CatContainer::Load(Json const& in) {
// serializes the full container snapshot against Sort()/Copy()
std::lock_guard guard{sort_mu_};
auto array = get<Array const>(in["enc"]);
auto n_features = array.size();

Expand Down Expand Up @@ -266,6 +306,19 @@ void CatContainer::Load(Json const& in) {
auto& h_sorted_idx = this->sorted_idx_.HostVector();
LoadJson<std::int32_t>(in["sorted_idx"], &h_sorted_idx);

// back-compat: missing fields default to is_ref=false, sorted=!sorted_idx.empty()
auto const& obj = get<Object const>(in);
if (auto it = obj.find("is_ref"); it != obj.cend()) {
this->is_ref_ = get<Boolean const>(it->second);
} else {
this->is_ref_ = false;
}
if (auto it = obj.find("sorted"); it != obj.cend()) {
this->sorted_ = get<Boolean const>(it->second);
} else {
this->sorted_ = !h_sorted_idx.empty();
}

this->cpu_impl_->Finalize();
}

Expand All @@ -275,6 +328,12 @@ CatContainer::CatContainer() : cpu_impl_{std::make_unique<cpu_impl::CatContainer
CatContainer::~CatContainer() = default;

void CatContainer::Copy(Context const* ctx, CatContainer const& that) {
if (&that == this) {
return;
}
// scoped_lock serializes concurrent a.Copy(b)+b.Copy(a); this->device_mu_ guards
// destination writes against a concurrent this->HostView() on another thread
std::scoped_lock guard{this->sort_mu_, that.sort_mu_, this->device_mu_};
[[maybe_unused]] auto h_view = that.HostView();
this->CopyCommon(ctx, that);
this->cpu_impl_->Copy(that.cpu_impl_.get());
Expand All @@ -290,9 +349,16 @@ void CatContainer::Copy(Context const* ctx, CatContainer const& that) {

void CatContainer::Sort(Context const* ctx) {
CHECK(ctx->IsCPU());
// sort_mu_ serializes Sort()/Copy(); HasCategorical() reads n_total_cats_ which
// Copy() writes under sort_mu_, so check inside the lock
std::lock_guard guard{sort_mu_};
if (!this->HasCategorical() || this->sorted_) {
return;
}
auto view = this->HostView();
this->sorted_idx_.HostVector().resize(view.n_total_cats);
enc::SortNames(enc::Policy<EncErrorPolicy>{}, view, this->sorted_idx_.HostSpan());
this->sorted_ = true;
}
#endif // !defined(XGBOOST_USE_CUDA)

Expand Down
Loading